Publish Ropedia minimal task baseline weights
Browse files- .gitattributes +1 -0
- README.md +107 -0
- artifacts/episode_task_suite/available_modalities.json +83 -0
- artifacts/episode_task_suite/caption_grounding/metrics.json +15 -0
- artifacts/episode_task_suite/caption_grounding/model.npz +3 -0
- artifacts/episode_task_suite/contact_prediction/metrics.json +19 -0
- artifacts/episode_task_suite/contact_prediction/model.npz +3 -0
- artifacts/episode_task_suite/cross_modal_retrieval/metrics.json +15 -0
- artifacts/episode_task_suite/cross_modal_retrieval/model.npz +3 -0
- artifacts/episode_task_suite/feature_manifest.json +104 -0
- artifacts/episode_task_suite/hand_trajectory_forecast/metrics.json +15 -0
- artifacts/episode_task_suite/misalignment_detection/metrics.json +19 -0
- artifacts/episode_task_suite/misalignment_detection/model.npz +3 -0
- artifacts/episode_task_suite/modality_reconstruction/metrics.json +12 -0
- artifacts/episode_task_suite/next_action/metrics.json +24 -0
- artifacts/episode_task_suite/next_action/model.npz +3 -0
- artifacts/episode_task_suite/object_relevance/metrics.json +14 -0
- artifacts/episode_task_suite/object_relevance/model.npz +3 -0
- artifacts/episode_task_suite/temporal_order/metrics.json +19 -0
- artifacts/episode_task_suite/temporal_order/model.npz +3 -0
- artifacts/episode_task_suite/timeline_action/metrics.json +24 -0
- artifacts/episode_task_suite/timeline_action/model.npz +3 -0
- artifacts/episode_task_suite/timeline_subtask/metrics.json +24 -0
- artifacts/episode_task_suite/timeline_subtask/model.npz +3 -0
- artifacts/episode_task_suite/transition_detection/metrics.json +26 -0
- artifacts/episode_task_suite/transition_detection/model.npz +3 -0
- artifacts/min_action_model/metrics.json +11 -0
- artifacts/min_action_model/model.npz +3 -0
- artifacts/min_all_modalities_action_model/available_modalities.json +83 -0
- artifacts/min_all_modalities_action_model/feature_manifest.json +104 -0
- artifacts/min_all_modalities_action_model/metrics.json +13 -0
- artifacts/min_all_modalities_action_model/model.npz +3 -0
- artifacts/min_all_modalities_subtask_model/available_modalities.json +83 -0
- artifacts/min_all_modalities_subtask_model/feature_manifest.json +104 -0
- artifacts/min_all_modalities_subtask_model/metrics.json +13 -0
- artifacts/min_all_modalities_subtask_model/model.npz +3 -0
- artifacts/min_subtask_model/metrics.json +11 -0
- artifacts/min_subtask_model/model.npz +3 -0
- assets/task_architectures.svg +216 -0
- assets/task_suite_infographic.png +3 -0
- notes/all_modalities_model.md +148 -0
- notes/episode_task_suite.md +176 -0
- notes/min_action_model.md +85 -0
- notes/reproducibility_audit.md +124 -0
- scripts/episode_task_suite.py +776 -0
- scripts/generate_visualizations.py +474 -0
- scripts/render_task_suite_infographic.py +378 -0
- scripts/train_all_modalities_model.py +582 -0
- scripts/train_min_action_model.py +531 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/task_suite_infographic.png filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: other
|
| 3 |
+
library_name: numpy
|
| 4 |
+
tags:
|
| 5 |
+
- robotics
|
| 6 |
+
- embodied-ai
|
| 7 |
+
- multimodal
|
| 8 |
+
- ropedia
|
| 9 |
+
- xperience-10m
|
| 10 |
+
- baseline
|
| 11 |
+
- linear-model
|
| 12 |
+
- retrieval
|
| 13 |
+
metrics:
|
| 14 |
+
- accuracy
|
| 15 |
+
- f1
|
| 16 |
+
- mean-reciprocal-rank
|
| 17 |
+
- mean-squared-error
|
| 18 |
+
model-index:
|
| 19 |
+
- name: Ropedia Minimal Task Baselines
|
| 20 |
+
results:
|
| 21 |
+
- task:
|
| 22 |
+
type: robotics
|
| 23 |
+
name: Cross-modal retrieval
|
| 24 |
+
dataset:
|
| 25 |
+
type: ropedia-ai/xperience-10m-sample
|
| 26 |
+
name: Xperience-10M public sample episode
|
| 27 |
+
metrics:
|
| 28 |
+
- type: top_5_accuracy
|
| 29 |
+
value: 0.3764
|
| 30 |
+
name: top-5 retrieval accuracy
|
| 31 |
+
- type: mrr
|
| 32 |
+
value: 0.2634
|
| 33 |
+
name: mean reciprocal rank
|
| 34 |
+
- task:
|
| 35 |
+
type: robotics
|
| 36 |
+
name: Transition detection
|
| 37 |
+
dataset:
|
| 38 |
+
type: ropedia-ai/xperience-10m-sample
|
| 39 |
+
name: Xperience-10M public sample episode
|
| 40 |
+
metrics:
|
| 41 |
+
- type: f1
|
| 42 |
+
value: 0.6552
|
| 43 |
+
name: macro-F1
|
| 44 |
+
---
|
| 45 |
+
|
| 46 |
+
# Ropedia Minimal Task Baselines
|
| 47 |
+
|
| 48 |
+
This repo stores the minimal baseline weights and metrics for the 12-task Ropedia episode suite.
|
| 49 |
+
|
| 50 |
+
These are intentionally small, transparent baselines:
|
| 51 |
+
|
| 52 |
+
- z-score + linear softmax classifiers,
|
| 53 |
+
- dual ridge regression/projection heads,
|
| 54 |
+
- sigmoid multi-label logistic regression,
|
| 55 |
+
- cosine ranking for retrieval tasks.
|
| 56 |
+
|
| 57 |
+
They are not deep robot policies or foundation models. Their purpose is to make every input/output contract auditable before scaling to many episodes.
|
| 58 |
+
|
| 59 |
+
## Included
|
| 60 |
+
|
| 61 |
+
- `artifacts/**/model.npz`: minimal baseline weights, scalers, and labels
|
| 62 |
+
- `artifacts/**/metrics.json`: committed metrics
|
| 63 |
+
- `artifacts/**/feature_manifest.json`: feature block boundaries where relevant
|
| 64 |
+
- `scripts/*.py`: training and visualization scripts
|
| 65 |
+
- `notes/*.md`: interpretation and reproducibility notes
|
| 66 |
+
|
| 67 |
+
The companion artifact dataset repo stores CSV/JSON predictions and dashboard assets:
|
| 68 |
+
|
| 69 |
+
https://huggingface.co/datasets/cy0307/ropedia-episode-task-suite-artifacts
|
| 70 |
+
|
| 71 |
+
The public visual dashboard is here:
|
| 72 |
+
|
| 73 |
+
https://huggingface.co/spaces/cy0307/ropedia-episode-task-suite
|
| 74 |
+
|
| 75 |
+
## Minimal Architecture
|
| 76 |
+
|
| 77 |
+

|
| 78 |
+
|
| 79 |
+
## Metrics Snapshot
|
| 80 |
+
|
| 81 |
+
| Task | Minimal head | Main metric |
|
| 82 |
+
| --- | --- | ---: |
|
| 83 |
+
| `timeline_action` | linear softmax | 0.0500 macro-F1 |
|
| 84 |
+
| `timeline_subtask` | linear softmax | 0.0495 macro-F1 |
|
| 85 |
+
| `transition_detection` | linear softmax | 0.6552 macro-F1 |
|
| 86 |
+
| `next_action` | linear softmax | 0.0593 macro-F1 |
|
| 87 |
+
| `hand_trajectory_forecast` | ridge regression | 0.8223 MPJPE |
|
| 88 |
+
| `contact_prediction` | linear softmax | 1.0000 macro-F1 |
|
| 89 |
+
| `object_relevance` | multi-label logistic | 0.1839 micro-F1 |
|
| 90 |
+
| `caption_grounding` | ridge + cosine rank | 0.0172 MRR |
|
| 91 |
+
| `cross_modal_retrieval` | ridge + cosine rank | 0.3764 top-5 |
|
| 92 |
+
| `modality_reconstruction` | ridge regression | -0.0160 R2 |
|
| 93 |
+
| `temporal_order` | binary softmax | 0.5487 F1 |
|
| 94 |
+
| `misalignment_detection` | binary softmax | 0.4866 F1 |
|
| 95 |
+
|
| 96 |
+
## Data Notice
|
| 97 |
+
|
| 98 |
+
This repo does not redistribute raw Ropedia videos or raw `annotation.hdf5`. Download the original sample from Ropedia / Hugging Face and follow the dataset terms:
|
| 99 |
+
|
| 100 |
+
- https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample
|
| 101 |
+
- https://ropedia.com/dataset
|
| 102 |
+
|
| 103 |
+
## Source
|
| 104 |
+
|
| 105 |
+
GitHub:
|
| 106 |
+
|
| 107 |
+
https://github.com/ChaoYue0307/ropedia-episode-task-suite
|
artifacts/episode_task_suite/available_modalities.json
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"modality": "depth_confidence",
|
| 4 |
+
"shape": [
|
| 5 |
+
5821,
|
| 6 |
+
140
|
| 7 |
+
]
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"modality": "video/fisheye_cam0",
|
| 11 |
+
"path": "data/sample/xperience-10m-sample/fisheye_cam0.mp4",
|
| 12 |
+
"shape": [
|
| 13 |
+
5821,
|
| 14 |
+
98
|
| 15 |
+
],
|
| 16 |
+
"exists": true
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"modality": "video/fisheye_cam1",
|
| 20 |
+
"path": "data/sample/xperience-10m-sample/fisheye_cam1.mp4",
|
| 21 |
+
"shape": [
|
| 22 |
+
5821,
|
| 23 |
+
98
|
| 24 |
+
],
|
| 25 |
+
"exists": true
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"modality": "video/fisheye_cam2",
|
| 29 |
+
"path": "data/sample/xperience-10m-sample/fisheye_cam2.mp4",
|
| 30 |
+
"shape": [
|
| 31 |
+
5821,
|
| 32 |
+
98
|
| 33 |
+
],
|
| 34 |
+
"exists": true
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"modality": "video/fisheye_cam3",
|
| 38 |
+
"path": "data/sample/xperience-10m-sample/fisheye_cam3.mp4",
|
| 39 |
+
"shape": [
|
| 40 |
+
5821,
|
| 41 |
+
98
|
| 42 |
+
],
|
| 43 |
+
"exists": true
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"modality": "video/stereo_left",
|
| 47 |
+
"path": "data/sample/xperience-10m-sample/stereo_left.mp4",
|
| 48 |
+
"shape": [
|
| 49 |
+
5821,
|
| 50 |
+
98
|
| 51 |
+
],
|
| 52 |
+
"exists": true
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"modality": "video/stereo_right",
|
| 56 |
+
"path": "data/sample/xperience-10m-sample/stereo_right.mp4",
|
| 57 |
+
"shape": [
|
| 58 |
+
5821,
|
| 59 |
+
98
|
| 60 |
+
],
|
| 61 |
+
"exists": true
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"modality": "caption_text",
|
| 65 |
+
"shape": [
|
| 66 |
+
5821,
|
| 67 |
+
128
|
| 68 |
+
],
|
| 69 |
+
"fields": "objects,interaction"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"modality": "slam_point_cloud_static",
|
| 73 |
+
"shape": [
|
| 74 |
+
22
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"modality": "calibration_static",
|
| 79 |
+
"shape": [
|
| 80 |
+
117
|
| 81 |
+
]
|
| 82 |
+
}
|
| 83 |
+
]
|
artifacts/episode_task_suite/caption_grounding/metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"mrr": 0.017183946083791223,
|
| 3 |
+
"median_rank": 167.0,
|
| 4 |
+
"mean_rank": 174.39367816091954,
|
| 5 |
+
"num_queries": 348,
|
| 6 |
+
"top1_accuracy": 0.0028735632183908046,
|
| 7 |
+
"top5_accuracy": 0.011494252873563218,
|
| 8 |
+
"top10_accuracy": 0.017241379310344827,
|
| 9 |
+
"task": "caption_grounding",
|
| 10 |
+
"input": "caption objects/interaction text query + candidate sensor windows",
|
| 11 |
+
"output": "matching time window",
|
| 12 |
+
"split": "chronological",
|
| 13 |
+
"num_train_windows": 813,
|
| 14 |
+
"num_test_windows": 348
|
| 15 |
+
}
|
artifacts/episode_task_suite/caption_grounding/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:032da1fd5b5142b449e758a13bf5a450bb9ac22afde032bebf194987f97c1341
|
| 3 |
+
size 14459176
|
artifacts/episode_task_suite/contact_prediction/metrics.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"accuracy": 1.0,
|
| 3 |
+
"balanced_accuracy": 1.0,
|
| 4 |
+
"macro_f1": 1.0,
|
| 5 |
+
"weighted_f1": 1.0,
|
| 6 |
+
"num_eval_windows": 348,
|
| 7 |
+
"num_classes": 1,
|
| 8 |
+
"task": "contact_prediction",
|
| 9 |
+
"input": "all non-contact/non-caption-label modalities -> any body contact",
|
| 10 |
+
"split": "chronological",
|
| 11 |
+
"num_windows": 1161,
|
| 12 |
+
"num_train_windows": 813,
|
| 13 |
+
"num_test_windows": 348,
|
| 14 |
+
"feature_dim": 7335,
|
| 15 |
+
"majority_baseline_accuracy": 1.0,
|
| 16 |
+
"train_final_accuracy": 1.0,
|
| 17 |
+
"train_final_loss": 0.0005947681493125856,
|
| 18 |
+
"unseen_test_classes": []
|
| 19 |
+
}
|
artifacts/episode_task_suite/contact_prediction/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:050d2139076c55b251c2c23b62d6c58023cc7fb1c0431ded6795e775c9300a7b
|
| 3 |
+
size 82797
|
artifacts/episode_task_suite/cross_modal_retrieval/metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"mrr": 0.26335984006618296,
|
| 3 |
+
"median_rank": 12.5,
|
| 4 |
+
"mean_rank": 43.33045977011494,
|
| 5 |
+
"num_queries": 348,
|
| 6 |
+
"top1_accuracy": 0.14942528735632185,
|
| 7 |
+
"top5_accuracy": 0.3764367816091954,
|
| 8 |
+
"top10_accuracy": 0.47413793103448276,
|
| 9 |
+
"task": "cross_modal_retrieval",
|
| 10 |
+
"input": "motion/IMU/camera query",
|
| 11 |
+
"output": "matching depth/video window",
|
| 12 |
+
"split": "chronological",
|
| 13 |
+
"num_train_windows": 813,
|
| 14 |
+
"num_test_windows": 348
|
| 15 |
+
}
|
artifacts/episode_task_suite/cross_modal_retrieval/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc5b2d0bc4350c4348be1e6098f9793a8ed5e479bad9ee20351bf2991c71347a
|
| 3 |
+
size 41310574
|
artifacts/episode_task_suite/feature_manifest.json
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"name": "hand_left_joints",
|
| 4 |
+
"start": 0,
|
| 5 |
+
"end": 441,
|
| 6 |
+
"dim": 441
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"name": "hand_right_joints",
|
| 10 |
+
"start": 441,
|
| 11 |
+
"end": 882,
|
| 12 |
+
"dim": 441
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"name": "body_joints",
|
| 16 |
+
"start": 882,
|
| 17 |
+
"end": 1974,
|
| 18 |
+
"dim": 1092
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"name": "body_contacts",
|
| 22 |
+
"start": 1974,
|
| 23 |
+
"end": 2121,
|
| 24 |
+
"dim": 147
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"name": "camera_translation",
|
| 28 |
+
"start": 2121,
|
| 29 |
+
"end": 2142,
|
| 30 |
+
"dim": 21
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"name": "camera_rotation_matrix",
|
| 34 |
+
"start": 2142,
|
| 35 |
+
"end": 2205,
|
| 36 |
+
"dim": 63
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"name": "imu_accel_gyro",
|
| 40 |
+
"start": 2205,
|
| 41 |
+
"end": 2247,
|
| 42 |
+
"dim": 42
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"name": "depth_confidence",
|
| 46 |
+
"start": 2247,
|
| 47 |
+
"end": 3227,
|
| 48 |
+
"dim": 980
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"name": "video_fisheye_cam0",
|
| 52 |
+
"start": 3227,
|
| 53 |
+
"end": 3913,
|
| 54 |
+
"dim": 686
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"name": "video_fisheye_cam1",
|
| 58 |
+
"start": 3913,
|
| 59 |
+
"end": 4599,
|
| 60 |
+
"dim": 686
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "video_fisheye_cam2",
|
| 64 |
+
"start": 4599,
|
| 65 |
+
"end": 5285,
|
| 66 |
+
"dim": 686
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"name": "video_fisheye_cam3",
|
| 70 |
+
"start": 5285,
|
| 71 |
+
"end": 5971,
|
| 72 |
+
"dim": 686
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"name": "video_stereo_left",
|
| 76 |
+
"start": 5971,
|
| 77 |
+
"end": 6657,
|
| 78 |
+
"dim": 686
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"name": "video_stereo_right",
|
| 82 |
+
"start": 6657,
|
| 83 |
+
"end": 7343,
|
| 84 |
+
"dim": 686
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "caption_objects_interaction_text",
|
| 88 |
+
"start": 7343,
|
| 89 |
+
"end": 8239,
|
| 90 |
+
"dim": 896
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"name": "slam_point_cloud",
|
| 94 |
+
"start": 8239,
|
| 95 |
+
"end": 8261,
|
| 96 |
+
"dim": 22
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"name": "calibration",
|
| 100 |
+
"start": 8261,
|
| 101 |
+
"end": 8378,
|
| 102 |
+
"dim": 117
|
| 103 |
+
}
|
| 104 |
+
]
|
artifacts/episode_task_suite/hand_trajectory_forecast/metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"mse": 11.323140144348145,
|
| 3 |
+
"mae": 0.40246668457984924,
|
| 4 |
+
"r2": -1334.788993815828,
|
| 5 |
+
"task": "hand_trajectory_forecast",
|
| 6 |
+
"input": "all modalities at t -> future left/right hand 3D joints",
|
| 7 |
+
"split": "chronological",
|
| 8 |
+
"num_windows": 1159,
|
| 9 |
+
"num_train_windows": 811,
|
| 10 |
+
"num_test_windows": 348,
|
| 11 |
+
"forecast_frames": 10,
|
| 12 |
+
"mpjpe": 0.8222644925117493,
|
| 13 |
+
"final_frame_mpjpe": 1.0649521350860596,
|
| 14 |
+
"target_dim": 1260
|
| 15 |
+
}
|
artifacts/episode_task_suite/misalignment_detection/metrics.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"accuracy": 0.5028901734104047,
|
| 3 |
+
"precision": 0.5030864197530864,
|
| 4 |
+
"recall": 0.47109826589595377,
|
| 5 |
+
"f1": 0.4865671641791045,
|
| 6 |
+
"tp": 163,
|
| 7 |
+
"tn": 185,
|
| 8 |
+
"fp": 161,
|
| 9 |
+
"fn": 183,
|
| 10 |
+
"positive_rate_true": 0.5,
|
| 11 |
+
"positive_rate_pred": 0.4682080924855491,
|
| 12 |
+
"task": "misalignment_detection",
|
| 13 |
+
"input": "motion+visual pair -> aligned vs shifted by 8 windows",
|
| 14 |
+
"split": "chronological",
|
| 15 |
+
"num_samples": 2306,
|
| 16 |
+
"num_train_samples": 1614,
|
| 17 |
+
"num_test_samples": 692,
|
| 18 |
+
"train_final_accuracy": 0.5018587360594795
|
| 19 |
+
}
|
artifacts/episode_task_suite/misalignment_detection/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:375daf8e2d5e8e926970c457eff3c48ab402608c02cc564135f367b133609063
|
| 3 |
+
size 110186
|
artifacts/episode_task_suite/modality_reconstruction/metrics.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"mse": 1359.1639404296875,
|
| 3 |
+
"mae": 0.31084805727005005,
|
| 4 |
+
"r2": -0.016022846771134747,
|
| 5 |
+
"task": "modality_reconstruction",
|
| 6 |
+
"input": "motion/IMU/camera",
|
| 7 |
+
"output": "depth/video feature vector",
|
| 8 |
+
"split": "chronological",
|
| 9 |
+
"num_train_windows": 813,
|
| 10 |
+
"num_test_windows": 348,
|
| 11 |
+
"target_dim": 5096
|
| 12 |
+
}
|
artifacts/episode_task_suite/next_action/metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"accuracy": 0.034482758620689655,
|
| 3 |
+
"balanced_accuracy": 0.04,
|
| 4 |
+
"macro_f1": 0.05925925925925927,
|
| 5 |
+
"weighted_f1": 0.05108556832694764,
|
| 6 |
+
"num_eval_windows": 348,
|
| 7 |
+
"num_classes": 18,
|
| 8 |
+
"task": "next_action",
|
| 9 |
+
"input": "all modalities at t -> action at t+20 frames",
|
| 10 |
+
"split": "chronological",
|
| 11 |
+
"num_windows": 1161,
|
| 12 |
+
"num_train_windows": 813,
|
| 13 |
+
"num_test_windows": 348,
|
| 14 |
+
"feature_dim": 8378,
|
| 15 |
+
"majority_baseline_accuracy": 0.0,
|
| 16 |
+
"train_final_accuracy": 1.0,
|
| 17 |
+
"train_final_loss": 0.017629079520702362,
|
| 18 |
+
"unseen_test_classes": [
|
| 19 |
+
"Place item on table",
|
| 20 |
+
"Pour coffee",
|
| 21 |
+
"Pour milk into coffee",
|
| 22 |
+
"Wait/Prepare for pouring"
|
| 23 |
+
]
|
| 24 |
+
}
|
artifacts/episode_task_suite/next_action/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3fcfa0e624694a7b07fecac33d9385c54f5aeb1faf4517d11fbf6db3b973292d
|
| 3 |
+
size 620530
|
artifacts/episode_task_suite/object_relevance/metrics.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"micro_f1": 0.18393030009680542,
|
| 3 |
+
"macro_f1": 0.06427052187996415,
|
| 4 |
+
"exact_match": 0.005747126436781609,
|
| 5 |
+
"precision": 0.16360505166475317,
|
| 6 |
+
"recall": 0.21002210759027265,
|
| 7 |
+
"task": "object_relevance",
|
| 8 |
+
"input": "all non-caption modalities -> current relevant object set",
|
| 9 |
+
"split": "chronological",
|
| 10 |
+
"num_windows": 1161,
|
| 11 |
+
"num_train_windows": 813,
|
| 12 |
+
"num_test_windows": 348,
|
| 13 |
+
"num_objects": 34
|
| 14 |
+
}
|
artifacts/episode_task_suite/object_relevance/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aca088062b23a8fa8b05b261cf698c50c00b11be238eb4b9260f7609da70ff11
|
| 3 |
+
size 1002718
|
artifacts/episode_task_suite/temporal_order/metrics.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"accuracy": 0.46120689655172414,
|
| 3 |
+
"precision": 0.4720496894409938,
|
| 4 |
+
"recall": 0.6551724137931034,
|
| 5 |
+
"f1": 0.5487364620938628,
|
| 6 |
+
"tp": 228,
|
| 7 |
+
"tn": 93,
|
| 8 |
+
"fp": 255,
|
| 9 |
+
"fn": 120,
|
| 10 |
+
"positive_rate_true": 0.5,
|
| 11 |
+
"positive_rate_pred": 0.6939655172413793,
|
| 12 |
+
"task": "temporal_order",
|
| 13 |
+
"input": "two adjacent windows -> whether order is correct",
|
| 14 |
+
"split": "chronological",
|
| 15 |
+
"num_samples": 2320,
|
| 16 |
+
"num_train_samples": 1624,
|
| 17 |
+
"num_test_samples": 696,
|
| 18 |
+
"train_final_accuracy": 0.5104679802955665
|
| 19 |
+
}
|
artifacts/episode_task_suite/temporal_order/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04330ca7fe354ecb592f366d27764a538e2b51fd6d23f66d618ea86d33c34f4e
|
| 3 |
+
size 335170
|
artifacts/episode_task_suite/timeline_action/metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"accuracy": 0.029154518950437316,
|
| 3 |
+
"balanced_accuracy": 0.03125,
|
| 4 |
+
"macro_f1": 0.05,
|
| 5 |
+
"weighted_f1": 0.04664723032069971,
|
| 6 |
+
"num_eval_windows": 343,
|
| 7 |
+
"num_classes": 18,
|
| 8 |
+
"task": "timeline_action",
|
| 9 |
+
"input": "all modalities -> current action label",
|
| 10 |
+
"split": "chronological",
|
| 11 |
+
"num_windows": 1144,
|
| 12 |
+
"num_train_windows": 801,
|
| 13 |
+
"num_test_windows": 343,
|
| 14 |
+
"feature_dim": 8378,
|
| 15 |
+
"majority_baseline_accuracy": 0.0,
|
| 16 |
+
"train_final_accuracy": 1.0,
|
| 17 |
+
"train_final_loss": 0.01664665900170803,
|
| 18 |
+
"unseen_test_classes": [
|
| 19 |
+
"Place item on table",
|
| 20 |
+
"Pour coffee",
|
| 21 |
+
"Pour milk into coffee",
|
| 22 |
+
"Wait/Prepare for pouring"
|
| 23 |
+
]
|
| 24 |
+
}
|
artifacts/episode_task_suite/timeline_action/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3052fc9442607895eb6dc5ca81d5a1c28f4cdf9e1f9a4931e6ef78403283a7c
|
| 3 |
+
size 620781
|
artifacts/episode_task_suite/timeline_subtask/metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"accuracy": 0.05813953488372093,
|
| 3 |
+
"balanced_accuracy": 0.05376979652090881,
|
| 4 |
+
"macro_f1": 0.04954121121178666,
|
| 5 |
+
"weighted_f1": 0.06731304264454903,
|
| 6 |
+
"num_eval_windows": 344,
|
| 7 |
+
"num_classes": 14,
|
| 8 |
+
"task": "timeline_subtask",
|
| 9 |
+
"input": "all modalities -> current subtask label",
|
| 10 |
+
"split": "chronological",
|
| 11 |
+
"num_windows": 1147,
|
| 12 |
+
"num_train_windows": 803,
|
| 13 |
+
"num_test_windows": 344,
|
| 14 |
+
"feature_dim": 8378,
|
| 15 |
+
"majority_baseline_accuracy": 0.0,
|
| 16 |
+
"train_final_accuracy": 1.0,
|
| 17 |
+
"train_final_loss": 0.014040183275938034,
|
| 18 |
+
"unseen_test_classes": [
|
| 19 |
+
"Move bottle to coffee equipment",
|
| 20 |
+
"Pour coffee",
|
| 21 |
+
"Pour milk into coffee",
|
| 22 |
+
"Prepare for pouring"
|
| 23 |
+
]
|
| 24 |
+
}
|
artifacts/episode_task_suite/timeline_subtask/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39dace29541e90a947e902a7ba7afd39b7a2c1d3123ed513653d0704d45d2ad1
|
| 3 |
+
size 496518
|
artifacts/episode_task_suite/transition_detection/metrics.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"accuracy": 0.9252873563218391,
|
| 3 |
+
"balanced_accuracy": 0.6931475903614458,
|
| 4 |
+
"macro_f1": 0.6551829268292684,
|
| 5 |
+
"weighted_f1": 0.9323030557891787,
|
| 6 |
+
"num_eval_windows": 348,
|
| 7 |
+
"num_classes": 2,
|
| 8 |
+
"task": "transition_detection",
|
| 9 |
+
"input": "all modalities -> action boundary/steady",
|
| 10 |
+
"split": "chronological",
|
| 11 |
+
"num_windows": 1161,
|
| 12 |
+
"num_train_windows": 813,
|
| 13 |
+
"num_test_windows": 348,
|
| 14 |
+
"feature_dim": 8378,
|
| 15 |
+
"majority_baseline_accuracy": 0.9540229885057471,
|
| 16 |
+
"train_final_accuracy": 1.0,
|
| 17 |
+
"train_final_loss": 0.007071746978908777,
|
| 18 |
+
"unseen_test_classes": [],
|
| 19 |
+
"boundary_precision": 0.125,
|
| 20 |
+
"boundary_recall": 0.75,
|
| 21 |
+
"boundary_f1": 0.21428571428571427,
|
| 22 |
+
"matched_boundaries": 3,
|
| 23 |
+
"true_boundaries": 4,
|
| 24 |
+
"predicted_boundaries": 24,
|
| 25 |
+
"mean_abs_timing_error_frames": 2.6666666666666665
|
| 26 |
+
}
|
artifacts/episode_task_suite/transition_detection/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f179e3278c2b0e6563ed0bfe14a42faae28a5a0a0aa4a0b056113fc345aa4a27
|
| 3 |
+
size 122843
|
artifacts/min_action_model/metrics.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"accuracy": 0.9828178694158075,
|
| 3 |
+
"balanced_accuracy": 0.9643518518518519,
|
| 4 |
+
"macro_f1": 0.96884342657456,
|
| 5 |
+
"weighted_f1": 0.9824311468352843,
|
| 6 |
+
"num_eval_windows": 291,
|
| 7 |
+
"num_classes": 18,
|
| 8 |
+
"majority_baseline_accuracy": 0.13745704467353953,
|
| 9 |
+
"train_final_accuracy": 1.0,
|
| 10 |
+
"train_final_loss": 0.019042566418647766
|
| 11 |
+
}
|
artifacts/min_action_model/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b143a74aa94c882e08279adabfcf5806348ccb37c70c9192c8def206fda97895
|
| 3 |
+
size 163871
|
artifacts/min_all_modalities_action_model/available_modalities.json
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"modality": "depth_confidence",
|
| 4 |
+
"shape": [
|
| 5 |
+
5821,
|
| 6 |
+
140
|
| 7 |
+
]
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"modality": "video/fisheye_cam0",
|
| 11 |
+
"path": "data/sample/xperience-10m-sample/fisheye_cam0.mp4",
|
| 12 |
+
"shape": [
|
| 13 |
+
5821,
|
| 14 |
+
98
|
| 15 |
+
],
|
| 16 |
+
"exists": true
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"modality": "video/fisheye_cam1",
|
| 20 |
+
"path": "data/sample/xperience-10m-sample/fisheye_cam1.mp4",
|
| 21 |
+
"shape": [
|
| 22 |
+
5821,
|
| 23 |
+
98
|
| 24 |
+
],
|
| 25 |
+
"exists": true
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"modality": "video/fisheye_cam2",
|
| 29 |
+
"path": "data/sample/xperience-10m-sample/fisheye_cam2.mp4",
|
| 30 |
+
"shape": [
|
| 31 |
+
5821,
|
| 32 |
+
98
|
| 33 |
+
],
|
| 34 |
+
"exists": true
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"modality": "video/fisheye_cam3",
|
| 38 |
+
"path": "data/sample/xperience-10m-sample/fisheye_cam3.mp4",
|
| 39 |
+
"shape": [
|
| 40 |
+
5821,
|
| 41 |
+
98
|
| 42 |
+
],
|
| 43 |
+
"exists": true
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"modality": "video/stereo_left",
|
| 47 |
+
"path": "data/sample/xperience-10m-sample/stereo_left.mp4",
|
| 48 |
+
"shape": [
|
| 49 |
+
5821,
|
| 50 |
+
98
|
| 51 |
+
],
|
| 52 |
+
"exists": true
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"modality": "video/stereo_right",
|
| 56 |
+
"path": "data/sample/xperience-10m-sample/stereo_right.mp4",
|
| 57 |
+
"shape": [
|
| 58 |
+
5821,
|
| 59 |
+
98
|
| 60 |
+
],
|
| 61 |
+
"exists": true
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"modality": "caption_text",
|
| 65 |
+
"shape": [
|
| 66 |
+
5821,
|
| 67 |
+
128
|
| 68 |
+
],
|
| 69 |
+
"fields": "objects,interaction"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"modality": "slam_point_cloud_static",
|
| 73 |
+
"shape": [
|
| 74 |
+
22
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"modality": "calibration_static",
|
| 79 |
+
"shape": [
|
| 80 |
+
117
|
| 81 |
+
]
|
| 82 |
+
}
|
| 83 |
+
]
|
artifacts/min_all_modalities_action_model/feature_manifest.json
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"name": "hand_left_joints",
|
| 4 |
+
"start": 0,
|
| 5 |
+
"end": 441,
|
| 6 |
+
"dim": 441
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"name": "hand_right_joints",
|
| 10 |
+
"start": 441,
|
| 11 |
+
"end": 882,
|
| 12 |
+
"dim": 441
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"name": "body_joints",
|
| 16 |
+
"start": 882,
|
| 17 |
+
"end": 1974,
|
| 18 |
+
"dim": 1092
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"name": "body_contacts",
|
| 22 |
+
"start": 1974,
|
| 23 |
+
"end": 2121,
|
| 24 |
+
"dim": 147
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"name": "camera_translation",
|
| 28 |
+
"start": 2121,
|
| 29 |
+
"end": 2142,
|
| 30 |
+
"dim": 21
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"name": "camera_rotation_matrix",
|
| 34 |
+
"start": 2142,
|
| 35 |
+
"end": 2205,
|
| 36 |
+
"dim": 63
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"name": "imu_accel_gyro",
|
| 40 |
+
"start": 2205,
|
| 41 |
+
"end": 2247,
|
| 42 |
+
"dim": 42
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"name": "depth_confidence",
|
| 46 |
+
"start": 2247,
|
| 47 |
+
"end": 3227,
|
| 48 |
+
"dim": 980
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"name": "video_fisheye_cam0",
|
| 52 |
+
"start": 3227,
|
| 53 |
+
"end": 3913,
|
| 54 |
+
"dim": 686
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"name": "video_fisheye_cam1",
|
| 58 |
+
"start": 3913,
|
| 59 |
+
"end": 4599,
|
| 60 |
+
"dim": 686
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "video_fisheye_cam2",
|
| 64 |
+
"start": 4599,
|
| 65 |
+
"end": 5285,
|
| 66 |
+
"dim": 686
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"name": "video_fisheye_cam3",
|
| 70 |
+
"start": 5285,
|
| 71 |
+
"end": 5971,
|
| 72 |
+
"dim": 686
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"name": "video_stereo_left",
|
| 76 |
+
"start": 5971,
|
| 77 |
+
"end": 6657,
|
| 78 |
+
"dim": 686
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"name": "video_stereo_right",
|
| 82 |
+
"start": 6657,
|
| 83 |
+
"end": 7343,
|
| 84 |
+
"dim": 686
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "caption_objects_interaction_text",
|
| 88 |
+
"start": 7343,
|
| 89 |
+
"end": 8239,
|
| 90 |
+
"dim": 896
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"name": "slam_point_cloud",
|
| 94 |
+
"start": 8239,
|
| 95 |
+
"end": 8261,
|
| 96 |
+
"dim": 22
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"name": "calibration",
|
| 100 |
+
"start": 8261,
|
| 101 |
+
"end": 8378,
|
| 102 |
+
"dim": 117
|
| 103 |
+
}
|
| 104 |
+
]
|
artifacts/min_all_modalities_action_model/metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"accuracy": 0.9828178694158075,
|
| 3 |
+
"balanced_accuracy": 0.9800925925925925,
|
| 4 |
+
"macro_f1": 0.9791023658779895,
|
| 5 |
+
"weighted_f1": 0.98276563540562,
|
| 6 |
+
"num_eval_windows": 291,
|
| 7 |
+
"num_classes": 18,
|
| 8 |
+
"majority_baseline_accuracy": 0.13745704467353953,
|
| 9 |
+
"train_final_accuracy": 1.0,
|
| 10 |
+
"train_final_loss": 0.014624637551605701,
|
| 11 |
+
"feature_dim": 8378,
|
| 12 |
+
"num_windows": 1144
|
| 13 |
+
}
|
artifacts/min_all_modalities_action_model/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:228cda0f036f86a7a1cb44e67d5c7112747bfc5cc27bf91c90516c6ba8322c81
|
| 3 |
+
size 621786
|
artifacts/min_all_modalities_subtask_model/available_modalities.json
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"modality": "depth_confidence",
|
| 4 |
+
"shape": [
|
| 5 |
+
5821,
|
| 6 |
+
140
|
| 7 |
+
]
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"modality": "video/fisheye_cam0",
|
| 11 |
+
"path": "data/sample/xperience-10m-sample/fisheye_cam0.mp4",
|
| 12 |
+
"shape": [
|
| 13 |
+
5821,
|
| 14 |
+
98
|
| 15 |
+
],
|
| 16 |
+
"exists": true
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"modality": "video/fisheye_cam1",
|
| 20 |
+
"path": "data/sample/xperience-10m-sample/fisheye_cam1.mp4",
|
| 21 |
+
"shape": [
|
| 22 |
+
5821,
|
| 23 |
+
98
|
| 24 |
+
],
|
| 25 |
+
"exists": true
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"modality": "video/fisheye_cam2",
|
| 29 |
+
"path": "data/sample/xperience-10m-sample/fisheye_cam2.mp4",
|
| 30 |
+
"shape": [
|
| 31 |
+
5821,
|
| 32 |
+
98
|
| 33 |
+
],
|
| 34 |
+
"exists": true
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"modality": "video/fisheye_cam3",
|
| 38 |
+
"path": "data/sample/xperience-10m-sample/fisheye_cam3.mp4",
|
| 39 |
+
"shape": [
|
| 40 |
+
5821,
|
| 41 |
+
98
|
| 42 |
+
],
|
| 43 |
+
"exists": true
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"modality": "video/stereo_left",
|
| 47 |
+
"path": "data/sample/xperience-10m-sample/stereo_left.mp4",
|
| 48 |
+
"shape": [
|
| 49 |
+
5821,
|
| 50 |
+
98
|
| 51 |
+
],
|
| 52 |
+
"exists": true
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"modality": "video/stereo_right",
|
| 56 |
+
"path": "data/sample/xperience-10m-sample/stereo_right.mp4",
|
| 57 |
+
"shape": [
|
| 58 |
+
5821,
|
| 59 |
+
98
|
| 60 |
+
],
|
| 61 |
+
"exists": true
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"modality": "caption_text",
|
| 65 |
+
"shape": [
|
| 66 |
+
5821,
|
| 67 |
+
128
|
| 68 |
+
],
|
| 69 |
+
"fields": "objects,interaction"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"modality": "slam_point_cloud_static",
|
| 73 |
+
"shape": [
|
| 74 |
+
22
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"modality": "calibration_static",
|
| 79 |
+
"shape": [
|
| 80 |
+
117
|
| 81 |
+
]
|
| 82 |
+
}
|
| 83 |
+
]
|
artifacts/min_all_modalities_subtask_model/feature_manifest.json
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"name": "hand_left_joints",
|
| 4 |
+
"start": 0,
|
| 5 |
+
"end": 441,
|
| 6 |
+
"dim": 441
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"name": "hand_right_joints",
|
| 10 |
+
"start": 441,
|
| 11 |
+
"end": 882,
|
| 12 |
+
"dim": 441
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"name": "body_joints",
|
| 16 |
+
"start": 882,
|
| 17 |
+
"end": 1974,
|
| 18 |
+
"dim": 1092
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"name": "body_contacts",
|
| 22 |
+
"start": 1974,
|
| 23 |
+
"end": 2121,
|
| 24 |
+
"dim": 147
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"name": "camera_translation",
|
| 28 |
+
"start": 2121,
|
| 29 |
+
"end": 2142,
|
| 30 |
+
"dim": 21
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"name": "camera_rotation_matrix",
|
| 34 |
+
"start": 2142,
|
| 35 |
+
"end": 2205,
|
| 36 |
+
"dim": 63
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"name": "imu_accel_gyro",
|
| 40 |
+
"start": 2205,
|
| 41 |
+
"end": 2247,
|
| 42 |
+
"dim": 42
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"name": "depth_confidence",
|
| 46 |
+
"start": 2247,
|
| 47 |
+
"end": 3227,
|
| 48 |
+
"dim": 980
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"name": "video_fisheye_cam0",
|
| 52 |
+
"start": 3227,
|
| 53 |
+
"end": 3913,
|
| 54 |
+
"dim": 686
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"name": "video_fisheye_cam1",
|
| 58 |
+
"start": 3913,
|
| 59 |
+
"end": 4599,
|
| 60 |
+
"dim": 686
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "video_fisheye_cam2",
|
| 64 |
+
"start": 4599,
|
| 65 |
+
"end": 5285,
|
| 66 |
+
"dim": 686
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"name": "video_fisheye_cam3",
|
| 70 |
+
"start": 5285,
|
| 71 |
+
"end": 5971,
|
| 72 |
+
"dim": 686
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"name": "video_stereo_left",
|
| 76 |
+
"start": 5971,
|
| 77 |
+
"end": 6657,
|
| 78 |
+
"dim": 686
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"name": "video_stereo_right",
|
| 82 |
+
"start": 6657,
|
| 83 |
+
"end": 7343,
|
| 84 |
+
"dim": 686
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "caption_objects_interaction_text",
|
| 88 |
+
"start": 7343,
|
| 89 |
+
"end": 8239,
|
| 90 |
+
"dim": 896
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"name": "slam_point_cloud",
|
| 94 |
+
"start": 8239,
|
| 95 |
+
"end": 8261,
|
| 96 |
+
"dim": 22
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"name": "calibration",
|
| 100 |
+
"start": 8261,
|
| 101 |
+
"end": 8378,
|
| 102 |
+
"dim": 117
|
| 103 |
+
}
|
| 104 |
+
]
|
artifacts/min_all_modalities_subtask_model/metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"accuracy": 0.9827586206896551,
|
| 3 |
+
"balanced_accuracy": 0.9505102040816327,
|
| 4 |
+
"macro_f1": 0.9307645963773675,
|
| 5 |
+
"weighted_f1": 0.9837987833808578,
|
| 6 |
+
"num_eval_windows": 290,
|
| 7 |
+
"num_classes": 14,
|
| 8 |
+
"majority_baseline_accuracy": 0.14482758620689656,
|
| 9 |
+
"train_final_accuracy": 1.0,
|
| 10 |
+
"train_final_loss": 0.012823422439396381,
|
| 11 |
+
"feature_dim": 8378,
|
| 12 |
+
"num_windows": 1147
|
| 13 |
+
}
|
artifacts/min_all_modalities_subtask_model/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ec248d69f63d5acd00c83c024bbfe23cadf0ab0ba1b6c9ff3916d2b1d76ee94
|
| 3 |
+
size 497409
|
artifacts/min_subtask_model/metrics.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"accuracy": 0.9758620689655172,
|
| 3 |
+
"balanced_accuracy": 0.9783924095954172,
|
| 4 |
+
"macro_f1": 0.9528048001232955,
|
| 5 |
+
"weighted_f1": 0.9778836359351952,
|
| 6 |
+
"num_eval_windows": 290,
|
| 7 |
+
"num_classes": 14,
|
| 8 |
+
"majority_baseline_accuracy": 0.14482758620689656,
|
| 9 |
+
"train_final_accuracy": 1.0,
|
| 10 |
+
"train_final_loss": 0.02664567530155182
|
| 11 |
+
}
|
artifacts/min_subtask_model/model.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:226b91679611e731abf36ec55f1181ad2748b25e9e84c6f09e35b00dd43a863f
|
| 3 |
+
size 131612
|
assets/task_architectures.svg
ADDED
|
|
assets/task_suite_infographic.png
ADDED
|
Git LFS Details
|
notes/all_modalities_model.md
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# All-Modality Minimal Model
|
| 2 |
+
|
| 3 |
+
Script:
|
| 4 |
+
|
| 5 |
+
```text
|
| 6 |
+
scripts/train_all_modalities_model.py
|
| 7 |
+
```
|
| 8 |
+
|
| 9 |
+
This extends the first minimal model by using every major sample modality in a lightweight way.
|
| 10 |
+
|
| 11 |
+
## Modalities Used
|
| 12 |
+
|
| 13 |
+
Dynamic sensor/action modalities:
|
| 14 |
+
|
| 15 |
+
- `hand_mocap/left_joints_3d`
|
| 16 |
+
- `hand_mocap/right_joints_3d`
|
| 17 |
+
- `full_body_mocap/keypoints`
|
| 18 |
+
- `full_body_mocap/contacts`
|
| 19 |
+
- `slam/trans_xyz`
|
| 20 |
+
- `slam/quat_wxyz` converted by the toolkit into camera rotation matrices
|
| 21 |
+
- `imu/accel_xyz`
|
| 22 |
+
- `imu/gyro_xyz`
|
| 23 |
+
- `depth/depth`
|
| 24 |
+
- `depth/confidence`
|
| 25 |
+
- `fisheye_cam0.mp4`
|
| 26 |
+
- `fisheye_cam1.mp4`
|
| 27 |
+
- `fisheye_cam2.mp4`
|
| 28 |
+
- `fisheye_cam3.mp4`
|
| 29 |
+
- `stereo_left.mp4`
|
| 30 |
+
- `stereo_right.mp4`
|
| 31 |
+
|
| 32 |
+
Static/context modalities:
|
| 33 |
+
|
| 34 |
+
- `slam/point_cloud`
|
| 35 |
+
- `calibration/*`
|
| 36 |
+
- caption objects
|
| 37 |
+
- caption interaction text
|
| 38 |
+
|
| 39 |
+
By default, the script does **not** include `action_label`, `Sub Task`, or action-description text as input, because those are too close to the prediction target. You can force that with `--include-label-text`, but that should be treated as a leakage/debug run, not a fair action-recognition experiment.
|
| 40 |
+
|
| 41 |
+
## Feature Design
|
| 42 |
+
|
| 43 |
+
The model is still intentionally small:
|
| 44 |
+
|
| 45 |
+
```text
|
| 46 |
+
raw modality -> per-frame or static handcrafted features -> window temporal statistics -> softmax classifier
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
For each 20-frame window:
|
| 50 |
+
|
| 51 |
+
- Motion signals use mean/std/min/max/delta/velocity statistics.
|
| 52 |
+
- Depth uses global depth stats plus a small normalized depth grid and confidence grid.
|
| 53 |
+
- Each video stream uses color stats, color histograms, a small grayscale grid, and simple edge stats.
|
| 54 |
+
- Text uses a hashed bag-of-words vector from objects and interaction text.
|
| 55 |
+
- Point cloud and calibration are included as static episode-level features.
|
| 56 |
+
|
| 57 |
+
Current feature blocks:
|
| 58 |
+
|
| 59 |
+
```text
|
| 60 |
+
hand_left_joints: 441
|
| 61 |
+
hand_right_joints: 441
|
| 62 |
+
body_joints: 1092
|
| 63 |
+
body_contacts: 147
|
| 64 |
+
camera_translation: 21
|
| 65 |
+
camera_rotation_matrix: 63
|
| 66 |
+
imu_accel_gyro: 42
|
| 67 |
+
depth_confidence: 980
|
| 68 |
+
video_fisheye_cam0: 686
|
| 69 |
+
video_fisheye_cam1: 686
|
| 70 |
+
video_fisheye_cam2: 686
|
| 71 |
+
video_fisheye_cam3: 686
|
| 72 |
+
video_stereo_left: 686
|
| 73 |
+
video_stereo_right: 686
|
| 74 |
+
caption_objects_interaction_text: 896
|
| 75 |
+
slam_point_cloud: 22
|
| 76 |
+
calibration: 117
|
| 77 |
+
total: 8378
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
## Run Commands
|
| 81 |
+
|
| 82 |
+
Action prediction:
|
| 83 |
+
|
| 84 |
+
```bash
|
| 85 |
+
cd /path/to/Ropedia
|
| 86 |
+
source .venv/bin/activate
|
| 87 |
+
python scripts/train_all_modalities_model.py
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
Subtask prediction:
|
| 91 |
+
|
| 92 |
+
```bash
|
| 93 |
+
python scripts/train_all_modalities_model.py --target subtask
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
The first run builds reusable caches in:
|
| 97 |
+
|
| 98 |
+
```text
|
| 99 |
+
outputs/feature_cache/
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
## Current Results
|
| 103 |
+
|
| 104 |
+
Action-label model:
|
| 105 |
+
|
| 106 |
+
```text
|
| 107 |
+
outputs/min_all_modalities_action_model/
|
| 108 |
+
accuracy: 0.9828
|
| 109 |
+
balanced_accuracy: 0.9801
|
| 110 |
+
macro_f1: 0.9791
|
| 111 |
+
weighted_f1: 0.9828
|
| 112 |
+
majority_baseline: 0.1375
|
| 113 |
+
classes: 18
|
| 114 |
+
feature_dim: 8378
|
| 115 |
+
test_windows: 291
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
Subtask-label model:
|
| 119 |
+
|
| 120 |
+
```text
|
| 121 |
+
outputs/min_all_modalities_subtask_model/
|
| 122 |
+
accuracy: 0.9828
|
| 123 |
+
balanced_accuracy: 0.9505
|
| 124 |
+
macro_f1: 0.9308
|
| 125 |
+
weighted_f1: 0.9838
|
| 126 |
+
majority_baseline: 0.1448
|
| 127 |
+
classes: 14
|
| 128 |
+
feature_dim: 8378
|
| 129 |
+
test_windows: 290
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
## How To Interpret This
|
| 133 |
+
|
| 134 |
+
This proves that the full sample can be converted into a complete supervised learning pipeline on this Mac.
|
| 135 |
+
|
| 136 |
+
It does **not** prove real generalization, because the public sample is one episode and the split is random windows from that same episode. Neighboring windows are correlated.
|
| 137 |
+
|
| 138 |
+
For a serious embodied-AI experiment:
|
| 139 |
+
|
| 140 |
+
```text
|
| 141 |
+
many episodes
|
| 142 |
+
-> cache features per episode
|
| 143 |
+
-> split by episode or task instance
|
| 144 |
+
-> train on some episodes
|
| 145 |
+
-> test on unseen episodes
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
The next useful upgrade is not a bigger classifier. It is a better split and more episodes.
|
notes/episode_task_suite.md
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Episode Task Suite
|
| 2 |
+
|
| 3 |
+
Script:
|
| 4 |
+
|
| 5 |
+
```text
|
| 6 |
+
scripts/episode_task_suite.py
|
| 7 |
+
```
|
| 8 |
+
|
| 9 |
+
This script turns the single public Ropedia sample episode into many end-to-end tasks. It is designed for learning, debugging, and task design. It is **not** a generalization benchmark because the data is still one episode.
|
| 10 |
+
|
| 11 |
+
Run:
|
| 12 |
+
|
| 13 |
+
```bash
|
| 14 |
+
cd /path/to/Ropedia
|
| 15 |
+
source .venv/bin/activate
|
| 16 |
+
python scripts/episode_task_suite.py
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
Output:
|
| 20 |
+
|
| 21 |
+
```text
|
| 22 |
+
outputs/episode_task_suite/
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
Shared setup:
|
| 26 |
+
|
| 27 |
+
```text
|
| 28 |
+
sample episode: 5821 frames
|
| 29 |
+
windows: 1161
|
| 30 |
+
window size: 20 frames
|
| 31 |
+
stride: 5 frames
|
| 32 |
+
feature dim: 8378
|
| 33 |
+
split: chronological, first 70% train and last 30% test
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
## Implemented Tasks
|
| 37 |
+
|
| 38 |
+
| Task | Input | Output | Main artifact |
|
| 39 |
+
|---|---|---|---|
|
| 40 |
+
| `timeline_action` | all modality window | current action label | `timeline_action/metrics.json` |
|
| 41 |
+
| `timeline_subtask` | all modality window | current subtask label | `timeline_subtask/metrics.json` |
|
| 42 |
+
| `transition_detection` | all modality window | steady vs action boundary | `transition_detection/metrics.json` |
|
| 43 |
+
| `next_action` | current all modality window | action 20 frames later | `next_action/metrics.json` |
|
| 44 |
+
| `hand_trajectory_forecast` | current all modality window | future 10-frame left/right hand joints | `hand_trajectory_forecast/predictions.npz` |
|
| 45 |
+
| `contact_prediction` | non-contact modalities | any body contact in window | `contact_prediction/metrics.json` |
|
| 46 |
+
| `object_relevance` | non-caption modalities | relevant object set | `object_relevance/predictions.csv` |
|
| 47 |
+
| `caption_grounding` | caption objects/interaction query + sensor candidates | matching time window | `caption_grounding/metrics.json` |
|
| 48 |
+
| `cross_modal_retrieval` | motion/IMU/camera query | matching depth/video window | `cross_modal_retrieval/metrics.json` |
|
| 49 |
+
| `modality_reconstruction` | motion/IMU/camera | depth/video feature vector | `modality_reconstruction/predictions.npz` |
|
| 50 |
+
| `temporal_order` | two adjacent windows | whether order is correct | `temporal_order/metrics.json` |
|
| 51 |
+
| `misalignment_detection` | motion+visual pair | aligned vs shifted | `misalignment_detection/metrics.json` |
|
| 52 |
+
|
| 53 |
+
## Minimal Model Architectures
|
| 54 |
+
|
| 55 |
+
All tasks share the same window builder unless a task explicitly removes a
|
| 56 |
+
feature block to avoid label leakage.
|
| 57 |
+
|
| 58 |
+
```text
|
| 59 |
+
raw sample episode
|
| 60 |
+
-> 20-frame sliding windows, stride 5
|
| 61 |
+
-> all-modality feature vector X_all, 8,378 dimensions
|
| 62 |
+
-> chronological split, first 70% train and last 30% test
|
| 63 |
+
-> train-only z-score scaler
|
| 64 |
+
-> task-specific minimal head
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
The task suite intentionally uses simple heads:
|
| 68 |
+
|
| 69 |
+
| Family | Formula | Tasks |
|
| 70 |
+
|---|---|---|
|
| 71 |
+
| Linear softmax | `softmax(z(X)W + b)`, cross-entropy, L2 | `timeline_action`, `timeline_subtask`, `transition_detection`, `next_action`, `contact_prediction`, `temporal_order`, `misalignment_detection` |
|
| 72 |
+
| Ridge regression/projection | dual ridge regression with L2=10 on z-scored X/Y | `hand_trajectory_forecast`, `caption_grounding`, `cross_modal_retrieval`, `modality_reconstruction` |
|
| 73 |
+
| Multi-label logistic | `sigmoid(z(X)W + b)`, weighted object heads | `object_relevance` |
|
| 74 |
+
|
| 75 |
+
Task-specific architecture details:
|
| 76 |
+
|
| 77 |
+
| Task | Input tensor/vector | Minimal head | Output target |
|
| 78 |
+
|---|---|---|---|
|
| 79 |
+
| `timeline_action` | `X_all`, 8,378d | class-weighted linear softmax | current action label |
|
| 80 |
+
| `timeline_subtask` | `X_all`, 8,378d | class-weighted linear softmax | current subtask label |
|
| 81 |
+
| `transition_detection` | `X_all`, 8,378d | class-weighted linear softmax | steady vs transition near action boundary |
|
| 82 |
+
| `next_action` | `X_all(t)`, 8,378d | class-weighted linear softmax | action at `t+20` frames |
|
| 83 |
+
| `hand_trajectory_forecast` | `X_all(t)`, 8,378d | ridge regression | future 10 frames of left/right hand joints, 1,260d |
|
| 84 |
+
| `contact_prediction` | all features except `body_contacts` and caption text, 7,335d | linear softmax on observed labels | any body contact in window |
|
| 85 |
+
| `object_relevance` | all features except caption text, 7,482d | multi-label logistic regression | 34-object multi-hot vector |
|
| 86 |
+
| `caption_grounding` | sensor features, 7,482d, projected into 896d text space | ridge projection plus cosine ranking | matching time window for a text query |
|
| 87 |
+
| `cross_modal_retrieval` | motion/IMU/camera, 2,247d, projected into 5,096d visual space | ridge projection plus cosine ranking | matching depth/video window |
|
| 88 |
+
| `modality_reconstruction` | motion/IMU/camera, 2,247d | ridge regression | depth/video feature vector, 5,096d |
|
| 89 |
+
| `temporal_order` | `[x_t, x_t+1, x_t+1-x_t]`, 25,134d | binary linear softmax | correct vs reversed order |
|
| 90 |
+
| `misalignment_detection` | motion plus visual pair, 7,343d | binary linear softmax | aligned vs shifted by 8 windows |
|
| 91 |
+
|
| 92 |
+
Diagram:
|
| 93 |
+
|
| 94 |
+
```text
|
| 95 |
+
docs/assets/task_architectures.svg
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
## Current Results
|
| 99 |
+
|
| 100 |
+
```text
|
| 101 |
+
timeline_action:
|
| 102 |
+
accuracy: 0.0292
|
| 103 |
+
macro_f1: 0.0500
|
| 104 |
+
note: future test region contains unseen action classes
|
| 105 |
+
|
| 106 |
+
timeline_subtask:
|
| 107 |
+
accuracy: 0.0581
|
| 108 |
+
macro_f1: 0.0495
|
| 109 |
+
note: future test region contains unseen subtask classes
|
| 110 |
+
|
| 111 |
+
transition_detection:
|
| 112 |
+
accuracy: 0.9253
|
| 113 |
+
macro_f1: 0.6552
|
| 114 |
+
boundary_f1: 0.2143
|
| 115 |
+
|
| 116 |
+
next_action:
|
| 117 |
+
accuracy: 0.0345
|
| 118 |
+
macro_f1: 0.0593
|
| 119 |
+
note: same unseen-future-class problem as timeline_action
|
| 120 |
+
|
| 121 |
+
hand_trajectory_forecast:
|
| 122 |
+
MPJPE: 0.8223
|
| 123 |
+
final-frame MPJPE: 1.0650
|
| 124 |
+
|
| 125 |
+
contact_prediction:
|
| 126 |
+
accuracy: 1.0000
|
| 127 |
+
note: degenerate on this sample because the binary contact label has only one class
|
| 128 |
+
|
| 129 |
+
object_relevance:
|
| 130 |
+
micro_f1: 0.1839
|
| 131 |
+
macro_f1: 0.0643
|
| 132 |
+
|
| 133 |
+
caption_grounding:
|
| 134 |
+
top1: 0.0029
|
| 135 |
+
top5: 0.0115
|
| 136 |
+
MRR: 0.0172
|
| 137 |
+
|
| 138 |
+
cross_modal_retrieval:
|
| 139 |
+
top1: 0.1494
|
| 140 |
+
top5: 0.3764
|
| 141 |
+
top10: 0.4741
|
| 142 |
+
MRR: 0.2634
|
| 143 |
+
|
| 144 |
+
modality_reconstruction:
|
| 145 |
+
R2: -0.0160
|
| 146 |
+
|
| 147 |
+
temporal_order:
|
| 148 |
+
accuracy: 0.4612
|
| 149 |
+
f1: 0.5487
|
| 150 |
+
|
| 151 |
+
misalignment_detection:
|
| 152 |
+
accuracy: 0.5029
|
| 153 |
+
f1: 0.4866
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
## How To Read These Results
|
| 157 |
+
|
| 158 |
+
Low scores are useful here. They show which tasks are not learnable from this one chronological sample with this minimal model.
|
| 159 |
+
|
| 160 |
+
The strongest signal is `cross_modal_retrieval`: motion/IMU/camera features can retrieve the matching depth/video window better than random. That means the modalities are synchronized and contain shared temporal structure.
|
| 161 |
+
|
| 162 |
+
The weakest supervised timeline tasks are weak mainly because of the split. The last 30% of a single ordered episode contains actions/subtasks not present in the first 70%, so a classifier trained on the first part cannot predict labels it never saw.
|
| 163 |
+
|
| 164 |
+
For serious research, keep the same task code but change the dataset unit:
|
| 165 |
+
|
| 166 |
+
```text
|
| 167 |
+
many episodes -> train episodes -> test unseen episodes
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
For single-episode learning, these tasks are best used as:
|
| 171 |
+
|
| 172 |
+
- data pipeline tests
|
| 173 |
+
- modality ablations
|
| 174 |
+
- label-alignment checks
|
| 175 |
+
- self-supervised retrieval experiments
|
| 176 |
+
- debugging templates before scaling to many episodes
|
notes/min_action_model.md
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Minimal Action Model
|
| 2 |
+
|
| 3 |
+
This is the first modeling baseline for the Ropedia/Xperience sample.
|
| 4 |
+
|
| 5 |
+
The script is:
|
| 6 |
+
|
| 7 |
+
```text
|
| 8 |
+
scripts/train_min_action_model.py
|
| 9 |
+
```
|
| 10 |
+
|
| 11 |
+
It trains a small Numpy-only softmax classifier:
|
| 12 |
+
|
| 13 |
+
```text
|
| 14 |
+
annotation.hdf5
|
| 15 |
+
-> hand/body/IMU/camera/contact windows
|
| 16 |
+
-> action or subtask labels from captions
|
| 17 |
+
-> stratified train/test split
|
| 18 |
+
-> multinomial logistic regression
|
| 19 |
+
-> metrics and predictions
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
Run:
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
cd /path/to/Ropedia
|
| 26 |
+
source .venv/bin/activate
|
| 27 |
+
python scripts/train_min_action_model.py
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
Default output:
|
| 31 |
+
|
| 32 |
+
```text
|
| 33 |
+
outputs/min_action_model/
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
Important artifacts:
|
| 37 |
+
|
| 38 |
+
- `metrics.json`: accuracy, balanced accuracy, macro-F1, weighted-F1, majority baseline.
|
| 39 |
+
- `per_class_metrics.csv`: precision/recall/F1 per action class.
|
| 40 |
+
- `confusion_matrix.csv`: true label vs predicted label matrix.
|
| 41 |
+
- `predictions.csv`: one row per test window.
|
| 42 |
+
- `feature_dataset.npz`: processed numeric features and labels.
|
| 43 |
+
- `model.npz`: fitted scaler and softmax weights.
|
| 44 |
+
|
| 45 |
+
This is a learning baseline, not a publishable benchmark. The public sample is only one episode, so stratified windows from one episode are correlated. For serious evaluation, use many episodes and split by held-out episodes or held-out task instances.
|
| 46 |
+
|
| 47 |
+
## Current Sample Results
|
| 48 |
+
|
| 49 |
+
Action-label model:
|
| 50 |
+
|
| 51 |
+
```text
|
| 52 |
+
outputs/min_action_model/
|
| 53 |
+
accuracy: 0.9828
|
| 54 |
+
balanced_accuracy: 0.9644
|
| 55 |
+
macro_f1: 0.9688
|
| 56 |
+
weighted_f1: 0.9824
|
| 57 |
+
majority_baseline: 0.1375
|
| 58 |
+
classes: 18
|
| 59 |
+
test_windows: 291
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
Subtask-label model:
|
| 63 |
+
|
| 64 |
+
```text
|
| 65 |
+
outputs/min_subtask_model/
|
| 66 |
+
accuracy: 0.9759
|
| 67 |
+
balanced_accuracy: 0.9784
|
| 68 |
+
macro_f1: 0.9528
|
| 69 |
+
weighted_f1: 0.9779
|
| 70 |
+
majority_baseline: 0.1448
|
| 71 |
+
classes: 14
|
| 72 |
+
test_windows: 290
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
Why the numbers are high:
|
| 76 |
+
|
| 77 |
+
- This is one public sample episode.
|
| 78 |
+
- Windows are stratified randomly, so train/test windows can be close in time.
|
| 79 |
+
- The result proves the pipeline works; it does not prove cross-episode generalization.
|
| 80 |
+
|
| 81 |
+
Next serious evaluation:
|
| 82 |
+
|
| 83 |
+
```text
|
| 84 |
+
many episodes -> split by episode -> train on some episodes -> test on unseen episodes
|
| 85 |
+
```
|
notes/reproducibility_audit.md
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Reproducibility Audit
|
| 2 |
+
|
| 3 |
+
Audit date: 2026-05-30 Asia/Singapore.
|
| 4 |
+
|
| 5 |
+
Purpose: verify that the committed Ropedia Episode Task Suite artifacts are
|
| 6 |
+
real outputs from the scripts, not placeholder or fabricated metrics.
|
| 7 |
+
|
| 8 |
+
## Raw Inputs Checked
|
| 9 |
+
|
| 10 |
+
The audit used the local public sample episode:
|
| 11 |
+
|
| 12 |
+
```text
|
| 13 |
+
data/sample/xperience-10m-sample/
|
| 14 |
+
annotation.hdf5
|
| 15 |
+
fisheye_cam0.mp4
|
| 16 |
+
fisheye_cam1.mp4
|
| 17 |
+
fisheye_cam2.mp4
|
| 18 |
+
fisheye_cam3.mp4
|
| 19 |
+
stereo_left.mp4
|
| 20 |
+
stereo_right.mp4
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
`annotation.hdf5` contains 5,821 aligned frames with depth, hand mocap, body
|
| 24 |
+
mocap, IMU, SLAM, calibration, and caption metadata. The video feature cache was
|
| 25 |
+
rebuilt from all six video files during the audit.
|
| 26 |
+
|
| 27 |
+
## Commands Re-run
|
| 28 |
+
|
| 29 |
+
All audit outputs were written outside the repo:
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
AUDIT=/private/tmp/ropedia-audit
|
| 33 |
+
WORKSPACE=/path/to/Ropedia
|
| 34 |
+
ANN=$WORKSPACE/data/sample/xperience-10m-sample/annotation.hdf5
|
| 35 |
+
PY=$WORKSPACE/.venv/bin/python
|
| 36 |
+
|
| 37 |
+
$PY -B scripts/train_min_action_model.py \
|
| 38 |
+
--workspace $WORKSPACE \
|
| 39 |
+
--annotation $ANN \
|
| 40 |
+
--output-dir $AUDIT/min_action_model \
|
| 41 |
+
--target action
|
| 42 |
+
|
| 43 |
+
$PY -B scripts/train_min_action_model.py \
|
| 44 |
+
--workspace $WORKSPACE \
|
| 45 |
+
--annotation $ANN \
|
| 46 |
+
--output-dir $AUDIT/min_subtask_model \
|
| 47 |
+
--target subtask
|
| 48 |
+
|
| 49 |
+
$PY -B scripts/train_all_modalities_model.py \
|
| 50 |
+
--workspace $WORKSPACE \
|
| 51 |
+
--annotation $ANN \
|
| 52 |
+
--output-dir $AUDIT/min_all_modalities_action_model \
|
| 53 |
+
--cache-dir $AUDIT/cache \
|
| 54 |
+
--target action
|
| 55 |
+
|
| 56 |
+
$PY -B scripts/train_all_modalities_model.py \
|
| 57 |
+
--workspace $WORKSPACE \
|
| 58 |
+
--annotation $ANN \
|
| 59 |
+
--output-dir $AUDIT/min_all_modalities_subtask_model \
|
| 60 |
+
--cache-dir $AUDIT/cache \
|
| 61 |
+
--target subtask
|
| 62 |
+
|
| 63 |
+
$PY -B scripts/episode_task_suite.py \
|
| 64 |
+
--workspace $WORKSPACE \
|
| 65 |
+
--annotation $ANN \
|
| 66 |
+
--output-dir $AUDIT/episode_task_suite \
|
| 67 |
+
--cache-dir $AUDIT/cache
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
## Exact Match Checks
|
| 71 |
+
|
| 72 |
+
The regenerated files matched the committed files:
|
| 73 |
+
|
| 74 |
+
```text
|
| 75 |
+
min_action_model/metrics.json: MATCH
|
| 76 |
+
min_subtask_model/metrics.json: MATCH
|
| 77 |
+
min_all_modalities_action_model/metrics.json: MATCH
|
| 78 |
+
min_all_modalities_subtask_model/metrics.json: MATCH
|
| 79 |
+
episode_task_suite/summary_report.json: MATCH
|
| 80 |
+
episode_task_suite/feature_manifest.json: MATCH
|
| 81 |
+
episode_task_suite/available_modalities.json: MATCH
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
Every per-task `metrics.json` also matched:
|
| 85 |
+
|
| 86 |
+
```text
|
| 87 |
+
caption_grounding/metrics.json: MATCH
|
| 88 |
+
contact_prediction/metrics.json: MATCH
|
| 89 |
+
cross_modal_retrieval/metrics.json: MATCH
|
| 90 |
+
hand_trajectory_forecast/metrics.json: MATCH
|
| 91 |
+
misalignment_detection/metrics.json: MATCH
|
| 92 |
+
modality_reconstruction/metrics.json: MATCH
|
| 93 |
+
next_action/metrics.json: MATCH
|
| 94 |
+
object_relevance/metrics.json: MATCH
|
| 95 |
+
temporal_order/metrics.json: MATCH
|
| 96 |
+
timeline_action/metrics.json: MATCH
|
| 97 |
+
timeline_subtask/metrics.json: MATCH
|
| 98 |
+
transition_detection/metrics.json: MATCH
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
## Fresh Cache Evidence
|
| 102 |
+
|
| 103 |
+
The all-modality audit rebuilt a fresh feature cache:
|
| 104 |
+
|
| 105 |
+
```text
|
| 106 |
+
depth_n5821_grid8.npz: shape=(5821, 140), nonzero=809107
|
| 107 |
+
video_fisheye_cam0_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570458
|
| 108 |
+
video_fisheye_cam1_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570400
|
| 109 |
+
video_fisheye_cam2_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570458
|
| 110 |
+
video_fisheye_cam3_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=568723
|
| 111 |
+
video_stereo_left_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570249
|
| 112 |
+
video_stereo_right_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570430
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
This confirms the committed metrics are reproducible from the raw sample and
|
| 116 |
+
that the all-modality pipeline reads real depth/video files instead of using
|
| 117 |
+
empty placeholder features.
|
| 118 |
+
|
| 119 |
+
## Caveats
|
| 120 |
+
|
| 121 |
+
The scripts contain a zero-feature fallback if a video file is missing. That is
|
| 122 |
+
not the path used in this audit: all six videos existed and produced nonzero
|
| 123 |
+
features. The repo remains a single-episode learning and pipeline-validation
|
| 124 |
+
project, not evidence of cross-episode generalization.
|
scripts/episode_task_suite.py
ADDED
|
@@ -0,0 +1,776 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
End-to-end task suite for one Ropedia/Xperience episode.
|
| 4 |
+
|
| 5 |
+
The purpose is not to prove generalization from one sample episode. It is to
|
| 6 |
+
turn the episode into multiple meaningful supervised/self-supervised learning
|
| 7 |
+
problems and write reproducible artifacts for each one.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import argparse
|
| 13 |
+
import csv
|
| 14 |
+
import json
|
| 15 |
+
import math
|
| 16 |
+
import sys
|
| 17 |
+
from collections import Counter, OrderedDict
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
|
| 20 |
+
import numpy as np
|
| 21 |
+
|
| 22 |
+
from train_all_modalities_model import (
|
| 23 |
+
extract_all_window_features,
|
| 24 |
+
prepare_modalities,
|
| 25 |
+
)
|
| 26 |
+
from train_min_action_model import (
|
| 27 |
+
add_toolkit_to_path,
|
| 28 |
+
compute_metrics,
|
| 29 |
+
encode_labels,
|
| 30 |
+
fit_scaler,
|
| 31 |
+
frame_label,
|
| 32 |
+
majority_label,
|
| 33 |
+
predict,
|
| 34 |
+
portable_path,
|
| 35 |
+
softmax,
|
| 36 |
+
train_softmax_classifier,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
TASKS = [
|
| 41 |
+
"timeline_action",
|
| 42 |
+
"timeline_subtask",
|
| 43 |
+
"transition_detection",
|
| 44 |
+
"next_action",
|
| 45 |
+
"hand_trajectory_forecast",
|
| 46 |
+
"contact_prediction",
|
| 47 |
+
"object_relevance",
|
| 48 |
+
"caption_grounding",
|
| 49 |
+
"cross_modal_retrieval",
|
| 50 |
+
"modality_reconstruction",
|
| 51 |
+
"temporal_order",
|
| 52 |
+
"misalignment_detection",
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def parse_args() -> argparse.Namespace:
|
| 57 |
+
workspace_default = Path(__file__).resolve().parents[1]
|
| 58 |
+
annotation_default = workspace_default / "data/sample/xperience-10m-sample/annotation.hdf5"
|
| 59 |
+
parser = argparse.ArgumentParser(description="Run an end-to-end task suite on one Ropedia episode.")
|
| 60 |
+
parser.add_argument("--workspace", type=Path, default=workspace_default)
|
| 61 |
+
parser.add_argument("--annotation", type=Path, default=annotation_default)
|
| 62 |
+
parser.add_argument("--output-dir", type=Path, default=workspace_default / "outputs/episode_task_suite")
|
| 63 |
+
parser.add_argument("--cache-dir", type=Path, default=workspace_default / "outputs/feature_cache")
|
| 64 |
+
parser.add_argument("--window-frames", type=int, default=20)
|
| 65 |
+
parser.add_argument("--stride-frames", type=int, default=5)
|
| 66 |
+
parser.add_argument("--min-label-fraction", type=float, default=0.6)
|
| 67 |
+
parser.add_argument("--test-fraction", type=float, default=0.30)
|
| 68 |
+
parser.add_argument("--epochs", type=int, default=400)
|
| 69 |
+
parser.add_argument("--learning-rate", type=float, default=0.12)
|
| 70 |
+
parser.add_argument("--l2", type=float, default=2e-3)
|
| 71 |
+
parser.add_argument("--ridge-l2", type=float, default=10.0)
|
| 72 |
+
parser.add_argument("--seed", type=int, default=7)
|
| 73 |
+
parser.add_argument("--future-frames", type=int, default=20, help="Future offset for next-action prediction.")
|
| 74 |
+
parser.add_argument("--forecast-frames", type=int, default=10, help="Future hand trajectory length.")
|
| 75 |
+
parser.add_argument("--boundary-tolerance-frames", type=int, default=10)
|
| 76 |
+
parser.add_argument("--misalignment-shift-windows", type=int, default=8)
|
| 77 |
+
parser.add_argument("--tasks", default="all", help="Comma-separated task list or 'all'.")
|
| 78 |
+
|
| 79 |
+
# Match train_all_modalities_model defaults used by prepare_modalities.
|
| 80 |
+
parser.add_argument("--force-rebuild-cache", action="store_true")
|
| 81 |
+
parser.add_argument("--video-image-size", type=int, default=32)
|
| 82 |
+
parser.add_argument("--video-grid-size", type=int, default=8)
|
| 83 |
+
parser.add_argument("--video-hist-bins", type=int, default=8)
|
| 84 |
+
parser.add_argument("--depth-grid-size", type=int, default=8)
|
| 85 |
+
parser.add_argument("--text-hash-dim", type=int, default=128)
|
| 86 |
+
parser.add_argument("--include-label-text", action="store_true")
|
| 87 |
+
parser.add_argument("--no-class-weights", action="store_true")
|
| 88 |
+
return parser.parse_args()
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def selected_tasks(spec: str) -> list[str]:
|
| 92 |
+
if spec.strip().lower() == "all":
|
| 93 |
+
return TASKS
|
| 94 |
+
chosen = [x.strip() for x in spec.split(",") if x.strip()]
|
| 95 |
+
unknown = [x for x in chosen if x not in TASKS]
|
| 96 |
+
if unknown:
|
| 97 |
+
raise ValueError(f"Unknown tasks: {unknown}. Valid tasks: {TASKS}")
|
| 98 |
+
return chosen
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def write_json(path: Path, data: dict | list) -> None:
|
| 102 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 103 |
+
path.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def write_csv(path: Path, rows: list[dict], fieldnames: list[str]) -> None:
|
| 107 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 108 |
+
with path.open("w", newline="", encoding="utf-8") as fp:
|
| 109 |
+
writer = csv.DictWriter(fp, fieldnames=fieldnames)
|
| 110 |
+
writer.writeheader()
|
| 111 |
+
writer.writerows(rows)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def write_confusion(path: Path, cm: np.ndarray, class_names: list[str]) -> None:
|
| 115 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 116 |
+
with path.open("w", newline="", encoding="utf-8") as fp:
|
| 117 |
+
writer = csv.writer(fp)
|
| 118 |
+
writer.writerow(["true\\pred"] + class_names)
|
| 119 |
+
for i, name in enumerate(class_names):
|
| 120 |
+
writer.writerow([name] + [int(v) for v in cm[i]])
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def chronological_split_indices(n: int, test_fraction: float) -> tuple[np.ndarray, np.ndarray]:
|
| 124 |
+
if n < 2:
|
| 125 |
+
raise ValueError("Need at least two samples for train/test split.")
|
| 126 |
+
split = int(round(n * (1.0 - test_fraction)))
|
| 127 |
+
split = max(1, min(split, n - 1))
|
| 128 |
+
return np.arange(split, dtype=np.int64), np.arange(split, n, dtype=np.int64)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def build_windows(args: argparse.Namespace, ann: dict, extras: dict):
|
| 132 |
+
frame_info = ann["caption_frame_info_map"]
|
| 133 |
+
n_frames = len(ann["img_names"])
|
| 134 |
+
rows = []
|
| 135 |
+
X = []
|
| 136 |
+
feature_manifest = None
|
| 137 |
+
|
| 138 |
+
for start in range(0, n_frames - args.window_frames + 1, args.stride_frames):
|
| 139 |
+
end = start + args.window_frames
|
| 140 |
+
action_labels = [frame_label(frame_info.get(i, {}), "action") for i in range(start, end)]
|
| 141 |
+
subtask_labels = [frame_label(frame_info.get(i, {}), "subtask") for i in range(start, end)]
|
| 142 |
+
action, action_frac = majority_label(action_labels, args.min_label_fraction)
|
| 143 |
+
subtask, subtask_frac = majority_label(subtask_labels, args.min_label_fraction)
|
| 144 |
+
|
| 145 |
+
if feature_manifest is None:
|
| 146 |
+
vec, blocks = extract_all_window_features(ann, extras, start, end, return_blocks=True)
|
| 147 |
+
offset = 0
|
| 148 |
+
feature_manifest = []
|
| 149 |
+
for name, dim in blocks:
|
| 150 |
+
feature_manifest.append({"name": name, "start": offset, "end": offset + dim, "dim": dim})
|
| 151 |
+
offset += dim
|
| 152 |
+
else:
|
| 153 |
+
vec = extract_all_window_features(ann, extras, start, end)
|
| 154 |
+
|
| 155 |
+
X.append(vec)
|
| 156 |
+
rows.append({
|
| 157 |
+
"window_index": len(rows),
|
| 158 |
+
"start_frame": start,
|
| 159 |
+
"end_frame": end - 1,
|
| 160 |
+
"center_frame": (start + end - 1) // 2,
|
| 161 |
+
"action_label": action,
|
| 162 |
+
"action_fraction": action_frac,
|
| 163 |
+
"subtask_label": subtask,
|
| 164 |
+
"subtask_fraction": subtask_frac,
|
| 165 |
+
})
|
| 166 |
+
|
| 167 |
+
return np.stack(X).astype(np.float32), rows, feature_manifest or []
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def block_indices(feature_manifest: list[dict], include: list[str] | None = None, exclude: list[str] | None = None) -> np.ndarray:
|
| 171 |
+
include = include or []
|
| 172 |
+
exclude = exclude or []
|
| 173 |
+
idxs = []
|
| 174 |
+
for block in feature_manifest:
|
| 175 |
+
name = block["name"]
|
| 176 |
+
if include and not any(name == p or name.startswith(p) for p in include):
|
| 177 |
+
continue
|
| 178 |
+
if exclude and any(name == p or name.startswith(p) for p in exclude):
|
| 179 |
+
continue
|
| 180 |
+
idxs.extend(range(int(block["start"]), int(block["end"])))
|
| 181 |
+
return np.asarray(idxs, dtype=np.int64)
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def label_array(rows: list[dict], key: str) -> np.ndarray:
|
| 185 |
+
return np.asarray([str(row.get(key, "") or "") for row in rows], dtype=object)
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def classification_task(
|
| 189 |
+
out_dir: Path,
|
| 190 |
+
X: np.ndarray,
|
| 191 |
+
labels: np.ndarray,
|
| 192 |
+
rows: list[dict],
|
| 193 |
+
args: argparse.Namespace,
|
| 194 |
+
task_name: str,
|
| 195 |
+
input_description: str,
|
| 196 |
+
) -> dict:
|
| 197 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 198 |
+
valid = np.asarray([bool(x) for x in labels])
|
| 199 |
+
valid_idx = np.flatnonzero(valid)
|
| 200 |
+
Xv = X[valid_idx]
|
| 201 |
+
labelv = labels[valid_idx]
|
| 202 |
+
rowv = [rows[int(i)] for i in valid_idx]
|
| 203 |
+
y, class_names = encode_labels(labelv)
|
| 204 |
+
train_local, test_local = chronological_split_indices(len(y), args.test_fraction)
|
| 205 |
+
|
| 206 |
+
train_classes = set(int(x) for x in y[train_local])
|
| 207 |
+
test_classes = set(int(x) for x in y[test_local])
|
| 208 |
+
unseen_test_classes = sorted(class_names[i] for i in (test_classes - train_classes))
|
| 209 |
+
|
| 210 |
+
mean, std = fit_scaler(Xv[train_local])
|
| 211 |
+
Xs = (Xv - mean) / std
|
| 212 |
+
W, b, history = train_softmax_classifier(
|
| 213 |
+
Xs[train_local],
|
| 214 |
+
y[train_local],
|
| 215 |
+
n_classes=len(class_names),
|
| 216 |
+
epochs=args.epochs,
|
| 217 |
+
lr=args.learning_rate,
|
| 218 |
+
l2=args.l2,
|
| 219 |
+
use_class_weights=not args.no_class_weights,
|
| 220 |
+
seed=args.seed,
|
| 221 |
+
)
|
| 222 |
+
pred, probs = predict(Xs[test_local], W, b)
|
| 223 |
+
metrics, per_class, cm = compute_metrics(y[test_local], pred, class_names)
|
| 224 |
+
majority = Counter(y[train_local]).most_common(1)[0][0]
|
| 225 |
+
metrics.update({
|
| 226 |
+
"task": task_name,
|
| 227 |
+
"input": input_description,
|
| 228 |
+
"split": "chronological",
|
| 229 |
+
"num_windows": int(len(y)),
|
| 230 |
+
"num_train_windows": int(len(train_local)),
|
| 231 |
+
"num_test_windows": int(len(test_local)),
|
| 232 |
+
"num_classes": int(len(class_names)),
|
| 233 |
+
"feature_dim": int(X.shape[1]),
|
| 234 |
+
"majority_baseline_accuracy": float(np.mean(y[test_local] == majority)),
|
| 235 |
+
"train_final_accuracy": float(history[-1]["train_accuracy"]),
|
| 236 |
+
"train_final_loss": float(history[-1]["loss"]),
|
| 237 |
+
"unseen_test_classes": unseen_test_classes,
|
| 238 |
+
})
|
| 239 |
+
|
| 240 |
+
pred_rows = []
|
| 241 |
+
for local_pos, pred_id in zip(test_local, pred):
|
| 242 |
+
row = rowv[int(local_pos)]
|
| 243 |
+
true_id = int(y[int(local_pos)])
|
| 244 |
+
pred_rows.append({
|
| 245 |
+
"window_index": row["window_index"],
|
| 246 |
+
"start_frame": row["start_frame"],
|
| 247 |
+
"end_frame": row["end_frame"],
|
| 248 |
+
"center_frame": row["center_frame"],
|
| 249 |
+
"true_label": class_names[true_id],
|
| 250 |
+
"predicted_label": class_names[int(pred_id)],
|
| 251 |
+
"confidence": float(probs[list(test_local).index(local_pos), int(pred_id)]),
|
| 252 |
+
"correct": int(true_id == int(pred_id)),
|
| 253 |
+
})
|
| 254 |
+
|
| 255 |
+
write_json(out_dir / "metrics.json", metrics)
|
| 256 |
+
write_csv(out_dir / "per_class_metrics.csv", per_class, ["class_id", "class_name", "support", "predicted", "precision", "recall", "f1"])
|
| 257 |
+
write_confusion(out_dir / "confusion_matrix.csv", cm, class_names)
|
| 258 |
+
write_csv(out_dir / "predictions.csv", pred_rows, ["window_index", "start_frame", "end_frame", "center_frame", "true_label", "predicted_label", "confidence", "correct"])
|
| 259 |
+
np.savez_compressed(out_dir / "model.npz", mean=mean, std=std, W=W, b=b, class_names=np.asarray(class_names, dtype=object))
|
| 260 |
+
return metrics
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def binary_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
|
| 264 |
+
y_true = y_true.astype(np.int64)
|
| 265 |
+
y_pred = y_pred.astype(np.int64)
|
| 266 |
+
tp = int(np.sum((y_true == 1) & (y_pred == 1)))
|
| 267 |
+
tn = int(np.sum((y_true == 0) & (y_pred == 0)))
|
| 268 |
+
fp = int(np.sum((y_true == 0) & (y_pred == 1)))
|
| 269 |
+
fn = int(np.sum((y_true == 1) & (y_pred == 0)))
|
| 270 |
+
precision = tp / (tp + fp) if tp + fp else 0.0
|
| 271 |
+
recall = tp / (tp + fn) if tp + fn else 0.0
|
| 272 |
+
f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
|
| 273 |
+
return {
|
| 274 |
+
"accuracy": float((tp + tn) / max(len(y_true), 1)),
|
| 275 |
+
"precision": precision,
|
| 276 |
+
"recall": recall,
|
| 277 |
+
"f1": f1,
|
| 278 |
+
"tp": tp,
|
| 279 |
+
"tn": tn,
|
| 280 |
+
"fp": fp,
|
| 281 |
+
"fn": fn,
|
| 282 |
+
"positive_rate_true": float(np.mean(y_true)) if len(y_true) else 0.0,
|
| 283 |
+
"positive_rate_pred": float(np.mean(y_pred)) if len(y_pred) else 0.0,
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def boundary_f1(true_frames: list[int], pred_frames: list[int], tolerance: int) -> dict:
|
| 288 |
+
used = set()
|
| 289 |
+
matches = 0
|
| 290 |
+
errors = []
|
| 291 |
+
for pf in pred_frames:
|
| 292 |
+
candidates = [(abs(pf - tf), j, tf) for j, tf in enumerate(true_frames) if j not in used and abs(pf - tf) <= tolerance]
|
| 293 |
+
if not candidates:
|
| 294 |
+
continue
|
| 295 |
+
diff, j, tf = min(candidates)
|
| 296 |
+
used.add(j)
|
| 297 |
+
matches += 1
|
| 298 |
+
errors.append(diff)
|
| 299 |
+
precision = matches / len(pred_frames) if pred_frames else 0.0
|
| 300 |
+
recall = matches / len(true_frames) if true_frames else 0.0
|
| 301 |
+
f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
|
| 302 |
+
return {
|
| 303 |
+
"boundary_precision": precision,
|
| 304 |
+
"boundary_recall": recall,
|
| 305 |
+
"boundary_f1": f1,
|
| 306 |
+
"matched_boundaries": matches,
|
| 307 |
+
"true_boundaries": len(true_frames),
|
| 308 |
+
"predicted_boundaries": len(pred_frames),
|
| 309 |
+
"mean_abs_timing_error_frames": float(np.mean(errors)) if errors else None,
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
def task_transition_detection(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, args: argparse.Namespace) -> dict:
|
| 314 |
+
frame_info = ann["caption_frame_info_map"]
|
| 315 |
+
n_frames = len(ann["img_names"])
|
| 316 |
+
per_frame = [frame_label(frame_info.get(i, {}), "action") for i in range(n_frames)]
|
| 317 |
+
true_boundaries = [i for i in range(1, n_frames) if per_frame[i] and per_frame[i - 1] and per_frame[i] != per_frame[i - 1]]
|
| 318 |
+
|
| 319 |
+
y = []
|
| 320 |
+
for row in rows:
|
| 321 |
+
c = int(row["center_frame"])
|
| 322 |
+
y.append(int(any(abs(c - b) <= args.boundary_tolerance_frames for b in true_boundaries)))
|
| 323 |
+
labels = np.asarray(["transition" if v else "steady" for v in y], dtype=object)
|
| 324 |
+
metrics = classification_task(out_dir, X, labels, rows, args, "transition_detection", "all modalities -> action boundary/steady")
|
| 325 |
+
|
| 326 |
+
pred_path = out_dir / "predictions.csv"
|
| 327 |
+
pred_rows = []
|
| 328 |
+
with pred_path.open("r", encoding="utf-8") as fp:
|
| 329 |
+
for row in csv.DictReader(fp):
|
| 330 |
+
pred_rows.append(row)
|
| 331 |
+
pred_frames = [int(r["center_frame"]) for r in pred_rows if r["predicted_label"] == "transition"]
|
| 332 |
+
test_start = min((int(r["center_frame"]) for r in pred_rows), default=0)
|
| 333 |
+
test_end = max((int(r["center_frame"]) for r in pred_rows), default=0)
|
| 334 |
+
true_test = [b for b in true_boundaries if test_start <= b <= test_end]
|
| 335 |
+
metrics.update(boundary_f1(true_test, pred_frames, args.boundary_tolerance_frames))
|
| 336 |
+
write_json(out_dir / "metrics.json", metrics)
|
| 337 |
+
write_csv(out_dir / "true_boundaries.csv", [{"frame": x} for x in true_boundaries], ["frame"])
|
| 338 |
+
return metrics
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def task_next_action(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, args: argparse.Namespace) -> dict:
|
| 342 |
+
frame_info = ann["caption_frame_info_map"]
|
| 343 |
+
labels = []
|
| 344 |
+
for row in rows:
|
| 345 |
+
future_frame = min(len(ann["img_names"]) - 1, int(row["end_frame"]) + args.future_frames)
|
| 346 |
+
labels.append(frame_label(frame_info.get(future_frame, {}), "action"))
|
| 347 |
+
return classification_task(out_dir, X, np.asarray(labels, dtype=object), rows, args, "next_action", f"all modalities at t -> action at t+{args.future_frames} frames")
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
def ridge_fit_predict(X_train: np.ndarray, Y_train: np.ndarray, X_test: np.ndarray, l2: float):
|
| 351 |
+
x_mean, x_std = fit_scaler(X_train)
|
| 352 |
+
y_mean = Y_train.mean(axis=0)
|
| 353 |
+
y_std = Y_train.std(axis=0)
|
| 354 |
+
y_std = np.where(y_std < 1e-6, 1.0, y_std)
|
| 355 |
+
Xtr = (X_train - x_mean) / x_std
|
| 356 |
+
Xte = (X_test - x_mean) / x_std
|
| 357 |
+
Ytr = (Y_train - y_mean) / y_std
|
| 358 |
+
Xtr_aug = np.concatenate([Xtr, np.ones((len(Xtr), 1), dtype=np.float32)], axis=1)
|
| 359 |
+
Xte_aug = np.concatenate([Xte, np.ones((len(Xte), 1), dtype=np.float32)], axis=1)
|
| 360 |
+
K = Xtr_aug @ Xtr_aug.T
|
| 361 |
+
alpha = np.linalg.solve(K + l2 * np.eye(K.shape[0], dtype=np.float32), Ytr)
|
| 362 |
+
W = Xtr_aug.T @ alpha
|
| 363 |
+
pred = (Xte_aug @ W) * y_std + y_mean
|
| 364 |
+
return pred.astype(np.float32), {"x_mean": x_mean, "x_std": x_std, "y_mean": y_mean.astype(np.float32), "y_std": y_std.astype(np.float32), "W": W.astype(np.float32)}
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def regression_metrics(Y_true: np.ndarray, Y_pred: np.ndarray) -> dict:
|
| 368 |
+
mse = float(np.mean((Y_true - Y_pred) ** 2))
|
| 369 |
+
mae = float(np.mean(np.abs(Y_true - Y_pred)))
|
| 370 |
+
ss_res = float(np.sum((Y_true - Y_pred) ** 2))
|
| 371 |
+
ss_tot = float(np.sum((Y_true - Y_true.mean(axis=0)) ** 2))
|
| 372 |
+
r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0
|
| 373 |
+
return {"mse": mse, "mae": mae, "r2": r2}
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
def task_hand_forecast(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, args: argparse.Namespace) -> dict:
|
| 377 |
+
left = ann.get("hand_left_joints")
|
| 378 |
+
right = ann.get("hand_right_joints")
|
| 379 |
+
body = ann.get("smplh_body_joints")
|
| 380 |
+
if left is None or right is None:
|
| 381 |
+
raise ValueError("Hand joints not available.")
|
| 382 |
+
|
| 383 |
+
valid_idx, Y = [], []
|
| 384 |
+
n_frames = len(left)
|
| 385 |
+
for i, row in enumerate(rows):
|
| 386 |
+
future_start = int(row["end_frame"]) + 1
|
| 387 |
+
future_end = future_start + args.forecast_frames
|
| 388 |
+
if future_end > n_frames:
|
| 389 |
+
continue
|
| 390 |
+
hand = np.concatenate([left[future_start:future_end], right[future_start:future_end]], axis=1)
|
| 391 |
+
if body is not None and future_end <= len(body):
|
| 392 |
+
root = body[future_start:future_end, :1, :]
|
| 393 |
+
hand = hand - root
|
| 394 |
+
valid_idx.append(i)
|
| 395 |
+
Y.append(hand.reshape(-1))
|
| 396 |
+
|
| 397 |
+
valid_idx = np.asarray(valid_idx, dtype=np.int64)
|
| 398 |
+
Y = np.stack(Y).astype(np.float32)
|
| 399 |
+
train, test = chronological_split_indices(len(valid_idx), args.test_fraction)
|
| 400 |
+
pred, model = ridge_fit_predict(X[valid_idx[train]], Y[train], X[valid_idx[test]], args.ridge_l2)
|
| 401 |
+
metrics = regression_metrics(Y[test], pred)
|
| 402 |
+
true_hand = Y[test].reshape(len(test), args.forecast_frames, 42, 3)
|
| 403 |
+
pred_hand = pred.reshape(len(test), args.forecast_frames, 42, 3)
|
| 404 |
+
mpjpe = np.linalg.norm(true_hand - pred_hand, axis=-1).mean()
|
| 405 |
+
final_error = np.linalg.norm(true_hand[:, -1] - pred_hand[:, -1], axis=-1).mean()
|
| 406 |
+
metrics.update({
|
| 407 |
+
"task": "hand_trajectory_forecast",
|
| 408 |
+
"input": "all modalities at t -> future left/right hand 3D joints",
|
| 409 |
+
"split": "chronological",
|
| 410 |
+
"num_windows": int(len(valid_idx)),
|
| 411 |
+
"num_train_windows": int(len(train)),
|
| 412 |
+
"num_test_windows": int(len(test)),
|
| 413 |
+
"forecast_frames": int(args.forecast_frames),
|
| 414 |
+
"mpjpe": float(mpjpe),
|
| 415 |
+
"final_frame_mpjpe": float(final_error),
|
| 416 |
+
"target_dim": int(Y.shape[1]),
|
| 417 |
+
})
|
| 418 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 419 |
+
write_json(out_dir / "metrics.json", metrics)
|
| 420 |
+
np.savez_compressed(out_dir / "predictions.npz", y_true=Y[test], y_pred=pred, test_window_indices=valid_idx[test], **model)
|
| 421 |
+
return metrics
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
def task_contact_prediction(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, manifest: list[dict], args: argparse.Namespace) -> dict:
|
| 425 |
+
contacts = ann.get("contacts")
|
| 426 |
+
if contacts is None:
|
| 427 |
+
raise ValueError("Contacts not available.")
|
| 428 |
+
y = []
|
| 429 |
+
for row in rows:
|
| 430 |
+
c = contacts[int(row["start_frame"]):int(row["end_frame"]) + 1]
|
| 431 |
+
y.append("contact" if np.any(c > 0) else "no_contact")
|
| 432 |
+
keep = block_indices(manifest, exclude=["body_contacts", "caption_objects_interaction_text"])
|
| 433 |
+
return classification_task(out_dir, X[:, keep], np.asarray(y, dtype=object), rows, args, "contact_prediction", "all non-contact/non-caption-label modalities -> any body contact")
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
def extract_objects(info: dict) -> list[str]:
|
| 437 |
+
objects = info.get("objects")
|
| 438 |
+
if isinstance(objects, list):
|
| 439 |
+
return [str(x).strip() for x in objects if str(x).strip()]
|
| 440 |
+
if objects:
|
| 441 |
+
return [str(objects).strip()]
|
| 442 |
+
return []
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
def sigmoid(z: np.ndarray) -> np.ndarray:
|
| 446 |
+
return 1.0 / (1.0 + np.exp(-np.clip(z, -40, 40)))
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
def train_multilabel_logistic(X: np.ndarray, Y: np.ndarray, epochs: int, lr: float, l2: float, seed: int):
|
| 450 |
+
rng = np.random.default_rng(seed)
|
| 451 |
+
n, d = X.shape
|
| 452 |
+
c = Y.shape[1]
|
| 453 |
+
W = rng.normal(0, 0.01, size=(d, c)).astype(np.float32)
|
| 454 |
+
b = np.zeros(c, dtype=np.float32)
|
| 455 |
+
counts = Y.sum(axis=0)
|
| 456 |
+
pos_weight = (n - counts) / np.maximum(counts, 1.0)
|
| 457 |
+
pos_weight = np.clip(pos_weight, 1.0, 20.0).astype(np.float32)
|
| 458 |
+
history = []
|
| 459 |
+
for epoch in range(1, epochs + 1):
|
| 460 |
+
P = sigmoid(X @ W + b)
|
| 461 |
+
weights = np.where(Y > 0, pos_weight[None, :], 1.0)
|
| 462 |
+
diff = (P - Y) * weights / n
|
| 463 |
+
W -= lr * (X.T @ diff + l2 * W)
|
| 464 |
+
b -= lr * diff.sum(axis=0)
|
| 465 |
+
if epoch == 1 or epoch == epochs or epoch % max(1, epochs // 5) == 0:
|
| 466 |
+
pred = (P >= 0.5).astype(np.float32)
|
| 467 |
+
history.append({"epoch": epoch, **multilabel_metrics(Y, pred)})
|
| 468 |
+
return W.astype(np.float32), b.astype(np.float32), history
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
def multilabel_metrics(Y: np.ndarray, P: np.ndarray) -> dict:
|
| 472 |
+
Y = Y.astype(np.int64)
|
| 473 |
+
P = P.astype(np.int64)
|
| 474 |
+
tp = int(np.sum((Y == 1) & (P == 1)))
|
| 475 |
+
fp = int(np.sum((Y == 0) & (P == 1)))
|
| 476 |
+
fn = int(np.sum((Y == 1) & (P == 0)))
|
| 477 |
+
precision = tp / (tp + fp) if tp + fp else 0.0
|
| 478 |
+
recall = tp / (tp + fn) if tp + fn else 0.0
|
| 479 |
+
micro_f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
|
| 480 |
+
per_f1 = []
|
| 481 |
+
for j in range(Y.shape[1]):
|
| 482 |
+
tpj = np.sum((Y[:, j] == 1) & (P[:, j] == 1))
|
| 483 |
+
fpj = np.sum((Y[:, j] == 0) & (P[:, j] == 1))
|
| 484 |
+
fnj = np.sum((Y[:, j] == 1) & (P[:, j] == 0))
|
| 485 |
+
pj = tpj / (tpj + fpj) if tpj + fpj else 0.0
|
| 486 |
+
rj = tpj / (tpj + fnj) if tpj + fnj else 0.0
|
| 487 |
+
per_f1.append(2 * pj * rj / (pj + rj) if pj + rj else 0.0)
|
| 488 |
+
exact = float(np.mean(np.all(Y == P, axis=1)))
|
| 489 |
+
return {"micro_f1": float(micro_f1), "macro_f1": float(np.mean(per_f1)), "exact_match": exact, "precision": precision, "recall": recall}
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
def task_object_relevance(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, manifest: list[dict], args: argparse.Namespace) -> dict:
|
| 493 |
+
frame_info = ann["caption_frame_info_map"]
|
| 494 |
+
vocab = OrderedDict()
|
| 495 |
+
labels = []
|
| 496 |
+
for row in rows:
|
| 497 |
+
counts = Counter()
|
| 498 |
+
for frame in range(int(row["start_frame"]), int(row["end_frame"]) + 1):
|
| 499 |
+
counts.update(extract_objects(frame_info.get(frame, {})))
|
| 500 |
+
objects = [obj for obj, count in counts.items() if count > 0]
|
| 501 |
+
for obj in objects:
|
| 502 |
+
if obj not in vocab:
|
| 503 |
+
vocab[obj] = len(vocab)
|
| 504 |
+
labels.append(objects)
|
| 505 |
+
if not vocab:
|
| 506 |
+
raise ValueError("No object labels found.")
|
| 507 |
+
Y = np.zeros((len(rows), len(vocab)), dtype=np.float32)
|
| 508 |
+
for i, objects in enumerate(labels):
|
| 509 |
+
for obj in objects:
|
| 510 |
+
Y[i, vocab[obj]] = 1.0
|
| 511 |
+
|
| 512 |
+
keep = block_indices(manifest, exclude=["caption_objects_interaction_text"])
|
| 513 |
+
Xo = X[:, keep]
|
| 514 |
+
train, test = chronological_split_indices(len(rows), args.test_fraction)
|
| 515 |
+
mean, std = fit_scaler(Xo[train])
|
| 516 |
+
Xs = (Xo - mean) / std
|
| 517 |
+
W, b, history = train_multilabel_logistic(Xs[train], Y[train], args.epochs, 0.05, args.l2, args.seed)
|
| 518 |
+
prob = sigmoid(Xs[test] @ W + b)
|
| 519 |
+
pred = (prob >= 0.5).astype(np.float32)
|
| 520 |
+
# Ensure at least one object is emitted per row.
|
| 521 |
+
empty = np.where(pred.sum(axis=1) == 0)[0]
|
| 522 |
+
if len(empty):
|
| 523 |
+
pred[empty, np.argmax(prob[empty], axis=1)] = 1
|
| 524 |
+
metrics = multilabel_metrics(Y[test], pred)
|
| 525 |
+
metrics.update({
|
| 526 |
+
"task": "object_relevance",
|
| 527 |
+
"input": "all non-caption modalities -> current relevant object set",
|
| 528 |
+
"split": "chronological",
|
| 529 |
+
"num_windows": int(len(rows)),
|
| 530 |
+
"num_train_windows": int(len(train)),
|
| 531 |
+
"num_test_windows": int(len(test)),
|
| 532 |
+
"num_objects": int(len(vocab)),
|
| 533 |
+
})
|
| 534 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 535 |
+
write_json(out_dir / "metrics.json", metrics)
|
| 536 |
+
write_json(out_dir / "object_vocab.json", list(vocab.keys()))
|
| 537 |
+
rows_out = []
|
| 538 |
+
names = list(vocab.keys())
|
| 539 |
+
for local_i, global_i in enumerate(test):
|
| 540 |
+
true_objs = [names[j] for j in np.flatnonzero(Y[global_i] > 0)]
|
| 541 |
+
pred_objs = [names[j] for j in np.flatnonzero(pred[local_i] > 0)]
|
| 542 |
+
rows_out.append({
|
| 543 |
+
"window_index": int(global_i),
|
| 544 |
+
"start_frame": rows[int(global_i)]["start_frame"],
|
| 545 |
+
"end_frame": rows[int(global_i)]["end_frame"],
|
| 546 |
+
"true_objects": "|".join(true_objs),
|
| 547 |
+
"predicted_objects": "|".join(pred_objs),
|
| 548 |
+
})
|
| 549 |
+
write_csv(out_dir / "predictions.csv", rows_out, ["window_index", "start_frame", "end_frame", "true_objects", "predicted_objects"])
|
| 550 |
+
np.savez_compressed(out_dir / "model.npz", mean=mean, std=std, W=W, b=b, object_vocab=np.asarray(names, dtype=object), history=np.asarray(history, dtype=object))
|
| 551 |
+
return metrics
|
| 552 |
+
|
| 553 |
+
|
| 554 |
+
def normalize_rows(A: np.ndarray) -> np.ndarray:
|
| 555 |
+
norm = np.linalg.norm(A, axis=1, keepdims=True)
|
| 556 |
+
return A / np.maximum(norm, 1e-8)
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
def retrieval_metrics(query: np.ndarray, candidates: np.ndarray, positive_indices: np.ndarray, topks=(1, 5, 10)) -> dict:
|
| 560 |
+
Q = normalize_rows(query)
|
| 561 |
+
C = normalize_rows(candidates)
|
| 562 |
+
sims = Q @ C.T
|
| 563 |
+
ranks = []
|
| 564 |
+
for i, pos in enumerate(positive_indices):
|
| 565 |
+
order = np.argsort(-sims[i])
|
| 566 |
+
rank = int(np.where(order == pos)[0][0]) + 1
|
| 567 |
+
ranks.append(rank)
|
| 568 |
+
ranks = np.asarray(ranks)
|
| 569 |
+
out = {
|
| 570 |
+
"mrr": float(np.mean(1.0 / ranks)),
|
| 571 |
+
"median_rank": float(np.median(ranks)),
|
| 572 |
+
"mean_rank": float(np.mean(ranks)),
|
| 573 |
+
"num_queries": int(len(ranks)),
|
| 574 |
+
}
|
| 575 |
+
for k in topks:
|
| 576 |
+
out[f"top{k}_accuracy"] = float(np.mean(ranks <= k))
|
| 577 |
+
return out
|
| 578 |
+
|
| 579 |
+
|
| 580 |
+
def task_caption_grounding(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict:
|
| 581 |
+
text_idx = block_indices(manifest, include=["caption_objects_interaction_text"])
|
| 582 |
+
sensor_idx = block_indices(manifest, exclude=["caption_objects_interaction_text"])
|
| 583 |
+
train, test = chronological_split_indices(len(X), args.test_fraction)
|
| 584 |
+
pred_text, model = ridge_fit_predict(X[train][:, sensor_idx], X[train][:, text_idx], X[test][:, sensor_idx], args.ridge_l2)
|
| 585 |
+
# Query is true text; candidates are sensor windows projected into text space.
|
| 586 |
+
metrics = retrieval_metrics(X[test][:, text_idx], pred_text, np.arange(len(test)))
|
| 587 |
+
metrics.update({
|
| 588 |
+
"task": "caption_grounding",
|
| 589 |
+
"input": "caption objects/interaction text query + candidate sensor windows",
|
| 590 |
+
"output": "matching time window",
|
| 591 |
+
"split": "chronological",
|
| 592 |
+
"num_train_windows": int(len(train)),
|
| 593 |
+
"num_test_windows": int(len(test)),
|
| 594 |
+
})
|
| 595 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 596 |
+
write_json(out_dir / "metrics.json", metrics)
|
| 597 |
+
np.savez_compressed(out_dir / "model.npz", **model)
|
| 598 |
+
return metrics
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
def task_cross_modal_retrieval(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict:
|
| 602 |
+
motion_idx = block_indices(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"])
|
| 603 |
+
visual_idx = block_indices(manifest, include=["depth_confidence", "video_"])
|
| 604 |
+
train, test = chronological_split_indices(len(X), args.test_fraction)
|
| 605 |
+
pred_visual, model = ridge_fit_predict(X[train][:, motion_idx], X[train][:, visual_idx], X[test][:, motion_idx], args.ridge_l2)
|
| 606 |
+
metrics = retrieval_metrics(pred_visual, X[test][:, visual_idx], np.arange(len(test)))
|
| 607 |
+
metrics.update({
|
| 608 |
+
"task": "cross_modal_retrieval",
|
| 609 |
+
"input": "motion/IMU/camera query",
|
| 610 |
+
"output": "matching depth/video window",
|
| 611 |
+
"split": "chronological",
|
| 612 |
+
"num_train_windows": int(len(train)),
|
| 613 |
+
"num_test_windows": int(len(test)),
|
| 614 |
+
})
|
| 615 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 616 |
+
write_json(out_dir / "metrics.json", metrics)
|
| 617 |
+
np.savez_compressed(out_dir / "model.npz", **model)
|
| 618 |
+
return metrics
|
| 619 |
+
|
| 620 |
+
|
| 621 |
+
def task_modality_reconstruction(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict:
|
| 622 |
+
motion_idx = block_indices(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"])
|
| 623 |
+
visual_idx = block_indices(manifest, include=["depth_confidence", "video_"])
|
| 624 |
+
train, test = chronological_split_indices(len(X), args.test_fraction)
|
| 625 |
+
pred, model = ridge_fit_predict(X[train][:, motion_idx], X[train][:, visual_idx], X[test][:, motion_idx], args.ridge_l2)
|
| 626 |
+
metrics = regression_metrics(X[test][:, visual_idx], pred)
|
| 627 |
+
metrics.update({
|
| 628 |
+
"task": "modality_reconstruction",
|
| 629 |
+
"input": "motion/IMU/camera",
|
| 630 |
+
"output": "depth/video feature vector",
|
| 631 |
+
"split": "chronological",
|
| 632 |
+
"num_train_windows": int(len(train)),
|
| 633 |
+
"num_test_windows": int(len(test)),
|
| 634 |
+
"target_dim": int(len(visual_idx)),
|
| 635 |
+
})
|
| 636 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 637 |
+
write_json(out_dir / "metrics.json", metrics)
|
| 638 |
+
np.savez_compressed(out_dir / "predictions.npz", y_true=X[test][:, visual_idx], y_pred=pred, **model)
|
| 639 |
+
return metrics
|
| 640 |
+
|
| 641 |
+
|
| 642 |
+
def binary_classification_from_arrays(out_dir: Path, X: np.ndarray, y: np.ndarray, args: argparse.Namespace, task: str, input_desc: str) -> dict:
|
| 643 |
+
train, test = chronological_split_indices(len(y), args.test_fraction)
|
| 644 |
+
mean, std = fit_scaler(X[train])
|
| 645 |
+
Xs = (X - mean) / std
|
| 646 |
+
W, b, history = train_softmax_classifier(
|
| 647 |
+
Xs[train],
|
| 648 |
+
y[train].astype(np.int64),
|
| 649 |
+
n_classes=2,
|
| 650 |
+
epochs=args.epochs,
|
| 651 |
+
lr=args.learning_rate,
|
| 652 |
+
l2=args.l2,
|
| 653 |
+
use_class_weights=True,
|
| 654 |
+
seed=args.seed,
|
| 655 |
+
)
|
| 656 |
+
pred, prob = predict(Xs[test], W, b)
|
| 657 |
+
metrics = binary_metrics(y[test], pred)
|
| 658 |
+
metrics.update({
|
| 659 |
+
"task": task,
|
| 660 |
+
"input": input_desc,
|
| 661 |
+
"split": "chronological",
|
| 662 |
+
"num_samples": int(len(y)),
|
| 663 |
+
"num_train_samples": int(len(train)),
|
| 664 |
+
"num_test_samples": int(len(test)),
|
| 665 |
+
"train_final_accuracy": float(history[-1]["train_accuracy"]),
|
| 666 |
+
})
|
| 667 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 668 |
+
write_json(out_dir / "metrics.json", metrics)
|
| 669 |
+
pred_rows = []
|
| 670 |
+
for k, idx in enumerate(test):
|
| 671 |
+
pred_rows.append({"sample_index": int(idx), "true": int(y[idx]), "predicted": int(pred[k]), "prob_positive": float(prob[k, 1])})
|
| 672 |
+
write_csv(out_dir / "predictions.csv", pred_rows, ["sample_index", "true", "predicted", "prob_positive"])
|
| 673 |
+
np.savez_compressed(out_dir / "model.npz", mean=mean, std=std, W=W, b=b)
|
| 674 |
+
return metrics
|
| 675 |
+
|
| 676 |
+
|
| 677 |
+
def task_temporal_order(out_dir: Path, X: np.ndarray, args: argparse.Namespace) -> dict:
|
| 678 |
+
pairs, y = [], []
|
| 679 |
+
for i in range(len(X) - 1):
|
| 680 |
+
a, b = X[i], X[i + 1]
|
| 681 |
+
pairs.append(np.concatenate([a, b, b - a]))
|
| 682 |
+
y.append(1)
|
| 683 |
+
pairs.append(np.concatenate([b, a, a - b]))
|
| 684 |
+
y.append(0)
|
| 685 |
+
return binary_classification_from_arrays(out_dir, np.stack(pairs).astype(np.float32), np.asarray(y, dtype=np.int64), args, "temporal_order", "two adjacent windows -> whether order is correct")
|
| 686 |
+
|
| 687 |
+
|
| 688 |
+
def task_misalignment(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict:
|
| 689 |
+
motion_idx = block_indices(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"])
|
| 690 |
+
visual_idx = block_indices(manifest, include=["depth_confidence", "video_"])
|
| 691 |
+
shift = args.misalignment_shift_windows
|
| 692 |
+
pairs, y = [], []
|
| 693 |
+
limit = len(X) - shift
|
| 694 |
+
for i in range(limit):
|
| 695 |
+
pairs.append(np.concatenate([X[i, motion_idx], X[i, visual_idx]]))
|
| 696 |
+
y.append(1)
|
| 697 |
+
pairs.append(np.concatenate([X[i, motion_idx], X[i + shift, visual_idx]]))
|
| 698 |
+
y.append(0)
|
| 699 |
+
return binary_classification_from_arrays(out_dir, np.stack(pairs).astype(np.float32), np.asarray(y, dtype=np.int64), args, "misalignment_detection", f"motion+visual pair -> aligned vs shifted by {shift} windows")
|
| 700 |
+
|
| 701 |
+
|
| 702 |
+
def main() -> int:
|
| 703 |
+
args = parse_args()
|
| 704 |
+
add_toolkit_to_path(args.workspace)
|
| 705 |
+
from data_loader import load_from_annotation_hdf5
|
| 706 |
+
|
| 707 |
+
args.output_dir.mkdir(parents=True, exist_ok=True)
|
| 708 |
+
tasks = selected_tasks(args.tasks)
|
| 709 |
+
|
| 710 |
+
print(f"Loading annotation: {args.annotation}")
|
| 711 |
+
ann = load_from_annotation_hdf5(args.annotation, 0, None, load_slam_point_cloud=True)
|
| 712 |
+
extras, available_modalities = prepare_modalities(args, ann)
|
| 713 |
+
print("Building shared all-modality windows")
|
| 714 |
+
X, rows, manifest = build_windows(args, ann, extras)
|
| 715 |
+
|
| 716 |
+
write_json(args.output_dir / "available_modalities.json", available_modalities)
|
| 717 |
+
write_json(args.output_dir / "feature_manifest.json", manifest)
|
| 718 |
+
write_csv(args.output_dir / "windows.csv", rows, ["window_index", "start_frame", "end_frame", "center_frame", "action_label", "action_fraction", "subtask_label", "subtask_fraction"])
|
| 719 |
+
np.savez_compressed(args.output_dir / "shared_windows.npz", X=X, starts=np.asarray([r["start_frame"] for r in rows]), ends=np.asarray([r["end_frame"] for r in rows]))
|
| 720 |
+
|
| 721 |
+
summary = {
|
| 722 |
+
"annotation": portable_path(args.annotation, args.workspace),
|
| 723 |
+
"num_frames": int(len(ann["img_names"])),
|
| 724 |
+
"num_windows": int(len(rows)),
|
| 725 |
+
"feature_dim": int(X.shape[1]),
|
| 726 |
+
"window_frames": int(args.window_frames),
|
| 727 |
+
"stride_frames": int(args.stride_frames),
|
| 728 |
+
"tasks": {},
|
| 729 |
+
}
|
| 730 |
+
|
| 731 |
+
print(f"Windows: {len(rows)}, feature_dim: {X.shape[1]}")
|
| 732 |
+
for task in tasks:
|
| 733 |
+
print(f"\nRunning task: {task}")
|
| 734 |
+
out = args.output_dir / task
|
| 735 |
+
try:
|
| 736 |
+
if task == "timeline_action":
|
| 737 |
+
metrics = classification_task(out, X, label_array(rows, "action_label"), rows, args, task, "all modalities -> current action label")
|
| 738 |
+
elif task == "timeline_subtask":
|
| 739 |
+
metrics = classification_task(out, X, label_array(rows, "subtask_label"), rows, args, task, "all modalities -> current subtask label")
|
| 740 |
+
elif task == "transition_detection":
|
| 741 |
+
metrics = task_transition_detection(out, X, rows, ann, args)
|
| 742 |
+
elif task == "next_action":
|
| 743 |
+
metrics = task_next_action(out, X, rows, ann, args)
|
| 744 |
+
elif task == "hand_trajectory_forecast":
|
| 745 |
+
metrics = task_hand_forecast(out, X, rows, ann, args)
|
| 746 |
+
elif task == "contact_prediction":
|
| 747 |
+
metrics = task_contact_prediction(out, X, rows, ann, manifest, args)
|
| 748 |
+
elif task == "object_relevance":
|
| 749 |
+
metrics = task_object_relevance(out, X, rows, ann, manifest, args)
|
| 750 |
+
elif task == "caption_grounding":
|
| 751 |
+
metrics = task_caption_grounding(out, X, manifest, args)
|
| 752 |
+
elif task == "cross_modal_retrieval":
|
| 753 |
+
metrics = task_cross_modal_retrieval(out, X, manifest, args)
|
| 754 |
+
elif task == "modality_reconstruction":
|
| 755 |
+
metrics = task_modality_reconstruction(out, X, manifest, args)
|
| 756 |
+
elif task == "temporal_order":
|
| 757 |
+
metrics = task_temporal_order(out, X, args)
|
| 758 |
+
elif task == "misalignment_detection":
|
| 759 |
+
metrics = task_misalignment(out, X, manifest, args)
|
| 760 |
+
else:
|
| 761 |
+
raise ValueError(task)
|
| 762 |
+
summary["tasks"][task] = metrics
|
| 763 |
+
key_metrics = {k: metrics[k] for k in ("accuracy", "macro_f1", "f1", "mpjpe", "mrr", "r2", "micro_f1") if k in metrics}
|
| 764 |
+
print(f" done: {key_metrics}")
|
| 765 |
+
except Exception as exc:
|
| 766 |
+
summary["tasks"][task] = {"error": str(exc)}
|
| 767 |
+
write_json(out / "error.json", {"task": task, "error": str(exc)})
|
| 768 |
+
print(f" error: {exc}")
|
| 769 |
+
|
| 770 |
+
write_json(args.output_dir / "summary_report.json", summary)
|
| 771 |
+
print(f"\nSuite artifacts written to: {args.output_dir}")
|
| 772 |
+
return 0
|
| 773 |
+
|
| 774 |
+
|
| 775 |
+
if __name__ == "__main__":
|
| 776 |
+
raise SystemExit(main())
|
scripts/generate_visualizations.py
ADDED
|
@@ -0,0 +1,474 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Generate static SVG visualizations and website data for the Ropedia task suite.
|
| 4 |
+
|
| 5 |
+
No plotting dependencies are required; this uses only the Python standard
|
| 6 |
+
library so the repo stays easy to run.
|
| 7 |
+
|
| 8 |
+
The polished GitHub Pages homepage in docs/index.html is hand-curated and is
|
| 9 |
+
not overwritten by this script. This script refreshes docs/assets/*.svg,
|
| 10 |
+
docs/assets/charts/*.svg, and docs/data/summary_metrics.json.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import html
|
| 16 |
+
import json
|
| 17 |
+
import textwrap
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 22 |
+
RESULTS = ROOT / "results"
|
| 23 |
+
DOCS = ROOT / "docs"
|
| 24 |
+
ASSETS = DOCS / "assets"
|
| 25 |
+
CHARTS = ASSETS / "charts"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def read_json(path: Path) -> dict:
|
| 29 |
+
return json.loads(path.read_text(encoding="utf-8"))
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def svg_bar_chart(path: Path, title: str, rows: list[tuple[str, float]], x_label: str = "score", max_value: float | None = None) -> None:
|
| 33 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 34 |
+
width = 1100
|
| 35 |
+
row_h = 34
|
| 36 |
+
top = 78
|
| 37 |
+
left = 310
|
| 38 |
+
right = 70
|
| 39 |
+
height = top + row_h * len(rows) + 70
|
| 40 |
+
max_value = max_value if max_value is not None else max([v for _, v in rows] + [1.0])
|
| 41 |
+
max_value = max(max_value, 1e-9)
|
| 42 |
+
plot_w = width - left - right
|
| 43 |
+
colors = ["#2563eb", "#059669", "#ea580c", "#7b5d12", "#0891b2", "#dc2626"]
|
| 44 |
+
parts = [
|
| 45 |
+
f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">',
|
| 46 |
+
'<rect width="100%" height="100%" fill="#ffffff"/>',
|
| 47 |
+
f'<text x="32" y="42" font-family="Arial, sans-serif" font-size="26" font-weight="700" fill="#111827">{html.escape(title)}</text>',
|
| 48 |
+
f'<text x="{left}" y="{height - 24}" font-family="Arial, sans-serif" font-size="13" fill="#6b7280">{html.escape(x_label)}</text>',
|
| 49 |
+
]
|
| 50 |
+
for tick in range(6):
|
| 51 |
+
x = left + plot_w * tick / 5
|
| 52 |
+
val = max_value * tick / 5
|
| 53 |
+
parts.append(f'<line x1="{x:.1f}" y1="{top - 18}" x2="{x:.1f}" y2="{height - 50}" stroke="#e5e7eb" stroke-width="1"/>')
|
| 54 |
+
parts.append(f'<text x="{x:.1f}" y="{height - 30}" text-anchor="middle" font-family="Arial, sans-serif" font-size="12" fill="#6b7280">{val:.2f}</text>')
|
| 55 |
+
for i, (label, value) in enumerate(rows):
|
| 56 |
+
y = top + i * row_h
|
| 57 |
+
bar_w = max(0.0, min(value / max_value, 1.0)) * plot_w
|
| 58 |
+
color = colors[i % len(colors)]
|
| 59 |
+
parts.append(f'<text x="{left - 14}" y="{y + 21}" text-anchor="end" font-family="Arial, sans-serif" font-size="14" fill="#111827">{html.escape(label)}</text>')
|
| 60 |
+
parts.append(f'<rect x="{left}" y="{y + 5}" width="{bar_w:.1f}" height="20" rx="4" fill="{color}"/>')
|
| 61 |
+
parts.append(f'<text x="{left + bar_w + 8:.1f}" y="{y + 21}" font-family="Arial, sans-serif" font-size="13" fill="#374151">{value:.4f}</text>')
|
| 62 |
+
parts.append("</svg>")
|
| 63 |
+
path.write_text("\n".join(parts), encoding="utf-8")
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def svg_feature_blocks(path: Path, feature_manifest: list[dict]) -> None:
|
| 67 |
+
rows = [(block["name"], float(block["dim"])) for block in feature_manifest]
|
| 68 |
+
svg_bar_chart(path, "All-Modality Feature Blocks", rows, x_label="feature dimensions", max_value=max(v for _, v in rows) * 1.08)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def svg_pipeline_diagram(path: Path, summary: dict) -> None:
|
| 72 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 73 |
+
suite = summary["suite"]
|
| 74 |
+
task_count = len(suite["tasks"])
|
| 75 |
+
width, height = 1400, 760
|
| 76 |
+
boxes = [
|
| 77 |
+
(60, 110, 250, 132, "1. Raw public sample", [
|
| 78 |
+
"annotation.hdf5",
|
| 79 |
+
"6 video files",
|
| 80 |
+
f"{suite['num_frames']:,} aligned frames",
|
| 81 |
+
], "#1f63e9"),
|
| 82 |
+
(365, 110, 250, 132, "2. HOMIE loader", [
|
| 83 |
+
"mocap, IMU, depth",
|
| 84 |
+
"caption map",
|
| 85 |
+
"SLAM and calibration",
|
| 86 |
+
], "#008b9a"),
|
| 87 |
+
(670, 110, 250, 132, "3. Window builder", [
|
| 88 |
+
f"{suite['window_frames']}-frame windows",
|
| 89 |
+
f"{suite['stride_frames']}-frame stride",
|
| 90 |
+
f"{suite['num_windows']:,} windows",
|
| 91 |
+
], "#0a7f55"),
|
| 92 |
+
(975, 110, 300, 132, "4. Feature vector", [
|
| 93 |
+
f"{suite['feature_dim']:,} dimensions",
|
| 94 |
+
"17 named feature blocks",
|
| 95 |
+
"stored manifest",
|
| 96 |
+
], "#b65b04"),
|
| 97 |
+
(60, 380, 360, 168, "5. Baseline models", [
|
| 98 |
+
"motion-only action/subtask",
|
| 99 |
+
"all-modality action/subtask",
|
| 100 |
+
"numpy softmax classifier",
|
| 101 |
+
"metrics and predictions",
|
| 102 |
+
], "#1f63e9"),
|
| 103 |
+
(520, 380, 360, 168, "6. Episode task suite", [
|
| 104 |
+
f"{task_count} supervised/self-supervised tasks",
|
| 105 |
+
"chronological split",
|
| 106 |
+
"retrieval, forecast, alignment",
|
| 107 |
+
"per-task artifacts",
|
| 108 |
+
], "#008b9a"),
|
| 109 |
+
(980, 380, 300, 168, "7. Published artifacts", [
|
| 110 |
+
"results/**/*.json/csv/npz",
|
| 111 |
+
"docs/data/summary_metrics.json",
|
| 112 |
+
"GitHub Pages dashboard",
|
| 113 |
+
"reproducibility audit",
|
| 114 |
+
], "#0a7f55"),
|
| 115 |
+
]
|
| 116 |
+
parts = [
|
| 117 |
+
f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">',
|
| 118 |
+
'<rect width="100%" height="100%" fill="#ffffff"/>',
|
| 119 |
+
'<rect x="0" y="0" width="1400" height="760" fill="#ffffff"/>',
|
| 120 |
+
'<text x="60" y="58" font-family="Arial, sans-serif" font-size="32" font-weight="700" fill="#10141f">Verified Ropedia Episode Pipeline</text>',
|
| 121 |
+
'<text x="60" y="88" font-family="Arial, sans-serif" font-size="16" fill="#5b6475">Generated from committed scripts and metrics; no conceptual placeholder stages.</text>',
|
| 122 |
+
]
|
| 123 |
+
arrows = [
|
| 124 |
+
(310, 176, 365, 176),
|
| 125 |
+
(615, 176, 670, 176),
|
| 126 |
+
(920, 176, 975, 176),
|
| 127 |
+
(215, 242, 240, 380),
|
| 128 |
+
(1095, 242, 700, 380),
|
| 129 |
+
(420, 464, 520, 464),
|
| 130 |
+
(880, 464, 980, 464),
|
| 131 |
+
]
|
| 132 |
+
for x1, y1, x2, y2 in arrows:
|
| 133 |
+
parts.append(f'<line x1="{x1}" y1="{y1}" x2="{x2}" y2="{y2}" stroke="#cbd5e1" stroke-width="3" marker-end="url(#arrow)"/>')
|
| 134 |
+
parts.insert(1, '<defs><marker id="arrow" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M 0 0 L 10 5 L 0 10 z" fill="#cbd5e1"/></marker></defs>')
|
| 135 |
+
for x, y, w, h, title, lines, color in boxes:
|
| 136 |
+
parts.append(f'<rect x="{x}" y="{y}" width="{w}" height="{h}" rx="8" fill="#ffffff" stroke="#dce2ec" stroke-width="2"/>')
|
| 137 |
+
parts.append(f'<rect x="{x}" y="{y}" width="8" height="{h}" rx="4" fill="{color}"/>')
|
| 138 |
+
parts.append(f'<text x="{x + 24}" y="{y + 34}" font-family="Arial, sans-serif" font-size="18" font-weight="700" fill="#10141f">{html.escape(title)}</text>')
|
| 139 |
+
for i, line in enumerate(lines):
|
| 140 |
+
parts.append(f'<text x="{x + 24}" y="{y + 66 + i * 22}" font-family="Arial, sans-serif" font-size="14" fill="#394255">{html.escape(line)}</text>')
|
| 141 |
+
checks = [
|
| 142 |
+
"Audit check: rerunning scripts to /private/tmp reproduced committed metrics exactly.",
|
| 143 |
+
"Video/depth check: fresh cache read depth plus fisheye_cam0/1/2/3 and stereo_left/right from raw files.",
|
| 144 |
+
"Scope check: this validates one public sample episode, not cross-episode generalization.",
|
| 145 |
+
]
|
| 146 |
+
parts.append('<rect x="60" y="620" width="1220" height="96" rx="8" fill="#f8fafc" stroke="#dce2ec"/>')
|
| 147 |
+
for i, line in enumerate(checks):
|
| 148 |
+
parts.append(f'<text x="84" y="{650 + i * 24}" font-family="Arial, sans-serif" font-size="15" fill="#273143">{html.escape(line)}</text>')
|
| 149 |
+
parts.append("</svg>")
|
| 150 |
+
path.write_text("\n".join(parts), encoding="utf-8")
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def feature_dim(feature_manifest: list[dict], include: list[str] | None = None, exclude: list[str] | None = None) -> int:
|
| 154 |
+
include = include or []
|
| 155 |
+
exclude = exclude or []
|
| 156 |
+
total = 0
|
| 157 |
+
for block in feature_manifest:
|
| 158 |
+
name = block["name"]
|
| 159 |
+
if include and not any(name == prefix or name.startswith(prefix) for prefix in include):
|
| 160 |
+
continue
|
| 161 |
+
if exclude and any(name == prefix or name.startswith(prefix) for prefix in exclude):
|
| 162 |
+
continue
|
| 163 |
+
total += int(block["dim"])
|
| 164 |
+
return total
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def metric_text(task_name: str, metrics: dict) -> str:
|
| 168 |
+
if task_name == "hand_trajectory_forecast":
|
| 169 |
+
return f"MPJPE {metrics['mpjpe']:.4f}"
|
| 170 |
+
if task_name == "cross_modal_retrieval":
|
| 171 |
+
return f"top-5 {metrics['top5_accuracy']:.4f}"
|
| 172 |
+
if task_name == "caption_grounding":
|
| 173 |
+
return f"MRR {metrics['mrr']:.4f}"
|
| 174 |
+
if task_name == "object_relevance":
|
| 175 |
+
return f"micro-F1 {metrics['micro_f1']:.4f}"
|
| 176 |
+
if task_name == "modality_reconstruction":
|
| 177 |
+
return f"R2 {metrics['r2']:.4f}"
|
| 178 |
+
if task_name in {"temporal_order", "misalignment_detection"}:
|
| 179 |
+
return f"F1 {metrics['f1']:.4f}"
|
| 180 |
+
if "macro_f1" in metrics:
|
| 181 |
+
return f"macro-F1 {metrics['macro_f1']:.4f}"
|
| 182 |
+
if "accuracy" in metrics:
|
| 183 |
+
return f"accuracy {metrics['accuracy']:.4f}"
|
| 184 |
+
return "metric in summary_report.json"
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def draw_text_block(parts: list[str], x: int, y: int, lines: list[str], size: int = 13, color: str = "#394255", weight: str = "500", max_chars: int = 42, line_h: int = 18) -> int:
|
| 188 |
+
cursor = y
|
| 189 |
+
for line in lines:
|
| 190 |
+
wrapped = textwrap.wrap(line, width=max_chars) or [""]
|
| 191 |
+
for item in wrapped:
|
| 192 |
+
parts.append(f'<text x="{x}" y="{cursor}" font-family="Arial, sans-serif" font-size="{size}" font-weight="{weight}" fill="{color}">{html.escape(item)}</text>')
|
| 193 |
+
cursor += line_h
|
| 194 |
+
return cursor
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def task_architecture_rows(summary: dict) -> list[dict]:
|
| 198 |
+
suite = summary["suite"]
|
| 199 |
+
tasks = suite["tasks"]
|
| 200 |
+
manifest = summary["feature_manifest"]
|
| 201 |
+
all_dim = int(suite["feature_dim"])
|
| 202 |
+
no_contact_text_dim = feature_dim(manifest, exclude=["body_contacts", "caption_objects_interaction_text"])
|
| 203 |
+
no_text_dim = feature_dim(manifest, exclude=["caption_objects_interaction_text"])
|
| 204 |
+
sensor_dim = no_text_dim
|
| 205 |
+
text_dim = feature_dim(manifest, include=["caption_objects_interaction_text"])
|
| 206 |
+
motion_dim = feature_dim(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"])
|
| 207 |
+
visual_dim = feature_dim(manifest, include=["depth_confidence", "video_"])
|
| 208 |
+
pair_dim = all_dim * 3
|
| 209 |
+
align_dim = motion_dim + visual_dim
|
| 210 |
+
|
| 211 |
+
return [
|
| 212 |
+
{
|
| 213 |
+
"task": "timeline_action",
|
| 214 |
+
"family": "softmax",
|
| 215 |
+
"input": f"X_all window, {all_dim:,}d",
|
| 216 |
+
"head": "z-score -> linear softmax, class-weighted CE + L2",
|
| 217 |
+
"output": f"current action class, {tasks['timeline_action']['num_classes']} classes",
|
| 218 |
+
"metric": metric_text("timeline_action", tasks["timeline_action"]),
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"task": "timeline_subtask",
|
| 222 |
+
"family": "softmax",
|
| 223 |
+
"input": f"X_all window, {all_dim:,}d",
|
| 224 |
+
"head": "z-score -> linear softmax, class-weighted CE + L2",
|
| 225 |
+
"output": f"current subtask class, {tasks['timeline_subtask']['num_classes']} classes",
|
| 226 |
+
"metric": metric_text("timeline_subtask", tasks["timeline_subtask"]),
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"task": "transition_detection",
|
| 230 |
+
"family": "softmax",
|
| 231 |
+
"input": f"X_all window, {all_dim:,}d",
|
| 232 |
+
"head": "z-score -> linear softmax, class-weighted CE + L2",
|
| 233 |
+
"output": "steady vs transition near action boundary",
|
| 234 |
+
"metric": f"{metric_text('transition_detection', tasks['transition_detection'])}; boundary-F1 {tasks['transition_detection']['boundary_f1']:.4f}",
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"task": "next_action",
|
| 238 |
+
"family": "softmax",
|
| 239 |
+
"input": f"X_all at time t, {all_dim:,}d",
|
| 240 |
+
"head": "z-score -> linear softmax, class-weighted CE + L2",
|
| 241 |
+
"output": f"action at t+{tasks['next_action'].get('future_frames', 20)} frames",
|
| 242 |
+
"metric": metric_text("next_action", tasks["next_action"]),
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"task": "hand_trajectory_forecast",
|
| 246 |
+
"family": "ridge",
|
| 247 |
+
"input": f"X_all at time t, {all_dim:,}d",
|
| 248 |
+
"head": "z-score X/Y -> dual ridge regression, L2=10",
|
| 249 |
+
"output": f"future hand joints, {tasks['hand_trajectory_forecast']['target_dim']}d",
|
| 250 |
+
"metric": metric_text("hand_trajectory_forecast", tasks["hand_trajectory_forecast"]),
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"task": "contact_prediction",
|
| 254 |
+
"family": "softmax",
|
| 255 |
+
"input": f"X without contact/text leakage, {no_contact_text_dim:,}d",
|
| 256 |
+
"head": "z-score -> linear softmax on observed labels",
|
| 257 |
+
"output": "any body contact in window; degenerate one-class sample",
|
| 258 |
+
"metric": metric_text("contact_prediction", tasks["contact_prediction"]),
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"task": "object_relevance",
|
| 262 |
+
"family": "multilabel",
|
| 263 |
+
"input": f"X without caption text, {no_text_dim:,}d",
|
| 264 |
+
"head": "z-score -> sigmoid multi-label logistic, weighted",
|
| 265 |
+
"output": f"multi-hot object set, {tasks['object_relevance']['num_objects']} objects",
|
| 266 |
+
"metric": metric_text("object_relevance", tasks["object_relevance"]),
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"task": "caption_grounding",
|
| 270 |
+
"family": "ridge+rank",
|
| 271 |
+
"input": f"sensor {sensor_dim:,}d -> text space {text_dim:,}d",
|
| 272 |
+
"head": "ridge projection, then cosine ranking",
|
| 273 |
+
"output": "text query retrieves matching time window",
|
| 274 |
+
"metric": metric_text("caption_grounding", tasks["caption_grounding"]),
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"task": "cross_modal_retrieval",
|
| 278 |
+
"family": "ridge+rank",
|
| 279 |
+
"input": f"motion/IMU/camera {motion_dim:,}d -> visual {visual_dim:,}d",
|
| 280 |
+
"head": "ridge projection, then cosine ranking",
|
| 281 |
+
"output": "retrieve matching depth/video window",
|
| 282 |
+
"metric": metric_text("cross_modal_retrieval", tasks["cross_modal_retrieval"]),
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"task": "modality_reconstruction",
|
| 286 |
+
"family": "ridge",
|
| 287 |
+
"input": f"motion/IMU/camera {motion_dim:,}d",
|
| 288 |
+
"head": "z-score X/Y -> dual ridge regression, L2=10",
|
| 289 |
+
"output": f"depth/video feature vector, {visual_dim:,}d",
|
| 290 |
+
"metric": metric_text("modality_reconstruction", tasks["modality_reconstruction"]),
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"task": "temporal_order",
|
| 294 |
+
"family": "softmax",
|
| 295 |
+
"input": f"concat[x_t, x_t+1, diff], {pair_dim:,}d",
|
| 296 |
+
"head": "z-score -> binary linear softmax, CE + L2",
|
| 297 |
+
"output": "correct vs reversed adjacent windows",
|
| 298 |
+
"metric": metric_text("temporal_order", tasks["temporal_order"]),
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"task": "misalignment_detection",
|
| 302 |
+
"family": "softmax",
|
| 303 |
+
"input": f"concat[motion_t, visual_t/visual_t+8], {align_dim:,}d",
|
| 304 |
+
"head": "z-score -> binary linear softmax, CE + L2",
|
| 305 |
+
"output": "aligned vs shifted by 8 windows",
|
| 306 |
+
"metric": metric_text("misalignment_detection", tasks["misalignment_detection"]),
|
| 307 |
+
},
|
| 308 |
+
]
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
def svg_task_architectures(path: Path, summary: dict) -> None:
|
| 312 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 313 |
+
suite = summary["suite"]
|
| 314 |
+
rows = task_architecture_rows(summary)
|
| 315 |
+
family_colors = {
|
| 316 |
+
"softmax": "#1f63e9",
|
| 317 |
+
"ridge": "#0a7f55",
|
| 318 |
+
"ridge+rank": "#008b9a",
|
| 319 |
+
"multilabel": "#b65b04",
|
| 320 |
+
}
|
| 321 |
+
width, height = 1500, 1840
|
| 322 |
+
parts = [
|
| 323 |
+
f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">',
|
| 324 |
+
'<defs><marker id="arrow2" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M 0 0 L 10 5 L 0 10 z" fill="#cbd5e1"/></marker></defs>',
|
| 325 |
+
'<rect width="100%" height="100%" fill="#ffffff"/>',
|
| 326 |
+
'<text x="60" y="56" font-family="Arial, sans-serif" font-size="34" font-weight="700" fill="#10141f">Minimal Architectures for the 12 Ropedia Episode Tasks</text>',
|
| 327 |
+
'<text x="60" y="88" font-family="Arial, sans-serif" font-size="16" fill="#5b6475">Generated from scripts/episode_task_suite.py semantics and committed summary metrics. These are minimal baselines, not deep foundation models.</text>',
|
| 328 |
+
]
|
| 329 |
+
|
| 330 |
+
setup = [
|
| 331 |
+
(60, 122, 310, 110, "Shared episode windows", [
|
| 332 |
+
f"{suite['num_frames']:,} frames -> {suite['num_windows']:,} windows",
|
| 333 |
+
f"{suite['window_frames']}-frame window, {suite['stride_frames']}-frame stride",
|
| 334 |
+
"chronological 70/30 split",
|
| 335 |
+
], "#1f63e9"),
|
| 336 |
+
(410, 122, 310, 110, "Feature vector", [
|
| 337 |
+
f"X_all = {suite['feature_dim']:,} dimensions",
|
| 338 |
+
"17 named modality blocks",
|
| 339 |
+
"mean/std fit on train only",
|
| 340 |
+
], "#008b9a"),
|
| 341 |
+
(760, 122, 320, 110, "Reusable heads", [
|
| 342 |
+
"linear softmax classifier",
|
| 343 |
+
"dual ridge regression/projection",
|
| 344 |
+
"multi-label logistic + cosine rank",
|
| 345 |
+
], "#0a7f55"),
|
| 346 |
+
(1120, 122, 320, 110, "Artifacts", [
|
| 347 |
+
"metrics.json, predictions.csv/npz",
|
| 348 |
+
"model.npz with scaler and weights",
|
| 349 |
+
"summary_report.json source of numbers",
|
| 350 |
+
], "#b65b04"),
|
| 351 |
+
]
|
| 352 |
+
for i in range(len(setup) - 1):
|
| 353 |
+
x1 = setup[i][0] + setup[i][2]
|
| 354 |
+
x2 = setup[i + 1][0]
|
| 355 |
+
y = setup[i][1] + 55
|
| 356 |
+
parts.append(f'<line x1="{x1 + 12}" y1="{y}" x2="{x2 - 14}" y2="{y}" stroke="#cbd5e1" stroke-width="3" marker-end="url(#arrow2)"/>')
|
| 357 |
+
for x, y, w, h, title, lines, color in setup:
|
| 358 |
+
parts.append(f'<rect x="{x}" y="{y}" width="{w}" height="{h}" rx="8" fill="#ffffff" stroke="#dce2ec" stroke-width="2"/>')
|
| 359 |
+
parts.append(f'<rect x="{x}" y="{y}" width="8" height="{h}" rx="4" fill="{color}"/>')
|
| 360 |
+
parts.append(f'<text x="{x + 24}" y="{y + 31}" font-family="Arial, sans-serif" font-size="18" font-weight="700" fill="#10141f">{html.escape(title)}</text>')
|
| 361 |
+
draw_text_block(parts, x + 24, y + 58, lines, size=13, color="#394255", max_chars=34, line_h=18)
|
| 362 |
+
|
| 363 |
+
families = [
|
| 364 |
+
("Softmax classifier", "logits = z(X)W + b; CE + L2; class weights for classifiers", "#1f63e9", 60, 270),
|
| 365 |
+
("Ridge regression/projection", "closed-form dual ridge on z(X), z(Y); used for forecast and reconstruction", "#0a7f55", 780, 270),
|
| 366 |
+
("Ridge + cosine ranking", "project one modality into another feature space, then rank candidates by cosine", "#008b9a", 60, 394),
|
| 367 |
+
("Multi-label logistic", "sigmoid heads for object vocabulary; threshold 0.5 with top-1 fallback", "#b65b04", 780, 394),
|
| 368 |
+
]
|
| 369 |
+
for title, desc, color, x, y in families:
|
| 370 |
+
parts.append(f'<rect x="{x}" y="{y}" width="660" height="100" rx="8" fill="#f8fafc" stroke="#dce2ec"/>')
|
| 371 |
+
parts.append(f'<text x="{x + 18}" y="{y + 33}" font-family="Arial, sans-serif" font-size="18" font-weight="700" fill="{color}">{html.escape(title)}</text>')
|
| 372 |
+
draw_text_block(parts, x + 18, y + 60, [desc], size=13, color="#394255", max_chars=76, line_h=18)
|
| 373 |
+
|
| 374 |
+
card_w, card_h = 440, 248
|
| 375 |
+
gap_x, gap_y = 30, 30
|
| 376 |
+
start_x, start_y = 60, 540
|
| 377 |
+
for idx, row in enumerate(rows):
|
| 378 |
+
col, card_row = idx % 3, idx // 3
|
| 379 |
+
x = start_x + col * (card_w + gap_x)
|
| 380 |
+
y = start_y + card_row * (card_h + gap_y)
|
| 381 |
+
color = family_colors[row["family"]]
|
| 382 |
+
parts.append(f'<rect x="{x}" y="{y}" width="{card_w}" height="{card_h}" rx="8" fill="#ffffff" stroke="#dce2ec" stroke-width="2"/>')
|
| 383 |
+
parts.append(f'<rect x="{x}" y="{y}" width="8" height="{card_h}" rx="4" fill="{color}"/>')
|
| 384 |
+
parts.append(f'<rect x="{x + 20}" y="{y + 18}" width="96" height="24" rx="6" fill="#f8fafc" stroke="{color}"/>')
|
| 385 |
+
parts.append(f'<text x="{x + 68}" y="{y + 35}" text-anchor="middle" font-family="Arial, sans-serif" font-size="11" font-weight="700" fill="{color}">{html.escape(row["family"])}</text>')
|
| 386 |
+
parts.append(f'<text x="{x + 20}" y="{y + 72}" font-family="Arial, sans-serif" font-size="20" font-weight="700" fill="#10141f">{html.escape(row["task"])}</text>')
|
| 387 |
+
cursor = y + 104
|
| 388 |
+
for label in ("input", "head", "output", "metric"):
|
| 389 |
+
parts.append(f'<text x="{x + 20}" y="{cursor}" font-family="Arial, sans-serif" font-size="12" font-weight="700" fill="{color}">{label.upper()}</text>')
|
| 390 |
+
cursor = draw_text_block(parts, x + 92, cursor, [row[label]], size=13, color="#394255", max_chars=41, line_h=17)
|
| 391 |
+
cursor += 8
|
| 392 |
+
|
| 393 |
+
notes = [
|
| 394 |
+
"Interpretation: this suite tests whether each input/output contract is wired correctly before scaling to many episodes.",
|
| 395 |
+
"Research-grade claims need held-out episode splits and stronger sequence/vision-language/robot-policy models.",
|
| 396 |
+
]
|
| 397 |
+
parts.append('<rect x="60" y="1688" width="1380" height="72" rx="8" fill="#f8fafc" stroke="#dce2ec"/>')
|
| 398 |
+
for i, line in enumerate(notes):
|
| 399 |
+
parts.append(f'<text x="84" y="{1718 + i * 24}" font-family="Arial, sans-serif" font-size="15" fill="#273143">{html.escape(line)}</text>')
|
| 400 |
+
parts.append("</svg>")
|
| 401 |
+
path.write_text("\n".join(parts), encoding="utf-8")
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
def collect_summary() -> dict:
|
| 405 |
+
all_action = read_json(RESULTS / "min_all_modalities_action_model/metrics.json")
|
| 406 |
+
all_subtask = read_json(RESULTS / "min_all_modalities_subtask_model/metrics.json")
|
| 407 |
+
min_action = read_json(RESULTS / "min_action_model/metrics.json")
|
| 408 |
+
min_subtask = read_json(RESULTS / "min_subtask_model/metrics.json")
|
| 409 |
+
suite = read_json(RESULTS / "episode_task_suite/summary_report.json")
|
| 410 |
+
manifest = read_json(RESULTS / "episode_task_suite/feature_manifest.json")
|
| 411 |
+
return {
|
| 412 |
+
"models": {
|
| 413 |
+
"motion_action": min_action,
|
| 414 |
+
"motion_subtask": min_subtask,
|
| 415 |
+
"all_modalities_action": all_action,
|
| 416 |
+
"all_modalities_subtask": all_subtask,
|
| 417 |
+
},
|
| 418 |
+
"suite": suite,
|
| 419 |
+
"feature_manifest": manifest,
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
def generate_charts(summary: dict) -> None:
|
| 424 |
+
CHARTS.mkdir(parents=True, exist_ok=True)
|
| 425 |
+
svg_pipeline_diagram(ASSETS / "pipeline_diagram.svg", summary)
|
| 426 |
+
svg_task_architectures(ASSETS / "task_architectures.svg", summary)
|
| 427 |
+
model_rows = [
|
| 428 |
+
("Motion-only action macro-F1", summary["models"]["motion_action"]["macro_f1"]),
|
| 429 |
+
("All-modality action macro-F1", summary["models"]["all_modalities_action"]["macro_f1"]),
|
| 430 |
+
("Motion-only subtask macro-F1", summary["models"]["motion_subtask"]["macro_f1"]),
|
| 431 |
+
("All-modality subtask macro-F1", summary["models"]["all_modalities_subtask"]["macro_f1"]),
|
| 432 |
+
]
|
| 433 |
+
svg_bar_chart(CHARTS / "model_macro_f1.svg", "Minimal Model Macro-F1 Comparison", model_rows, max_value=1.0)
|
| 434 |
+
|
| 435 |
+
suite = summary["suite"]["tasks"]
|
| 436 |
+
task_rows = []
|
| 437 |
+
for task_name, metrics in suite.items():
|
| 438 |
+
score = metrics.get("macro_f1", metrics.get("f1", metrics.get("micro_f1", metrics.get("top5_accuracy", metrics.get("r2", 0.0)))))
|
| 439 |
+
if score is None:
|
| 440 |
+
score = 0.0
|
| 441 |
+
score = max(float(score), 0.0)
|
| 442 |
+
task_rows.append((task_name, score))
|
| 443 |
+
svg_bar_chart(CHARTS / "episode_task_scores.svg", "Episode Task Suite: Main Scores", task_rows, max_value=1.0)
|
| 444 |
+
svg_feature_blocks(CHARTS / "feature_blocks.svg", summary["feature_manifest"])
|
| 445 |
+
|
| 446 |
+
retrieval = suite["cross_modal_retrieval"]
|
| 447 |
+
retrieval_rows = [
|
| 448 |
+
("top1", retrieval["top1_accuracy"]),
|
| 449 |
+
("top5", retrieval["top5_accuracy"]),
|
| 450 |
+
("top10", retrieval["top10_accuracy"]),
|
| 451 |
+
("MRR", retrieval["mrr"]),
|
| 452 |
+
]
|
| 453 |
+
svg_bar_chart(CHARTS / "cross_modal_retrieval.svg", "Cross-Modal Retrieval", retrieval_rows, max_value=1.0)
|
| 454 |
+
|
| 455 |
+
|
| 456 |
+
def write_summary_data(summary: dict) -> None:
|
| 457 |
+
DOCS.mkdir(parents=True, exist_ok=True)
|
| 458 |
+
(DOCS / "data").mkdir(parents=True, exist_ok=True)
|
| 459 |
+
(DOCS / "data/summary_metrics.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
def main() -> int:
|
| 463 |
+
summary = collect_summary()
|
| 464 |
+
generate_charts(summary)
|
| 465 |
+
write_summary_data(summary)
|
| 466 |
+
print(f"Wrote pipeline diagram: {ASSETS / 'pipeline_diagram.svg'}")
|
| 467 |
+
print(f"Wrote task architectures diagram: {ASSETS / 'task_architectures.svg'}")
|
| 468 |
+
print(f"Wrote charts: {CHARTS}")
|
| 469 |
+
print(f"Wrote data: {DOCS / 'data/summary_metrics.json'}")
|
| 470 |
+
return 0
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
if __name__ == "__main__":
|
| 474 |
+
raise SystemExit(main())
|
scripts/render_task_suite_infographic.py
ADDED
|
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Render a ChatGPT-image-backed 12-task infographic.
|
| 4 |
+
|
| 5 |
+
The background bitmap is AI-generated. The task names, inputs, and metrics are
|
| 6 |
+
read from results/episode_task_suite/summary_report.json so the published image
|
| 7 |
+
does not rely on image-model text generation.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import argparse
|
| 13 |
+
import html
|
| 14 |
+
import json
|
| 15 |
+
import subprocess
|
| 16 |
+
import tempfile
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 21 |
+
SUMMARY_PATH = ROOT / "results/episode_task_suite/summary_report.json"
|
| 22 |
+
DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png"
|
| 23 |
+
DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
GROUPS = [
|
| 27 |
+
{
|
| 28 |
+
"name": "Label + State",
|
| 29 |
+
"color": "#008b9a",
|
| 30 |
+
"left": 94,
|
| 31 |
+
"top": 374,
|
| 32 |
+
"width": 246,
|
| 33 |
+
"tasks": [
|
| 34 |
+
("timeline_action", "supervised"),
|
| 35 |
+
("timeline_subtask", "supervised"),
|
| 36 |
+
("next_action", "supervised"),
|
| 37 |
+
],
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"name": "Prediction + Reconstruction",
|
| 41 |
+
"color": "#1f63e9",
|
| 42 |
+
"left": 472,
|
| 43 |
+
"top": 374,
|
| 44 |
+
"width": 248,
|
| 45 |
+
"tasks": [
|
| 46 |
+
("hand_trajectory_forecast", "forecast"),
|
| 47 |
+
("modality_reconstruction", "forecast"),
|
| 48 |
+
("contact_prediction", "supervised"),
|
| 49 |
+
],
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"name": "Grounding + Retrieval",
|
| 53 |
+
"color": "#b65b04",
|
| 54 |
+
"left": 848,
|
| 55 |
+
"top": 374,
|
| 56 |
+
"width": 220,
|
| 57 |
+
"tasks": [
|
| 58 |
+
("caption_grounding", "retrieval"),
|
| 59 |
+
("cross_modal_retrieval", "retrieval"),
|
| 60 |
+
("object_relevance", "supervised"),
|
| 61 |
+
],
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"name": "Temporal Diagnostics",
|
| 65 |
+
"color": "#b42318",
|
| 66 |
+
"left": 1202,
|
| 67 |
+
"top": 374,
|
| 68 |
+
"width": 244,
|
| 69 |
+
"tasks": [
|
| 70 |
+
("transition_detection", "diagnostic"),
|
| 71 |
+
("temporal_order", "diagnostic"),
|
| 72 |
+
("misalignment_detection", "diagnostic"),
|
| 73 |
+
],
|
| 74 |
+
},
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def load_summary() -> dict:
|
| 79 |
+
return json.loads(SUMMARY_PATH.read_text(encoding="utf-8"))
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def fmt(value: float) -> str:
|
| 83 |
+
return f"{float(value):.4f}"
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def metric_for(task_name: str, metrics: dict) -> tuple[str, str]:
|
| 87 |
+
if task_name == "hand_trajectory_forecast":
|
| 88 |
+
return "MPJPE", fmt(metrics["mpjpe"])
|
| 89 |
+
if task_name == "cross_modal_retrieval":
|
| 90 |
+
return "top-5", fmt(metrics["top5_accuracy"])
|
| 91 |
+
if task_name == "caption_grounding":
|
| 92 |
+
return "MRR", fmt(metrics["mrr"])
|
| 93 |
+
if task_name == "object_relevance":
|
| 94 |
+
return "micro-F1", fmt(metrics["micro_f1"])
|
| 95 |
+
if task_name == "modality_reconstruction":
|
| 96 |
+
return "R2", fmt(metrics["r2"])
|
| 97 |
+
if task_name in {"temporal_order", "misalignment_detection"}:
|
| 98 |
+
return "F1", fmt(metrics["f1"])
|
| 99 |
+
if "macro_f1" in metrics:
|
| 100 |
+
return "macro-F1", fmt(metrics["macro_f1"])
|
| 101 |
+
if "accuracy" in metrics:
|
| 102 |
+
return "accuracy", fmt(metrics["accuracy"])
|
| 103 |
+
raise KeyError(f"No main metric configured for {task_name}")
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def short_io(task_name: str, metrics: dict) -> str:
|
| 107 |
+
custom = {
|
| 108 |
+
"timeline_action": "all modalities -> action label",
|
| 109 |
+
"timeline_subtask": "all modalities -> subtask label",
|
| 110 |
+
"transition_detection": "all modalities -> boundary / steady",
|
| 111 |
+
"next_action": "window at t -> action at t+20",
|
| 112 |
+
"hand_trajectory_forecast": "all modalities -> future hand joints",
|
| 113 |
+
"contact_prediction": "non-contact modalities -> contact",
|
| 114 |
+
"object_relevance": "non-caption modalities -> object set",
|
| 115 |
+
"caption_grounding": "text query -> matching window",
|
| 116 |
+
"cross_modal_retrieval": "motion / IMU / camera -> depth / video",
|
| 117 |
+
"modality_reconstruction": "motion / IMU / camera -> depth / video vec",
|
| 118 |
+
"temporal_order": "two windows -> correct order?",
|
| 119 |
+
"misalignment_detection": "motion + visual -> aligned / shifted",
|
| 120 |
+
}
|
| 121 |
+
return custom.get(task_name, metrics.get("input", ""))
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def task_html(task_name: str, kind: str, metrics: dict, top: int, group: dict) -> str:
|
| 125 |
+
label, value = metric_for(task_name, metrics)
|
| 126 |
+
io = short_io(task_name, metrics)
|
| 127 |
+
name_size = 17 if len(task_name) > 22 else 18
|
| 128 |
+
return f"""
|
| 129 |
+
<section class="task" style="left:{group['left']}px;top:{top}px;width:{group['width']}px;--accent:{group['color']};">
|
| 130 |
+
<div class="kind">{html.escape(kind)}</div>
|
| 131 |
+
<div class="task-name" style="font-size:{name_size}px;">{html.escape(task_name)}</div>
|
| 132 |
+
<div class="io">{html.escape(io)}</div>
|
| 133 |
+
<div class="metric"><span>{html.escape(label)}</span><strong>{html.escape(value)}</strong></div>
|
| 134 |
+
</section>
|
| 135 |
+
"""
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def build_html(summary: dict, base_image: Path) -> str:
|
| 139 |
+
suite = summary["tasks"]
|
| 140 |
+
task_count = len(suite)
|
| 141 |
+
group_headers = []
|
| 142 |
+
cards = []
|
| 143 |
+
row_tops = [374, 552, 730]
|
| 144 |
+
header_lefts = [38, 417, 792, 1143]
|
| 145 |
+
for group, header_left in zip(GROUPS, header_lefts):
|
| 146 |
+
group_headers.append(
|
| 147 |
+
f'<div class="group-title" style="left:{header_left}px;top:333px;color:{group["color"]};">{html.escape(group["name"])}</div>'
|
| 148 |
+
)
|
| 149 |
+
for row_idx, (task_name, kind) in enumerate(group["tasks"]):
|
| 150 |
+
cards.append(task_html(task_name, kind, suite[task_name], row_tops[row_idx], group))
|
| 151 |
+
|
| 152 |
+
stats = [
|
| 153 |
+
f"{summary['num_frames']:,} frames",
|
| 154 |
+
f"{summary['num_windows']:,} windows",
|
| 155 |
+
f"{summary['feature_dim']:,} features",
|
| 156 |
+
f"{task_count} tasks",
|
| 157 |
+
"chronological split",
|
| 158 |
+
]
|
| 159 |
+
stat_html = "".join(f"<span>{html.escape(item)}</span>" for item in stats)
|
| 160 |
+
base_uri = base_image.resolve().as_uri()
|
| 161 |
+
return f"""<!doctype html>
|
| 162 |
+
<html lang="en">
|
| 163 |
+
<head>
|
| 164 |
+
<meta charset="utf-8">
|
| 165 |
+
<meta name="viewport" content="width=1536, initial-scale=1">
|
| 166 |
+
<title>Ropedia 12-Task Episode Suite Infographic</title>
|
| 167 |
+
<style>
|
| 168 |
+
* {{ box-sizing: border-box; }}
|
| 169 |
+
html, body {{ margin: 0; width: 1536px; height: 1024px; background: #ffffff; }}
|
| 170 |
+
body {{
|
| 171 |
+
font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Arial, sans-serif;
|
| 172 |
+
color: #10141f;
|
| 173 |
+
}}
|
| 174 |
+
.canvas {{
|
| 175 |
+
position: relative;
|
| 176 |
+
width: 1536px;
|
| 177 |
+
height: 1024px;
|
| 178 |
+
overflow: hidden;
|
| 179 |
+
background-image: url("{base_uri}");
|
| 180 |
+
background-size: 1536px 1024px;
|
| 181 |
+
background-repeat: no-repeat;
|
| 182 |
+
}}
|
| 183 |
+
.title {{
|
| 184 |
+
position: absolute;
|
| 185 |
+
left: 330px;
|
| 186 |
+
top: 42px;
|
| 187 |
+
width: 876px;
|
| 188 |
+
text-align: center;
|
| 189 |
+
}}
|
| 190 |
+
h1 {{
|
| 191 |
+
margin: 0;
|
| 192 |
+
font-size: 38px;
|
| 193 |
+
line-height: 1.05;
|
| 194 |
+
letter-spacing: 0;
|
| 195 |
+
font-weight: 820;
|
| 196 |
+
}}
|
| 197 |
+
.subtitle {{
|
| 198 |
+
margin-top: 8px;
|
| 199 |
+
color: #425067;
|
| 200 |
+
font-size: 15px;
|
| 201 |
+
line-height: 1.35;
|
| 202 |
+
font-weight: 520;
|
| 203 |
+
}}
|
| 204 |
+
.stats {{
|
| 205 |
+
margin-top: 12px;
|
| 206 |
+
display: flex;
|
| 207 |
+
justify-content: center;
|
| 208 |
+
gap: 8px;
|
| 209 |
+
}}
|
| 210 |
+
.stats span {{
|
| 211 |
+
display: inline-flex;
|
| 212 |
+
align-items: center;
|
| 213 |
+
height: 24px;
|
| 214 |
+
padding: 0 10px;
|
| 215 |
+
border: 1px solid #cdd8e8;
|
| 216 |
+
background: rgba(255, 255, 255, 0.82);
|
| 217 |
+
border-radius: 999px;
|
| 218 |
+
color: #253046;
|
| 219 |
+
font-size: 12px;
|
| 220 |
+
font-weight: 720;
|
| 221 |
+
}}
|
| 222 |
+
.modality {{
|
| 223 |
+
position: absolute;
|
| 224 |
+
top: 256px;
|
| 225 |
+
width: 180px;
|
| 226 |
+
text-align: center;
|
| 227 |
+
font-size: 12px;
|
| 228 |
+
color: #536074;
|
| 229 |
+
font-weight: 720;
|
| 230 |
+
text-transform: uppercase;
|
| 231 |
+
letter-spacing: 0;
|
| 232 |
+
}}
|
| 233 |
+
.group-title {{
|
| 234 |
+
position: absolute;
|
| 235 |
+
width: 322px;
|
| 236 |
+
text-align: center;
|
| 237 |
+
font-size: 18px;
|
| 238 |
+
line-height: 1;
|
| 239 |
+
font-weight: 830;
|
| 240 |
+
letter-spacing: 0;
|
| 241 |
+
}}
|
| 242 |
+
.task {{
|
| 243 |
+
position: absolute;
|
| 244 |
+
padding: 0;
|
| 245 |
+
}}
|
| 246 |
+
.kind {{
|
| 247 |
+
display: inline-flex;
|
| 248 |
+
align-items: center;
|
| 249 |
+
height: 22px;
|
| 250 |
+
padding: 0 8px;
|
| 251 |
+
border-radius: 6px;
|
| 252 |
+
border: 1px solid color-mix(in srgb, var(--accent) 35%, #ffffff);
|
| 253 |
+
color: var(--accent);
|
| 254 |
+
background: rgba(255, 255, 255, 0.76);
|
| 255 |
+
text-transform: uppercase;
|
| 256 |
+
font-size: 10px;
|
| 257 |
+
line-height: 1;
|
| 258 |
+
font-weight: 840;
|
| 259 |
+
letter-spacing: 0;
|
| 260 |
+
}}
|
| 261 |
+
.task-name {{
|
| 262 |
+
margin-top: 7px;
|
| 263 |
+
color: #111827;
|
| 264 |
+
line-height: 1.05;
|
| 265 |
+
font-weight: 850;
|
| 266 |
+
letter-spacing: 0;
|
| 267 |
+
white-space: nowrap;
|
| 268 |
+
}}
|
| 269 |
+
.io {{
|
| 270 |
+
margin-top: 8px;
|
| 271 |
+
min-height: 36px;
|
| 272 |
+
color: #475569;
|
| 273 |
+
font-size: 13.5px;
|
| 274 |
+
line-height: 1.28;
|
| 275 |
+
font-weight: 570;
|
| 276 |
+
}}
|
| 277 |
+
.metric {{
|
| 278 |
+
display: inline-flex;
|
| 279 |
+
align-items: center;
|
| 280 |
+
gap: 9px;
|
| 281 |
+
margin-top: 8px;
|
| 282 |
+
height: 30px;
|
| 283 |
+
padding: 0 10px;
|
| 284 |
+
border-radius: 7px;
|
| 285 |
+
border: 1px solid color-mix(in srgb, var(--accent) 36%, #ffffff);
|
| 286 |
+
background: rgba(255, 255, 255, 0.90);
|
| 287 |
+
box-shadow: 0 7px 20px rgba(16, 20, 31, 0.07);
|
| 288 |
+
}}
|
| 289 |
+
.metric span {{
|
| 290 |
+
color: #64748b;
|
| 291 |
+
font-size: 12px;
|
| 292 |
+
font-weight: 760;
|
| 293 |
+
}}
|
| 294 |
+
.metric strong {{
|
| 295 |
+
color: var(--accent);
|
| 296 |
+
font-size: 16px;
|
| 297 |
+
line-height: 1;
|
| 298 |
+
font-weight: 860;
|
| 299 |
+
}}
|
| 300 |
+
.footer {{
|
| 301 |
+
position: absolute;
|
| 302 |
+
left: 360px;
|
| 303 |
+
top: 932px;
|
| 304 |
+
width: 816px;
|
| 305 |
+
text-align: center;
|
| 306 |
+
color: #536074;
|
| 307 |
+
font-size: 14px;
|
| 308 |
+
font-weight: 650;
|
| 309 |
+
}}
|
| 310 |
+
</style>
|
| 311 |
+
</head>
|
| 312 |
+
<body>
|
| 313 |
+
<main class="canvas" aria-label="Ropedia 12-task episode suite infographic">
|
| 314 |
+
<div class="title">
|
| 315 |
+
<h1>Ropedia 12-Task Episode Suite</h1>
|
| 316 |
+
<div class="subtitle">All labels and metrics are overlaid from the verified single-episode results.</div>
|
| 317 |
+
<div class="stats">{stat_html}</div>
|
| 318 |
+
</div>
|
| 319 |
+
<div class="modality" style="left:50px;">fisheye video</div>
|
| 320 |
+
<div class="modality" style="left:270px;">depth</div>
|
| 321 |
+
<div class="modality" style="left:530px;">3D / SLAM</div>
|
| 322 |
+
<div class="modality" style="left:770px;">IMU</div>
|
| 323 |
+
<div class="modality" style="left:1030px;">hands</div>
|
| 324 |
+
<div class="modality" style="left:1278px;">text / objects</div>
|
| 325 |
+
{''.join(group_headers)}
|
| 326 |
+
{''.join(cards)}
|
| 327 |
+
<div class="footer">Single public sample episode: useful for pipeline validation and task design, not cross-episode generalization.</div>
|
| 328 |
+
</main>
|
| 329 |
+
</body>
|
| 330 |
+
</html>
|
| 331 |
+
"""
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def render_html(html_path: Path, output_path: Path) -> None:
|
| 335 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 336 |
+
subprocess.run(
|
| 337 |
+
[
|
| 338 |
+
"npx",
|
| 339 |
+
"--yes",
|
| 340 |
+
"playwright",
|
| 341 |
+
"screenshot",
|
| 342 |
+
"--full-page",
|
| 343 |
+
"--viewport-size=1536,1024",
|
| 344 |
+
html_path.resolve().as_uri(),
|
| 345 |
+
str(output_path),
|
| 346 |
+
],
|
| 347 |
+
check=True,
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def main() -> int:
|
| 352 |
+
parser = argparse.ArgumentParser()
|
| 353 |
+
parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE)
|
| 354 |
+
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
|
| 355 |
+
parser.add_argument("--html", type=Path)
|
| 356 |
+
parser.add_argument("--no-export", action="store_true", help="Only write the HTML overlay.")
|
| 357 |
+
args = parser.parse_args()
|
| 358 |
+
|
| 359 |
+
summary = load_summary()
|
| 360 |
+
html_text = build_html(summary, args.base_image)
|
| 361 |
+
if args.html is None:
|
| 362 |
+
with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle:
|
| 363 |
+
handle.write(html_text)
|
| 364 |
+
html_path = Path(handle.name)
|
| 365 |
+
else:
|
| 366 |
+
html_path = args.html
|
| 367 |
+
html_path.parent.mkdir(parents=True, exist_ok=True)
|
| 368 |
+
html_path.write_text(html_text, encoding="utf-8")
|
| 369 |
+
|
| 370 |
+
if not args.no_export:
|
| 371 |
+
render_html(html_path, args.output)
|
| 372 |
+
print(f"Wrote image: {args.output}")
|
| 373 |
+
print(f"Wrote overlay HTML: {html_path}")
|
| 374 |
+
return 0
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
if __name__ == "__main__":
|
| 378 |
+
raise SystemExit(main())
|
scripts/train_all_modalities_model.py
ADDED
|
@@ -0,0 +1,582 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
All-modality lightweight baseline for a Ropedia/Xperience episode.
|
| 4 |
+
|
| 5 |
+
This intentionally stays small enough for a MacBook:
|
| 6 |
+
- no deep video training
|
| 7 |
+
- no CUDA
|
| 8 |
+
- no PyTorch dependency
|
| 9 |
+
|
| 10 |
+
Each modality is compressed into window-level statistics, then the same
|
| 11 |
+
Numpy softmax classifier from train_min_action_model.py is used.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import argparse
|
| 17 |
+
import csv
|
| 18 |
+
import hashlib
|
| 19 |
+
import json
|
| 20 |
+
import re
|
| 21 |
+
import sys
|
| 22 |
+
from collections import Counter, OrderedDict
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
|
| 25 |
+
import cv2
|
| 26 |
+
import h5py
|
| 27 |
+
import numpy as np
|
| 28 |
+
|
| 29 |
+
from train_min_action_model import (
|
| 30 |
+
add_toolkit_to_path,
|
| 31 |
+
center_by_body_root,
|
| 32 |
+
compute_metrics,
|
| 33 |
+
encode_labels,
|
| 34 |
+
fit_scaler,
|
| 35 |
+
frame_label,
|
| 36 |
+
majority_label,
|
| 37 |
+
predict,
|
| 38 |
+
portable_path,
|
| 39 |
+
safe_window,
|
| 40 |
+
save_artifacts,
|
| 41 |
+
stratified_split,
|
| 42 |
+
temporal_stats,
|
| 43 |
+
train_softmax_classifier,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
VIDEO_FILES = OrderedDict([
|
| 48 |
+
("fisheye_cam0", "fisheye_cam0.mp4"),
|
| 49 |
+
("fisheye_cam1", "fisheye_cam1.mp4"),
|
| 50 |
+
("fisheye_cam2", "fisheye_cam2.mp4"),
|
| 51 |
+
("fisheye_cam3", "fisheye_cam3.mp4"),
|
| 52 |
+
("stereo_left", "stereo_left.mp4"),
|
| 53 |
+
("stereo_right", "stereo_right.mp4"),
|
| 54 |
+
])
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def parse_args() -> argparse.Namespace:
|
| 58 |
+
workspace_default = Path(__file__).resolve().parents[1]
|
| 59 |
+
annotation_default = workspace_default / "data/sample/xperience-10m-sample/annotation.hdf5"
|
| 60 |
+
|
| 61 |
+
parser = argparse.ArgumentParser(description="Train a lightweight all-modality Ropedia classifier.")
|
| 62 |
+
parser.add_argument("--workspace", type=Path, default=workspace_default, help="Ropedia workspace root.")
|
| 63 |
+
parser.add_argument("--annotation", type=Path, default=annotation_default, help="Path to annotation.hdf5.")
|
| 64 |
+
parser.add_argument("--output-dir", type=Path, default=None, help="Output artifact directory.")
|
| 65 |
+
parser.add_argument("--cache-dir", type=Path, default=None, help="Feature cache directory.")
|
| 66 |
+
parser.add_argument("--target", choices=["action", "subtask"], default="action", help="Prediction target.")
|
| 67 |
+
parser.add_argument("--window-frames", type=int, default=20, help="Frames per training window.")
|
| 68 |
+
parser.add_argument("--stride-frames", type=int, default=5, help="Stride between windows.")
|
| 69 |
+
parser.add_argument("--min-label-fraction", type=float, default=0.6, help="Minimum majority-label fraction.")
|
| 70 |
+
parser.add_argument("--test-fraction", type=float, default=0.25, help="Stratified test fraction.")
|
| 71 |
+
parser.add_argument("--epochs", type=int, default=800, help="Training epochs.")
|
| 72 |
+
parser.add_argument("--learning-rate", type=float, default=0.12, help="Softmax learning rate.")
|
| 73 |
+
parser.add_argument("--l2", type=float, default=2e-3, help="L2 weight decay.")
|
| 74 |
+
parser.add_argument("--seed", type=int, default=7, help="Random seed.")
|
| 75 |
+
parser.add_argument("--no-class-weights", action="store_true", help="Disable inverse-frequency class weighting.")
|
| 76 |
+
parser.add_argument("--force-rebuild-cache", action="store_true", help="Recompute cached depth/video features.")
|
| 77 |
+
parser.add_argument("--video-image-size", type=int, default=32, help="Resize video frames before visual features.")
|
| 78 |
+
parser.add_argument("--video-grid-size", type=int, default=8, help="Small grayscale grid per video frame.")
|
| 79 |
+
parser.add_argument("--video-hist-bins", type=int, default=8, help="Color histogram bins per channel.")
|
| 80 |
+
parser.add_argument("--depth-grid-size", type=int, default=8, help="Small depth/confidence grid per frame.")
|
| 81 |
+
parser.add_argument("--text-hash-dim", type=int, default=128, help="Hashed bag-of-words dimension.")
|
| 82 |
+
parser.add_argument(
|
| 83 |
+
"--include-label-text",
|
| 84 |
+
action="store_true",
|
| 85 |
+
help="Also include action/subtask/action-description text as input. This leaks target semantics.",
|
| 86 |
+
)
|
| 87 |
+
args = parser.parse_args()
|
| 88 |
+
|
| 89 |
+
if args.output_dir is None:
|
| 90 |
+
name = "min_all_modalities_action_model" if args.target == "action" else "min_all_modalities_subtask_model"
|
| 91 |
+
args.output_dir = args.workspace / "outputs" / name
|
| 92 |
+
if args.cache_dir is None:
|
| 93 |
+
args.cache_dir = args.workspace / "outputs/feature_cache"
|
| 94 |
+
return args
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def numeric_array(value) -> np.ndarray | None:
|
| 98 |
+
try:
|
| 99 |
+
arr = np.asarray(value, dtype=np.float32)
|
| 100 |
+
except (TypeError, ValueError):
|
| 101 |
+
return None
|
| 102 |
+
if arr.size == 0:
|
| 103 |
+
return None
|
| 104 |
+
return np.nan_to_num(arr.reshape(-1), nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def calibration_features(calib_data: dict | None) -> np.ndarray:
|
| 108 |
+
if not calib_data:
|
| 109 |
+
return np.zeros(0, dtype=np.float32)
|
| 110 |
+
chunks: list[np.ndarray] = []
|
| 111 |
+
for cam_id in sorted(calib_data):
|
| 112 |
+
cam = calib_data.get(cam_id, {})
|
| 113 |
+
if not isinstance(cam, dict):
|
| 114 |
+
continue
|
| 115 |
+
for key in sorted(cam):
|
| 116 |
+
arr = numeric_array(cam.get(key))
|
| 117 |
+
if arr is not None:
|
| 118 |
+
chunks.append(arr)
|
| 119 |
+
if not chunks:
|
| 120 |
+
return np.zeros(0, dtype=np.float32)
|
| 121 |
+
return np.concatenate(chunks).astype(np.float32)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def point_cloud_features(points: np.ndarray | None) -> np.ndarray:
|
| 125 |
+
if points is None:
|
| 126 |
+
return np.zeros(0, dtype=np.float32)
|
| 127 |
+
pts = np.asarray(points, dtype=np.float32)
|
| 128 |
+
if pts.ndim != 2 or pts.shape[1] != 3 or len(pts) == 0:
|
| 129 |
+
return np.zeros(0, dtype=np.float32)
|
| 130 |
+
pts = np.nan_to_num(pts, nan=0.0, posinf=0.0, neginf=0.0)
|
| 131 |
+
stats = [
|
| 132 |
+
pts.mean(axis=0),
|
| 133 |
+
pts.std(axis=0),
|
| 134 |
+
pts.min(axis=0),
|
| 135 |
+
pts.max(axis=0),
|
| 136 |
+
np.percentile(pts, 10, axis=0),
|
| 137 |
+
np.percentile(pts, 50, axis=0),
|
| 138 |
+
np.percentile(pts, 90, axis=0),
|
| 139 |
+
np.asarray([np.log1p(len(pts))], dtype=np.float32),
|
| 140 |
+
]
|
| 141 |
+
return np.concatenate(stats).astype(np.float32)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def video_frame_features(frame: np.ndarray, image_size: int, grid_size: int, hist_bins: int) -> np.ndarray:
|
| 145 |
+
small = cv2.resize(frame, (image_size, image_size), interpolation=cv2.INTER_AREA)
|
| 146 |
+
rgb = cv2.cvtColor(small, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
|
| 147 |
+
mean = rgb.reshape(-1, 3).mean(axis=0)
|
| 148 |
+
std = rgb.reshape(-1, 3).std(axis=0)
|
| 149 |
+
|
| 150 |
+
hists = []
|
| 151 |
+
for channel in range(3):
|
| 152 |
+
hist, _ = np.histogram(rgb[:, :, channel], bins=hist_bins, range=(0.0, 1.0))
|
| 153 |
+
hist = hist.astype(np.float32)
|
| 154 |
+
hist /= max(float(hist.sum()), 1.0)
|
| 155 |
+
hists.append(hist)
|
| 156 |
+
|
| 157 |
+
gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY).astype(np.float32) / 255.0
|
| 158 |
+
grid = cv2.resize(gray, (grid_size, grid_size), interpolation=cv2.INTER_AREA).reshape(-1)
|
| 159 |
+
gy, gx = np.gradient(gray)
|
| 160 |
+
edge = np.asarray([np.abs(gx).mean(), np.abs(gy).mean(), np.abs(gx).std(), np.abs(gy).std()], dtype=np.float32)
|
| 161 |
+
|
| 162 |
+
return np.concatenate([mean, std, *hists, grid, edge]).astype(np.float32)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def read_video_feature_cache(
|
| 166 |
+
path: Path,
|
| 167 |
+
n_frames: int,
|
| 168 |
+
cache_dir: Path,
|
| 169 |
+
image_size: int,
|
| 170 |
+
grid_size: int,
|
| 171 |
+
hist_bins: int,
|
| 172 |
+
force: bool,
|
| 173 |
+
) -> np.ndarray:
|
| 174 |
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
| 175 |
+
cache_path = cache_dir / f"video_{path.stem}_n{n_frames}_img{image_size}_grid{grid_size}_hist{hist_bins}.npz"
|
| 176 |
+
if cache_path.exists() and not force:
|
| 177 |
+
return np.load(cache_path)["features"].astype(np.float32)
|
| 178 |
+
|
| 179 |
+
dummy_dim = 6 + 3 * hist_bins + grid_size * grid_size + 4
|
| 180 |
+
features = np.zeros((n_frames, dummy_dim), dtype=np.float32)
|
| 181 |
+
if not path.exists():
|
| 182 |
+
np.savez_compressed(cache_path, features=features)
|
| 183 |
+
return features
|
| 184 |
+
|
| 185 |
+
cap = cv2.VideoCapture(str(path))
|
| 186 |
+
if not cap.isOpened():
|
| 187 |
+
np.savez_compressed(cache_path, features=features)
|
| 188 |
+
return features
|
| 189 |
+
|
| 190 |
+
last = np.zeros(dummy_dim, dtype=np.float32)
|
| 191 |
+
for idx in range(n_frames):
|
| 192 |
+
ok, frame = cap.read()
|
| 193 |
+
if ok:
|
| 194 |
+
last = video_frame_features(frame, image_size, grid_size, hist_bins)
|
| 195 |
+
features[idx] = last
|
| 196 |
+
if idx and idx % 1000 == 0:
|
| 197 |
+
print(f" {path.name}: {idx}/{n_frames} frames")
|
| 198 |
+
cap.release()
|
| 199 |
+
np.savez_compressed(cache_path, features=features)
|
| 200 |
+
return features
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def depth_frame_features(depth: np.ndarray, confidence: np.ndarray | None, depth_min: float, depth_max: float, grid_size: int) -> np.ndarray:
|
| 204 |
+
d = np.asarray(depth, dtype=np.float32)
|
| 205 |
+
valid = np.isfinite(d) & (d > 0)
|
| 206 |
+
if valid.any():
|
| 207 |
+
vals = d[valid]
|
| 208 |
+
d_stats = np.asarray([
|
| 209 |
+
vals.mean(),
|
| 210 |
+
vals.std(),
|
| 211 |
+
vals.min(),
|
| 212 |
+
vals.max(),
|
| 213 |
+
np.percentile(vals, 10),
|
| 214 |
+
np.percentile(vals, 50),
|
| 215 |
+
np.percentile(vals, 90),
|
| 216 |
+
valid.mean(),
|
| 217 |
+
], dtype=np.float32)
|
| 218 |
+
else:
|
| 219 |
+
d_stats = np.zeros(8, dtype=np.float32)
|
| 220 |
+
|
| 221 |
+
denom = max(depth_max - depth_min, 1e-6)
|
| 222 |
+
d_norm = np.clip((np.nan_to_num(d, nan=0.0) - depth_min) / denom, 0.0, 1.0)
|
| 223 |
+
d_grid = cv2.resize(d_norm, (grid_size, grid_size), interpolation=cv2.INTER_AREA).reshape(-1).astype(np.float32)
|
| 224 |
+
|
| 225 |
+
if confidence is None:
|
| 226 |
+
c_stats = np.zeros(4, dtype=np.float32)
|
| 227 |
+
c_grid = np.zeros(grid_size * grid_size, dtype=np.float32)
|
| 228 |
+
else:
|
| 229 |
+
c = np.asarray(confidence, dtype=np.float32)
|
| 230 |
+
c_scale = 255.0 if c.max(initial=0) > 1.0 else 1.0
|
| 231 |
+
c = np.clip(c / c_scale, 0.0, 1.0)
|
| 232 |
+
c_stats = np.asarray([c.mean(), c.std(), c.min(initial=0), c.max(initial=0)], dtype=np.float32)
|
| 233 |
+
c_grid = cv2.resize(c, (grid_size, grid_size), interpolation=cv2.INTER_AREA).reshape(-1).astype(np.float32)
|
| 234 |
+
|
| 235 |
+
return np.concatenate([d_stats, d_grid, c_stats, c_grid]).astype(np.float32)
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def read_depth_feature_cache(annotation: Path, n_frames: int, cache_dir: Path, grid_size: int, force: bool) -> np.ndarray:
|
| 239 |
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
| 240 |
+
cache_path = cache_dir / f"depth_n{n_frames}_grid{grid_size}.npz"
|
| 241 |
+
if cache_path.exists() and not force:
|
| 242 |
+
return np.load(cache_path)["features"].astype(np.float32)
|
| 243 |
+
|
| 244 |
+
feature_dim = 8 + grid_size * grid_size + 4 + grid_size * grid_size
|
| 245 |
+
features = np.zeros((n_frames, feature_dim), dtype=np.float32)
|
| 246 |
+
with h5py.File(annotation, "r") as f:
|
| 247 |
+
if "depth/depth" not in f:
|
| 248 |
+
np.savez_compressed(cache_path, features=features)
|
| 249 |
+
return features
|
| 250 |
+
depth_ds = f["depth/depth"]
|
| 251 |
+
conf_ds = f["depth/confidence"] if "depth/confidence" in f else None
|
| 252 |
+
depth_min = float(np.asarray(f["depth/depth_min"][()]).flat[0]) if "depth/depth_min" in f else 0.0
|
| 253 |
+
depth_max = float(np.asarray(f["depth/depth_max"][()]).flat[0]) if "depth/depth_max" in f else 4.0
|
| 254 |
+
limit = min(n_frames, depth_ds.shape[0])
|
| 255 |
+
for idx in range(limit):
|
| 256 |
+
confidence = conf_ds[idx] if conf_ds is not None else None
|
| 257 |
+
features[idx] = depth_frame_features(depth_ds[idx], confidence, depth_min, depth_max, grid_size)
|
| 258 |
+
if idx and idx % 1000 == 0:
|
| 259 |
+
print(f" depth: {idx}/{limit} frames")
|
| 260 |
+
np.savez_compressed(cache_path, features=features)
|
| 261 |
+
return features
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
TOKEN_RE = re.compile(r"[a-zA-Z0-9_]+")
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def hashed_text(text: str, dim: int) -> np.ndarray:
|
| 268 |
+
vec = np.zeros(dim, dtype=np.float32)
|
| 269 |
+
for token in TOKEN_RE.findall(text.lower()):
|
| 270 |
+
digest = hashlib.blake2b(token.encode("utf-8"), digest_size=8).digest()
|
| 271 |
+
bucket = int.from_bytes(digest[:4], "little") % dim
|
| 272 |
+
sign = 1.0 if digest[4] & 1 else -1.0
|
| 273 |
+
vec[bucket] += sign
|
| 274 |
+
norm = np.linalg.norm(vec)
|
| 275 |
+
if norm > 0:
|
| 276 |
+
vec /= norm
|
| 277 |
+
return vec
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def text_for_frame(info: dict, include_label_text: bool) -> str:
|
| 281 |
+
parts: list[str] = []
|
| 282 |
+
objects = info.get("objects")
|
| 283 |
+
if isinstance(objects, list):
|
| 284 |
+
parts.extend(str(x) for x in objects)
|
| 285 |
+
elif objects:
|
| 286 |
+
parts.append(str(objects))
|
| 287 |
+
if info.get("interaction"):
|
| 288 |
+
parts.append(str(info["interaction"]))
|
| 289 |
+
if include_label_text:
|
| 290 |
+
for key in ("theme", "action_label", "action_desc"):
|
| 291 |
+
if info.get(key):
|
| 292 |
+
parts.append(str(info[key]))
|
| 293 |
+
return " ".join(parts)
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def build_text_features(frame_info_map: dict, n_frames: int, dim: int, include_label_text: bool) -> np.ndarray:
|
| 297 |
+
features = np.zeros((n_frames, dim), dtype=np.float32)
|
| 298 |
+
for idx in range(n_frames):
|
| 299 |
+
info = frame_info_map.get(idx, {})
|
| 300 |
+
features[idx] = hashed_text(text_for_frame(info, include_label_text), dim)
|
| 301 |
+
return features
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
def prepare_modalities(args: argparse.Namespace, ann: dict) -> tuple[dict, list[dict]]:
|
| 305 |
+
data_root = args.annotation.parent
|
| 306 |
+
n_frames = len(ann["img_names"])
|
| 307 |
+
extras: dict = {
|
| 308 |
+
"video": OrderedDict(),
|
| 309 |
+
"depth": None,
|
| 310 |
+
"text": None,
|
| 311 |
+
"static": OrderedDict(),
|
| 312 |
+
}
|
| 313 |
+
available = []
|
| 314 |
+
|
| 315 |
+
print("Preparing all-modality feature caches")
|
| 316 |
+
print(" depth/confidence")
|
| 317 |
+
depth = read_depth_feature_cache(args.annotation, n_frames, args.cache_dir, args.depth_grid_size, args.force_rebuild_cache)
|
| 318 |
+
extras["depth"] = depth
|
| 319 |
+
available.append({"modality": "depth_confidence", "shape": list(depth.shape)})
|
| 320 |
+
|
| 321 |
+
print(" videos")
|
| 322 |
+
for name, filename in VIDEO_FILES.items():
|
| 323 |
+
path = data_root / filename
|
| 324 |
+
feats = read_video_feature_cache(
|
| 325 |
+
path,
|
| 326 |
+
n_frames,
|
| 327 |
+
args.cache_dir,
|
| 328 |
+
args.video_image_size,
|
| 329 |
+
args.video_grid_size,
|
| 330 |
+
args.video_hist_bins,
|
| 331 |
+
args.force_rebuild_cache,
|
| 332 |
+
)
|
| 333 |
+
extras["video"][name] = feats
|
| 334 |
+
available.append({
|
| 335 |
+
"modality": f"video/{name}",
|
| 336 |
+
"path": portable_path(path, args.workspace),
|
| 337 |
+
"shape": list(feats.shape),
|
| 338 |
+
"exists": path.exists(),
|
| 339 |
+
})
|
| 340 |
+
|
| 341 |
+
print(" caption objects/interaction text")
|
| 342 |
+
text = build_text_features(
|
| 343 |
+
ann["caption_frame_info_map"],
|
| 344 |
+
n_frames,
|
| 345 |
+
args.text_hash_dim,
|
| 346 |
+
args.include_label_text,
|
| 347 |
+
)
|
| 348 |
+
extras["text"] = text
|
| 349 |
+
available.append({
|
| 350 |
+
"modality": "caption_text",
|
| 351 |
+
"shape": list(text.shape),
|
| 352 |
+
"fields": "objects,interaction" + (",theme,action_label,action_desc" if args.include_label_text else ""),
|
| 353 |
+
})
|
| 354 |
+
|
| 355 |
+
pc = point_cloud_features(ann.get("slam_point_cloud"))
|
| 356 |
+
if len(pc):
|
| 357 |
+
extras["static"]["slam_point_cloud"] = pc
|
| 358 |
+
available.append({"modality": "slam_point_cloud_static", "shape": [int(len(pc))]})
|
| 359 |
+
|
| 360 |
+
calib = calibration_features(ann.get("calib_data"))
|
| 361 |
+
if len(calib):
|
| 362 |
+
extras["static"]["calibration"] = calib
|
| 363 |
+
available.append({"modality": "calibration_static", "shape": [int(len(calib))]})
|
| 364 |
+
|
| 365 |
+
return extras, available
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
def extract_all_window_features(ann: dict, extras: dict, start: int, end: int, return_blocks: bool = False):
|
| 369 |
+
body = safe_window(ann.get("smplh_body_joints"), start, end)
|
| 370 |
+
left = safe_window(ann.get("hand_left_joints"), start, end)
|
| 371 |
+
right = safe_window(ann.get("hand_right_joints"), start, end)
|
| 372 |
+
contacts = safe_window(ann.get("contacts"), start, end)
|
| 373 |
+
cam_t = safe_window(ann.get("t_c2w_all"), start, end)
|
| 374 |
+
cam_R = safe_window(ann.get("R_c2w_all"), start, end)
|
| 375 |
+
|
| 376 |
+
blocks: list[tuple[str, np.ndarray]] = []
|
| 377 |
+
|
| 378 |
+
def add(name: str, vec: np.ndarray | None) -> None:
|
| 379 |
+
if vec is None:
|
| 380 |
+
return
|
| 381 |
+
arr = np.asarray(vec, dtype=np.float32).reshape(-1)
|
| 382 |
+
if arr.size:
|
| 383 |
+
blocks.append((name, np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)))
|
| 384 |
+
|
| 385 |
+
if left is not None:
|
| 386 |
+
add("hand_left_joints", temporal_stats(center_by_body_root(left, body)))
|
| 387 |
+
if right is not None:
|
| 388 |
+
add("hand_right_joints", temporal_stats(center_by_body_root(right, body)))
|
| 389 |
+
if body is not None:
|
| 390 |
+
root = body[:, :1, :] if body.ndim == 3 else 0.0
|
| 391 |
+
add("body_joints", temporal_stats(body - root))
|
| 392 |
+
if contacts is not None:
|
| 393 |
+
add("body_contacts", temporal_stats(contacts))
|
| 394 |
+
if cam_t is not None:
|
| 395 |
+
add("camera_translation", temporal_stats(cam_t - cam_t[:1]))
|
| 396 |
+
if cam_R is not None:
|
| 397 |
+
add("camera_rotation_matrix", temporal_stats(cam_R))
|
| 398 |
+
|
| 399 |
+
imu_accel = ann.get("imu_accel_xyz")
|
| 400 |
+
imu_gyro = ann.get("imu_gyro_xyz")
|
| 401 |
+
imu_keyframes = ann.get("imu_keyframe_indices")
|
| 402 |
+
if imu_accel is not None and imu_gyro is not None and imu_keyframes is not None and len(imu_keyframes) > end - 1:
|
| 403 |
+
imu_start = int(max(0, imu_keyframes[start]))
|
| 404 |
+
imu_end = int(min(len(imu_accel), max(imu_start + 1, imu_keyframes[end - 1] + 1)))
|
| 405 |
+
imu = np.concatenate([imu_accel[imu_start:imu_end], imu_gyro[imu_start:imu_end]], axis=1)
|
| 406 |
+
add("imu_accel_gyro", temporal_stats(imu))
|
| 407 |
+
|
| 408 |
+
if extras.get("depth") is not None:
|
| 409 |
+
add("depth_confidence", temporal_stats(extras["depth"][start:end]))
|
| 410 |
+
for name, feats in extras.get("video", {}).items():
|
| 411 |
+
add(f"video_{name}", temporal_stats(feats[start:end]))
|
| 412 |
+
if extras.get("text") is not None:
|
| 413 |
+
add("caption_objects_interaction_text", temporal_stats(extras["text"][start:end]))
|
| 414 |
+
for name, vec in extras.get("static", {}).items():
|
| 415 |
+
add(name, vec)
|
| 416 |
+
|
| 417 |
+
if not blocks:
|
| 418 |
+
raise ValueError("No usable modalities found.")
|
| 419 |
+
full = np.concatenate([vec for _, vec in blocks]).astype(np.float32)
|
| 420 |
+
if return_blocks:
|
| 421 |
+
return full, [(name, int(len(vec))) for name, vec in blocks]
|
| 422 |
+
return full
|
| 423 |
+
|
| 424 |
+
|
| 425 |
+
def build_feature_dataset(ann: dict, extras: dict, target: str, window_frames: int, stride_frames: int, min_label_fraction: float):
|
| 426 |
+
frame_info = ann.get("caption_frame_info_map")
|
| 427 |
+
if frame_info is None:
|
| 428 |
+
raise ValueError("No caption_frame_info_map found in annotation.")
|
| 429 |
+
|
| 430 |
+
n_frames = len(ann["img_names"])
|
| 431 |
+
X, y_labels, starts, ends, label_fracs = [], [], [], [], []
|
| 432 |
+
feature_manifest = None
|
| 433 |
+
for start in range(0, n_frames - window_frames + 1, stride_frames):
|
| 434 |
+
end = start + window_frames
|
| 435 |
+
labels = [frame_label(frame_info.get(i, {}), target) for i in range(start, end)]
|
| 436 |
+
label, frac = majority_label(labels, min_label_fraction)
|
| 437 |
+
if not label:
|
| 438 |
+
continue
|
| 439 |
+
if feature_manifest is None:
|
| 440 |
+
vec, blocks = extract_all_window_features(ann, extras, start, end, return_blocks=True)
|
| 441 |
+
offset = 0
|
| 442 |
+
feature_manifest = []
|
| 443 |
+
for name, length in blocks:
|
| 444 |
+
feature_manifest.append({"name": name, "start": offset, "end": offset + length, "dim": length})
|
| 445 |
+
offset += length
|
| 446 |
+
else:
|
| 447 |
+
vec = extract_all_window_features(ann, extras, start, end)
|
| 448 |
+
X.append(vec)
|
| 449 |
+
y_labels.append(label)
|
| 450 |
+
starts.append(start)
|
| 451 |
+
ends.append(end - 1)
|
| 452 |
+
label_fracs.append(frac)
|
| 453 |
+
|
| 454 |
+
if not X:
|
| 455 |
+
raise ValueError("No labeled windows were created. Try lowering --min-label-fraction.")
|
| 456 |
+
|
| 457 |
+
return (
|
| 458 |
+
np.stack(X).astype(np.float32),
|
| 459 |
+
np.asarray(y_labels, dtype=object),
|
| 460 |
+
np.asarray(starts, dtype=np.int64),
|
| 461 |
+
np.asarray(ends, dtype=np.int64),
|
| 462 |
+
np.asarray(label_fracs, dtype=np.float32),
|
| 463 |
+
feature_manifest or [],
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
|
| 467 |
+
def write_extra_reports(output_dir: Path, feature_manifest: list[dict], available_modalities: list[dict], args: argparse.Namespace) -> None:
|
| 468 |
+
(output_dir / "feature_manifest.json").write_text(json.dumps(feature_manifest, indent=2), encoding="utf-8")
|
| 469 |
+
(output_dir / "available_modalities.json").write_text(json.dumps(available_modalities, indent=2), encoding="utf-8")
|
| 470 |
+
with (output_dir / "feature_manifest.csv").open("w", newline="", encoding="utf-8") as fp:
|
| 471 |
+
writer = csv.DictWriter(fp, fieldnames=["name", "start", "end", "dim"])
|
| 472 |
+
writer.writeheader()
|
| 473 |
+
writer.writerows(feature_manifest)
|
| 474 |
+
notes = [
|
| 475 |
+
"This is an all-modality lightweight baseline.",
|
| 476 |
+
"RGB/stereo/fisheye/depth/point-cloud/calibration/text are compressed into handcrafted features.",
|
| 477 |
+
"It is not a deep multimodal model.",
|
| 478 |
+
"Do not treat random windows from one episode as a final generalization benchmark.",
|
| 479 |
+
]
|
| 480 |
+
if args.include_label_text:
|
| 481 |
+
notes.append("WARNING: --include-label-text was used, so language input leaks target semantics.")
|
| 482 |
+
else:
|
| 483 |
+
notes.append("Label text was not included as input; only objects and interaction text were used.")
|
| 484 |
+
(output_dir / "README_model.txt").write_text("\n".join(notes) + "\n", encoding="utf-8")
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
def main() -> int:
|
| 488 |
+
args = parse_args()
|
| 489 |
+
add_toolkit_to_path(args.workspace)
|
| 490 |
+
from data_loader import load_from_annotation_hdf5
|
| 491 |
+
|
| 492 |
+
if not args.annotation.exists():
|
| 493 |
+
raise FileNotFoundError(f"annotation.hdf5 not found: {args.annotation}")
|
| 494 |
+
|
| 495 |
+
print(f"Loading annotation: {args.annotation}")
|
| 496 |
+
ann = load_from_annotation_hdf5(args.annotation, 0, None, load_slam_point_cloud=True)
|
| 497 |
+
|
| 498 |
+
extras, available_modalities = prepare_modalities(args, ann)
|
| 499 |
+
|
| 500 |
+
print("Building all-modality windowed feature dataset")
|
| 501 |
+
X, y_labels, starts, ends, label_fracs, feature_manifest = build_feature_dataset(
|
| 502 |
+
ann,
|
| 503 |
+
extras,
|
| 504 |
+
target=args.target,
|
| 505 |
+
window_frames=args.window_frames,
|
| 506 |
+
stride_frames=args.stride_frames,
|
| 507 |
+
min_label_fraction=args.min_label_fraction,
|
| 508 |
+
)
|
| 509 |
+
y, class_names = encode_labels(y_labels)
|
| 510 |
+
train_idx, test_idx = stratified_split(y, args.test_fraction, args.seed)
|
| 511 |
+
if len(test_idx) == 0:
|
| 512 |
+
raise ValueError("No test windows available. Lower --test-fraction or use more data.")
|
| 513 |
+
|
| 514 |
+
mean, std = fit_scaler(X[train_idx])
|
| 515 |
+
X_scaled = (X - mean) / std
|
| 516 |
+
|
| 517 |
+
print(f"Windows: {len(y)} total, {len(train_idx)} train, {len(test_idx)} test")
|
| 518 |
+
print(f"Features: {X.shape[1]}, classes: {len(class_names)}")
|
| 519 |
+
print("Feature blocks:")
|
| 520 |
+
for block in feature_manifest:
|
| 521 |
+
print(f" {block['dim']:5d} {block['name']}")
|
| 522 |
+
for name, count in Counter(y_labels).most_common():
|
| 523 |
+
print(f" {count:4d} windows {name}")
|
| 524 |
+
|
| 525 |
+
print("Training softmax classifier")
|
| 526 |
+
W, b, history = train_softmax_classifier(
|
| 527 |
+
X_scaled[train_idx],
|
| 528 |
+
y[train_idx],
|
| 529 |
+
n_classes=len(class_names),
|
| 530 |
+
epochs=args.epochs,
|
| 531 |
+
lr=args.learning_rate,
|
| 532 |
+
l2=args.l2,
|
| 533 |
+
use_class_weights=not args.no_class_weights,
|
| 534 |
+
seed=args.seed,
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
y_pred, probs = predict(X_scaled[test_idx], W, b)
|
| 538 |
+
metrics, per_class_rows, cm = compute_metrics(y[test_idx], y_pred, class_names)
|
| 539 |
+
majority_class = Counter(y[train_idx]).most_common(1)[0][0]
|
| 540 |
+
metrics["majority_baseline_accuracy"] = float(np.mean(y[test_idx] == majority_class))
|
| 541 |
+
metrics["train_final_accuracy"] = history[-1]["train_accuracy"] if history else float("nan")
|
| 542 |
+
metrics["train_final_loss"] = history[-1]["loss"] if history else float("nan")
|
| 543 |
+
metrics["feature_dim"] = int(X.shape[1])
|
| 544 |
+
metrics["num_windows"] = int(len(y))
|
| 545 |
+
|
| 546 |
+
save_artifacts(
|
| 547 |
+
args.output_dir,
|
| 548 |
+
X,
|
| 549 |
+
y,
|
| 550 |
+
y_labels,
|
| 551 |
+
starts,
|
| 552 |
+
ends,
|
| 553 |
+
label_fracs,
|
| 554 |
+
train_idx,
|
| 555 |
+
test_idx,
|
| 556 |
+
class_names,
|
| 557 |
+
mean,
|
| 558 |
+
std,
|
| 559 |
+
W,
|
| 560 |
+
b,
|
| 561 |
+
history,
|
| 562 |
+
metrics,
|
| 563 |
+
per_class_rows,
|
| 564 |
+
cm,
|
| 565 |
+
y_pred,
|
| 566 |
+
probs,
|
| 567 |
+
args,
|
| 568 |
+
)
|
| 569 |
+
write_extra_reports(args.output_dir, feature_manifest, available_modalities, args)
|
| 570 |
+
|
| 571 |
+
print("\nEvaluation")
|
| 572 |
+
print(f" accuracy: {metrics['accuracy']:.4f}")
|
| 573 |
+
print(f" balanced_accuracy: {metrics['balanced_accuracy']:.4f}")
|
| 574 |
+
print(f" macro_f1: {metrics['macro_f1']:.4f}")
|
| 575 |
+
print(f" weighted_f1: {metrics['weighted_f1']:.4f}")
|
| 576 |
+
print(f" majority_baseline: {metrics['majority_baseline_accuracy']:.4f}")
|
| 577 |
+
print(f"\nArtifacts written to: {args.output_dir}")
|
| 578 |
+
return 0
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
if __name__ == "__main__":
|
| 582 |
+
raise SystemExit(main())
|
scripts/train_min_action_model.py
ADDED
|
@@ -0,0 +1,531 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Minimal end-to-end action-recognition pipeline for a Ropedia/Xperience episode.
|
| 4 |
+
|
| 5 |
+
Input:
|
| 6 |
+
annotation.hdf5
|
| 7 |
+
|
| 8 |
+
Features:
|
| 9 |
+
hand joints, body joints, contacts, camera trajectory, IMU summary statistics.
|
| 10 |
+
|
| 11 |
+
Target:
|
| 12 |
+
caption action_label by default. Use --target subtask for Sub Task labels.
|
| 13 |
+
|
| 14 |
+
Model:
|
| 15 |
+
Numpy-only multinomial logistic regression.
|
| 16 |
+
|
| 17 |
+
Outputs:
|
| 18 |
+
metrics.json, per_class_metrics.csv, confusion_matrix.csv, predictions.csv,
|
| 19 |
+
feature_dataset.npz, model.npz.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
import argparse
|
| 25 |
+
import csv
|
| 26 |
+
import json
|
| 27 |
+
import math
|
| 28 |
+
import sys
|
| 29 |
+
from collections import Counter, OrderedDict
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
|
| 32 |
+
import numpy as np
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def parse_args() -> argparse.Namespace:
|
| 36 |
+
workspace_default = Path(__file__).resolve().parents[1]
|
| 37 |
+
data_default = workspace_default / "data/sample/xperience-10m-sample/annotation.hdf5"
|
| 38 |
+
out_default = workspace_default / "outputs/min_action_model"
|
| 39 |
+
|
| 40 |
+
parser = argparse.ArgumentParser(description="Train a minimal action classifier on Ropedia annotation.hdf5.")
|
| 41 |
+
parser.add_argument("--workspace", type=Path, default=workspace_default, help="Ropedia workspace root.")
|
| 42 |
+
parser.add_argument("--annotation", type=Path, default=data_default, help="Path to annotation.hdf5.")
|
| 43 |
+
parser.add_argument("--output-dir", type=Path, default=out_default, help="Output artifact directory.")
|
| 44 |
+
parser.add_argument("--target", choices=["action", "subtask"], default="action", help="Prediction target.")
|
| 45 |
+
parser.add_argument("--window-frames", type=int, default=20, help="Frames per training window.")
|
| 46 |
+
parser.add_argument("--stride-frames", type=int, default=5, help="Stride between windows.")
|
| 47 |
+
parser.add_argument("--min-label-fraction", type=float, default=0.6, help="Minimum majority-label fraction in a window.")
|
| 48 |
+
parser.add_argument("--test-fraction", type=float, default=0.25, help="Stratified test fraction.")
|
| 49 |
+
parser.add_argument("--epochs", type=int, default=800, help="Training epochs.")
|
| 50 |
+
parser.add_argument("--learning-rate", type=float, default=0.2, help="Softmax learning rate.")
|
| 51 |
+
parser.add_argument("--l2", type=float, default=1e-3, help="L2 weight decay.")
|
| 52 |
+
parser.add_argument("--seed", type=int, default=7, help="Random seed.")
|
| 53 |
+
parser.add_argument("--no-class-weights", action="store_true", help="Disable inverse-frequency class weighting.")
|
| 54 |
+
return parser.parse_args()
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def add_toolkit_to_path(workspace: Path) -> None:
|
| 58 |
+
toolkit = workspace / "HOMIE-toolkit"
|
| 59 |
+
if not toolkit.exists():
|
| 60 |
+
raise FileNotFoundError(f"HOMIE-toolkit not found: {toolkit}")
|
| 61 |
+
sys.path.insert(0, str(toolkit))
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def portable_path(path: Path, workspace: Path | None = None) -> str:
|
| 65 |
+
roots = [workspace, Path.cwd()]
|
| 66 |
+
for root in roots:
|
| 67 |
+
if root is None:
|
| 68 |
+
continue
|
| 69 |
+
try:
|
| 70 |
+
return path.resolve().relative_to(Path(root).resolve()).as_posix()
|
| 71 |
+
except (FileNotFoundError, ValueError):
|
| 72 |
+
continue
|
| 73 |
+
return path.name
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def temporal_stats(arr: np.ndarray) -> np.ndarray:
|
| 77 |
+
"""Return fixed statistics over time for an array shaped (T, ...)."""
|
| 78 |
+
arr = np.asarray(arr, dtype=np.float32)
|
| 79 |
+
if arr.ndim == 0:
|
| 80 |
+
arr = arr.reshape(1, 1)
|
| 81 |
+
elif arr.ndim == 1:
|
| 82 |
+
arr = arr[:, None]
|
| 83 |
+
flat = arr.reshape(arr.shape[0], -1)
|
| 84 |
+
flat = np.nan_to_num(flat, nan=0.0, posinf=0.0, neginf=0.0)
|
| 85 |
+
if flat.shape[0] == 0:
|
| 86 |
+
raise ValueError("temporal_stats received an empty time axis")
|
| 87 |
+
|
| 88 |
+
mean = flat.mean(axis=0)
|
| 89 |
+
std = flat.std(axis=0)
|
| 90 |
+
amin = flat.min(axis=0)
|
| 91 |
+
amax = flat.max(axis=0)
|
| 92 |
+
delta = flat[-1] - flat[0]
|
| 93 |
+
if flat.shape[0] > 1:
|
| 94 |
+
vel = np.diff(flat, axis=0)
|
| 95 |
+
vel_mean = vel.mean(axis=0)
|
| 96 |
+
vel_std = vel.std(axis=0)
|
| 97 |
+
else:
|
| 98 |
+
vel_mean = np.zeros(flat.shape[1], dtype=np.float32)
|
| 99 |
+
vel_std = np.zeros(flat.shape[1], dtype=np.float32)
|
| 100 |
+
return np.concatenate([mean, std, amin, amax, delta, vel_mean, vel_std]).astype(np.float32)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def safe_window(arr: np.ndarray | None, start: int, end: int) -> np.ndarray | None:
|
| 104 |
+
if arr is None:
|
| 105 |
+
return None
|
| 106 |
+
if start >= len(arr):
|
| 107 |
+
return None
|
| 108 |
+
return np.asarray(arr[start:min(end, len(arr))])
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def center_by_body_root(values: np.ndarray, body: np.ndarray | None) -> np.ndarray:
|
| 112 |
+
if body is None or len(body) != len(values) or body.ndim < 3 or body.shape[-1] != 3:
|
| 113 |
+
return values
|
| 114 |
+
root = body[:, :1, :]
|
| 115 |
+
return values - root
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def extract_window_features(ann: dict, start: int, end: int) -> np.ndarray:
|
| 119 |
+
body = safe_window(ann.get("smplh_body_joints"), start, end)
|
| 120 |
+
left = safe_window(ann.get("hand_left_joints"), start, end)
|
| 121 |
+
right = safe_window(ann.get("hand_right_joints"), start, end)
|
| 122 |
+
contacts = safe_window(ann.get("contacts"), start, end)
|
| 123 |
+
cam_t = safe_window(ann.get("t_c2w_all"), start, end)
|
| 124 |
+
|
| 125 |
+
chunks: list[np.ndarray] = []
|
| 126 |
+
|
| 127 |
+
if left is not None:
|
| 128 |
+
chunks.append(temporal_stats(center_by_body_root(left, body)))
|
| 129 |
+
if right is not None:
|
| 130 |
+
chunks.append(temporal_stats(center_by_body_root(right, body)))
|
| 131 |
+
if body is not None:
|
| 132 |
+
root = body[:, :1, :] if body.ndim == 3 else 0.0
|
| 133 |
+
chunks.append(temporal_stats(body - root))
|
| 134 |
+
if contacts is not None:
|
| 135 |
+
chunks.append(temporal_stats(contacts))
|
| 136 |
+
if cam_t is not None:
|
| 137 |
+
cam_t = cam_t - cam_t[:1]
|
| 138 |
+
chunks.append(temporal_stats(cam_t))
|
| 139 |
+
|
| 140 |
+
imu_accel = ann.get("imu_accel_xyz")
|
| 141 |
+
imu_gyro = ann.get("imu_gyro_xyz")
|
| 142 |
+
imu_keyframes = ann.get("imu_keyframe_indices")
|
| 143 |
+
if imu_accel is not None and imu_gyro is not None and imu_keyframes is not None and len(imu_keyframes) > end - 1:
|
| 144 |
+
imu_start = int(max(0, imu_keyframes[start]))
|
| 145 |
+
imu_end = int(min(len(imu_accel), max(imu_start + 1, imu_keyframes[end - 1] + 1)))
|
| 146 |
+
imu = np.concatenate([imu_accel[imu_start:imu_end], imu_gyro[imu_start:imu_end]], axis=1)
|
| 147 |
+
chunks.append(temporal_stats(imu))
|
| 148 |
+
|
| 149 |
+
if not chunks:
|
| 150 |
+
raise ValueError("No usable numeric modalities found in annotation.")
|
| 151 |
+
return np.concatenate(chunks).astype(np.float32)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def frame_label(info: dict, target: str) -> str:
|
| 155 |
+
if target == "subtask":
|
| 156 |
+
label = info.get("theme", "")
|
| 157 |
+
else:
|
| 158 |
+
label = info.get("action_label", "")
|
| 159 |
+
label = str(label).strip()
|
| 160 |
+
if not label or label.upper() == "N/A":
|
| 161 |
+
return ""
|
| 162 |
+
return label
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def majority_label(labels: list[str], min_fraction: float) -> tuple[str, float]:
|
| 166 |
+
labels = [x for x in labels if x]
|
| 167 |
+
if not labels:
|
| 168 |
+
return "", 0.0
|
| 169 |
+
label, count = Counter(labels).most_common(1)[0]
|
| 170 |
+
frac = count / len(labels)
|
| 171 |
+
if frac < min_fraction:
|
| 172 |
+
return "", frac
|
| 173 |
+
return label, frac
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def build_feature_dataset(ann: dict, target: str, window_frames: int, stride_frames: int, min_label_fraction: float):
|
| 177 |
+
frame_info = ann.get("caption_frame_info_map")
|
| 178 |
+
if frame_info is None:
|
| 179 |
+
raise ValueError("No caption_frame_info_map found in annotation.")
|
| 180 |
+
|
| 181 |
+
n_frames = len(ann["img_names"])
|
| 182 |
+
X, y_labels, starts, ends, label_fracs = [], [], [], [], []
|
| 183 |
+
for start in range(0, n_frames - window_frames + 1, stride_frames):
|
| 184 |
+
end = start + window_frames
|
| 185 |
+
labels = [frame_label(frame_info.get(i, {}), target) for i in range(start, end)]
|
| 186 |
+
label, frac = majority_label(labels, min_label_fraction)
|
| 187 |
+
if not label:
|
| 188 |
+
continue
|
| 189 |
+
X.append(extract_window_features(ann, start, end))
|
| 190 |
+
y_labels.append(label)
|
| 191 |
+
starts.append(start)
|
| 192 |
+
ends.append(end - 1)
|
| 193 |
+
label_fracs.append(frac)
|
| 194 |
+
|
| 195 |
+
if not X:
|
| 196 |
+
raise ValueError("No labeled windows were created. Try lowering --min-label-fraction.")
|
| 197 |
+
|
| 198 |
+
return (
|
| 199 |
+
np.stack(X).astype(np.float32),
|
| 200 |
+
np.asarray(y_labels, dtype=object),
|
| 201 |
+
np.asarray(starts, dtype=np.int64),
|
| 202 |
+
np.asarray(ends, dtype=np.int64),
|
| 203 |
+
np.asarray(label_fracs, dtype=np.float32),
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def encode_labels(y_labels: np.ndarray) -> tuple[np.ndarray, list[str]]:
|
| 208 |
+
seen = OrderedDict()
|
| 209 |
+
for label in y_labels:
|
| 210 |
+
if label not in seen:
|
| 211 |
+
seen[label] = len(seen)
|
| 212 |
+
class_names = list(seen.keys())
|
| 213 |
+
y = np.asarray([seen[label] for label in y_labels], dtype=np.int64)
|
| 214 |
+
return y, class_names
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def stratified_split(y: np.ndarray, test_fraction: float, seed: int) -> tuple[np.ndarray, np.ndarray]:
|
| 218 |
+
rng = np.random.default_rng(seed)
|
| 219 |
+
train_idx, test_idx = [], []
|
| 220 |
+
for cls in np.unique(y):
|
| 221 |
+
idx = np.flatnonzero(y == cls)
|
| 222 |
+
rng.shuffle(idx)
|
| 223 |
+
if len(idx) < 2:
|
| 224 |
+
train_idx.extend(idx.tolist())
|
| 225 |
+
continue
|
| 226 |
+
n_test = int(round(len(idx) * test_fraction))
|
| 227 |
+
n_test = max(1, min(n_test, len(idx) - 1))
|
| 228 |
+
test_idx.extend(idx[:n_test].tolist())
|
| 229 |
+
train_idx.extend(idx[n_test:].tolist())
|
| 230 |
+
rng.shuffle(train_idx)
|
| 231 |
+
rng.shuffle(test_idx)
|
| 232 |
+
return np.asarray(train_idx, dtype=np.int64), np.asarray(test_idx, dtype=np.int64)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def fit_scaler(X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
| 236 |
+
mean = X.mean(axis=0)
|
| 237 |
+
std = X.std(axis=0)
|
| 238 |
+
std = np.where(std < 1e-6, 1.0, std)
|
| 239 |
+
return mean.astype(np.float32), std.astype(np.float32)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def softmax(logits: np.ndarray) -> np.ndarray:
|
| 243 |
+
logits = logits - logits.max(axis=1, keepdims=True)
|
| 244 |
+
exp = np.exp(logits)
|
| 245 |
+
return exp / exp.sum(axis=1, keepdims=True)
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def train_softmax_classifier(
|
| 249 |
+
X: np.ndarray,
|
| 250 |
+
y: np.ndarray,
|
| 251 |
+
n_classes: int,
|
| 252 |
+
epochs: int,
|
| 253 |
+
lr: float,
|
| 254 |
+
l2: float,
|
| 255 |
+
use_class_weights: bool,
|
| 256 |
+
seed: int,
|
| 257 |
+
) -> tuple[np.ndarray, np.ndarray, list[dict]]:
|
| 258 |
+
rng = np.random.default_rng(seed)
|
| 259 |
+
n, d = X.shape
|
| 260 |
+
W = rng.normal(0.0, 0.01, size=(d, n_classes)).astype(np.float32)
|
| 261 |
+
b = np.zeros(n_classes, dtype=np.float32)
|
| 262 |
+
onehot = np.eye(n_classes, dtype=np.float32)[y]
|
| 263 |
+
|
| 264 |
+
if use_class_weights:
|
| 265 |
+
counts = np.bincount(y, minlength=n_classes).astype(np.float32)
|
| 266 |
+
weights_by_class = n / np.maximum(counts, 1.0) / n_classes
|
| 267 |
+
sample_weights = weights_by_class[y]
|
| 268 |
+
else:
|
| 269 |
+
sample_weights = np.ones(n, dtype=np.float32)
|
| 270 |
+
sample_weights = sample_weights / sample_weights.mean()
|
| 271 |
+
|
| 272 |
+
history = []
|
| 273 |
+
report_every = max(1, epochs // 10)
|
| 274 |
+
for epoch in range(1, epochs + 1):
|
| 275 |
+
logits = X @ W + b
|
| 276 |
+
probs = softmax(logits)
|
| 277 |
+
weighted_diff = (probs - onehot) * sample_weights[:, None] / n
|
| 278 |
+
grad_W = X.T @ weighted_diff + l2 * W
|
| 279 |
+
grad_b = weighted_diff.sum(axis=0)
|
| 280 |
+
W -= lr * grad_W
|
| 281 |
+
b -= lr * grad_b
|
| 282 |
+
|
| 283 |
+
if epoch == 1 or epoch == epochs or epoch % report_every == 0:
|
| 284 |
+
p_true = np.clip(probs[np.arange(n), y], 1e-9, 1.0)
|
| 285 |
+
loss = float(-(sample_weights * np.log(p_true)).mean() + 0.5 * l2 * float(np.sum(W * W)))
|
| 286 |
+
acc = float(np.mean(np.argmax(probs, axis=1) == y))
|
| 287 |
+
history.append({"epoch": epoch, "loss": loss, "train_accuracy": acc})
|
| 288 |
+
return W.astype(np.float32), b.astype(np.float32), history
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def predict(X: np.ndarray, W: np.ndarray, b: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
| 292 |
+
probs = softmax(X @ W + b)
|
| 293 |
+
return np.argmax(probs, axis=1), probs
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray, class_names: list[str]) -> tuple[dict, list[dict], np.ndarray]:
|
| 297 |
+
n_classes = len(class_names)
|
| 298 |
+
cm = np.zeros((n_classes, n_classes), dtype=np.int64)
|
| 299 |
+
for t, p in zip(y_true, y_pred):
|
| 300 |
+
cm[int(t), int(p)] += 1
|
| 301 |
+
|
| 302 |
+
rows = []
|
| 303 |
+
recalls, f1s, weighted_f1_total = [], [], 0.0
|
| 304 |
+
support_total = int(cm.sum())
|
| 305 |
+
for i, name in enumerate(class_names):
|
| 306 |
+
tp = int(cm[i, i])
|
| 307 |
+
support = int(cm[i, :].sum())
|
| 308 |
+
pred_count = int(cm[:, i].sum())
|
| 309 |
+
precision = tp / pred_count if pred_count else 0.0
|
| 310 |
+
recall = tp / support if support else 0.0
|
| 311 |
+
f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
|
| 312 |
+
if support:
|
| 313 |
+
recalls.append(recall)
|
| 314 |
+
f1s.append(f1)
|
| 315 |
+
weighted_f1_total += f1 * support
|
| 316 |
+
rows.append({
|
| 317 |
+
"class_id": i,
|
| 318 |
+
"class_name": name,
|
| 319 |
+
"support": support,
|
| 320 |
+
"predicted": pred_count,
|
| 321 |
+
"precision": precision,
|
| 322 |
+
"recall": recall,
|
| 323 |
+
"f1": f1,
|
| 324 |
+
})
|
| 325 |
+
|
| 326 |
+
accuracy = float(np.mean(y_true == y_pred)) if len(y_true) else 0.0
|
| 327 |
+
macro_f1 = float(np.mean(f1s)) if f1s else 0.0
|
| 328 |
+
balanced_accuracy = float(np.mean(recalls)) if recalls else 0.0
|
| 329 |
+
weighted_f1 = float(weighted_f1_total / support_total) if support_total else 0.0
|
| 330 |
+
metrics = {
|
| 331 |
+
"accuracy": accuracy,
|
| 332 |
+
"balanced_accuracy": balanced_accuracy,
|
| 333 |
+
"macro_f1": macro_f1,
|
| 334 |
+
"weighted_f1": weighted_f1,
|
| 335 |
+
"num_eval_windows": int(len(y_true)),
|
| 336 |
+
"num_classes": n_classes,
|
| 337 |
+
}
|
| 338 |
+
return metrics, rows, cm
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def write_csv(path: Path, rows: list[dict], fieldnames: list[str]) -> None:
|
| 342 |
+
with path.open("w", newline="", encoding="utf-8") as fp:
|
| 343 |
+
writer = csv.DictWriter(fp, fieldnames=fieldnames)
|
| 344 |
+
writer.writeheader()
|
| 345 |
+
writer.writerows(rows)
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def save_artifacts(
|
| 349 |
+
output_dir: Path,
|
| 350 |
+
X: np.ndarray,
|
| 351 |
+
y: np.ndarray,
|
| 352 |
+
y_labels: np.ndarray,
|
| 353 |
+
starts: np.ndarray,
|
| 354 |
+
ends: np.ndarray,
|
| 355 |
+
label_fracs: np.ndarray,
|
| 356 |
+
train_idx: np.ndarray,
|
| 357 |
+
test_idx: np.ndarray,
|
| 358 |
+
class_names: list[str],
|
| 359 |
+
mean: np.ndarray,
|
| 360 |
+
std: np.ndarray,
|
| 361 |
+
W: np.ndarray,
|
| 362 |
+
b: np.ndarray,
|
| 363 |
+
history: list[dict],
|
| 364 |
+
metrics: dict,
|
| 365 |
+
per_class_rows: list[dict],
|
| 366 |
+
cm: np.ndarray,
|
| 367 |
+
y_pred: np.ndarray,
|
| 368 |
+
probs: np.ndarray,
|
| 369 |
+
args: argparse.Namespace,
|
| 370 |
+
) -> None:
|
| 371 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 372 |
+
|
| 373 |
+
np.savez_compressed(
|
| 374 |
+
output_dir / "feature_dataset.npz",
|
| 375 |
+
X=X,
|
| 376 |
+
y=y,
|
| 377 |
+
labels=y_labels.astype(str),
|
| 378 |
+
start_frame=starts,
|
| 379 |
+
end_frame=ends,
|
| 380 |
+
label_fraction=label_fracs,
|
| 381 |
+
train_idx=train_idx,
|
| 382 |
+
test_idx=test_idx,
|
| 383 |
+
class_names=np.asarray(class_names, dtype=object),
|
| 384 |
+
)
|
| 385 |
+
np.savez_compressed(output_dir / "model.npz", mean=mean, std=std, W=W, b=b, class_names=np.asarray(class_names, dtype=object))
|
| 386 |
+
|
| 387 |
+
metadata = {
|
| 388 |
+
"annotation": portable_path(args.annotation, args.workspace),
|
| 389 |
+
"target": args.target,
|
| 390 |
+
"window_frames": args.window_frames,
|
| 391 |
+
"stride_frames": args.stride_frames,
|
| 392 |
+
"min_label_fraction": args.min_label_fraction,
|
| 393 |
+
"test_fraction": args.test_fraction,
|
| 394 |
+
"epochs": args.epochs,
|
| 395 |
+
"learning_rate": args.learning_rate,
|
| 396 |
+
"l2": args.l2,
|
| 397 |
+
"class_weights": not args.no_class_weights,
|
| 398 |
+
"num_windows": int(len(y)),
|
| 399 |
+
"num_features": int(X.shape[1]),
|
| 400 |
+
"num_train_windows": int(len(train_idx)),
|
| 401 |
+
"num_test_windows": int(len(test_idx)),
|
| 402 |
+
"classes": class_names,
|
| 403 |
+
"history": history,
|
| 404 |
+
}
|
| 405 |
+
(output_dir / "metadata.json").write_text(json.dumps(metadata, indent=2), encoding="utf-8")
|
| 406 |
+
(output_dir / "metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8")
|
| 407 |
+
|
| 408 |
+
write_csv(
|
| 409 |
+
output_dir / "per_class_metrics.csv",
|
| 410 |
+
per_class_rows,
|
| 411 |
+
["class_id", "class_name", "support", "predicted", "precision", "recall", "f1"],
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
with (output_dir / "confusion_matrix.csv").open("w", newline="", encoding="utf-8") as fp:
|
| 415 |
+
writer = csv.writer(fp)
|
| 416 |
+
writer.writerow(["true\\pred"] + class_names)
|
| 417 |
+
for i, name in enumerate(class_names):
|
| 418 |
+
writer.writerow([name] + [int(v) for v in cm[i]])
|
| 419 |
+
|
| 420 |
+
pred_rows = []
|
| 421 |
+
pred_lookup = {int(idx): k for k, idx in enumerate(test_idx)}
|
| 422 |
+
for idx in test_idx:
|
| 423 |
+
idx = int(idx)
|
| 424 |
+
k = pred_lookup[idx]
|
| 425 |
+
pred_id = int(y_pred[k])
|
| 426 |
+
true_id = int(y[idx])
|
| 427 |
+
pred_rows.append({
|
| 428 |
+
"window_index": idx,
|
| 429 |
+
"start_frame": int(starts[idx]),
|
| 430 |
+
"end_frame": int(ends[idx]),
|
| 431 |
+
"true_label": class_names[true_id],
|
| 432 |
+
"predicted_label": class_names[pred_id],
|
| 433 |
+
"confidence": float(probs[k, pred_id]),
|
| 434 |
+
"correct": int(pred_id == true_id),
|
| 435 |
+
"label_fraction": float(label_fracs[idx]),
|
| 436 |
+
})
|
| 437 |
+
write_csv(
|
| 438 |
+
output_dir / "predictions.csv",
|
| 439 |
+
pred_rows,
|
| 440 |
+
["window_index", "start_frame", "end_frame", "true_label", "predicted_label", "confidence", "correct", "label_fraction"],
|
| 441 |
+
)
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
def main() -> int:
|
| 445 |
+
args = parse_args()
|
| 446 |
+
add_toolkit_to_path(args.workspace)
|
| 447 |
+
from data_loader import load_from_annotation_hdf5
|
| 448 |
+
|
| 449 |
+
if not args.annotation.exists():
|
| 450 |
+
raise FileNotFoundError(f"annotation.hdf5 not found: {args.annotation}")
|
| 451 |
+
|
| 452 |
+
print(f"Loading annotation: {args.annotation}")
|
| 453 |
+
ann = load_from_annotation_hdf5(args.annotation, 0, None, load_slam_point_cloud=False)
|
| 454 |
+
|
| 455 |
+
print("Building windowed feature dataset")
|
| 456 |
+
X, y_labels, starts, ends, label_fracs = build_feature_dataset(
|
| 457 |
+
ann,
|
| 458 |
+
target=args.target,
|
| 459 |
+
window_frames=args.window_frames,
|
| 460 |
+
stride_frames=args.stride_frames,
|
| 461 |
+
min_label_fraction=args.min_label_fraction,
|
| 462 |
+
)
|
| 463 |
+
y, class_names = encode_labels(y_labels)
|
| 464 |
+
train_idx, test_idx = stratified_split(y, args.test_fraction, args.seed)
|
| 465 |
+
if len(test_idx) == 0:
|
| 466 |
+
raise ValueError("No test windows available. Lower --test-fraction or use more data.")
|
| 467 |
+
|
| 468 |
+
mean, std = fit_scaler(X[train_idx])
|
| 469 |
+
X_scaled = (X - mean) / std
|
| 470 |
+
|
| 471 |
+
print(f"Windows: {len(y)} total, {len(train_idx)} train, {len(test_idx)} test")
|
| 472 |
+
print(f"Features: {X.shape[1]}, classes: {len(class_names)}")
|
| 473 |
+
for name, count in Counter(y_labels).most_common():
|
| 474 |
+
print(f" {count:4d} windows {name}")
|
| 475 |
+
|
| 476 |
+
print("Training softmax classifier")
|
| 477 |
+
W, b, history = train_softmax_classifier(
|
| 478 |
+
X_scaled[train_idx],
|
| 479 |
+
y[train_idx],
|
| 480 |
+
n_classes=len(class_names),
|
| 481 |
+
epochs=args.epochs,
|
| 482 |
+
lr=args.learning_rate,
|
| 483 |
+
l2=args.l2,
|
| 484 |
+
use_class_weights=not args.no_class_weights,
|
| 485 |
+
seed=args.seed,
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
y_pred, probs = predict(X_scaled[test_idx], W, b)
|
| 489 |
+
metrics, per_class_rows, cm = compute_metrics(y[test_idx], y_pred, class_names)
|
| 490 |
+
|
| 491 |
+
majority_class = Counter(y[train_idx]).most_common(1)[0][0]
|
| 492 |
+
metrics["majority_baseline_accuracy"] = float(np.mean(y[test_idx] == majority_class))
|
| 493 |
+
metrics["train_final_accuracy"] = history[-1]["train_accuracy"] if history else math.nan
|
| 494 |
+
metrics["train_final_loss"] = history[-1]["loss"] if history else math.nan
|
| 495 |
+
|
| 496 |
+
save_artifacts(
|
| 497 |
+
args.output_dir,
|
| 498 |
+
X,
|
| 499 |
+
y,
|
| 500 |
+
y_labels,
|
| 501 |
+
starts,
|
| 502 |
+
ends,
|
| 503 |
+
label_fracs,
|
| 504 |
+
train_idx,
|
| 505 |
+
test_idx,
|
| 506 |
+
class_names,
|
| 507 |
+
mean,
|
| 508 |
+
std,
|
| 509 |
+
W,
|
| 510 |
+
b,
|
| 511 |
+
history,
|
| 512 |
+
metrics,
|
| 513 |
+
per_class_rows,
|
| 514 |
+
cm,
|
| 515 |
+
y_pred,
|
| 516 |
+
probs,
|
| 517 |
+
args,
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
+
print("\nEvaluation")
|
| 521 |
+
print(f" accuracy: {metrics['accuracy']:.4f}")
|
| 522 |
+
print(f" balanced_accuracy: {metrics['balanced_accuracy']:.4f}")
|
| 523 |
+
print(f" macro_f1: {metrics['macro_f1']:.4f}")
|
| 524 |
+
print(f" weighted_f1: {metrics['weighted_f1']:.4f}")
|
| 525 |
+
print(f" majority_baseline: {metrics['majority_baseline_accuracy']:.4f}")
|
| 526 |
+
print(f"\nArtifacts written to: {args.output_dir}")
|
| 527 |
+
return 0
|
| 528 |
+
|
| 529 |
+
|
| 530 |
+
if __name__ == "__main__":
|
| 531 |
+
raise SystemExit(main())
|