Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "title": "Ropedia Xperience-10M Unified 20-Task Provenance Bundle", | |
| "status": "pass", | |
| "generated_at_utc": "2026-06-16T06:25:58+00:00", | |
| "suite_position": "unified_20_task_provenance", | |
| "legacy_path_note": "The tier2_task_suite file and directory names are retained for stable public links; this bundle is provenance inside the unified 20-task suite, not a separate public tier.", | |
| "unified_task_integration": { | |
| "total_task_count": 20, | |
| "legacy_provenance_row_count": 8, | |
| "shared_metrics": "docs/data/summary_metrics.json", | |
| "unified_protocol": "docs/data/evaluation_protocol.json" | |
| }, | |
| "dataset_scope": { | |
| "sample_episode_count": 1, | |
| "num_frames": 5821, | |
| "num_windows": 1161, | |
| "feature_dim": 8546, | |
| "window_frames": 20, | |
| "stride_frames": 5, | |
| "future_horizon_windows": 20, | |
| "future_horizon_frames": 100, | |
| "future_horizon_seconds_at_20fps": 5.0, | |
| "transition_target_cap_frames": 200, | |
| "transition_target_cap_seconds_at_20fps": 10.0, | |
| "split_policy": "single_episode_chronological_70_30", | |
| "raw_hdf5_required_to_regenerate": true, | |
| "raw_data_redistributed": false | |
| }, | |
| "setup_alignment": { | |
| "same_window_unit_as_unified_suite": true, | |
| "same_feature_manifest_as_unified_suite": "results/episode_task_suite/feature_manifest.json", | |
| "same_shared_tensor_as_unified_suite": "results/episode_task_suite/shared_windows.npz", | |
| "minimal_baselines": "softmax, ridge regression/projection, and ridge multilabel heads", | |
| "neural_baselines": "compact one-hidden-layer/two-layer PyTorch MLP heads with the same chronological split", | |
| "leakage_policy": "Caption-derived text features are removed whenever the target is a label, object, relation, interaction phrase, or future semantic state." | |
| }, | |
| "source_files": [ | |
| "results/episode_task_suite/shared_windows.npz", | |
| "results/episode_task_suite/windows.csv", | |
| "results/episode_task_suite/feature_manifest.json", | |
| "data/sample/xperience-10m-sample/annotation.hdf5" | |
| ], | |
| "task_specs": { | |
| "long_horizon_next_action": { | |
| "name": "Long-Horizon Next-Action Forecasting", | |
| "family": "classification", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "target": "Action label five seconds later.", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "meaning": "Tests whether the current state carries enough procedure context to forecast beyond the one-second core next-action task." | |
| }, | |
| "next_subtask_forecast": { | |
| "name": "Long-Horizon Next-Subtask Forecasting", | |
| "family": "classification", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "target": "Procedure subtask label five seconds later.", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "meaning": "Moves from immediate action anticipation to higher-level procedure-state prediction." | |
| }, | |
| "interaction_text_prediction": { | |
| "name": "Interaction Text Prediction", | |
| "family": "classification", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "target": "Raw annotation interaction phrase for the same window.", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "meaning": "Uses the raw caption JSON interaction field as a language target instead of only the hashed text feature." | |
| }, | |
| "action_object_relation": { | |
| "name": "Action-Object Relation Prediction", | |
| "family": "classification", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "target": "Joint action plus active object-set relation.", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "meaning": "Evaluates whether a model can bind what action is happening to which objects are involved." | |
| }, | |
| "object_set_forecast": { | |
| "name": "Future Object-Set Forecasting", | |
| "family": "multi_label", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "target": "Object set active five seconds later.", | |
| "metric_key": "micro_f1", | |
| "metric_name": "micro-F1", | |
| "metric_direction": "higher", | |
| "meaning": "Predicts which objects will become relevant soon, not only which objects are relevant now." | |
| }, | |
| "imu_to_hand_pose": { | |
| "name": "IMU-to-Hand Pose Reconstruction", | |
| "family": "regression", | |
| "input": "Current IMU acceleration/gyroscope feature block only.", | |
| "target": "Current left/right hand joint feature blocks.", | |
| "metric_key": "mae", | |
| "metric_name": "MAE", | |
| "metric_direction": "lower", | |
| "meaning": "A sensor-bridge probe for how much hand configuration can be recovered from inertial motion alone." | |
| }, | |
| "camera_view_sync_retrieval": { | |
| "name": "Camera-View Synchronization Retrieval", | |
| "family": "retrieval", | |
| "input": "Fisheye camera-1 feature query projected into fisheye camera-3 feature space.", | |
| "target": "The synchronized held-out camera-3 window.", | |
| "metric_key": "mrr", | |
| "metric_name": "MRR", | |
| "metric_direction": "higher", | |
| "meaning": "Stress-tests multi-camera time alignment beyond the core cross-modal retrieval task." | |
| }, | |
| "time_to_transition": { | |
| "name": "Time-to-Next-Transition Regression", | |
| "family": "regression", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "target": "Frames until the next action-label boundary, capped at 200 frames.", | |
| "metric_key": "mae", | |
| "metric_name": "MAE frames", | |
| "metric_direction": "lower", | |
| "meaning": "Turns boundary detection into a continuous timing estimate for procedural control." | |
| } | |
| }, | |
| "tasks": { | |
| "long_horizon_next_action": { | |
| "minimal": { | |
| "accuracy": 0.055900621118012424, | |
| "balanced_accuracy": 0.072, | |
| "macro_f1": 0.07499999999999998, | |
| "weighted_f1": 0.058229813664596265, | |
| "num_eval_windows": 322, | |
| "num_classes": 18, | |
| "status": "pass", | |
| "task": "long_horizon_next_action", | |
| "task_display_name": "Long-Horizon Next-Action Forecasting", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "minimal_softmax", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 1073, | |
| "num_train_windows": 751, | |
| "num_test_windows": 322, | |
| "num_train_classes": 14, | |
| "majority_baseline_accuracy": 0.0, | |
| "primary_metric": "macro_f1", | |
| "primary_score": 0.07499999999999998, | |
| "unseen_test_class_count": 4, | |
| "unseen_test_classes": [ | |
| "Place item on table", | |
| "Wait/Prepare for pouring", | |
| "Pour coffee", | |
| "Pour milk into coffee" | |
| ], | |
| "history": [ | |
| { | |
| "epoch": 1, | |
| "loss": 2.9943459033966064, | |
| "train_accuracy": 0.07190412782956059 | |
| }, | |
| { | |
| "epoch": 22, | |
| "loss": 0.022863121703267097, | |
| "train_accuracy": 0.9986684420772304 | |
| }, | |
| { | |
| "epoch": 44, | |
| "loss": 0.019138943403959274, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 66, | |
| "loss": 0.017911160364747047, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 88, | |
| "loss": 0.017209626734256744, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 110, | |
| "loss": 0.0167277492582798, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 132, | |
| "loss": 0.016360996291041374, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 154, | |
| "loss": 0.016062702983617783, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 176, | |
| "loss": 0.015808619558811188, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 198, | |
| "loss": 0.015584941953420639, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 220, | |
| "loss": 0.015383150428533554, | |
| "train_accuracy": 1.0 | |
| } | |
| ] | |
| }, | |
| "neural_mlp": { | |
| "accuracy": 0.055900621118012424, | |
| "balanced_accuracy": 0.072, | |
| "macro_f1": 0.06545454545454546, | |
| "weighted_f1": 0.05081874647092039, | |
| "num_eval_windows": 322, | |
| "num_classes": 18, | |
| "status": "pass", | |
| "task": "long_horizon_next_action", | |
| "task_display_name": "Long-Horizon Next-Action Forecasting", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "neural_mlp", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 1073, | |
| "num_train_windows": 751, | |
| "num_test_windows": 322, | |
| "primary_metric": "macro_f1", | |
| "primary_score": 0.06545454545454546, | |
| "history": [ | |
| { | |
| "epoch": 1, | |
| "loss": 1.8488772948794612, | |
| "train_accuracy": 0.4420772303595206 | |
| }, | |
| { | |
| "epoch": 5, | |
| "loss": 0.05503799814170353, | |
| "train_accuracy": 0.9760319573901465 | |
| }, | |
| { | |
| "epoch": 10, | |
| "loss": 0.005950478469201434, | |
| "train_accuracy": 0.9973368841544608 | |
| }, | |
| { | |
| "epoch": 15, | |
| "loss": 0.004196559216643618, | |
| "train_accuracy": 0.9986684420772304 | |
| }, | |
| { | |
| "epoch": 20, | |
| "loss": 0.0011443984907922818, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 25, | |
| "loss": 0.0011185314030400149, | |
| "train_accuracy": 1.0 | |
| } | |
| ], | |
| "device": "cpu" | |
| } | |
| }, | |
| "next_subtask_forecast": { | |
| "minimal": { | |
| "accuracy": 0.02046783625730994, | |
| "balanced_accuracy": 0.029166666666666667, | |
| "macro_f1": 0.04545454545454545, | |
| "weighted_f1": 0.03189792663476874, | |
| "num_eval_windows": 342, | |
| "num_classes": 14, | |
| "status": "pass", | |
| "task": "next_subtask_forecast", | |
| "task_display_name": "Long-Horizon Next-Subtask Forecasting", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "minimal_softmax", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 1141, | |
| "num_train_windows": 799, | |
| "num_test_windows": 342, | |
| "num_train_classes": 11, | |
| "majority_baseline_accuracy": 0.0, | |
| "primary_metric": "macro_f1", | |
| "primary_score": 0.04545454545454545, | |
| "unseen_test_class_count": 3, | |
| "unseen_test_classes": [ | |
| "Prepare for pouring", | |
| "Pour coffee", | |
| "Pour milk into coffee" | |
| ], | |
| "history": [ | |
| { | |
| "epoch": 1, | |
| "loss": 2.55131196975708, | |
| "train_accuracy": 0.1113892365456821 | |
| }, | |
| { | |
| "epoch": 22, | |
| "loss": 0.028098762035369873, | |
| "train_accuracy": 0.9949937421777222 | |
| }, | |
| { | |
| "epoch": 44, | |
| "loss": 0.021430641412734985, | |
| "train_accuracy": 0.9987484355444305 | |
| }, | |
| { | |
| "epoch": 66, | |
| "loss": 0.01899738796055317, | |
| "train_accuracy": 0.9987484355444305 | |
| }, | |
| { | |
| "epoch": 88, | |
| "loss": 0.017645347863435745, | |
| "train_accuracy": 0.9987484355444305 | |
| }, | |
| { | |
| "epoch": 110, | |
| "loss": 0.016760651022195816, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 132, | |
| "loss": 0.016124067828059196, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 154, | |
| "loss": 0.015635930001735687, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 176, | |
| "loss": 0.015243873000144958, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 198, | |
| "loss": 0.014917710795998573, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 220, | |
| "loss": 0.014638766646385193, | |
| "train_accuracy": 1.0 | |
| } | |
| ] | |
| }, | |
| "neural_mlp": { | |
| "accuracy": 0.02046783625730994, | |
| "balanced_accuracy": 0.029166666666666667, | |
| "macro_f1": 0.050724637681159424, | |
| "weighted_f1": 0.03559623696923468, | |
| "num_eval_windows": 342, | |
| "num_classes": 14, | |
| "status": "pass", | |
| "task": "next_subtask_forecast", | |
| "task_display_name": "Long-Horizon Next-Subtask Forecasting", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "neural_mlp", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 1141, | |
| "num_train_windows": 799, | |
| "num_test_windows": 342, | |
| "primary_metric": "macro_f1", | |
| "primary_score": 0.050724637681159424, | |
| "history": [ | |
| { | |
| "epoch": 1, | |
| "loss": 1.578477246442038, | |
| "train_accuracy": 0.46307884856070086 | |
| }, | |
| { | |
| "epoch": 5, | |
| "loss": 0.043756316020823686, | |
| "train_accuracy": 0.9824780976220275 | |
| }, | |
| { | |
| "epoch": 10, | |
| "loss": 0.02675439281685182, | |
| "train_accuracy": 0.9949937421777222 | |
| }, | |
| { | |
| "epoch": 15, | |
| "loss": 0.013605056314243094, | |
| "train_accuracy": 0.9962453066332916 | |
| }, | |
| { | |
| "epoch": 20, | |
| "loss": 0.003073849640401996, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 25, | |
| "loss": 0.0026577636194491153, | |
| "train_accuracy": 0.9987484355444305 | |
| } | |
| ], | |
| "device": "cpu" | |
| } | |
| }, | |
| "interaction_text_prediction": { | |
| "minimal": { | |
| "accuracy": 0.017241379310344827, | |
| "balanced_accuracy": 0.03333333333333333, | |
| "macro_f1": 0.04444444444444444, | |
| "weighted_f1": 0.022988505747126436, | |
| "num_eval_windows": 58, | |
| "num_classes": 46, | |
| "status": "pass", | |
| "task": "interaction_text_prediction", | |
| "task_display_name": "Interaction Text Prediction", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "minimal_softmax", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 192, | |
| "num_train_windows": 134, | |
| "num_test_windows": 58, | |
| "num_train_classes": 32, | |
| "majority_baseline_accuracy": 0.0, | |
| "primary_metric": "macro_f1", | |
| "primary_score": 0.04444444444444444, | |
| "unseen_test_class_count": 14, | |
| "unseen_test_classes": [ | |
| "hand holding the white bottle over the workspace", | |
| "hand maintaining grip on the white bottle", | |
| "Hand placing the small bottle on the table surface", | |
| "Hands released from objects, resting near the brewing station", | |
| "Hands positioned near the coffee equipment, ready for the next step", | |
| "hands resting near the coffee brewing equipment on the table", | |
| "hands slightly adjusted in preparation for interacting with the equipment", | |
| "The right hand is gripping the handle of the coffee carafe to initiate pouring.", | |
| "The right hand is tilting the carafe to pour coffee into the mug.", | |
| "The right hand holds the empty carafe after completing the pour.", | |
| "The user is holding the milk pitcher over the coffee cup, initiating the pour.", | |
| "The user is carefully pouring the milk into the cup with coffee, controlling the flow.", | |
| "The milk continues to be poured into the coffee, creating a swirling motion in the cup.", | |
| "The right hand is tilting the milk pitcher to pour milk into the coffee mug, while the left hand holds the mug steady on the table." | |
| ], | |
| "history": [ | |
| { | |
| "epoch": 1, | |
| "loss": 3.447813034057617, | |
| "train_accuracy": 0.05223880597014925 | |
| }, | |
| { | |
| "epoch": 22, | |
| "loss": 0.02874920144677162, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 44, | |
| "loss": 0.02785160206258297, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 66, | |
| "loss": 0.02734168991446495, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 88, | |
| "loss": 0.026947205886244774, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 110, | |
| "loss": 0.02660428173840046, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 132, | |
| "loss": 0.02628966234624386, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 154, | |
| "loss": 0.025992820039391518, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 176, | |
| "loss": 0.0257082711905241, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 198, | |
| "loss": 0.025432869791984558, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 220, | |
| "loss": 0.025164704769849777, | |
| "train_accuracy": 1.0 | |
| } | |
| ] | |
| }, | |
| "neural_mlp": { | |
| "accuracy": 0.034482758620689655, | |
| "balanced_accuracy": 0.06666666666666667, | |
| "macro_f1": 0.0380952380952381, | |
| "weighted_f1": 0.01970443349753695, | |
| "num_eval_windows": 58, | |
| "num_classes": 46, | |
| "status": "pass", | |
| "task": "interaction_text_prediction", | |
| "task_display_name": "Interaction Text Prediction", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "neural_mlp", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 192, | |
| "num_train_windows": 134, | |
| "num_test_windows": 58, | |
| "primary_metric": "macro_f1", | |
| "primary_score": 0.0380952380952381, | |
| "history": [ | |
| { | |
| "epoch": 1, | |
| "loss": 3.8020725890771665, | |
| "train_accuracy": 0.04477611940298507 | |
| }, | |
| { | |
| "epoch": 5, | |
| "loss": 0.4838796658302421, | |
| "train_accuracy": 0.9029850746268657 | |
| }, | |
| { | |
| "epoch": 10, | |
| "loss": 0.05817107102875389, | |
| "train_accuracy": 0.9776119402985075 | |
| }, | |
| { | |
| "epoch": 15, | |
| "loss": 0.011369604450553211, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 20, | |
| "loss": 0.006697736902913051, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 25, | |
| "loss": 0.008224115385534936, | |
| "train_accuracy": 1.0 | |
| } | |
| ], | |
| "device": "cpu" | |
| } | |
| }, | |
| "action_object_relation": { | |
| "minimal": { | |
| "accuracy": 0.0, | |
| "balanced_accuracy": 0.0, | |
| "macro_f1": 0.0, | |
| "weighted_f1": 0.0, | |
| "num_eval_windows": 53, | |
| "num_classes": 42, | |
| "status": "pass", | |
| "task": "action_object_relation", | |
| "task_display_name": "Action-Object Relation Prediction", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "minimal_softmax", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 178, | |
| "num_train_windows": 125, | |
| "num_test_windows": 53, | |
| "num_train_classes": 32, | |
| "majority_baseline_accuracy": 0.0, | |
| "primary_metric": "macro_f1", | |
| "primary_score": 0.0, | |
| "unseen_test_class_count": 10, | |
| "unseen_test_classes": [ | |
| "Close bottle cap :: coffee dripper | scale | white bottle", | |
| "Close bottle cap :: coffee equipment | small bottle | weighing scale | white mug", | |
| "Place item on table :: coffee equipment | small bottle | weighing scale | white mug", | |
| "Wait/Prepare for pouring :: coffee equipment | small bottle | weighing scale | white mug", | |
| "Wait/Prepare for pouring :: digital scale with dripper | glass carafe | metal pitcher | water bottle | white coffee cup", | |
| "Wait/Prepare for pouring :: carafe | coffee mug | scale", | |
| "Pour coffee :: carafe | coffee mug | scale", | |
| "Pour coffee :: bottle | coffee cup | digital scale | milk pitcher", | |
| "Pour coffee :: coffee mug | digital scale | milk bottle | stainless steel milk pitcher | table", | |
| "Pour milk into coffee :: coffee mug | digital scale | milk bottle | stainless steel milk pitcher | table" | |
| ], | |
| "history": [ | |
| { | |
| "epoch": 1, | |
| "loss": 3.422329902648926, | |
| "train_accuracy": 0.056 | |
| }, | |
| { | |
| "epoch": 22, | |
| "loss": 0.030762728303670883, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 44, | |
| "loss": 0.029601721093058586, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 66, | |
| "loss": 0.02893223613500595, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 88, | |
| "loss": 0.028430834412574768, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 110, | |
| "loss": 0.028011377900838852, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 132, | |
| "loss": 0.027639301493763924, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 154, | |
| "loss": 0.02729770354926586, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 176, | |
| "loss": 0.026977315545082092, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 198, | |
| "loss": 0.026672501116991043, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 220, | |
| "loss": 0.026379700750112534, | |
| "train_accuracy": 1.0 | |
| } | |
| ] | |
| }, | |
| "neural_mlp": { | |
| "accuracy": 0.0, | |
| "balanced_accuracy": 0.0, | |
| "macro_f1": 0.0, | |
| "weighted_f1": 0.0, | |
| "num_eval_windows": 53, | |
| "num_classes": 42, | |
| "status": "pass", | |
| "task": "action_object_relation", | |
| "task_display_name": "Action-Object Relation Prediction", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "neural_mlp", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 178, | |
| "num_train_windows": 125, | |
| "num_test_windows": 53, | |
| "primary_metric": "macro_f1", | |
| "primary_score": 0.0, | |
| "history": [ | |
| { | |
| "epoch": 1, | |
| "loss": 3.753063440322876, | |
| "train_accuracy": 0.008 | |
| }, | |
| { | |
| "epoch": 5, | |
| "loss": 0.8229753971099854, | |
| "train_accuracy": 0.872 | |
| }, | |
| { | |
| "epoch": 10, | |
| "loss": 0.0829126164317131, | |
| "train_accuracy": 0.968 | |
| }, | |
| { | |
| "epoch": 15, | |
| "loss": 0.07906360924243927, | |
| "train_accuracy": 0.976 | |
| }, | |
| { | |
| "epoch": 20, | |
| "loss": 0.013344862498342991, | |
| "train_accuracy": 1.0 | |
| }, | |
| { | |
| "epoch": 25, | |
| "loss": 0.0362895242869854, | |
| "train_accuracy": 1.0 | |
| } | |
| ], | |
| "device": "cpu" | |
| } | |
| }, | |
| "object_set_forecast": { | |
| "minimal": { | |
| "precision": 0.12015503875968993, | |
| "recall": 0.28703703703703703, | |
| "micro_f1": 0.16939890710382516, | |
| "macro_f1": 0.09796905529697701, | |
| "exact_match": 0.0, | |
| "status": "pass", | |
| "task": "object_set_forecast", | |
| "task_display_name": "Future Object-Set Forecasting", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "minimal_ridge_multilabel", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 188, | |
| "num_train_windows": 132, | |
| "num_test_windows": 56, | |
| "num_objects": 23, | |
| "future_horizon_frames": 100, | |
| "primary_metric": "micro_f1", | |
| "primary_score": 0.16939890710382516, | |
| "unseen_test_objects": { | |
| "coffee equipment": 16, | |
| "small bottle": 16, | |
| "weighing scale": 16, | |
| "digital scale with dripper": 8, | |
| "metal pitcher": 8, | |
| "white coffee cup": 8, | |
| "carafe": 8, | |
| "coffee cup": 12, | |
| "milk pitcher": 12, | |
| "milk bottle": 4, | |
| "stainless steel milk pitcher": 4 | |
| } | |
| }, | |
| "neural_mlp": { | |
| "precision": 0.1590909090909091, | |
| "recall": 0.25925925925925924, | |
| "micro_f1": 0.19718309859154928, | |
| "macro_f1": 0.07845536106405672, | |
| "exact_match": 0.0, | |
| "status": "pass", | |
| "task": "object_set_forecast", | |
| "task_display_name": "Future Object-Set Forecasting", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "neural_mlp_multilabel", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 188, | |
| "num_train_windows": 132, | |
| "num_test_windows": 56, | |
| "num_objects": 23, | |
| "primary_metric": "micro_f1", | |
| "primary_score": 0.19718309859154928, | |
| "history": [ | |
| { | |
| "epoch": 1, | |
| "loss": 1.118124373031385 | |
| }, | |
| { | |
| "epoch": 5, | |
| "loss": 0.4309653134057016 | |
| }, | |
| { | |
| "epoch": 10, | |
| "loss": 0.17918715264761087 | |
| }, | |
| { | |
| "epoch": 15, | |
| "loss": 0.08946222806292953 | |
| }, | |
| { | |
| "epoch": 20, | |
| "loss": 0.07499222908959244 | |
| }, | |
| { | |
| "epoch": 25, | |
| "loss": 0.0528871344797539 | |
| } | |
| ], | |
| "device": "cpu" | |
| } | |
| }, | |
| "imu_to_hand_pose": { | |
| "minimal": { | |
| "mse": 0.005499584134668112, | |
| "mae": 0.042049407958984375, | |
| "r2": -0.35125992233237024, | |
| "num_test": 348, | |
| "status": "pass", | |
| "task": "imu_to_hand_pose", | |
| "task_display_name": "IMU-to-Hand Pose Reconstruction", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "minimal_ridge_regression", | |
| "input": "Current IMU acceleration/gyroscope feature block only.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "target_dim": 882, | |
| "primary_metric": "mae", | |
| "primary_score": 0.042049407958984375 | |
| }, | |
| "neural_mlp": { | |
| "mse": 0.005374640692025423, | |
| "mae": 0.042562149465084076, | |
| "r2": -0.32056106903460324, | |
| "num_test": 348, | |
| "status": "pass", | |
| "task": "imu_to_hand_pose", | |
| "task_display_name": "IMU-to-Hand Pose Reconstruction", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "neural_mlp_regression", | |
| "input": "Current IMU acceleration/gyroscope feature block only.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "target_dim": 882, | |
| "primary_metric": "mae", | |
| "primary_score": 0.042562149465084076, | |
| "history": [ | |
| { | |
| "epoch": 1, | |
| "loss": 0.9968642874690733 | |
| }, | |
| { | |
| "epoch": 5, | |
| "loss": 0.8155221368700523 | |
| }, | |
| { | |
| "epoch": 10, | |
| "loss": 0.6730313805489816 | |
| }, | |
| { | |
| "epoch": 15, | |
| "loss": 0.6062786274143984 | |
| }, | |
| { | |
| "epoch": 20, | |
| "loss": 0.5605393504451268 | |
| }, | |
| { | |
| "epoch": 25, | |
| "loss": 0.515976368574492 | |
| } | |
| ], | |
| "device": "cpu" | |
| } | |
| }, | |
| "camera_view_sync_retrieval": { | |
| "minimal": { | |
| "mrr": 0.4943004846572876, | |
| "top1": 0.3448275862068966, | |
| "top5": 0.6724137931034483, | |
| "top10": 0.7614942528735632, | |
| "median_rank": 2.0, | |
| "num_test": 348, | |
| "status": "pass", | |
| "task": "camera_view_sync_retrieval", | |
| "task_display_name": "Camera-View Synchronization Retrieval", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "minimal_ridge_projection_cosine_retrieval", | |
| "input": "Fisheye camera-1 feature query projected into fisheye camera-3 feature space.", | |
| "split": "single_episode_chronological", | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "query_dim": 686, | |
| "target_dim": 686, | |
| "primary_metric": "mrr", | |
| "primary_score": 0.4943004846572876 | |
| }, | |
| "neural_mlp": { | |
| "mrr": 0.24086658656597137, | |
| "top1": 0.12931034482758622, | |
| "top5": 0.3390804597701149, | |
| "top10": 0.46839080459770116, | |
| "median_rank": 12.0, | |
| "num_test": 348, | |
| "status": "pass", | |
| "task": "camera_view_sync_retrieval", | |
| "task_display_name": "Camera-View Synchronization Retrieval", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "neural_mlp_projection_cosine_retrieval", | |
| "input": "Fisheye camera-1 feature query projected into fisheye camera-3 feature space.", | |
| "split": "single_episode_chronological", | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "query_dim": 686, | |
| "target_dim": 686, | |
| "primary_metric": "mrr", | |
| "primary_score": 0.24086658656597137, | |
| "history": [ | |
| { | |
| "epoch": 1, | |
| "loss": 0.9819011160368409 | |
| }, | |
| { | |
| "epoch": 5, | |
| "loss": 0.5516944707979575 | |
| }, | |
| { | |
| "epoch": 10, | |
| "loss": 0.36679228487783105 | |
| }, | |
| { | |
| "epoch": 15, | |
| "loss": 0.2996834480967762 | |
| }, | |
| { | |
| "epoch": 20, | |
| "loss": 0.2610064353266912 | |
| }, | |
| { | |
| "epoch": 25, | |
| "loss": 0.23746319687014578 | |
| } | |
| ], | |
| "device": "cpu" | |
| } | |
| }, | |
| "time_to_transition": { | |
| "minimal": { | |
| "mse": 1345.12353515625, | |
| "mae": 10.53735637664795, | |
| "r2": -0.0899740955263848, | |
| "num_test": 348, | |
| "mae_frames": 10.53735637664795, | |
| "status": "pass", | |
| "task": "time_to_transition", | |
| "task_display_name": "Time-to-Next-Transition Regression", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "minimal_ridge_regression", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "target_dim": 1, | |
| "primary_metric": "mae", | |
| "primary_score": 10.53735637664795 | |
| }, | |
| "neural_mlp": { | |
| "mse": 1345.0997314453125, | |
| "mae": 10.55449390411377, | |
| "r2": -0.08995473993654857, | |
| "num_test": 348, | |
| "mae_frames": 10.55449390411377, | |
| "status": "pass", | |
| "task": "time_to_transition", | |
| "task_display_name": "Time-to-Next-Transition Regression", | |
| "suite_position": "unified_20_task_provenance", | |
| "model_family": "neural_mlp_regression", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "split": "single_episode_chronological", | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "target_dim": 1, | |
| "primary_metric": "mae", | |
| "primary_score": 10.55449390411377, | |
| "history": [ | |
| { | |
| "epoch": 1, | |
| "loss": 0.1785692156992422 | |
| }, | |
| { | |
| "epoch": 5, | |
| "loss": 0.04815403889832608 | |
| }, | |
| { | |
| "epoch": 10, | |
| "loss": 0.010813283567347759 | |
| }, | |
| { | |
| "epoch": 15, | |
| "loss": 0.0039978047098556645 | |
| }, | |
| { | |
| "epoch": 20, | |
| "loss": 0.0023154149574845075 | |
| }, | |
| { | |
| "epoch": 25, | |
| "loss": 0.0012936348804051623 | |
| } | |
| ], | |
| "device": "cpu" | |
| } | |
| } | |
| } | |
| } | |