Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
Add files using upload-large-folder tool
Browse files
data/episode128_task_model_radar.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"title": "128-Episode 20-Task Radar",
|
| 3 |
"status": "pass",
|
| 4 |
-
"generated_at_utc": "2026-06-
|
| 5 |
"description": "Selected 128-episode metadata/raw baselines plus verified Qwen3-Omni v6, Cosmos3-Super, and Cosmos3-Nano diagnostics. Every method has 20 records; numeric scores appear only where the public artifact produced that task target.",
|
| 6 |
"task_count": 20,
|
| 7 |
"method_count": 7,
|
|
@@ -192,7 +192,7 @@
|
|
| 192 |
"label": "Action Recognition",
|
| 193 |
"axis_label": "01 Action Recognition",
|
| 194 |
"short_label": "Action",
|
| 195 |
-
"
|
| 196 |
"metric_key": "macro_f1",
|
| 197 |
"metric_name": "macro-F1",
|
| 198 |
"metric_direction": "higher",
|
|
@@ -283,7 +283,7 @@
|
|
| 283 |
"label": "Procedure Step Recognition",
|
| 284 |
"axis_label": "02 Procedure Step Recognition",
|
| 285 |
"short_label": "Step",
|
| 286 |
-
"
|
| 287 |
"metric_key": "macro_f1",
|
| 288 |
"metric_name": "macro-F1",
|
| 289 |
"metric_direction": "higher",
|
|
@@ -374,7 +374,7 @@
|
|
| 374 |
"label": "Action Boundary Detection",
|
| 375 |
"axis_label": "03 Action Boundary Detection",
|
| 376 |
"short_label": "Boundary",
|
| 377 |
-
"
|
| 378 |
"metric_key": "macro_f1",
|
| 379 |
"metric_name": "macro-F1",
|
| 380 |
"metric_direction": "higher",
|
|
@@ -465,7 +465,7 @@
|
|
| 465 |
"label": "Next-Action Prediction",
|
| 466 |
"axis_label": "04 Next-Action Prediction",
|
| 467 |
"short_label": "Next act",
|
| 468 |
-
"
|
| 469 |
"metric_key": "macro_f1",
|
| 470 |
"metric_name": "macro-F1",
|
| 471 |
"metric_direction": "higher",
|
|
@@ -556,7 +556,7 @@
|
|
| 556 |
"label": "Hand Trajectory Forecasting",
|
| 557 |
"axis_label": "05 Hand Trajectory Forecasting",
|
| 558 |
"short_label": "Hand traj",
|
| 559 |
-
"
|
| 560 |
"metric_key": "mpjpe",
|
| 561 |
"metric_name": "MPJPE",
|
| 562 |
"metric_direction": "lower",
|
|
@@ -647,7 +647,7 @@
|
|
| 647 |
"label": "Contact State Prediction",
|
| 648 |
"axis_label": "06 Contact State Prediction",
|
| 649 |
"short_label": "Contact",
|
| 650 |
-
"
|
| 651 |
"metric_key": "macro_f1",
|
| 652 |
"metric_name": "macro-F1",
|
| 653 |
"metric_direction": "higher",
|
|
@@ -738,7 +738,7 @@
|
|
| 738 |
"label": "Object Relevance Prediction",
|
| 739 |
"axis_label": "07 Object Relevance Prediction",
|
| 740 |
"short_label": "Objects",
|
| 741 |
-
"
|
| 742 |
"metric_key": "micro_f1",
|
| 743 |
"metric_name": "micro-F1",
|
| 744 |
"metric_direction": "higher",
|
|
@@ -829,7 +829,7 @@
|
|
| 829 |
"label": "Language Grounding",
|
| 830 |
"axis_label": "08 Language Grounding",
|
| 831 |
"short_label": "Language",
|
| 832 |
-
"
|
| 833 |
"metric_key": "mrr",
|
| 834 |
"metric_name": "MRR",
|
| 835 |
"metric_direction": "higher",
|
|
@@ -920,7 +920,7 @@
|
|
| 920 |
"label": "Cross-Modal Retrieval",
|
| 921 |
"axis_label": "09 Cross-Modal Retrieval",
|
| 922 |
"short_label": "X-modal",
|
| 923 |
-
"
|
| 924 |
"metric_key": "mrr",
|
| 925 |
"metric_name": "MRR",
|
| 926 |
"metric_direction": "higher",
|
|
@@ -1011,7 +1011,7 @@
|
|
| 1011 |
"label": "Cross-Modal Reconstruction",
|
| 1012 |
"axis_label": "10 Cross-Modal Reconstruction",
|
| 1013 |
"short_label": "Recon",
|
| 1014 |
-
"
|
| 1015 |
"metric_key": "r2",
|
| 1016 |
"metric_name": "R2",
|
| 1017 |
"metric_direction": "higher",
|
|
@@ -1102,7 +1102,7 @@
|
|
| 1102 |
"label": "Temporal Order Verification",
|
| 1103 |
"axis_label": "11 Temporal Order Verification",
|
| 1104 |
"short_label": "Order",
|
| 1105 |
-
"
|
| 1106 |
"metric_key": "f1",
|
| 1107 |
"metric_name": "F1",
|
| 1108 |
"metric_direction": "higher",
|
|
@@ -1193,7 +1193,7 @@
|
|
| 1193 |
"label": "Multimodal Synchronization Detection",
|
| 1194 |
"axis_label": "12 Multimodal Synchronization Detection",
|
| 1195 |
"short_label": "Sync",
|
| 1196 |
-
"
|
| 1197 |
"metric_key": "f1",
|
| 1198 |
"metric_name": "F1",
|
| 1199 |
"metric_direction": "higher",
|
|
@@ -1284,7 +1284,7 @@
|
|
| 1284 |
"label": "Long-Horizon Next-Action Forecasting",
|
| 1285 |
"axis_label": "13 Long-Horizon Next-Action Forecasting",
|
| 1286 |
"short_label": "Long act",
|
| 1287 |
-
"
|
| 1288 |
"metric_key": "macro_f1",
|
| 1289 |
"metric_name": "macro-F1",
|
| 1290 |
"metric_direction": "higher",
|
|
@@ -1375,7 +1375,7 @@
|
|
| 1375 |
"label": "Long-Horizon Next-Subtask Forecasting",
|
| 1376 |
"axis_label": "14 Long-Horizon Next-Subtask Forecasting",
|
| 1377 |
"short_label": "Long step",
|
| 1378 |
-
"
|
| 1379 |
"metric_key": "macro_f1",
|
| 1380 |
"metric_name": "macro-F1",
|
| 1381 |
"metric_direction": "higher",
|
|
@@ -1466,7 +1466,7 @@
|
|
| 1466 |
"label": "Interaction Text Prediction",
|
| 1467 |
"axis_label": "15 Interaction Text Prediction",
|
| 1468 |
"short_label": "Interact txt",
|
| 1469 |
-
"
|
| 1470 |
"metric_key": "macro_f1",
|
| 1471 |
"metric_name": "macro-F1",
|
| 1472 |
"metric_direction": "higher",
|
|
@@ -1557,7 +1557,7 @@
|
|
| 1557 |
"label": "Action-Object Relation Prediction",
|
| 1558 |
"axis_label": "16 Action-Object Relation Prediction",
|
| 1559 |
"short_label": "Act+obj",
|
| 1560 |
-
"
|
| 1561 |
"metric_key": "macro_f1",
|
| 1562 |
"metric_name": "macro-F1",
|
| 1563 |
"metric_direction": "higher",
|
|
@@ -1648,7 +1648,7 @@
|
|
| 1648 |
"label": "Future Object-Set Forecasting",
|
| 1649 |
"axis_label": "17 Future Object-Set Forecasting",
|
| 1650 |
"short_label": "Future obj",
|
| 1651 |
-
"
|
| 1652 |
"metric_key": "micro_f1",
|
| 1653 |
"metric_name": "micro-F1",
|
| 1654 |
"metric_direction": "higher",
|
|
@@ -1739,7 +1739,7 @@
|
|
| 1739 |
"label": "IMU-to-Hand Pose Reconstruction",
|
| 1740 |
"axis_label": "18 IMU-to-Hand Pose Reconstruction",
|
| 1741 |
"short_label": "IMU->hand",
|
| 1742 |
-
"
|
| 1743 |
"metric_key": "mae",
|
| 1744 |
"metric_name": "MAE",
|
| 1745 |
"metric_direction": "lower",
|
|
@@ -1830,7 +1830,7 @@
|
|
| 1830 |
"label": "Camera-View Synchronization Retrieval",
|
| 1831 |
"axis_label": "19 Camera-View Synchronization Retrieval",
|
| 1832 |
"short_label": "Cam sync",
|
| 1833 |
-
"
|
| 1834 |
"metric_key": "mrr",
|
| 1835 |
"metric_name": "MRR",
|
| 1836 |
"metric_direction": "higher",
|
|
@@ -1921,7 +1921,7 @@
|
|
| 1921 |
"label": "Time-to-Next-Transition Regression",
|
| 1922 |
"axis_label": "20 Time-to-Next-Transition Regression",
|
| 1923 |
"short_label": "Time2bdry",
|
| 1924 |
-
"
|
| 1925 |
"metric_key": "mae",
|
| 1926 |
"metric_name": "MAE frames",
|
| 1927 |
"metric_direction": "lower",
|
|
|
|
| 1 |
{
|
| 2 |
"title": "128-Episode 20-Task Radar",
|
| 3 |
"status": "pass",
|
| 4 |
+
"generated_at_utc": "2026-06-21T15:20:34+00:00",
|
| 5 |
"description": "Selected 128-episode metadata/raw baselines plus verified Qwen3-Omni v6, Cosmos3-Super, and Cosmos3-Nano diagnostics. Every method has 20 records; numeric scores appear only where the public artifact produced that task target.",
|
| 6 |
"task_count": 20,
|
| 7 |
"method_count": 7,
|
|
|
|
| 192 |
"label": "Action Recognition",
|
| 193 |
"axis_label": "01 Action Recognition",
|
| 194 |
"short_label": "Action",
|
| 195 |
+
"provenance_source": "walkthrough_backed_task_contract",
|
| 196 |
"metric_key": "macro_f1",
|
| 197 |
"metric_name": "macro-F1",
|
| 198 |
"metric_direction": "higher",
|
|
|
|
| 283 |
"label": "Procedure Step Recognition",
|
| 284 |
"axis_label": "02 Procedure Step Recognition",
|
| 285 |
"short_label": "Step",
|
| 286 |
+
"provenance_source": "walkthrough_backed_task_contract",
|
| 287 |
"metric_key": "macro_f1",
|
| 288 |
"metric_name": "macro-F1",
|
| 289 |
"metric_direction": "higher",
|
|
|
|
| 374 |
"label": "Action Boundary Detection",
|
| 375 |
"axis_label": "03 Action Boundary Detection",
|
| 376 |
"short_label": "Boundary",
|
| 377 |
+
"provenance_source": "walkthrough_backed_task_contract",
|
| 378 |
"metric_key": "macro_f1",
|
| 379 |
"metric_name": "macro-F1",
|
| 380 |
"metric_direction": "higher",
|
|
|
|
| 465 |
"label": "Next-Action Prediction",
|
| 466 |
"axis_label": "04 Next-Action Prediction",
|
| 467 |
"short_label": "Next act",
|
| 468 |
+
"provenance_source": "walkthrough_backed_task_contract",
|
| 469 |
"metric_key": "macro_f1",
|
| 470 |
"metric_name": "macro-F1",
|
| 471 |
"metric_direction": "higher",
|
|
|
|
| 556 |
"label": "Hand Trajectory Forecasting",
|
| 557 |
"axis_label": "05 Hand Trajectory Forecasting",
|
| 558 |
"short_label": "Hand traj",
|
| 559 |
+
"provenance_source": "walkthrough_backed_task_contract",
|
| 560 |
"metric_key": "mpjpe",
|
| 561 |
"metric_name": "MPJPE",
|
| 562 |
"metric_direction": "lower",
|
|
|
|
| 647 |
"label": "Contact State Prediction",
|
| 648 |
"axis_label": "06 Contact State Prediction",
|
| 649 |
"short_label": "Contact",
|
| 650 |
+
"provenance_source": "walkthrough_backed_task_contract",
|
| 651 |
"metric_key": "macro_f1",
|
| 652 |
"metric_name": "macro-F1",
|
| 653 |
"metric_direction": "higher",
|
|
|
|
| 738 |
"label": "Object Relevance Prediction",
|
| 739 |
"axis_label": "07 Object Relevance Prediction",
|
| 740 |
"short_label": "Objects",
|
| 741 |
+
"provenance_source": "walkthrough_backed_task_contract",
|
| 742 |
"metric_key": "micro_f1",
|
| 743 |
"metric_name": "micro-F1",
|
| 744 |
"metric_direction": "higher",
|
|
|
|
| 829 |
"label": "Language Grounding",
|
| 830 |
"axis_label": "08 Language Grounding",
|
| 831 |
"short_label": "Language",
|
| 832 |
+
"provenance_source": "walkthrough_backed_task_contract",
|
| 833 |
"metric_key": "mrr",
|
| 834 |
"metric_name": "MRR",
|
| 835 |
"metric_direction": "higher",
|
|
|
|
| 920 |
"label": "Cross-Modal Retrieval",
|
| 921 |
"axis_label": "09 Cross-Modal Retrieval",
|
| 922 |
"short_label": "X-modal",
|
| 923 |
+
"provenance_source": "walkthrough_backed_task_contract",
|
| 924 |
"metric_key": "mrr",
|
| 925 |
"metric_name": "MRR",
|
| 926 |
"metric_direction": "higher",
|
|
|
|
| 1011 |
"label": "Cross-Modal Reconstruction",
|
| 1012 |
"axis_label": "10 Cross-Modal Reconstruction",
|
| 1013 |
"short_label": "Recon",
|
| 1014 |
+
"provenance_source": "walkthrough_backed_task_contract",
|
| 1015 |
"metric_key": "r2",
|
| 1016 |
"metric_name": "R2",
|
| 1017 |
"metric_direction": "higher",
|
|
|
|
| 1102 |
"label": "Temporal Order Verification",
|
| 1103 |
"axis_label": "11 Temporal Order Verification",
|
| 1104 |
"short_label": "Order",
|
| 1105 |
+
"provenance_source": "walkthrough_backed_task_contract",
|
| 1106 |
"metric_key": "f1",
|
| 1107 |
"metric_name": "F1",
|
| 1108 |
"metric_direction": "higher",
|
|
|
|
| 1193 |
"label": "Multimodal Synchronization Detection",
|
| 1194 |
"axis_label": "12 Multimodal Synchronization Detection",
|
| 1195 |
"short_label": "Sync",
|
| 1196 |
+
"provenance_source": "walkthrough_backed_task_contract",
|
| 1197 |
"metric_key": "f1",
|
| 1198 |
"metric_name": "F1",
|
| 1199 |
"metric_direction": "higher",
|
|
|
|
| 1284 |
"label": "Long-Horizon Next-Action Forecasting",
|
| 1285 |
"axis_label": "13 Long-Horizon Next-Action Forecasting",
|
| 1286 |
"short_label": "Long act",
|
| 1287 |
+
"provenance_source": "historical_result_bundle",
|
| 1288 |
"metric_key": "macro_f1",
|
| 1289 |
"metric_name": "macro-F1",
|
| 1290 |
"metric_direction": "higher",
|
|
|
|
| 1375 |
"label": "Long-Horizon Next-Subtask Forecasting",
|
| 1376 |
"axis_label": "14 Long-Horizon Next-Subtask Forecasting",
|
| 1377 |
"short_label": "Long step",
|
| 1378 |
+
"provenance_source": "historical_result_bundle",
|
| 1379 |
"metric_key": "macro_f1",
|
| 1380 |
"metric_name": "macro-F1",
|
| 1381 |
"metric_direction": "higher",
|
|
|
|
| 1466 |
"label": "Interaction Text Prediction",
|
| 1467 |
"axis_label": "15 Interaction Text Prediction",
|
| 1468 |
"short_label": "Interact txt",
|
| 1469 |
+
"provenance_source": "historical_result_bundle",
|
| 1470 |
"metric_key": "macro_f1",
|
| 1471 |
"metric_name": "macro-F1",
|
| 1472 |
"metric_direction": "higher",
|
|
|
|
| 1557 |
"label": "Action-Object Relation Prediction",
|
| 1558 |
"axis_label": "16 Action-Object Relation Prediction",
|
| 1559 |
"short_label": "Act+obj",
|
| 1560 |
+
"provenance_source": "historical_result_bundle",
|
| 1561 |
"metric_key": "macro_f1",
|
| 1562 |
"metric_name": "macro-F1",
|
| 1563 |
"metric_direction": "higher",
|
|
|
|
| 1648 |
"label": "Future Object-Set Forecasting",
|
| 1649 |
"axis_label": "17 Future Object-Set Forecasting",
|
| 1650 |
"short_label": "Future obj",
|
| 1651 |
+
"provenance_source": "historical_result_bundle",
|
| 1652 |
"metric_key": "micro_f1",
|
| 1653 |
"metric_name": "micro-F1",
|
| 1654 |
"metric_direction": "higher",
|
|
|
|
| 1739 |
"label": "IMU-to-Hand Pose Reconstruction",
|
| 1740 |
"axis_label": "18 IMU-to-Hand Pose Reconstruction",
|
| 1741 |
"short_label": "IMU->hand",
|
| 1742 |
+
"provenance_source": "historical_result_bundle",
|
| 1743 |
"metric_key": "mae",
|
| 1744 |
"metric_name": "MAE",
|
| 1745 |
"metric_direction": "lower",
|
|
|
|
| 1830 |
"label": "Camera-View Synchronization Retrieval",
|
| 1831 |
"axis_label": "19 Camera-View Synchronization Retrieval",
|
| 1832 |
"short_label": "Cam sync",
|
| 1833 |
+
"provenance_source": "historical_result_bundle",
|
| 1834 |
"metric_key": "mrr",
|
| 1835 |
"metric_name": "MRR",
|
| 1836 |
"metric_direction": "higher",
|
|
|
|
| 1921 |
"label": "Time-to-Next-Transition Regression",
|
| 1922 |
"axis_label": "20 Time-to-Next-Transition Regression",
|
| 1923 |
"short_label": "Time2bdry",
|
| 1924 |
+
"provenance_source": "historical_result_bundle",
|
| 1925 |
"metric_key": "mae",
|
| 1926 |
"metric_name": "MAE frames",
|
| 1927 |
"metric_direction": "lower",
|