Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "status": "pass", | |
| "generated_at_utc": "2026-06-03T15:11:33+00:00", | |
| "repo_id": "ropedia-ai/xperience-10m", | |
| "download_policy": "annotation.hdf5 only; no videos or visualization.rrd downloaded", | |
| "cache_note": "raw annotation files were cached outside the published repo", | |
| "probes": [ | |
| { | |
| "repo_filename": "9cecac72-8874-4b97-9541-18d4858f8e43/ep10/annotation.hdf5", | |
| "inspection": { | |
| "cache_note": "annotation file cached outside the published repo", | |
| "local_bytes": 6687192, | |
| "local_human": "6.38 MiB", | |
| "top_level_keys": [ | |
| "calibration", | |
| "caption", | |
| "depth", | |
| "full_body_mocap", | |
| "hand_mocap", | |
| "imu", | |
| "metadata", | |
| "slam", | |
| "video" | |
| ], | |
| "dataset_count": 65, | |
| "dataset_first_dim_histogram_top20": { | |
| "20": 27, | |
| "4": 14, | |
| "190": 3, | |
| "47": 1 | |
| }, | |
| "top_group_stats": { | |
| "calibration": { | |
| "dataset_count": 23, | |
| "max_first_dim": 4, | |
| "first_dim_values": { | |
| "4": 14 | |
| } | |
| }, | |
| "caption": { | |
| "dataset_count": 1, | |
| "max_first_dim": 0, | |
| "first_dim_values": {} | |
| }, | |
| "depth": { | |
| "dataset_count": 5, | |
| "max_first_dim": 20, | |
| "first_dim_values": { | |
| "20": 2 | |
| } | |
| }, | |
| "full_body_mocap": { | |
| "dataset_count": 9, | |
| "max_first_dim": 20, | |
| "first_dim_values": { | |
| "20": 9 | |
| } | |
| }, | |
| "hand_mocap": { | |
| "dataset_count": 10, | |
| "max_first_dim": 20, | |
| "first_dim_values": { | |
| "20": 10 | |
| } | |
| }, | |
| "imu": { | |
| "dataset_count": 4, | |
| "max_first_dim": 190, | |
| "first_dim_values": { | |
| "190": 3, | |
| "20": 1 | |
| } | |
| }, | |
| "metadata": { | |
| "dataset_count": 6, | |
| "max_first_dim": 0, | |
| "first_dim_values": {} | |
| }, | |
| "slam": { | |
| "dataset_count": 4, | |
| "max_first_dim": 47, | |
| "first_dim_values": { | |
| "20": 3, | |
| "47": 1 | |
| } | |
| }, | |
| "video": { | |
| "dataset_count": 3, | |
| "max_first_dim": 20, | |
| "first_dim_values": { | |
| "20": 2 | |
| } | |
| } | |
| }, | |
| "max_first_dim_dataset": { | |
| "path": "imu/accel_xyz", | |
| "shape": [ | |
| 190, | |
| 3 | |
| ], | |
| "dtype": "float64", | |
| "first_dim": 190, | |
| "storage_bytes": 4560, | |
| "storage_human": "4.45 KiB" | |
| }, | |
| "text_action_interaction_related_datasets": [ | |
| { | |
| "path": "caption", | |
| "shape": [], | |
| "dtype": "object", | |
| "first_dim": null, | |
| "storage_bytes": 16, | |
| "storage_human": "16.00 B", | |
| "sample_values": [ | |
| "{\"config\": {\"segment_sec\": 20, \"sample_fps\": 0.5, \"total_tokens\": 2047, \"Main Task\": \"Packing items into a plastic bin. The person is placing various items into a clear plastic storage container.\"}, \"segments\": [{\"segment_id\": 0, \"start_frame\": \"82777404821554\", \"end_frame\": \"frame_0000021\", \"Sub Task\": \"Packing items into a plastic bin\", \"Current Action\": [{\"label\": \"Arrange items in bin\", \"description\": \"The person adjusts the position of items inside the plastic storage container to ensure they are organized.\", \"start_frame\": 82777404821554, \"end_frame\": 82777404821554}], \"sampled_frames\": {\"Image 1\": 82777404821554}, \"objects\": {\"82777404821554\": [\"plastic storage bin\", \"hand\", \"cardboard box\"]}, \"interaction\": {\"82777404821554\": \"The hand is reaching into and organizing items inside the plastic storage bin.\"}, \"api_call_start\": \"2026-03-12T19:33:43.280472\", \"api_call_end\": \"2026-03-12T19:33:44.979810\", \"tokens_in\": 1842, \"tokens_out\": 205}], \"global_summary\": \"The video depicts the process of organizing and packing various personal items into a plastic storage container. It focuses on the practical task of tidying up or preparing belongings for storage.\"}" | |
| ] | |
| } | |
| ], | |
| "caption_json_summary": { | |
| "parse_status": "ok", | |
| "json_bytes": 1178, | |
| "top_keys": [ | |
| "config", | |
| "segments", | |
| "global_summary" | |
| ], | |
| "config": { | |
| "segment_sec": 20, | |
| "sample_fps": 0.5, | |
| "total_tokens": 2047, | |
| "Main Task": "Packing items into a plastic bin. The person is placing various items into a clear plastic storage container." | |
| }, | |
| "segment_count": 1, | |
| "current_action_count": 1, | |
| "unique_sub_task_count": 1, | |
| "unique_action_label_count": 1, | |
| "object_frame_count": 1, | |
| "interaction_frame_count": 1, | |
| "sampled_frame_count": 1, | |
| "unique_object_count": 3, | |
| "sub_tasks": [ | |
| "Packing items into a plastic bin" | |
| ], | |
| "action_labels": [ | |
| "Arrange items in bin" | |
| ], | |
| "objects": [ | |
| "cardboard box", | |
| "hand", | |
| "plastic storage bin" | |
| ], | |
| "global_summary_preview": "The video depicts the process of organizing and packing various personal items into a plastic storage container. It focuses on the practical task of tidying up or preparing belongings for storage." | |
| } | |
| } | |
| }, | |
| { | |
| "repo_filename": "cdc1ae12-a460-48ac-a892-7d314095c4b1/ep23/annotation.hdf5", | |
| "inspection": { | |
| "cache_note": "annotation file cached outside the published repo", | |
| "local_bytes": 6687256, | |
| "local_human": "6.38 MiB", | |
| "top_level_keys": [ | |
| "calibration", | |
| "caption", | |
| "depth", | |
| "full_body_mocap", | |
| "hand_mocap", | |
| "imu", | |
| "metadata", | |
| "slam", | |
| "video" | |
| ], | |
| "dataset_count": 65, | |
| "dataset_first_dim_histogram_top20": { | |
| "20": 27, | |
| "4": 14, | |
| "188": 3, | |
| "128": 1 | |
| }, | |
| "top_group_stats": { | |
| "calibration": { | |
| "dataset_count": 23, | |
| "max_first_dim": 4, | |
| "first_dim_values": { | |
| "4": 14 | |
| } | |
| }, | |
| "caption": { | |
| "dataset_count": 1, | |
| "max_first_dim": 0, | |
| "first_dim_values": {} | |
| }, | |
| "depth": { | |
| "dataset_count": 5, | |
| "max_first_dim": 20, | |
| "first_dim_values": { | |
| "20": 2 | |
| } | |
| }, | |
| "full_body_mocap": { | |
| "dataset_count": 9, | |
| "max_first_dim": 20, | |
| "first_dim_values": { | |
| "20": 9 | |
| } | |
| }, | |
| "hand_mocap": { | |
| "dataset_count": 10, | |
| "max_first_dim": 20, | |
| "first_dim_values": { | |
| "20": 10 | |
| } | |
| }, | |
| "imu": { | |
| "dataset_count": 4, | |
| "max_first_dim": 188, | |
| "first_dim_values": { | |
| "188": 3, | |
| "20": 1 | |
| } | |
| }, | |
| "metadata": { | |
| "dataset_count": 6, | |
| "max_first_dim": 0, | |
| "first_dim_values": {} | |
| }, | |
| "slam": { | |
| "dataset_count": 4, | |
| "max_first_dim": 128, | |
| "first_dim_values": { | |
| "20": 3, | |
| "128": 1 | |
| } | |
| }, | |
| "video": { | |
| "dataset_count": 3, | |
| "max_first_dim": 20, | |
| "first_dim_values": { | |
| "20": 2 | |
| } | |
| } | |
| }, | |
| "max_first_dim_dataset": { | |
| "path": "imu/accel_xyz", | |
| "shape": [ | |
| 188, | |
| 3 | |
| ], | |
| "dtype": "float64", | |
| "first_dim": 188, | |
| "storage_bytes": 4512, | |
| "storage_human": "4.41 KiB" | |
| }, | |
| "text_action_interaction_related_datasets": [ | |
| { | |
| "path": "caption", | |
| "shape": [], | |
| "dtype": "object", | |
| "first_dim": null, | |
| "storage_bytes": 16, | |
| "storage_human": "16.00 B", | |
| "sample_values": [ | |
| "{\"config\": {\"segment_sec\": 20, \"sample_fps\": 0.5, \"total_tokens\": 2035, \"Main Task\": \"Putting on socks. The person is standing in a bathroom and putting on their socks.\"}, \"segments\": [{\"segment_id\": 0, \"start_frame\": \"78968504788029\", \"end_frame\": \"78969405629613\", \"Sub Task\": \"Putting on socks\", \"Current Action\": [{\"label\": \"Pulling up sock\", \"description\": \"The person is manually adjusting and pulling up a sock on their foot.\", \"start_frame\": 78968504788029, \"end_frame\": 78968504788029}], \"sampled_frames\": {\"Image 1\": 78968504788029}, \"objects\": {\"78968504788029\": [\"sock\", \"feet\", \"bathroom floor\", \"toilet\"]}, \"interaction\": {\"78968504788029\": \"The person's hands are gripping and pulling on the fabric of the sock to adjust it over their foot.\"}, \"api_call_start\": \"2026-03-11T07:58:15.838321\", \"api_call_end\": \"2026-03-11T07:58:17.411279\", \"tokens_in\": 1839, \"tokens_out\": 196}], \"global_summary\": \"The video focuses on the simple, everyday task of putting on socks. It provides a brief look at this routine action as the central theme.\"}" | |
| ] | |
| } | |
| ], | |
| "caption_json_summary": { | |
| "parse_status": "ok", | |
| "json_bytes": 1051, | |
| "top_keys": [ | |
| "config", | |
| "segments", | |
| "global_summary" | |
| ], | |
| "config": { | |
| "segment_sec": 20, | |
| "sample_fps": 0.5, | |
| "total_tokens": 2035, | |
| "Main Task": "Putting on socks. The person is standing in a bathroom and putting on their socks." | |
| }, | |
| "segment_count": 1, | |
| "current_action_count": 1, | |
| "unique_sub_task_count": 1, | |
| "unique_action_label_count": 1, | |
| "object_frame_count": 1, | |
| "interaction_frame_count": 1, | |
| "sampled_frame_count": 1, | |
| "unique_object_count": 4, | |
| "sub_tasks": [ | |
| "Putting on socks" | |
| ], | |
| "action_labels": [ | |
| "Pulling up sock" | |
| ], | |
| "objects": [ | |
| "bathroom floor", | |
| "feet", | |
| "sock", | |
| "toilet" | |
| ], | |
| "global_summary_preview": "The video focuses on the simple, everyday task of putting on socks. It provides a brief look at this routine action as the central theme." | |
| } | |
| } | |
| }, | |
| { | |
| "repo_filename": "10282b64-a955-461e-9ef9-a1ddf8dc619a/ep5/annotation.hdf5", | |
| "inspection": { | |
| "cache_note": "annotation file cached outside the published repo", | |
| "local_bytes": 6706448, | |
| "local_human": "6.40 MiB", | |
| "top_level_keys": [ | |
| "calibration", | |
| "caption", | |
| "depth", | |
| "full_body_mocap", | |
| "hand_mocap", | |
| "imu", | |
| "metadata", | |
| "slam", | |
| "video" | |
| ], | |
| "dataset_count": 65, | |
| "dataset_first_dim_histogram_top20": { | |
| "20": 27, | |
| "4": 14, | |
| "190": 3, | |
| "837": 1 | |
| }, | |
| "top_group_stats": { | |
| "calibration": { | |
| "dataset_count": 23, | |
| "max_first_dim": 4, | |
| "first_dim_values": { | |
| "4": 14 | |
| } | |
| }, | |
| "caption": { | |
| "dataset_count": 1, | |
| "max_first_dim": 0, | |
| "first_dim_values": {} | |
| }, | |
| "depth": { | |
| "dataset_count": 5, | |
| "max_first_dim": 20, | |
| "first_dim_values": { | |
| "20": 2 | |
| } | |
| }, | |
| "full_body_mocap": { | |
| "dataset_count": 9, | |
| "max_first_dim": 20, | |
| "first_dim_values": { | |
| "20": 9 | |
| } | |
| }, | |
| "hand_mocap": { | |
| "dataset_count": 10, | |
| "max_first_dim": 20, | |
| "first_dim_values": { | |
| "20": 10 | |
| } | |
| }, | |
| "imu": { | |
| "dataset_count": 4, | |
| "max_first_dim": 190, | |
| "first_dim_values": { | |
| "190": 3, | |
| "20": 1 | |
| } | |
| }, | |
| "metadata": { | |
| "dataset_count": 6, | |
| "max_first_dim": 0, | |
| "first_dim_values": {} | |
| }, | |
| "slam": { | |
| "dataset_count": 4, | |
| "max_first_dim": 837, | |
| "first_dim_values": { | |
| "20": 3, | |
| "837": 1 | |
| } | |
| }, | |
| "video": { | |
| "dataset_count": 3, | |
| "max_first_dim": 20, | |
| "first_dim_values": { | |
| "20": 2 | |
| } | |
| } | |
| }, | |
| "max_first_dim_dataset": { | |
| "path": "slam/point_cloud", | |
| "shape": [ | |
| 837, | |
| 3 | |
| ], | |
| "dtype": "float64", | |
| "first_dim": 837, | |
| "storage_bytes": 20088, | |
| "storage_human": "19.62 KiB" | |
| }, | |
| "text_action_interaction_related_datasets": [ | |
| { | |
| "path": "caption", | |
| "shape": [], | |
| "dtype": "object", | |
| "first_dim": null, | |
| "storage_bytes": 16, | |
| "storage_human": "16.00 B", | |
| "sample_values": [ | |
| "{\"config\": {\"segment_sec\": 20, \"sample_fps\": 0.5, \"total_tokens\": 2060, \"Main Task\": \"walking through a retail store. The video shows a first-person perspective of someone walking through a retail aisle lined with shelves of products, while other people are seated nearby.\"}, \"segments\": [{\"segment_id\": 0, \"start_frame\": \"78307554787048\", \"end_frame\": \"frame_0000021\", \"Sub Task\": \"walking through a retail store\", \"Current Action\": [{\"label\": \"Walk down retail aisle\", \"description\": \"The camera operator is walking along a retail aisle while observing merchandise on the shelves and other people in the store.\", \"start_frame\": \"78307554787048\", \"end_frame\": \"78307554787048\"}], \"sampled_frames\": {\"Image 1\": 78307554787048}, \"objects\": {\"78307554787048\": [\"retail shelf\", \"product packaging\", \"shopping bags\", \"person seated\"]}, \"interaction\": {\"78307554787048\": \"The individual is walking through the store environment, passing by shelved products and people.\"}, \"api_call_start\": \"2026-03-12T13:50:32.723710\", \"api_call_end\": \"2026-03-12T13:50:34.677914\", \"tokens_in\": 1856, \"tokens_out\": 204}], \"global_summary\": \"The video captures a casual, observational stroll through a retail store environment. It focuses on the experience of navigating the aisles and browsing the products on display.\"}" | |
| ] | |
| } | |
| ], | |
| "caption_json_summary": { | |
| "parse_status": "ok", | |
| "json_bytes": 1299, | |
| "top_keys": [ | |
| "config", | |
| "segments", | |
| "global_summary" | |
| ], | |
| "config": { | |
| "segment_sec": 20, | |
| "sample_fps": 0.5, | |
| "total_tokens": 2060, | |
| "Main Task": "walking through a retail store. The video shows a first-person perspective of someone walking through a retail aisle lined with shelves of products, while other people are seated nearby." | |
| }, | |
| "segment_count": 1, | |
| "current_action_count": 1, | |
| "unique_sub_task_count": 1, | |
| "unique_action_label_count": 1, | |
| "object_frame_count": 1, | |
| "interaction_frame_count": 1, | |
| "sampled_frame_count": 1, | |
| "unique_object_count": 4, | |
| "sub_tasks": [ | |
| "walking through a retail store" | |
| ], | |
| "action_labels": [ | |
| "Walk down retail aisle" | |
| ], | |
| "objects": [ | |
| "person seated", | |
| "product packaging", | |
| "retail shelf", | |
| "shopping bags" | |
| ], | |
| "global_summary_preview": "The video captures a casual, observational stroll through a retail store environment. It focuses on the experience of navigating the aisles and browsing the products on display." | |
| } | |
| } | |
| } | |
| ] | |
| } | |