Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "title": "Xperience-10M Official Dataset Card Alignment", | |
| "checked_at_utc": "2026-06-01T11:14:51+00:00", | |
| "source_urls": { | |
| "official_hf_dataset": "https://huggingface.co/datasets/ropedia-ai/xperience-10m", | |
| "official_hf_api": "https://huggingface.co/api/datasets/ropedia-ai/xperience-10m", | |
| "official_sample": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample", | |
| "ropedia_dataset_site": "https://ropedia.com/dataset", | |
| "ropedia_release_page": "https://ropedia.com/blog/20260316_xperience_10m", | |
| "homie_toolkit": "https://github.com/Ropedia/HOMIE-toolkit" | |
| }, | |
| "hf_repo_metadata_observed": { | |
| "repo_id": "ropedia-ai/xperience-10m", | |
| "pretty_name": "Xperience-10M", | |
| "repo_sha": "ce943cf271a758b60240084892d05cf6dc12dd90", | |
| "last_modified": "2026-04-21T05:03:45.000Z", | |
| "gated": "manual", | |
| "task_categories": [ | |
| "video-classification", | |
| "image-to-text", | |
| "depth-estimation", | |
| "robotics" | |
| ], | |
| "card_tags": [ | |
| "egocentric", | |
| "first-person", | |
| "multimodal", | |
| "3d", | |
| "4d", | |
| "embodied-ai", | |
| "robotics", | |
| "human-motion", | |
| "mocap", | |
| "imu", | |
| "audio", | |
| "depth", | |
| "captions", | |
| "video" | |
| ], | |
| "modalities": [ | |
| "3d", | |
| "audio", | |
| "video" | |
| ], | |
| "language": [ | |
| "en" | |
| ], | |
| "size_categories": [ | |
| "1M<n<10M" | |
| ], | |
| "license": "other", | |
| "access_note": "Reviewed gated access for approved non-commercial use; an external agreement-signing step may be required before approval.", | |
| "live_hf_page_observed": { | |
| "source": "Hugging Face dataset page/API public metadata", | |
| "total_file_size_display": "31.9 TB", | |
| "used_storage_bytes_observed": 31871115497224, | |
| "note": "This live HF-hosted file-size display is separate from the dataset card's about-1PB full-scale data statement." | |
| }, | |
| "api_file_listing_observed": { | |
| "scope": "public Hugging Face API metadata, not local data possession", | |
| "sibling_count": 85258, | |
| "session_folder_count": 803, | |
| "episode_folder_count": 12103, | |
| "annotation_hdf5_count": 12103, | |
| "mp4_count": 72612, | |
| "visualization_rrd_count": 541, | |
| "canonical_episode_file_counts": { | |
| "annotation.hdf5": 12103, | |
| "fisheye_cam0.mp4": 12102, | |
| "fisheye_cam1.mp4": 12102, | |
| "fisheye_cam2.mp4": 12102, | |
| "fisheye_cam3.mp4": 12102, | |
| "stereo_left.mp4": 12102, | |
| "stereo_right.mp4": 12102, | |
| "visualization.rrd": 541 | |
| } | |
| } | |
| }, | |
| "public_sample_card_observed": { | |
| "repo_id": "ropedia-ai/xperience-10m-sample", | |
| "pretty_name": "Xperience-10M-Sample", | |
| "license": "cc-by-nc-4.0", | |
| "tags": [ | |
| "sample", | |
| "xperience-10k" | |
| ], | |
| "size_categories": [ | |
| "n<1K" | |
| ], | |
| "card_summary": "A sample episode for Xperience-10M. The card says videos and annotations can be downloaded and inspected with HOMIE Toolkit, and the RRD file can be visualized with Rerun 0.29.0.", | |
| "tooling": [ | |
| "HOMIE Toolkit", | |
| "Rerun 0.29.0 for visualization.rrd" | |
| ] | |
| }, | |
| "official_dataset_summary": { | |
| "description": "Large-scale egocentric multimodal human-experience data for embodied AI, robotics, world models, and spatial intelligence.", | |
| "experience_units": "about 10 million", | |
| "recording_hours": "about 10,000", | |
| "storage_described_by_card": "about 1 PB" | |
| }, | |
| "official_scale_statistics": { | |
| "rgb_frames": "about 2.88 billion", | |
| "depth_frames": "about 720 million", | |
| "camera_pose_records": "about 576 million", | |
| "motion_capture_frames": "about 576 million", | |
| "imu_records": "about 7.2 billion", | |
| "caption_sentences": "about 16 million", | |
| "caption_words": "about 200 million", | |
| "vocabulary_words": "about 6,000", | |
| "object_annotations": "about 350,000", | |
| "trajectory_distance": "about 39,000 km" | |
| }, | |
| "official_modalities": [ | |
| "six RGB video streams: four fisheye views and two rectified stereo views", | |
| "audio embedded in the video streams", | |
| "stereo depth and confidence", | |
| "camera pose, SLAM trajectory, and point cloud", | |
| "two-hand motion capture", | |
| "full-body motion capture", | |
| "inertial accelerometer and gyroscope streams", | |
| "hierarchical language and caption annotations", | |
| "metadata and calibration records" | |
| ], | |
| "episode_layout": { | |
| "folder_pattern": "<session_uuid>/ep<episode_id>/", | |
| "required_for_valid_episode_in_this_repo": [ | |
| "annotation.hdf5" | |
| ], | |
| "preferred_for_full_omni_in_this_repo": [ | |
| "fisheye_cam0.mp4", | |
| "fisheye_cam1.mp4", | |
| "fisheye_cam2.mp4", | |
| "fisheye_cam3.mp4", | |
| "stereo_left.mp4", | |
| "stereo_right.mp4" | |
| ], | |
| "optional_or_excluded": [ | |
| "visualization.rrd" | |
| ], | |
| "training_policy": "Use annotation.hdf5 plus available MP4 streams; keep visualization.rrd for optional human inspection only and exclude it from training/public bundles." | |
| }, | |
| "annotation_hdf5_groups": [ | |
| "calibration", | |
| "slam / camera pose", | |
| "depth", | |
| "hand_mocap", | |
| "full_body_mocap", | |
| "imu", | |
| "video timing", | |
| "metadata", | |
| "caption / language annotations" | |
| ], | |
| "official_intended_uses": [ | |
| "egocentric video and action understanding", | |
| "task and subtask recognition", | |
| "temporal action localization", | |
| "action-language grounding and action captioning", | |
| "human-object interaction analysis", | |
| "object grounding and caption/language grounding", | |
| "audio-visual learning and multimodal pretraining", | |
| "embodied reasoning and world-model learning", | |
| "robotics imitation learning", | |
| "depth estimation, odometry, SLAM, and scene reconstruction", | |
| "hand/body pose and human motion understanding", | |
| "sensor fusion" | |
| ], | |
| "current_repo_alignment": { | |
| "validated_episode_count": 1, | |
| "validated_frames": 5821, | |
| "validated_windows": 1161, | |
| "current_feature_dim": 8546, | |
| "raw_data_redistributed": false, | |
| "audio_feature_status": "Audio is present in the sample MP4 streams and extracted into the current baseline feature vector.", | |
| "implemented_task_count": 12, | |
| "neural_head_count": 12, | |
| "covered_by_current_tasks": [ | |
| "action/subtask recognition", | |
| "next-action prediction", | |
| "transition and temporal diagnostics", | |
| "hand trajectory forecasting", | |
| "contact prediction", | |
| "object relevance", | |
| "caption grounding", | |
| "cross-modal retrieval", | |
| "modality reconstruction", | |
| "misalignment detection" | |
| ], | |
| "not_yet_claimed": [ | |
| "large-scale audio-visual pretraining", | |
| "caption generation", | |
| "depth-pixel estimation", | |
| "SLAM estimation", | |
| "neural rendering", | |
| "policy learning", | |
| "cross-episode generalization", | |
| "real held-out multi-episode Qwen3-Omni model quality" | |
| ] | |
| }, | |
| "responsible_use_boundary": [ | |
| "No raw MP4, raw annotation.hdf5, private gated data, raw visualization.rrd, or full Qwen weights are redistributed.", | |
| "The public sample card lists cc-by-nc-4.0; the full gated dataset uses the official Ropedia/Xperience-10M access terms and license field.", | |
| "The official card describes the open-source dataset as limited in diversity and showcase/production quality, so robust evaluation and downstream safeguards are still required.", | |
| "The project does not support identity recognition, re-identification, biometric profiling, surveillance, sensitive attribute inference, or safety-critical deployment.", | |
| "Dataset use remains governed by the official Ropedia/Xperience-10M terms." | |
| ] | |
| } | |