File size: 7,559 Bytes
94a5118 9d58132 94a5118 a8124a8 94a5118 a8124a8 94a5118 a8124a8 cca436c a8124a8 94a5118 a8124a8 94a5118 a8124a8 94a5118 a8124a8 94a5118 a8124a8 94a5118 1cd1f8d 94a5118 a8124a8 94a5118 9371cfb 94a5118 a8124a8 94a5118 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | {
"title": "Xperience-10M Official Dataset Card Alignment",
"checked_at_utc": "2026-06-01T11:14:51+00:00",
"source_urls": {
"official_hf_dataset": "https://huggingface.co/datasets/ropedia-ai/xperience-10m",
"official_hf_api": "https://huggingface.co/api/datasets/ropedia-ai/xperience-10m",
"official_sample": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample",
"ropedia_dataset_site": "https://ropedia.com/dataset",
"ropedia_release_page": "https://ropedia.com/blog/20260316_xperience_10m",
"homie_toolkit": "https://github.com/Ropedia/HOMIE-toolkit"
},
"hf_repo_metadata_observed": {
"repo_id": "ropedia-ai/xperience-10m",
"pretty_name": "Xperience-10M",
"repo_sha": "ce943cf271a758b60240084892d05cf6dc12dd90",
"last_modified": "2026-04-21T05:03:45.000Z",
"gated": "manual",
"task_categories": [
"video-classification",
"image-to-text",
"depth-estimation",
"robotics"
],
"card_tags": [
"egocentric",
"first-person",
"multimodal",
"3d",
"4d",
"embodied-ai",
"robotics",
"human-motion",
"mocap",
"imu",
"audio",
"depth",
"captions",
"video"
],
"modalities": [
"3d",
"audio",
"video"
],
"language": [
"en"
],
"size_categories": [
"1M<n<10M"
],
"license": "other",
"access_note": "Reviewed gated access for approved non-commercial use; an external agreement-signing step may be required before approval.",
"live_hf_page_observed": {
"source": "Hugging Face dataset page/API public metadata",
"total_file_size_display": "31.9 TB",
"used_storage_bytes_observed": 31871115497224,
"note": "This live HF-hosted file-size display is separate from the dataset card's about-1PB full-scale data statement."
},
"api_file_listing_observed": {
"scope": "public Hugging Face API metadata, not local data possession",
"sibling_count": 85258,
"session_folder_count": 803,
"episode_folder_count": 12103,
"annotation_hdf5_count": 12103,
"mp4_count": 72612,
"visualization_rrd_count": 541,
"canonical_episode_file_counts": {
"annotation.hdf5": 12103,
"fisheye_cam0.mp4": 12102,
"fisheye_cam1.mp4": 12102,
"fisheye_cam2.mp4": 12102,
"fisheye_cam3.mp4": 12102,
"stereo_left.mp4": 12102,
"stereo_right.mp4": 12102,
"visualization.rrd": 541
}
}
},
"public_sample_card_observed": {
"repo_id": "ropedia-ai/xperience-10m-sample",
"pretty_name": "Xperience-10M-Sample",
"license": "cc-by-nc-4.0",
"tags": [
"sample",
"xperience-10k"
],
"size_categories": [
"n<1K"
],
"card_summary": "A sample episode for Xperience-10M. The card says videos and annotations can be downloaded and inspected with HOMIE Toolkit, and the RRD file can be visualized with Rerun 0.29.0.",
"tooling": [
"HOMIE Toolkit",
"Rerun 0.29.0 for visualization.rrd"
]
},
"official_dataset_summary": {
"description": "Large-scale egocentric multimodal human-experience data for embodied AI, robotics, world models, and spatial intelligence.",
"experience_units": "about 10 million",
"recording_hours": "about 10,000",
"storage_described_by_card": "about 1 PB"
},
"official_scale_statistics": {
"rgb_frames": "about 2.88 billion",
"depth_frames": "about 720 million",
"camera_pose_records": "about 576 million",
"motion_capture_frames": "about 576 million",
"imu_records": "about 7.2 billion",
"caption_sentences": "about 16 million",
"caption_words": "about 200 million",
"vocabulary_words": "about 6,000",
"object_annotations": "about 350,000",
"trajectory_distance": "about 39,000 km"
},
"official_modalities": [
"six RGB video streams: four fisheye views and two rectified stereo views",
"audio embedded in the video streams",
"stereo depth and confidence",
"camera pose, SLAM trajectory, and point cloud",
"two-hand motion capture",
"full-body motion capture",
"inertial accelerometer and gyroscope streams",
"hierarchical language and caption annotations",
"metadata and calibration records"
],
"episode_layout": {
"folder_pattern": "<session_uuid>/ep<episode_id>/",
"required_for_valid_episode_in_this_repo": [
"annotation.hdf5"
],
"preferred_for_full_omni_in_this_repo": [
"fisheye_cam0.mp4",
"fisheye_cam1.mp4",
"fisheye_cam2.mp4",
"fisheye_cam3.mp4",
"stereo_left.mp4",
"stereo_right.mp4"
],
"optional_or_excluded": [
"visualization.rrd"
],
"training_policy": "Use annotation.hdf5 plus available MP4 streams; keep visualization.rrd for optional human inspection only and exclude it from training/public bundles."
},
"annotation_hdf5_groups": [
"calibration",
"slam / camera pose",
"depth",
"hand_mocap",
"full_body_mocap",
"imu",
"video timing",
"metadata",
"caption / language annotations"
],
"official_intended_uses": [
"egocentric video and action understanding",
"task and subtask recognition",
"temporal action localization",
"action-language grounding and action captioning",
"human-object interaction analysis",
"object grounding and caption/language grounding",
"audio-visual learning and multimodal pretraining",
"embodied reasoning and world-model learning",
"robotics imitation learning",
"depth estimation, odometry, SLAM, and scene reconstruction",
"hand/body pose and human motion understanding",
"sensor fusion"
],
"current_repo_alignment": {
"validated_episode_count": 1,
"validated_frames": 5821,
"validated_windows": 1161,
"current_feature_dim": 8546,
"raw_data_redistributed": false,
"audio_feature_status": "Audio is present in the sample MP4 streams and extracted into the current baseline feature vector.",
"implemented_task_count": 12,
"neural_head_count": 12,
"covered_by_current_tasks": [
"action/subtask recognition",
"next-action prediction",
"transition and temporal diagnostics",
"hand trajectory forecasting",
"contact prediction",
"object relevance",
"caption grounding",
"cross-modal retrieval",
"modality reconstruction",
"misalignment detection"
],
"not_yet_claimed": [
"large-scale audio-visual pretraining",
"caption generation",
"depth-pixel estimation",
"SLAM estimation",
"neural rendering",
"policy learning",
"cross-episode generalization",
"real held-out multi-episode Qwen3-Omni model quality"
]
},
"responsible_use_boundary": [
"No raw MP4, raw annotation.hdf5, private gated data, raw visualization.rrd, or full Qwen weights are redistributed.",
"The public sample card lists cc-by-nc-4.0; the full gated dataset uses the official Ropedia/Xperience-10M access terms and license field.",
"The official card describes the open-source dataset as limited in diversity and showcase/production quality, so robust evaluation and downstream safeguards are still required.",
"The project does not support identity recognition, re-identification, biometric profiling, surveillance, sensitive attribute inference, or safety-critical deployment.",
"Dataset use remains governed by the official Ropedia/Xperience-10M terms."
]
}
|