File size: 7,559 Bytes
94a5118
 
9d58132
94a5118
 
 
 
 
 
 
 
a8124a8
 
 
 
 
 
 
 
94a5118
 
a8124a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94a5118
 
 
 
 
 
 
a8124a8
 
 
 
 
 
 
 
cca436c
a8124a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94a5118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8124a8
 
94a5118
 
 
 
 
 
 
 
 
 
a8124a8
 
 
 
 
 
94a5118
 
 
 
 
 
 
 
 
 
 
a8124a8
 
 
 
 
 
 
 
94a5118
 
 
 
 
 
 
 
 
 
a8124a8
94a5118
1cd1f8d
94a5118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8124a8
94a5118
 
 
 
 
 
9371cfb
94a5118
 
a8124a8
 
 
 
 
 
 
94a5118
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
{
  "title": "Xperience-10M Official Dataset Card Alignment",
  "checked_at_utc": "2026-06-01T11:14:51+00:00",
  "source_urls": {
    "official_hf_dataset": "https://huggingface.co/datasets/ropedia-ai/xperience-10m",
    "official_hf_api": "https://huggingface.co/api/datasets/ropedia-ai/xperience-10m",
    "official_sample": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample",
    "ropedia_dataset_site": "https://ropedia.com/dataset",
    "ropedia_release_page": "https://ropedia.com/blog/20260316_xperience_10m",
    "homie_toolkit": "https://github.com/Ropedia/HOMIE-toolkit"
  },
  "hf_repo_metadata_observed": {
    "repo_id": "ropedia-ai/xperience-10m",
    "pretty_name": "Xperience-10M",
    "repo_sha": "ce943cf271a758b60240084892d05cf6dc12dd90",
    "last_modified": "2026-04-21T05:03:45.000Z",
    "gated": "manual",
    "task_categories": [
      "video-classification",
      "image-to-text",
      "depth-estimation",
      "robotics"
    ],
    "card_tags": [
      "egocentric",
      "first-person",
      "multimodal",
      "3d",
      "4d",
      "embodied-ai",
      "robotics",
      "human-motion",
      "mocap",
      "imu",
      "audio",
      "depth",
      "captions",
      "video"
    ],
    "modalities": [
      "3d",
      "audio",
      "video"
    ],
    "language": [
      "en"
    ],
    "size_categories": [
      "1M<n<10M"
    ],
    "license": "other",
    "access_note": "Reviewed gated access for approved non-commercial use; an external agreement-signing step may be required before approval.",
    "live_hf_page_observed": {
      "source": "Hugging Face dataset page/API public metadata",
      "total_file_size_display": "31.9 TB",
      "used_storage_bytes_observed": 31871115497224,
      "note": "This live HF-hosted file-size display is separate from the dataset card's about-1PB full-scale data statement."
    },
    "api_file_listing_observed": {
      "scope": "public Hugging Face API metadata, not local data possession",
      "sibling_count": 85258,
      "session_folder_count": 803,
      "episode_folder_count": 12103,
      "annotation_hdf5_count": 12103,
      "mp4_count": 72612,
      "visualization_rrd_count": 541,
      "canonical_episode_file_counts": {
        "annotation.hdf5": 12103,
        "fisheye_cam0.mp4": 12102,
        "fisheye_cam1.mp4": 12102,
        "fisheye_cam2.mp4": 12102,
        "fisheye_cam3.mp4": 12102,
        "stereo_left.mp4": 12102,
        "stereo_right.mp4": 12102,
        "visualization.rrd": 541
      }
    }
  },
  "public_sample_card_observed": {
    "repo_id": "ropedia-ai/xperience-10m-sample",
    "pretty_name": "Xperience-10M-Sample",
    "license": "cc-by-nc-4.0",
    "tags": [
      "sample",
      "xperience-10k"
    ],
    "size_categories": [
      "n<1K"
    ],
    "card_summary": "A sample episode for Xperience-10M. The card says videos and annotations can be downloaded and inspected with HOMIE Toolkit, and the RRD file can be visualized with Rerun 0.29.0.",
    "tooling": [
      "HOMIE Toolkit",
      "Rerun 0.29.0 for visualization.rrd"
    ]
  },
  "official_dataset_summary": {
    "description": "Large-scale egocentric multimodal human-experience data for embodied AI, robotics, world models, and spatial intelligence.",
    "experience_units": "about 10 million",
    "recording_hours": "about 10,000",
    "storage_described_by_card": "about 1 PB"
  },
  "official_scale_statistics": {
    "rgb_frames": "about 2.88 billion",
    "depth_frames": "about 720 million",
    "camera_pose_records": "about 576 million",
    "motion_capture_frames": "about 576 million",
    "imu_records": "about 7.2 billion",
    "caption_sentences": "about 16 million",
    "caption_words": "about 200 million",
    "vocabulary_words": "about 6,000",
    "object_annotations": "about 350,000",
    "trajectory_distance": "about 39,000 km"
  },
  "official_modalities": [
    "six RGB video streams: four fisheye views and two rectified stereo views",
    "audio embedded in the video streams",
    "stereo depth and confidence",
    "camera pose, SLAM trajectory, and point cloud",
    "two-hand motion capture",
    "full-body motion capture",
    "inertial accelerometer and gyroscope streams",
    "hierarchical language and caption annotations",
    "metadata and calibration records"
  ],
  "episode_layout": {
    "folder_pattern": "<session_uuid>/ep<episode_id>/",
    "required_for_valid_episode_in_this_repo": [
      "annotation.hdf5"
    ],
    "preferred_for_full_omni_in_this_repo": [
      "fisheye_cam0.mp4",
      "fisheye_cam1.mp4",
      "fisheye_cam2.mp4",
      "fisheye_cam3.mp4",
      "stereo_left.mp4",
      "stereo_right.mp4"
    ],
    "optional_or_excluded": [
      "visualization.rrd"
    ],
    "training_policy": "Use annotation.hdf5 plus available MP4 streams; keep visualization.rrd for optional human inspection only and exclude it from training/public bundles."
  },
  "annotation_hdf5_groups": [
    "calibration",
    "slam / camera pose",
    "depth",
    "hand_mocap",
    "full_body_mocap",
    "imu",
    "video timing",
    "metadata",
    "caption / language annotations"
  ],
  "official_intended_uses": [
    "egocentric video and action understanding",
    "task and subtask recognition",
    "temporal action localization",
    "action-language grounding and action captioning",
    "human-object interaction analysis",
    "object grounding and caption/language grounding",
    "audio-visual learning and multimodal pretraining",
    "embodied reasoning and world-model learning",
    "robotics imitation learning",
    "depth estimation, odometry, SLAM, and scene reconstruction",
    "hand/body pose and human motion understanding",
    "sensor fusion"
  ],
  "current_repo_alignment": {
    "validated_episode_count": 1,
    "validated_frames": 5821,
    "validated_windows": 1161,
    "current_feature_dim": 8546,
    "raw_data_redistributed": false,
    "audio_feature_status": "Audio is present in the sample MP4 streams and extracted into the current baseline feature vector.",
    "implemented_task_count": 12,
    "neural_head_count": 12,
    "covered_by_current_tasks": [
      "action/subtask recognition",
      "next-action prediction",
      "transition and temporal diagnostics",
      "hand trajectory forecasting",
      "contact prediction",
      "object relevance",
      "caption grounding",
      "cross-modal retrieval",
      "modality reconstruction",
      "misalignment detection"
    ],
    "not_yet_claimed": [
      "large-scale audio-visual pretraining",
      "caption generation",
      "depth-pixel estimation",
      "SLAM estimation",
      "neural rendering",
      "policy learning",
      "cross-episode generalization",
      "real held-out multi-episode Qwen3-Omni model quality"
    ]
  },
  "responsible_use_boundary": [
    "No raw MP4, raw annotation.hdf5, private gated data, raw visualization.rrd, or full Qwen weights are redistributed.",
    "The public sample card lists cc-by-nc-4.0; the full gated dataset uses the official Ropedia/Xperience-10M access terms and license field.",
    "The official card describes the open-source dataset as limited in diversity and showcase/production quality, so robust evaluation and downstream safeguards are still required.",
    "The project does not support identity recognition, re-identification, biometric profiling, surveillance, sensitive attribute inference, or safety-critical deployment.",
    "Dataset use remains governed by the official Ropedia/Xperience-10M terms."
  ]
}