File size: 11,210 Bytes
518399e 20602c1 518399e 20602c1 518399e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 | {
"title": "Ropedia Xperience-10M Raw Public Sample Files",
"status": "pass",
"source_checked_utc": "2026-06-15T08:12:55+00:00",
"dataset": {
"repo_id": "ropedia-ai/xperience-10m-sample",
"repo_url": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample",
"repo_commit": "2d80b5af8085a18ffe702e1420968308007f6204",
"license": "cc-by-nc-4.0",
"episode_id": "xperience-10m-sample",
"hosting_policy": "Raw files are streamed or downloaded from the official public sample dataset. This task-suite repo mirrors derived artifacts and metadata only."
},
"browser_preview_policy": "The website includes compact 12 second H.264/AAC fast-start preview clips derived from the official public MP4 files so video and audio can play immediately in browsers. The complete raw MP4, HDF5, and RRD source links remain the authoritative files.",
"windowization": {
"num_frames": 5821,
"num_windows": 1161,
"window_frames": 20,
"stride_frames": 5,
"fps_observed": 20.00137419266181,
"feature_dim": 8546,
"audio_sample_rate": 16000,
"note": "The public task suite converts the raw episode into overlapping 20-frame windows with a 5-frame stride."
},
"sample_tree": [
"xperience-10m-sample/",
" annotation.hdf5",
" fisheye_cam0.mp4",
" fisheye_cam1.mp4",
" fisheye_cam2.mp4",
" fisheye_cam3.mp4",
" stereo_left.mp4",
" stereo_right.mp4",
" visualization.rrd"
],
"preview_assets": [
{
"source_file": "fisheye_cam0.mp4",
"path": "assets/raw-sample-preview/fisheye_cam0_preview.mp4",
"bytes": 506126,
"duration_sec": 12,
"contains_audio": true
},
{
"source_file": "fisheye_cam1.mp4",
"path": "assets/raw-sample-preview/fisheye_cam1_preview.mp4",
"bytes": 648798,
"duration_sec": 12,
"contains_audio": false
},
{
"source_file": "fisheye_cam2.mp4",
"path": "assets/raw-sample-preview/fisheye_cam2_preview.mp4",
"bytes": 659709,
"duration_sec": 12,
"contains_audio": false
},
{
"source_file": "fisheye_cam3.mp4",
"path": "assets/raw-sample-preview/fisheye_cam3_preview.mp4",
"bytes": 330306,
"duration_sec": 12,
"contains_audio": false
},
{
"source_file": "stereo_left.mp4",
"path": "assets/raw-sample-preview/stereo_left_preview.mp4",
"bytes": 594158,
"duration_sec": 12,
"contains_audio": false
},
{
"source_file": "stereo_right.mp4",
"path": "assets/raw-sample-preview/stereo_right_preview.mp4",
"bytes": 650983,
"duration_sec": 12,
"contains_audio": false
}
],
"files": [
{
"name": "annotation.hdf5",
"kind": "hdf5_annotation_container",
"role": "Primary synchronized annotation and sensor container.",
"bytes": 1931496028,
"content_type": "application/octet-stream",
"url": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample/resolve/main/annotation.hdf5",
"browser_behavior": "download_or_open_external",
"contains": [
"caption segments and object/interactions",
"depth and confidence arrays",
"camera pose and SLAM point cloud",
"full-body and hand mocap",
"IMU accelerometer and gyroscope",
"calibration and metadata"
],
"task_suite_use": "Loaded by HOMIE and task-suite scripts to build aligned labels, sensor features, windows, object labels, and diagnostics.",
"reader_sections": [
{
"title": "Inside",
"text": "Caption JSON, object and interaction labels, depth/confidence arrays, camera pose and SLAM values, full-body and hand mocap, IMU, calibration, timestamps, and source metadata."
},
{
"title": "How it relates",
"text": "The six MP4 streams provide synchronized video/audio. annotation.hdf5 supplies the aligned labels and non-RGB sensor timelines used to build 20-frame windows, task targets, and derived public-safe features."
},
{
"title": "Open with",
"text": "Use h5py, HDFView, or HOMIE Toolkit readers. Inspect groups/datasets selectively instead of loading the whole 1.93 GB file into memory."
}
]
},
{
"name": "fisheye_cam0.mp4",
"kind": "video_with_audio",
"role": "Fisheye camera 0 stream and the public sample audio source.",
"bytes": 89842251,
"content_type": "video/mp4",
"url": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample/resolve/main/fisheye_cam0.mp4",
"browser_preview": {
"path": "assets/raw-sample-preview/fisheye_cam0_preview.mp4",
"bytes": 506126,
"duration_sec": 12,
"codec": "h264/aac",
"derived_from": "official raw MP4"
},
"browser_behavior": "play_video_and_audio",
"contains": [
"egocentric fisheye RGB frames",
"embedded audio stream"
],
"task_suite_use": "Video features feed visual tasks; the embedded audio stream feeds audio ablation and acoustic feature blocks."
},
{
"name": "fisheye_cam1.mp4",
"kind": "video",
"role": "Fisheye camera 1 stream.",
"bytes": 127085978,
"content_type": "video/mp4",
"url": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample/resolve/main/fisheye_cam1.mp4",
"browser_preview": {
"path": "assets/raw-sample-preview/fisheye_cam1_preview.mp4",
"bytes": 648798,
"duration_sec": 12,
"codec": "h264",
"derived_from": "official raw MP4"
},
"browser_behavior": "play_video",
"contains": [
"synchronized fisheye RGB frames"
],
"task_suite_use": "Used as one of the synchronized visual feature streams."
},
{
"name": "fisheye_cam2.mp4",
"kind": "video",
"role": "Fisheye camera 2 stream.",
"bytes": 113406041,
"content_type": "video/mp4",
"url": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample/resolve/main/fisheye_cam2.mp4",
"browser_preview": {
"path": "assets/raw-sample-preview/fisheye_cam2_preview.mp4",
"bytes": 659709,
"duration_sec": 12,
"codec": "h264",
"derived_from": "official raw MP4"
},
"browser_behavior": "play_video",
"contains": [
"synchronized fisheye RGB frames"
],
"task_suite_use": "Used as one of the synchronized visual feature streams."
},
{
"name": "fisheye_cam3.mp4",
"kind": "video",
"role": "Fisheye camera 3 stream.",
"bytes": 95024162,
"content_type": "video/mp4",
"url": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample/resolve/main/fisheye_cam3.mp4",
"browser_preview": {
"path": "assets/raw-sample-preview/fisheye_cam3_preview.mp4",
"bytes": 330306,
"duration_sec": 12,
"codec": "h264",
"derived_from": "official raw MP4"
},
"browser_behavior": "play_video",
"contains": [
"synchronized fisheye RGB frames"
],
"task_suite_use": "Used as one of the synchronized visual feature streams."
},
{
"name": "stereo_left.mp4",
"kind": "video",
"role": "Left stereo RGB stream.",
"bytes": 22674748,
"content_type": "video/mp4",
"url": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample/resolve/main/stereo_left.mp4",
"browser_preview": {
"path": "assets/raw-sample-preview/stereo_left_preview.mp4",
"bytes": 594158,
"duration_sec": 12,
"codec": "h264",
"derived_from": "official raw MP4"
},
"browser_behavior": "play_video",
"contains": [
"synchronized stereo-left RGB frames"
],
"task_suite_use": "Used for visual features and multi-view consistency retrieval targets."
},
{
"name": "stereo_right.mp4",
"kind": "video",
"role": "Right stereo RGB stream.",
"bytes": 25415328,
"content_type": "video/mp4",
"url": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample/resolve/main/stereo_right.mp4",
"browser_preview": {
"path": "assets/raw-sample-preview/stereo_right_preview.mp4",
"bytes": 650983,
"duration_sec": 12,
"codec": "h264",
"derived_from": "official raw MP4"
},
"browser_behavior": "play_video",
"contains": [
"synchronized stereo-right RGB frames"
],
"task_suite_use": "Used as a paired stereo visual stream."
},
{
"name": "visualization.rrd",
"kind": "rerun_viewer_recording",
"role": "Optional Rerun visualization bundle for the sample episode.",
"bytes": 2702924036,
"content_type": "application/octet-stream",
"url": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample/resolve/main/visualization.rrd",
"browser_behavior": "download_or_open_with_rerun",
"contains": [
"prebuilt Rerun visualization state"
],
"task_suite_use": "Not used for training or metrics; useful for external visual inspection with Rerun 0.29.0.",
"reader_sections": [
{
"title": "Inside",
"text": "A prebuilt Rerun recording of the sample episode for visual inspection of the episode timeline and viewer state."
},
{
"title": "How it relates",
"text": "It is a convenience viewer artifact beside the raw MP4 and annotation files. It helps inspect alignment, but it is not an input to the published task scores."
},
{
"title": "Open with",
"text": "Download the .rrd file and open it in Rerun 0.29.0. The browser links the official source instead of embedding the 2.70 GB binary."
}
]
}
],
"hdf5_organization": [
{
"group": "calibration",
"description": "Camera intrinsics/extrinsics and static calibration values used to align sensor streams."
},
{
"group": "caption",
"description": "JSON text with task config, action segments, object mentions, interaction descriptions, and global summary."
},
{
"group": "depth",
"description": "Depth maps and confidence channels aligned to the episode timeline."
},
{
"group": "full_body_mocap",
"description": "Full-body joint and contact signals for human motion modeling."
},
{
"group": "hand_mocap",
"description": "Left and right hand joint trajectories used by hand-motion and forecast tasks."
},
{
"group": "imu",
"description": "Accelerometer and gyroscope streams sampled at higher frequency than the video frames."
},
{
"group": "metadata",
"description": "Episode metadata, frame indexing, and source bookkeeping."
},
{
"group": "slam",
"description": "Camera trajectory, pose, and sparse SLAM point-cloud information."
},
{
"group": "video",
"description": "Video stream metadata and per-frame alignment information."
}
]
}
|