ropedia-xperience-10m-task-baselines / metrics /xperience10m_dataset_card_alignment.json
cy0307's picture
Publish Ropedia Xperience-10M task baseline cards
1cd1f8d verified
Raw
History Blame
7.56 kB
{
"title": "Xperience-10M Official Dataset Card Alignment",
"checked_at_utc": "2026-06-01T11:14:51+00:00",
"source_urls": {
"official_hf_dataset": "https://huggingface.co/datasets/ropedia-ai/xperience-10m",
"official_hf_api": "https://huggingface.co/api/datasets/ropedia-ai/xperience-10m",
"official_sample": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample",
"ropedia_dataset_site": "https://ropedia.com/dataset",
"ropedia_release_page": "https://ropedia.com/blog/20260316_xperience_10m",
"homie_toolkit": "https://github.com/Ropedia/HOMIE-toolkit"
},
"hf_repo_metadata_observed": {
"repo_id": "ropedia-ai/xperience-10m",
"pretty_name": "Xperience-10M",
"repo_sha": "ce943cf271a758b60240084892d05cf6dc12dd90",
"last_modified": "2026-04-21T05:03:45.000Z",
"gated": "manual",
"task_categories": [
"video-classification",
"image-to-text",
"depth-estimation",
"robotics"
],
"card_tags": [
"egocentric",
"first-person",
"multimodal",
"3d",
"4d",
"embodied-ai",
"robotics",
"human-motion",
"mocap",
"imu",
"audio",
"depth",
"captions",
"video"
],
"modalities": [
"3d",
"audio",
"video"
],
"language": [
"en"
],
"size_categories": [
"1M<n<10M"
],
"license": "other",
"access_note": "Reviewed gated access for approved non-commercial use; an external agreement-signing step may be required before approval.",
"live_hf_page_observed": {
"source": "Hugging Face dataset page/API public metadata",
"total_file_size_display": "31.9 TB",
"used_storage_bytes_observed": 31871115497224,
"note": "This live HF-hosted file-size display is separate from the dataset card's about-1PB full-scale data statement."
},
"api_file_listing_observed": {
"scope": "public Hugging Face API metadata, not local data possession",
"sibling_count": 85258,
"session_folder_count": 803,
"episode_folder_count": 12103,
"annotation_hdf5_count": 12103,
"mp4_count": 72612,
"visualization_rrd_count": 541,
"canonical_episode_file_counts": {
"annotation.hdf5": 12103,
"fisheye_cam0.mp4": 12102,
"fisheye_cam1.mp4": 12102,
"fisheye_cam2.mp4": 12102,
"fisheye_cam3.mp4": 12102,
"stereo_left.mp4": 12102,
"stereo_right.mp4": 12102,
"visualization.rrd": 541
}
}
},
"public_sample_card_observed": {
"repo_id": "ropedia-ai/xperience-10m-sample",
"pretty_name": "Xperience-10M-Sample",
"license": "cc-by-nc-4.0",
"tags": [
"sample",
"xperience-10k"
],
"size_categories": [
"n<1K"
],
"card_summary": "A sample episode for Xperience-10M. The card says videos and annotations can be downloaded and inspected with HOMIE Toolkit, and the RRD file can be visualized with Rerun 0.29.0.",
"tooling": [
"HOMIE Toolkit",
"Rerun 0.29.0 for visualization.rrd"
]
},
"official_dataset_summary": {
"description": "Large-scale egocentric multimodal human-experience data for embodied AI, robotics, world models, and spatial intelligence.",
"experience_units": "about 10 million",
"recording_hours": "about 10,000",
"storage_described_by_card": "about 1 PB"
},
"official_scale_statistics": {
"rgb_frames": "about 2.88 billion",
"depth_frames": "about 720 million",
"camera_pose_records": "about 576 million",
"motion_capture_frames": "about 576 million",
"imu_records": "about 7.2 billion",
"caption_sentences": "about 16 million",
"caption_words": "about 200 million",
"vocabulary_words": "about 6,000",
"object_annotations": "about 350,000",
"trajectory_distance": "about 39,000 km"
},
"official_modalities": [
"six RGB video streams: four fisheye views and two rectified stereo views",
"audio embedded in the video streams",
"stereo depth and confidence",
"camera pose, SLAM trajectory, and point cloud",
"two-hand motion capture",
"full-body motion capture",
"inertial accelerometer and gyroscope streams",
"hierarchical language and caption annotations",
"metadata and calibration records"
],
"episode_layout": {
"folder_pattern": "<session_uuid>/ep<episode_id>/",
"required_for_valid_episode_in_this_repo": [
"annotation.hdf5"
],
"preferred_for_full_omni_in_this_repo": [
"fisheye_cam0.mp4",
"fisheye_cam1.mp4",
"fisheye_cam2.mp4",
"fisheye_cam3.mp4",
"stereo_left.mp4",
"stereo_right.mp4"
],
"optional_or_excluded": [
"visualization.rrd"
],
"training_policy": "Use annotation.hdf5 plus available MP4 streams; keep visualization.rrd for optional human inspection only and exclude it from training/public bundles."
},
"annotation_hdf5_groups": [
"calibration",
"slam / camera pose",
"depth",
"hand_mocap",
"full_body_mocap",
"imu",
"video timing",
"metadata",
"caption / language annotations"
],
"official_intended_uses": [
"egocentric video and action understanding",
"task and subtask recognition",
"temporal action localization",
"action-language grounding and action captioning",
"human-object interaction analysis",
"object grounding and caption/language grounding",
"audio-visual learning and multimodal pretraining",
"embodied reasoning and world-model learning",
"robotics imitation learning",
"depth estimation, odometry, SLAM, and scene reconstruction",
"hand/body pose and human motion understanding",
"sensor fusion"
],
"current_repo_alignment": {
"validated_episode_count": 1,
"validated_frames": 5821,
"validated_windows": 1161,
"current_feature_dim": 8546,
"raw_data_redistributed": false,
"audio_feature_status": "Audio is present in the sample MP4 streams and extracted into the current baseline feature vector.",
"implemented_task_count": 12,
"neural_head_count": 12,
"covered_by_current_tasks": [
"action/subtask recognition",
"next-action prediction",
"transition and temporal diagnostics",
"hand trajectory forecasting",
"contact prediction",
"object relevance",
"caption grounding",
"cross-modal retrieval",
"modality reconstruction",
"misalignment detection"
],
"not_yet_claimed": [
"large-scale audio-visual pretraining",
"caption generation",
"depth-pixel estimation",
"SLAM estimation",
"neural rendering",
"policy learning",
"cross-episode generalization",
"real held-out multi-episode Qwen3-Omni model quality"
]
},
"responsible_use_boundary": [
"No raw MP4, raw annotation.hdf5, private gated data, raw visualization.rrd, or full Qwen weights are redistributed.",
"The public sample card lists cc-by-nc-4.0; the full gated dataset uses the official Ropedia/Xperience-10M access terms and license field.",
"The official card describes the open-source dataset as limited in diversity and showcase/production quality, so robust evaluation and downstream safeguards are still required.",
"The project does not support identity recognition, re-identification, biometric profiling, surveillance, sensitive attribute inference, or safety-critical deployment.",
"Dataset use remains governed by the official Ropedia/Xperience-10M terms."
]
}