File size: 7,559 Bytes

{
  "title": "Xperience-10M Official Dataset Card Alignment",
  "checked_at_utc": "2026-06-01T11:14:51+00:00",
  "source_urls": {
    "official_hf_dataset": "https://huggingface.co/datasets/ropedia-ai/xperience-10m",
    "official_hf_api": "https://huggingface.co/api/datasets/ropedia-ai/xperience-10m",
    "official_sample": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample",
    "ropedia_dataset_site": "https://ropedia.com/dataset",
    "ropedia_release_page": "https://ropedia.com/blog/20260316_xperience_10m",
    "homie_toolkit": "https://github.com/Ropedia/HOMIE-toolkit"
  },
  "hf_repo_metadata_observed": {
    "repo_id": "ropedia-ai/xperience-10m",
    "pretty_name": "Xperience-10M",
    "repo_sha": "ce943cf271a758b60240084892d05cf6dc12dd90",
    "last_modified": "2026-04-21T05:03:45.000Z",
    "gated": "manual",
    "task_categories": [
      "video-classification",
      "image-to-text",
      "depth-estimation",
      "robotics"
    ],
    "card_tags": [
      "egocentric",
      "first-person",
      "multimodal",
      "3d",
      "4d",
      "embodied-ai",
      "robotics",
      "human-motion",
      "mocap",
      "imu",
      "audio",
      "depth",
      "captions",
      "video"
    ],
    "modalities": [
      "3d",
      "audio",
      "video"
    ],
    "language": [
      "en"
    ],
    "size_categories": [
      "1M<n<10M"
    ],
    "license": "other",
    "access_note": "Reviewed gated access for approved non-commercial use; an external agreement-signing step may be required before approval.",
    "live_hf_page_observed": {
      "source": "Hugging Face dataset page/API public metadata",
      "total_file_size_display": "31.9 TB",
      "used_storage_bytes_observed": 31871115497224,
      "note": "This live HF-hosted file-size display is separate from the dataset card's about-1PB full-scale data statement."
    },
    "api_file_listing_observed": {
      "scope": "public Hugging Face API metadata, not local data possession",
      "sibling_count": 85258,
      "session_folder_count": 803,
      "episode_folder_count": 12103,
      "annotation_hdf5_count": 12103,
      "mp4_count": 72612,
      "visualization_rrd_count": 541,
      "canonical_episode_file_counts": {
        "annotation.hdf5": 12103,
        "fisheye_cam0.mp4": 12102,
        "fisheye_cam1.mp4": 12102,
        "fisheye_cam2.mp4": 12102,
        "fisheye_cam3.mp4": 12102,
        "stereo_left.mp4": 12102,
        "stereo_right.mp4": 12102,
        "visualization.rrd": 541
      }
    }
  },
  "public_sample_card_observed": {
    "repo_id": "ropedia-ai/xperience-10m-sample",
    "pretty_name": "Xperience-10M-Sample",
    "license": "cc-by-nc-4.0",
    "tags": [
      "sample",
      "xperience-10k"
    ],
    "size_categories": [
      "n<1K"
    ],
    "card_summary": "A sample episode for Xperience-10M. The card says videos and annotations can be downloaded and inspected with HOMIE Toolkit, and the RRD file can be visualized with Rerun 0.29.0.",
    "tooling": [
      "HOMIE Toolkit",
      "Rerun 0.29.0 for visualization.rrd"
    ]
  },
  "official_dataset_summary": {
    "description": "Large-scale egocentric multimodal human-experience data for embodied AI, robotics, world models, and spatial intelligence.",
    "experience_units": "about 10 million",
    "recording_hours": "about 10,000",
    "storage_described_by_card": "about 1 PB"
  },
  "official_scale_statistics": {
    "rgb_frames": "about 2.88 billion",
    "depth_frames": "about 720 million",
    "camera_pose_records": "about 576 million",
    "motion_capture_frames": "about 576 million",
    "imu_records": "about 7.2 billion",
    "caption_sentences": "about 16 million",
    "caption_words": "about 200 million",
    "vocabulary_words": "about 6,000",
    "object_annotations": "about 350,000",
    "trajectory_distance": "about 39,000 km"
  },
  "official_modalities": [
    "six RGB video streams: four fisheye views and two rectified stereo views",
    "audio embedded in the video streams",
    "stereo depth and confidence",
    "camera pose, SLAM trajectory, and point cloud",
    "two-hand motion capture",
    "full-body motion capture",
    "inertial accelerometer and gyroscope streams",
    "hierarchical language and caption annotations",
    "metadata and calibration records"
  ],
  "episode_layout": {
    "folder_pattern": "<session_uuid>/ep<episode_id>/",
    "required_for_valid_episode_in_this_repo": [
      "annotation.hdf5"
    ],
    "preferred_for_full_omni_in_this_repo": [
      "fisheye_cam0.mp4",
      "fisheye_cam1.mp4",
      "fisheye_cam2.mp4",
      "fisheye_cam3.mp4",
      "stereo_left.mp4",
      "stereo_right.mp4"
    ],
    "optional_or_excluded": [
      "visualization.rrd"
    ],
    "training_policy": "Use annotation.hdf5 plus available MP4 streams; keep visualization.rrd for optional human inspection only and exclude it from training/public bundles."
  },
  "annotation_hdf5_groups": [
    "calibration",
    "slam / camera pose",
    "depth",
    "hand_mocap",
    "full_body_mocap",
    "imu",
    "video timing",
    "metadata",
    "caption / language annotations"
  ],
  "official_intended_uses": [
    "egocentric video and action understanding",
    "task and subtask recognition",
    "temporal action localization",
    "action-language grounding and action captioning",
    "human-object interaction analysis",
    "object grounding and caption/language grounding",
    "audio-visual learning and multimodal pretraining",
    "embodied reasoning and world-model learning",
    "robotics imitation learning",
    "depth estimation, odometry, SLAM, and scene reconstruction",
    "hand/body pose and human motion understanding",
    "sensor fusion"
  ],
  "current_repo_alignment": {
    "validated_episode_count": 1,
    "validated_frames": 5821,
    "validated_windows": 1161,
    "current_feature_dim": 8546,
    "raw_data_redistributed": false,
    "audio_feature_status": "Audio is present in the sample MP4 streams and extracted into the current baseline feature vector.",
    "implemented_task_count": 12,
    "neural_head_count": 12,
    "covered_by_current_tasks": [
      "action/subtask recognition",
      "next-action prediction",
      "transition and temporal diagnostics",
      "hand trajectory forecasting",
      "contact prediction",
      "object relevance",
      "caption grounding",
      "cross-modal retrieval",
      "modality reconstruction",
      "misalignment detection"
    ],
    "not_yet_claimed": [
      "large-scale audio-visual pretraining",
      "caption generation",
      "depth-pixel estimation",
      "SLAM estimation",
      "neural rendering",
      "policy learning",
      "cross-episode generalization",
      "real held-out multi-episode Qwen3-Omni model quality"
    ]
  },
  "responsible_use_boundary": [
    "No raw MP4, raw annotation.hdf5, private gated data, raw visualization.rrd, or full Qwen weights are redistributed.",
    "The public sample card lists cc-by-nc-4.0; the full gated dataset uses the official Ropedia/Xperience-10M access terms and license field.",
    "The official card describes the open-source dataset as limited in diversity and showcase/production quality, so robust evaluation and downstream safeguards are still required.",
    "The project does not support identity recognition, re-identification, biometric profiling, surveillance, sensitive attribute inference, or safety-critical deployment.",
    "Dataset use remains governed by the official Ropedia/Xperience-10M terms."
  ]
}