{ "title": "Xperience-10M Official Dataset Card Alignment", "checked_at_utc": "2026-06-01T11:14:51+00:00", "source_urls": { "official_hf_dataset": "https://huggingface.co/datasets/ropedia-ai/xperience-10m", "official_hf_api": "https://huggingface.co/api/datasets/ropedia-ai/xperience-10m", "official_sample": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample", "ropedia_dataset_site": "https://ropedia.com/dataset", "ropedia_release_page": "https://ropedia.com/blog/20260316_xperience_10m", "homie_toolkit": "https://github.com/Ropedia/HOMIE-toolkit" }, "hf_repo_metadata_observed": { "repo_id": "ropedia-ai/xperience-10m", "pretty_name": "Xperience-10M", "repo_sha": "ce943cf271a758b60240084892d05cf6dc12dd90", "last_modified": "2026-04-21T05:03:45.000Z", "gated": "manual", "task_categories": [ "video-classification", "image-to-text", "depth-estimation", "robotics" ], "card_tags": [ "egocentric", "first-person", "multimodal", "3d", "4d", "embodied-ai", "robotics", "human-motion", "mocap", "imu", "audio", "depth", "captions", "video" ], "modalities": [ "3d", "audio", "video" ], "language": [ "en" ], "size_categories": [ "1M/ep/", "required_for_valid_episode_in_this_repo": [ "annotation.hdf5" ], "preferred_for_full_omni_in_this_repo": [ "fisheye_cam0.mp4", "fisheye_cam1.mp4", "fisheye_cam2.mp4", "fisheye_cam3.mp4", "stereo_left.mp4", "stereo_right.mp4" ], "optional_or_excluded": [ "visualization.rrd" ], "training_policy": "Use annotation.hdf5 plus available MP4 streams; keep visualization.rrd for optional human inspection only and exclude it from training/public bundles." }, "annotation_hdf5_groups": [ "calibration", "slam / camera pose", "depth", "hand_mocap", "full_body_mocap", "imu", "video timing", "metadata", "caption / language annotations" ], "official_intended_uses": [ "egocentric video and action understanding", "task and subtask recognition", "temporal action localization", "action-language grounding and action captioning", "human-object interaction analysis", "object grounding and caption/language grounding", "audio-visual learning and multimodal pretraining", "embodied reasoning and world-model learning", "robotics imitation learning", "depth estimation, odometry, SLAM, and scene reconstruction", "hand/body pose and human motion understanding", "sensor fusion" ], "current_repo_alignment": { "validated_episode_count": 1, "validated_frames": 5821, "validated_windows": 1161, "current_feature_dim": 8546, "raw_data_redistributed": false, "audio_feature_status": "Audio is present in the sample MP4 streams and extracted into the current baseline feature vector as a real AAC audio block.", "implemented_task_count": 12, "neural_head_count": 12, "covered_by_current_tasks": [ "action/subtask recognition", "next-action prediction", "transition and temporal diagnostics", "hand trajectory forecasting", "contact prediction", "object relevance", "caption grounding", "cross-modal retrieval", "modality reconstruction", "misalignment detection" ], "not_yet_claimed": [ "large-scale audio-visual pretraining", "caption generation", "depth-pixel estimation", "SLAM estimation", "neural rendering", "policy learning", "cross-episode generalization", "real 32-episode Qwen3-Omni model quality" ] }, "responsible_use_boundary": [ "No raw MP4, raw annotation.hdf5, private gated data, raw visualization.rrd, or full Qwen weights are redistributed.", "The public sample card lists cc-by-nc-4.0; the full gated dataset uses the official Ropedia/Xperience-10M access terms and license field.", "The official card describes the open-source dataset as limited in diversity and showcase/production quality, so robust evaluation and downstream safeguards are still required.", "The project does not support identity recognition, re-identification, biometric profiling, surveillance, sensitive attribute inference, or safety-critical deployment.", "Dataset use remains governed by the official Ropedia/Xperience-10M terms." ] }