ropedia-xperience-10m-task-baselines / metrics /xperience10m_dataset_card_alignment.json

Publish Ropedia Xperience-10M task baseline cards

1cd1f8d verified 25 days ago

7.56 kB

	{
	"title": "Xperience-10M Official Dataset Card Alignment",
	"checked_at_utc": "2026-06-01T11:14:51+00:00",
	"source_urls": {
	"official_hf_dataset": "https://huggingface.co/datasets/ropedia-ai/xperience-10m",
	"official_hf_api": "https://huggingface.co/api/datasets/ropedia-ai/xperience-10m",
	"official_sample": "https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample",
	"ropedia_dataset_site": "https://ropedia.com/dataset",
	"ropedia_release_page": "https://ropedia.com/blog/20260316_xperience_10m",
	"homie_toolkit": "https://github.com/Ropedia/HOMIE-toolkit"
	},
	"hf_repo_metadata_observed": {
	"repo_id": "ropedia-ai/xperience-10m",
	"pretty_name": "Xperience-10M",
	"repo_sha": "ce943cf271a758b60240084892d05cf6dc12dd90",
	"last_modified": "2026-04-21T05:03:45.000Z",
	"gated": "manual",
	"task_categories": [
	"video-classification",
	"image-to-text",
	"depth-estimation",
	"robotics"
	],
	"card_tags": [
	"egocentric",
	"first-person",
	"multimodal",
	"3d",
	"4d",
	"embodied-ai",
	"robotics",
	"human-motion",
	"mocap",
	"imu",
	"audio",
	"depth",
	"captions",
	"video"
	],
	"modalities": [
	"3d",
	"audio",
	"video"
	],
	"language": [
	"en"
	],
	"size_categories": [
	"1M<n<10M"
	],
	"license": "other",
	"access_note": "Reviewed gated access for approved non-commercial use; an external agreement-signing step may be required before approval.",
	"live_hf_page_observed": {
	"source": "Hugging Face dataset page/API public metadata",
	"total_file_size_display": "31.9 TB",
	"used_storage_bytes_observed": 31871115497224,
	"note": "This live HF-hosted file-size display is separate from the dataset card's about-1PB full-scale data statement."
	},
	"api_file_listing_observed": {
	"scope": "public Hugging Face API metadata, not local data possession",
	"sibling_count": 85258,
	"session_folder_count": 803,
	"episode_folder_count": 12103,
	"annotation_hdf5_count": 12103,
	"mp4_count": 72612,
	"visualization_rrd_count": 541,
	"canonical_episode_file_counts": {
	"annotation.hdf5": 12103,
	"fisheye_cam0.mp4": 12102,
	"fisheye_cam1.mp4": 12102,
	"fisheye_cam2.mp4": 12102,
	"fisheye_cam3.mp4": 12102,
	"stereo_left.mp4": 12102,
	"stereo_right.mp4": 12102,
	"visualization.rrd": 541
	}
	}
	},
	"public_sample_card_observed": {
	"repo_id": "ropedia-ai/xperience-10m-sample",
	"pretty_name": "Xperience-10M-Sample",
	"license": "cc-by-nc-4.0",
	"tags": [
	"sample",
	"xperience-10k"
	],
	"size_categories": [
	"n<1K"
	],
	"card_summary": "A sample episode for Xperience-10M. The card says videos and annotations can be downloaded and inspected with HOMIE Toolkit, and the RRD file can be visualized with Rerun 0.29.0.",
	"tooling": [
	"HOMIE Toolkit",
	"Rerun 0.29.0 for visualization.rrd"
	]
	},
	"official_dataset_summary": {
	"description": "Large-scale egocentric multimodal human-experience data for embodied AI, robotics, world models, and spatial intelligence.",
	"experience_units": "about 10 million",
	"recording_hours": "about 10,000",
	"storage_described_by_card": "about 1 PB"
	},
	"official_scale_statistics": {
	"rgb_frames": "about 2.88 billion",
	"depth_frames": "about 720 million",
	"camera_pose_records": "about 576 million",
	"motion_capture_frames": "about 576 million",
	"imu_records": "about 7.2 billion",
	"caption_sentences": "about 16 million",
	"caption_words": "about 200 million",
	"vocabulary_words": "about 6,000",
	"object_annotations": "about 350,000",
	"trajectory_distance": "about 39,000 km"
	},
	"official_modalities": [
	"six RGB video streams: four fisheye views and two rectified stereo views",
	"audio embedded in the video streams",
	"stereo depth and confidence",
	"camera pose, SLAM trajectory, and point cloud",
	"two-hand motion capture",
	"full-body motion capture",
	"inertial accelerometer and gyroscope streams",
	"hierarchical language and caption annotations",
	"metadata and calibration records"
	],
	"episode_layout": {
	"folder_pattern": "<session_uuid>/ep<episode_id>/",
	"required_for_valid_episode_in_this_repo": [
	"annotation.hdf5"
	],
	"preferred_for_full_omni_in_this_repo": [
	"fisheye_cam0.mp4",
	"fisheye_cam1.mp4",
	"fisheye_cam2.mp4",
	"fisheye_cam3.mp4",
	"stereo_left.mp4",
	"stereo_right.mp4"
	],
	"optional_or_excluded": [
	"visualization.rrd"
	],
	"training_policy": "Use annotation.hdf5 plus available MP4 streams; keep visualization.rrd for optional human inspection only and exclude it from training/public bundles."
	},
	"annotation_hdf5_groups": [
	"calibration",
	"slam / camera pose",
	"depth",
	"hand_mocap",
	"full_body_mocap",
	"imu",
	"video timing",
	"metadata",
	"caption / language annotations"
	],
	"official_intended_uses": [
	"egocentric video and action understanding",
	"task and subtask recognition",
	"temporal action localization",
	"action-language grounding and action captioning",
	"human-object interaction analysis",
	"object grounding and caption/language grounding",
	"audio-visual learning and multimodal pretraining",
	"embodied reasoning and world-model learning",
	"robotics imitation learning",
	"depth estimation, odometry, SLAM, and scene reconstruction",
	"hand/body pose and human motion understanding",
	"sensor fusion"
	],
	"current_repo_alignment": {
	"validated_episode_count": 1,
	"validated_frames": 5821,
	"validated_windows": 1161,
	"current_feature_dim": 8546,
	"raw_data_redistributed": false,
	"audio_feature_status": "Audio is present in the sample MP4 streams and extracted into the current baseline feature vector.",
	"implemented_task_count": 12,
	"neural_head_count": 12,
	"covered_by_current_tasks": [
	"action/subtask recognition",
	"next-action prediction",
	"transition and temporal diagnostics",
	"hand trajectory forecasting",
	"contact prediction",
	"object relevance",
	"caption grounding",
	"cross-modal retrieval",
	"modality reconstruction",
	"misalignment detection"
	],
	"not_yet_claimed": [
	"large-scale audio-visual pretraining",
	"caption generation",
	"depth-pixel estimation",
	"SLAM estimation",
	"neural rendering",
	"policy learning",
	"cross-episode generalization",
	"real held-out multi-episode Qwen3-Omni model quality"
	]
	},
	"responsible_use_boundary": [
	"No raw MP4, raw annotation.hdf5, private gated data, raw visualization.rrd, or full Qwen weights are redistributed.",
	"The public sample card lists cc-by-nc-4.0; the full gated dataset uses the official Ropedia/Xperience-10M access terms and license field.",
	"The official card describes the open-source dataset as limited in diversity and showcase/production quality, so robust evaluation and downstream safeguards are still required.",
	"The project does not support identity recognition, re-identification, biometric profiling, surveillance, sensitive attribute inference, or safety-critical deployment.",
	"Dataset use remains governed by the official Ropedia/Xperience-10M terms."
	]
	}