Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "title": "Three Foundation Pipeline Tracks", | |
| "status": "pipeline_plan", | |
| "source_document": "THREE_FOUNDATION_PIPELINES.md", | |
| "reader_note": "These are supported pipeline directions with concrete data exports, training recipes, and evaluation gates.", | |
| "diagram_assets": { | |
| "status": "published_high_resolution_slide_diagrams", | |
| "asset_root": "docs/assets/foundation-pipelines", | |
| "source": "Clean direction-slide PNGs supplied for the three public direction figures, with original presentation photos retained as provenance", | |
| "source_slide_root": "docs/assets/foundation-pipelines/source-slides", | |
| "source_photo_root": "docs/assets/foundation-pipelines/source-photos", | |
| "provenance_file": "docs/assets/foundation-pipelines/prompts.md", | |
| "renderer_script": "scripts/render_foundation_pipeline_diagrams.py", | |
| "diagram_type": "direction_slide_diagram", | |
| "source_update": "2026-06-19: clean Spatial intelligence, Human-video world model, and Vision-language-action PNGs are committed as source-slide assets and published as 2560-pixel public images.", | |
| "note": "Images are slide-diagram communication assets for pipeline tracks. Technical readouts remain governed by the Markdown/JSON contracts and verified metrics." | |
| }, | |
| "shared_principles": [ | |
| "Use episode-level train/validation/test separation.", | |
| "Build manifest-first exporters before training.", | |
| "Keep target-side future labels and captions out of inputs unless the task explicitly queries them.", | |
| "Report task-specific metrics and saved predictions before updating public cards.", | |
| "Exclude raw private data and heavyweight base model weights from public packages." | |
| ], | |
| "tracks": [ | |
| { | |
| "id": "spatial_intelligence", | |
| "title": "Spatial intelligence models", | |
| "question": "Can the model recover and reason over space from video?", | |
| "core_inputs": [ | |
| "multiview RGB", | |
| "egocentric video", | |
| "depth", | |
| "camera pose", | |
| "calibration", | |
| "object cues", | |
| "language questions" | |
| ], | |
| "intermediate_artifacts": [ | |
| "synchronized camera window manifest", | |
| "pose and depth availability report", | |
| "scene and object memory records", | |
| "object permanence targets", | |
| "spatial relation targets", | |
| "spatial QA prompts" | |
| ], | |
| "outputs": [ | |
| "object count", | |
| "object persistence", | |
| "relative location", | |
| "3D geometry consistency", | |
| "multiview retrieval", | |
| "camera-motion-aware scene memory", | |
| "language answers grounded in the scene" | |
| ], | |
| "first_pipeline": "Build a spatial-memory exporter, start with metric depth and pose consistency tasks, then evaluate spatial QA, object permanence, counting, retrieval, and pose-aware consistency.", | |
| "current_maturity": "Ready as a pipeline and evaluation contract.", | |
| "next_gate": "Raw depth and pose artifacts plus held-out multi-episode spatial metrics.", | |
| "diagram_image": "docs/assets/foundation-pipelines/spatial-intelligence-pipeline.png", | |
| "website_image": "assets/foundation-pipelines/spatial-intelligence-pipeline.png", | |
| "image_alt": "High-resolution slide diagram showing the Spatial intelligence models direction for Xperience-10M.", | |
| "one_sample_training_io": { | |
| "sample_basis": "Single public sample episode: 5,821 frames, 1,161 overlapping 20-frame windows, 5-frame stride, about 20 FPS.", | |
| "source_artifacts": [ | |
| "results/episode_task_suite/windows.csv", | |
| "results/episode_task_suite/shared_windows.npz", | |
| "results/episode_task_suite/feature_manifest.json", | |
| "official sample annotation.hdf5", | |
| "official sample six MP4 camera streams" | |
| ], | |
| "input_builder": "Slice each 20-frame window, then join multiview RGB summaries with depth, camera pose, SLAM/calibration, object cues, contact cues, and optional language questions from the public annotation timeline.", | |
| "target_builder": "Create spatial targets such as camera-view match, object relevance, object-set memory, depth/pose reconstruction proxy, caption-grounded retrieval, and spatial QA answers.", | |
| "existing_task_hooks": [ | |
| "object_relevance", | |
| "modality_reconstruction", | |
| "caption_grounding", | |
| "object_set_forecast", | |
| "camera_view_sync_retrieval" | |
| ], | |
| "boundary": "This yields a one-episode spatial training-pair recipe and proxy tasks; the next spatial-intelligence readout is held-out depth, pose, and scene-memory metrics." | |
| }, | |
| "diagram_flow": [ | |
| { | |
| "stage": "inputs", | |
| "items": [ | |
| "multiview RGB plus egocentric video", | |
| "metric depth and confidence", | |
| "camera pose, calibration, SLAM", | |
| "object, contact, and language cues" | |
| ] | |
| }, | |
| { | |
| "stage": "tasks_targets", | |
| "items": [ | |
| "spatial QA and object count", | |
| "object permanence across windows", | |
| "relative location and retrieval", | |
| "pose-aware 3D consistency" | |
| ] | |
| }, | |
| { | |
| "stage": "train_models", | |
| "items": [ | |
| "export scene/object memory records", | |
| "train spatial-memory encoder", | |
| "add geometry-aware QA and retrieval heads", | |
| "keep episode-level split discipline" | |
| ] | |
| }, | |
| { | |
| "stage": "evaluate_gates", | |
| "items": [ | |
| "held-out episode spatial metrics", | |
| "count and relation accuracy", | |
| "retrieval rank and consistency", | |
| "saved predictions before public package update" | |
| ] | |
| } | |
| ], | |
| "next_readout_before_stronger_positioning": [ | |
| "held-out spatial QA", | |
| "pose consistency", | |
| "object-counting and scene-memory metrics" | |
| ] | |
| }, | |
| { | |
| "id": "human_video_world_models", | |
| "title": "Human-video world models", | |
| "question": "Can the model predict what happens next?", | |
| "core_inputs": [ | |
| "observed video windows", | |
| "audio", | |
| "sensor windows", | |
| "hand and body motion", | |
| "object and contact state", | |
| "action and subtask labels", | |
| "future windows" | |
| ], | |
| "intermediate_artifacts": [ | |
| "observed and future window pairs", | |
| "future label targets", | |
| "action-conditioned target records", | |
| "visual or latent reconstruction targets", | |
| "temporal consistency metadata" | |
| ], | |
| "outputs": [ | |
| "next action", | |
| "next subtask", | |
| "future object set", | |
| "future state embedding", | |
| "camera-motion delta", | |
| "contact transition", | |
| "future-window quality metrics" | |
| ], | |
| "first_pipeline": "Keep Qwen-style structured future probes for task interpretability, keep Cosmos-style dynamics branches separate, and add latent or feature-reconstruction metrics before presenting world-model quality.", | |
| "current_maturity": "Partially evidenced by current future-task probes and Cosmos-style branch artifacts.", | |
| "next_gate": "Stronger future-state metrics, qualitative future examples, and held-out episode breakdowns.", | |
| "diagram_image": "docs/assets/foundation-pipelines/human-video-world-model-pipeline.png", | |
| "website_image": "assets/foundation-pipelines/human-video-world-model-pipeline.png", | |
| "image_alt": "High-resolution slide diagram showing the Human-video world models direction for Xperience-10M.", | |
| "one_sample_training_io": { | |
| "sample_basis": "Single public sample episode: current observed windows are paired with shifted future labels or future-window features from the same timeline.", | |
| "source_artifacts": [ | |
| "results/episode_task_suite/windows.csv", | |
| "results/episode_task_suite/shared_windows.npz", | |
| "results/episode_task_suite/tier2_task_suite/tier2_task_suite_results.json", | |
| "results/episode_task_suite/research_direction_extensions/research_direction_extension_results.json" | |
| ], | |
| "input_builder": "Use the current 20-frame observed window at time t: RGB/audio/sensor summaries, hand/body motion, camera pose, current object/contact state, and current action/subtask context only.", | |
| "target_builder": "Shift the episode timeline forward to produce next-action, next-subtask, future object-set, contact-transition, time-to-transition, camera-motion delta, or latent/future-feature targets.", | |
| "existing_task_hooks": [ | |
| "next_action", | |
| "long_horizon_next_action", | |
| "next_subtask_forecast", | |
| "object_set_forecast", | |
| "time_to_transition", | |
| "ego_motion_forecast" | |
| ], | |
| "boundary": "Future labels and future windows must stay out of the input. Structured future probes show the pipeline, while visual world-model readouts need latent or visual future metrics." | |
| }, | |
| "diagram_flow": [ | |
| { | |
| "stage": "inputs", | |
| "items": [ | |
| "observed video/audio/sensor window", | |
| "hand/body motion and camera pose", | |
| "object/contact state", | |
| "action and subtask labels" | |
| ] | |
| }, | |
| { | |
| "stage": "tasks_targets", | |
| "items": [ | |
| "next action and next subtask", | |
| "future object set", | |
| "contact transition", | |
| "camera-motion delta or latent future" | |
| ] | |
| }, | |
| { | |
| "stage": "train_models", | |
| "items": [ | |
| "Qwen structured future probes", | |
| "Cosmos/dynamics branch separately", | |
| "latent rollout or reconstruction loss", | |
| "no target-side future leakage" | |
| ] | |
| }, | |
| { | |
| "stage": "evaluate_gates", | |
| "items": [ | |
| "held-out future-task metrics", | |
| "contact and object-set F1", | |
| "rollout or latent consistency", | |
| "per-episode breakdown and examples" | |
| ] | |
| } | |
| ], | |
| "next_readout_before_stronger_positioning": [ | |
| "latent or visual future metrics", | |
| "per-episode future-task breakdowns", | |
| "qualitative examples backed by saved targets" | |
| ] | |
| }, | |
| { | |
| "id": "vision_language_action", | |
| "title": "Vision-language-action models", | |
| "question": "Can the model turn what it sees and reads into action?", | |
| "core_inputs": [ | |
| "egocentric video", | |
| "language captions", | |
| "hand and body motion", | |
| "contacts", | |
| "objects", | |
| "procedure and subtask labels" | |
| ], | |
| "intermediate_artifacts": [ | |
| "action-token vocabulary", | |
| "action-chunk windows", | |
| "normalization stats", | |
| "retargeting report", | |
| "leakage audit", | |
| "action-space model card" | |
| ], | |
| "outputs": [ | |
| "next action", | |
| "action chunk", | |
| "object-conditioned action", | |
| "contact state", | |
| "subtask transition", | |
| "policy or VLA held-out metrics" | |
| ], | |
| "first_pipeline": "Define the action space, use existing 20-task next-action/contact/object-conditioned tasks first, then add hand-trajectory or policy-compatible action chunks after conversion is traceable.", | |
| "current_maturity": "Feasible but gated by action-target conversion.", | |
| "next_gate": "Traceable action tokens, normalization, retargeting metadata, and held-out policy metrics.", | |
| "diagram_image": "docs/assets/foundation-pipelines/vision-language-action-pipeline.png", | |
| "website_image": "assets/foundation-pipelines/vision-language-action-pipeline.png", | |
| "image_alt": "High-resolution slide diagram showing the Vision-language-action models direction for Xperience-10M.", | |
| "one_sample_training_io": { | |
| "sample_basis": "Single public sample episode: observation-language windows are paired with action-token proxies because robot retargeted action chunks are not part of the public sample yet.", | |
| "source_artifacts": [ | |
| "results/episode_task_suite/windows.csv", | |
| "results/episode_task_suite/shared_windows.npz", | |
| "results/episode_task_suite/task_walkthroughs/task_walkthroughs.json", | |
| "official sample annotation.hdf5" | |
| ], | |
| "input_builder": "Use egocentric/fisheye video windows, caption and object context, hand/body mocap, contact state, and current subtask text as the observation-language side of each training pair.", | |
| "target_builder": "Create action-token proxy targets: current or next action, object-conditioned action relation, contact state, interaction-text class, subtask transition, or hand-trajectory/action-chunk proxy.", | |
| "existing_task_hooks": [ | |
| "timeline_action", | |
| "next_action", | |
| "hand_trajectory_forecast", | |
| "contact_prediction", | |
| "interaction_text_prediction", | |
| "action_object_relation" | |
| ], | |
| "boundary": "This is a VLA/policy data-conversion recipe for the one-sample suite. Robot policy readouts require a later action-space converter, normalization, retargeting report, and held-out policy metrics." | |
| }, | |
| "diagram_flow": [ | |
| { | |
| "stage": "inputs", | |
| "items": [ | |
| "egocentric video and captions", | |
| "objects, contacts, and procedures", | |
| "hand/body motion windows", | |
| "subtask labels and language context" | |
| ] | |
| }, | |
| { | |
| "stage": "tasks_targets", | |
| "items": [ | |
| "action-token vocabulary", | |
| "next action and action chunks", | |
| "object-conditioned actions", | |
| "contact state and subtask transition" | |
| ] | |
| }, | |
| { | |
| "stage": "train_models", | |
| "items": [ | |
| "build action-space converter", | |
| "normalize and audit action chunks", | |
| "train VLA/policy-compatible head", | |
| "track leakage and retargeting reports" | |
| ] | |
| }, | |
| { | |
| "stage": "evaluate_gates", | |
| "items": [ | |
| "held-out action metrics", | |
| "chunk and next-action accuracy", | |
| "object/contact-conditioned scores", | |
| "policy card before robot-policy quality readout" | |
| ] | |
| } | |
| ], | |
| "next_readout_before_stronger_positioning": [ | |
| "action-space conversion", | |
| "normalized action chunks", | |
| "held-out policy metrics" | |
| ] | |
| } | |
| ] | |
| } | |