{ "title": "Three Foundation Pipeline Tracks", "status": "pipeline_plan", "source_document": "THREE_FOUNDATION_PIPELINES.md", "claim_boundary": "These are supported pipeline directions, not three completed model-quality claims.", "diagram_assets": { "status": "published_high_resolution_slide_diagrams", "asset_root": "docs/assets/foundation-pipelines", "source": "Clean direction-slide PNGs supplied for the three public direction figures, with original presentation photos retained as provenance", "source_slide_root": "docs/assets/foundation-pipelines/source-slides", "source_photo_root": "docs/assets/foundation-pipelines/source-photos", "provenance_file": "docs/assets/foundation-pipelines/prompts.md", "renderer_script": "scripts/render_foundation_pipeline_diagrams.py", "diagram_type": "direction_slide_diagram", "source_update": "2026-06-19: clean Spatial intelligence, Human-video world model, and Vision-language-action PNGs are committed as source-slide assets and published as 2560-pixel public images.", "note": "Images are slide-diagram communication assets for pipeline tracks. Technical claims remain governed by the Markdown/JSON contracts and verified metrics." }, "shared_principles": [ "Use episode-level train/validation/test separation.", "Build manifest-first exporters before training.", "Keep target-side future labels and captions out of inputs unless the task explicitly queries them.", "Report task-specific metrics and saved predictions before updating public cards.", "Exclude raw private data and heavyweight base model weights from public packages." ], "tracks": [ { "id": "spatial_intelligence", "title": "Spatial intelligence models", "question": "Can the model recover and reason over space from video?", "core_inputs": [ "multiview RGB", "egocentric video", "depth", "camera pose", "calibration", "object cues", "language questions" ], "intermediate_artifacts": [ "synchronized camera window manifest", "pose and depth availability report", "scene and object memory records", "object permanence targets", "spatial relation targets", "spatial QA prompts" ], "outputs": [ "object count", "object persistence", "relative location", "3D geometry consistency", "multiview retrieval", "camera-motion-aware scene memory", "language answers grounded in the scene" ], "first_pipeline": "Build a spatial-memory exporter, start with metric depth and pose consistency tasks, then evaluate spatial QA, object permanence, counting, retrieval, and pose-aware consistency.", "current_maturity": "Ready as a pipeline and evaluation contract.", "next_gate": "Raw depth and pose artifacts plus held-out multi-episode spatial metrics.", "diagram_image": "docs/assets/foundation-pipelines/spatial-intelligence-pipeline.png", "website_image": "assets/foundation-pipelines/spatial-intelligence-pipeline.png", "image_alt": "High-resolution slide diagram showing the Spatial intelligence models direction for Xperience-10M.", "one_sample_training_io": { "sample_basis": "Single public sample episode: 5,821 frames, 1,161 overlapping 20-frame windows, 5-frame stride, about 20 FPS.", "source_artifacts": [ "results/episode_task_suite/windows.csv", "results/episode_task_suite/shared_windows.npz", "results/episode_task_suite/feature_manifest.json", "official sample annotation.hdf5", "official sample six MP4 camera streams" ], "input_builder": "Slice each 20-frame window, then join multiview RGB summaries with depth, camera pose, SLAM/calibration, object cues, contact cues, and optional language questions from the public annotation timeline.", "target_builder": "Create spatial targets such as camera-view match, object relevance, object-set memory, depth/pose reconstruction proxy, caption-grounded retrieval, and spatial QA answers.", "existing_task_hooks": [ "object_relevance", "modality_reconstruction", "caption_grounding", "object_set_forecast", "camera_view_sync_retrieval" ], "boundary": "This yields a one-episode spatial training-pair recipe and proxy tasks; full spatial-intelligence claims require held-out multi-episode depth/pose/scene-memory metrics." }, "diagram_flow": [ { "stage": "inputs", "items": [ "multiview RGB plus egocentric video", "metric depth and confidence", "camera pose, calibration, SLAM", "object, contact, and language cues" ] }, { "stage": "tasks_targets", "items": [ "spatial QA and object count", "object permanence across windows", "relative location and retrieval", "pose-aware 3D consistency" ] }, { "stage": "train_models", "items": [ "export scene/object memory records", "train spatial-memory encoder", "add geometry-aware QA and retrieval heads", "keep episode-level split discipline" ] }, { "stage": "evaluate_gates", "items": [ "held-out episode spatial metrics", "count and relation accuracy", "retrieval rank and consistency", "saved predictions before public claim" ] } ], "avoid_claiming_now": [ "full neural rendering", "full 3D reconstruction", "general spatial intelligence without artifact-level evidence" ] }, { "id": "human_video_world_models", "title": "Human-video world models", "question": "Can the model predict what happens next?", "core_inputs": [ "observed video windows", "audio", "sensor windows", "hand and body motion", "object and contact state", "action and subtask labels", "future windows" ], "intermediate_artifacts": [ "observed and future window pairs", "future label targets", "action-conditioned target records", "visual or latent reconstruction targets", "temporal consistency metadata" ], "outputs": [ "next action", "next subtask", "future object set", "future state embedding", "camera-motion delta", "contact transition", "future-window quality metrics" ], "first_pipeline": "Keep Qwen-style structured future probes for task interpretability, keep Cosmos-style dynamics branches separate, and add latent or feature-reconstruction metrics before claiming world-model quality.", "current_maturity": "Partially evidenced by current future-task probes and Cosmos-style branch artifacts.", "next_gate": "Stronger future-state metrics, qualitative future examples, and held-out episode breakdowns.", "diagram_image": "docs/assets/foundation-pipelines/human-video-world-model-pipeline.png", "website_image": "assets/foundation-pipelines/human-video-world-model-pipeline.png", "image_alt": "High-resolution slide diagram showing the Human-video world models direction for Xperience-10M.", "one_sample_training_io": { "sample_basis": "Single public sample episode: current observed windows are paired with shifted future labels or future-window features from the same timeline.", "source_artifacts": [ "results/episode_task_suite/windows.csv", "results/episode_task_suite/shared_windows.npz", "results/episode_task_suite/tier2_task_suite/tier2_task_suite_results.json", "results/episode_task_suite/research_direction_extensions/research_direction_extension_results.json" ], "input_builder": "Use the current 20-frame observed window at time t: RGB/audio/sensor summaries, hand/body motion, camera pose, current object/contact state, and current action/subtask context only.", "target_builder": "Shift the episode timeline forward to produce next-action, next-subtask, future object-set, contact-transition, time-to-transition, camera-motion delta, or latent/future-feature targets.", "existing_task_hooks": [ "next_action", "long_horizon_next_action", "next_subtask_forecast", "object_set_forecast", "time_to_transition", "ego_motion_forecast" ], "boundary": "Future labels and future windows must stay out of the input. Structured future probes are evidence for the pipeline, not a full visual world-model claim by themselves." }, "diagram_flow": [ { "stage": "inputs", "items": [ "observed video/audio/sensor window", "hand/body motion and camera pose", "object/contact state", "action and subtask labels" ] }, { "stage": "tasks_targets", "items": [ "next action and next subtask", "future object set", "contact transition", "camera-motion delta or latent future" ] }, { "stage": "train_models", "items": [ "Qwen structured future probes", "Cosmos/dynamics branch separately", "latent rollout or reconstruction loss", "no target-side future leakage" ] }, { "stage": "evaluate_gates", "items": [ "held-out future-task metrics", "contact and object-set F1", "rollout or latent consistency", "per-episode breakdown and examples" ] } ], "avoid_claiming_now": [ "strong world model from structured future-task scores alone", "visual future quality without visual or latent future metrics" ] }, { "id": "vision_language_action", "title": "Vision-language-action models", "question": "Can the model turn what it sees and reads into action?", "core_inputs": [ "egocentric video", "language captions", "hand and body motion", "contacts", "objects", "procedure and subtask labels" ], "intermediate_artifacts": [ "action-token vocabulary", "action-chunk windows", "normalization stats", "retargeting report", "leakage audit", "action-space model card" ], "outputs": [ "next action", "action chunk", "object-conditioned action", "contact state", "subtask transition", "policy or VLA held-out metrics" ], "first_pipeline": "Define the action space, use existing 20-task next-action/contact/object-conditioned tasks first, then add hand-trajectory or policy-compatible action chunks after conversion is traceable.", "current_maturity": "Feasible but gated by action-target conversion.", "next_gate": "Traceable action tokens, normalization, retargeting metadata, and held-out policy metrics.", "diagram_image": "docs/assets/foundation-pipelines/vision-language-action-pipeline.png", "website_image": "assets/foundation-pipelines/vision-language-action-pipeline.png", "image_alt": "High-resolution slide diagram showing the Vision-language-action models direction for Xperience-10M.", "one_sample_training_io": { "sample_basis": "Single public sample episode: observation-language windows are paired with action-token proxies because robot retargeted action chunks are not part of the public sample yet.", "source_artifacts": [ "results/episode_task_suite/windows.csv", "results/episode_task_suite/shared_windows.npz", "results/episode_task_suite/task_walkthroughs/task_walkthroughs.json", "official sample annotation.hdf5" ], "input_builder": "Use egocentric/fisheye video windows, caption and object context, hand/body mocap, contact state, and current subtask text as the observation-language side of each training pair.", "target_builder": "Create action-token proxy targets: current or next action, object-conditioned action relation, contact state, interaction-text class, subtask transition, or hand-trajectory/action-chunk proxy.", "existing_task_hooks": [ "timeline_action", "next_action", "hand_trajectory_forecast", "contact_prediction", "interaction_text_prediction", "action_object_relation" ], "boundary": "This is a VLA/policy data-conversion recipe for the one-sample suite. Robot policy claims require a later action-space converter, normalization, retargeting report, and held-out policy metrics." }, "diagram_flow": [ { "stage": "inputs", "items": [ "egocentric video and captions", "objects, contacts, and procedures", "hand/body motion windows", "subtask labels and language context" ] }, { "stage": "tasks_targets", "items": [ "action-token vocabulary", "next action and action chunks", "object-conditioned actions", "contact state and subtask transition" ] }, { "stage": "train_models", "items": [ "build action-space converter", "normalize and audit action chunks", "train VLA/policy-compatible head", "track leakage and retargeting reports" ] }, { "stage": "evaluate_gates", "items": [ "held-out action metrics", "chunk and next-action accuracy", "object/contact-conditioned scores", "policy card before robot-policy claim" ] } ], "avoid_claiming_now": [ "robot policy quality", "policy generalization before action-space evidence exists" ] } ] }