| { |
| "title": "Three Foundation Pipeline Tracks", |
| "status": "pipeline_plan", |
| "source_document": "THREE_FOUNDATION_PIPELINES.md", |
| "claim_boundary": "These are supported pipeline directions, not three completed model-quality claims.", |
| "diagram_assets": { |
| "status": "published_restored_presentation_photos", |
| "asset_root": "docs/assets/foundation-pipelines", |
| "source": "Original presentation photos supplied by the project owner, restored locally for public-resolution use", |
| "source_photo_root": "docs/assets/foundation-pipelines/source-photos", |
| "provenance_file": "docs/assets/foundation-pipelines/prompts.md", |
| "renderer_script": "scripts/render_foundation_pipeline_diagrams.py", |
| "diagram_type": "restored_direction_slide_photo", |
| "note": "Images are restored presentation-photo communication assets for pipeline tracks. Technical claims remain governed by the Markdown/JSON contracts and verified metrics." |
| }, |
| "shared_principles": [ |
| "Use episode-level train/validation/test separation.", |
| "Build manifest-first exporters before training.", |
| "Keep target-side future labels and captions out of inputs unless the task explicitly queries them.", |
| "Report task-specific metrics and saved predictions before updating public cards.", |
| "Exclude raw private data and heavyweight base model weights from public packages." |
| ], |
| "tracks": [ |
| { |
| "id": "spatial_intelligence", |
| "title": "Spatial intelligence models", |
| "question": "Can the model recover and reason over space from video?", |
| "core_inputs": [ |
| "multiview RGB", |
| "egocentric video", |
| "depth", |
| "camera pose", |
| "calibration", |
| "object cues", |
| "language questions" |
| ], |
| "intermediate_artifacts": [ |
| "synchronized camera window manifest", |
| "pose and depth availability report", |
| "scene and object memory records", |
| "object permanence targets", |
| "spatial relation targets", |
| "spatial QA prompts" |
| ], |
| "outputs": [ |
| "object count", |
| "object persistence", |
| "relative location", |
| "3D geometry consistency", |
| "multiview retrieval", |
| "camera-motion-aware scene memory", |
| "language answers grounded in the scene" |
| ], |
| "first_pipeline": "Build a spatial-memory exporter, start with metric depth and pose consistency tasks, then evaluate spatial QA, object permanence, counting, retrieval, and pose-aware consistency.", |
| "current_maturity": "Ready as a pipeline and evaluation contract.", |
| "next_gate": "Raw depth and pose artifacts plus held-out multi-episode spatial metrics.", |
| "diagram_image": "docs/assets/foundation-pipelines/spatial-intelligence-pipeline.png", |
| "website_image": "assets/foundation-pipelines/spatial-intelligence-pipeline.png", |
| "image_alt": "Restored presentation photo showing the Spatial intelligence models direction slide for Xperience-10M.", |
| "diagram_flow": [ |
| { |
| "stage": "inputs", |
| "items": [ |
| "multiview RGB plus egocentric video", |
| "metric depth and confidence", |
| "camera pose, calibration, SLAM", |
| "object, contact, and language cues" |
| ] |
| }, |
| { |
| "stage": "tasks_targets", |
| "items": [ |
| "spatial QA and object count", |
| "object permanence across windows", |
| "relative location and retrieval", |
| "pose-aware 3D consistency" |
| ] |
| }, |
| { |
| "stage": "train_models", |
| "items": [ |
| "export scene/object memory records", |
| "train spatial-memory encoder", |
| "add geometry-aware QA and retrieval heads", |
| "keep episode-level split discipline" |
| ] |
| }, |
| { |
| "stage": "evaluate_gates", |
| "items": [ |
| "held-out episode spatial metrics", |
| "count and relation accuracy", |
| "retrieval rank and consistency", |
| "saved predictions before public claim" |
| ] |
| } |
| ], |
| "avoid_claiming_now": [ |
| "full neural rendering", |
| "full 3D reconstruction", |
| "general spatial intelligence without artifact-level evidence" |
| ] |
| }, |
| { |
| "id": "human_video_world_models", |
| "title": "Human-video world models", |
| "question": "Can the model predict what happens next?", |
| "core_inputs": [ |
| "observed video windows", |
| "audio", |
| "sensor windows", |
| "hand and body motion", |
| "object and contact state", |
| "action and subtask labels", |
| "future windows" |
| ], |
| "intermediate_artifacts": [ |
| "observed and future window pairs", |
| "future label targets", |
| "action-conditioned target records", |
| "visual or latent reconstruction targets", |
| "temporal consistency metadata" |
| ], |
| "outputs": [ |
| "next action", |
| "next subtask", |
| "future object set", |
| "future state embedding", |
| "camera-motion delta", |
| "contact transition", |
| "future-window quality metrics" |
| ], |
| "first_pipeline": "Keep Qwen-style structured future probes for task interpretability, keep Cosmos-style dynamics branches separate, and add latent or feature-reconstruction metrics before claiming world-model quality.", |
| "current_maturity": "Partially evidenced by current future-task probes and Cosmos-style branch artifacts.", |
| "next_gate": "Stronger future-state metrics, qualitative future examples, and held-out episode breakdowns.", |
| "diagram_image": "docs/assets/foundation-pipelines/human-video-world-model-pipeline.png", |
| "website_image": "assets/foundation-pipelines/human-video-world-model-pipeline.png", |
| "image_alt": "Restored presentation photo showing the Human-video world models direction slide for Xperience-10M.", |
| "diagram_flow": [ |
| { |
| "stage": "inputs", |
| "items": [ |
| "observed video/audio/sensor window", |
| "hand/body motion and camera pose", |
| "object/contact state", |
| "action and subtask labels" |
| ] |
| }, |
| { |
| "stage": "tasks_targets", |
| "items": [ |
| "next action and next subtask", |
| "future object set", |
| "contact transition", |
| "camera-motion delta or latent future" |
| ] |
| }, |
| { |
| "stage": "train_models", |
| "items": [ |
| "Qwen structured future probes", |
| "Cosmos/dynamics branch separately", |
| "latent rollout or reconstruction loss", |
| "no target-side future leakage" |
| ] |
| }, |
| { |
| "stage": "evaluate_gates", |
| "items": [ |
| "held-out future-task metrics", |
| "contact and object-set F1", |
| "rollout or latent consistency", |
| "per-episode breakdown and examples" |
| ] |
| } |
| ], |
| "avoid_claiming_now": [ |
| "strong world model from structured future-task scores alone", |
| "visual future quality without visual or latent future metrics" |
| ] |
| }, |
| { |
| "id": "vision_language_action", |
| "title": "Vision-language-action models", |
| "question": "Can the model turn what it sees and reads into action?", |
| "core_inputs": [ |
| "egocentric video", |
| "language captions", |
| "hand and body motion", |
| "contacts", |
| "objects", |
| "procedure and subtask labels" |
| ], |
| "intermediate_artifacts": [ |
| "action-token vocabulary", |
| "action-chunk windows", |
| "normalization stats", |
| "retargeting report", |
| "leakage audit", |
| "action-space model card" |
| ], |
| "outputs": [ |
| "next action", |
| "action chunk", |
| "object-conditioned action", |
| "contact state", |
| "subtask transition", |
| "policy or VLA held-out metrics" |
| ], |
| "first_pipeline": "Define the action space, use existing 20-task next-action/contact/object-conditioned tasks first, then add hand-trajectory or policy-compatible action chunks after conversion is traceable.", |
| "current_maturity": "Feasible but gated by action-target conversion.", |
| "next_gate": "Traceable action tokens, normalization, retargeting metadata, and held-out policy metrics.", |
| "diagram_image": "docs/assets/foundation-pipelines/vision-language-action-pipeline.png", |
| "website_image": "assets/foundation-pipelines/vision-language-action-pipeline.png", |
| "image_alt": "Restored presentation photo showing the Vision-language-action models direction slide for Xperience-10M.", |
| "diagram_flow": [ |
| { |
| "stage": "inputs", |
| "items": [ |
| "egocentric video and captions", |
| "objects, contacts, and procedures", |
| "hand/body motion windows", |
| "subtask labels and language context" |
| ] |
| }, |
| { |
| "stage": "tasks_targets", |
| "items": [ |
| "action-token vocabulary", |
| "next action and action chunks", |
| "object-conditioned actions", |
| "contact state and subtask transition" |
| ] |
| }, |
| { |
| "stage": "train_models", |
| "items": [ |
| "build action-space converter", |
| "normalize and audit action chunks", |
| "train VLA/policy-compatible head", |
| "track leakage and retargeting reports" |
| ] |
| }, |
| { |
| "stage": "evaluate_gates", |
| "items": [ |
| "held-out action metrics", |
| "chunk and next-action accuracy", |
| "object/contact-conditioned scores", |
| "policy card before robot-policy claim" |
| ] |
| } |
| ], |
| "avoid_claiming_now": [ |
| "robot policy quality", |
| "policy generalization before action-space evidence exists" |
| ] |
| } |
| ] |
| } |
|
|