File size: 10,312 Bytes

{
  "title": "Three Foundation Pipeline Tracks",
  "status": "pipeline_plan",
  "source_document": "THREE_FOUNDATION_PIPELINES.md",
  "claim_boundary": "These are supported pipeline directions, not three completed model-quality claims.",
  "diagram_assets": {
    "status": "published_high_resolution_slide_diagrams",
    "asset_root": "docs/assets/foundation-pipelines",
    "source": "Clean direction-slide PNGs supplied for the three public direction figures, with original presentation photos retained as provenance",
    "source_slide_root": "docs/assets/foundation-pipelines/source-slides",
    "source_photo_root": "docs/assets/foundation-pipelines/source-photos",
    "provenance_file": "docs/assets/foundation-pipelines/prompts.md",
    "renderer_script": "scripts/render_foundation_pipeline_diagrams.py",
    "diagram_type": "direction_slide_diagram",
    "source_update": "2026-06-19: clean Spatial intelligence, Human-video world model, and Vision-language-action PNGs are committed as source-slide assets and published as 2560-pixel public images.",
    "note": "Images are slide-diagram communication assets for pipeline tracks. Technical claims remain governed by the Markdown/JSON contracts and verified metrics."
  },
  "shared_principles": [
    "Use episode-level train/validation/test separation.",
    "Build manifest-first exporters before training.",
    "Keep target-side future labels and captions out of inputs unless the task explicitly queries them.",
    "Report task-specific metrics and saved predictions before updating public cards.",
    "Exclude raw private data and heavyweight base model weights from public packages."
  ],
  "tracks": [
    {
      "id": "spatial_intelligence",
      "title": "Spatial intelligence models",
      "question": "Can the model recover and reason over space from video?",
      "core_inputs": [
        "multiview RGB",
        "egocentric video",
        "depth",
        "camera pose",
        "calibration",
        "object cues",
        "language questions"
      ],
      "intermediate_artifacts": [
        "synchronized camera window manifest",
        "pose and depth availability report",
        "scene and object memory records",
        "object permanence targets",
        "spatial relation targets",
        "spatial QA prompts"
      ],
      "outputs": [
        "object count",
        "object persistence",
        "relative location",
        "3D geometry consistency",
        "multiview retrieval",
        "camera-motion-aware scene memory",
        "language answers grounded in the scene"
      ],
      "first_pipeline": "Build a spatial-memory exporter, start with metric depth and pose consistency tasks, then evaluate spatial QA, object permanence, counting, retrieval, and pose-aware consistency.",
      "current_maturity": "Ready as a pipeline and evaluation contract.",
      "next_gate": "Raw depth and pose artifacts plus held-out multi-episode spatial metrics.",
      "diagram_image": "docs/assets/foundation-pipelines/spatial-intelligence-pipeline.png",
      "website_image": "assets/foundation-pipelines/spatial-intelligence-pipeline.png",
      "image_alt": "High-resolution slide diagram showing the Spatial intelligence models direction for Xperience-10M.",
      "diagram_flow": [
        {
          "stage": "inputs",
          "items": [
            "multiview RGB plus egocentric video",
            "metric depth and confidence",
            "camera pose, calibration, SLAM",
            "object, contact, and language cues"
          ]
        },
        {
          "stage": "tasks_targets",
          "items": [
            "spatial QA and object count",
            "object permanence across windows",
            "relative location and retrieval",
            "pose-aware 3D consistency"
          ]
        },
        {
          "stage": "train_models",
          "items": [
            "export scene/object memory records",
            "train spatial-memory encoder",
            "add geometry-aware QA and retrieval heads",
            "keep episode-level split discipline"
          ]
        },
        {
          "stage": "evaluate_gates",
          "items": [
            "held-out episode spatial metrics",
            "count and relation accuracy",
            "retrieval rank and consistency",
            "saved predictions before public claim"
          ]
        }
      ],
      "avoid_claiming_now": [
        "full neural rendering",
        "full 3D reconstruction",
        "general spatial intelligence without artifact-level evidence"
      ]
    },
    {
      "id": "human_video_world_models",
      "title": "Human-video world models",
      "question": "Can the model predict what happens next?",
      "core_inputs": [
        "observed video windows",
        "audio",
        "sensor windows",
        "hand and body motion",
        "object and contact state",
        "action and subtask labels",
        "future windows"
      ],
      "intermediate_artifacts": [
        "observed and future window pairs",
        "future label targets",
        "action-conditioned target records",
        "visual or latent reconstruction targets",
        "temporal consistency metadata"
      ],
      "outputs": [
        "next action",
        "next subtask",
        "future object set",
        "future state embedding",
        "camera-motion delta",
        "contact transition",
        "future-window quality metrics"
      ],
      "first_pipeline": "Keep Qwen-style structured future probes for task interpretability, keep Cosmos-style dynamics branches separate, and add latent or feature-reconstruction metrics before claiming world-model quality.",
      "current_maturity": "Partially evidenced by current future-task probes and Cosmos-style branch artifacts.",
      "next_gate": "Stronger future-state metrics, qualitative future examples, and held-out episode breakdowns.",
      "diagram_image": "docs/assets/foundation-pipelines/human-video-world-model-pipeline.png",
      "website_image": "assets/foundation-pipelines/human-video-world-model-pipeline.png",
      "image_alt": "High-resolution slide diagram showing the Human-video world models direction for Xperience-10M.",
      "diagram_flow": [
        {
          "stage": "inputs",
          "items": [
            "observed video/audio/sensor window",
            "hand/body motion and camera pose",
            "object/contact state",
            "action and subtask labels"
          ]
        },
        {
          "stage": "tasks_targets",
          "items": [
            "next action and next subtask",
            "future object set",
            "contact transition",
            "camera-motion delta or latent future"
          ]
        },
        {
          "stage": "train_models",
          "items": [
            "Qwen structured future probes",
            "Cosmos/dynamics branch separately",
            "latent rollout or reconstruction loss",
            "no target-side future leakage"
          ]
        },
        {
          "stage": "evaluate_gates",
          "items": [
            "held-out future-task metrics",
            "contact and object-set F1",
            "rollout or latent consistency",
            "per-episode breakdown and examples"
          ]
        }
      ],
      "avoid_claiming_now": [
        "strong world model from structured future-task scores alone",
        "visual future quality without visual or latent future metrics"
      ]
    },
    {
      "id": "vision_language_action",
      "title": "Vision-language-action models",
      "question": "Can the model turn what it sees and reads into action?",
      "core_inputs": [
        "egocentric video",
        "language captions",
        "hand and body motion",
        "contacts",
        "objects",
        "procedure and subtask labels"
      ],
      "intermediate_artifacts": [
        "action-token vocabulary",
        "action-chunk windows",
        "normalization stats",
        "retargeting report",
        "leakage audit",
        "action-space model card"
      ],
      "outputs": [
        "next action",
        "action chunk",
        "object-conditioned action",
        "contact state",
        "subtask transition",
        "policy or VLA held-out metrics"
      ],
      "first_pipeline": "Define the action space, use existing 20-task next-action/contact/object-conditioned tasks first, then add hand-trajectory or policy-compatible action chunks after conversion is traceable.",
      "current_maturity": "Feasible but gated by action-target conversion.",
      "next_gate": "Traceable action tokens, normalization, retargeting metadata, and held-out policy metrics.",
      "diagram_image": "docs/assets/foundation-pipelines/vision-language-action-pipeline.png",
      "website_image": "assets/foundation-pipelines/vision-language-action-pipeline.png",
      "image_alt": "High-resolution slide diagram showing the Vision-language-action models direction for Xperience-10M.",
      "diagram_flow": [
        {
          "stage": "inputs",
          "items": [
            "egocentric video and captions",
            "objects, contacts, and procedures",
            "hand/body motion windows",
            "subtask labels and language context"
          ]
        },
        {
          "stage": "tasks_targets",
          "items": [
            "action-token vocabulary",
            "next action and action chunks",
            "object-conditioned actions",
            "contact state and subtask transition"
          ]
        },
        {
          "stage": "train_models",
          "items": [
            "build action-space converter",
            "normalize and audit action chunks",
            "train VLA/policy-compatible head",
            "track leakage and retargeting reports"
          ]
        },
        {
          "stage": "evaluate_gates",
          "items": [
            "held-out action metrics",
            "chunk and next-action accuracy",
            "object/contact-conditioned scores",
            "policy card before robot-policy claim"
          ]
        }
      ],
      "avoid_claiming_now": [
        "robot policy quality",
        "policy generalization before action-space evidence exists"
      ]
    }
  ]
}