ropedia-xperience-10m-task-baselines / data /three_foundation_pipelines.json
cy0307's picture
Add files using upload-large-folder tool
01f57c3 verified
Raw
History Blame
10.3 kB
{
"title": "Three Foundation Pipeline Tracks",
"status": "pipeline_plan",
"source_document": "THREE_FOUNDATION_PIPELINES.md",
"claim_boundary": "These are supported pipeline directions, not three completed model-quality claims.",
"diagram_assets": {
"status": "published_high_resolution_slide_diagrams",
"asset_root": "docs/assets/foundation-pipelines",
"source": "Clean direction-slide PNGs supplied for the three public direction figures, with original presentation photos retained as provenance",
"source_slide_root": "docs/assets/foundation-pipelines/source-slides",
"source_photo_root": "docs/assets/foundation-pipelines/source-photos",
"provenance_file": "docs/assets/foundation-pipelines/prompts.md",
"renderer_script": "scripts/render_foundation_pipeline_diagrams.py",
"diagram_type": "direction_slide_diagram",
"source_update": "2026-06-19: clean Spatial intelligence, Human-video world model, and Vision-language-action PNGs are committed as source-slide assets and published as 2560-pixel public images.",
"note": "Images are slide-diagram communication assets for pipeline tracks. Technical claims remain governed by the Markdown/JSON contracts and verified metrics."
},
"shared_principles": [
"Use episode-level train/validation/test separation.",
"Build manifest-first exporters before training.",
"Keep target-side future labels and captions out of inputs unless the task explicitly queries them.",
"Report task-specific metrics and saved predictions before updating public cards.",
"Exclude raw private data and heavyweight base model weights from public packages."
],
"tracks": [
{
"id": "spatial_intelligence",
"title": "Spatial intelligence models",
"question": "Can the model recover and reason over space from video?",
"core_inputs": [
"multiview RGB",
"egocentric video",
"depth",
"camera pose",
"calibration",
"object cues",
"language questions"
],
"intermediate_artifacts": [
"synchronized camera window manifest",
"pose and depth availability report",
"scene and object memory records",
"object permanence targets",
"spatial relation targets",
"spatial QA prompts"
],
"outputs": [
"object count",
"object persistence",
"relative location",
"3D geometry consistency",
"multiview retrieval",
"camera-motion-aware scene memory",
"language answers grounded in the scene"
],
"first_pipeline": "Build a spatial-memory exporter, start with metric depth and pose consistency tasks, then evaluate spatial QA, object permanence, counting, retrieval, and pose-aware consistency.",
"current_maturity": "Ready as a pipeline and evaluation contract.",
"next_gate": "Raw depth and pose artifacts plus held-out multi-episode spatial metrics.",
"diagram_image": "docs/assets/foundation-pipelines/spatial-intelligence-pipeline.png",
"website_image": "assets/foundation-pipelines/spatial-intelligence-pipeline.png",
"image_alt": "High-resolution slide diagram showing the Spatial intelligence models direction for Xperience-10M.",
"diagram_flow": [
{
"stage": "inputs",
"items": [
"multiview RGB plus egocentric video",
"metric depth and confidence",
"camera pose, calibration, SLAM",
"object, contact, and language cues"
]
},
{
"stage": "tasks_targets",
"items": [
"spatial QA and object count",
"object permanence across windows",
"relative location and retrieval",
"pose-aware 3D consistency"
]
},
{
"stage": "train_models",
"items": [
"export scene/object memory records",
"train spatial-memory encoder",
"add geometry-aware QA and retrieval heads",
"keep episode-level split discipline"
]
},
{
"stage": "evaluate_gates",
"items": [
"held-out episode spatial metrics",
"count and relation accuracy",
"retrieval rank and consistency",
"saved predictions before public claim"
]
}
],
"avoid_claiming_now": [
"full neural rendering",
"full 3D reconstruction",
"general spatial intelligence without artifact-level evidence"
]
},
{
"id": "human_video_world_models",
"title": "Human-video world models",
"question": "Can the model predict what happens next?",
"core_inputs": [
"observed video windows",
"audio",
"sensor windows",
"hand and body motion",
"object and contact state",
"action and subtask labels",
"future windows"
],
"intermediate_artifacts": [
"observed and future window pairs",
"future label targets",
"action-conditioned target records",
"visual or latent reconstruction targets",
"temporal consistency metadata"
],
"outputs": [
"next action",
"next subtask",
"future object set",
"future state embedding",
"camera-motion delta",
"contact transition",
"future-window quality metrics"
],
"first_pipeline": "Keep Qwen-style structured future probes for task interpretability, keep Cosmos-style dynamics branches separate, and add latent or feature-reconstruction metrics before claiming world-model quality.",
"current_maturity": "Partially evidenced by current future-task probes and Cosmos-style branch artifacts.",
"next_gate": "Stronger future-state metrics, qualitative future examples, and held-out episode breakdowns.",
"diagram_image": "docs/assets/foundation-pipelines/human-video-world-model-pipeline.png",
"website_image": "assets/foundation-pipelines/human-video-world-model-pipeline.png",
"image_alt": "High-resolution slide diagram showing the Human-video world models direction for Xperience-10M.",
"diagram_flow": [
{
"stage": "inputs",
"items": [
"observed video/audio/sensor window",
"hand/body motion and camera pose",
"object/contact state",
"action and subtask labels"
]
},
{
"stage": "tasks_targets",
"items": [
"next action and next subtask",
"future object set",
"contact transition",
"camera-motion delta or latent future"
]
},
{
"stage": "train_models",
"items": [
"Qwen structured future probes",
"Cosmos/dynamics branch separately",
"latent rollout or reconstruction loss",
"no target-side future leakage"
]
},
{
"stage": "evaluate_gates",
"items": [
"held-out future-task metrics",
"contact and object-set F1",
"rollout or latent consistency",
"per-episode breakdown and examples"
]
}
],
"avoid_claiming_now": [
"strong world model from structured future-task scores alone",
"visual future quality without visual or latent future metrics"
]
},
{
"id": "vision_language_action",
"title": "Vision-language-action models",
"question": "Can the model turn what it sees and reads into action?",
"core_inputs": [
"egocentric video",
"language captions",
"hand and body motion",
"contacts",
"objects",
"procedure and subtask labels"
],
"intermediate_artifacts": [
"action-token vocabulary",
"action-chunk windows",
"normalization stats",
"retargeting report",
"leakage audit",
"action-space model card"
],
"outputs": [
"next action",
"action chunk",
"object-conditioned action",
"contact state",
"subtask transition",
"policy or VLA held-out metrics"
],
"first_pipeline": "Define the action space, use existing 20-task next-action/contact/object-conditioned tasks first, then add hand-trajectory or policy-compatible action chunks after conversion is traceable.",
"current_maturity": "Feasible but gated by action-target conversion.",
"next_gate": "Traceable action tokens, normalization, retargeting metadata, and held-out policy metrics.",
"diagram_image": "docs/assets/foundation-pipelines/vision-language-action-pipeline.png",
"website_image": "assets/foundation-pipelines/vision-language-action-pipeline.png",
"image_alt": "High-resolution slide diagram showing the Vision-language-action models direction for Xperience-10M.",
"diagram_flow": [
{
"stage": "inputs",
"items": [
"egocentric video and captions",
"objects, contacts, and procedures",
"hand/body motion windows",
"subtask labels and language context"
]
},
{
"stage": "tasks_targets",
"items": [
"action-token vocabulary",
"next action and action chunks",
"object-conditioned actions",
"contact state and subtask transition"
]
},
{
"stage": "train_models",
"items": [
"build action-space converter",
"normalize and audit action chunks",
"train VLA/policy-compatible head",
"track leakage and retargeting reports"
]
},
{
"stage": "evaluate_gates",
"items": [
"held-out action metrics",
"chunk and next-action accuracy",
"object/contact-conditioned scores",
"policy card before robot-policy claim"
]
}
],
"avoid_claiming_now": [
"robot policy quality",
"policy generalization before action-space evidence exists"
]
}
]
}