ropedia-xperience-10m-task-baselines / docs /data /three_foundation_pipelines.json

Refine reader-facing scope wording (2/4)

2600a90 verified 3 days ago

14.6 kB

	{
	"title": "Three Foundation Pipeline Tracks",
	"status": "pipeline_plan",
	"source_document": "THREE_FOUNDATION_PIPELINES.md",
	"reader_note": "These are supported pipeline directions with concrete data exports, training recipes, and evaluation gates.",
	"diagram_assets": {
	"status": "published_high_resolution_slide_diagrams",
	"asset_root": "docs/assets/foundation-pipelines",
	"source": "Clean direction-slide PNGs supplied for the three public direction figures, with original presentation photos retained as provenance",
	"source_slide_root": "docs/assets/foundation-pipelines/source-slides",
	"source_photo_root": "docs/assets/foundation-pipelines/source-photos",
	"provenance_file": "docs/assets/foundation-pipelines/prompts.md",
	"renderer_script": "scripts/render_foundation_pipeline_diagrams.py",
	"diagram_type": "direction_slide_diagram",
	"source_update": "2026-06-19: clean Spatial intelligence, Human-video world model, and Vision-language-action PNGs are committed as source-slide assets and published as 2560-pixel public images.",
	"note": "Images are slide-diagram communication assets for pipeline tracks. Technical readouts remain governed by the Markdown/JSON contracts and verified metrics."
	},
	"shared_principles": [
	"Use episode-level train/validation/test separation.",
	"Build manifest-first exporters before training.",
	"Keep target-side future labels and captions out of inputs unless the task explicitly queries them.",
	"Report task-specific metrics and saved predictions before updating public cards.",
	"Exclude raw private data and heavyweight base model weights from public packages."
	],
	"tracks": [
	{
	"id": "spatial_intelligence",
	"title": "Spatial intelligence models",
	"question": "Can the model recover and reason over space from video?",
	"core_inputs": [
	"multiview RGB",
	"egocentric video",
	"depth",
	"camera pose",
	"calibration",
	"object cues",
	"language questions"
	],
	"intermediate_artifacts": [
	"synchronized camera window manifest",
	"pose and depth availability report",
	"scene and object memory records",
	"object permanence targets",
	"spatial relation targets",
	"spatial QA prompts"
	],
	"outputs": [
	"object count",
	"object persistence",
	"relative location",
	"3D geometry consistency",
	"multiview retrieval",
	"camera-motion-aware scene memory",
	"language answers grounded in the scene"
	],
	"first_pipeline": "Build a spatial-memory exporter, start with metric depth and pose consistency tasks, then evaluate spatial QA, object permanence, counting, retrieval, and pose-aware consistency.",
	"current_maturity": "Ready as a pipeline and evaluation contract.",
	"next_gate": "Raw depth and pose artifacts plus held-out multi-episode spatial metrics.",
	"diagram_image": "docs/assets/foundation-pipelines/spatial-intelligence-pipeline.png",
	"website_image": "assets/foundation-pipelines/spatial-intelligence-pipeline.png",
	"image_alt": "High-resolution slide diagram showing the Spatial intelligence models direction for Xperience-10M.",
	"one_sample_training_io": {
	"sample_basis": "Single public sample episode: 5,821 frames, 1,161 overlapping 20-frame windows, 5-frame stride, about 20 FPS.",
	"source_artifacts": [
	"results/episode_task_suite/windows.csv",
	"results/episode_task_suite/shared_windows.npz",
	"results/episode_task_suite/feature_manifest.json",
	"official sample annotation.hdf5",
	"official sample six MP4 camera streams"
	],
	"input_builder": "Slice each 20-frame window, then join multiview RGB summaries with depth, camera pose, SLAM/calibration, object cues, contact cues, and optional language questions from the public annotation timeline.",
	"target_builder": "Create spatial targets such as camera-view match, object relevance, object-set memory, depth/pose reconstruction proxy, caption-grounded retrieval, and spatial QA answers.",
	"existing_task_hooks": [
	"object_relevance",
	"modality_reconstruction",
	"caption_grounding",
	"object_set_forecast",
	"camera_view_sync_retrieval"
	],
	"boundary": "This yields a one-episode spatial training-pair recipe and proxy tasks; the next spatial-intelligence readout is held-out depth, pose, and scene-memory metrics."
	},
	"diagram_flow": [
	{
	"stage": "inputs",
	"items": [
	"multiview RGB plus egocentric video",
	"metric depth and confidence",
	"camera pose, calibration, SLAM",
	"object, contact, and language cues"
	]
	},
	{
	"stage": "tasks_targets",
	"items": [
	"spatial QA and object count",
	"object permanence across windows",
	"relative location and retrieval",
	"pose-aware 3D consistency"
	]
	},
	{
	"stage": "train_models",
	"items": [
	"export scene/object memory records",
	"train spatial-memory encoder",
	"add geometry-aware QA and retrieval heads",
	"keep episode-level split discipline"
	]
	},
	{
	"stage": "evaluate_gates",
	"items": [
	"held-out episode spatial metrics",
	"count and relation accuracy",
	"retrieval rank and consistency",
	"saved predictions before public package update"
	]
	}
	],
	"next_readout_before_stronger_positioning": [
	"held-out spatial QA",
	"pose consistency",
	"object-counting and scene-memory metrics"
	]
	},
	{
	"id": "human_video_world_models",
	"title": "Human-video world models",
	"question": "Can the model predict what happens next?",
	"core_inputs": [
	"observed video windows",
	"audio",
	"sensor windows",
	"hand and body motion",
	"object and contact state",
	"action and subtask labels",
	"future windows"
	],
	"intermediate_artifacts": [
	"observed and future window pairs",
	"future label targets",
	"action-conditioned target records",
	"visual or latent reconstruction targets",
	"temporal consistency metadata"
	],
	"outputs": [
	"next action",
	"next subtask",
	"future object set",
	"future state embedding",
	"camera-motion delta",
	"contact transition",
	"future-window quality metrics"
	],
	"first_pipeline": "Keep Qwen-style structured future probes for task interpretability, keep Cosmos-style dynamics branches separate, and add latent or feature-reconstruction metrics before presenting world-model quality.",
	"current_maturity": "Partially evidenced by current future-task probes and Cosmos-style branch artifacts.",
	"next_gate": "Stronger future-state metrics, qualitative future examples, and held-out episode breakdowns.",
	"diagram_image": "docs/assets/foundation-pipelines/human-video-world-model-pipeline.png",
	"website_image": "assets/foundation-pipelines/human-video-world-model-pipeline.png",
	"image_alt": "High-resolution slide diagram showing the Human-video world models direction for Xperience-10M.",
	"one_sample_training_io": {
	"sample_basis": "Single public sample episode: current observed windows are paired with shifted future labels or future-window features from the same timeline.",
	"source_artifacts": [
	"results/episode_task_suite/windows.csv",
	"results/episode_task_suite/shared_windows.npz",
	"results/episode_task_suite/tier2_task_suite/tier2_task_suite_results.json",
	"results/episode_task_suite/research_direction_extensions/research_direction_extension_results.json"
	],
	"input_builder": "Use the current 20-frame observed window at time t: RGB/audio/sensor summaries, hand/body motion, camera pose, current object/contact state, and current action/subtask context only.",
	"target_builder": "Shift the episode timeline forward to produce next-action, next-subtask, future object-set, contact-transition, time-to-transition, camera-motion delta, or latent/future-feature targets.",
	"existing_task_hooks": [
	"next_action",
	"long_horizon_next_action",
	"next_subtask_forecast",
	"object_set_forecast",
	"time_to_transition",
	"ego_motion_forecast"
	],
	"boundary": "Future labels and future windows must stay out of the input. Structured future probes show the pipeline, while visual world-model readouts need latent or visual future metrics."
	},
	"diagram_flow": [
	{
	"stage": "inputs",
	"items": [
	"observed video/audio/sensor window",
	"hand/body motion and camera pose",
	"object/contact state",
	"action and subtask labels"
	]
	},
	{
	"stage": "tasks_targets",
	"items": [
	"next action and next subtask",
	"future object set",
	"contact transition",
	"camera-motion delta or latent future"
	]
	},
	{
	"stage": "train_models",
	"items": [
	"Qwen structured future probes",
	"Cosmos/dynamics branch separately",
	"latent rollout or reconstruction loss",
	"no target-side future leakage"
	]
	},
	{
	"stage": "evaluate_gates",
	"items": [
	"held-out future-task metrics",
	"contact and object-set F1",
	"rollout or latent consistency",
	"per-episode breakdown and examples"
	]
	}
	],
	"next_readout_before_stronger_positioning": [
	"latent or visual future metrics",
	"per-episode future-task breakdowns",
	"qualitative examples backed by saved targets"
	]
	},
	{
	"id": "vision_language_action",
	"title": "Vision-language-action models",
	"question": "Can the model turn what it sees and reads into action?",
	"core_inputs": [
	"egocentric video",
	"language captions",
	"hand and body motion",
	"contacts",
	"objects",
	"procedure and subtask labels"
	],
	"intermediate_artifacts": [
	"action-token vocabulary",
	"action-chunk windows",
	"normalization stats",
	"retargeting report",
	"leakage audit",
	"action-space model card"
	],
	"outputs": [
	"next action",
	"action chunk",
	"object-conditioned action",
	"contact state",
	"subtask transition",
	"policy or VLA held-out metrics"
	],
	"first_pipeline": "Define the action space, use existing 20-task next-action/contact/object-conditioned tasks first, then add hand-trajectory or policy-compatible action chunks after conversion is traceable.",
	"current_maturity": "Feasible but gated by action-target conversion.",
	"next_gate": "Traceable action tokens, normalization, retargeting metadata, and held-out policy metrics.",
	"diagram_image": "docs/assets/foundation-pipelines/vision-language-action-pipeline.png",
	"website_image": "assets/foundation-pipelines/vision-language-action-pipeline.png",
	"image_alt": "High-resolution slide diagram showing the Vision-language-action models direction for Xperience-10M.",
	"one_sample_training_io": {
	"sample_basis": "Single public sample episode: observation-language windows are paired with action-token proxies because robot retargeted action chunks are not part of the public sample yet.",
	"source_artifacts": [
	"results/episode_task_suite/windows.csv",
	"results/episode_task_suite/shared_windows.npz",
	"results/episode_task_suite/task_walkthroughs/task_walkthroughs.json",
	"official sample annotation.hdf5"
	],
	"input_builder": "Use egocentric/fisheye video windows, caption and object context, hand/body mocap, contact state, and current subtask text as the observation-language side of each training pair.",
	"target_builder": "Create action-token proxy targets: current or next action, object-conditioned action relation, contact state, interaction-text class, subtask transition, or hand-trajectory/action-chunk proxy.",
	"existing_task_hooks": [
	"timeline_action",
	"next_action",
	"hand_trajectory_forecast",
	"contact_prediction",
	"interaction_text_prediction",
	"action_object_relation"
	],
	"boundary": "This is a VLA/policy data-conversion recipe for the one-sample suite. Robot policy readouts require a later action-space converter, normalization, retargeting report, and held-out policy metrics."
	},
	"diagram_flow": [
	{
	"stage": "inputs",
	"items": [
	"egocentric video and captions",
	"objects, contacts, and procedures",
	"hand/body motion windows",
	"subtask labels and language context"
	]
	},
	{
	"stage": "tasks_targets",
	"items": [
	"action-token vocabulary",
	"next action and action chunks",
	"object-conditioned actions",
	"contact state and subtask transition"
	]
	},
	{
	"stage": "train_models",
	"items": [
	"build action-space converter",
	"normalize and audit action chunks",
	"train VLA/policy-compatible head",
	"track leakage and retargeting reports"
	]
	},
	{
	"stage": "evaluate_gates",
	"items": [
	"held-out action metrics",
	"chunk and next-action accuracy",
	"object/contact-conditioned scores",
	"policy card before robot-policy quality readout"
	]
	}
	],
	"next_readout_before_stronger_positioning": [
	"action-space conversion",
	"normalized action chunks",
	"held-out policy metrics"
	]
	}
	]
	}