ropedia-xperience-10m-task-baselines / docs /data /research_roadmap_interactive.json
cy0307's picture
Add files using upload-large-folder tool
176f74a verified
Raw
History Blame
191 kB
{
"additional_development_directions": {
"directions": [
{
"data_signals": [
"language annotations",
"object labels",
"scene context",
"video thumbnails",
"motion statistics",
"missing-modality flags"
],
"evaluation": "Coverage by session, activity, object, and modality; duplicate checks; train/val/test leakage checks; reproducible selection report.",
"first_build": "Episode atlas, category tags, balance report, and split builder across activities, objects, scenes, people, sessions, and missing modalities.",
"id": "episode_taxonomy_data_engine",
"name": "Episode Taxonomy and Data Engine",
"why_it_matters": "Fine-tuning quality depends on selecting representative episodes instead of sampling randomly from a large corpus."
},
{
"data_signals": [
"episode manifests",
"window manifests",
"task labels",
"prediction files",
"metric files"
],
"evaluation": "Versioned splits, deterministic metric scripts, task-specific confidence intervals, and model-card reporting templates.",
"first_build": "Fixed train/val/test manifests, task cards, leakage checks, metric scripts, and small reference baselines.",
"id": "standardized_benchmark_protocol",
"name": "Standardized Benchmark Protocol",
"why_it_matters": "Future model results become comparable across Qwen, Cosmos-style world models, policy models, and smaller task heads."
},
{
"data_signals": [
"video",
"audio",
"depth",
"pose/SLAM",
"mocap",
"IMU",
"language"
],
"evaluation": "Cross-modal retrieval, missing-modality reconstruction, transfer to the 12 task heads, and held-out episode generalization.",
"first_build": "Contrastive and masked-prediction objectives over synchronized multimodal windows.",
"id": "multimodal_representation_learning",
"name": "Multimodal Representation Learning",
"why_it_matters": "Xperience-10M can train reusable encoders before committing to expensive large-model fine-tuning or pretraining."
},
{
"data_signals": [
"action labels",
"subtask labels",
"language annotations",
"hand trajectories",
"contact states",
"object labels"
],
"evaluation": "Step boundary accuracy, transition prediction, next-step prediction, graph consistency, and long-horizon task replay.",
"first_build": "Step segmentation, transition graph, precondition/effect labels, and temporal skill graph extraction.",
"id": "skill_procedure_graph_mining",
"name": "Skill and Procedure Graph Mining",
"why_it_matters": "It connects egocentric perception to task structure, planning, and long-horizon embodied reasoning."
},
{
"data_signals": [
"hand mocap",
"body mocap",
"contacts",
"objects",
"egocentric video",
"language"
],
"evaluation": "Contact F1, object micro-F1, affordance accuracy, future interaction prediction, and per-object error analysis.",
"first_build": "Contact, hand-object state, reachable object, likely tool use, and next-affordance prediction tasks.",
"id": "human_object_affordance_modeling",
"name": "Human-Object Interaction and Affordance Modeling",
"why_it_matters": "The dataset can model what actions the scene affords, not only what action label is currently visible."
},
{
"data_signals": [
"depth",
"pose/SLAM",
"multiview video",
"camera calibration",
"objects",
"motion traces"
],
"evaluation": "Map consistency, object permanence, spatial retrieval, future-state prediction, and novel-view or view-consistency probes.",
"first_build": "Persistent scene/object map prototypes built from depth, pose/SLAM, multiview video, and object cues.",
"id": "scene_object_memory",
"name": "3D/4D Scene and Object Memory",
"why_it_matters": "It moves beyond frame-level recognition toward world-state tracking, object permanence, and spatial reasoning."
},
{
"data_signals": [
"timestamps",
"file manifests",
"camera streams",
"audio streams",
"depth streams",
"calibration",
"annotation coverage"
],
"evaluation": "QA pass rate, drift estimates, missing-view tables, corruption reports, and exclusion or degraded-mode manifests.",
"first_build": "Per-episode QA for timestamp drift, stream availability, calibration consistency, corrupted files, and missing modalities.",
"id": "data_quality_sync_diagnostics",
"name": "Data Quality, Synchronization, and Missing-Modality Diagnostics",
"why_it_matters": "Large multimodal training fails quietly without strong data-quality gates, so QA should be a first-class artifact."
},
{
"data_signals": [
"mocap",
"hand trajectories",
"contacts",
"object states",
"egocentric video",
"language instructions"
],
"evaluation": "Retargeting validity, action prediction, contact consistency, imitation rollout quality, and sim-to-real assumption checks.",
"first_build": "Action-token conversion, robot-compatible targets, imitation-learning examples, and simulation transfer probes.",
"id": "policy_retargeting_simulation_transfer",
"name": "Policy, Retargeting, and Simulation Transfer",
"why_it_matters": "It creates a bridge from human egocentric experience to robot policies while keeping action-space assumptions explicit."
}
],
"practical_order": [
"Build the episode taxonomy and data-quality diagnostics first.",
"Lock the benchmark protocol and split manifests before reporting model scores.",
"Add representation-learning and skill-graph objectives once enough episodes are staged.",
"Add affordance, 3D/4D memory, and policy-retargeting branches after labels and action targets are measurable."
],
"public_boundary": "These are proposed development tracks. They are not reported as completed held-out benchmark results.",
"source_document": "ADDITIONAL_DEVELOPMENT_DIRECTIONS.md",
"status": "planned_research_directions",
"summary": "Concrete Xperience-10M project directions beyond the current minimal baselines, Qwen3-Omni LoRA plan, Cosmos/world-model track, and long-term native pretraining goal.",
"title": "Additional Development Directions"
},
"baseline_summary": {
"baseline_heads": "minimal and neural MLP heads",
"current_use": "task design, data-contract validation, case studies, and baseline comparison",
"split": "chronological single-episode split for public-sample diagnostics",
"task_count": 20
},
"directions": [
{
"code": "A",
"counts": {
"diagnostic": 0,
"direct": 3,
"proxy": 3,
"total_links": 6
},
"current_readout": "The sample supports hand trajectory forecasting and contact/object probes, but it does not yet include a full body/shape model or multi-person priors.",
"current_status": "partially implemented",
"extension_tasks": [
{
"current_limit": "This is a motion-energy proxy, not a SMPL/MANO body model or a generative motion prior.",
"family": "classification",
"id": "body_motion_intensity",
"metric_name": "macro-F1",
"name": "Body and Hand Motion Intensity"
}
],
"focus": "Human/hand/body motion, deformation priors, human-object interaction, affordance modeling.",
"id": "human_motion",
"name": "Human Modeling & Motion Understanding",
"next_steps": [
"Add SMPL/SMPL-X or MANO-style body/hand parameter targets where available.",
"Train sequence models over multi-episode motion trajectories instead of isolated windows.",
"Evaluate affordance prediction on held-out objects and held-out episodes."
],
"preferred_background": "Human pose/shape estimation, SMPL-style models, motion capture, or motion generation.",
"task_ids": [
"timeline_action",
"hand_trajectory_forecast",
"contact_prediction",
"object_relevance",
"interaction_text_prediction",
"imu_to_hand_pose"
],
"tasks": [
{
"architecture_family": "multiclass classifier",
"case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.",
"current_limit": "Chronological single-episode split creates unseen future action classes.",
"direction_roles": {
"A": "proxy",
"C": "direct"
},
"display_name": "Action Recognition",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "supervised",
"id": "timeline_action",
"input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.",
"input_short": "20-frame multimodal window",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.05,
"name": "macro-F1",
"neural_mlp": 0.0148
},
"modalities": [
"video",
"depth",
"pose_slam",
"motion_capture",
"inertial",
"language"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "current action class",
"primary_direction": "C",
"process_short": "window features -> action label builder -> classifier",
"research_name": "Egocentric Action Recognition",
"why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout."
},
{
"architecture_family": "continuous regressor",
"case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.",
"current_limit": "Forecasting is window-level and not yet a full sequence or policy model.",
"direction_roles": {
"A": "direct",
"C": "proxy"
},
"display_name": "Hand Trajectory Forecasting",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json",
"label": "Neural metrics"
}
],
"family": "forecast",
"id": "hand_trajectory_forecast",
"input": "The current all-modality window vector at time t.",
"input_short": "current multimodal window",
"metric": {
"better_baseline": "neural_mlp",
"direction": "lower",
"key": "mpjpe",
"minimal": 0.8647,
"name": "MPJPE",
"neural_mlp": 0.1079
},
"modalities": [
"motion_capture",
"video",
"depth",
"pose_slam",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "future hand-joint trajectory",
"primary_direction": "A",
"process_short": "current features -> future mocap target -> regression head",
"research_name": "3D Hand Motion Forecasting",
"why": "Directly predicts human hand motion and supports hand-object interaction modeling."
},
{
"architecture_family": "binary classifier",
"case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.",
"current_limit": "The public sample is degenerate for this target because one class dominates.",
"direction_roles": {
"A": "direct",
"C": "proxy"
},
"display_name": "Contact State Prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "supervised",
"id": "contact_prediction",
"input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.",
"input_short": "non-contact, non-caption features",
"metric": {
"better_baseline": "tie",
"direction": "higher",
"key": "macro_f1",
"minimal": 1.0,
"name": "macro-F1",
"neural_mlp": 1.0
},
"modalities": [
"motion_capture",
"video",
"depth",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "contact or no contact",
"primary_direction": "A",
"process_short": "feature filter -> contact target -> binary classifier",
"research_name": "Human-Object Contact Prediction",
"why": "Targets physical interaction state, a core affordance and manipulation signal."
},
{
"architecture_family": "multi-label classifier",
"case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.",
"current_limit": "Object labels are language-derived and sparse in one episode.",
"direction_roles": {
"A": "proxy",
"C": "direct",
"D": "proxy"
},
"display_name": "Object Relevance Prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv",
"label": "Neural predictions"
}
],
"family": "supervised",
"id": "object_relevance",
"input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.",
"input_short": "non-caption multimodal features",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "micro_f1",
"minimal": 0.1803,
"name": "micro-F1",
"neural_mlp": 0.1679
},
"modalities": [
"video",
"depth",
"pose_slam",
"motion_capture",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "relevant object set",
"primary_direction": "C",
"process_short": "object vocabulary -> multi-hot labels -> sigmoid heads",
"research_name": "Object-Centric Interaction Recognition",
"why": "Connects egocentric activity to manipulated objects and early object-centric state."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Public derived features retain hashed text targets; raw full text requires the official annotation source.",
"direction_roles": {
"A": "proxy",
"C": "direct"
},
"display_name": "Interaction text prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "classification",
"id": "interaction_text_prediction",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0444,
"name": "macro-F1",
"neural_mlp": 0.0381
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Interaction text prediction",
"why": "Connects egocentric observations to the natural-language interaction semantics carried by the annotation."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Pose reconstruction is window-level and does not yet fit a full parametric hand/body model.",
"direction_roles": {
"A": "direct",
"B": "proxy"
},
"display_name": "IMU-to-hand pose reconstruction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "regression",
"id": "imu_to_hand_pose",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "lower",
"key": "mae",
"minimal": 0.042,
"name": "MAE",
"neural_mlp": 0.0426
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "A",
"process_short": null,
"research_name": "IMU-to-hand pose reconstruction",
"why": "Measures human-motion reconstruction from wearable and motion cues."
}
]
},
{
"code": "B",
"counts": {
"diagnostic": 1,
"direct": 1,
"proxy": 3,
"total_links": 5
},
"current_readout": "The current suite checks cross-modal alignment and depth/video reconstruction proxies; it does not yet train a renderer or reconstruct geometry.",
"current_status": "proxy tasks only",
"extension_tasks": [
{
"current_limit": "This checks calibrated multi-view signal, but it is still feature retrieval, not NeRF, Gaussian Splatting, or novel-view synthesis.",
"family": "retrieval",
"id": "multi_view_consistency_retrieval",
"metric_name": "MRR",
"name": "Multi-View Consistency Retrieval"
}
],
"focus": "Multi-view dynamic scene reconstruction, NeRF/Gaussian Splatting, novel-view synthesis.",
"id": "reconstruction_rendering",
"name": "3D/4D Reconstruction & Neural Rendering",
"next_steps": [
"Use calibrated multi-view video plus SLAM pose to build per-episode camera trajectories.",
"Add depth-supervised point clouds, TSDF, Gaussian Splatting, or NeRF baselines.",
"Evaluate novel-view synthesis and temporal consistency across held-out views/time."
],
"preferred_background": "3D reconstruction, neural rendering, camera calibration, and bundle adjustment.",
"task_ids": [
"cross_modal_retrieval",
"modality_reconstruction",
"misalignment_detection",
"imu_to_hand_pose",
"camera_view_sync_retrieval"
],
"tasks": [
{
"architecture_family": "two-tower retrieval head",
"case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.",
"current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
"direction_roles": {
"B": "proxy",
"C": "diagnostic",
"D": "proxy"
},
"display_name": "Cross-Modal Retrieval",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json",
"label": "Neural metrics"
}
],
"family": "retrieval",
"id": "cross_modal_retrieval",
"input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.",
"input_short": "motion/IMU/pose query; depth/video candidates",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "mrr",
"minimal": 0.2693,
"name": "MRR",
"neural_mlp": 0.13
},
"modalities": [
"motion_capture",
"inertial",
"pose_slam",
"depth",
"video"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "ranked visual windows",
"primary_direction": "C",
"process_short": "modality split -> projection -> nearest-neighbor ranker",
"research_name": "Multimodal Representation Retrieval",
"why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling."
},
{
"architecture_family": "feature regressor",
"case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.",
"current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.",
"direction_roles": {
"B": "proxy",
"D": "proxy"
},
"display_name": "Cross-Modal Reconstruction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json",
"label": "Neural metrics"
}
],
"family": "forecast",
"id": "modality_reconstruction",
"input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.",
"input_short": "motion, IMU, and camera/pose features",
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "r2",
"minimal": -0.0153,
"name": "R2",
"neural_mlp": -0.0102
},
"modalities": [
"motion_capture",
"inertial",
"pose_slam",
"depth",
"video"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "reconstructed depth/video vector",
"primary_direction": "B",
"process_short": "source-target split -> scaler -> regression head",
"research_name": "Modality Feature Reconstruction",
"why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective."
},
{
"architecture_family": "pairwise classifier",
"case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.",
"current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
"direction_roles": {
"B": "diagnostic",
"C": "diagnostic",
"D": "diagnostic"
},
"display_name": "Multimodal Synchronization Detection",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv",
"label": "Neural predictions"
}
],
"family": "diagnostic",
"id": "misalignment_detection",
"input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.",
"input_short": "motion-side and visual/depth-side feature groups",
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "f1",
"minimal": 0.5052,
"name": "F1",
"neural_mlp": 0.7153
},
"modalities": [
"motion_capture",
"inertial",
"video",
"depth",
"pose_slam"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "aligned or shifted",
"primary_direction": "C",
"process_short": "aligned/shifted pairs -> feature combiner -> binary classifier",
"research_name": "Cross-Modal Misalignment Detection",
"why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Pose reconstruction is window-level and does not yet fit a full parametric hand/body model.",
"direction_roles": {
"A": "direct",
"B": "proxy"
},
"display_name": "IMU-to-hand pose reconstruction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "regression",
"id": "imu_to_hand_pose",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "lower",
"key": "mae",
"minimal": 0.042,
"name": "MAE",
"neural_mlp": 0.0426
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "A",
"process_short": null,
"research_name": "IMU-to-hand pose reconstruction",
"why": "Measures human-motion reconstruction from wearable and motion cues."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Retrieval checks view consistency but does not reconstruct geometry by itself.",
"direction_roles": {
"B": "direct",
"D": "proxy"
},
"display_name": "Camera-view synchronization retrieval",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "retrieval",
"id": "camera_view_sync_retrieval",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "mrr",
"minimal": 0.4943,
"name": "MRR",
"neural_mlp": 0.2409
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "B",
"process_short": null,
"research_name": "Camera-view synchronization retrieval",
"why": "Tests whether synchronized multi-view structure is recoverable across camera streams."
}
]
},
{
"code": "C",
"counts": {
"diagnostic": 4,
"direct": 10,
"proxy": 3,
"total_links": 17
},
"current_readout": "The unified 20-task suite directly targets egocentric action, task state, interaction, grounding, forecasting, and alignment.",
"current_status": "strongest implemented track",
"extension_tasks": [
{
"current_limit": "This is an action-structure probe inside one episode, not a general intent model across homes, people, or tasks.",
"family": "regression",
"id": "action_phase_progress",
"metric_name": "MAE",
"name": "Action Phase Progress Estimation"
}
],
"focus": "Egocentric action and intention understanding, hand-object interaction, gaze/attention modeling, task structure modeling.",
"id": "egocentric_interaction",
"name": "Egocentric Vision & Interaction",
"next_steps": [
"Move from single-episode chronological splits to held-out-episode splits.",
"Use audio together with stronger multimodal backbones for action, intent, and grounding.",
"Evaluate long-horizon task success prediction and action-conditioned generation."
],
"preferred_background": "Video understanding, action recognition, or egocentric vision.",
"task_ids": [
"timeline_action",
"timeline_subtask",
"transition_detection",
"next_action",
"hand_trajectory_forecast",
"contact_prediction",
"object_relevance",
"caption_grounding",
"cross_modal_retrieval",
"temporal_order",
"misalignment_detection",
"long_horizon_next_action",
"next_subtask_forecast",
"interaction_text_prediction",
"action_object_relation",
"object_set_forecast",
"time_to_transition"
],
"tasks": [
{
"architecture_family": "multiclass classifier",
"case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.",
"current_limit": "Chronological single-episode split creates unseen future action classes.",
"direction_roles": {
"A": "proxy",
"C": "direct"
},
"display_name": "Action Recognition",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "supervised",
"id": "timeline_action",
"input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.",
"input_short": "20-frame multimodal window",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.05,
"name": "macro-F1",
"neural_mlp": 0.0148
},
"modalities": [
"video",
"depth",
"pose_slam",
"motion_capture",
"inertial",
"language"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "current action class",
"primary_direction": "C",
"process_short": "window features -> action label builder -> classifier",
"research_name": "Egocentric Action Recognition",
"why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout."
},
{
"architecture_family": "multiclass classifier",
"case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.",
"current_limit": "Single-episode ordering makes future subtasks hard to generalize.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Procedure Step Recognition",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "supervised",
"id": "timeline_subtask",
"input": "The same all-modality window vector used by action recognition.",
"input_short": "20-frame multimodal window",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0506,
"name": "macro-F1",
"neural_mlp": 0.0281
},
"modalities": [
"video",
"depth",
"pose_slam",
"motion_capture",
"inertial",
"language"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "current procedure step",
"primary_direction": "C",
"process_short": "window features -> subtask label builder -> classifier",
"research_name": "Temporal Subtask Recognition",
"why": "Segments egocentric task state and provides a first proxy for symbolic world/task state."
},
{
"architecture_family": "binary classifier",
"case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.",
"current_limit": "Boundary class is sparse, so accuracy alone is misleading.",
"direction_roles": {
"C": "direct",
"D": "diagnostic"
},
"display_name": "Action Boundary Detection",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "diagnostic",
"id": "transition_detection",
"input": "One all-modality window vector plus labels derived from action-change timestamps.",
"input_short": "current window with boundary target",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.6118,
"name": "macro-F1",
"neural_mlp": 0.5862
},
"modalities": [
"video",
"pose_slam",
"motion_capture",
"inertial",
"language"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "boundary or steady",
"primary_direction": "C",
"process_short": "action changes -> boundary labels -> binary classifier",
"research_name": "Temporal Action Segmentation",
"why": "Localizes egocentric task boundaries and diagnoses temporal state changes."
},
{
"architecture_family": "future-label classifier",
"case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.",
"current_limit": "Unseen future labels dominate the single-episode chronological test.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Next-Action Prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "supervised",
"id": "next_action",
"input": "The current all-modality window vector at time t.",
"input_short": "current window at time t",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0593,
"name": "macro-F1",
"neural_mlp": 0.0419
},
"modalities": [
"video",
"depth",
"pose_slam",
"motion_capture",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "action at t+20 frames",
"primary_direction": "C",
"process_short": "current features -> future label shift -> classifier",
"research_name": "Short-Horizon Intention Prediction",
"why": "Tests action intention/task-flow prediction from egocentric context."
},
{
"architecture_family": "continuous regressor",
"case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.",
"current_limit": "Forecasting is window-level and not yet a full sequence or policy model.",
"direction_roles": {
"A": "direct",
"C": "proxy"
},
"display_name": "Hand Trajectory Forecasting",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json",
"label": "Neural metrics"
}
],
"family": "forecast",
"id": "hand_trajectory_forecast",
"input": "The current all-modality window vector at time t.",
"input_short": "current multimodal window",
"metric": {
"better_baseline": "neural_mlp",
"direction": "lower",
"key": "mpjpe",
"minimal": 0.8647,
"name": "MPJPE",
"neural_mlp": 0.1079
},
"modalities": [
"motion_capture",
"video",
"depth",
"pose_slam",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "future hand-joint trajectory",
"primary_direction": "A",
"process_short": "current features -> future mocap target -> regression head",
"research_name": "3D Hand Motion Forecasting",
"why": "Directly predicts human hand motion and supports hand-object interaction modeling."
},
{
"architecture_family": "binary classifier",
"case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.",
"current_limit": "The public sample is degenerate for this target because one class dominates.",
"direction_roles": {
"A": "direct",
"C": "proxy"
},
"display_name": "Contact State Prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "supervised",
"id": "contact_prediction",
"input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.",
"input_short": "non-contact, non-caption features",
"metric": {
"better_baseline": "tie",
"direction": "higher",
"key": "macro_f1",
"minimal": 1.0,
"name": "macro-F1",
"neural_mlp": 1.0
},
"modalities": [
"motion_capture",
"video",
"depth",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "contact or no contact",
"primary_direction": "A",
"process_short": "feature filter -> contact target -> binary classifier",
"research_name": "Human-Object Contact Prediction",
"why": "Targets physical interaction state, a core affordance and manipulation signal."
},
{
"architecture_family": "multi-label classifier",
"case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.",
"current_limit": "Object labels are language-derived and sparse in one episode.",
"direction_roles": {
"A": "proxy",
"C": "direct",
"D": "proxy"
},
"display_name": "Object Relevance Prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv",
"label": "Neural predictions"
}
],
"family": "supervised",
"id": "object_relevance",
"input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.",
"input_short": "non-caption multimodal features",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "micro_f1",
"minimal": 0.1803,
"name": "micro-F1",
"neural_mlp": 0.1679
},
"modalities": [
"video",
"depth",
"pose_slam",
"motion_capture",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "relevant object set",
"primary_direction": "C",
"process_short": "object vocabulary -> multi-hot labels -> sigmoid heads",
"research_name": "Object-Centric Interaction Recognition",
"why": "Connects egocentric activity to manipulated objects and early object-centric state."
},
{
"architecture_family": "retrieval ranker",
"case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.",
"current_limit": "Bag-of-objects language features are too weak for rich grounding.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Language Grounding",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json",
"label": "Neural metrics"
}
],
"family": "retrieval",
"id": "caption_grounding",
"input": "Caption/object/interaction query features and a set of candidate sensor-window features.",
"input_short": "text-like query and candidate windows",
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "mrr",
"minimal": 0.016,
"name": "MRR",
"neural_mlp": 0.0168
},
"modalities": [
"language",
"video",
"depth",
"pose_slam"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "ranked matching moments",
"primary_direction": "C",
"process_short": "query features -> candidate index -> cosine ranker",
"research_name": "Language-to-Moment Grounding",
"why": "Grounds language annotation into egocentric sensor time and task state."
},
{
"architecture_family": "two-tower retrieval head",
"case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.",
"current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
"direction_roles": {
"B": "proxy",
"C": "diagnostic",
"D": "proxy"
},
"display_name": "Cross-Modal Retrieval",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json",
"label": "Neural metrics"
}
],
"family": "retrieval",
"id": "cross_modal_retrieval",
"input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.",
"input_short": "motion/IMU/pose query; depth/video candidates",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "mrr",
"minimal": 0.2693,
"name": "MRR",
"neural_mlp": 0.13
},
"modalities": [
"motion_capture",
"inertial",
"pose_slam",
"depth",
"video"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "ranked visual windows",
"primary_direction": "C",
"process_short": "modality split -> projection -> nearest-neighbor ranker",
"research_name": "Multimodal Representation Retrieval",
"why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling."
},
{
"architecture_family": "pairwise classifier",
"case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.",
"current_limit": "Only local adjacent ordering, not long-horizon causal modeling.",
"direction_roles": {
"C": "diagnostic",
"D": "diagnostic"
},
"display_name": "Temporal Order Verification",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv",
"label": "Neural predictions"
}
],
"family": "diagnostic",
"id": "temporal_order",
"input": "A pair of adjacent window vectors, plus their difference vector.",
"input_short": "two adjacent windows plus difference vector",
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "f1",
"minimal": 0.54,
"name": "F1",
"neural_mlp": 0.852
},
"modalities": [
"video",
"pose_slam",
"motion_capture",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "correct or reversed",
"primary_direction": "C",
"process_short": "pair builder -> feature combiner -> binary classifier",
"research_name": "Temporal Order Verification",
"why": "Checks whether features encode local time direction and task progression."
},
{
"architecture_family": "pairwise classifier",
"case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.",
"current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
"direction_roles": {
"B": "diagnostic",
"C": "diagnostic",
"D": "diagnostic"
},
"display_name": "Multimodal Synchronization Detection",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv",
"label": "Neural predictions"
}
],
"family": "diagnostic",
"id": "misalignment_detection",
"input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.",
"input_short": "motion-side and visual/depth-side feature groups",
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "f1",
"minimal": 0.5052,
"name": "F1",
"neural_mlp": 0.7153
},
"modalities": [
"motion_capture",
"inertial",
"video",
"depth",
"pose_slam"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "aligned or shifted",
"primary_direction": "C",
"process_short": "aligned/shifted pairs -> feature combiner -> binary classifier",
"research_name": "Cross-Modal Misalignment Detection",
"why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Evaluated from sample-supported future labels, not full open-world action generation.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Long-horizon next-action forecasting",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "classification",
"id": "long_horizon_next_action",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.075,
"name": "macro-F1",
"neural_mlp": 0.0655
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Long-horizon next-action forecasting",
"why": "Extends short-horizon intention prediction into longer activity futures, a key egocentric and world-model signal."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Subtask labels are constrained to the available annotation vocabulary.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Long-horizon next-subtask forecasting",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "classification",
"id": "next_subtask_forecast",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0455,
"name": "macro-F1",
"neural_mlp": 0.0507
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Long-horizon next-subtask forecasting",
"why": "Measures whether the model can anticipate the next procedural phase rather than only the current frame state."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Public derived features retain hashed text targets; raw full text requires the official annotation source.",
"direction_roles": {
"A": "proxy",
"C": "direct"
},
"display_name": "Interaction text prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "classification",
"id": "interaction_text_prediction",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0444,
"name": "macro-F1",
"neural_mlp": 0.0381
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Interaction text prediction",
"why": "Connects egocentric observations to the natural-language interaction semantics carried by the annotation."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Relation labels are derived from the public-sample annotation scope.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Action-object relation prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "classification",
"id": "action_object_relation",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "tie",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0,
"name": "macro-F1",
"neural_mlp": 0.0
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Action-object relation prediction",
"why": "Tests whether action recognition and object state are connected as a relational interaction representation."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "This is a set-level proxy, not a persistent 3D scene graph.",
"direction_roles": {
"C": "proxy",
"D": "direct"
},
"display_name": "Future object-set forecasting",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "multi-label",
"id": "object_set_forecast",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "micro_f1",
"minimal": 0.1694,
"name": "micro-F1",
"neural_mlp": 0.1972
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "D",
"process_short": null,
"research_name": "Future object-set forecasting",
"why": "Asks whether the current scene state supports predicting which objects will matter later."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Regression is local to the annotated public sample timeline.",
"direction_roles": {
"C": "diagnostic",
"D": "diagnostic"
},
"display_name": "Time-to-next-transition regression",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "regression",
"id": "time_to_transition",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "lower",
"key": "mae",
"minimal": 10.5374,
"name": "MAE frames",
"neural_mlp": 10.5545
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Time-to-next-transition regression",
"why": "Measures temporal boundary awareness as a continuous timing target."
}
]
},
{
"code": "D",
"counts": {
"diagnostic": 4,
"direct": 1,
"proxy": 10,
"total_links": 15
},
"current_readout": "The current tasks probe temporal structure, object relevance, cross-modal retrieval, and modality prediction, but they do not yet build persistent maps or scene graphs.",
"current_status": "early proxy tasks",
"extension_tasks": [
{
"current_limit": "This is a compact world-model proxy; it does not build a persistent map, scene graph, or object permanence model.",
"family": "forecast",
"id": "ego_motion_forecast",
"metric_name": "MAE",
"name": "Short-Horizon Ego-Motion Forecasting"
}
],
"focus": "Long-term consistent 3D/4D scene mapping, scene graphs, object- and space-centric representations, spatial reasoning.",
"id": "world_modeling",
"name": "Scene Reconstruction & World Modeling",
"next_steps": [
"Convert windows into persistent object/scene-state nodes with timestamps and camera poses.",
"Add map consistency, object permanence, and spatial relation prediction tasks.",
"Train held-out-episode world models that predict future observations and task state."
],
"preferred_background": "Large-scale mapping, semantic reconstruction, or agent world models.",
"task_ids": [
"timeline_subtask",
"transition_detection",
"next_action",
"object_relevance",
"caption_grounding",
"cross_modal_retrieval",
"modality_reconstruction",
"temporal_order",
"misalignment_detection",
"long_horizon_next_action",
"next_subtask_forecast",
"action_object_relation",
"object_set_forecast",
"camera_view_sync_retrieval",
"time_to_transition"
],
"tasks": [
{
"architecture_family": "multiclass classifier",
"case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.",
"current_limit": "Single-episode ordering makes future subtasks hard to generalize.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Procedure Step Recognition",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "supervised",
"id": "timeline_subtask",
"input": "The same all-modality window vector used by action recognition.",
"input_short": "20-frame multimodal window",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0506,
"name": "macro-F1",
"neural_mlp": 0.0281
},
"modalities": [
"video",
"depth",
"pose_slam",
"motion_capture",
"inertial",
"language"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "current procedure step",
"primary_direction": "C",
"process_short": "window features -> subtask label builder -> classifier",
"research_name": "Temporal Subtask Recognition",
"why": "Segments egocentric task state and provides a first proxy for symbolic world/task state."
},
{
"architecture_family": "binary classifier",
"case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.",
"current_limit": "Boundary class is sparse, so accuracy alone is misleading.",
"direction_roles": {
"C": "direct",
"D": "diagnostic"
},
"display_name": "Action Boundary Detection",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "diagnostic",
"id": "transition_detection",
"input": "One all-modality window vector plus labels derived from action-change timestamps.",
"input_short": "current window with boundary target",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.6118,
"name": "macro-F1",
"neural_mlp": 0.5862
},
"modalities": [
"video",
"pose_slam",
"motion_capture",
"inertial",
"language"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "boundary or steady",
"primary_direction": "C",
"process_short": "action changes -> boundary labels -> binary classifier",
"research_name": "Temporal Action Segmentation",
"why": "Localizes egocentric task boundaries and diagnoses temporal state changes."
},
{
"architecture_family": "future-label classifier",
"case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.",
"current_limit": "Unseen future labels dominate the single-episode chronological test.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Next-Action Prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "supervised",
"id": "next_action",
"input": "The current all-modality window vector at time t.",
"input_short": "current window at time t",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0593,
"name": "macro-F1",
"neural_mlp": 0.0419
},
"modalities": [
"video",
"depth",
"pose_slam",
"motion_capture",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "action at t+20 frames",
"primary_direction": "C",
"process_short": "current features -> future label shift -> classifier",
"research_name": "Short-Horizon Intention Prediction",
"why": "Tests action intention/task-flow prediction from egocentric context."
},
{
"architecture_family": "multi-label classifier",
"case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.",
"current_limit": "Object labels are language-derived and sparse in one episode.",
"direction_roles": {
"A": "proxy",
"C": "direct",
"D": "proxy"
},
"display_name": "Object Relevance Prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv",
"label": "Neural predictions"
}
],
"family": "supervised",
"id": "object_relevance",
"input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.",
"input_short": "non-caption multimodal features",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "micro_f1",
"minimal": 0.1803,
"name": "micro-F1",
"neural_mlp": 0.1679
},
"modalities": [
"video",
"depth",
"pose_slam",
"motion_capture",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "relevant object set",
"primary_direction": "C",
"process_short": "object vocabulary -> multi-hot labels -> sigmoid heads",
"research_name": "Object-Centric Interaction Recognition",
"why": "Connects egocentric activity to manipulated objects and early object-centric state."
},
{
"architecture_family": "retrieval ranker",
"case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.",
"current_limit": "Bag-of-objects language features are too weak for rich grounding.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Language Grounding",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json",
"label": "Neural metrics"
}
],
"family": "retrieval",
"id": "caption_grounding",
"input": "Caption/object/interaction query features and a set of candidate sensor-window features.",
"input_short": "text-like query and candidate windows",
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "mrr",
"minimal": 0.016,
"name": "MRR",
"neural_mlp": 0.0168
},
"modalities": [
"language",
"video",
"depth",
"pose_slam"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "ranked matching moments",
"primary_direction": "C",
"process_short": "query features -> candidate index -> cosine ranker",
"research_name": "Language-to-Moment Grounding",
"why": "Grounds language annotation into egocentric sensor time and task state."
},
{
"architecture_family": "two-tower retrieval head",
"case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.",
"current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
"direction_roles": {
"B": "proxy",
"C": "diagnostic",
"D": "proxy"
},
"display_name": "Cross-Modal Retrieval",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json",
"label": "Neural metrics"
}
],
"family": "retrieval",
"id": "cross_modal_retrieval",
"input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.",
"input_short": "motion/IMU/pose query; depth/video candidates",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "mrr",
"minimal": 0.2693,
"name": "MRR",
"neural_mlp": 0.13
},
"modalities": [
"motion_capture",
"inertial",
"pose_slam",
"depth",
"video"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "ranked visual windows",
"primary_direction": "C",
"process_short": "modality split -> projection -> nearest-neighbor ranker",
"research_name": "Multimodal Representation Retrieval",
"why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling."
},
{
"architecture_family": "feature regressor",
"case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.",
"current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.",
"direction_roles": {
"B": "proxy",
"D": "proxy"
},
"display_name": "Cross-Modal Reconstruction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json",
"label": "Neural metrics"
}
],
"family": "forecast",
"id": "modality_reconstruction",
"input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.",
"input_short": "motion, IMU, and camera/pose features",
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "r2",
"minimal": -0.0153,
"name": "R2",
"neural_mlp": -0.0102
},
"modalities": [
"motion_capture",
"inertial",
"pose_slam",
"depth",
"video"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "reconstructed depth/video vector",
"primary_direction": "B",
"process_short": "source-target split -> scaler -> regression head",
"research_name": "Modality Feature Reconstruction",
"why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective."
},
{
"architecture_family": "pairwise classifier",
"case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.",
"current_limit": "Only local adjacent ordering, not long-horizon causal modeling.",
"direction_roles": {
"C": "diagnostic",
"D": "diagnostic"
},
"display_name": "Temporal Order Verification",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv",
"label": "Neural predictions"
}
],
"family": "diagnostic",
"id": "temporal_order",
"input": "A pair of adjacent window vectors, plus their difference vector.",
"input_short": "two adjacent windows plus difference vector",
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "f1",
"minimal": 0.54,
"name": "F1",
"neural_mlp": 0.852
},
"modalities": [
"video",
"pose_slam",
"motion_capture",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "correct or reversed",
"primary_direction": "C",
"process_short": "pair builder -> feature combiner -> binary classifier",
"research_name": "Temporal Order Verification",
"why": "Checks whether features encode local time direction and task progression."
},
{
"architecture_family": "pairwise classifier",
"case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.",
"current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
"direction_roles": {
"B": "diagnostic",
"C": "diagnostic",
"D": "diagnostic"
},
"display_name": "Multimodal Synchronization Detection",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv",
"label": "Neural predictions"
}
],
"family": "diagnostic",
"id": "misalignment_detection",
"input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.",
"input_short": "motion-side and visual/depth-side feature groups",
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "f1",
"minimal": 0.5052,
"name": "F1",
"neural_mlp": 0.7153
},
"modalities": [
"motion_capture",
"inertial",
"video",
"depth",
"pose_slam"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "aligned or shifted",
"primary_direction": "C",
"process_short": "aligned/shifted pairs -> feature combiner -> binary classifier",
"research_name": "Cross-Modal Misalignment Detection",
"why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Evaluated from sample-supported future labels, not full open-world action generation.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Long-horizon next-action forecasting",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "classification",
"id": "long_horizon_next_action",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.075,
"name": "macro-F1",
"neural_mlp": 0.0655
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Long-horizon next-action forecasting",
"why": "Extends short-horizon intention prediction into longer activity futures, a key egocentric and world-model signal."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Subtask labels are constrained to the available annotation vocabulary.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Long-horizon next-subtask forecasting",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "classification",
"id": "next_subtask_forecast",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0455,
"name": "macro-F1",
"neural_mlp": 0.0507
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Long-horizon next-subtask forecasting",
"why": "Measures whether the model can anticipate the next procedural phase rather than only the current frame state."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Relation labels are derived from the public-sample annotation scope.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Action-object relation prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "classification",
"id": "action_object_relation",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "tie",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0,
"name": "macro-F1",
"neural_mlp": 0.0
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Action-object relation prediction",
"why": "Tests whether action recognition and object state are connected as a relational interaction representation."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "This is a set-level proxy, not a persistent 3D scene graph.",
"direction_roles": {
"C": "proxy",
"D": "direct"
},
"display_name": "Future object-set forecasting",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "multi-label",
"id": "object_set_forecast",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "micro_f1",
"minimal": 0.1694,
"name": "micro-F1",
"neural_mlp": 0.1972
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "D",
"process_short": null,
"research_name": "Future object-set forecasting",
"why": "Asks whether the current scene state supports predicting which objects will matter later."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Retrieval checks view consistency but does not reconstruct geometry by itself.",
"direction_roles": {
"B": "direct",
"D": "proxy"
},
"display_name": "Camera-view synchronization retrieval",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "retrieval",
"id": "camera_view_sync_retrieval",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "mrr",
"minimal": 0.4943,
"name": "MRR",
"neural_mlp": 0.2409
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "B",
"process_short": null,
"research_name": "Camera-view synchronization retrieval",
"why": "Tests whether synchronized multi-view structure is recoverable across camera streams."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Regression is local to the annotated public sample timeline.",
"direction_roles": {
"C": "diagnostic",
"D": "diagnostic"
},
"display_name": "Time-to-next-transition regression",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "regression",
"id": "time_to_transition",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "lower",
"key": "mae",
"minimal": 10.5374,
"name": "MAE frames",
"neural_mlp": 10.5545
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Time-to-next-transition regression",
"why": "Measures temporal boundary awareness as a continuous timing target."
}
]
}
],
"foundation_model_plan": {
"decision": {
"external_reasoning_reference": "Gemini Robotics",
"first_policy_branch_candidates": [
"OpenVLA / OpenVLA-OFT",
"openpi pi0/pi0.5",
"NVIDIA GR00T"
],
"first_world_model_branch": "Cosmos 3",
"immediate_trainable_backbone": "Qwen3-Omni",
"long_term_native_pretraining_goal": "Xperience Embodied Foundation Model"
},
"evaluation_additions": [
{
"metrics": [
"JSON validity",
"macro-F1",
"accuracy",
"micro-F1"
],
"model_families": [
"Qwen3-Omni",
"Gemini Robotics reference"
],
"target": "structured_task_prediction"
},
{
"metrics": [
"retrieval rank",
"temporal consistency",
"feature reconstruction",
"qualitative visual inspection"
],
"model_families": [
"Cosmos 3"
],
"target": "future_state_prediction"
},
{
"metrics": [
"transition accuracy",
"contact accuracy",
"next-action accuracy"
],
"model_families": [
"Cosmos 3",
"OpenVLA",
"openpi",
"GR00T"
],
"target": "action_conditioned_dynamics"
},
{
"metrics": [
"held-out episode metrics",
"held-out session metrics",
"leakage checks"
],
"model_families": [
"all trainable branches"
],
"target": "cross_episode_generalization"
}
],
"execution_order": [
{
"action": "Stage at least 32 valid Xperience-10M episodes with held-out episode split.",
"name": "Data gate",
"step": 1
},
{
"action": "Run Qwen3-Omni action/subtask error analysis and targeted reruns to improve the verified diagnostic baseline.",
"name": "First held-out baseline",
"step": 2
},
{
"action": "Run 3-8 episode dry runs for any next backbone before scaling beyond the selected split.",
"name": "Model-selection dry run",
"step": 3
},
{
"action": "Promote Cosmos 3 beyond the current Nano compatibility and Super forward-dynamics runs only when loss metrics, preprocessing, and storage justify the added compute.",
"name": "World-model track",
"step": 4
},
{
"action": "Promote OpenVLA/openpi/GR00T after action target conversion and retargeting artifacts are traceable.",
"name": "Policy branch",
"step": 5
},
{
"action": "Publish branch results only with real manifests, predictions, metrics, and qualitative examples.",
"name": "Publishing threshold",
"step": 6
},
{
"action": "Start a from-scratch Xperience Embodied Foundation Model only after smaller scaling stages, full-corpus storage, multi-node compute, and held-out evaluation protocols are in place.",
"name": "Xperience-native pretraining",
"step": 7
}
],
"model_families": [
{
"best_role": "First selected-episode multimodal LoRA pilot and structured task predictor.",
"category": "omni_instruction_model",
"current_decision": "keep_as_first_pilot",
"entry_condition": "Selected episodes prepared with held-out episode split.",
"family": "Qwen3-Omni",
"openness": "open_weights_available_from_official_hf_repo",
"priority": 1,
"public_source": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct",
"xperience10m_fit": [
"RGB/fisheye video, embedded audio, and language prompts can enter directly.",
"Depth, pose/SLAM, mocap, contacts, and IMU enter through the existing sensor bridge.",
"Matches current task outputs: labels, structured JSON, captions, and short decisions."
]
},
{
"best_role": "Embodied world modeling, action generation, future-window prediction, and synthetic-data expansion.",
"category": "world_foundation_model",
"current_decision": "implemented_as_nano_future_window_and_super_forward_dynamics_branches",
"entry_condition": "Use separate metrics for Nano future-window retrieval and Super forward-dynamics MSE; do not compare them directly to Qwen JSON-task accuracy.",
"family": "Cosmos 3",
"openness": "track_official_nvidia_release_and_available_weights",
"priority": 2,
"public_source": "https://www.nvidia.com/en-us/ai/cosmos/",
"xperience10m_fit": [
"Uses video streams as visual state.",
"Uses pose/SLAM, depth, mocap, IMU, and language as physical-world conditioning signals.",
"Better aligned with prediction/generation objectives than simple label classification."
]
},
{
"best_role": "Humanoid action understanding, retargeting, contact/action prediction, and embodied skill transfer.",
"category": "humanoid_policy_foundation_model",
"current_decision": "track_as_humanoid_policy_branch",
"entry_condition": "Retargeting artifact and action-space definition exist.",
"family": "NVIDIA GR00T",
"openness": "track_official_nvidia_release_and_tooling",
"priority": 3,
"public_source": "https://developer.nvidia.com/isaac/gr00t",
"xperience10m_fit": [
"Hand/body mocap and contact cues can be retargeted into humanoid state/action targets.",
"Egocentric video plus human motion can support affordance and interaction tasks."
]
},
{
"best_role": "Open robot-policy baseline after observations and action labels are converted into a VLA format.",
"category": "vision_language_action_policy",
"current_decision": "candidate_after_action_space_design",
"entry_condition": "Window-to-action-token conversion is implemented and checked.",
"family": "OpenVLA / OpenVLA-OFT",
"openness": "open_project_and_weights",
"priority": 4,
"public_source": "https://openvla.github.io/",
"xperience10m_fit": [
"Good candidate when each window is expressed as visual observation, instruction/context, and action token.",
"Requires an explicit action target; current human egocentric labels are not robot controls by default."
]
},
{
"best_role": "Action-chunking, policy fine-tuning, and embodiment-transfer experiments.",
"category": "robot_policy_model",
"current_decision": "candidate_policy_branch",
"entry_condition": "Action target and train/eval protocol exist for at least 64 episodes.",
"family": "openpi pi0/pi0.5",
"openness": "open_source_policy_training_stack",
"priority": 5,
"public_source": "https://github.com/Physical-Intelligence/openpi",
"xperience10m_fit": [
"Useful once hand trajectories, contacts, or retargeted body motion are converted into policy targets.",
"Better for policy branch than for current structured task JSON outputs."
]
},
{
"best_role": "Qualitative reasoning reference, annotation helper, and external comparison when API access exists.",
"category": "closed_embodied_reasoning_reference",
"current_decision": "external_reference_only",
"entry_condition": "API/access exists and outputs are logged separately from trainable model metrics.",
"family": "Gemini Robotics",
"openness": "closed_or_limited_access",
"priority": 6,
"public_source": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/",
"xperience10m_fit": [
"Can help reason over egocentric scenes and task descriptions.",
"Not a local fine-tune target for this repo."
]
},
{
"best_role": "Cheaper policy baselines for observation-to-action experiments.",
"category": "lightweight_robot_policy_baselines",
"current_decision": "optional_baseline_after_data_staging",
"entry_condition": "Action labels and baseline protocol exist.",
"family": "Octo / SmolVLA-style lightweight policies",
"openness": "open_projects",
"priority": 7,
"public_source": "https://github.com/huggingface/lerobot",
"xperience10m_fit": [
"Useful after action target design.",
"Less directly omni-modal than Qwen3-Omni or Cosmos 3."
]
},
{
"best_role": "Domain model over synchronized embodied experience.",
"category": "xperience_native_pretraining_goal",
"current_decision": "future_goal_after_scaling_evidence",
"entry_condition": "Full-corpus data path, PB-scale storage, multi-node compute, and positive smaller-run scaling evidence.",
"family": "Xperience Embodied Foundation Model",
"openness": "future project-specific model if full-corpus access and compute exist",
"priority": 8,
"public_source": "XPERIENCE_EMBODIED_FOUNDATION_MODEL_PRETRAINING.md",
"xperience10m_fit": [
"Uses the full aligned modality stack rather than treating sensors as auxiliary metadata.",
"Targets temporal embodied representation learning across perception, motion, geometry, audio, and language.",
"Can become the shared pretraining backbone for Qwen-style instruction tasks, Cosmos-style world modeling, and policy/action branches."
]
}
],
"source_links": [
{
"label": "Qwen3-Omni official HF model",
"url": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct"
},
{
"label": "NVIDIA Cosmos",
"url": "https://www.nvidia.com/en-us/ai/cosmos/"
},
{
"label": "NVIDIA Isaac GR00T",
"url": "https://developer.nvidia.com/isaac/gr00t"
},
{
"label": "OpenVLA",
"url": "https://openvla.github.io/"
},
{
"label": "openpi",
"url": "https://github.com/Physical-Intelligence/openpi"
},
{
"label": "Gemini Robotics",
"url": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/"
},
{
"label": "Octo",
"url": "https://octo-models.github.io/"
},
{
"label": "LeRobot / SmolVLA",
"url": "https://github.com/huggingface/lerobot"
},
{
"label": "Xperience Embodied Foundation Model pretraining plan",
"url": "XPERIENCE_EMBODIED_FOUNDATION_MODEL_PRETRAINING.md"
}
],
"status": "planning_artifact"
},
"generated_at_utc": "2026-06-21T19:51:24+00:00",
"omni_plan": {
"adapter": "LoRA rank 16, alpha 32, dropout 0.05",
"backbone": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
"evaluation": [
"JSON validity",
"action macro-F1",
"subtask accuracy",
"transition accuracy",
"next-action accuracy",
"contact accuracy",
"object micro-F1",
"held-out episode count"
],
"first_pilot": "32 held-out-episode pilot after valid episodes are prepared",
"training_unit": "episode-level split, window-level supervised examples"
},
"phases": [
{
"completion_evidence": [
"PROJECT_STATUS.md",
"EVALUATION_PROTOCOL.md",
"RESEARCH_TAKEAWAYS.md",
"docs/data/summary_metrics.json",
"results/episode_task_suite/summary_report.json"
],
"deliverables": [
"1161 aligned windows",
"12 task contracts",
"minimal baseline heads",
"neural MLP heads",
"modality atlas",
"task walkthroughs",
"derived figures"
],
"entry_condition": "One public Xperience-10M sample episode is available.",
"id": "public_sample_task_lab",
"name": "Public-Sample Task Lab",
"reader_takeaway": "The public sample supports task design, feature contracts, walkthroughs, and baseline comparisons.",
"stage": "now",
"status": "implemented"
},
{
"completion_evidence": [
"results/omni_finetune/DATA_ACCESS_STATUS.md",
"results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md",
"results/omni_finetune/source_discovery.json"
],
"deliverables": [
"128 selected episodes",
"episode manifest",
"missing-view manifest",
"held-out episode split",
"source-discovery report"
],
"entry_condition": "Gated dataset availability and enough storage for selected episodes.",
"id": "multi_episode_data_staging",
"name": "Multi-Episode Data Preparation",
"reader_takeaway": "The first selected split is available for Qwen3-Omni diagnostics, with train/test separation at the episode level.",
"stage": "future",
"status": "implemented_for_first_pilot"
},
{
"completion_evidence": [
"docs/data/omni_finetune_verified_result.json",
"docs/data/qwen3_v5_v6_comparison.json",
"results/omni_finetune/QWEN3_V5_V6_COMPARISON_20260614.md",
"results/omni_finetune/verified_public/",
"dataset_manifest.json",
"training_metadata.json",
"progress.jsonl",
"metrics.json",
"predictions.jsonl",
"RUN_REPORT.md"
],
"deliverables": [
"dataset JSONL/media manifests",
"LoRA adapter checkpoint",
"progress logs",
"validation monitoring",
"held-out predictions",
"metrics",
"confusion matrices",
"run report",
"v5/v6 comparison",
"public LoRA adapter repo"
],
"entry_condition": "Selected episodes are prepared locally with no train/test episode leakage.",
"id": "qwen3_omni_lora_diagnostic_pilot",
"name": "Qwen3-Omni LoRA Latest Diagnostic Branch",
"reader_takeaway": "The final omni-model diagnostic result establishes the full held-out training/validation/evaluation loop and meets the strict-JSON target, but weak action/subtask metrics make it a diagnostic baseline.",
"stage": "future",
"status": "verified_latest_branch"
},
{
"completion_evidence": [
"results/omni_finetune/multi_episode_128_task_baselines/BASELINE_ALIGNMENT_REPORT.md",
"results/omni_finetune/multi_episode_128_task_baselines/summary_report.json",
"scripts/omni/run_128_task_baselines.py"
],
"deliverables": [
"same 12 task ids",
"simple metadata/text baselines",
"neural MLP baselines for JSON-supported labels",
"explicit unsupported markers for raw-feature-only tasks"
],
"entry_condition": "Derived Qwen JSONL export for the selected 96/16/16 split.",
"id": "multi_episode_128_same_split_baselines",
"name": "128-Episode Same-Split Simple/NN Baselines",
"reader_takeaway": "The simple and neural baseline framing is now aligned to the selected 128-episode setup; trajectory, retrieval, reconstruction, and misalignment variants still need raw 128 feature blocks for exact feature-level reproduction.",
"stage": "future",
"status": "verified_companion_result"
},
{
"completion_evidence": [
"TASK_SUITE_ENHANCEMENT_128.md",
"docs/data/task_suite_enhancement_128.json",
"results/omni_finetune/task_suite_enhancement_128_v1_20260608/enhancement_plan.json",
"scripts/omni/build_task_suite_enhancement_128.py"
],
"deliverables": [
"dense-window and multiscale export estimates",
"hierarchical action/subtask target contract",
"raw-feature shard priorities for unsupported tasks",
"Qwen v5 and Cosmos continuation run cards",
"publication-ready enhancement artifacts"
],
"entry_condition": "Same selected 96/16/16 split and current public 3,808-window export.",
"id": "task_suite_enhancement_128",
"name": "128-Episode Task Suite Enhancement Pack",
"reader_takeaway": "The current 128-episode setup still has headroom: use multiscale_20s10_40s20_80s40, hierarchical labels, label-normalized scoring, and raw-feature shards before adding more episodes.",
"stage": "future",
"status": "current"
},
{
"completion_evidence": [
"error-analysis tables",
"held-out metrics by failure type",
"verified public-safe package"
],
"deliverables": [
"same 96/16/16 episode split",
"action/subtask confusion analysis",
"unseen-label analysis",
"object/action family breakdowns",
"held-out test evaluation",
"comparison to the final verified Qwen baseline"
],
"entry_condition": "The final diagnostic package meets strict JSON validity but has weak action/subtask held-out quality.",
"id": "qwen3_omni_structured_output_error_analysis",
"name": "Action/Subtask Error-Analysis Pass",
"reader_takeaway": "The next pass should improve action/subtask quality before larger model-quality claims.",
"stage": "future",
"status": "active_next_step"
},
{
"completion_evidence": [
"FOUNDATION_MODEL_PLAN.md",
"docs/data/foundation_model_plan.json",
"research_roadmap_interactive.json"
],
"deliverables": [
"backbone registry",
"Cosmos 3 world-model track plan",
"Cosmos3-Super Forward-Dynamics LoRA verified package",
"Qwen3-Omni LoRA baseline plan",
"OpenVLA/openpi/GR00T policy-branch candidates",
"model-specific evaluation additions"
],
"entry_condition": "The selected episodes are prepared or a 3-8 episode dry run is available for preprocessing checks.",
"id": "foundation_model_selection_matrix",
"name": "Foundation-Model Selection Matrix",
"reader_takeaway": "Qwen3-Omni remains the structured JSON held-out pilot; Cosmos 3 is the first world-model track. Cosmos3-Super now has a verified forward-dynamics LoRA over camera-pose proxy targets, while VLA/policy models wait for robot-compatible action targets.",
"stage": "future",
"status": "current"
},
{
"completion_evidence": [
"held-out metrics by session",
"held-out metrics by task",
"held-out metrics by modality",
"ablation tables",
"qualitative error analysis"
],
"deliverables": [
"split-by-session metrics",
"modality ablations",
"calibration/object/language error analysis",
"missing-view sensitivity analysis"
],
"entry_condition": "The selected-episode pilot trains and evaluates cleanly.",
"id": "robustness_run_64_128_episode",
"name": "64-128 Episode Robustness Run",
"reader_takeaway": "The robustness run tests whether the pilot conclusions survive broader sessions and missing modalities.",
"stage": "future",
"status": "partially_implemented"
},
{
"completion_evidence": [
"task-specific held-out evaluations",
"verified Cosmos3-Super forward-dynamics LoRA package",
"qualitative inspection",
"updated model cards"
],
"deliverables": [
"Cosmos 3 future-window and action-conditioned world-model probes",
"OpenVLA/openpi/GR00T action-policy baseline",
"audio/video/depth/pose/mocap conditioning checks",
"affordance and object-interaction tasks",
"synthetic-data usefulness test"
],
"entry_condition": "Enough multi-episode data, compute budget, and model-specific action/world-state targets.",
"id": "foundation_world_model_extensions",
"name": "Cosmos 3 and Policy-Model Extensions",
"reader_takeaway": "The Cosmos3 track now includes Nano future-window compatibility and Super forward-dynamics LoRA; the long-term direction remains richer multimodal representation learning with model tracks chosen by task fit rather than by a single default backbone.",
"stage": "future",
"status": "planned"
},
{
"completion_evidence": [
"pretraining metadata",
"checkpoint inventory",
"scaling curves",
"held-out evaluation reports",
"qualitative retrieval or future-state examples",
"safety and data-boundary report"
],
"deliverables": [
"full-corpus episode and split manifests",
"pretraining shard and provenance manifests",
"0.3B-1B and 1B-3B scaling pilots",
"3B-7B Xperience-native domain model target",
"held-out episode/session/activity/object evaluations",
"missing-modality robustness report",
"model card and data-boundary report"
],
"entry_condition": "Full-corpus access, PB-scale storage path, high-throughput data loading, multi-node compute, and positive scaling evidence from smaller multi-episode runs.",
"id": "xperience_embodied_foundation_pretraining",
"name": "Xperience Embodied Foundation Model Pretraining",
"reader_takeaway": "The final research direction is a domain-specific embodied foundation model trained directly on Xperience-10M, after smaller pilots justify the cost and infrastructure.",
"stage": "future",
"status": "future"
}
],
"scale_up": {
"access_status": "The gated Xperience-10M dataset is available for selected multi-episode pilot preparation.",
"candidate_scan_top_level_sessions": 802,
"estimated_bytes": 298188841943,
"exclude": [
"visualization.rrd"
],
"selection_strategy": "stratified_round_robin_by_top_level_session",
"status": "verified_full_128_episode_diagnostic_result",
"target_episodes": 128,
"valid_candidates": 12102
},
"scope": {
"feature_blocks": 18,
"feature_dim": 8546,
"num_frames": 5821,
"num_windows": 1161,
"sample_episode_count": 1,
"stride_frames": 5,
"warning": "These walkthroughs explain task contracts on one public sample episode; cross-episode performance requires held-out episodes.",
"window_frames": 20
},
"source_files": [
"docs/data/research_directions.json",
"docs/data/task_walkthroughs.json",
"docs/data/research_roadmap.json",
"docs/data/foundation_model_plan.json",
"docs/data/three_foundation_pipelines.json",
"docs/data/additional_development_directions.json",
"docs/data/summary_metrics.json",
"docs/data/research_direction_extensions.json",
"results/episode_task_suite/summary_report.json",
"results/episode_task_suite/feature_manifest.json"
],
"tasks": [
{
"architecture_family": "multiclass classifier",
"case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.",
"current_limit": "Chronological single-episode split creates unseen future action classes.",
"direction_roles": {
"A": "proxy",
"C": "direct"
},
"display_name": "Action Recognition",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "supervised",
"id": "timeline_action",
"input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.",
"input_short": "20-frame multimodal window",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.05,
"name": "macro-F1",
"neural_mlp": 0.0148
},
"modalities": [
"video",
"depth",
"pose_slam",
"motion_capture",
"inertial",
"language"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "current action class",
"primary_direction": "C",
"process_short": "window features -> action label builder -> classifier",
"research_name": "Egocentric Action Recognition",
"why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout."
},
{
"architecture_family": "multiclass classifier",
"case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.",
"current_limit": "Single-episode ordering makes future subtasks hard to generalize.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Procedure Step Recognition",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "supervised",
"id": "timeline_subtask",
"input": "The same all-modality window vector used by action recognition.",
"input_short": "20-frame multimodal window",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0506,
"name": "macro-F1",
"neural_mlp": 0.0281
},
"modalities": [
"video",
"depth",
"pose_slam",
"motion_capture",
"inertial",
"language"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "current procedure step",
"primary_direction": "C",
"process_short": "window features -> subtask label builder -> classifier",
"research_name": "Temporal Subtask Recognition",
"why": "Segments egocentric task state and provides a first proxy for symbolic world/task state."
},
{
"architecture_family": "binary classifier",
"case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.",
"current_limit": "Boundary class is sparse, so accuracy alone is misleading.",
"direction_roles": {
"C": "direct",
"D": "diagnostic"
},
"display_name": "Action Boundary Detection",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "diagnostic",
"id": "transition_detection",
"input": "One all-modality window vector plus labels derived from action-change timestamps.",
"input_short": "current window with boundary target",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.6118,
"name": "macro-F1",
"neural_mlp": 0.5862
},
"modalities": [
"video",
"pose_slam",
"motion_capture",
"inertial",
"language"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "boundary or steady",
"primary_direction": "C",
"process_short": "action changes -> boundary labels -> binary classifier",
"research_name": "Temporal Action Segmentation",
"why": "Localizes egocentric task boundaries and diagnoses temporal state changes."
},
{
"architecture_family": "future-label classifier",
"case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.",
"current_limit": "Unseen future labels dominate the single-episode chronological test.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Next-Action Prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "supervised",
"id": "next_action",
"input": "The current all-modality window vector at time t.",
"input_short": "current window at time t",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0593,
"name": "macro-F1",
"neural_mlp": 0.0419
},
"modalities": [
"video",
"depth",
"pose_slam",
"motion_capture",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "action at t+20 frames",
"primary_direction": "C",
"process_short": "current features -> future label shift -> classifier",
"research_name": "Short-Horizon Intention Prediction",
"why": "Tests action intention/task-flow prediction from egocentric context."
},
{
"architecture_family": "continuous regressor",
"case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.",
"current_limit": "Forecasting is window-level and not yet a full sequence or policy model.",
"direction_roles": {
"A": "direct",
"C": "proxy"
},
"display_name": "Hand Trajectory Forecasting",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json",
"label": "Neural metrics"
}
],
"family": "forecast",
"id": "hand_trajectory_forecast",
"input": "The current all-modality window vector at time t.",
"input_short": "current multimodal window",
"metric": {
"better_baseline": "neural_mlp",
"direction": "lower",
"key": "mpjpe",
"minimal": 0.8647,
"name": "MPJPE",
"neural_mlp": 0.1079
},
"modalities": [
"motion_capture",
"video",
"depth",
"pose_slam",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "future hand-joint trajectory",
"primary_direction": "A",
"process_short": "current features -> future mocap target -> regression head",
"research_name": "3D Hand Motion Forecasting",
"why": "Directly predicts human hand motion and supports hand-object interaction modeling."
},
{
"architecture_family": "binary classifier",
"case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.",
"current_limit": "The public sample is degenerate for this target because one class dominates.",
"direction_roles": {
"A": "direct",
"C": "proxy"
},
"display_name": "Contact State Prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv",
"label": "Neural predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv",
"label": "Confusion matrix"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv",
"label": "Neural confusion matrix"
}
],
"family": "supervised",
"id": "contact_prediction",
"input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.",
"input_short": "non-contact, non-caption features",
"metric": {
"better_baseline": "tie",
"direction": "higher",
"key": "macro_f1",
"minimal": 1.0,
"name": "macro-F1",
"neural_mlp": 1.0
},
"modalities": [
"motion_capture",
"video",
"depth",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "contact or no contact",
"primary_direction": "A",
"process_short": "feature filter -> contact target -> binary classifier",
"research_name": "Human-Object Contact Prediction",
"why": "Targets physical interaction state, a core affordance and manipulation signal."
},
{
"architecture_family": "multi-label classifier",
"case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.",
"current_limit": "Object labels are language-derived and sparse in one episode.",
"direction_roles": {
"A": "proxy",
"C": "direct",
"D": "proxy"
},
"display_name": "Object Relevance Prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv",
"label": "Neural predictions"
}
],
"family": "supervised",
"id": "object_relevance",
"input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.",
"input_short": "non-caption multimodal features",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "micro_f1",
"minimal": 0.1803,
"name": "micro-F1",
"neural_mlp": 0.1679
},
"modalities": [
"video",
"depth",
"pose_slam",
"motion_capture",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "relevant object set",
"primary_direction": "C",
"process_short": "object vocabulary -> multi-hot labels -> sigmoid heads",
"research_name": "Object-Centric Interaction Recognition",
"why": "Connects egocentric activity to manipulated objects and early object-centric state."
},
{
"architecture_family": "retrieval ranker",
"case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.",
"current_limit": "Bag-of-objects language features are too weak for rich grounding.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Language Grounding",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json",
"label": "Neural metrics"
}
],
"family": "retrieval",
"id": "caption_grounding",
"input": "Caption/object/interaction query features and a set of candidate sensor-window features.",
"input_short": "text-like query and candidate windows",
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "mrr",
"minimal": 0.016,
"name": "MRR",
"neural_mlp": 0.0168
},
"modalities": [
"language",
"video",
"depth",
"pose_slam"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "ranked matching moments",
"primary_direction": "C",
"process_short": "query features -> candidate index -> cosine ranker",
"research_name": "Language-to-Moment Grounding",
"why": "Grounds language annotation into egocentric sensor time and task state."
},
{
"architecture_family": "two-tower retrieval head",
"case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.",
"current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
"direction_roles": {
"B": "proxy",
"C": "diagnostic",
"D": "proxy"
},
"display_name": "Cross-Modal Retrieval",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json",
"label": "Neural metrics"
}
],
"family": "retrieval",
"id": "cross_modal_retrieval",
"input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.",
"input_short": "motion/IMU/pose query; depth/video candidates",
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "mrr",
"minimal": 0.2693,
"name": "MRR",
"neural_mlp": 0.13
},
"modalities": [
"motion_capture",
"inertial",
"pose_slam",
"depth",
"video"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "ranked visual windows",
"primary_direction": "C",
"process_short": "modality split -> projection -> nearest-neighbor ranker",
"research_name": "Multimodal Representation Retrieval",
"why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling."
},
{
"architecture_family": "feature regressor",
"case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.",
"current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.",
"direction_roles": {
"B": "proxy",
"D": "proxy"
},
"display_name": "Cross-Modal Reconstruction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json",
"label": "Neural metrics"
}
],
"family": "forecast",
"id": "modality_reconstruction",
"input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.",
"input_short": "motion, IMU, and camera/pose features",
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "r2",
"minimal": -0.0153,
"name": "R2",
"neural_mlp": -0.0102
},
"modalities": [
"motion_capture",
"inertial",
"pose_slam",
"depth",
"video"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "reconstructed depth/video vector",
"primary_direction": "B",
"process_short": "source-target split -> scaler -> regression head",
"research_name": "Modality Feature Reconstruction",
"why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective."
},
{
"architecture_family": "pairwise classifier",
"case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.",
"current_limit": "Only local adjacent ordering, not long-horizon causal modeling.",
"direction_roles": {
"C": "diagnostic",
"D": "diagnostic"
},
"display_name": "Temporal Order Verification",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv",
"label": "Neural predictions"
}
],
"family": "diagnostic",
"id": "temporal_order",
"input": "A pair of adjacent window vectors, plus their difference vector.",
"input_short": "two adjacent windows plus difference vector",
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "f1",
"minimal": 0.54,
"name": "F1",
"neural_mlp": 0.852
},
"modalities": [
"video",
"pose_slam",
"motion_capture",
"inertial"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "correct or reversed",
"primary_direction": "C",
"process_short": "pair builder -> feature combiner -> binary classifier",
"research_name": "Temporal Order Verification",
"why": "Checks whether features encode local time direction and task progression."
},
{
"architecture_family": "pairwise classifier",
"case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.",
"current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
"direction_roles": {
"B": "diagnostic",
"C": "diagnostic",
"D": "diagnostic"
},
"display_name": "Multimodal Synchronization Detection",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json",
"label": "Minimal metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json",
"label": "Neural metrics"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv",
"label": "Minimal predictions"
},
{
"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv",
"label": "Neural predictions"
}
],
"family": "diagnostic",
"id": "misalignment_detection",
"input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.",
"input_short": "motion-side and visual/depth-side feature groups",
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "f1",
"minimal": 0.5052,
"name": "F1",
"neural_mlp": 0.7153
},
"modalities": [
"motion_capture",
"inertial",
"video",
"depth",
"pose_slam"
],
"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
"output_short": "aligned or shifted",
"primary_direction": "C",
"process_short": "aligned/shifted pairs -> feature combiner -> binary classifier",
"research_name": "Cross-Modal Misalignment Detection",
"why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Evaluated from sample-supported future labels, not full open-world action generation.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Long-horizon next-action forecasting",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "classification",
"id": "long_horizon_next_action",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.075,
"name": "macro-F1",
"neural_mlp": 0.0655
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Long-horizon next-action forecasting",
"why": "Extends short-horizon intention prediction into longer activity futures, a key egocentric and world-model signal."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Subtask labels are constrained to the available annotation vocabulary.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Long-horizon next-subtask forecasting",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "classification",
"id": "next_subtask_forecast",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0455,
"name": "macro-F1",
"neural_mlp": 0.0507
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Long-horizon next-subtask forecasting",
"why": "Measures whether the model can anticipate the next procedural phase rather than only the current frame state."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Public derived features retain hashed text targets; raw full text requires the official annotation source.",
"direction_roles": {
"A": "proxy",
"C": "direct"
},
"display_name": "Interaction text prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "classification",
"id": "interaction_text_prediction",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0444,
"name": "macro-F1",
"neural_mlp": 0.0381
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Interaction text prediction",
"why": "Connects egocentric observations to the natural-language interaction semantics carried by the annotation."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Relation labels are derived from the public-sample annotation scope.",
"direction_roles": {
"C": "direct",
"D": "proxy"
},
"display_name": "Action-object relation prediction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "classification",
"id": "action_object_relation",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "tie",
"direction": "higher",
"key": "macro_f1",
"minimal": 0.0,
"name": "macro-F1",
"neural_mlp": 0.0
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Action-object relation prediction",
"why": "Tests whether action recognition and object state are connected as a relational interaction representation."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "This is a set-level proxy, not a persistent 3D scene graph.",
"direction_roles": {
"C": "proxy",
"D": "direct"
},
"display_name": "Future object-set forecasting",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "multi-label",
"id": "object_set_forecast",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "neural_mlp",
"direction": "higher",
"key": "micro_f1",
"minimal": 0.1694,
"name": "micro-F1",
"neural_mlp": 0.1972
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "D",
"process_short": null,
"research_name": "Future object-set forecasting",
"why": "Asks whether the current scene state supports predicting which objects will matter later."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Pose reconstruction is window-level and does not yet fit a full parametric hand/body model.",
"direction_roles": {
"A": "direct",
"B": "proxy"
},
"display_name": "IMU-to-hand pose reconstruction",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "regression",
"id": "imu_to_hand_pose",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "lower",
"key": "mae",
"minimal": 0.042,
"name": "MAE",
"neural_mlp": 0.0426
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "A",
"process_short": null,
"research_name": "IMU-to-hand pose reconstruction",
"why": "Measures human-motion reconstruction from wearable and motion cues."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Retrieval checks view consistency but does not reconstruct geometry by itself.",
"direction_roles": {
"B": "direct",
"D": "proxy"
},
"display_name": "Camera-view synchronization retrieval",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "retrieval",
"id": "camera_view_sync_retrieval",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "higher",
"key": "mrr",
"minimal": 0.4943,
"name": "MRR",
"neural_mlp": 0.2409
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "B",
"process_short": null,
"research_name": "Camera-view synchronization retrieval",
"why": "Tests whether synchronized multi-view structure is recoverable across camera streams."
},
{
"architecture_family": null,
"case_study": null,
"current_limit": "Regression is local to the annotated public sample timeline.",
"direction_roles": {
"C": "diagnostic",
"D": "diagnostic"
},
"display_name": "Time-to-next-transition regression",
"evidence_links": [
{
"href": "data/task_walkthroughs.json",
"label": "Task walkthrough"
},
{
"href": "single_episode_explorer.html",
"label": "Single-episode explorer"
}
],
"family": "regression",
"id": "time_to_transition",
"input": null,
"input_short": null,
"metric": {
"better_baseline": "minimal",
"direction": "lower",
"key": "mae",
"minimal": 10.5374,
"name": "MAE frames",
"neural_mlp": 10.5545
},
"modalities": [],
"module_summary": null,
"output_short": null,
"primary_direction": "C",
"process_short": null,
"research_name": "Time-to-next-transition regression",
"why": "Measures temporal boundary awareness as a continuous timing target."
}
],
"three_foundation_pipelines": {
"claim_boundary": "These are supported pipeline directions, not three completed model-quality claims.",
"source_document": "THREE_FOUNDATION_PIPELINES.md",
"status": "pipeline_plan",
"title": "Three Foundation Pipeline Tracks",
"tracks": [
{
"avoid_claiming_now": [
"full neural rendering",
"full 3D reconstruction",
"general spatial intelligence without artifact-level evidence"
],
"core_inputs": [
"multiview RGB",
"egocentric video",
"depth",
"camera pose",
"calibration",
"object cues",
"language questions"
],
"current_maturity": "Ready as a pipeline and evaluation contract.",
"diagram_flow": [
{
"items": [
"multiview RGB plus egocentric video",
"metric depth and confidence",
"camera pose, calibration, SLAM",
"object, contact, and language cues"
],
"stage": "inputs"
},
{
"items": [
"spatial QA and object count",
"object permanence across windows",
"relative location and retrieval",
"pose-aware 3D consistency"
],
"stage": "tasks_targets"
},
{
"items": [
"export scene/object memory records",
"train spatial-memory encoder",
"add geometry-aware QA and retrieval heads",
"keep episode-level split discipline"
],
"stage": "train_models"
},
{
"items": [
"held-out episode spatial metrics",
"count and relation accuracy",
"retrieval rank and consistency",
"saved predictions before public claim"
],
"stage": "evaluate_gates"
}
],
"diagram_image": "docs/assets/foundation-pipelines/spatial-intelligence-pipeline.png",
"first_pipeline": "Build a spatial-memory exporter, start with metric depth and pose consistency tasks, then evaluate spatial QA, object permanence, counting, retrieval, and pose-aware consistency.",
"id": "spatial_intelligence",
"image_alt": "High-resolution slide diagram showing the Spatial intelligence models direction for Xperience-10M.",
"intermediate_artifacts": [
"synchronized camera window manifest",
"pose and depth availability report",
"scene and object memory records",
"object permanence targets",
"spatial relation targets",
"spatial QA prompts"
],
"next_gate": "Raw depth and pose artifacts plus held-out multi-episode spatial metrics.",
"one_sample_training_io": {
"boundary": "This yields a one-episode spatial training-pair recipe and proxy tasks; full spatial-intelligence claims require held-out multi-episode depth/pose/scene-memory metrics.",
"existing_task_hooks": [
"object_relevance",
"modality_reconstruction",
"caption_grounding",
"object_set_forecast",
"camera_view_sync_retrieval"
],
"input_builder": "Slice each 20-frame window, then join multiview RGB summaries with depth, camera pose, SLAM/calibration, object cues, contact cues, and optional language questions from the public annotation timeline.",
"sample_basis": "Single public sample episode: 5,821 frames, 1,161 overlapping 20-frame windows, 5-frame stride, about 20 FPS.",
"source_artifacts": [
"results/episode_task_suite/windows.csv",
"results/episode_task_suite/shared_windows.npz",
"results/episode_task_suite/feature_manifest.json",
"official sample annotation.hdf5",
"official sample six MP4 camera streams"
],
"target_builder": "Create spatial targets such as camera-view match, object relevance, object-set memory, depth/pose reconstruction proxy, caption-grounded retrieval, and spatial QA answers."
},
"outputs": [
"object count",
"object persistence",
"relative location",
"3D geometry consistency",
"multiview retrieval",
"camera-motion-aware scene memory",
"language answers grounded in the scene"
],
"question": "Can the model recover and reason over space from video?",
"title": "Spatial intelligence models",
"website_image": "assets/foundation-pipelines/spatial-intelligence-pipeline.png"
},
{
"avoid_claiming_now": [
"strong world model from structured future-task scores alone",
"visual future quality without visual or latent future metrics"
],
"core_inputs": [
"observed video windows",
"audio",
"sensor windows",
"hand and body motion",
"object and contact state",
"action and subtask labels",
"future windows"
],
"current_maturity": "Partially evidenced by current future-task probes and Cosmos-style branch artifacts.",
"diagram_flow": [
{
"items": [
"observed video/audio/sensor window",
"hand/body motion and camera pose",
"object/contact state",
"action and subtask labels"
],
"stage": "inputs"
},
{
"items": [
"next action and next subtask",
"future object set",
"contact transition",
"camera-motion delta or latent future"
],
"stage": "tasks_targets"
},
{
"items": [
"Qwen structured future probes",
"Cosmos/dynamics branch separately",
"latent rollout or reconstruction loss",
"no target-side future leakage"
],
"stage": "train_models"
},
{
"items": [
"held-out future-task metrics",
"contact and object-set F1",
"rollout or latent consistency",
"per-episode breakdown and examples"
],
"stage": "evaluate_gates"
}
],
"diagram_image": "docs/assets/foundation-pipelines/human-video-world-model-pipeline.png",
"first_pipeline": "Keep Qwen-style structured future probes for task interpretability, keep Cosmos-style dynamics branches separate, and add latent or feature-reconstruction metrics before claiming world-model quality.",
"id": "human_video_world_models",
"image_alt": "High-resolution slide diagram showing the Human-video world models direction for Xperience-10M.",
"intermediate_artifacts": [
"observed and future window pairs",
"future label targets",
"action-conditioned target records",
"visual or latent reconstruction targets",
"temporal consistency metadata"
],
"next_gate": "Stronger future-state metrics, qualitative future examples, and held-out episode breakdowns.",
"one_sample_training_io": {
"boundary": "Future labels and future windows must stay out of the input. Structured future probes are evidence for the pipeline, not a full visual world-model claim by themselves.",
"existing_task_hooks": [
"next_action",
"long_horizon_next_action",
"next_subtask_forecast",
"object_set_forecast",
"time_to_transition",
"ego_motion_forecast"
],
"input_builder": "Use the current 20-frame observed window at time t: RGB/audio/sensor summaries, hand/body motion, camera pose, current object/contact state, and current action/subtask context only.",
"sample_basis": "Single public sample episode: current observed windows are paired with shifted future labels or future-window features from the same timeline.",
"source_artifacts": [
"results/episode_task_suite/windows.csv",
"results/episode_task_suite/shared_windows.npz",
"results/episode_task_suite/tier2_task_suite/tier2_task_suite_results.json",
"results/episode_task_suite/research_direction_extensions/research_direction_extension_results.json"
],
"target_builder": "Shift the episode timeline forward to produce next-action, next-subtask, future object-set, contact-transition, time-to-transition, camera-motion delta, or latent/future-feature targets."
},
"outputs": [
"next action",
"next subtask",
"future object set",
"future state embedding",
"camera-motion delta",
"contact transition",
"future-window quality metrics"
],
"question": "Can the model predict what happens next?",
"title": "Human-video world models",
"website_image": "assets/foundation-pipelines/human-video-world-model-pipeline.png"
},
{
"avoid_claiming_now": [
"robot policy quality",
"policy generalization before action-space evidence exists"
],
"core_inputs": [
"egocentric video",
"language captions",
"hand and body motion",
"contacts",
"objects",
"procedure and subtask labels"
],
"current_maturity": "Feasible but gated by action-target conversion.",
"diagram_flow": [
{
"items": [
"egocentric video and captions",
"objects, contacts, and procedures",
"hand/body motion windows",
"subtask labels and language context"
],
"stage": "inputs"
},
{
"items": [
"action-token vocabulary",
"next action and action chunks",
"object-conditioned actions",
"contact state and subtask transition"
],
"stage": "tasks_targets"
},
{
"items": [
"build action-space converter",
"normalize and audit action chunks",
"train VLA/policy-compatible head",
"track leakage and retargeting reports"
],
"stage": "train_models"
},
{
"items": [
"held-out action metrics",
"chunk and next-action accuracy",
"object/contact-conditioned scores",
"policy card before robot-policy claim"
],
"stage": "evaluate_gates"
}
],
"diagram_image": "docs/assets/foundation-pipelines/vision-language-action-pipeline.png",
"first_pipeline": "Define the action space, use existing 20-task next-action/contact/object-conditioned tasks first, then add hand-trajectory or policy-compatible action chunks after conversion is traceable.",
"id": "vision_language_action",
"image_alt": "High-resolution slide diagram showing the Vision-language-action models direction for Xperience-10M.",
"intermediate_artifacts": [
"action-token vocabulary",
"action-chunk windows",
"normalization stats",
"retargeting report",
"leakage audit",
"action-space model card"
],
"next_gate": "Traceable action tokens, normalization, retargeting metadata, and held-out policy metrics.",
"one_sample_training_io": {
"boundary": "This is a VLA/policy data-conversion recipe for the one-sample suite. Robot policy claims require a later action-space converter, normalization, retargeting report, and held-out policy metrics.",
"existing_task_hooks": [
"timeline_action",
"next_action",
"hand_trajectory_forecast",
"contact_prediction",
"interaction_text_prediction",
"action_object_relation"
],
"input_builder": "Use egocentric/fisheye video windows, caption and object context, hand/body mocap, contact state, and current subtask text as the observation-language side of each training pair.",
"sample_basis": "Single public sample episode: observation-language windows are paired with action-token proxies because robot retargeted action chunks are not part of the public sample yet.",
"source_artifacts": [
"results/episode_task_suite/windows.csv",
"results/episode_task_suite/shared_windows.npz",
"results/episode_task_suite/task_walkthroughs/task_walkthroughs.json",
"official sample annotation.hdf5"
],
"target_builder": "Create action-token proxy targets: current or next action, object-conditioned action relation, contact state, interaction-text class, subtask transition, or hand-trajectory/action-chunk proxy."
},
"outputs": [
"next action",
"action chunk",
"object-conditioned action",
"contact state",
"subtask transition",
"policy or VLA held-out metrics"
],
"question": "Can the model turn what it sees and reads into action?",
"title": "Vision-language-action models",
"website_image": "assets/foundation-pipelines/vision-language-action-pipeline.png"
}
]
},
"title": "Interactive Research Roadmap"
}