| { |
| "additional_development_directions": { |
| "directions": [ |
| { |
| "data_signals": [ |
| "language annotations", |
| "object labels", |
| "scene context", |
| "video thumbnails", |
| "motion statistics", |
| "missing-modality flags" |
| ], |
| "evaluation": "Coverage by session, activity, object, and modality; duplicate checks; train/val/test leakage checks; reproducible selection report.", |
| "first_build": "Episode atlas, category tags, balance report, and split builder across activities, objects, scenes, people, sessions, and missing modalities.", |
| "id": "episode_taxonomy_data_engine", |
| "name": "Episode Taxonomy and Data Engine", |
| "why_it_matters": "Fine-tuning quality depends on selecting representative episodes instead of sampling randomly from a large corpus." |
| }, |
| { |
| "data_signals": [ |
| "episode manifests", |
| "window manifests", |
| "task labels", |
| "prediction files", |
| "metric files" |
| ], |
| "evaluation": "Versioned splits, deterministic metric scripts, task-specific confidence intervals, and model-card reporting templates.", |
| "first_build": "Fixed train/val/test manifests, task cards, leakage checks, metric scripts, and small reference baselines.", |
| "id": "standardized_benchmark_protocol", |
| "name": "Standardized Benchmark Protocol", |
| "why_it_matters": "Future model results become comparable across Qwen, Cosmos-style world models, policy models, and smaller task heads." |
| }, |
| { |
| "data_signals": [ |
| "video", |
| "audio", |
| "depth", |
| "pose/SLAM", |
| "mocap", |
| "IMU", |
| "language" |
| ], |
| "evaluation": "Cross-modal retrieval, missing-modality reconstruction, transfer to the 12 task heads, and held-out episode generalization.", |
| "first_build": "Contrastive and masked-prediction objectives over synchronized multimodal windows.", |
| "id": "multimodal_representation_learning", |
| "name": "Multimodal Representation Learning", |
| "why_it_matters": "Xperience-10M can train reusable encoders before committing to expensive large-model fine-tuning or pretraining." |
| }, |
| { |
| "data_signals": [ |
| "action labels", |
| "subtask labels", |
| "language annotations", |
| "hand trajectories", |
| "contact states", |
| "object labels" |
| ], |
| "evaluation": "Step boundary accuracy, transition prediction, next-step prediction, graph consistency, and long-horizon task replay.", |
| "first_build": "Step segmentation, transition graph, precondition/effect labels, and temporal skill graph extraction.", |
| "id": "skill_procedure_graph_mining", |
| "name": "Skill and Procedure Graph Mining", |
| "why_it_matters": "It connects egocentric perception to task structure, planning, and long-horizon embodied reasoning." |
| }, |
| { |
| "data_signals": [ |
| "hand mocap", |
| "body mocap", |
| "contacts", |
| "objects", |
| "egocentric video", |
| "language" |
| ], |
| "evaluation": "Contact F1, object micro-F1, affordance accuracy, future interaction prediction, and per-object error analysis.", |
| "first_build": "Contact, hand-object state, reachable object, likely tool use, and next-affordance prediction tasks.", |
| "id": "human_object_affordance_modeling", |
| "name": "Human-Object Interaction and Affordance Modeling", |
| "why_it_matters": "The dataset can model what actions the scene affords, not only what action label is currently visible." |
| }, |
| { |
| "data_signals": [ |
| "depth", |
| "pose/SLAM", |
| "multiview video", |
| "camera calibration", |
| "objects", |
| "motion traces" |
| ], |
| "evaluation": "Map consistency, object permanence, spatial retrieval, future-state prediction, and novel-view or view-consistency probes.", |
| "first_build": "Persistent scene/object map prototypes built from depth, pose/SLAM, multiview video, and object cues.", |
| "id": "scene_object_memory", |
| "name": "3D/4D Scene and Object Memory", |
| "why_it_matters": "It moves beyond frame-level recognition toward world-state tracking, object permanence, and spatial reasoning." |
| }, |
| { |
| "data_signals": [ |
| "timestamps", |
| "file manifests", |
| "camera streams", |
| "audio streams", |
| "depth streams", |
| "calibration", |
| "annotation coverage" |
| ], |
| "evaluation": "QA pass rate, drift estimates, missing-view tables, corruption reports, and exclusion or degraded-mode manifests.", |
| "first_build": "Per-episode QA for timestamp drift, stream availability, calibration consistency, corrupted files, and missing modalities.", |
| "id": "data_quality_sync_diagnostics", |
| "name": "Data Quality, Synchronization, and Missing-Modality Diagnostics", |
| "why_it_matters": "Large multimodal training fails quietly without strong data-quality gates, so QA should be a first-class artifact." |
| }, |
| { |
| "data_signals": [ |
| "mocap", |
| "hand trajectories", |
| "contacts", |
| "object states", |
| "egocentric video", |
| "language instructions" |
| ], |
| "evaluation": "Retargeting validity, action prediction, contact consistency, imitation rollout quality, and sim-to-real assumption checks.", |
| "first_build": "Action-token conversion, robot-compatible targets, imitation-learning examples, and simulation transfer probes.", |
| "id": "policy_retargeting_simulation_transfer", |
| "name": "Policy, Retargeting, and Simulation Transfer", |
| "why_it_matters": "It creates a bridge from human egocentric experience to robot policies while keeping action-space assumptions explicit." |
| } |
| ], |
| "practical_order": [ |
| "Build the episode taxonomy and data-quality diagnostics first.", |
| "Lock the benchmark protocol and split manifests before reporting model scores.", |
| "Add representation-learning and skill-graph objectives once enough episodes are staged.", |
| "Add affordance, 3D/4D memory, and policy-retargeting branches after labels and action targets are measurable." |
| ], |
| "public_boundary": "These are proposed development tracks. They are not reported as completed held-out benchmark results.", |
| "source_document": "ADDITIONAL_DEVELOPMENT_DIRECTIONS.md", |
| "status": "planned_research_directions", |
| "summary": "Concrete Xperience-10M project directions beyond the current minimal baselines, Qwen3-Omni LoRA plan, Cosmos/world-model track, and long-term native pretraining goal.", |
| "title": "Additional Development Directions" |
| }, |
| "baseline_summary": { |
| "baseline_heads": "minimal and neural MLP heads", |
| "current_use": "task design, data-contract validation, case studies, and baseline comparison", |
| "split": "chronological single-episode split for public-sample diagnostics", |
| "task_count": 20 |
| }, |
| "directions": [ |
| { |
| "code": "A", |
| "counts": { |
| "diagnostic": 0, |
| "direct": 3, |
| "proxy": 3, |
| "total_links": 6 |
| }, |
| "current_readout": "The sample supports hand trajectory forecasting and contact/object probes, but it does not yet include a full body/shape model or multi-person priors.", |
| "current_status": "partially implemented", |
| "extension_tasks": [ |
| { |
| "current_limit": "This is a motion-energy proxy, not a SMPL/MANO body model or a generative motion prior.", |
| "family": "classification", |
| "id": "body_motion_intensity", |
| "metric_name": "macro-F1", |
| "name": "Body and Hand Motion Intensity" |
| } |
| ], |
| "focus": "Human/hand/body motion, deformation priors, human-object interaction, affordance modeling.", |
| "id": "human_motion", |
| "name": "Human Modeling & Motion Understanding", |
| "next_steps": [ |
| "Add SMPL/SMPL-X or MANO-style body/hand parameter targets where available.", |
| "Train sequence models over multi-episode motion trajectories instead of isolated windows.", |
| "Evaluate affordance prediction on held-out objects and held-out episodes." |
| ], |
| "preferred_background": "Human pose/shape estimation, SMPL-style models, motion capture, or motion generation.", |
| "task_ids": [ |
| "timeline_action", |
| "hand_trajectory_forecast", |
| "contact_prediction", |
| "object_relevance", |
| "interaction_text_prediction", |
| "imu_to_hand_pose" |
| ], |
| "tasks": [ |
| { |
| "architecture_family": "multiclass classifier", |
| "case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.", |
| "current_limit": "Chronological single-episode split creates unseen future action classes.", |
| "direction_roles": { |
| "A": "proxy", |
| "C": "direct" |
| }, |
| "display_name": "Action Recognition", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "supervised", |
| "id": "timeline_action", |
| "input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.", |
| "input_short": "20-frame multimodal window", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.05, |
| "name": "macro-F1", |
| "neural_mlp": 0.0148 |
| }, |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial", |
| "language" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "current action class", |
| "primary_direction": "C", |
| "process_short": "window features -> action label builder -> classifier", |
| "research_name": "Egocentric Action Recognition", |
| "why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout." |
| }, |
| { |
| "architecture_family": "continuous regressor", |
| "case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.", |
| "current_limit": "Forecasting is window-level and not yet a full sequence or policy model.", |
| "direction_roles": { |
| "A": "direct", |
| "C": "proxy" |
| }, |
| "display_name": "Hand Trajectory Forecasting", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json", |
| "label": "Neural metrics" |
| } |
| ], |
| "family": "forecast", |
| "id": "hand_trajectory_forecast", |
| "input": "The current all-modality window vector at time t.", |
| "input_short": "current multimodal window", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "lower", |
| "key": "mpjpe", |
| "minimal": 0.8647, |
| "name": "MPJPE", |
| "neural_mlp": 0.1079 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "video", |
| "depth", |
| "pose_slam", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "future hand-joint trajectory", |
| "primary_direction": "A", |
| "process_short": "current features -> future mocap target -> regression head", |
| "research_name": "3D Hand Motion Forecasting", |
| "why": "Directly predicts human hand motion and supports hand-object interaction modeling." |
| }, |
| { |
| "architecture_family": "binary classifier", |
| "case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.", |
| "current_limit": "The public sample is degenerate for this target because one class dominates.", |
| "direction_roles": { |
| "A": "direct", |
| "C": "proxy" |
| }, |
| "display_name": "Contact State Prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "supervised", |
| "id": "contact_prediction", |
| "input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.", |
| "input_short": "non-contact, non-caption features", |
| "metric": { |
| "better_baseline": "tie", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 1.0, |
| "name": "macro-F1", |
| "neural_mlp": 1.0 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "video", |
| "depth", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "contact or no contact", |
| "primary_direction": "A", |
| "process_short": "feature filter -> contact target -> binary classifier", |
| "research_name": "Human-Object Contact Prediction", |
| "why": "Targets physical interaction state, a core affordance and manipulation signal." |
| }, |
| { |
| "architecture_family": "multi-label classifier", |
| "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.", |
| "current_limit": "Object labels are language-derived and sparse in one episode.", |
| "direction_roles": { |
| "A": "proxy", |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Object Relevance Prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv", |
| "label": "Neural predictions" |
| } |
| ], |
| "family": "supervised", |
| "id": "object_relevance", |
| "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.", |
| "input_short": "non-caption multimodal features", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "micro_f1", |
| "minimal": 0.1803, |
| "name": "micro-F1", |
| "neural_mlp": 0.1679 |
| }, |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "relevant object set", |
| "primary_direction": "C", |
| "process_short": "object vocabulary -> multi-hot labels -> sigmoid heads", |
| "research_name": "Object-Centric Interaction Recognition", |
| "why": "Connects egocentric activity to manipulated objects and early object-centric state." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Public derived features retain hashed text targets; raw full text requires the official annotation source.", |
| "direction_roles": { |
| "A": "proxy", |
| "C": "direct" |
| }, |
| "display_name": "Interaction text prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "classification", |
| "id": "interaction_text_prediction", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0444, |
| "name": "macro-F1", |
| "neural_mlp": 0.0381 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Interaction text prediction", |
| "why": "Connects egocentric observations to the natural-language interaction semantics carried by the annotation." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Pose reconstruction is window-level and does not yet fit a full parametric hand/body model.", |
| "direction_roles": { |
| "A": "direct", |
| "B": "proxy" |
| }, |
| "display_name": "IMU-to-hand pose reconstruction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "regression", |
| "id": "imu_to_hand_pose", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "lower", |
| "key": "mae", |
| "minimal": 0.042, |
| "name": "MAE", |
| "neural_mlp": 0.0426 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "A", |
| "process_short": null, |
| "research_name": "IMU-to-hand pose reconstruction", |
| "why": "Measures human-motion reconstruction from wearable and motion cues." |
| } |
| ] |
| }, |
| { |
| "code": "B", |
| "counts": { |
| "diagnostic": 1, |
| "direct": 1, |
| "proxy": 3, |
| "total_links": 5 |
| }, |
| "current_readout": "The current suite checks cross-modal alignment and depth/video reconstruction proxies; it does not yet train a renderer or reconstruct geometry.", |
| "current_status": "proxy tasks only", |
| "extension_tasks": [ |
| { |
| "current_limit": "This checks calibrated multi-view signal, but it is still feature retrieval, not NeRF, Gaussian Splatting, or novel-view synthesis.", |
| "family": "retrieval", |
| "id": "multi_view_consistency_retrieval", |
| "metric_name": "MRR", |
| "name": "Multi-View Consistency Retrieval" |
| } |
| ], |
| "focus": "Multi-view dynamic scene reconstruction, NeRF/Gaussian Splatting, novel-view synthesis.", |
| "id": "reconstruction_rendering", |
| "name": "3D/4D Reconstruction & Neural Rendering", |
| "next_steps": [ |
| "Use calibrated multi-view video plus SLAM pose to build per-episode camera trajectories.", |
| "Add depth-supervised point clouds, TSDF, Gaussian Splatting, or NeRF baselines.", |
| "Evaluate novel-view synthesis and temporal consistency across held-out views/time." |
| ], |
| "preferred_background": "3D reconstruction, neural rendering, camera calibration, and bundle adjustment.", |
| "task_ids": [ |
| "cross_modal_retrieval", |
| "modality_reconstruction", |
| "misalignment_detection", |
| "imu_to_hand_pose", |
| "camera_view_sync_retrieval" |
| ], |
| "tasks": [ |
| { |
| "architecture_family": "two-tower retrieval head", |
| "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.", |
| "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.", |
| "direction_roles": { |
| "B": "proxy", |
| "C": "diagnostic", |
| "D": "proxy" |
| }, |
| "display_name": "Cross-Modal Retrieval", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json", |
| "label": "Neural metrics" |
| } |
| ], |
| "family": "retrieval", |
| "id": "cross_modal_retrieval", |
| "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.", |
| "input_short": "motion/IMU/pose query; depth/video candidates", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "mrr", |
| "minimal": 0.2693, |
| "name": "MRR", |
| "neural_mlp": 0.13 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "pose_slam", |
| "depth", |
| "video" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "ranked visual windows", |
| "primary_direction": "C", |
| "process_short": "modality split -> projection -> nearest-neighbor ranker", |
| "research_name": "Multimodal Representation Retrieval", |
| "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling." |
| }, |
| { |
| "architecture_family": "feature regressor", |
| "case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.", |
| "current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.", |
| "direction_roles": { |
| "B": "proxy", |
| "D": "proxy" |
| }, |
| "display_name": "Cross-Modal Reconstruction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json", |
| "label": "Neural metrics" |
| } |
| ], |
| "family": "forecast", |
| "id": "modality_reconstruction", |
| "input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.", |
| "input_short": "motion, IMU, and camera/pose features", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "r2", |
| "minimal": -0.0153, |
| "name": "R2", |
| "neural_mlp": -0.0102 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "pose_slam", |
| "depth", |
| "video" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "reconstructed depth/video vector", |
| "primary_direction": "B", |
| "process_short": "source-target split -> scaler -> regression head", |
| "research_name": "Modality Feature Reconstruction", |
| "why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective." |
| }, |
| { |
| "architecture_family": "pairwise classifier", |
| "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.", |
| "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.", |
| "direction_roles": { |
| "B": "diagnostic", |
| "C": "diagnostic", |
| "D": "diagnostic" |
| }, |
| "display_name": "Multimodal Synchronization Detection", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv", |
| "label": "Neural predictions" |
| } |
| ], |
| "family": "diagnostic", |
| "id": "misalignment_detection", |
| "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.", |
| "input_short": "motion-side and visual/depth-side feature groups", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "f1", |
| "minimal": 0.5052, |
| "name": "F1", |
| "neural_mlp": 0.7153 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "video", |
| "depth", |
| "pose_slam" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "aligned or shifted", |
| "primary_direction": "C", |
| "process_short": "aligned/shifted pairs -> feature combiner -> binary classifier", |
| "research_name": "Cross-Modal Misalignment Detection", |
| "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Pose reconstruction is window-level and does not yet fit a full parametric hand/body model.", |
| "direction_roles": { |
| "A": "direct", |
| "B": "proxy" |
| }, |
| "display_name": "IMU-to-hand pose reconstruction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "regression", |
| "id": "imu_to_hand_pose", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "lower", |
| "key": "mae", |
| "minimal": 0.042, |
| "name": "MAE", |
| "neural_mlp": 0.0426 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "A", |
| "process_short": null, |
| "research_name": "IMU-to-hand pose reconstruction", |
| "why": "Measures human-motion reconstruction from wearable and motion cues." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Retrieval checks view consistency but does not reconstruct geometry by itself.", |
| "direction_roles": { |
| "B": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Camera-view synchronization retrieval", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "retrieval", |
| "id": "camera_view_sync_retrieval", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "mrr", |
| "minimal": 0.4943, |
| "name": "MRR", |
| "neural_mlp": 0.2409 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "B", |
| "process_short": null, |
| "research_name": "Camera-view synchronization retrieval", |
| "why": "Tests whether synchronized multi-view structure is recoverable across camera streams." |
| } |
| ] |
| }, |
| { |
| "code": "C", |
| "counts": { |
| "diagnostic": 4, |
| "direct": 10, |
| "proxy": 3, |
| "total_links": 17 |
| }, |
| "current_readout": "The unified 20-task suite directly targets egocentric action, task state, interaction, grounding, forecasting, and alignment.", |
| "current_status": "strongest implemented track", |
| "extension_tasks": [ |
| { |
| "current_limit": "This is an action-structure probe inside one episode, not a general intent model across homes, people, or tasks.", |
| "family": "regression", |
| "id": "action_phase_progress", |
| "metric_name": "MAE", |
| "name": "Action Phase Progress Estimation" |
| } |
| ], |
| "focus": "Egocentric action and intention understanding, hand-object interaction, gaze/attention modeling, task structure modeling.", |
| "id": "egocentric_interaction", |
| "name": "Egocentric Vision & Interaction", |
| "next_steps": [ |
| "Move from single-episode chronological splits to held-out-episode splits.", |
| "Use audio together with stronger multimodal backbones for action, intent, and grounding.", |
| "Evaluate long-horizon task success prediction and action-conditioned generation." |
| ], |
| "preferred_background": "Video understanding, action recognition, or egocentric vision.", |
| "task_ids": [ |
| "timeline_action", |
| "timeline_subtask", |
| "transition_detection", |
| "next_action", |
| "hand_trajectory_forecast", |
| "contact_prediction", |
| "object_relevance", |
| "caption_grounding", |
| "cross_modal_retrieval", |
| "temporal_order", |
| "misalignment_detection", |
| "long_horizon_next_action", |
| "next_subtask_forecast", |
| "interaction_text_prediction", |
| "action_object_relation", |
| "object_set_forecast", |
| "time_to_transition" |
| ], |
| "tasks": [ |
| { |
| "architecture_family": "multiclass classifier", |
| "case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.", |
| "current_limit": "Chronological single-episode split creates unseen future action classes.", |
| "direction_roles": { |
| "A": "proxy", |
| "C": "direct" |
| }, |
| "display_name": "Action Recognition", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "supervised", |
| "id": "timeline_action", |
| "input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.", |
| "input_short": "20-frame multimodal window", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.05, |
| "name": "macro-F1", |
| "neural_mlp": 0.0148 |
| }, |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial", |
| "language" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "current action class", |
| "primary_direction": "C", |
| "process_short": "window features -> action label builder -> classifier", |
| "research_name": "Egocentric Action Recognition", |
| "why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout." |
| }, |
| { |
| "architecture_family": "multiclass classifier", |
| "case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.", |
| "current_limit": "Single-episode ordering makes future subtasks hard to generalize.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Procedure Step Recognition", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "supervised", |
| "id": "timeline_subtask", |
| "input": "The same all-modality window vector used by action recognition.", |
| "input_short": "20-frame multimodal window", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0506, |
| "name": "macro-F1", |
| "neural_mlp": 0.0281 |
| }, |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial", |
| "language" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "current procedure step", |
| "primary_direction": "C", |
| "process_short": "window features -> subtask label builder -> classifier", |
| "research_name": "Temporal Subtask Recognition", |
| "why": "Segments egocentric task state and provides a first proxy for symbolic world/task state." |
| }, |
| { |
| "architecture_family": "binary classifier", |
| "case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.", |
| "current_limit": "Boundary class is sparse, so accuracy alone is misleading.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "diagnostic" |
| }, |
| "display_name": "Action Boundary Detection", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "diagnostic", |
| "id": "transition_detection", |
| "input": "One all-modality window vector plus labels derived from action-change timestamps.", |
| "input_short": "current window with boundary target", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.6118, |
| "name": "macro-F1", |
| "neural_mlp": 0.5862 |
| }, |
| "modalities": [ |
| "video", |
| "pose_slam", |
| "motion_capture", |
| "inertial", |
| "language" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "boundary or steady", |
| "primary_direction": "C", |
| "process_short": "action changes -> boundary labels -> binary classifier", |
| "research_name": "Temporal Action Segmentation", |
| "why": "Localizes egocentric task boundaries and diagnoses temporal state changes." |
| }, |
| { |
| "architecture_family": "future-label classifier", |
| "case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.", |
| "current_limit": "Unseen future labels dominate the single-episode chronological test.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Next-Action Prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "supervised", |
| "id": "next_action", |
| "input": "The current all-modality window vector at time t.", |
| "input_short": "current window at time t", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0593, |
| "name": "macro-F1", |
| "neural_mlp": 0.0419 |
| }, |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "action at t+20 frames", |
| "primary_direction": "C", |
| "process_short": "current features -> future label shift -> classifier", |
| "research_name": "Short-Horizon Intention Prediction", |
| "why": "Tests action intention/task-flow prediction from egocentric context." |
| }, |
| { |
| "architecture_family": "continuous regressor", |
| "case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.", |
| "current_limit": "Forecasting is window-level and not yet a full sequence or policy model.", |
| "direction_roles": { |
| "A": "direct", |
| "C": "proxy" |
| }, |
| "display_name": "Hand Trajectory Forecasting", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json", |
| "label": "Neural metrics" |
| } |
| ], |
| "family": "forecast", |
| "id": "hand_trajectory_forecast", |
| "input": "The current all-modality window vector at time t.", |
| "input_short": "current multimodal window", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "lower", |
| "key": "mpjpe", |
| "minimal": 0.8647, |
| "name": "MPJPE", |
| "neural_mlp": 0.1079 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "video", |
| "depth", |
| "pose_slam", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "future hand-joint trajectory", |
| "primary_direction": "A", |
| "process_short": "current features -> future mocap target -> regression head", |
| "research_name": "3D Hand Motion Forecasting", |
| "why": "Directly predicts human hand motion and supports hand-object interaction modeling." |
| }, |
| { |
| "architecture_family": "binary classifier", |
| "case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.", |
| "current_limit": "The public sample is degenerate for this target because one class dominates.", |
| "direction_roles": { |
| "A": "direct", |
| "C": "proxy" |
| }, |
| "display_name": "Contact State Prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "supervised", |
| "id": "contact_prediction", |
| "input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.", |
| "input_short": "non-contact, non-caption features", |
| "metric": { |
| "better_baseline": "tie", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 1.0, |
| "name": "macro-F1", |
| "neural_mlp": 1.0 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "video", |
| "depth", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "contact or no contact", |
| "primary_direction": "A", |
| "process_short": "feature filter -> contact target -> binary classifier", |
| "research_name": "Human-Object Contact Prediction", |
| "why": "Targets physical interaction state, a core affordance and manipulation signal." |
| }, |
| { |
| "architecture_family": "multi-label classifier", |
| "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.", |
| "current_limit": "Object labels are language-derived and sparse in one episode.", |
| "direction_roles": { |
| "A": "proxy", |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Object Relevance Prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv", |
| "label": "Neural predictions" |
| } |
| ], |
| "family": "supervised", |
| "id": "object_relevance", |
| "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.", |
| "input_short": "non-caption multimodal features", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "micro_f1", |
| "minimal": 0.1803, |
| "name": "micro-F1", |
| "neural_mlp": 0.1679 |
| }, |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "relevant object set", |
| "primary_direction": "C", |
| "process_short": "object vocabulary -> multi-hot labels -> sigmoid heads", |
| "research_name": "Object-Centric Interaction Recognition", |
| "why": "Connects egocentric activity to manipulated objects and early object-centric state." |
| }, |
| { |
| "architecture_family": "retrieval ranker", |
| "case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.", |
| "current_limit": "Bag-of-objects language features are too weak for rich grounding.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Language Grounding", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json", |
| "label": "Neural metrics" |
| } |
| ], |
| "family": "retrieval", |
| "id": "caption_grounding", |
| "input": "Caption/object/interaction query features and a set of candidate sensor-window features.", |
| "input_short": "text-like query and candidate windows", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "mrr", |
| "minimal": 0.016, |
| "name": "MRR", |
| "neural_mlp": 0.0168 |
| }, |
| "modalities": [ |
| "language", |
| "video", |
| "depth", |
| "pose_slam" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "ranked matching moments", |
| "primary_direction": "C", |
| "process_short": "query features -> candidate index -> cosine ranker", |
| "research_name": "Language-to-Moment Grounding", |
| "why": "Grounds language annotation into egocentric sensor time and task state." |
| }, |
| { |
| "architecture_family": "two-tower retrieval head", |
| "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.", |
| "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.", |
| "direction_roles": { |
| "B": "proxy", |
| "C": "diagnostic", |
| "D": "proxy" |
| }, |
| "display_name": "Cross-Modal Retrieval", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json", |
| "label": "Neural metrics" |
| } |
| ], |
| "family": "retrieval", |
| "id": "cross_modal_retrieval", |
| "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.", |
| "input_short": "motion/IMU/pose query; depth/video candidates", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "mrr", |
| "minimal": 0.2693, |
| "name": "MRR", |
| "neural_mlp": 0.13 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "pose_slam", |
| "depth", |
| "video" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "ranked visual windows", |
| "primary_direction": "C", |
| "process_short": "modality split -> projection -> nearest-neighbor ranker", |
| "research_name": "Multimodal Representation Retrieval", |
| "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling." |
| }, |
| { |
| "architecture_family": "pairwise classifier", |
| "case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.", |
| "current_limit": "Only local adjacent ordering, not long-horizon causal modeling.", |
| "direction_roles": { |
| "C": "diagnostic", |
| "D": "diagnostic" |
| }, |
| "display_name": "Temporal Order Verification", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv", |
| "label": "Neural predictions" |
| } |
| ], |
| "family": "diagnostic", |
| "id": "temporal_order", |
| "input": "A pair of adjacent window vectors, plus their difference vector.", |
| "input_short": "two adjacent windows plus difference vector", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "f1", |
| "minimal": 0.54, |
| "name": "F1", |
| "neural_mlp": 0.852 |
| }, |
| "modalities": [ |
| "video", |
| "pose_slam", |
| "motion_capture", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "correct or reversed", |
| "primary_direction": "C", |
| "process_short": "pair builder -> feature combiner -> binary classifier", |
| "research_name": "Temporal Order Verification", |
| "why": "Checks whether features encode local time direction and task progression." |
| }, |
| { |
| "architecture_family": "pairwise classifier", |
| "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.", |
| "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.", |
| "direction_roles": { |
| "B": "diagnostic", |
| "C": "diagnostic", |
| "D": "diagnostic" |
| }, |
| "display_name": "Multimodal Synchronization Detection", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv", |
| "label": "Neural predictions" |
| } |
| ], |
| "family": "diagnostic", |
| "id": "misalignment_detection", |
| "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.", |
| "input_short": "motion-side and visual/depth-side feature groups", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "f1", |
| "minimal": 0.5052, |
| "name": "F1", |
| "neural_mlp": 0.7153 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "video", |
| "depth", |
| "pose_slam" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "aligned or shifted", |
| "primary_direction": "C", |
| "process_short": "aligned/shifted pairs -> feature combiner -> binary classifier", |
| "research_name": "Cross-Modal Misalignment Detection", |
| "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Evaluated from sample-supported future labels, not full open-world action generation.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Long-horizon next-action forecasting", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "classification", |
| "id": "long_horizon_next_action", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.075, |
| "name": "macro-F1", |
| "neural_mlp": 0.0655 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Long-horizon next-action forecasting", |
| "why": "Extends short-horizon intention prediction into longer activity futures, a key egocentric and world-model signal." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Subtask labels are constrained to the available annotation vocabulary.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Long-horizon next-subtask forecasting", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "classification", |
| "id": "next_subtask_forecast", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0455, |
| "name": "macro-F1", |
| "neural_mlp": 0.0507 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Long-horizon next-subtask forecasting", |
| "why": "Measures whether the model can anticipate the next procedural phase rather than only the current frame state." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Public derived features retain hashed text targets; raw full text requires the official annotation source.", |
| "direction_roles": { |
| "A": "proxy", |
| "C": "direct" |
| }, |
| "display_name": "Interaction text prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "classification", |
| "id": "interaction_text_prediction", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0444, |
| "name": "macro-F1", |
| "neural_mlp": 0.0381 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Interaction text prediction", |
| "why": "Connects egocentric observations to the natural-language interaction semantics carried by the annotation." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Relation labels are derived from the public-sample annotation scope.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Action-object relation prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "classification", |
| "id": "action_object_relation", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "tie", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0, |
| "name": "macro-F1", |
| "neural_mlp": 0.0 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Action-object relation prediction", |
| "why": "Tests whether action recognition and object state are connected as a relational interaction representation." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "This is a set-level proxy, not a persistent 3D scene graph.", |
| "direction_roles": { |
| "C": "proxy", |
| "D": "direct" |
| }, |
| "display_name": "Future object-set forecasting", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "multi-label", |
| "id": "object_set_forecast", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "micro_f1", |
| "minimal": 0.1694, |
| "name": "micro-F1", |
| "neural_mlp": 0.1972 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "D", |
| "process_short": null, |
| "research_name": "Future object-set forecasting", |
| "why": "Asks whether the current scene state supports predicting which objects will matter later." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Regression is local to the annotated public sample timeline.", |
| "direction_roles": { |
| "C": "diagnostic", |
| "D": "diagnostic" |
| }, |
| "display_name": "Time-to-next-transition regression", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "regression", |
| "id": "time_to_transition", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "lower", |
| "key": "mae", |
| "minimal": 10.5374, |
| "name": "MAE frames", |
| "neural_mlp": 10.5545 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Time-to-next-transition regression", |
| "why": "Measures temporal boundary awareness as a continuous timing target." |
| } |
| ] |
| }, |
| { |
| "code": "D", |
| "counts": { |
| "diagnostic": 4, |
| "direct": 1, |
| "proxy": 10, |
| "total_links": 15 |
| }, |
| "current_readout": "The current tasks probe temporal structure, object relevance, cross-modal retrieval, and modality prediction, but they do not yet build persistent maps or scene graphs.", |
| "current_status": "early proxy tasks", |
| "extension_tasks": [ |
| { |
| "current_limit": "This is a compact world-model proxy; it does not build a persistent map, scene graph, or object permanence model.", |
| "family": "forecast", |
| "id": "ego_motion_forecast", |
| "metric_name": "MAE", |
| "name": "Short-Horizon Ego-Motion Forecasting" |
| } |
| ], |
| "focus": "Long-term consistent 3D/4D scene mapping, scene graphs, object- and space-centric representations, spatial reasoning.", |
| "id": "world_modeling", |
| "name": "Scene Reconstruction & World Modeling", |
| "next_steps": [ |
| "Convert windows into persistent object/scene-state nodes with timestamps and camera poses.", |
| "Add map consistency, object permanence, and spatial relation prediction tasks.", |
| "Train held-out-episode world models that predict future observations and task state." |
| ], |
| "preferred_background": "Large-scale mapping, semantic reconstruction, or agent world models.", |
| "task_ids": [ |
| "timeline_subtask", |
| "transition_detection", |
| "next_action", |
| "object_relevance", |
| "caption_grounding", |
| "cross_modal_retrieval", |
| "modality_reconstruction", |
| "temporal_order", |
| "misalignment_detection", |
| "long_horizon_next_action", |
| "next_subtask_forecast", |
| "action_object_relation", |
| "object_set_forecast", |
| "camera_view_sync_retrieval", |
| "time_to_transition" |
| ], |
| "tasks": [ |
| { |
| "architecture_family": "multiclass classifier", |
| "case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.", |
| "current_limit": "Single-episode ordering makes future subtasks hard to generalize.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Procedure Step Recognition", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "supervised", |
| "id": "timeline_subtask", |
| "input": "The same all-modality window vector used by action recognition.", |
| "input_short": "20-frame multimodal window", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0506, |
| "name": "macro-F1", |
| "neural_mlp": 0.0281 |
| }, |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial", |
| "language" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "current procedure step", |
| "primary_direction": "C", |
| "process_short": "window features -> subtask label builder -> classifier", |
| "research_name": "Temporal Subtask Recognition", |
| "why": "Segments egocentric task state and provides a first proxy for symbolic world/task state." |
| }, |
| { |
| "architecture_family": "binary classifier", |
| "case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.", |
| "current_limit": "Boundary class is sparse, so accuracy alone is misleading.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "diagnostic" |
| }, |
| "display_name": "Action Boundary Detection", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "diagnostic", |
| "id": "transition_detection", |
| "input": "One all-modality window vector plus labels derived from action-change timestamps.", |
| "input_short": "current window with boundary target", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.6118, |
| "name": "macro-F1", |
| "neural_mlp": 0.5862 |
| }, |
| "modalities": [ |
| "video", |
| "pose_slam", |
| "motion_capture", |
| "inertial", |
| "language" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "boundary or steady", |
| "primary_direction": "C", |
| "process_short": "action changes -> boundary labels -> binary classifier", |
| "research_name": "Temporal Action Segmentation", |
| "why": "Localizes egocentric task boundaries and diagnoses temporal state changes." |
| }, |
| { |
| "architecture_family": "future-label classifier", |
| "case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.", |
| "current_limit": "Unseen future labels dominate the single-episode chronological test.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Next-Action Prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "supervised", |
| "id": "next_action", |
| "input": "The current all-modality window vector at time t.", |
| "input_short": "current window at time t", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0593, |
| "name": "macro-F1", |
| "neural_mlp": 0.0419 |
| }, |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "action at t+20 frames", |
| "primary_direction": "C", |
| "process_short": "current features -> future label shift -> classifier", |
| "research_name": "Short-Horizon Intention Prediction", |
| "why": "Tests action intention/task-flow prediction from egocentric context." |
| }, |
| { |
| "architecture_family": "multi-label classifier", |
| "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.", |
| "current_limit": "Object labels are language-derived and sparse in one episode.", |
| "direction_roles": { |
| "A": "proxy", |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Object Relevance Prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv", |
| "label": "Neural predictions" |
| } |
| ], |
| "family": "supervised", |
| "id": "object_relevance", |
| "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.", |
| "input_short": "non-caption multimodal features", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "micro_f1", |
| "minimal": 0.1803, |
| "name": "micro-F1", |
| "neural_mlp": 0.1679 |
| }, |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "relevant object set", |
| "primary_direction": "C", |
| "process_short": "object vocabulary -> multi-hot labels -> sigmoid heads", |
| "research_name": "Object-Centric Interaction Recognition", |
| "why": "Connects egocentric activity to manipulated objects and early object-centric state." |
| }, |
| { |
| "architecture_family": "retrieval ranker", |
| "case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.", |
| "current_limit": "Bag-of-objects language features are too weak for rich grounding.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Language Grounding", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json", |
| "label": "Neural metrics" |
| } |
| ], |
| "family": "retrieval", |
| "id": "caption_grounding", |
| "input": "Caption/object/interaction query features and a set of candidate sensor-window features.", |
| "input_short": "text-like query and candidate windows", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "mrr", |
| "minimal": 0.016, |
| "name": "MRR", |
| "neural_mlp": 0.0168 |
| }, |
| "modalities": [ |
| "language", |
| "video", |
| "depth", |
| "pose_slam" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "ranked matching moments", |
| "primary_direction": "C", |
| "process_short": "query features -> candidate index -> cosine ranker", |
| "research_name": "Language-to-Moment Grounding", |
| "why": "Grounds language annotation into egocentric sensor time and task state." |
| }, |
| { |
| "architecture_family": "two-tower retrieval head", |
| "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.", |
| "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.", |
| "direction_roles": { |
| "B": "proxy", |
| "C": "diagnostic", |
| "D": "proxy" |
| }, |
| "display_name": "Cross-Modal Retrieval", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json", |
| "label": "Neural metrics" |
| } |
| ], |
| "family": "retrieval", |
| "id": "cross_modal_retrieval", |
| "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.", |
| "input_short": "motion/IMU/pose query; depth/video candidates", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "mrr", |
| "minimal": 0.2693, |
| "name": "MRR", |
| "neural_mlp": 0.13 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "pose_slam", |
| "depth", |
| "video" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "ranked visual windows", |
| "primary_direction": "C", |
| "process_short": "modality split -> projection -> nearest-neighbor ranker", |
| "research_name": "Multimodal Representation Retrieval", |
| "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling." |
| }, |
| { |
| "architecture_family": "feature regressor", |
| "case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.", |
| "current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.", |
| "direction_roles": { |
| "B": "proxy", |
| "D": "proxy" |
| }, |
| "display_name": "Cross-Modal Reconstruction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json", |
| "label": "Neural metrics" |
| } |
| ], |
| "family": "forecast", |
| "id": "modality_reconstruction", |
| "input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.", |
| "input_short": "motion, IMU, and camera/pose features", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "r2", |
| "minimal": -0.0153, |
| "name": "R2", |
| "neural_mlp": -0.0102 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "pose_slam", |
| "depth", |
| "video" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "reconstructed depth/video vector", |
| "primary_direction": "B", |
| "process_short": "source-target split -> scaler -> regression head", |
| "research_name": "Modality Feature Reconstruction", |
| "why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective." |
| }, |
| { |
| "architecture_family": "pairwise classifier", |
| "case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.", |
| "current_limit": "Only local adjacent ordering, not long-horizon causal modeling.", |
| "direction_roles": { |
| "C": "diagnostic", |
| "D": "diagnostic" |
| }, |
| "display_name": "Temporal Order Verification", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv", |
| "label": "Neural predictions" |
| } |
| ], |
| "family": "diagnostic", |
| "id": "temporal_order", |
| "input": "A pair of adjacent window vectors, plus their difference vector.", |
| "input_short": "two adjacent windows plus difference vector", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "f1", |
| "minimal": 0.54, |
| "name": "F1", |
| "neural_mlp": 0.852 |
| }, |
| "modalities": [ |
| "video", |
| "pose_slam", |
| "motion_capture", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "correct or reversed", |
| "primary_direction": "C", |
| "process_short": "pair builder -> feature combiner -> binary classifier", |
| "research_name": "Temporal Order Verification", |
| "why": "Checks whether features encode local time direction and task progression." |
| }, |
| { |
| "architecture_family": "pairwise classifier", |
| "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.", |
| "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.", |
| "direction_roles": { |
| "B": "diagnostic", |
| "C": "diagnostic", |
| "D": "diagnostic" |
| }, |
| "display_name": "Multimodal Synchronization Detection", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv", |
| "label": "Neural predictions" |
| } |
| ], |
| "family": "diagnostic", |
| "id": "misalignment_detection", |
| "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.", |
| "input_short": "motion-side and visual/depth-side feature groups", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "f1", |
| "minimal": 0.5052, |
| "name": "F1", |
| "neural_mlp": 0.7153 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "video", |
| "depth", |
| "pose_slam" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "aligned or shifted", |
| "primary_direction": "C", |
| "process_short": "aligned/shifted pairs -> feature combiner -> binary classifier", |
| "research_name": "Cross-Modal Misalignment Detection", |
| "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Evaluated from sample-supported future labels, not full open-world action generation.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Long-horizon next-action forecasting", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "classification", |
| "id": "long_horizon_next_action", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.075, |
| "name": "macro-F1", |
| "neural_mlp": 0.0655 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Long-horizon next-action forecasting", |
| "why": "Extends short-horizon intention prediction into longer activity futures, a key egocentric and world-model signal." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Subtask labels are constrained to the available annotation vocabulary.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Long-horizon next-subtask forecasting", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "classification", |
| "id": "next_subtask_forecast", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0455, |
| "name": "macro-F1", |
| "neural_mlp": 0.0507 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Long-horizon next-subtask forecasting", |
| "why": "Measures whether the model can anticipate the next procedural phase rather than only the current frame state." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Relation labels are derived from the public-sample annotation scope.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Action-object relation prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "classification", |
| "id": "action_object_relation", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "tie", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0, |
| "name": "macro-F1", |
| "neural_mlp": 0.0 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Action-object relation prediction", |
| "why": "Tests whether action recognition and object state are connected as a relational interaction representation." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "This is a set-level proxy, not a persistent 3D scene graph.", |
| "direction_roles": { |
| "C": "proxy", |
| "D": "direct" |
| }, |
| "display_name": "Future object-set forecasting", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "multi-label", |
| "id": "object_set_forecast", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "micro_f1", |
| "minimal": 0.1694, |
| "name": "micro-F1", |
| "neural_mlp": 0.1972 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "D", |
| "process_short": null, |
| "research_name": "Future object-set forecasting", |
| "why": "Asks whether the current scene state supports predicting which objects will matter later." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Retrieval checks view consistency but does not reconstruct geometry by itself.", |
| "direction_roles": { |
| "B": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Camera-view synchronization retrieval", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "retrieval", |
| "id": "camera_view_sync_retrieval", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "mrr", |
| "minimal": 0.4943, |
| "name": "MRR", |
| "neural_mlp": 0.2409 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "B", |
| "process_short": null, |
| "research_name": "Camera-view synchronization retrieval", |
| "why": "Tests whether synchronized multi-view structure is recoverable across camera streams." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Regression is local to the annotated public sample timeline.", |
| "direction_roles": { |
| "C": "diagnostic", |
| "D": "diagnostic" |
| }, |
| "display_name": "Time-to-next-transition regression", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "regression", |
| "id": "time_to_transition", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "lower", |
| "key": "mae", |
| "minimal": 10.5374, |
| "name": "MAE frames", |
| "neural_mlp": 10.5545 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Time-to-next-transition regression", |
| "why": "Measures temporal boundary awareness as a continuous timing target." |
| } |
| ] |
| } |
| ], |
| "foundation_model_plan": { |
| "decision": { |
| "external_reasoning_reference": "Gemini Robotics", |
| "first_policy_branch_candidates": [ |
| "OpenVLA / OpenVLA-OFT", |
| "openpi pi0/pi0.5", |
| "NVIDIA GR00T" |
| ], |
| "first_world_model_branch": "Cosmos 3", |
| "immediate_trainable_backbone": "Qwen3-Omni", |
| "long_term_native_pretraining_goal": "Xperience Embodied Foundation Model" |
| }, |
| "evaluation_additions": [ |
| { |
| "metrics": [ |
| "JSON validity", |
| "macro-F1", |
| "accuracy", |
| "micro-F1" |
| ], |
| "model_families": [ |
| "Qwen3-Omni", |
| "Gemini Robotics reference" |
| ], |
| "target": "structured_task_prediction" |
| }, |
| { |
| "metrics": [ |
| "retrieval rank", |
| "temporal consistency", |
| "feature reconstruction", |
| "qualitative visual inspection" |
| ], |
| "model_families": [ |
| "Cosmos 3" |
| ], |
| "target": "future_state_prediction" |
| }, |
| { |
| "metrics": [ |
| "transition accuracy", |
| "contact accuracy", |
| "next-action accuracy" |
| ], |
| "model_families": [ |
| "Cosmos 3", |
| "OpenVLA", |
| "openpi", |
| "GR00T" |
| ], |
| "target": "action_conditioned_dynamics" |
| }, |
| { |
| "metrics": [ |
| "held-out episode metrics", |
| "held-out session metrics", |
| "leakage checks" |
| ], |
| "model_families": [ |
| "all trainable branches" |
| ], |
| "target": "cross_episode_generalization" |
| } |
| ], |
| "execution_order": [ |
| { |
| "action": "Stage at least 32 valid Xperience-10M episodes with held-out episode split.", |
| "name": "Data gate", |
| "step": 1 |
| }, |
| { |
| "action": "Run Qwen3-Omni action/subtask error analysis and targeted reruns to improve the verified diagnostic baseline.", |
| "name": "First held-out baseline", |
| "step": 2 |
| }, |
| { |
| "action": "Run 3-8 episode dry runs for any next backbone before scaling beyond the selected split.", |
| "name": "Model-selection dry run", |
| "step": 3 |
| }, |
| { |
| "action": "Promote Cosmos 3 beyond the current Nano compatibility and Super forward-dynamics runs only when loss metrics, preprocessing, and storage justify the added compute.", |
| "name": "World-model track", |
| "step": 4 |
| }, |
| { |
| "action": "Promote OpenVLA/openpi/GR00T after action target conversion and retargeting artifacts are traceable.", |
| "name": "Policy branch", |
| "step": 5 |
| }, |
| { |
| "action": "Publish branch results only with real manifests, predictions, metrics, and qualitative examples.", |
| "name": "Publishing threshold", |
| "step": 6 |
| }, |
| { |
| "action": "Start a from-scratch Xperience Embodied Foundation Model only after smaller scaling stages, full-corpus storage, multi-node compute, and held-out evaluation protocols are in place.", |
| "name": "Xperience-native pretraining", |
| "step": 7 |
| } |
| ], |
| "model_families": [ |
| { |
| "best_role": "First selected-episode multimodal LoRA pilot and structured task predictor.", |
| "category": "omni_instruction_model", |
| "current_decision": "keep_as_first_pilot", |
| "entry_condition": "Selected episodes prepared with held-out episode split.", |
| "family": "Qwen3-Omni", |
| "openness": "open_weights_available_from_official_hf_repo", |
| "priority": 1, |
| "public_source": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct", |
| "xperience10m_fit": [ |
| "RGB/fisheye video, embedded audio, and language prompts can enter directly.", |
| "Depth, pose/SLAM, mocap, contacts, and IMU enter through the existing sensor bridge.", |
| "Matches current task outputs: labels, structured JSON, captions, and short decisions." |
| ] |
| }, |
| { |
| "best_role": "Embodied world modeling, action generation, future-window prediction, and synthetic-data expansion.", |
| "category": "world_foundation_model", |
| "current_decision": "implemented_as_nano_future_window_and_super_forward_dynamics_branches", |
| "entry_condition": "Use separate metrics for Nano future-window retrieval and Super forward-dynamics MSE; do not compare them directly to Qwen JSON-task accuracy.", |
| "family": "Cosmos 3", |
| "openness": "track_official_nvidia_release_and_available_weights", |
| "priority": 2, |
| "public_source": "https://www.nvidia.com/en-us/ai/cosmos/", |
| "xperience10m_fit": [ |
| "Uses video streams as visual state.", |
| "Uses pose/SLAM, depth, mocap, IMU, and language as physical-world conditioning signals.", |
| "Better aligned with prediction/generation objectives than simple label classification." |
| ] |
| }, |
| { |
| "best_role": "Humanoid action understanding, retargeting, contact/action prediction, and embodied skill transfer.", |
| "category": "humanoid_policy_foundation_model", |
| "current_decision": "track_as_humanoid_policy_branch", |
| "entry_condition": "Retargeting artifact and action-space definition exist.", |
| "family": "NVIDIA GR00T", |
| "openness": "track_official_nvidia_release_and_tooling", |
| "priority": 3, |
| "public_source": "https://developer.nvidia.com/isaac/gr00t", |
| "xperience10m_fit": [ |
| "Hand/body mocap and contact cues can be retargeted into humanoid state/action targets.", |
| "Egocentric video plus human motion can support affordance and interaction tasks." |
| ] |
| }, |
| { |
| "best_role": "Open robot-policy baseline after observations and action labels are converted into a VLA format.", |
| "category": "vision_language_action_policy", |
| "current_decision": "candidate_after_action_space_design", |
| "entry_condition": "Window-to-action-token conversion is implemented and checked.", |
| "family": "OpenVLA / OpenVLA-OFT", |
| "openness": "open_project_and_weights", |
| "priority": 4, |
| "public_source": "https://openvla.github.io/", |
| "xperience10m_fit": [ |
| "Good candidate when each window is expressed as visual observation, instruction/context, and action token.", |
| "Requires an explicit action target; current human egocentric labels are not robot controls by default." |
| ] |
| }, |
| { |
| "best_role": "Action-chunking, policy fine-tuning, and embodiment-transfer experiments.", |
| "category": "robot_policy_model", |
| "current_decision": "candidate_policy_branch", |
| "entry_condition": "Action target and train/eval protocol exist for at least 64 episodes.", |
| "family": "openpi pi0/pi0.5", |
| "openness": "open_source_policy_training_stack", |
| "priority": 5, |
| "public_source": "https://github.com/Physical-Intelligence/openpi", |
| "xperience10m_fit": [ |
| "Useful once hand trajectories, contacts, or retargeted body motion are converted into policy targets.", |
| "Better for policy branch than for current structured task JSON outputs." |
| ] |
| }, |
| { |
| "best_role": "Qualitative reasoning reference, annotation helper, and external comparison when API access exists.", |
| "category": "closed_embodied_reasoning_reference", |
| "current_decision": "external_reference_only", |
| "entry_condition": "API/access exists and outputs are logged separately from trainable model metrics.", |
| "family": "Gemini Robotics", |
| "openness": "closed_or_limited_access", |
| "priority": 6, |
| "public_source": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/", |
| "xperience10m_fit": [ |
| "Can help reason over egocentric scenes and task descriptions.", |
| "Not a local fine-tune target for this repo." |
| ] |
| }, |
| { |
| "best_role": "Cheaper policy baselines for observation-to-action experiments.", |
| "category": "lightweight_robot_policy_baselines", |
| "current_decision": "optional_baseline_after_data_staging", |
| "entry_condition": "Action labels and baseline protocol exist.", |
| "family": "Octo / SmolVLA-style lightweight policies", |
| "openness": "open_projects", |
| "priority": 7, |
| "public_source": "https://github.com/huggingface/lerobot", |
| "xperience10m_fit": [ |
| "Useful after action target design.", |
| "Less directly omni-modal than Qwen3-Omni or Cosmos 3." |
| ] |
| }, |
| { |
| "best_role": "Domain model over synchronized embodied experience.", |
| "category": "xperience_native_pretraining_goal", |
| "current_decision": "future_goal_after_scaling_evidence", |
| "entry_condition": "Full-corpus data path, PB-scale storage, multi-node compute, and positive smaller-run scaling evidence.", |
| "family": "Xperience Embodied Foundation Model", |
| "openness": "future project-specific model if full-corpus access and compute exist", |
| "priority": 8, |
| "public_source": "XPERIENCE_EMBODIED_FOUNDATION_MODEL_PRETRAINING.md", |
| "xperience10m_fit": [ |
| "Uses the full aligned modality stack rather than treating sensors as auxiliary metadata.", |
| "Targets temporal embodied representation learning across perception, motion, geometry, audio, and language.", |
| "Can become the shared pretraining backbone for Qwen-style instruction tasks, Cosmos-style world modeling, and policy/action branches." |
| ] |
| } |
| ], |
| "source_links": [ |
| { |
| "label": "Qwen3-Omni official HF model", |
| "url": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct" |
| }, |
| { |
| "label": "NVIDIA Cosmos", |
| "url": "https://www.nvidia.com/en-us/ai/cosmos/" |
| }, |
| { |
| "label": "NVIDIA Isaac GR00T", |
| "url": "https://developer.nvidia.com/isaac/gr00t" |
| }, |
| { |
| "label": "OpenVLA", |
| "url": "https://openvla.github.io/" |
| }, |
| { |
| "label": "openpi", |
| "url": "https://github.com/Physical-Intelligence/openpi" |
| }, |
| { |
| "label": "Gemini Robotics", |
| "url": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/" |
| }, |
| { |
| "label": "Octo", |
| "url": "https://octo-models.github.io/" |
| }, |
| { |
| "label": "LeRobot / SmolVLA", |
| "url": "https://github.com/huggingface/lerobot" |
| }, |
| { |
| "label": "Xperience Embodied Foundation Model pretraining plan", |
| "url": "XPERIENCE_EMBODIED_FOUNDATION_MODEL_PRETRAINING.md" |
| } |
| ], |
| "status": "planning_artifact" |
| }, |
| "generated_at_utc": "2026-06-21T19:51:24+00:00", |
| "omni_plan": { |
| "adapter": "LoRA rank 16, alpha 32, dropout 0.05", |
| "backbone": "Qwen/Qwen3-Omni-30B-A3B-Instruct", |
| "evaluation": [ |
| "JSON validity", |
| "action macro-F1", |
| "subtask accuracy", |
| "transition accuracy", |
| "next-action accuracy", |
| "contact accuracy", |
| "object micro-F1", |
| "held-out episode count" |
| ], |
| "first_pilot": "32 held-out-episode pilot after valid episodes are prepared", |
| "training_unit": "episode-level split, window-level supervised examples" |
| }, |
| "phases": [ |
| { |
| "completion_evidence": [ |
| "PROJECT_STATUS.md", |
| "EVALUATION_PROTOCOL.md", |
| "RESEARCH_TAKEAWAYS.md", |
| "docs/data/summary_metrics.json", |
| "results/episode_task_suite/summary_report.json" |
| ], |
| "deliverables": [ |
| "1161 aligned windows", |
| "12 task contracts", |
| "minimal baseline heads", |
| "neural MLP heads", |
| "modality atlas", |
| "task walkthroughs", |
| "derived figures" |
| ], |
| "entry_condition": "One public Xperience-10M sample episode is available.", |
| "id": "public_sample_task_lab", |
| "name": "Public-Sample Task Lab", |
| "reader_takeaway": "The public sample supports task design, feature contracts, walkthroughs, and baseline comparisons.", |
| "stage": "now", |
| "status": "implemented" |
| }, |
| { |
| "completion_evidence": [ |
| "results/omni_finetune/DATA_ACCESS_STATUS.md", |
| "results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md", |
| "results/omni_finetune/source_discovery.json" |
| ], |
| "deliverables": [ |
| "128 selected episodes", |
| "episode manifest", |
| "missing-view manifest", |
| "held-out episode split", |
| "source-discovery report" |
| ], |
| "entry_condition": "Gated dataset availability and enough storage for selected episodes.", |
| "id": "multi_episode_data_staging", |
| "name": "Multi-Episode Data Preparation", |
| "reader_takeaway": "The first selected split is available for Qwen3-Omni diagnostics, with train/test separation at the episode level.", |
| "stage": "future", |
| "status": "implemented_for_first_pilot" |
| }, |
| { |
| "completion_evidence": [ |
| "docs/data/omni_finetune_verified_result.json", |
| "docs/data/qwen3_v5_v6_comparison.json", |
| "results/omni_finetune/QWEN3_V5_V6_COMPARISON_20260614.md", |
| "results/omni_finetune/verified_public/", |
| "dataset_manifest.json", |
| "training_metadata.json", |
| "progress.jsonl", |
| "metrics.json", |
| "predictions.jsonl", |
| "RUN_REPORT.md" |
| ], |
| "deliverables": [ |
| "dataset JSONL/media manifests", |
| "LoRA adapter checkpoint", |
| "progress logs", |
| "validation monitoring", |
| "held-out predictions", |
| "metrics", |
| "confusion matrices", |
| "run report", |
| "v5/v6 comparison", |
| "public LoRA adapter repo" |
| ], |
| "entry_condition": "Selected episodes are prepared locally with no train/test episode leakage.", |
| "id": "qwen3_omni_lora_diagnostic_pilot", |
| "name": "Qwen3-Omni LoRA Latest Diagnostic Branch", |
| "reader_takeaway": "The final omni-model diagnostic result establishes the full held-out training/validation/evaluation loop and meets the strict-JSON target, but weak action/subtask metrics make it a diagnostic baseline.", |
| "stage": "future", |
| "status": "verified_latest_branch" |
| }, |
| { |
| "completion_evidence": [ |
| "results/omni_finetune/multi_episode_128_task_baselines/BASELINE_ALIGNMENT_REPORT.md", |
| "results/omni_finetune/multi_episode_128_task_baselines/summary_report.json", |
| "scripts/omni/run_128_task_baselines.py" |
| ], |
| "deliverables": [ |
| "same 12 task ids", |
| "simple metadata/text baselines", |
| "neural MLP baselines for JSON-supported labels", |
| "explicit unsupported markers for raw-feature-only tasks" |
| ], |
| "entry_condition": "Derived Qwen JSONL export for the selected 96/16/16 split.", |
| "id": "multi_episode_128_same_split_baselines", |
| "name": "128-Episode Same-Split Simple/NN Baselines", |
| "reader_takeaway": "The simple and neural baseline framing is now aligned to the selected 128-episode setup; trajectory, retrieval, reconstruction, and misalignment variants still need raw 128 feature blocks for exact feature-level reproduction.", |
| "stage": "future", |
| "status": "verified_companion_result" |
| }, |
| { |
| "completion_evidence": [ |
| "TASK_SUITE_ENHANCEMENT_128.md", |
| "docs/data/task_suite_enhancement_128.json", |
| "results/omni_finetune/task_suite_enhancement_128_v1_20260608/enhancement_plan.json", |
| "scripts/omni/build_task_suite_enhancement_128.py" |
| ], |
| "deliverables": [ |
| "dense-window and multiscale export estimates", |
| "hierarchical action/subtask target contract", |
| "raw-feature shard priorities for unsupported tasks", |
| "Qwen v5 and Cosmos continuation run cards", |
| "publication-ready enhancement artifacts" |
| ], |
| "entry_condition": "Same selected 96/16/16 split and current public 3,808-window export.", |
| "id": "task_suite_enhancement_128", |
| "name": "128-Episode Task Suite Enhancement Pack", |
| "reader_takeaway": "The current 128-episode setup still has headroom: use multiscale_20s10_40s20_80s40, hierarchical labels, label-normalized scoring, and raw-feature shards before adding more episodes.", |
| "stage": "future", |
| "status": "current" |
| }, |
| { |
| "completion_evidence": [ |
| "error-analysis tables", |
| "held-out metrics by failure type", |
| "verified public-safe package" |
| ], |
| "deliverables": [ |
| "same 96/16/16 episode split", |
| "action/subtask confusion analysis", |
| "unseen-label analysis", |
| "object/action family breakdowns", |
| "held-out test evaluation", |
| "comparison to the final verified Qwen baseline" |
| ], |
| "entry_condition": "The final diagnostic package meets strict JSON validity but has weak action/subtask held-out quality.", |
| "id": "qwen3_omni_structured_output_error_analysis", |
| "name": "Action/Subtask Error-Analysis Pass", |
| "reader_takeaway": "The next pass should improve action/subtask quality before larger model-quality claims.", |
| "stage": "future", |
| "status": "active_next_step" |
| }, |
| { |
| "completion_evidence": [ |
| "FOUNDATION_MODEL_PLAN.md", |
| "docs/data/foundation_model_plan.json", |
| "research_roadmap_interactive.json" |
| ], |
| "deliverables": [ |
| "backbone registry", |
| "Cosmos 3 world-model track plan", |
| "Cosmos3-Super Forward-Dynamics LoRA verified package", |
| "Qwen3-Omni LoRA baseline plan", |
| "OpenVLA/openpi/GR00T policy-branch candidates", |
| "model-specific evaluation additions" |
| ], |
| "entry_condition": "The selected episodes are prepared or a 3-8 episode dry run is available for preprocessing checks.", |
| "id": "foundation_model_selection_matrix", |
| "name": "Foundation-Model Selection Matrix", |
| "reader_takeaway": "Qwen3-Omni remains the structured JSON held-out pilot; Cosmos 3 is the first world-model track. Cosmos3-Super now has a verified forward-dynamics LoRA over camera-pose proxy targets, while VLA/policy models wait for robot-compatible action targets.", |
| "stage": "future", |
| "status": "current" |
| }, |
| { |
| "completion_evidence": [ |
| "held-out metrics by session", |
| "held-out metrics by task", |
| "held-out metrics by modality", |
| "ablation tables", |
| "qualitative error analysis" |
| ], |
| "deliverables": [ |
| "split-by-session metrics", |
| "modality ablations", |
| "calibration/object/language error analysis", |
| "missing-view sensitivity analysis" |
| ], |
| "entry_condition": "The selected-episode pilot trains and evaluates cleanly.", |
| "id": "robustness_run_64_128_episode", |
| "name": "64-128 Episode Robustness Run", |
| "reader_takeaway": "The robustness run tests whether the pilot conclusions survive broader sessions and missing modalities.", |
| "stage": "future", |
| "status": "partially_implemented" |
| }, |
| { |
| "completion_evidence": [ |
| "task-specific held-out evaluations", |
| "verified Cosmos3-Super forward-dynamics LoRA package", |
| "qualitative inspection", |
| "updated model cards" |
| ], |
| "deliverables": [ |
| "Cosmos 3 future-window and action-conditioned world-model probes", |
| "OpenVLA/openpi/GR00T action-policy baseline", |
| "audio/video/depth/pose/mocap conditioning checks", |
| "affordance and object-interaction tasks", |
| "synthetic-data usefulness test" |
| ], |
| "entry_condition": "Enough multi-episode data, compute budget, and model-specific action/world-state targets.", |
| "id": "foundation_world_model_extensions", |
| "name": "Cosmos 3 and Policy-Model Extensions", |
| "reader_takeaway": "The Cosmos3 track now includes Nano future-window compatibility and Super forward-dynamics LoRA; the long-term direction remains richer multimodal representation learning with model tracks chosen by task fit rather than by a single default backbone.", |
| "stage": "future", |
| "status": "planned" |
| }, |
| { |
| "completion_evidence": [ |
| "pretraining metadata", |
| "checkpoint inventory", |
| "scaling curves", |
| "held-out evaluation reports", |
| "qualitative retrieval or future-state examples", |
| "safety and data-boundary report" |
| ], |
| "deliverables": [ |
| "full-corpus episode and split manifests", |
| "pretraining shard and provenance manifests", |
| "0.3B-1B and 1B-3B scaling pilots", |
| "3B-7B Xperience-native domain model target", |
| "held-out episode/session/activity/object evaluations", |
| "missing-modality robustness report", |
| "model card and data-boundary report" |
| ], |
| "entry_condition": "Full-corpus access, PB-scale storage path, high-throughput data loading, multi-node compute, and positive scaling evidence from smaller multi-episode runs.", |
| "id": "xperience_embodied_foundation_pretraining", |
| "name": "Xperience Embodied Foundation Model Pretraining", |
| "reader_takeaway": "The final research direction is a domain-specific embodied foundation model trained directly on Xperience-10M, after smaller pilots justify the cost and infrastructure.", |
| "stage": "future", |
| "status": "future" |
| } |
| ], |
| "scale_up": { |
| "access_status": "The gated Xperience-10M dataset is available for selected multi-episode pilot preparation.", |
| "candidate_scan_top_level_sessions": 802, |
| "estimated_bytes": 298188841943, |
| "exclude": [ |
| "visualization.rrd" |
| ], |
| "selection_strategy": "stratified_round_robin_by_top_level_session", |
| "status": "verified_full_128_episode_diagnostic_result", |
| "target_episodes": 128, |
| "valid_candidates": 12102 |
| }, |
| "scope": { |
| "feature_blocks": 18, |
| "feature_dim": 8546, |
| "num_frames": 5821, |
| "num_windows": 1161, |
| "sample_episode_count": 1, |
| "stride_frames": 5, |
| "warning": "These walkthroughs explain task contracts on one public sample episode; cross-episode performance requires held-out episodes.", |
| "window_frames": 20 |
| }, |
| "source_files": [ |
| "docs/data/research_directions.json", |
| "docs/data/task_walkthroughs.json", |
| "docs/data/research_roadmap.json", |
| "docs/data/foundation_model_plan.json", |
| "docs/data/three_foundation_pipelines.json", |
| "docs/data/additional_development_directions.json", |
| "docs/data/summary_metrics.json", |
| "docs/data/research_direction_extensions.json", |
| "results/episode_task_suite/summary_report.json", |
| "results/episode_task_suite/feature_manifest.json" |
| ], |
| "tasks": [ |
| { |
| "architecture_family": "multiclass classifier", |
| "case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.", |
| "current_limit": "Chronological single-episode split creates unseen future action classes.", |
| "direction_roles": { |
| "A": "proxy", |
| "C": "direct" |
| }, |
| "display_name": "Action Recognition", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "supervised", |
| "id": "timeline_action", |
| "input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.", |
| "input_short": "20-frame multimodal window", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.05, |
| "name": "macro-F1", |
| "neural_mlp": 0.0148 |
| }, |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial", |
| "language" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "current action class", |
| "primary_direction": "C", |
| "process_short": "window features -> action label builder -> classifier", |
| "research_name": "Egocentric Action Recognition", |
| "why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout." |
| }, |
| { |
| "architecture_family": "multiclass classifier", |
| "case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.", |
| "current_limit": "Single-episode ordering makes future subtasks hard to generalize.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Procedure Step Recognition", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "supervised", |
| "id": "timeline_subtask", |
| "input": "The same all-modality window vector used by action recognition.", |
| "input_short": "20-frame multimodal window", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0506, |
| "name": "macro-F1", |
| "neural_mlp": 0.0281 |
| }, |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial", |
| "language" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "current procedure step", |
| "primary_direction": "C", |
| "process_short": "window features -> subtask label builder -> classifier", |
| "research_name": "Temporal Subtask Recognition", |
| "why": "Segments egocentric task state and provides a first proxy for symbolic world/task state." |
| }, |
| { |
| "architecture_family": "binary classifier", |
| "case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.", |
| "current_limit": "Boundary class is sparse, so accuracy alone is misleading.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "diagnostic" |
| }, |
| "display_name": "Action Boundary Detection", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "diagnostic", |
| "id": "transition_detection", |
| "input": "One all-modality window vector plus labels derived from action-change timestamps.", |
| "input_short": "current window with boundary target", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.6118, |
| "name": "macro-F1", |
| "neural_mlp": 0.5862 |
| }, |
| "modalities": [ |
| "video", |
| "pose_slam", |
| "motion_capture", |
| "inertial", |
| "language" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "boundary or steady", |
| "primary_direction": "C", |
| "process_short": "action changes -> boundary labels -> binary classifier", |
| "research_name": "Temporal Action Segmentation", |
| "why": "Localizes egocentric task boundaries and diagnoses temporal state changes." |
| }, |
| { |
| "architecture_family": "future-label classifier", |
| "case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.", |
| "current_limit": "Unseen future labels dominate the single-episode chronological test.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Next-Action Prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "supervised", |
| "id": "next_action", |
| "input": "The current all-modality window vector at time t.", |
| "input_short": "current window at time t", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0593, |
| "name": "macro-F1", |
| "neural_mlp": 0.0419 |
| }, |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "action at t+20 frames", |
| "primary_direction": "C", |
| "process_short": "current features -> future label shift -> classifier", |
| "research_name": "Short-Horizon Intention Prediction", |
| "why": "Tests action intention/task-flow prediction from egocentric context." |
| }, |
| { |
| "architecture_family": "continuous regressor", |
| "case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.", |
| "current_limit": "Forecasting is window-level and not yet a full sequence or policy model.", |
| "direction_roles": { |
| "A": "direct", |
| "C": "proxy" |
| }, |
| "display_name": "Hand Trajectory Forecasting", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json", |
| "label": "Neural metrics" |
| } |
| ], |
| "family": "forecast", |
| "id": "hand_trajectory_forecast", |
| "input": "The current all-modality window vector at time t.", |
| "input_short": "current multimodal window", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "lower", |
| "key": "mpjpe", |
| "minimal": 0.8647, |
| "name": "MPJPE", |
| "neural_mlp": 0.1079 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "video", |
| "depth", |
| "pose_slam", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "future hand-joint trajectory", |
| "primary_direction": "A", |
| "process_short": "current features -> future mocap target -> regression head", |
| "research_name": "3D Hand Motion Forecasting", |
| "why": "Directly predicts human hand motion and supports hand-object interaction modeling." |
| }, |
| { |
| "architecture_family": "binary classifier", |
| "case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.", |
| "current_limit": "The public sample is degenerate for this target because one class dominates.", |
| "direction_roles": { |
| "A": "direct", |
| "C": "proxy" |
| }, |
| "display_name": "Contact State Prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv", |
| "label": "Neural predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv", |
| "label": "Confusion matrix" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv", |
| "label": "Neural confusion matrix" |
| } |
| ], |
| "family": "supervised", |
| "id": "contact_prediction", |
| "input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.", |
| "input_short": "non-contact, non-caption features", |
| "metric": { |
| "better_baseline": "tie", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 1.0, |
| "name": "macro-F1", |
| "neural_mlp": 1.0 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "video", |
| "depth", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "contact or no contact", |
| "primary_direction": "A", |
| "process_short": "feature filter -> contact target -> binary classifier", |
| "research_name": "Human-Object Contact Prediction", |
| "why": "Targets physical interaction state, a core affordance and manipulation signal." |
| }, |
| { |
| "architecture_family": "multi-label classifier", |
| "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.", |
| "current_limit": "Object labels are language-derived and sparse in one episode.", |
| "direction_roles": { |
| "A": "proxy", |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Object Relevance Prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv", |
| "label": "Neural predictions" |
| } |
| ], |
| "family": "supervised", |
| "id": "object_relevance", |
| "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.", |
| "input_short": "non-caption multimodal features", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "micro_f1", |
| "minimal": 0.1803, |
| "name": "micro-F1", |
| "neural_mlp": 0.1679 |
| }, |
| "modalities": [ |
| "video", |
| "depth", |
| "pose_slam", |
| "motion_capture", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "relevant object set", |
| "primary_direction": "C", |
| "process_short": "object vocabulary -> multi-hot labels -> sigmoid heads", |
| "research_name": "Object-Centric Interaction Recognition", |
| "why": "Connects egocentric activity to manipulated objects and early object-centric state." |
| }, |
| { |
| "architecture_family": "retrieval ranker", |
| "case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.", |
| "current_limit": "Bag-of-objects language features are too weak for rich grounding.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Language Grounding", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json", |
| "label": "Neural metrics" |
| } |
| ], |
| "family": "retrieval", |
| "id": "caption_grounding", |
| "input": "Caption/object/interaction query features and a set of candidate sensor-window features.", |
| "input_short": "text-like query and candidate windows", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "mrr", |
| "minimal": 0.016, |
| "name": "MRR", |
| "neural_mlp": 0.0168 |
| }, |
| "modalities": [ |
| "language", |
| "video", |
| "depth", |
| "pose_slam" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "ranked matching moments", |
| "primary_direction": "C", |
| "process_short": "query features -> candidate index -> cosine ranker", |
| "research_name": "Language-to-Moment Grounding", |
| "why": "Grounds language annotation into egocentric sensor time and task state." |
| }, |
| { |
| "architecture_family": "two-tower retrieval head", |
| "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.", |
| "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.", |
| "direction_roles": { |
| "B": "proxy", |
| "C": "diagnostic", |
| "D": "proxy" |
| }, |
| "display_name": "Cross-Modal Retrieval", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json", |
| "label": "Neural metrics" |
| } |
| ], |
| "family": "retrieval", |
| "id": "cross_modal_retrieval", |
| "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.", |
| "input_short": "motion/IMU/pose query; depth/video candidates", |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "mrr", |
| "minimal": 0.2693, |
| "name": "MRR", |
| "neural_mlp": 0.13 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "pose_slam", |
| "depth", |
| "video" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "ranked visual windows", |
| "primary_direction": "C", |
| "process_short": "modality split -> projection -> nearest-neighbor ranker", |
| "research_name": "Multimodal Representation Retrieval", |
| "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling." |
| }, |
| { |
| "architecture_family": "feature regressor", |
| "case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.", |
| "current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.", |
| "direction_roles": { |
| "B": "proxy", |
| "D": "proxy" |
| }, |
| "display_name": "Cross-Modal Reconstruction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json", |
| "label": "Neural metrics" |
| } |
| ], |
| "family": "forecast", |
| "id": "modality_reconstruction", |
| "input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.", |
| "input_short": "motion, IMU, and camera/pose features", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "r2", |
| "minimal": -0.0153, |
| "name": "R2", |
| "neural_mlp": -0.0102 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "pose_slam", |
| "depth", |
| "video" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "reconstructed depth/video vector", |
| "primary_direction": "B", |
| "process_short": "source-target split -> scaler -> regression head", |
| "research_name": "Modality Feature Reconstruction", |
| "why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective." |
| }, |
| { |
| "architecture_family": "pairwise classifier", |
| "case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.", |
| "current_limit": "Only local adjacent ordering, not long-horizon causal modeling.", |
| "direction_roles": { |
| "C": "diagnostic", |
| "D": "diagnostic" |
| }, |
| "display_name": "Temporal Order Verification", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv", |
| "label": "Neural predictions" |
| } |
| ], |
| "family": "diagnostic", |
| "id": "temporal_order", |
| "input": "A pair of adjacent window vectors, plus their difference vector.", |
| "input_short": "two adjacent windows plus difference vector", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "f1", |
| "minimal": 0.54, |
| "name": "F1", |
| "neural_mlp": 0.852 |
| }, |
| "modalities": [ |
| "video", |
| "pose_slam", |
| "motion_capture", |
| "inertial" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "correct or reversed", |
| "primary_direction": "C", |
| "process_short": "pair builder -> feature combiner -> binary classifier", |
| "research_name": "Temporal Order Verification", |
| "why": "Checks whether features encode local time direction and task progression." |
| }, |
| { |
| "architecture_family": "pairwise classifier", |
| "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.", |
| "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.", |
| "direction_roles": { |
| "B": "diagnostic", |
| "C": "diagnostic", |
| "D": "diagnostic" |
| }, |
| "display_name": "Multimodal Synchronization Detection", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json", |
| "label": "Minimal metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json", |
| "label": "Neural metrics" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv", |
| "label": "Minimal predictions" |
| }, |
| { |
| "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv", |
| "label": "Neural predictions" |
| } |
| ], |
| "family": "diagnostic", |
| "id": "misalignment_detection", |
| "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.", |
| "input_short": "motion-side and visual/depth-side feature groups", |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "f1", |
| "minimal": 0.5052, |
| "name": "F1", |
| "neural_mlp": 0.7153 |
| }, |
| "modalities": [ |
| "motion_capture", |
| "inertial", |
| "video", |
| "depth", |
| "pose_slam" |
| ], |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files", |
| "output_short": "aligned or shifted", |
| "primary_direction": "C", |
| "process_short": "aligned/shifted pairs -> feature combiner -> binary classifier", |
| "research_name": "Cross-Modal Misalignment Detection", |
| "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Evaluated from sample-supported future labels, not full open-world action generation.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Long-horizon next-action forecasting", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "classification", |
| "id": "long_horizon_next_action", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.075, |
| "name": "macro-F1", |
| "neural_mlp": 0.0655 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Long-horizon next-action forecasting", |
| "why": "Extends short-horizon intention prediction into longer activity futures, a key egocentric and world-model signal." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Subtask labels are constrained to the available annotation vocabulary.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Long-horizon next-subtask forecasting", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "classification", |
| "id": "next_subtask_forecast", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0455, |
| "name": "macro-F1", |
| "neural_mlp": 0.0507 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Long-horizon next-subtask forecasting", |
| "why": "Measures whether the model can anticipate the next procedural phase rather than only the current frame state." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Public derived features retain hashed text targets; raw full text requires the official annotation source.", |
| "direction_roles": { |
| "A": "proxy", |
| "C": "direct" |
| }, |
| "display_name": "Interaction text prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "classification", |
| "id": "interaction_text_prediction", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0444, |
| "name": "macro-F1", |
| "neural_mlp": 0.0381 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Interaction text prediction", |
| "why": "Connects egocentric observations to the natural-language interaction semantics carried by the annotation." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Relation labels are derived from the public-sample annotation scope.", |
| "direction_roles": { |
| "C": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Action-object relation prediction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "classification", |
| "id": "action_object_relation", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "tie", |
| "direction": "higher", |
| "key": "macro_f1", |
| "minimal": 0.0, |
| "name": "macro-F1", |
| "neural_mlp": 0.0 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Action-object relation prediction", |
| "why": "Tests whether action recognition and object state are connected as a relational interaction representation." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "This is a set-level proxy, not a persistent 3D scene graph.", |
| "direction_roles": { |
| "C": "proxy", |
| "D": "direct" |
| }, |
| "display_name": "Future object-set forecasting", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "multi-label", |
| "id": "object_set_forecast", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "neural_mlp", |
| "direction": "higher", |
| "key": "micro_f1", |
| "minimal": 0.1694, |
| "name": "micro-F1", |
| "neural_mlp": 0.1972 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "D", |
| "process_short": null, |
| "research_name": "Future object-set forecasting", |
| "why": "Asks whether the current scene state supports predicting which objects will matter later." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Pose reconstruction is window-level and does not yet fit a full parametric hand/body model.", |
| "direction_roles": { |
| "A": "direct", |
| "B": "proxy" |
| }, |
| "display_name": "IMU-to-hand pose reconstruction", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "regression", |
| "id": "imu_to_hand_pose", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "lower", |
| "key": "mae", |
| "minimal": 0.042, |
| "name": "MAE", |
| "neural_mlp": 0.0426 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "A", |
| "process_short": null, |
| "research_name": "IMU-to-hand pose reconstruction", |
| "why": "Measures human-motion reconstruction from wearable and motion cues." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Retrieval checks view consistency but does not reconstruct geometry by itself.", |
| "direction_roles": { |
| "B": "direct", |
| "D": "proxy" |
| }, |
| "display_name": "Camera-view synchronization retrieval", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "retrieval", |
| "id": "camera_view_sync_retrieval", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "higher", |
| "key": "mrr", |
| "minimal": 0.4943, |
| "name": "MRR", |
| "neural_mlp": 0.2409 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "B", |
| "process_short": null, |
| "research_name": "Camera-view synchronization retrieval", |
| "why": "Tests whether synchronized multi-view structure is recoverable across camera streams." |
| }, |
| { |
| "architecture_family": null, |
| "case_study": null, |
| "current_limit": "Regression is local to the annotated public sample timeline.", |
| "direction_roles": { |
| "C": "diagnostic", |
| "D": "diagnostic" |
| }, |
| "display_name": "Time-to-next-transition regression", |
| "evidence_links": [ |
| { |
| "href": "data/task_walkthroughs.json", |
| "label": "Task walkthrough" |
| }, |
| { |
| "href": "single_episode_explorer.html", |
| "label": "Single-episode explorer" |
| } |
| ], |
| "family": "regression", |
| "id": "time_to_transition", |
| "input": null, |
| "input_short": null, |
| "metric": { |
| "better_baseline": "minimal", |
| "direction": "lower", |
| "key": "mae", |
| "minimal": 10.5374, |
| "name": "MAE frames", |
| "neural_mlp": 10.5545 |
| }, |
| "modalities": [], |
| "module_summary": null, |
| "output_short": null, |
| "primary_direction": "C", |
| "process_short": null, |
| "research_name": "Time-to-next-transition regression", |
| "why": "Measures temporal boundary awareness as a continuous timing target." |
| } |
| ], |
| "three_foundation_pipelines": { |
| "claim_boundary": "These are supported pipeline directions, not three completed model-quality claims.", |
| "source_document": "THREE_FOUNDATION_PIPELINES.md", |
| "status": "pipeline_plan", |
| "title": "Three Foundation Pipeline Tracks", |
| "tracks": [ |
| { |
| "avoid_claiming_now": [ |
| "full neural rendering", |
| "full 3D reconstruction", |
| "general spatial intelligence without artifact-level evidence" |
| ], |
| "core_inputs": [ |
| "multiview RGB", |
| "egocentric video", |
| "depth", |
| "camera pose", |
| "calibration", |
| "object cues", |
| "language questions" |
| ], |
| "current_maturity": "Ready as a pipeline and evaluation contract.", |
| "diagram_flow": [ |
| { |
| "items": [ |
| "multiview RGB plus egocentric video", |
| "metric depth and confidence", |
| "camera pose, calibration, SLAM", |
| "object, contact, and language cues" |
| ], |
| "stage": "inputs" |
| }, |
| { |
| "items": [ |
| "spatial QA and object count", |
| "object permanence across windows", |
| "relative location and retrieval", |
| "pose-aware 3D consistency" |
| ], |
| "stage": "tasks_targets" |
| }, |
| { |
| "items": [ |
| "export scene/object memory records", |
| "train spatial-memory encoder", |
| "add geometry-aware QA and retrieval heads", |
| "keep episode-level split discipline" |
| ], |
| "stage": "train_models" |
| }, |
| { |
| "items": [ |
| "held-out episode spatial metrics", |
| "count and relation accuracy", |
| "retrieval rank and consistency", |
| "saved predictions before public claim" |
| ], |
| "stage": "evaluate_gates" |
| } |
| ], |
| "diagram_image": "docs/assets/foundation-pipelines/spatial-intelligence-pipeline.png", |
| "first_pipeline": "Build a spatial-memory exporter, start with metric depth and pose consistency tasks, then evaluate spatial QA, object permanence, counting, retrieval, and pose-aware consistency.", |
| "id": "spatial_intelligence", |
| "image_alt": "High-resolution slide diagram showing the Spatial intelligence models direction for Xperience-10M.", |
| "intermediate_artifacts": [ |
| "synchronized camera window manifest", |
| "pose and depth availability report", |
| "scene and object memory records", |
| "object permanence targets", |
| "spatial relation targets", |
| "spatial QA prompts" |
| ], |
| "next_gate": "Raw depth and pose artifacts plus held-out multi-episode spatial metrics.", |
| "one_sample_training_io": { |
| "boundary": "This yields a one-episode spatial training-pair recipe and proxy tasks; full spatial-intelligence claims require held-out multi-episode depth/pose/scene-memory metrics.", |
| "existing_task_hooks": [ |
| "object_relevance", |
| "modality_reconstruction", |
| "caption_grounding", |
| "object_set_forecast", |
| "camera_view_sync_retrieval" |
| ], |
| "input_builder": "Slice each 20-frame window, then join multiview RGB summaries with depth, camera pose, SLAM/calibration, object cues, contact cues, and optional language questions from the public annotation timeline.", |
| "sample_basis": "Single public sample episode: 5,821 frames, 1,161 overlapping 20-frame windows, 5-frame stride, about 20 FPS.", |
| "source_artifacts": [ |
| "results/episode_task_suite/windows.csv", |
| "results/episode_task_suite/shared_windows.npz", |
| "results/episode_task_suite/feature_manifest.json", |
| "official sample annotation.hdf5", |
| "official sample six MP4 camera streams" |
| ], |
| "target_builder": "Create spatial targets such as camera-view match, object relevance, object-set memory, depth/pose reconstruction proxy, caption-grounded retrieval, and spatial QA answers." |
| }, |
| "outputs": [ |
| "object count", |
| "object persistence", |
| "relative location", |
| "3D geometry consistency", |
| "multiview retrieval", |
| "camera-motion-aware scene memory", |
| "language answers grounded in the scene" |
| ], |
| "question": "Can the model recover and reason over space from video?", |
| "title": "Spatial intelligence models", |
| "website_image": "assets/foundation-pipelines/spatial-intelligence-pipeline.png" |
| }, |
| { |
| "avoid_claiming_now": [ |
| "strong world model from structured future-task scores alone", |
| "visual future quality without visual or latent future metrics" |
| ], |
| "core_inputs": [ |
| "observed video windows", |
| "audio", |
| "sensor windows", |
| "hand and body motion", |
| "object and contact state", |
| "action and subtask labels", |
| "future windows" |
| ], |
| "current_maturity": "Partially evidenced by current future-task probes and Cosmos-style branch artifacts.", |
| "diagram_flow": [ |
| { |
| "items": [ |
| "observed video/audio/sensor window", |
| "hand/body motion and camera pose", |
| "object/contact state", |
| "action and subtask labels" |
| ], |
| "stage": "inputs" |
| }, |
| { |
| "items": [ |
| "next action and next subtask", |
| "future object set", |
| "contact transition", |
| "camera-motion delta or latent future" |
| ], |
| "stage": "tasks_targets" |
| }, |
| { |
| "items": [ |
| "Qwen structured future probes", |
| "Cosmos/dynamics branch separately", |
| "latent rollout or reconstruction loss", |
| "no target-side future leakage" |
| ], |
| "stage": "train_models" |
| }, |
| { |
| "items": [ |
| "held-out future-task metrics", |
| "contact and object-set F1", |
| "rollout or latent consistency", |
| "per-episode breakdown and examples" |
| ], |
| "stage": "evaluate_gates" |
| } |
| ], |
| "diagram_image": "docs/assets/foundation-pipelines/human-video-world-model-pipeline.png", |
| "first_pipeline": "Keep Qwen-style structured future probes for task interpretability, keep Cosmos-style dynamics branches separate, and add latent or feature-reconstruction metrics before claiming world-model quality.", |
| "id": "human_video_world_models", |
| "image_alt": "High-resolution slide diagram showing the Human-video world models direction for Xperience-10M.", |
| "intermediate_artifacts": [ |
| "observed and future window pairs", |
| "future label targets", |
| "action-conditioned target records", |
| "visual or latent reconstruction targets", |
| "temporal consistency metadata" |
| ], |
| "next_gate": "Stronger future-state metrics, qualitative future examples, and held-out episode breakdowns.", |
| "one_sample_training_io": { |
| "boundary": "Future labels and future windows must stay out of the input. Structured future probes are evidence for the pipeline, not a full visual world-model claim by themselves.", |
| "existing_task_hooks": [ |
| "next_action", |
| "long_horizon_next_action", |
| "next_subtask_forecast", |
| "object_set_forecast", |
| "time_to_transition", |
| "ego_motion_forecast" |
| ], |
| "input_builder": "Use the current 20-frame observed window at time t: RGB/audio/sensor summaries, hand/body motion, camera pose, current object/contact state, and current action/subtask context only.", |
| "sample_basis": "Single public sample episode: current observed windows are paired with shifted future labels or future-window features from the same timeline.", |
| "source_artifacts": [ |
| "results/episode_task_suite/windows.csv", |
| "results/episode_task_suite/shared_windows.npz", |
| "results/episode_task_suite/tier2_task_suite/tier2_task_suite_results.json", |
| "results/episode_task_suite/research_direction_extensions/research_direction_extension_results.json" |
| ], |
| "target_builder": "Shift the episode timeline forward to produce next-action, next-subtask, future object-set, contact-transition, time-to-transition, camera-motion delta, or latent/future-feature targets." |
| }, |
| "outputs": [ |
| "next action", |
| "next subtask", |
| "future object set", |
| "future state embedding", |
| "camera-motion delta", |
| "contact transition", |
| "future-window quality metrics" |
| ], |
| "question": "Can the model predict what happens next?", |
| "title": "Human-video world models", |
| "website_image": "assets/foundation-pipelines/human-video-world-model-pipeline.png" |
| }, |
| { |
| "avoid_claiming_now": [ |
| "robot policy quality", |
| "policy generalization before action-space evidence exists" |
| ], |
| "core_inputs": [ |
| "egocentric video", |
| "language captions", |
| "hand and body motion", |
| "contacts", |
| "objects", |
| "procedure and subtask labels" |
| ], |
| "current_maturity": "Feasible but gated by action-target conversion.", |
| "diagram_flow": [ |
| { |
| "items": [ |
| "egocentric video and captions", |
| "objects, contacts, and procedures", |
| "hand/body motion windows", |
| "subtask labels and language context" |
| ], |
| "stage": "inputs" |
| }, |
| { |
| "items": [ |
| "action-token vocabulary", |
| "next action and action chunks", |
| "object-conditioned actions", |
| "contact state and subtask transition" |
| ], |
| "stage": "tasks_targets" |
| }, |
| { |
| "items": [ |
| "build action-space converter", |
| "normalize and audit action chunks", |
| "train VLA/policy-compatible head", |
| "track leakage and retargeting reports" |
| ], |
| "stage": "train_models" |
| }, |
| { |
| "items": [ |
| "held-out action metrics", |
| "chunk and next-action accuracy", |
| "object/contact-conditioned scores", |
| "policy card before robot-policy claim" |
| ], |
| "stage": "evaluate_gates" |
| } |
| ], |
| "diagram_image": "docs/assets/foundation-pipelines/vision-language-action-pipeline.png", |
| "first_pipeline": "Define the action space, use existing 20-task next-action/contact/object-conditioned tasks first, then add hand-trajectory or policy-compatible action chunks after conversion is traceable.", |
| "id": "vision_language_action", |
| "image_alt": "High-resolution slide diagram showing the Vision-language-action models direction for Xperience-10M.", |
| "intermediate_artifacts": [ |
| "action-token vocabulary", |
| "action-chunk windows", |
| "normalization stats", |
| "retargeting report", |
| "leakage audit", |
| "action-space model card" |
| ], |
| "next_gate": "Traceable action tokens, normalization, retargeting metadata, and held-out policy metrics.", |
| "one_sample_training_io": { |
| "boundary": "This is a VLA/policy data-conversion recipe for the one-sample suite. Robot policy claims require a later action-space converter, normalization, retargeting report, and held-out policy metrics.", |
| "existing_task_hooks": [ |
| "timeline_action", |
| "next_action", |
| "hand_trajectory_forecast", |
| "contact_prediction", |
| "interaction_text_prediction", |
| "action_object_relation" |
| ], |
| "input_builder": "Use egocentric/fisheye video windows, caption and object context, hand/body mocap, contact state, and current subtask text as the observation-language side of each training pair.", |
| "sample_basis": "Single public sample episode: observation-language windows are paired with action-token proxies because robot retargeted action chunks are not part of the public sample yet.", |
| "source_artifacts": [ |
| "results/episode_task_suite/windows.csv", |
| "results/episode_task_suite/shared_windows.npz", |
| "results/episode_task_suite/task_walkthroughs/task_walkthroughs.json", |
| "official sample annotation.hdf5" |
| ], |
| "target_builder": "Create action-token proxy targets: current or next action, object-conditioned action relation, contact state, interaction-text class, subtask transition, or hand-trajectory/action-chunk proxy." |
| }, |
| "outputs": [ |
| "next action", |
| "action chunk", |
| "object-conditioned action", |
| "contact state", |
| "subtask transition", |
| "policy or VLA held-out metrics" |
| ], |
| "question": "Can the model turn what it sees and reads into action?", |
| "title": "Vision-language-action models", |
| "website_image": "assets/foundation-pipelines/vision-language-action-pipeline.png" |
| } |
| ] |
| }, |
| "title": "Interactive Research Roadmap" |
| } |
|
|