ropedia-xperience-10m-task-baselines / docs /data /research_roadmap_interactive.json

Add files using upload-large-folder tool

8b4c4fe verified 3 days ago

145 kB

	{
	"additional_development_directions": {
	"directions": [
	{
	"data_signals": [
	"language annotations",
	"object labels",
	"scene context",
	"video thumbnails",
	"motion statistics",
	"missing-modality flags"
	],
	"evaluation": "Coverage by session, activity, object, and modality; duplicate checks; train/val/test leakage checks; reproducible selection report.",
	"first_build": "Episode atlas, category tags, balance report, and split builder across activities, objects, scenes, people, sessions, and missing modalities.",
	"id": "episode_taxonomy_data_engine",
	"name": "Episode Taxonomy and Data Engine",
	"why_it_matters": "Fine-tuning quality depends on selecting representative episodes instead of sampling randomly from a large corpus."
	},
	{
	"data_signals": [
	"episode manifests",
	"window manifests",
	"task labels",
	"prediction files",
	"metric files"
	],
	"evaluation": "Versioned splits, deterministic metric scripts, task-specific confidence intervals, and model-card reporting templates.",
	"first_build": "Fixed train/val/test manifests, task cards, leakage checks, metric scripts, and small reference baselines.",
	"id": "standardized_benchmark_protocol",
	"name": "Standardized Benchmark Protocol",
	"why_it_matters": "Future model results become comparable across Qwen, Cosmos-style world models, policy models, and smaller task heads."
	},
	{
	"data_signals": [
	"video",
	"audio",
	"depth",
	"pose/SLAM",
	"mocap",
	"IMU",
	"language"
	],
	"evaluation": "Cross-modal retrieval, missing-modality reconstruction, transfer to the 12 task heads, and held-out episode generalization.",
	"first_build": "Contrastive and masked-prediction objectives over synchronized multimodal windows.",
	"id": "multimodal_representation_learning",
	"name": "Multimodal Representation Learning",
	"why_it_matters": "Xperience-10M can train reusable encoders before committing to expensive large-model fine-tuning or pretraining."
	},
	{
	"data_signals": [
	"action labels",
	"subtask labels",
	"language annotations",
	"hand trajectories",
	"contact states",
	"object labels"
	],
	"evaluation": "Step boundary accuracy, transition prediction, next-step prediction, graph consistency, and long-horizon task replay.",
	"first_build": "Step segmentation, transition graph, precondition/effect labels, and temporal skill graph extraction.",
	"id": "skill_procedure_graph_mining",
	"name": "Skill and Procedure Graph Mining",
	"why_it_matters": "It connects egocentric perception to task structure, planning, and long-horizon embodied reasoning."
	},
	{
	"data_signals": [
	"hand mocap",
	"body mocap",
	"contacts",
	"objects",
	"egocentric video",
	"language"
	],
	"evaluation": "Contact F1, object micro-F1, affordance accuracy, future interaction prediction, and per-object error analysis.",
	"first_build": "Contact, hand-object state, reachable object, likely tool use, and next-affordance prediction tasks.",
	"id": "human_object_affordance_modeling",
	"name": "Human-Object Interaction and Affordance Modeling",
	"why_it_matters": "The dataset can model what actions the scene affords, not only what action label is currently visible."
	},
	{
	"data_signals": [
	"depth",
	"pose/SLAM",
	"multiview video",
	"camera calibration",
	"objects",
	"motion traces"
	],
	"evaluation": "Map consistency, object permanence, spatial retrieval, future-state prediction, and novel-view or view-consistency probes.",
	"first_build": "Persistent scene/object map prototypes built from depth, pose/SLAM, multiview video, and object cues.",
	"id": "scene_object_memory",
	"name": "3D/4D Scene and Object Memory",
	"why_it_matters": "It moves beyond frame-level recognition toward world-state tracking, object permanence, and spatial reasoning."
	},
	{
	"data_signals": [
	"timestamps",
	"file manifests",
	"camera streams",
	"audio streams",
	"depth streams",
	"calibration",
	"annotation coverage"
	],
	"evaluation": "QA pass rate, drift estimates, missing-view tables, corruption reports, and exclusion or degraded-mode manifests.",
	"first_build": "Per-episode QA for timestamp drift, stream availability, calibration consistency, corrupted files, and missing modalities.",
	"id": "data_quality_sync_diagnostics",
	"name": "Data Quality, Synchronization, and Missing-Modality Diagnostics",
	"why_it_matters": "Large multimodal training fails quietly without strong data-quality gates, so QA should be a first-class artifact."
	},
	{
	"data_signals": [
	"mocap",
	"hand trajectories",
	"contacts",
	"object states",
	"egocentric video",
	"language instructions"
	],
	"evaluation": "Retargeting validity, action prediction, contact consistency, imitation rollout quality, and sim-to-real assumption checks.",
	"first_build": "Action-token conversion, robot-compatible targets, imitation-learning examples, and simulation transfer probes.",
	"id": "policy_retargeting_simulation_transfer",
	"name": "Policy, Retargeting, and Simulation Transfer",
	"why_it_matters": "It creates a bridge from human egocentric experience to robot policies while keeping action-space assumptions explicit."
	}
	],
	"practical_order": [
	"Build the episode taxonomy and data-quality diagnostics first.",
	"Lock the benchmark protocol and split manifests before reporting model scores.",
	"Add representation-learning and skill-graph objectives once enough episodes are staged.",
	"Add affordance, 3D/4D memory, and policy-retargeting branches after labels and action targets are measurable."
	],
	"public_boundary": "These are proposed development tracks. They are not reported as completed held-out benchmark results.",
	"source_document": "ADDITIONAL_DEVELOPMENT_DIRECTIONS.md",
	"status": "planned_research_directions",
	"summary": "Concrete Xperience-10M project directions beyond the current minimal baselines, Qwen3-Omni LoRA plan, Cosmos/world-model branch, and long-term native pretraining goal.",
	"title": "Additional Development Directions"
	},
	"baseline_summary": {
	"baseline_heads": "minimal and neural MLP heads",
	"current_use": "task design, data-contract validation, case studies, and baseline comparison",
	"split": "chronological single-episode split for public-sample diagnostics",
	"task_count": 12
	},
	"directions": [
	{
	"code": "A",
	"counts": {
	"diagnostic": 0,
	"direct": 2,
	"proxy": 2,
	"total_links": 4
	},
	"current_readout": "The sample supports hand trajectory forecasting and contact/object probes, but it does not yet include a full body/shape model or multi-person priors.",
	"current_status": "partially implemented",
	"extension_tasks": [
	{
	"current_limit": "This is a motion-energy proxy, not a SMPL/MANO body model or a generative motion prior.",
	"family": "classification",
	"id": "body_motion_intensity",
	"metric_name": "macro-F1",
	"name": "Body and Hand Motion Intensity"
	}
	],
	"focus": "Human/hand/body motion, deformation priors, human-object interaction, affordance modeling.",
	"id": "human_motion",
	"name": "Human Modeling & Motion Understanding",
	"next_steps": [
	"Add SMPL/SMPL-X or MANO-style body/hand parameter targets where available.",
	"Train sequence models over multi-episode motion trajectories instead of isolated windows.",
	"Evaluate affordance prediction on held-out objects and held-out episodes."
	],
	"preferred_background": "Human pose/shape estimation, SMPL-style models, motion capture, or motion generation.",
	"task_ids": [
	"timeline_action",
	"hand_trajectory_forecast",
	"contact_prediction",
	"object_relevance"
	],
	"tasks": [
	{
	"architecture_family": "multiclass classifier",
	"case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.",
	"current_limit": "Chronological single-episode split creates unseen future action classes.",
	"direction_roles": {
	"A": "proxy",
	"C": "direct"
	},
	"display_name": "Action Recognition",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "supervised",
	"id": "timeline_action",
	"input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.",
	"input_short": "20-frame multimodal window",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 0.05,
	"name": "macro-F1",
	"neural_mlp": 0.0148
	},
	"modalities": [
	"video",
	"depth",
	"pose_slam",
	"motion_capture",
	"inertial",
	"language"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "current action class",
	"primary_direction": "C",
	"process_short": "window features -> action label builder -> classifier",
	"research_name": "Egocentric Action Recognition",
	"why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout."
	},
	{
	"architecture_family": "continuous regressor",
	"case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.",
	"current_limit": "Forecasting is window-level and not yet a full sequence or policy model.",
	"direction_roles": {
	"A": "direct",
	"C": "proxy"
	},
	"display_name": "Hand Trajectory Forecasting",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json",
	"label": "Neural metrics"
	}
	],
	"family": "forecast",
	"id": "hand_trajectory_forecast",
	"input": "The current all-modality window vector at time t.",
	"input_short": "current multimodal window",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "lower",
	"key": "mpjpe",
	"minimal": 0.8647,
	"name": "MPJPE",
	"neural_mlp": 0.1079
	},
	"modalities": [
	"motion_capture",
	"video",
	"depth",
	"pose_slam",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "future hand-joint trajectory",
	"primary_direction": "A",
	"process_short": "current features -> future mocap target -> regression head",
	"research_name": "3D Hand Motion Forecasting",
	"why": "Directly predicts human hand motion and supports hand-object interaction modeling."
	},
	{
	"architecture_family": "binary classifier",
	"case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.",
	"current_limit": "The public sample is degenerate for this target because one class dominates.",
	"direction_roles": {
	"A": "direct",
	"C": "proxy"
	},
	"display_name": "Contact State Prediction",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "supervised",
	"id": "contact_prediction",
	"input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.",
	"input_short": "non-contact, non-caption features",
	"metric": {
	"better_baseline": "tie",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 1.0,
	"name": "macro-F1",
	"neural_mlp": 1.0
	},
	"modalities": [
	"motion_capture",
	"video",
	"depth",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "contact or no contact",
	"primary_direction": "A",
	"process_short": "feature filter -> contact target -> binary classifier",
	"research_name": "Human-Object Contact Prediction",
	"why": "Targets physical interaction state, a core affordance and manipulation signal."
	},
	{
	"architecture_family": "multi-label classifier",
	"case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.",
	"current_limit": "Object labels are language-derived and sparse in one episode.",
	"direction_roles": {
	"A": "proxy",
	"C": "direct",
	"D": "proxy"
	},
	"display_name": "Object Relevance Prediction",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv",
	"label": "Neural predictions"
	}
	],
	"family": "supervised",
	"id": "object_relevance",
	"input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.",
	"input_short": "non-caption multimodal features",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "micro_f1",
	"minimal": 0.1803,
	"name": "micro-F1",
	"neural_mlp": 0.1679
	},
	"modalities": [
	"video",
	"depth",
	"pose_slam",
	"motion_capture",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "relevant object set",
	"primary_direction": "C",
	"process_short": "object vocabulary -> multi-hot labels -> sigmoid heads",
	"research_name": "Object-Centric Interaction Recognition",
	"why": "Connects egocentric activity to manipulated objects and early object-centric state."
	}
	]
	},
	{
	"code": "B",
	"counts": {
	"diagnostic": 1,
	"direct": 0,
	"proxy": 2,
	"total_links": 3
	},
	"current_readout": "The current suite checks cross-modal alignment and depth/video reconstruction proxies; it does not yet train a renderer or reconstruct geometry.",
	"current_status": "proxy tasks only",
	"extension_tasks": [
	{
	"current_limit": "This checks calibrated multi-view signal, but it is still feature retrieval, not NeRF, Gaussian Splatting, or novel-view synthesis.",
	"family": "retrieval",
	"id": "multi_view_consistency_retrieval",
	"metric_name": "MRR",
	"name": "Multi-View Consistency Retrieval"
	}
	],
	"focus": "Multi-view dynamic scene reconstruction, NeRF/Gaussian Splatting, novel-view synthesis.",
	"id": "reconstruction_rendering",
	"name": "3D/4D Reconstruction & Neural Rendering",
	"next_steps": [
	"Use calibrated multi-view video plus SLAM pose to build per-episode camera trajectories.",
	"Add depth-supervised point clouds, TSDF, Gaussian Splatting, or NeRF baselines.",
	"Evaluate novel-view synthesis and temporal consistency across held-out views/time."
	],
	"preferred_background": "3D reconstruction, neural rendering, camera calibration, and bundle adjustment.",
	"task_ids": [
	"cross_modal_retrieval",
	"modality_reconstruction",
	"misalignment_detection"
	],
	"tasks": [
	{
	"architecture_family": "two-tower retrieval head",
	"case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.",
	"current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
	"direction_roles": {
	"B": "proxy",
	"C": "diagnostic",
	"D": "proxy"
	},
	"display_name": "Cross-Modal Retrieval",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json",
	"label": "Neural metrics"
	}
	],
	"family": "retrieval",
	"id": "cross_modal_retrieval",
	"input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.",
	"input_short": "motion/IMU/pose query; depth/video candidates",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "mrr",
	"minimal": 0.2693,
	"name": "MRR",
	"neural_mlp": 0.13
	},
	"modalities": [
	"motion_capture",
	"inertial",
	"pose_slam",
	"depth",
	"video"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "ranked visual windows",
	"primary_direction": "C",
	"process_short": "modality split -> projection -> nearest-neighbor ranker",
	"research_name": "Multimodal Representation Retrieval",
	"why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling."
	},
	{
	"architecture_family": "feature regressor",
	"case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.",
	"current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.",
	"direction_roles": {
	"B": "proxy",
	"D": "proxy"
	},
	"display_name": "Cross-Modal Reconstruction",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json",
	"label": "Neural metrics"
	}
	],
	"family": "forecast",
	"id": "modality_reconstruction",
	"input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.",
	"input_short": "motion, IMU, and camera/pose features",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "higher",
	"key": "r2",
	"minimal": -0.0153,
	"name": "R2",
	"neural_mlp": -0.0102
	},
	"modalities": [
	"motion_capture",
	"inertial",
	"pose_slam",
	"depth",
	"video"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "reconstructed depth/video vector",
	"primary_direction": "B",
	"process_short": "source-target split -> scaler -> regression head",
	"research_name": "Modality Feature Reconstruction",
	"why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective."
	},
	{
	"architecture_family": "pairwise classifier",
	"case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.",
	"current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
	"direction_roles": {
	"B": "diagnostic",
	"C": "diagnostic",
	"D": "diagnostic"
	},
	"display_name": "Multimodal Synchronization Detection",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv",
	"label": "Neural predictions"
	}
	],
	"family": "diagnostic",
	"id": "misalignment_detection",
	"input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.",
	"input_short": "motion-side and visual/depth-side feature groups",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "higher",
	"key": "f1",
	"minimal": 0.5052,
	"name": "F1",
	"neural_mlp": 0.7153
	},
	"modalities": [
	"motion_capture",
	"inertial",
	"video",
	"depth",
	"pose_slam"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "aligned or shifted",
	"primary_direction": "C",
	"process_short": "aligned/shifted pairs -> feature combiner -> binary classifier",
	"research_name": "Cross-Modal Misalignment Detection",
	"why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models."
	}
	]
	},
	{
	"code": "C",
	"counts": {
	"diagnostic": 3,
	"direct": 6,
	"proxy": 2,
	"total_links": 11
	},
	"current_readout": "Most of the 12 tasks directly target egocentric action, task state, interaction, grounding, and alignment.",
	"current_status": "strongest implemented track",
	"extension_tasks": [
	{
	"current_limit": "This is an action-structure probe inside one episode, not a general intent model across homes, people, or tasks.",
	"family": "regression",
	"id": "action_phase_progress",
	"metric_name": "MAE",
	"name": "Action Phase Progress Estimation"
	}
	],
	"focus": "Egocentric action and intention understanding, hand-object interaction, gaze/attention modeling, task structure modeling.",
	"id": "egocentric_interaction",
	"name": "Egocentric Vision & Interaction",
	"next_steps": [
	"Move from single-episode chronological splits to held-out-episode splits.",
	"Use audio together with stronger multimodal backbones for action, intent, and grounding.",
	"Evaluate long-horizon task success prediction and action-conditioned generation."
	],
	"preferred_background": "Video understanding, action recognition, or egocentric vision.",
	"task_ids": [
	"timeline_action",
	"timeline_subtask",
	"transition_detection",
	"next_action",
	"hand_trajectory_forecast",
	"contact_prediction",
	"object_relevance",
	"caption_grounding",
	"cross_modal_retrieval",
	"temporal_order",
	"misalignment_detection"
	],
	"tasks": [
	{
	"architecture_family": "multiclass classifier",
	"case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.",
	"current_limit": "Chronological single-episode split creates unseen future action classes.",
	"direction_roles": {
	"A": "proxy",
	"C": "direct"
	},
	"display_name": "Action Recognition",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "supervised",
	"id": "timeline_action",
	"input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.",
	"input_short": "20-frame multimodal window",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 0.05,
	"name": "macro-F1",
	"neural_mlp": 0.0148
	},
	"modalities": [
	"video",
	"depth",
	"pose_slam",
	"motion_capture",
	"inertial",
	"language"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "current action class",
	"primary_direction": "C",
	"process_short": "window features -> action label builder -> classifier",
	"research_name": "Egocentric Action Recognition",
	"why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout."
	},
	{
	"architecture_family": "multiclass classifier",
	"case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.",
	"current_limit": "Single-episode ordering makes future subtasks hard to generalize.",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"display_name": "Procedure Step Recognition",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "supervised",
	"id": "timeline_subtask",
	"input": "The same all-modality window vector used by action recognition.",
	"input_short": "20-frame multimodal window",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 0.0506,
	"name": "macro-F1",
	"neural_mlp": 0.0281
	},
	"modalities": [
	"video",
	"depth",
	"pose_slam",
	"motion_capture",
	"inertial",
	"language"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "current procedure step",
	"primary_direction": "C",
	"process_short": "window features -> subtask label builder -> classifier",
	"research_name": "Temporal Subtask Recognition",
	"why": "Segments egocentric task state and provides a first proxy for symbolic world/task state."
	},
	{
	"architecture_family": "binary classifier",
	"case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.",
	"current_limit": "Boundary class is sparse, so accuracy alone is misleading.",
	"direction_roles": {
	"C": "direct",
	"D": "diagnostic"
	},
	"display_name": "Action Boundary Detection",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "diagnostic",
	"id": "transition_detection",
	"input": "One all-modality window vector plus labels derived from action-change timestamps.",
	"input_short": "current window with boundary target",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 0.6118,
	"name": "macro-F1",
	"neural_mlp": 0.5862
	},
	"modalities": [
	"video",
	"pose_slam",
	"motion_capture",
	"inertial",
	"language"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "boundary or steady",
	"primary_direction": "C",
	"process_short": "action changes -> boundary labels -> binary classifier",
	"research_name": "Temporal Action Segmentation",
	"why": "Localizes egocentric task boundaries and diagnoses temporal state changes."
	},
	{
	"architecture_family": "future-label classifier",
	"case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.",
	"current_limit": "Unseen future labels dominate the single-episode chronological test.",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"display_name": "Next-Action Prediction",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "supervised",
	"id": "next_action",
	"input": "The current all-modality window vector at time t.",
	"input_short": "current window at time t",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 0.0593,
	"name": "macro-F1",
	"neural_mlp": 0.0419
	},
	"modalities": [
	"video",
	"depth",
	"pose_slam",
	"motion_capture",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "action at t+20 frames",
	"primary_direction": "C",
	"process_short": "current features -> future label shift -> classifier",
	"research_name": "Short-Horizon Intention Prediction",
	"why": "Tests action intention/task-flow prediction from egocentric context."
	},
	{
	"architecture_family": "continuous regressor",
	"case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.",
	"current_limit": "Forecasting is window-level and not yet a full sequence or policy model.",
	"direction_roles": {
	"A": "direct",
	"C": "proxy"
	},
	"display_name": "Hand Trajectory Forecasting",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json",
	"label": "Neural metrics"
	}
	],
	"family": "forecast",
	"id": "hand_trajectory_forecast",
	"input": "The current all-modality window vector at time t.",
	"input_short": "current multimodal window",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "lower",
	"key": "mpjpe",
	"minimal": 0.8647,
	"name": "MPJPE",
	"neural_mlp": 0.1079
	},
	"modalities": [
	"motion_capture",
	"video",
	"depth",
	"pose_slam",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "future hand-joint trajectory",
	"primary_direction": "A",
	"process_short": "current features -> future mocap target -> regression head",
	"research_name": "3D Hand Motion Forecasting",
	"why": "Directly predicts human hand motion and supports hand-object interaction modeling."
	},
	{
	"architecture_family": "binary classifier",
	"case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.",
	"current_limit": "The public sample is degenerate for this target because one class dominates.",
	"direction_roles": {
	"A": "direct",
	"C": "proxy"
	},
	"display_name": "Contact State Prediction",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "supervised",
	"id": "contact_prediction",
	"input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.",
	"input_short": "non-contact, non-caption features",
	"metric": {
	"better_baseline": "tie",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 1.0,
	"name": "macro-F1",
	"neural_mlp": 1.0
	},
	"modalities": [
	"motion_capture",
	"video",
	"depth",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "contact or no contact",
	"primary_direction": "A",
	"process_short": "feature filter -> contact target -> binary classifier",
	"research_name": "Human-Object Contact Prediction",
	"why": "Targets physical interaction state, a core affordance and manipulation signal."
	},
	{
	"architecture_family": "multi-label classifier",
	"case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.",
	"current_limit": "Object labels are language-derived and sparse in one episode.",
	"direction_roles": {
	"A": "proxy",
	"C": "direct",
	"D": "proxy"
	},
	"display_name": "Object Relevance Prediction",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv",
	"label": "Neural predictions"
	}
	],
	"family": "supervised",
	"id": "object_relevance",
	"input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.",
	"input_short": "non-caption multimodal features",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "micro_f1",
	"minimal": 0.1803,
	"name": "micro-F1",
	"neural_mlp": 0.1679
	},
	"modalities": [
	"video",
	"depth",
	"pose_slam",
	"motion_capture",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "relevant object set",
	"primary_direction": "C",
	"process_short": "object vocabulary -> multi-hot labels -> sigmoid heads",
	"research_name": "Object-Centric Interaction Recognition",
	"why": "Connects egocentric activity to manipulated objects and early object-centric state."
	},
	{
	"architecture_family": "retrieval ranker",
	"case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.",
	"current_limit": "Bag-of-objects language features are too weak for rich grounding.",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"display_name": "Language Grounding",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json",
	"label": "Neural metrics"
	}
	],
	"family": "retrieval",
	"id": "caption_grounding",
	"input": "Caption/object/interaction query features and a set of candidate sensor-window features.",
	"input_short": "text-like query and candidate windows",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "higher",
	"key": "mrr",
	"minimal": 0.016,
	"name": "MRR",
	"neural_mlp": 0.0168
	},
	"modalities": [
	"language",
	"video",
	"depth",
	"pose_slam"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "ranked matching moments",
	"primary_direction": "C",
	"process_short": "query features -> candidate index -> cosine ranker",
	"research_name": "Language-to-Moment Grounding",
	"why": "Grounds language annotation into egocentric sensor time and task state."
	},
	{
	"architecture_family": "two-tower retrieval head",
	"case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.",
	"current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
	"direction_roles": {
	"B": "proxy",
	"C": "diagnostic",
	"D": "proxy"
	},
	"display_name": "Cross-Modal Retrieval",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json",
	"label": "Neural metrics"
	}
	],
	"family": "retrieval",
	"id": "cross_modal_retrieval",
	"input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.",
	"input_short": "motion/IMU/pose query; depth/video candidates",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "mrr",
	"minimal": 0.2693,
	"name": "MRR",
	"neural_mlp": 0.13
	},
	"modalities": [
	"motion_capture",
	"inertial",
	"pose_slam",
	"depth",
	"video"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "ranked visual windows",
	"primary_direction": "C",
	"process_short": "modality split -> projection -> nearest-neighbor ranker",
	"research_name": "Multimodal Representation Retrieval",
	"why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling."
	},
	{
	"architecture_family": "pairwise classifier",
	"case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.",
	"current_limit": "Only local adjacent ordering, not long-horizon causal modeling.",
	"direction_roles": {
	"C": "diagnostic",
	"D": "diagnostic"
	},
	"display_name": "Temporal Order Verification",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv",
	"label": "Neural predictions"
	}
	],
	"family": "diagnostic",
	"id": "temporal_order",
	"input": "A pair of adjacent window vectors, plus their difference vector.",
	"input_short": "two adjacent windows plus difference vector",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "higher",
	"key": "f1",
	"minimal": 0.54,
	"name": "F1",
	"neural_mlp": 0.852
	},
	"modalities": [
	"video",
	"pose_slam",
	"motion_capture",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "correct or reversed",
	"primary_direction": "C",
	"process_short": "pair builder -> feature combiner -> binary classifier",
	"research_name": "Temporal Order Verification",
	"why": "Checks whether features encode local time direction and task progression."
	},
	{
	"architecture_family": "pairwise classifier",
	"case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.",
	"current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
	"direction_roles": {
	"B": "diagnostic",
	"C": "diagnostic",
	"D": "diagnostic"
	},
	"display_name": "Multimodal Synchronization Detection",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv",
	"label": "Neural predictions"
	}
	],
	"family": "diagnostic",
	"id": "misalignment_detection",
	"input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.",
	"input_short": "motion-side and visual/depth-side feature groups",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "higher",
	"key": "f1",
	"minimal": 0.5052,
	"name": "F1",
	"neural_mlp": 0.7153
	},
	"modalities": [
	"motion_capture",
	"inertial",
	"video",
	"depth",
	"pose_slam"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "aligned or shifted",
	"primary_direction": "C",
	"process_short": "aligned/shifted pairs -> feature combiner -> binary classifier",
	"research_name": "Cross-Modal Misalignment Detection",
	"why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models."
	}
	]
	},
	{
	"code": "D",
	"counts": {
	"diagnostic": 3,
	"direct": 0,
	"proxy": 6,
	"total_links": 9
	},
	"current_readout": "The current tasks probe temporal structure, object relevance, cross-modal retrieval, and modality prediction, but they do not yet build persistent maps or scene graphs.",
	"current_status": "early proxy tasks",
	"extension_tasks": [
	{
	"current_limit": "This is a compact world-model proxy; it does not build a persistent map, scene graph, or object permanence model.",
	"family": "forecast",
	"id": "ego_motion_forecast",
	"metric_name": "MAE",
	"name": "Short-Horizon Ego-Motion Forecasting"
	}
	],
	"focus": "Long-term consistent 3D/4D scene mapping, scene graphs, object- and space-centric representations, spatial reasoning.",
	"id": "world_modeling",
	"name": "Scene Reconstruction & World Modeling",
	"next_steps": [
	"Convert windows into persistent object/scene-state nodes with timestamps and camera poses.",
	"Add map consistency, object permanence, and spatial relation prediction tasks.",
	"Train held-out-episode world models that predict future observations and task state."
	],
	"preferred_background": "Large-scale mapping, semantic reconstruction, or agent world models.",
	"task_ids": [
	"timeline_subtask",
	"transition_detection",
	"next_action",
	"object_relevance",
	"caption_grounding",
	"cross_modal_retrieval",
	"modality_reconstruction",
	"temporal_order",
	"misalignment_detection"
	],
	"tasks": [
	{
	"architecture_family": "multiclass classifier",
	"case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.",
	"current_limit": "Single-episode ordering makes future subtasks hard to generalize.",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"display_name": "Procedure Step Recognition",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "supervised",
	"id": "timeline_subtask",
	"input": "The same all-modality window vector used by action recognition.",
	"input_short": "20-frame multimodal window",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 0.0506,
	"name": "macro-F1",
	"neural_mlp": 0.0281
	},
	"modalities": [
	"video",
	"depth",
	"pose_slam",
	"motion_capture",
	"inertial",
	"language"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "current procedure step",
	"primary_direction": "C",
	"process_short": "window features -> subtask label builder -> classifier",
	"research_name": "Temporal Subtask Recognition",
	"why": "Segments egocentric task state and provides a first proxy for symbolic world/task state."
	},
	{
	"architecture_family": "binary classifier",
	"case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.",
	"current_limit": "Boundary class is sparse, so accuracy alone is misleading.",
	"direction_roles": {
	"C": "direct",
	"D": "diagnostic"
	},
	"display_name": "Action Boundary Detection",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "diagnostic",
	"id": "transition_detection",
	"input": "One all-modality window vector plus labels derived from action-change timestamps.",
	"input_short": "current window with boundary target",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 0.6118,
	"name": "macro-F1",
	"neural_mlp": 0.5862
	},
	"modalities": [
	"video",
	"pose_slam",
	"motion_capture",
	"inertial",
	"language"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "boundary or steady",
	"primary_direction": "C",
	"process_short": "action changes -> boundary labels -> binary classifier",
	"research_name": "Temporal Action Segmentation",
	"why": "Localizes egocentric task boundaries and diagnoses temporal state changes."
	},
	{
	"architecture_family": "future-label classifier",
	"case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.",
	"current_limit": "Unseen future labels dominate the single-episode chronological test.",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"display_name": "Next-Action Prediction",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "supervised",
	"id": "next_action",
	"input": "The current all-modality window vector at time t.",
	"input_short": "current window at time t",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 0.0593,
	"name": "macro-F1",
	"neural_mlp": 0.0419
	},
	"modalities": [
	"video",
	"depth",
	"pose_slam",
	"motion_capture",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "action at t+20 frames",
	"primary_direction": "C",
	"process_short": "current features -> future label shift -> classifier",
	"research_name": "Short-Horizon Intention Prediction",
	"why": "Tests action intention/task-flow prediction from egocentric context."
	},
	{
	"architecture_family": "multi-label classifier",
	"case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.",
	"current_limit": "Object labels are language-derived and sparse in one episode.",
	"direction_roles": {
	"A": "proxy",
	"C": "direct",
	"D": "proxy"
	},
	"display_name": "Object Relevance Prediction",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv",
	"label": "Neural predictions"
	}
	],
	"family": "supervised",
	"id": "object_relevance",
	"input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.",
	"input_short": "non-caption multimodal features",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "micro_f1",
	"minimal": 0.1803,
	"name": "micro-F1",
	"neural_mlp": 0.1679
	},
	"modalities": [
	"video",
	"depth",
	"pose_slam",
	"motion_capture",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "relevant object set",
	"primary_direction": "C",
	"process_short": "object vocabulary -> multi-hot labels -> sigmoid heads",
	"research_name": "Object-Centric Interaction Recognition",
	"why": "Connects egocentric activity to manipulated objects and early object-centric state."
	},
	{
	"architecture_family": "retrieval ranker",
	"case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.",
	"current_limit": "Bag-of-objects language features are too weak for rich grounding.",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"display_name": "Language Grounding",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json",
	"label": "Neural metrics"
	}
	],
	"family": "retrieval",
	"id": "caption_grounding",
	"input": "Caption/object/interaction query features and a set of candidate sensor-window features.",
	"input_short": "text-like query and candidate windows",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "higher",
	"key": "mrr",
	"minimal": 0.016,
	"name": "MRR",
	"neural_mlp": 0.0168
	},
	"modalities": [
	"language",
	"video",
	"depth",
	"pose_slam"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "ranked matching moments",
	"primary_direction": "C",
	"process_short": "query features -> candidate index -> cosine ranker",
	"research_name": "Language-to-Moment Grounding",
	"why": "Grounds language annotation into egocentric sensor time and task state."
	},
	{
	"architecture_family": "two-tower retrieval head",
	"case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.",
	"current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
	"direction_roles": {
	"B": "proxy",
	"C": "diagnostic",
	"D": "proxy"
	},
	"display_name": "Cross-Modal Retrieval",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json",
	"label": "Neural metrics"
	}
	],
	"family": "retrieval",
	"id": "cross_modal_retrieval",
	"input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.",
	"input_short": "motion/IMU/pose query; depth/video candidates",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "mrr",
	"minimal": 0.2693,
	"name": "MRR",
	"neural_mlp": 0.13
	},
	"modalities": [
	"motion_capture",
	"inertial",
	"pose_slam",
	"depth",
	"video"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "ranked visual windows",
	"primary_direction": "C",
	"process_short": "modality split -> projection -> nearest-neighbor ranker",
	"research_name": "Multimodal Representation Retrieval",
	"why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling."
	},
	{
	"architecture_family": "feature regressor",
	"case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.",
	"current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.",
	"direction_roles": {
	"B": "proxy",
	"D": "proxy"
	},
	"display_name": "Cross-Modal Reconstruction",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json",
	"label": "Neural metrics"
	}
	],
	"family": "forecast",
	"id": "modality_reconstruction",
	"input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.",
	"input_short": "motion, IMU, and camera/pose features",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "higher",
	"key": "r2",
	"minimal": -0.0153,
	"name": "R2",
	"neural_mlp": -0.0102
	},
	"modalities": [
	"motion_capture",
	"inertial",
	"pose_slam",
	"depth",
	"video"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "reconstructed depth/video vector",
	"primary_direction": "B",
	"process_short": "source-target split -> scaler -> regression head",
	"research_name": "Modality Feature Reconstruction",
	"why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective."
	},
	{
	"architecture_family": "pairwise classifier",
	"case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.",
	"current_limit": "Only local adjacent ordering, not long-horizon causal modeling.",
	"direction_roles": {
	"C": "diagnostic",
	"D": "diagnostic"
	},
	"display_name": "Temporal Order Verification",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv",
	"label": "Neural predictions"
	}
	],
	"family": "diagnostic",
	"id": "temporal_order",
	"input": "A pair of adjacent window vectors, plus their difference vector.",
	"input_short": "two adjacent windows plus difference vector",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "higher",
	"key": "f1",
	"minimal": 0.54,
	"name": "F1",
	"neural_mlp": 0.852
	},
	"modalities": [
	"video",
	"pose_slam",
	"motion_capture",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "correct or reversed",
	"primary_direction": "C",
	"process_short": "pair builder -> feature combiner -> binary classifier",
	"research_name": "Temporal Order Verification",
	"why": "Checks whether features encode local time direction and task progression."
	},
	{
	"architecture_family": "pairwise classifier",
	"case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.",
	"current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
	"direction_roles": {
	"B": "diagnostic",
	"C": "diagnostic",
	"D": "diagnostic"
	},
	"display_name": "Multimodal Synchronization Detection",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv",
	"label": "Neural predictions"
	}
	],
	"family": "diagnostic",
	"id": "misalignment_detection",
	"input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.",
	"input_short": "motion-side and visual/depth-side feature groups",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "higher",
	"key": "f1",
	"minimal": 0.5052,
	"name": "F1",
	"neural_mlp": 0.7153
	},
	"modalities": [
	"motion_capture",
	"inertial",
	"video",
	"depth",
	"pose_slam"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "aligned or shifted",
	"primary_direction": "C",
	"process_short": "aligned/shifted pairs -> feature combiner -> binary classifier",
	"research_name": "Cross-Modal Misalignment Detection",
	"why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models."
	}
	]
	}
	],
	"foundation_model_plan": {
	"decision": {
	"external_reasoning_reference": "Gemini Robotics",
	"first_policy_branch_candidates": [
	"OpenVLA / OpenVLA-OFT",
	"openpi pi0/pi0.5",
	"NVIDIA GR00T"
	],
	"first_world_model_branch": "Cosmos 3",
	"immediate_trainable_backbone": "Qwen3-Omni",
	"long_term_native_pretraining_goal": "Xperience Embodied Foundation Model"
	},
	"evaluation_additions": [
	{
	"metrics": [
	"JSON validity",
	"macro-F1",
	"accuracy",
	"micro-F1"
	],
	"model_families": [
	"Qwen3-Omni",
	"Gemini Robotics reference"
	],
	"target": "structured_task_prediction"
	},
	{
	"metrics": [
	"retrieval rank",
	"temporal consistency",
	"feature reconstruction",
	"qualitative visual inspection"
	],
	"model_families": [
	"Cosmos 3"
	],
	"target": "future_state_prediction"
	},
	{
	"metrics": [
	"transition accuracy",
	"contact accuracy",
	"next-action accuracy"
	],
	"model_families": [
	"Cosmos 3",
	"OpenVLA",
	"openpi",
	"GR00T"
	],
	"target": "action_conditioned_dynamics"
	},
	{
	"metrics": [
	"held-out episode metrics",
	"held-out session metrics",
	"leakage checks"
	],
	"model_families": [
	"all trainable branches"
	],
	"target": "cross_episode_generalization"
	}
	],
	"execution_order": [
	{
	"action": "Stage at least 32 valid Xperience-10M episodes with held-out episode split.",
	"name": "Data gate",
	"step": 1
	},
	{
	"action": "Run Qwen3-Omni action/subtask error analysis and targeted reruns to improve the verified diagnostic baseline.",
	"name": "First held-out baseline",
	"step": 2
	},
	{
	"action": "Run 3-8 episode dry runs for any next backbone before scaling beyond the selected split.",
	"name": "Model-selection dry run",
	"step": 3
	},
	{
	"action": "Promote Cosmos 3 beyond the current Nano compatibility and Super forward-dynamics runs only when loss metrics, preprocessing, and storage justify the added compute.",
	"name": "World-model branch",
	"step": 4
	},
	{
	"action": "Promote OpenVLA/openpi/GR00T after action target conversion and retargeting artifacts are traceable.",
	"name": "Policy branch",
	"step": 5
	},
	{
	"action": "Publish branch results only with real manifests, predictions, metrics, and qualitative examples.",
	"name": "Publishing threshold",
	"step": 6
	},
	{
	"action": "Start a from-scratch Xperience Embodied Foundation Model only after smaller scaling stages, full-corpus storage, multi-node compute, and held-out evaluation protocols are in place.",
	"name": "Xperience-native pretraining",
	"step": 7
	}
	],
	"model_families": [
	{
	"best_role": "First selected-episode multimodal LoRA pilot and structured task predictor.",
	"category": "omni_instruction_model",
	"current_decision": "keep_as_first_pilot",
	"entry_condition": "Selected episodes prepared with held-out episode split.",
	"family": "Qwen3-Omni",
	"openness": "open_weights_available_from_official_hf_repo",
	"priority": 1,
	"public_source": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct",
	"xperience10m_fit": [
	"RGB/fisheye video, embedded audio, and language prompts can enter directly.",
	"Depth, pose/SLAM, mocap, contacts, and IMU enter through the existing sensor bridge.",
	"Matches current task outputs: labels, structured JSON, captions, and short decisions."
	]
	},
	{
	"best_role": "Embodied world modeling, action generation, future-window prediction, and synthetic-data expansion.",
	"category": "world_foundation_model",
	"current_decision": "implemented_as_nano_future_window_and_super_forward_dynamics_branches",
	"entry_condition": "Use separate metrics for Nano future-window retrieval and Super forward-dynamics MSE; do not compare them directly to Qwen JSON-task accuracy.",
	"family": "Cosmos 3",
	"openness": "track_official_nvidia_release_and_available_weights",
	"priority": 2,
	"public_source": "https://www.nvidia.com/en-us/ai/cosmos/",
	"xperience10m_fit": [
	"Uses video streams as visual state.",
	"Uses pose/SLAM, depth, mocap, IMU, and language as physical-world conditioning signals.",
	"Better aligned with prediction/generation objectives than simple label classification."
	]
	},
	{
	"best_role": "Humanoid action understanding, retargeting, contact/action prediction, and embodied skill transfer.",
	"category": "humanoid_policy_foundation_model",
	"current_decision": "track_as_humanoid_policy_branch",
	"entry_condition": "Retargeting artifact and action-space definition exist.",
	"family": "NVIDIA GR00T",
	"openness": "track_official_nvidia_release_and_tooling",
	"priority": 3,
	"public_source": "https://developer.nvidia.com/isaac/gr00t",
	"xperience10m_fit": [
	"Hand/body mocap and contact cues can be retargeted into humanoid state/action targets.",
	"Egocentric video plus human motion can support affordance and interaction tasks."
	]
	},
	{
	"best_role": "Open robot-policy baseline after observations and action labels are converted into a VLA format.",
	"category": "vision_language_action_policy",
	"current_decision": "candidate_after_action_space_design",
	"entry_condition": "Window-to-action-token conversion is implemented and checked.",
	"family": "OpenVLA / OpenVLA-OFT",
	"openness": "open_project_and_weights",
	"priority": 4,
	"public_source": "https://openvla.github.io/",
	"xperience10m_fit": [
	"Good candidate when each window is expressed as visual observation, instruction/context, and action token.",
	"Requires an explicit action target; current human egocentric labels are not robot controls by default."
	]
	},
	{
	"best_role": "Action-chunking, policy fine-tuning, and embodiment-transfer experiments.",
	"category": "robot_policy_model",
	"current_decision": "candidate_policy_branch",
	"entry_condition": "Action target and train/eval protocol exist for at least 64 episodes.",
	"family": "openpi pi0/pi0.5",
	"openness": "open_source_policy_training_stack",
	"priority": 5,
	"public_source": "https://github.com/Physical-Intelligence/openpi",
	"xperience10m_fit": [
	"Useful once hand trajectories, contacts, or retargeted body motion are converted into policy targets.",
	"Better for policy branch than for current structured task JSON outputs."
	]
	},
	{
	"best_role": "Qualitative reasoning reference, annotation helper, and external comparison when API access exists.",
	"category": "closed_embodied_reasoning_reference",
	"current_decision": "external_reference_only",
	"entry_condition": "API/access exists and outputs are logged separately from trainable model metrics.",
	"family": "Gemini Robotics",
	"openness": "closed_or_limited_access",
	"priority": 6,
	"public_source": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/",
	"xperience10m_fit": [
	"Can help reason over egocentric scenes and task descriptions.",
	"Not a local fine-tune target for this repo."
	]
	},
	{
	"best_role": "Cheaper policy baselines for observation-to-action experiments.",
	"category": "lightweight_robot_policy_baselines",
	"current_decision": "optional_baseline_after_data_staging",
	"entry_condition": "Action labels and baseline protocol exist.",
	"family": "Octo / SmolVLA-style lightweight policies",
	"openness": "open_projects",
	"priority": 7,
	"public_source": "https://github.com/huggingface/lerobot",
	"xperience10m_fit": [
	"Useful after action target design.",
	"Less directly omni-modal than Qwen3-Omni or Cosmos 3."
	]
	},
	{
	"best_role": "Domain model over synchronized embodied experience.",
	"category": "xperience_native_pretraining_goal",
	"current_decision": "future_goal_after_scaling_evidence",
	"entry_condition": "Full-corpus data path, PB-scale storage, multi-node compute, and positive smaller-run scaling evidence.",
	"family": "Xperience Embodied Foundation Model",
	"openness": "future project-specific model if full-corpus access and compute exist",
	"priority": 8,
	"public_source": "XPERIENCE_EMBODIED_FOUNDATION_MODEL_PRETRAINING.md",
	"xperience10m_fit": [
	"Uses the full aligned modality stack rather than treating sensors as auxiliary metadata.",
	"Targets temporal embodied representation learning across perception, motion, geometry, audio, and language.",
	"Can become the shared pretraining backbone for Qwen-style instruction tasks, Cosmos-style world modeling, and policy/action branches."
	]
	}
	],
	"source_links": [
	{
	"label": "Qwen3-Omni official HF model",
	"url": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct"
	},
	{
	"label": "NVIDIA Cosmos",
	"url": "https://www.nvidia.com/en-us/ai/cosmos/"
	},
	{
	"label": "NVIDIA Isaac GR00T",
	"url": "https://developer.nvidia.com/isaac/gr00t"
	},
	{
	"label": "OpenVLA",
	"url": "https://openvla.github.io/"
	},
	{
	"label": "openpi",
	"url": "https://github.com/Physical-Intelligence/openpi"
	},
	{
	"label": "Gemini Robotics",
	"url": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/"
	},
	{
	"label": "Octo",
	"url": "https://octo-models.github.io/"
	},
	{
	"label": "LeRobot / SmolVLA",
	"url": "https://github.com/huggingface/lerobot"
	},
	{
	"label": "Xperience Embodied Foundation Model pretraining plan",
	"url": "XPERIENCE_EMBODIED_FOUNDATION_MODEL_PRETRAINING.md"
	}
	],
	"status": "planning_artifact"
	},
	"generated_at_utc": "2026-06-13T17:41:13+00:00",
	"omni_plan": {
	"adapter": "LoRA rank 16, alpha 32, dropout 0.05",
	"backbone": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
	"evaluation": [
	"JSON validity",
	"action macro-F1",
	"subtask accuracy",
	"transition accuracy",
	"next-action accuracy",
	"contact accuracy",
	"object micro-F1",
	"held-out episode count"
	],
	"first_pilot": "32 held-out-episode pilot after valid episodes are prepared",
	"training_unit": "episode-level split, window-level supervised examples"
	},
	"phases": [
	{
	"completion_evidence": [
	"PROJECT_STATUS.md",
	"EVALUATION_PROTOCOL.md",
	"RESEARCH_TAKEAWAYS.md",
	"docs/data/summary_metrics.json",
	"results/episode_task_suite/summary_report.json"
	],
	"deliverables": [
	"1161 aligned windows",
	"12 task contracts",
	"minimal baseline heads",
	"neural MLP heads",
	"modality atlas",
	"task walkthroughs",
	"derived figures"
	],
	"entry_condition": "One public Xperience-10M sample episode is available.",
	"id": "public_sample_task_lab",
	"name": "Public-Sample Task Lab",
	"reader_takeaway": "The public sample supports task design, feature contracts, walkthroughs, and baseline comparisons.",
	"stage": "now",
	"status": "implemented"
	},
	{
	"completion_evidence": [
	"results/omni_finetune/DATA_ACCESS_STATUS.md",
	"results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md",
	"results/omni_finetune/source_discovery.json"
	],
	"deliverables": [
	"128 selected episodes",
	"episode manifest",
	"missing-view manifest",
	"held-out episode split",
	"source-discovery report"
	],
	"entry_condition": "Gated dataset availability and enough storage for selected episodes.",
	"id": "multi_episode_data_staging",
	"name": "Multi-Episode Data Preparation",
	"reader_takeaway": "The first selected split is available for Qwen3-Omni diagnostics, with train/test separation at the episode level.",
	"stage": "future",
	"status": "implemented_for_first_pilot"
	},
	{
	"completion_evidence": [
	"docs/data/omni_finetune_verified_result.json",
	"docs/data/qwen3_v5_v6_comparison.json",
	"results/omni_finetune/QWEN3_V5_V6_COMPARISON_20260614.md",
	"results/omni_finetune/verified_public/",
	"dataset_manifest.json",
	"training_metadata.json",
	"progress.jsonl",
	"metrics.json",
	"predictions.jsonl",
	"RUN_REPORT.md"
	],
	"deliverables": [
	"dataset JSONL/media manifests",
	"LoRA adapter checkpoint",
	"progress logs",
	"validation monitoring",
	"held-out predictions",
	"metrics",
	"confusion matrices",
	"run report",
	"v5/v6 comparison",
	"public LoRA adapter repo"
	],
	"entry_condition": "Selected episodes are prepared locally with no train/test episode leakage.",
	"id": "qwen3_omni_lora_diagnostic_pilot",
	"name": "Qwen3-Omni LoRA Latest Diagnostic Branch",
	"reader_takeaway": "The final omni-model diagnostic result establishes the full held-out training/validation/evaluation loop and meets the strict-JSON target, but weak action/subtask metrics make it a diagnostic baseline.",
	"stage": "future",
	"status": "verified_latest_branch"
	},
	{
	"completion_evidence": [
	"results/omni_finetune/multi_episode_128_task_baselines/BASELINE_ALIGNMENT_REPORT.md",
	"results/omni_finetune/multi_episode_128_task_baselines/summary_report.json",
	"scripts/omni/run_128_task_baselines.py"
	],
	"deliverables": [
	"same 12 task ids",
	"simple metadata/text baselines",
	"neural MLP baselines for JSON-supported labels",
	"explicit unsupported markers for raw-feature-only tasks"
	],
	"entry_condition": "Derived Qwen JSONL export for the selected 96/16/16 split.",
	"id": "multi_episode_128_same_split_baselines",
	"name": "128-Episode Same-Split Simple/NN Baselines",
	"reader_takeaway": "The simple and neural baseline framing is now aligned to the selected 128-episode setup; trajectory, retrieval, reconstruction, and misalignment variants still need raw 128 feature blocks for exact feature-level reproduction.",
	"stage": "future",
	"status": "verified_companion_result"
	},
	{
	"completion_evidence": [
	"TASK_SUITE_ENHANCEMENT_128.md",
	"docs/data/task_suite_enhancement_128.json",
	"results/omni_finetune/task_suite_enhancement_128_v1_20260608/enhancement_plan.json",
	"scripts/omni/build_task_suite_enhancement_128.py"
	],
	"deliverables": [
	"dense-window and multiscale export estimates",
	"hierarchical action/subtask target contract",
	"raw-feature shard priorities for unsupported tasks",
	"Qwen v5 and Cosmos continuation run cards",
	"publication-ready enhancement artifacts"
	],
	"entry_condition": "Same selected 96/16/16 split and current public 3,808-window export.",
	"id": "task_suite_enhancement_128",
	"name": "128-Episode Task Suite Enhancement Pack",
	"reader_takeaway": "The current 128-episode setup still has headroom: use multiscale_20s10_40s20_80s40, hierarchical labels, label-normalized scoring, and raw-feature shards before adding more episodes.",
	"stage": "future",
	"status": "current"
	},
	{
	"completion_evidence": [
	"error-analysis tables",
	"held-out metrics by failure type",
	"verified public-safe package"
	],
	"deliverables": [
	"same 96/16/16 episode split",
	"action/subtask confusion analysis",
	"unseen-label analysis",
	"object/action family breakdowns",
	"held-out test evaluation",
	"comparison to the final verified Qwen baseline"
	],
	"entry_condition": "The final diagnostic package meets strict JSON validity but has weak action/subtask held-out quality.",
	"id": "qwen3_omni_structured_output_error_analysis",
	"name": "Action/Subtask Error-Analysis Pass",
	"reader_takeaway": "The next pass should improve action/subtask quality before larger model-quality claims.",
	"stage": "future",
	"status": "active_next_step"
	},
	{
	"completion_evidence": [
	"FOUNDATION_MODEL_PLAN.md",
	"docs/data/foundation_model_plan.json",
	"research_roadmap_interactive.json"
	],
	"deliverables": [
	"backbone registry",
	"Cosmos 3 world-model branch plan",
	"Cosmos3-Super Forward-Dynamics LoRA verified package",
	"Qwen3-Omni LoRA baseline plan",
	"OpenVLA/openpi/GR00T policy-branch candidates",
	"model-specific evaluation additions"
	],
	"entry_condition": "The selected episodes are prepared or a 3-8 episode dry run is available for preprocessing checks.",
	"id": "foundation_model_selection_matrix",
	"name": "Foundation-Model Selection Matrix",
	"reader_takeaway": "Qwen3-Omni remains the structured JSON held-out pilot; Cosmos 3 is the first world-model branch. Cosmos3-Super now has a verified forward-dynamics LoRA over camera-pose proxy targets, while VLA/policy models wait for robot-compatible action targets.",
	"stage": "future",
	"status": "current"
	},
	{
	"completion_evidence": [
	"held-out metrics by session",
	"held-out metrics by task",
	"held-out metrics by modality",
	"ablation tables",
	"qualitative error analysis"
	],
	"deliverables": [
	"split-by-session metrics",
	"modality ablations",
	"calibration/object/language error analysis",
	"missing-view sensitivity analysis"
	],
	"entry_condition": "The selected-episode pilot trains and evaluates cleanly.",
	"id": "robustness_run_64_128_episode",
	"name": "64-128 Episode Robustness Run",
	"reader_takeaway": "The robustness run tests whether the pilot conclusions survive broader sessions and missing modalities.",
	"stage": "future",
	"status": "partially_implemented"
	},
	{
	"completion_evidence": [
	"task-specific held-out evaluations",
	"verified Cosmos3-Super forward-dynamics LoRA package",
	"qualitative inspection",
	"updated model cards"
	],
	"deliverables": [
	"Cosmos 3 future-window and action-conditioned world-model probes",
	"OpenVLA/openpi/GR00T action-policy baseline",
	"audio/video/depth/pose/mocap conditioning checks",
	"affordance and object-interaction tasks",
	"synthetic-data usefulness test"
	],
	"entry_condition": "Enough multi-episode data, compute budget, and model-specific action/world-state targets.",
	"id": "foundation_world_model_extensions",
	"name": "Cosmos 3 and Policy-Model Extensions",
	"reader_takeaway": "The Cosmos branch now includes Nano future-window compatibility and Super forward-dynamics LoRA; the long-term direction remains richer multimodal representation learning with model branches chosen by task fit rather than by a single default backbone.",
	"stage": "future",
	"status": "planned"
	},
	{
	"completion_evidence": [
	"pretraining metadata",
	"checkpoint inventory",
	"scaling curves",
	"held-out evaluation reports",
	"qualitative retrieval or future-state examples",
	"safety and data-boundary report"
	],
	"deliverables": [
	"full-corpus episode and split manifests",
	"pretraining shard and provenance manifests",
	"0.3B-1B and 1B-3B scaling pilots",
	"3B-7B Xperience-native domain model target",
	"held-out episode/session/activity/object evaluations",
	"missing-modality robustness report",
	"model card and data-boundary report"
	],
	"entry_condition": "Full-corpus access, PB-scale storage path, high-throughput data loading, multi-node compute, and positive scaling evidence from smaller multi-episode runs.",
	"id": "xperience_embodied_foundation_pretraining",
	"name": "Xperience Embodied Foundation Model Pretraining",
	"reader_takeaway": "The final research direction is a domain-specific embodied foundation model trained directly on Xperience-10M, after smaller pilots justify the cost and infrastructure.",
	"stage": "future",
	"status": "future"
	}
	],
	"scale_up": {
	"access_status": "The gated Xperience-10M dataset is available for selected multi-episode pilot preparation.",
	"candidate_scan_top_level_sessions": 802,
	"estimated_bytes": 298188841943,
	"exclude": [
	"visualization.rrd"
	],
	"selection_strategy": "stratified_round_robin_by_top_level_session",
	"status": "verified_full_128_episode_diagnostic_result",
	"target_episodes": 128,
	"valid_candidates": 12102
	},
	"scope": {
	"feature_blocks": 18,
	"feature_dim": 8546,
	"num_frames": 5821,
	"num_windows": 1161,
	"sample_episode_count": 1,
	"stride_frames": 5,
	"warning": "These walkthroughs explain task contracts on one public sample episode; cross-episode performance requires held-out episodes.",
	"window_frames": 20
	},
	"source_files": [
	"docs/data/research_directions.json",
	"docs/data/task_walkthroughs.json",
	"docs/data/research_roadmap.json",
	"docs/data/foundation_model_plan.json",
	"docs/data/additional_development_directions.json",
	"docs/data/summary_metrics.json",
	"docs/data/research_direction_extensions.json",
	"results/episode_task_suite/summary_report.json",
	"results/episode_task_suite/feature_manifest.json"
	],
	"tasks": [
	{
	"architecture_family": "multiclass classifier",
	"case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.",
	"current_limit": "Chronological single-episode split creates unseen future action classes.",
	"direction_roles": {
	"A": "proxy",
	"C": "direct"
	},
	"display_name": "Action Recognition",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "supervised",
	"id": "timeline_action",
	"input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.",
	"input_short": "20-frame multimodal window",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 0.05,
	"name": "macro-F1",
	"neural_mlp": 0.0148
	},
	"modalities": [
	"video",
	"depth",
	"pose_slam",
	"motion_capture",
	"inertial",
	"language"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "current action class",
	"primary_direction": "C",
	"process_short": "window features -> action label builder -> classifier",
	"research_name": "Egocentric Action Recognition",
	"why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout."
	},
	{
	"architecture_family": "multiclass classifier",
	"case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.",
	"current_limit": "Single-episode ordering makes future subtasks hard to generalize.",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"display_name": "Procedure Step Recognition",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "supervised",
	"id": "timeline_subtask",
	"input": "The same all-modality window vector used by action recognition.",
	"input_short": "20-frame multimodal window",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 0.0506,
	"name": "macro-F1",
	"neural_mlp": 0.0281
	},
	"modalities": [
	"video",
	"depth",
	"pose_slam",
	"motion_capture",
	"inertial",
	"language"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "current procedure step",
	"primary_direction": "C",
	"process_short": "window features -> subtask label builder -> classifier",
	"research_name": "Temporal Subtask Recognition",
	"why": "Segments egocentric task state and provides a first proxy for symbolic world/task state."
	},
	{
	"architecture_family": "binary classifier",
	"case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.",
	"current_limit": "Boundary class is sparse, so accuracy alone is misleading.",
	"direction_roles": {
	"C": "direct",
	"D": "diagnostic"
	},
	"display_name": "Action Boundary Detection",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "diagnostic",
	"id": "transition_detection",
	"input": "One all-modality window vector plus labels derived from action-change timestamps.",
	"input_short": "current window with boundary target",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 0.6118,
	"name": "macro-F1",
	"neural_mlp": 0.5862
	},
	"modalities": [
	"video",
	"pose_slam",
	"motion_capture",
	"inertial",
	"language"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "boundary or steady",
	"primary_direction": "C",
	"process_short": "action changes -> boundary labels -> binary classifier",
	"research_name": "Temporal Action Segmentation",
	"why": "Localizes egocentric task boundaries and diagnoses temporal state changes."
	},
	{
	"architecture_family": "future-label classifier",
	"case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.",
	"current_limit": "Unseen future labels dominate the single-episode chronological test.",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"display_name": "Next-Action Prediction",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "supervised",
	"id": "next_action",
	"input": "The current all-modality window vector at time t.",
	"input_short": "current window at time t",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 0.0593,
	"name": "macro-F1",
	"neural_mlp": 0.0419
	},
	"modalities": [
	"video",
	"depth",
	"pose_slam",
	"motion_capture",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "action at t+20 frames",
	"primary_direction": "C",
	"process_short": "current features -> future label shift -> classifier",
	"research_name": "Short-Horizon Intention Prediction",
	"why": "Tests action intention/task-flow prediction from egocentric context."
	},
	{
	"architecture_family": "continuous regressor",
	"case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.",
	"current_limit": "Forecasting is window-level and not yet a full sequence or policy model.",
	"direction_roles": {
	"A": "direct",
	"C": "proxy"
	},
	"display_name": "Hand Trajectory Forecasting",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json",
	"label": "Neural metrics"
	}
	],
	"family": "forecast",
	"id": "hand_trajectory_forecast",
	"input": "The current all-modality window vector at time t.",
	"input_short": "current multimodal window",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "lower",
	"key": "mpjpe",
	"minimal": 0.8647,
	"name": "MPJPE",
	"neural_mlp": 0.1079
	},
	"modalities": [
	"motion_capture",
	"video",
	"depth",
	"pose_slam",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "future hand-joint trajectory",
	"primary_direction": "A",
	"process_short": "current features -> future mocap target -> regression head",
	"research_name": "3D Hand Motion Forecasting",
	"why": "Directly predicts human hand motion and supports hand-object interaction modeling."
	},
	{
	"architecture_family": "binary classifier",
	"case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.",
	"current_limit": "The public sample is degenerate for this target because one class dominates.",
	"direction_roles": {
	"A": "direct",
	"C": "proxy"
	},
	"display_name": "Contact State Prediction",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv",
	"label": "Neural predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv",
	"label": "Confusion matrix"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv",
	"label": "Neural confusion matrix"
	}
	],
	"family": "supervised",
	"id": "contact_prediction",
	"input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.",
	"input_short": "non-contact, non-caption features",
	"metric": {
	"better_baseline": "tie",
	"direction": "higher",
	"key": "macro_f1",
	"minimal": 1.0,
	"name": "macro-F1",
	"neural_mlp": 1.0
	},
	"modalities": [
	"motion_capture",
	"video",
	"depth",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "contact or no contact",
	"primary_direction": "A",
	"process_short": "feature filter -> contact target -> binary classifier",
	"research_name": "Human-Object Contact Prediction",
	"why": "Targets physical interaction state, a core affordance and manipulation signal."
	},
	{
	"architecture_family": "multi-label classifier",
	"case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.",
	"current_limit": "Object labels are language-derived and sparse in one episode.",
	"direction_roles": {
	"A": "proxy",
	"C": "direct",
	"D": "proxy"
	},
	"display_name": "Object Relevance Prediction",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv",
	"label": "Neural predictions"
	}
	],
	"family": "supervised",
	"id": "object_relevance",
	"input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.",
	"input_short": "non-caption multimodal features",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "micro_f1",
	"minimal": 0.1803,
	"name": "micro-F1",
	"neural_mlp": 0.1679
	},
	"modalities": [
	"video",
	"depth",
	"pose_slam",
	"motion_capture",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "relevant object set",
	"primary_direction": "C",
	"process_short": "object vocabulary -> multi-hot labels -> sigmoid heads",
	"research_name": "Object-Centric Interaction Recognition",
	"why": "Connects egocentric activity to manipulated objects and early object-centric state."
	},
	{
	"architecture_family": "retrieval ranker",
	"case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.",
	"current_limit": "Bag-of-objects language features are too weak for rich grounding.",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"display_name": "Language Grounding",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json",
	"label": "Neural metrics"
	}
	],
	"family": "retrieval",
	"id": "caption_grounding",
	"input": "Caption/object/interaction query features and a set of candidate sensor-window features.",
	"input_short": "text-like query and candidate windows",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "higher",
	"key": "mrr",
	"minimal": 0.016,
	"name": "MRR",
	"neural_mlp": 0.0168
	},
	"modalities": [
	"language",
	"video",
	"depth",
	"pose_slam"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "ranked matching moments",
	"primary_direction": "C",
	"process_short": "query features -> candidate index -> cosine ranker",
	"research_name": "Language-to-Moment Grounding",
	"why": "Grounds language annotation into egocentric sensor time and task state."
	},
	{
	"architecture_family": "two-tower retrieval head",
	"case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.",
	"current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
	"direction_roles": {
	"B": "proxy",
	"C": "diagnostic",
	"D": "proxy"
	},
	"display_name": "Cross-Modal Retrieval",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json",
	"label": "Neural metrics"
	}
	],
	"family": "retrieval",
	"id": "cross_modal_retrieval",
	"input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.",
	"input_short": "motion/IMU/pose query; depth/video candidates",
	"metric": {
	"better_baseline": "minimal",
	"direction": "higher",
	"key": "mrr",
	"minimal": 0.2693,
	"name": "MRR",
	"neural_mlp": 0.13
	},
	"modalities": [
	"motion_capture",
	"inertial",
	"pose_slam",
	"depth",
	"video"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "ranked visual windows",
	"primary_direction": "C",
	"process_short": "modality split -> projection -> nearest-neighbor ranker",
	"research_name": "Multimodal Representation Retrieval",
	"why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling."
	},
	{
	"architecture_family": "feature regressor",
	"case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.",
	"current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.",
	"direction_roles": {
	"B": "proxy",
	"D": "proxy"
	},
	"display_name": "Cross-Modal Reconstruction",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json",
	"label": "Neural metrics"
	}
	],
	"family": "forecast",
	"id": "modality_reconstruction",
	"input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.",
	"input_short": "motion, IMU, and camera/pose features",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "higher",
	"key": "r2",
	"minimal": -0.0153,
	"name": "R2",
	"neural_mlp": -0.0102
	},
	"modalities": [
	"motion_capture",
	"inertial",
	"pose_slam",
	"depth",
	"video"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "reconstructed depth/video vector",
	"primary_direction": "B",
	"process_short": "source-target split -> scaler -> regression head",
	"research_name": "Modality Feature Reconstruction",
	"why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective."
	},
	{
	"architecture_family": "pairwise classifier",
	"case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.",
	"current_limit": "Only local adjacent ordering, not long-horizon causal modeling.",
	"direction_roles": {
	"C": "diagnostic",
	"D": "diagnostic"
	},
	"display_name": "Temporal Order Verification",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv",
	"label": "Neural predictions"
	}
	],
	"family": "diagnostic",
	"id": "temporal_order",
	"input": "A pair of adjacent window vectors, plus their difference vector.",
	"input_short": "two adjacent windows plus difference vector",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "higher",
	"key": "f1",
	"minimal": 0.54,
	"name": "F1",
	"neural_mlp": 0.852
	},
	"modalities": [
	"video",
	"pose_slam",
	"motion_capture",
	"inertial"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "correct or reversed",
	"primary_direction": "C",
	"process_short": "pair builder -> feature combiner -> binary classifier",
	"research_name": "Temporal Order Verification",
	"why": "Checks whether features encode local time direction and task progression."
	},
	{
	"architecture_family": "pairwise classifier",
	"case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.",
	"current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
	"direction_roles": {
	"B": "diagnostic",
	"C": "diagnostic",
	"D": "diagnostic"
	},
	"display_name": "Multimodal Synchronization Detection",
	"evidence_links": [
	{
	"href": "data/task_walkthroughs.json",
	"label": "Task walkthrough"
	},
	{
	"href": "single_episode_explorer.html",
	"label": "Single-episode explorer"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json",
	"label": "Minimal metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json",
	"label": "Neural metrics"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv",
	"label": "Minimal predictions"
	},
	{
	"href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv",
	"label": "Neural predictions"
	}
	],
	"family": "diagnostic",
	"id": "misalignment_detection",
	"input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.",
	"input_short": "motion-side and visual/depth-side feature groups",
	"metric": {
	"better_baseline": "neural_mlp",
	"direction": "higher",
	"key": "f1",
	"minimal": 0.5052,
	"name": "F1",
	"neural_mlp": 0.7153
	},
	"modalities": [
	"motion_capture",
	"inertial",
	"video",
	"depth",
	"pose_slam"
	],
	"module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
	"output_short": "aligned or shifted",
	"primary_direction": "C",
	"process_short": "aligned/shifted pairs -> feature combiner -> binary classifier",
	"research_name": "Cross-Modal Misalignment Detection",
	"why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models."
	}
	],
	"title": "Interactive Research Roadmap"
	}