Add files using upload-large-folder tool

3c21768 verified 4 days ago

25 kB

	{
	"source": "docs/data/task_suite_20.json plus results/episode_task_suite/summary_report.json",
	"dataset_scope": {
	"sample_episode_count": 1,
	"num_frames": 5821,
	"num_windows": 1161,
	"feature_dim": 8546,
	"warning": "Single public sample episode; this supports pipeline/task evidence, while cross-episode generalization requires held-out episodes."
	},
	"baselines": {
	"minimal": "Interpretable softmax, logistic, ridge, and retrieval heads over the 8,546-d window feature vector.",
	"neural_mlp": "Small PyTorch MLP classifiers/regressors using the same features, splits, and task contracts."
	},
	"task_count": 20,
	"directions": {
	"A": {
	"id": "human_motion",
	"name": "Human Modeling & Motion Understanding",
	"focus": "Human/hand/body motion, deformation priors, human-object interaction, affordance modeling.",
	"preferred_background": "Human pose/shape estimation, SMPL-style models, motion capture, or motion generation.",
	"current_status": "partially implemented",
	"current_readout": "The sample supports hand trajectory forecasting and contact/object probes, but it does not yet include a full body/shape model or multi-person priors.",
	"next_steps": [
	"Add SMPL/SMPL-X or MANO-style body/hand parameter targets where available.",
	"Train sequence models over multi-episode motion trajectories instead of isolated windows.",
	"Evaluate affordance prediction on held-out objects and held-out episodes."
	],
	"tasks": [
	"timeline_action",
	"hand_trajectory_forecast",
	"contact_prediction",
	"object_relevance",
	"interaction_text_prediction",
	"imu_to_hand_pose"
	],
	"task_display_names": [
	"Action Recognition",
	"Hand Trajectory Forecasting",
	"Contact State Prediction",
	"Object Relevance Prediction",
	"Interaction Text Prediction",
	"IMU-to-Hand Pose Reconstruction"
	],
	"counts": {
	"direct": 3,
	"proxy": 3,
	"diagnostic": 0,
	"total_links": 6
	}
	},
	"B": {
	"id": "reconstruction_rendering",
	"name": "3D/4D Reconstruction & Neural Rendering",
	"focus": "Multi-view dynamic scene reconstruction, NeRF/Gaussian Splatting, novel-view synthesis.",
	"preferred_background": "3D reconstruction, neural rendering, camera calibration, and bundle adjustment.",
	"current_status": "proxy tasks only",
	"current_readout": "The current suite checks cross-modal alignment and depth/video reconstruction proxies; it does not yet train a renderer or reconstruct geometry.",
	"next_steps": [
	"Use calibrated multi-view video plus SLAM pose to build per-episode camera trajectories.",
	"Add depth-supervised point clouds, TSDF, Gaussian Splatting, or NeRF baselines.",
	"Evaluate novel-view synthesis and temporal consistency across held-out views/time."
	],
	"tasks": [
	"cross_modal_retrieval",
	"modality_reconstruction",
	"misalignment_detection",
	"imu_to_hand_pose",
	"camera_view_sync_retrieval"
	],
	"task_display_names": [
	"Cross-Modal Retrieval",
	"Cross-Modal Reconstruction",
	"Multimodal Synchronization Detection",
	"IMU-to-Hand Pose Reconstruction",
	"Camera-View Synchronization Retrieval"
	],
	"counts": {
	"direct": 1,
	"proxy": 3,
	"diagnostic": 1,
	"total_links": 5
	}
	},
	"C": {
	"id": "egocentric_interaction",
	"name": "Egocentric Vision & Interaction",
	"focus": "Egocentric action and intention understanding, hand-object interaction, gaze/attention modeling, task structure modeling.",
	"preferred_background": "Video understanding, action recognition, or egocentric vision.",
	"current_status": "strongest implemented track",
	"current_readout": "The unified 20-task suite directly targets egocentric action, task state, interaction, grounding, forecasting, and alignment.",
	"next_steps": [
	"Move from single-episode chronological splits to held-out-episode splits.",
	"Use audio together with stronger multimodal backbones for action, intent, and grounding.",
	"Evaluate long-horizon task success prediction and action-conditioned generation."
	],
	"tasks": [
	"timeline_action",
	"timeline_subtask",
	"transition_detection",
	"next_action",
	"hand_trajectory_forecast",
	"contact_prediction",
	"object_relevance",
	"caption_grounding",
	"cross_modal_retrieval",
	"temporal_order",
	"misalignment_detection",
	"long_horizon_next_action",
	"next_subtask_forecast",
	"interaction_text_prediction",
	"action_object_relation",
	"object_set_forecast",
	"time_to_transition"
	],
	"task_display_names": [
	"Action Recognition",
	"Procedure Step Recognition",
	"Action Boundary Detection",
	"Next-Action Prediction",
	"Hand Trajectory Forecasting",
	"Contact State Prediction",
	"Object Relevance Prediction",
	"Language Grounding",
	"Cross-Modal Retrieval",
	"Temporal Order Verification",
	"Multimodal Synchronization Detection",
	"Long-Horizon Next-Action Forecasting",
	"Long-Horizon Next-Subtask Forecasting",
	"Interaction Text Prediction",
	"Action-Object Relation Prediction",
	"Future Object-Set Forecasting",
	"Time-to-Next-Transition Regression"
	],
	"counts": {
	"direct": 10,
	"proxy": 3,
	"diagnostic": 4,
	"total_links": 17
	}
	},
	"D": {
	"id": "world_modeling",
	"name": "Scene Reconstruction & World Modeling",
	"focus": "Long-term consistent 3D/4D scene mapping, scene graphs, object- and space-centric representations, spatial reasoning.",
	"preferred_background": "Large-scale mapping, semantic reconstruction, or agent world models.",
	"current_status": "early proxy tasks",
	"current_readout": "The current tasks probe temporal structure, object relevance, cross-modal retrieval, and modality prediction, but they do not yet build persistent maps or scene graphs.",
	"next_steps": [
	"Convert windows into persistent object/scene-state nodes with timestamps and camera poses.",
	"Add map consistency, object permanence, and spatial relation prediction tasks.",
	"Train held-out-episode world models that predict future observations and task state."
	],
	"tasks": [
	"timeline_subtask",
	"transition_detection",
	"next_action",
	"object_relevance",
	"caption_grounding",
	"cross_modal_retrieval",
	"modality_reconstruction",
	"temporal_order",
	"misalignment_detection",
	"long_horizon_next_action",
	"next_subtask_forecast",
	"action_object_relation",
	"object_set_forecast",
	"camera_view_sync_retrieval",
	"time_to_transition"
	],
	"task_display_names": [
	"Procedure Step Recognition",
	"Action Boundary Detection",
	"Next-Action Prediction",
	"Object Relevance Prediction",
	"Language Grounding",
	"Cross-Modal Retrieval",
	"Cross-Modal Reconstruction",
	"Temporal Order Verification",
	"Multimodal Synchronization Detection",
	"Long-Horizon Next-Action Forecasting",
	"Long-Horizon Next-Subtask Forecasting",
	"Action-Object Relation Prediction",
	"Future Object-Set Forecasting",
	"Camera-View Synchronization Retrieval",
	"Time-to-Next-Transition Regression"
	],
	"counts": {
	"direct": 1,
	"proxy": 10,
	"diagnostic": 4,
	"total_links": 15
	}
	}
	},
	"tasks": {
	"timeline_action": {
	"name": "Timeline action recognition",
	"family": "supervised",
	"input": "all featurized modalities",
	"output": "current action label",
	"primary_direction": "C",
	"direction_roles": {
	"C": "direct",
	"A": "proxy"
	},
	"why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout.",
	"current_limit": "Chronological single-episode split creates unseen future action classes.",
	"display_name": "Action Recognition",
	"artifact_id": "timeline_action",
	"metric": {
	"key": "macro_f1",
	"name": "macro-F1",
	"direction": "higher",
	"minimal": 0.05,
	"neural_mlp": 0.014814814814814814,
	"better_baseline": "minimal"
	}
	},
	"timeline_subtask": {
	"name": "Timeline subtask recognition",
	"family": "supervised",
	"input": "all featurized modalities",
	"output": "current subtask label",
	"primary_direction": "C",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"why": "Segments egocentric task state and provides a first proxy for symbolic world/task state.",
	"current_limit": "Single-episode ordering makes future subtasks hard to generalize.",
	"display_name": "Procedure Step Recognition",
	"artifact_id": "timeline_subtask",
	"metric": {
	"key": "macro_f1",
	"name": "macro-F1",
	"direction": "higher",
	"minimal": 0.05056355513846935,
	"neural_mlp": 0.02810810810810811,
	"better_baseline": "minimal"
	}
	},
	"transition_detection": {
	"name": "Action transition detection",
	"family": "diagnostic",
	"input": "all featurized modalities",
	"output": "boundary vs steady state",
	"primary_direction": "C",
	"direction_roles": {
	"C": "direct",
	"D": "diagnostic"
	},
	"why": "Localizes egocentric task boundaries and diagnoses temporal state changes.",
	"current_limit": "Boundary class is sparse, so accuracy alone is misleading.",
	"display_name": "Action Boundary Detection",
	"artifact_id": "transition_detection",
	"metric": {
	"key": "macro_f1",
	"name": "macro-F1",
	"direction": "higher",
	"minimal": 0.6118237590630229,
	"neural_mlp": 0.5862068965517241,
	"better_baseline": "minimal"
	}
	},
	"next_action": {
	"name": "Short-horizon next action",
	"family": "supervised",
	"input": "current multimodal window",
	"output": "action 20 frames later",
	"primary_direction": "C",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"why": "Tests action intention/task-flow prediction from egocentric context.",
	"current_limit": "Unseen future labels dominate the single-episode chronological test.",
	"display_name": "Next-Action Prediction",
	"artifact_id": "next_action",
	"metric": {
	"key": "macro_f1",
	"name": "macro-F1",
	"direction": "higher",
	"minimal": 0.05925925925925927,
	"neural_mlp": 0.04186046511627907,
	"better_baseline": "minimal"
	}
	},
	"hand_trajectory_forecast": {
	"name": "Hand trajectory forecasting",
	"family": "forecast",
	"input": "current multimodal window",
	"output": "future left/right hand 3D joints",
	"primary_direction": "A",
	"direction_roles": {
	"A": "direct",
	"C": "proxy"
	},
	"why": "Directly predicts human hand motion and supports hand-object interaction modeling.",
	"current_limit": "Forecasting is window-level and not yet a full sequence or policy model.",
	"display_name": "Hand Trajectory Forecasting",
	"artifact_id": "hand_trajectory_forecast",
	"metric": {
	"key": "mpjpe",
	"name": "MPJPE",
	"direction": "lower",
	"minimal": 0.8646570444107056,
	"neural_mlp": 0.10785018652677536,
	"better_baseline": "neural_mlp"
	}
	},
	"contact_prediction": {
	"name": "Body/object contact prediction",
	"family": "supervised",
	"input": "non-contact/non-caption features",
	"output": "binary contact label",
	"primary_direction": "A",
	"direction_roles": {
	"A": "direct",
	"C": "proxy"
	},
	"why": "Targets physical interaction state, a core affordance and manipulation signal.",
	"current_limit": "The public sample is degenerate for this target because one class dominates.",
	"display_name": "Contact State Prediction",
	"artifact_id": "contact_prediction",
	"metric": {
	"key": "macro_f1",
	"name": "macro-F1",
	"direction": "higher",
	"minimal": 1.0,
	"neural_mlp": 1.0,
	"better_baseline": "tie"
	}
	},
	"object_relevance": {
	"name": "Relevant object set prediction",
	"family": "supervised",
	"input": "non-caption feature blocks",
	"output": "multi-label object set",
	"primary_direction": "C",
	"direction_roles": {
	"C": "direct",
	"A": "proxy",
	"D": "proxy"
	},
	"why": "Connects egocentric activity to manipulated objects and early object-centric state.",
	"current_limit": "Object labels are language-derived and sparse in one episode.",
	"display_name": "Object Relevance Prediction",
	"artifact_id": "object_relevance",
	"metric": {
	"key": "micro_f1",
	"name": "micro-F1",
	"direction": "higher",
	"minimal": 0.18034382095361662,
	"neural_mlp": 0.1679279279279279,
	"better_baseline": "minimal"
	}
	},
	"caption_grounding": {
	"name": "Caption-to-window grounding",
	"family": "retrieval",
	"input": "caption objects/interaction query and candidate sensor windows",
	"output": "matching time window",
	"primary_direction": "C",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"why": "Grounds language annotation into egocentric sensor time and task state.",
	"current_limit": "Bag-of-objects language features are too weak for rich grounding.",
	"display_name": "Language Grounding",
	"artifact_id": "caption_grounding",
	"metric": {
	"key": "mrr",
	"name": "MRR",
	"direction": "higher",
	"minimal": 0.016023479050338015,
	"neural_mlp": 0.01684125567132316,
	"better_baseline": "neural_mlp"
	}
	},
	"cross_modal_retrieval": {
	"name": "Cross-modal retrieval",
	"family": "retrieval",
	"input": "motion/IMU/camera query",
	"output": "matching depth/video window",
	"primary_direction": "C",
	"direction_roles": {
	"C": "diagnostic",
	"B": "proxy",
	"D": "proxy"
	},
	"why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling.",
	"current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
	"display_name": "Cross-Modal Retrieval",
	"artifact_id": "cross_modal_retrieval",
	"metric": {
	"key": "mrr",
	"name": "MRR",
	"direction": "higher",
	"minimal": 0.26925966892956127,
	"neural_mlp": 0.1299971898648288,
	"better_baseline": "minimal"
	}
	},
	"modality_reconstruction": {
	"name": "Modality reconstruction",
	"family": "forecast",
	"input": "motion/IMU/camera",
	"output": "depth/video feature vector",
	"primary_direction": "B",
	"direction_roles": {
	"B": "proxy",
	"D": "proxy"
	},
	"why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective.",
	"current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.",
	"display_name": "Cross-Modal Reconstruction",
	"artifact_id": "modality_reconstruction",
	"metric": {
	"key": "r2",
	"name": "R2",
	"direction": "higher",
	"minimal": -0.015271898913936655,
	"neural_mlp": -0.010171410134180991,
	"better_baseline": "neural_mlp"
	}
	},
	"temporal_order": {
	"name": "Temporal order verification",
	"family": "diagnostic",
	"input": "two adjacent windows",
	"output": "correct vs reversed order",
	"primary_direction": "C",
	"direction_roles": {
	"C": "diagnostic",
	"D": "diagnostic"
	},
	"why": "Checks whether features encode local time direction and task progression.",
	"current_limit": "Only local adjacent ordering, not long-horizon causal modeling.",
	"display_name": "Temporal Order Verification",
	"artifact_id": "temporal_order",
	"metric": {
	"key": "f1",
	"name": "F1",
	"direction": "higher",
	"minimal": 0.5399515738498789,
	"neural_mlp": 0.8520179372197308,
	"better_baseline": "neural_mlp"
	}
	},
	"misalignment_detection": {
	"name": "Cross-modal misalignment detection",
	"family": "diagnostic",
	"input": "motion plus visual/depth pair",
	"output": "aligned vs shifted",
	"primary_direction": "C",
	"direction_roles": {
	"C": "diagnostic",
	"B": "diagnostic",
	"D": "diagnostic"
	},
	"why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models.",
	"current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
	"display_name": "Multimodal Synchronization Detection",
	"artifact_id": "misalignment_detection",
	"metric": {
	"key": "f1",
	"name": "F1",
	"direction": "higher",
	"minimal": 0.5051698670605613,
	"neural_mlp": 0.7152682255845944,
	"better_baseline": "neural_mlp"
	}
	},
	"long_horizon_next_action": {
	"name": "Long-horizon next-action forecasting",
	"family": "classification",
	"input": "current and historical windows",
	"output": "future action label",
	"primary_direction": "C",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"why": "Extends short-horizon intention prediction into longer activity futures, a key egocentric and world-model signal.",
	"current_limit": "Evaluated from sample-supported future labels, not full open-world action generation.",
	"display_name": "Long-Horizon Next-Action Forecasting",
	"artifact_id": "long_horizon_next_action",
	"metric": {
	"key": "macro_f1",
	"name": "macro-F1",
	"direction": "higher",
	"minimal": 0.07499999999999998,
	"neural_mlp": 0.06545454545454546,
	"better_baseline": "minimal"
	}
	},
	"next_subtask_forecast": {
	"name": "Long-horizon next-subtask forecasting",
	"family": "classification",
	"input": "current and historical windows",
	"output": "future procedure-step label",
	"primary_direction": "C",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"why": "Measures whether the model can anticipate the next procedural phase rather than only the current frame state.",
	"current_limit": "Subtask labels are constrained to the available annotation vocabulary.",
	"display_name": "Long-Horizon Next-Subtask Forecasting",
	"artifact_id": "next_subtask_forecast",
	"metric": {
	"key": "macro_f1",
	"name": "macro-F1",
	"direction": "higher",
	"minimal": 0.04545454545454545,
	"neural_mlp": 0.050724637681159424,
	"better_baseline": "neural_mlp"
	}
	},
	"interaction_text_prediction": {
	"name": "Interaction text prediction",
	"family": "classification",
	"input": "window features without target text leakage",
	"output": "natural-language interaction class",
	"primary_direction": "C",
	"direction_roles": {
	"C": "direct",
	"A": "proxy"
	},
	"why": "Connects egocentric observations to the natural-language interaction semantics carried by the annotation.",
	"current_limit": "Public derived features retain hashed text targets; raw full text requires the official annotation source.",
	"display_name": "Interaction Text Prediction",
	"artifact_id": "interaction_text_prediction",
	"metric": {
	"key": "macro_f1",
	"name": "macro-F1",
	"direction": "higher",
	"minimal": 0.04444444444444444,
	"neural_mlp": 0.0380952380952381,
	"better_baseline": "minimal"
	}
	},
	"action_object_relation": {
	"name": "Action-object relation prediction",
	"family": "classification",
	"input": "window features with target-side relation leakage excluded",
	"output": "action-object relation class",
	"primary_direction": "C",
	"direction_roles": {
	"C": "direct",
	"D": "proxy"
	},
	"why": "Tests whether action recognition and object state are connected as a relational interaction representation.",
	"current_limit": "Relation labels are derived from the public-sample annotation scope.",
	"display_name": "Action-Object Relation Prediction",
	"artifact_id": "action_object_relation",
	"metric": {
	"key": "macro_f1",
	"name": "macro-F1",
	"direction": "higher",
	"minimal": 0.0,
	"neural_mlp": 0.0,
	"better_baseline": "tie"
	}
	},
	"object_set_forecast": {
	"name": "Future object-set forecasting",
	"family": "multi-label",
	"input": "current and historical windows",
	"output": "future object set",
	"primary_direction": "D",
	"direction_roles": {
	"D": "direct",
	"C": "proxy"
	},
	"why": "Asks whether the current scene state supports predicting which objects will matter later.",
	"current_limit": "This is a set-level proxy, not a persistent 3D scene graph.",
	"display_name": "Future Object-Set Forecasting",
	"artifact_id": "object_set_forecast",
	"metric": {
	"key": "micro_f1",
	"name": "micro-F1",
	"direction": "higher",
	"minimal": 0.16939890710382516,
	"neural_mlp": 0.19718309859154928,
	"better_baseline": "neural_mlp"
	}
	},
	"imu_to_hand_pose": {
	"name": "IMU-to-hand pose reconstruction",
	"family": "regression",
	"input": "IMU and motion context",
	"output": "hand pose target",
	"primary_direction": "A",
	"direction_roles": {
	"A": "direct",
	"B": "proxy"
	},
	"why": "Measures human-motion reconstruction from wearable and motion cues.",
	"current_limit": "Pose reconstruction is window-level and does not yet fit a full parametric hand/body model.",
	"display_name": "IMU-to-Hand Pose Reconstruction",
	"artifact_id": "imu_to_hand_pose",
	"metric": {
	"key": "mae",
	"name": "MAE",
	"direction": "lower",
	"minimal": 0.042049407958984375,
	"neural_mlp": 0.042562149465084076,
	"better_baseline": "minimal"
	}
	},
	"camera_view_sync_retrieval": {
	"name": "Camera-view synchronization retrieval",
	"family": "retrieval",
	"input": "one camera-view/window query",
	"output": "matching synchronized view",
	"primary_direction": "B",
	"direction_roles": {
	"B": "direct",
	"D": "proxy"
	},
	"why": "Tests whether synchronized multi-view structure is recoverable across camera streams.",
	"current_limit": "Retrieval checks view consistency but does not reconstruct geometry by itself.",
	"display_name": "Camera-View Synchronization Retrieval",
	"artifact_id": "camera_view_sync_retrieval",
	"metric": {
	"key": "mrr",
	"name": "MRR",
	"direction": "higher",
	"minimal": 0.4943004846572876,
	"neural_mlp": 0.24086658656597137,
	"better_baseline": "minimal"
	}
	},
	"time_to_transition": {
	"name": "Time-to-next-transition regression",
	"family": "regression",
	"input": "current temporal window state",
	"output": "frames/time until the next transition",
	"primary_direction": "C",
	"direction_roles": {
	"C": "diagnostic",
	"D": "diagnostic"
	},
	"why": "Measures temporal boundary awareness as a continuous timing target.",
	"current_limit": "Regression is local to the annotated public sample timeline.",
	"display_name": "Time-to-Next-Transition Regression",
	"artifact_id": "time_to_transition",
	"metric": {
	"key": "mae",
	"name": "MAE frames",
	"direction": "lower",
	"minimal": 10.53735637664795,
	"neural_mlp": 10.55449390411377,
	"better_baseline": "minimal"
	}
	}
	}
	}