File size: 14,429 Bytes

{
  "source": "results/episode_task_suite/summary_report.json",
  "dataset_scope": {
    "sample_episode_count": 1,
    "num_frames": 5821,
    "num_windows": 1161,
    "feature_dim": 8546,
    "warning": "Single public sample episode; this supports pipeline/task evidence, while cross-episode generalization requires held-out episodes."
  },
  "baselines": {
    "minimal": "Interpretable softmax, logistic, ridge, and retrieval heads over the 8,546-d window feature vector.",
    "neural_mlp": "Small PyTorch MLP classifiers/regressors using the same features, splits, and task contracts."
  },
  "directions": {
    "A": {
      "id": "human_motion",
      "name": "Human Modeling & Motion Understanding",
      "focus": "Human/hand/body motion, deformation priors, human-object interaction, affordance modeling.",
      "preferred_background": "Human pose/shape estimation, SMPL-style models, motion capture, or motion generation.",
      "current_status": "partially implemented",
      "current_readout": "The sample supports hand trajectory forecasting and contact/object probes, but it does not yet include a full body/shape model or multi-person priors.",
      "next_steps": [
        "Add SMPL/SMPL-X or MANO-style body/hand parameter targets where available.",
        "Train sequence models over multi-episode motion trajectories instead of isolated windows.",
        "Evaluate affordance prediction on held-out objects and held-out episodes."
      ],
      "tasks": [
        "timeline_action",
        "hand_trajectory_forecast",
        "contact_prediction",
        "object_relevance"
      ],
      "counts": {
        "direct": 2,
        "proxy": 2,
        "diagnostic": 0,
        "total_links": 4
      }
    },
    "B": {
      "id": "reconstruction_rendering",
      "name": "3D/4D Reconstruction & Neural Rendering",
      "focus": "Multi-view dynamic scene reconstruction, NeRF/Gaussian Splatting, novel-view synthesis.",
      "preferred_background": "3D reconstruction, neural rendering, camera calibration, and bundle adjustment.",
      "current_status": "proxy tasks only",
      "current_readout": "The current suite checks cross-modal alignment and depth/video reconstruction proxies; it does not yet train a renderer or reconstruct geometry.",
      "next_steps": [
        "Use calibrated multi-view video plus SLAM pose to build per-episode camera trajectories.",
        "Add depth-supervised point clouds, TSDF, Gaussian Splatting, or NeRF baselines.",
        "Evaluate novel-view synthesis and temporal consistency across held-out views/time."
      ],
      "tasks": [
        "cross_modal_retrieval",
        "modality_reconstruction",
        "misalignment_detection"
      ],
      "counts": {
        "direct": 0,
        "proxy": 2,
        "diagnostic": 1,
        "total_links": 3
      }
    },
    "C": {
      "id": "egocentric_interaction",
      "name": "Egocentric Vision & Interaction",
      "focus": "Egocentric action and intention understanding, hand-object interaction, gaze/attention modeling, task structure modeling.",
      "preferred_background": "Video understanding, action recognition, or egocentric vision.",
      "current_status": "strongest implemented track",
      "current_readout": "Most of the 12 tasks directly target egocentric action, task state, interaction, grounding, and alignment.",
      "next_steps": [
        "Move from single-episode chronological splits to held-out-episode splits.",
        "Use the extracted AAC audio block with stronger multimodal backbones for action, intent, and grounding.",
        "Evaluate long-horizon task success prediction and action-conditioned generation."
      ],
      "tasks": [
        "timeline_action",
        "timeline_subtask",
        "transition_detection",
        "next_action",
        "hand_trajectory_forecast",
        "contact_prediction",
        "object_relevance",
        "caption_grounding",
        "cross_modal_retrieval",
        "temporal_order",
        "misalignment_detection"
      ],
      "counts": {
        "direct": 6,
        "proxy": 2,
        "diagnostic": 3,
        "total_links": 11
      }
    },
    "D": {
      "id": "world_modeling",
      "name": "Scene Reconstruction & World Modeling",
      "focus": "Long-term consistent 3D/4D scene mapping, scene graphs, object- and space-centric representations, spatial reasoning.",
      "preferred_background": "Large-scale mapping, semantic reconstruction, or agent world models.",
      "current_status": "early proxy tasks",
      "current_readout": "The current tasks probe temporal structure, object relevance, cross-modal retrieval, and modality prediction, but they do not yet build persistent maps or scene graphs.",
      "next_steps": [
        "Convert windows into persistent object/scene-state nodes with timestamps and camera poses.",
        "Add map consistency, object permanence, and spatial relation prediction tasks.",
        "Train held-out-episode world models that predict future observations and task state."
      ],
      "tasks": [
        "timeline_subtask",
        "transition_detection",
        "next_action",
        "object_relevance",
        "caption_grounding",
        "cross_modal_retrieval",
        "modality_reconstruction",
        "temporal_order",
        "misalignment_detection"
      ],
      "counts": {
        "direct": 0,
        "proxy": 6,
        "diagnostic": 3,
        "total_links": 9
      }
    }
  },
  "tasks": {
    "timeline_action": {
      "name": "Timeline action recognition",
      "family": "supervised",
      "input": "all featurized modalities",
      "output": "current action label",
      "primary_direction": "C",
      "direction_roles": {
        "C": "direct",
        "A": "proxy"
      },
      "why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout.",
      "current_limit": "Chronological single-episode split creates unseen future action classes.",
      "metric": {
        "key": "macro_f1",
        "name": "macro-F1",
        "direction": "higher",
        "minimal": 0.05,
        "neural_mlp": 0.014814814814814814,
        "better_baseline": "minimal"
      }
    },
    "timeline_subtask": {
      "name": "Timeline subtask recognition",
      "family": "supervised",
      "input": "all featurized modalities",
      "output": "current subtask label",
      "primary_direction": "C",
      "direction_roles": {
        "C": "direct",
        "D": "proxy"
      },
      "why": "Segments egocentric task state and provides a first proxy for symbolic world/task state.",
      "current_limit": "Single-episode ordering makes future subtasks hard to generalize.",
      "metric": {
        "key": "macro_f1",
        "name": "macro-F1",
        "direction": "higher",
        "minimal": 0.05056355513846935,
        "neural_mlp": 0.02810810810810811,
        "better_baseline": "minimal"
      }
    },
    "transition_detection": {
      "name": "Action transition detection",
      "family": "diagnostic",
      "input": "all featurized modalities",
      "output": "boundary vs steady state",
      "primary_direction": "C",
      "direction_roles": {
        "C": "direct",
        "D": "diagnostic"
      },
      "why": "Localizes egocentric task boundaries and diagnoses temporal state changes.",
      "current_limit": "Boundary class is sparse, so accuracy alone is misleading.",
      "metric": {
        "key": "macro_f1",
        "name": "macro-F1",
        "direction": "higher",
        "minimal": 0.6118237590630229,
        "neural_mlp": 0.5862068965517241,
        "better_baseline": "minimal"
      }
    },
    "next_action": {
      "name": "Short-horizon next action",
      "family": "supervised",
      "input": "current multimodal window",
      "output": "action 20 frames later",
      "primary_direction": "C",
      "direction_roles": {
        "C": "direct",
        "D": "proxy"
      },
      "why": "Tests action intention/task-flow prediction from egocentric context.",
      "current_limit": "Unseen future labels dominate the single-episode chronological test.",
      "metric": {
        "key": "macro_f1",
        "name": "macro-F1",
        "direction": "higher",
        "minimal": 0.05925925925925927,
        "neural_mlp": 0.04186046511627907,
        "better_baseline": "minimal"
      }
    },
    "hand_trajectory_forecast": {
      "name": "Hand trajectory forecasting",
      "family": "forecast",
      "input": "current multimodal window",
      "output": "future left/right hand 3D joints",
      "primary_direction": "A",
      "direction_roles": {
        "A": "direct",
        "C": "proxy"
      },
      "why": "Directly predicts human hand motion and supports hand-object interaction modeling.",
      "current_limit": "Forecasting is window-level and not yet a full sequence or policy model.",
      "metric": {
        "key": "mpjpe",
        "name": "MPJPE",
        "direction": "lower",
        "minimal": 0.8646570444107056,
        "neural_mlp": 0.10785018652677536,
        "better_baseline": "neural_mlp"
      }
    },
    "contact_prediction": {
      "name": "Body/object contact prediction",
      "family": "supervised",
      "input": "non-contact/non-caption features",
      "output": "binary contact label",
      "primary_direction": "A",
      "direction_roles": {
        "A": "direct",
        "C": "proxy"
      },
      "why": "Targets physical interaction state, a core affordance and manipulation signal.",
      "current_limit": "The public sample is degenerate for this target because one class dominates.",
      "metric": {
        "key": "macro_f1",
        "name": "macro-F1",
        "direction": "higher",
        "minimal": 1.0,
        "neural_mlp": 1.0,
        "better_baseline": "tie"
      }
    },
    "object_relevance": {
      "name": "Relevant object set prediction",
      "family": "supervised",
      "input": "non-caption feature blocks",
      "output": "multi-label object set",
      "primary_direction": "C",
      "direction_roles": {
        "C": "direct",
        "A": "proxy",
        "D": "proxy"
      },
      "why": "Connects egocentric activity to manipulated objects and early object-centric state.",
      "current_limit": "Object labels are language-derived and sparse in one episode.",
      "metric": {
        "key": "micro_f1",
        "name": "micro-F1",
        "direction": "higher",
        "minimal": 0.18034382095361662,
        "neural_mlp": 0.1679279279279279,
        "better_baseline": "minimal"
      }
    },
    "caption_grounding": {
      "name": "Caption-to-window grounding",
      "family": "retrieval",
      "input": "caption objects/interaction query and candidate sensor windows",
      "output": "matching time window",
      "primary_direction": "C",
      "direction_roles": {
        "C": "direct",
        "D": "proxy"
      },
      "why": "Grounds language annotation into egocentric sensor time and task state.",
      "current_limit": "Bag-of-objects language features are too weak for rich grounding.",
      "metric": {
        "key": "mrr",
        "name": "MRR",
        "direction": "higher",
        "minimal": 0.016023479050338015,
        "neural_mlp": 0.01684125567132316,
        "better_baseline": "neural_mlp"
      }
    },
    "cross_modal_retrieval": {
      "name": "Cross-modal retrieval",
      "family": "retrieval",
      "input": "motion/IMU/camera query",
      "output": "matching depth/video window",
      "primary_direction": "C",
      "direction_roles": {
        "C": "diagnostic",
        "B": "proxy",
        "D": "proxy"
      },
      "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling.",
      "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
      "metric": {
        "key": "mrr",
        "name": "MRR",
        "direction": "higher",
        "minimal": 0.26925966892956127,
        "neural_mlp": 0.1299971898648288,
        "better_baseline": "minimal"
      }
    },
    "modality_reconstruction": {
      "name": "Modality reconstruction",
      "family": "forecast",
      "input": "motion/IMU/camera",
      "output": "depth/video feature vector",
      "primary_direction": "B",
      "direction_roles": {
        "B": "proxy",
        "D": "proxy"
      },
      "why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective.",
      "current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.",
      "metric": {
        "key": "r2",
        "name": "R2",
        "direction": "higher",
        "minimal": -0.015271898913936655,
        "neural_mlp": -0.010171410134180991,
        "better_baseline": "neural_mlp"
      }
    },
    "temporal_order": {
      "name": "Temporal order verification",
      "family": "diagnostic",
      "input": "two adjacent windows",
      "output": "correct vs reversed order",
      "primary_direction": "C",
      "direction_roles": {
        "C": "diagnostic",
        "D": "diagnostic"
      },
      "why": "Checks whether features encode local time direction and task progression.",
      "current_limit": "Only local adjacent ordering, not long-horizon causal modeling.",
      "metric": {
        "key": "f1",
        "name": "F1",
        "direction": "higher",
        "minimal": 0.5399515738498789,
        "neural_mlp": 0.8520179372197308,
        "better_baseline": "neural_mlp"
      }
    },
    "misalignment_detection": {
      "name": "Cross-modal misalignment detection",
      "family": "diagnostic",
      "input": "motion plus visual/depth pair",
      "output": "aligned vs shifted",
      "primary_direction": "C",
      "direction_roles": {
        "C": "diagnostic",
        "B": "diagnostic",
        "D": "diagnostic"
      },
      "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models.",
      "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
      "metric": {
        "key": "f1",
        "name": "F1",
        "direction": "higher",
        "minimal": 0.5051698670605613,
        "neural_mlp": 0.7152682255845944,
        "better_baseline": "neural_mlp"
      }
    }
  }
}