{
  "source": {
    "shared_windows": "results/episode_task_suite/shared_windows.npz",
    "windows_csv": "results/episode_task_suite/windows.csv",
    "feature_manifest": "results/episode_task_suite/feature_manifest.json"
  },
  "dataset_scope": {
    "sample_episode_count": 1,
    "num_windows": 1161,
    "feature_dim": 8546,
    "first_start_frame": 0,
    "last_end_frame": 5819,
    "warning": "Single public sample episode; these extension probes validate task design and pipeline mechanics, not cross-episode generalization."
  },
  "baselines": {
    "minimal": "Ridge classifiers/regressors/projections plus cosine retrieval on the committed feature tensor.",
    "neural_mlp": "Small one-hidden-layer PyTorch MLP heads using the same inputs, targets, chronological split, and evaluator."
  },
  "run_config": {
    "train_fraction": 0.7,
    "ridge_l2": 10.0,
    "seed": 7,
    "future_windows": 4,
    "neural_epochs": 25,
    "neural_hidden_dim": 128,
    "neural_batch_size": 128,
    "skip_neural": false
  },
  "task_specs": {
    "body_motion_intensity": {
      "direction": "A",
      "direction_name": "Human Modeling & Motion Understanding",
      "name": "Body and Hand Motion Intensity",
      "family": "classification",
      "case_study": "A window with a fast reach or pour should be classified as high motion; a steady holding window should be low motion.",
      "input": "Current non-mocap feature blocks: video, AAC audio, depth, camera pose/rotation, IMU, SLAM, calibration, and language context.",
      "middle_process": "Compute the target from hand/body joint changes between neighboring windows, hide the mocap blocks from the input, then classify high versus low motion using the train-set median as the threshold.",
      "output": "Binary label: high_motion or low_motion.",
      "minimal_baseline": "Ridge classifier on standardized non-mocap features.",
      "neural_baseline": "One-hidden-layer MLP binary classifier on the same input features.",
      "metric_name": "macro-F1",
      "metric_key": "macro_f1",
      "metric_direction": "higher",
      "current_limit": "This is a motion-energy proxy, not a SMPL/MANO body model or a generative motion prior."
    },
    "multi_view_consistency_retrieval": {
      "direction": "B",
      "direction_name": "3D/4D Reconstruction & Neural Rendering",
      "name": "Multi-View Consistency Retrieval",
      "family": "retrieval",
      "case_study": "Given the fisheye camera features for a pouring moment, retrieve the synchronized stereo-left view from the same time window.",
      "input": "Query side: fisheye_cam0 video feature block. Candidate side: stereo_left video feature block from held-out windows.",
      "middle_process": "Learn a projection from one camera-view feature space into another, then rank held-out candidate windows by cosine similarity.",
      "output": "Ranked candidate windows; the correct synchronized view should rank near the top.",
      "minimal_baseline": "Ridge projection followed by cosine nearest-neighbor retrieval.",
      "neural_baseline": "One-hidden-layer MLP projection followed by the same cosine retrieval evaluator.",
      "metric_name": "MRR",
      "metric_key": "mrr",
      "metric_direction": "higher",
      "current_limit": "This checks calibrated multi-view signal, but it is still feature retrieval, not NeRF, Gaussian Splatting, or novel-view synthesis."
    },
    "action_phase_progress": {
      "direction": "C",
      "direction_name": "Egocentric Vision & Interaction",
      "name": "Action Phase Progress Estimation",
      "family": "regression",
      "case_study": "Inside a Pour coffee action segment, estimate whether the current window is near the beginning, middle, or end of that action.",
      "input": "Current non-caption multimodal feature vector, so the label text cannot be copied directly from the language block.",
      "middle_process": "Convert contiguous action-label runs into a normalized 0-to-1 progress target, train on earlier windows, and regress progress for later windows.",
      "output": "A scalar progress value between 0.0 and 1.0 for the current action segment.",
      "minimal_baseline": "Ridge regressor on standardized non-caption features.",
      "neural_baseline": "One-hidden-layer MLP regressor on the same input features.",
      "metric_name": "MAE",
      "metric_key": "mae",
      "metric_direction": "lower",
      "current_limit": "This is an action-structure probe inside one episode, not a general intent model across homes, people, or tasks."
    },
    "ego_motion_forecast": {
      "direction": "D",
      "direction_name": "Scene Reconstruction & World Modeling",
      "name": "Short-Horizon Ego-Motion Forecasting",
      "family": "forecast",
      "case_study": "From the current sensors, predict how the camera translation will change over the next 20 frames while the wearer moves through the scene.",
      "input": "Current multimodal features excluding the camera-translation block and caption text.",
      "middle_process": "Build a future target from camera-translation difference at a four-window horizon, then regress that future ego-motion delta from current sensors.",
      "output": "A future camera-translation delta vector.",
      "minimal_baseline": "Ridge regressor with a 20-frame forecast horizon.",
      "neural_baseline": "One-hidden-layer MLP regressor with the same horizon and split.",
      "metric_name": "MAE",
      "metric_key": "mae",
      "metric_direction": "lower",
      "current_limit": "This is a compact world-model proxy; it does not build a persistent map, scene graph, or object permanence model."
    }
  },
  "tasks": {
    "body_motion_intensity": {
      "train_windows": 812,
      "test_windows": 348,
      "target_threshold_train_median": 0.476467490196228,
      "input_dim": 6425,
      "target_source": "hand/body joint delta between neighboring windows",
      "minimal": {
        "accuracy": 0.7787356321839081,
        "macro_f1": 0.7685510688836106,
        "positive_rate_true": 0.35919540229885055,
        "positive_rate_pred": 0.43103448275862066,
        "num_test": 348
      },
      "neural_mlp": {
        "accuracy": 0.8218390804597702,
        "macro_f1": 0.8163807189542484,
        "positive_rate_true": 0.35919540229885055,
        "positive_rate_pred": 0.46839080459770116,
        "num_test": 348
      },
      "neural_training": {
        "available": true,
        "epochs": 25,
        "hidden_dim": 128,
        "loss_history": [
          0.3781587006570083,
          0.22267521227815468,
          0.13476210898660088,
          0.1000808995639162,
          0.074504286399469,
          0.06342194511972625,
          0.052560133978797885,
          0.04292357993757196,
          0.030875398993051698,
          0.03208484900702396,
          0.031622758848601815,
          0.02632193020522007,
          0.022023197674086968,
          0.017444461822656576,
          0.017830463406157317,
          0.01671520966848386,
          0.012931180691227244,
          0.009671396886691304,
          0.008911670790067668,
          0.006801604596081332,
          0.006320740412828958,
          0.0066360303526514855,
          0.006593080356790514,
          0.0066198104168999515,
          0.005764562139984936
        ]
      }
    },
    "multi_view_consistency_retrieval": {
      "train_windows": 813,
      "test_windows": 348,
      "query_block": "video_fisheye_cam0",
      "target_block": "video_stereo_left",
      "query_dim": 686,
      "target_dim": 686,
      "minimal": {
        "mrr": 0.552907407283783,
        "top1": 0.41954022988505746,
        "top5": 0.7068965517241379,
        "top10": 0.8304597701149425,
        "median_rank": 2.0,
        "num_test": 348
      },
      "neural_mlp": {
        "mrr": 0.3451290726661682,
        "top1": 0.22988505747126436,
        "top5": 0.4540229885057471,
        "top10": 0.5862068965517241,
        "median_rank": 7.0,
        "num_test": 348
      },
      "neural_training": {
        "available": true,
        "epochs": 25,
        "hidden_dim": 128,
        "loss_history": [
          0.9802072261532294,
          0.8302588426172367,
          0.7033629322843798,
          0.6110119350428658,
          0.5481111645551918,
          0.4936590702771261,
          0.45608002786853363,
          0.42634973314855373,
          0.39841723533720697,
          0.37698091557398344,
          0.3569651228108823,
          0.34343410762620297,
          0.330490266008424,
          0.31780097566700805,
          0.30736573823117097,
          0.29689836681695586,
          0.2889042267573569,
          0.28041428760115655,
          0.27232901398252884,
          0.26450822736064444,
          0.2603846002292223,
          0.25524227731708815,
          0.2491118810053681,
          0.24406919165584287,
          0.23761634876880552
        ]
      }
    },
    "action_phase_progress": {
      "train_windows": 813,
      "test_windows": 348,
      "input_dim": 7650,
      "target_source": "normalized position inside contiguous action-label runs",
      "minimal": {
        "mse": 0.1694207489490509,
        "mae": 0.3267306983470917,
        "r2": -1.0234981195481887,
        "num_test": 348
      },
      "neural_mlp": {
        "mse": 0.13695015013217926,
        "mae": 0.29772356152534485,
        "r2": -0.6356814781977305,
        "num_test": 348
      },
      "neural_training": {
        "available": true,
        "epochs": 25,
        "hidden_dim": 128,
        "loss_history": [
          3.0329475983482443,
          1.2599444914274696,
          0.85070915826279,
          0.619871592133101,
          0.48542586764432993,
          0.42507788102711846,
          0.29655895087991696,
          0.2454112539798718,
          0.20303582259869662,
          0.15765487147909835,
          0.13586207317946727,
          0.13174593402817386,
          0.1372238758305342,
          0.13300886880354396,
          0.124804903293741,
          0.10118342695017729,
          0.09366046138758442,
          0.093825314583447,
          0.09046100699520346,
          0.08759752604171066,
          0.07746588366351298,
          0.07865790530654337,
          0.06554747357744573,
          0.06426462710251961,
          0.05965157469956784
        ]
      }
    },
    "ego_motion_forecast": {
      "train_windows": 810,
      "test_windows": 347,
      "forecast_horizon_windows": 4,
      "forecast_horizon_frames": 20,
      "input_dim": 7629,
      "target_dim": 21,
      "target_source": "future minus current camera_translation feature block",
      "minimal": {
        "mse": 3.318504571914673,
        "mae": 0.1699875444173813,
        "r2": -5674.148995961734,
        "num_test": 347
      },
      "neural_mlp": {
        "mse": 0.7483499646186829,
        "mae": 0.0953596979379654,
        "r2": -1278.7925303160894,
        "num_test": 347
      },
      "neural_training": {
        "available": true,
        "epochs": 25,
        "hidden_dim": 128,
        "loss_history": [
          1.0473333364651527,
          0.6150051871935527,
          0.4443125699773247,
          0.35273332441294636,
          0.2848470503901258,
          0.23567287907188322,
          0.19179441708105582,
          0.17041770444240098,
          0.1639460261589215,
          0.13933086925082735,
          0.12862536557662635,
          0.11986567676067353,
          0.10877050499857208,
          0.1035334379768666,
          0.09534551799297333,
          0.0938962385242368,
          0.08685412046350079,
          0.08199865805146135,
          0.0804690555565887,
          0.08263200140661664,
          0.07935356992630312,
          0.08362679818162212,
          0.07105950446408472,
          0.07413670999216444,
          0.07001579207034758
        ]
      }
    }
  }
}