{ "source": { "shared_windows": "results/episode_task_suite/shared_windows.npz", "windows_csv": "results/episode_task_suite/windows.csv", "feature_manifest": "results/episode_task_suite/feature_manifest.json" }, "dataset_scope": { "sample_episode_count": 1, "num_windows": 1161, "feature_dim": 8546, "first_start_frame": 0, "last_end_frame": 5819, "warning": "Single public sample episode; these extension probes validate task design and pipeline mechanics, not cross-episode generalization." }, "baselines": { "minimal": "Ridge classifiers/regressors/projections plus cosine retrieval on the committed feature tensor.", "neural_mlp": "Small one-hidden-layer PyTorch MLP heads using the same inputs, targets, chronological split, and evaluator." }, "run_config": { "train_fraction": 0.7, "ridge_l2": 10.0, "seed": 7, "future_windows": 4, "neural_epochs": 25, "neural_hidden_dim": 128, "neural_batch_size": 128, "skip_neural": false }, "task_specs": { "body_motion_intensity": { "direction": "A", "direction_name": "Human Modeling & Motion Understanding", "name": "Body and Hand Motion Intensity", "family": "classification", "case_study": "A window with a fast reach or pour should be classified as high motion; a steady holding window should be low motion.", "input": "Current non-mocap feature blocks: video, AAC audio, depth, camera pose/rotation, IMU, SLAM, calibration, and language context.", "middle_process": "Compute the target from hand/body joint changes between neighboring windows, hide the mocap blocks from the input, then classify high versus low motion using the train-set median as the threshold.", "output": "Binary label: high_motion or low_motion.", "minimal_baseline": "Ridge classifier on standardized non-mocap features.", "neural_baseline": "One-hidden-layer MLP binary classifier on the same input features.", "metric_name": "macro-F1", "metric_key": "macro_f1", "metric_direction": "higher", "current_limit": "This is a motion-energy proxy, not a SMPL/MANO body model or a generative motion prior." }, "multi_view_consistency_retrieval": { "direction": "B", "direction_name": "3D/4D Reconstruction & Neural Rendering", "name": "Multi-View Consistency Retrieval", "family": "retrieval", "case_study": "Given the fisheye camera features for a pouring moment, retrieve the synchronized stereo-left view from the same time window.", "input": "Query side: fisheye_cam0 video feature block. Candidate side: stereo_left video feature block from held-out windows.", "middle_process": "Learn a projection from one camera-view feature space into another, then rank held-out candidate windows by cosine similarity.", "output": "Ranked candidate windows; the correct synchronized view should rank near the top.", "minimal_baseline": "Ridge projection followed by cosine nearest-neighbor retrieval.", "neural_baseline": "One-hidden-layer MLP projection followed by the same cosine retrieval evaluator.", "metric_name": "MRR", "metric_key": "mrr", "metric_direction": "higher", "current_limit": "This checks calibrated multi-view signal, but it is still feature retrieval, not NeRF, Gaussian Splatting, or novel-view synthesis." }, "action_phase_progress": { "direction": "C", "direction_name": "Egocentric Vision & Interaction", "name": "Action Phase Progress Estimation", "family": "regression", "case_study": "Inside a Pour coffee action segment, estimate whether the current window is near the beginning, middle, or end of that action.", "input": "Current non-caption multimodal feature vector, so the label text cannot be copied directly from the language block.", "middle_process": "Convert contiguous action-label runs into a normalized 0-to-1 progress target, train on earlier windows, and regress progress for later windows.", "output": "A scalar progress value between 0.0 and 1.0 for the current action segment.", "minimal_baseline": "Ridge regressor on standardized non-caption features.", "neural_baseline": "One-hidden-layer MLP regressor on the same input features.", "metric_name": "MAE", "metric_key": "mae", "metric_direction": "lower", "current_limit": "This is an action-structure probe inside one episode, not a general intent model across homes, people, or tasks." }, "ego_motion_forecast": { "direction": "D", "direction_name": "Scene Reconstruction & World Modeling", "name": "Short-Horizon Ego-Motion Forecasting", "family": "forecast", "case_study": "From the current sensors, predict how the camera translation will change over the next 20 frames while the wearer moves through the scene.", "input": "Current multimodal features excluding the camera-translation block and caption text.", "middle_process": "Build a future target from camera-translation difference at a four-window horizon, then regress that future ego-motion delta from current sensors.", "output": "A future camera-translation delta vector.", "minimal_baseline": "Ridge regressor with a 20-frame forecast horizon.", "neural_baseline": "One-hidden-layer MLP regressor with the same horizon and split.", "metric_name": "MAE", "metric_key": "mae", "metric_direction": "lower", "current_limit": "This is a compact world-model proxy; it does not build a persistent map, scene graph, or object permanence model." } }, "tasks": { "body_motion_intensity": { "train_windows": 812, "test_windows": 348, "target_threshold_train_median": 0.476467490196228, "input_dim": 6425, "target_source": "hand/body joint delta between neighboring windows", "minimal": { "accuracy": 0.7787356321839081, "macro_f1": 0.7685510688836106, "positive_rate_true": 0.35919540229885055, "positive_rate_pred": 0.43103448275862066, "num_test": 348 }, "neural_mlp": { "accuracy": 0.8218390804597702, "macro_f1": 0.8163807189542484, "positive_rate_true": 0.35919540229885055, "positive_rate_pred": 0.46839080459770116, "num_test": 348 }, "neural_training": { "available": true, "epochs": 25, "hidden_dim": 128, "loss_history": [ 0.3781587006570083, 0.22267521227815468, 0.13476210898660088, 0.1000808995639162, 0.074504286399469, 0.06342194511972625, 0.052560133978797885, 0.04292357993757196, 0.030875398993051698, 0.03208484900702396, 0.031622758848601815, 0.02632193020522007, 0.022023197674086968, 0.017444461822656576, 0.017830463406157317, 0.01671520966848386, 0.012931180691227244, 0.009671396886691304, 0.008911670790067668, 0.006801604596081332, 0.006320740412828958, 0.0066360303526514855, 0.006593080356790514, 0.0066198104168999515, 0.005764562139984936 ] } }, "multi_view_consistency_retrieval": { "train_windows": 813, "test_windows": 348, "query_block": "video_fisheye_cam0", "target_block": "video_stereo_left", "query_dim": 686, "target_dim": 686, "minimal": { "mrr": 0.552907407283783, "top1": 0.41954022988505746, "top5": 0.7068965517241379, "top10": 0.8304597701149425, "median_rank": 2.0, "num_test": 348 }, "neural_mlp": { "mrr": 0.3451290726661682, "top1": 0.22988505747126436, "top5": 0.4540229885057471, "top10": 0.5862068965517241, "median_rank": 7.0, "num_test": 348 }, "neural_training": { "available": true, "epochs": 25, "hidden_dim": 128, "loss_history": [ 0.9802072261532294, 0.8302588426172367, 0.7033629322843798, 0.6110119350428658, 0.5481111645551918, 0.4936590702771261, 0.45608002786853363, 0.42634973314855373, 0.39841723533720697, 0.37698091557398344, 0.3569651228108823, 0.34343410762620297, 0.330490266008424, 0.31780097566700805, 0.30736573823117097, 0.29689836681695586, 0.2889042267573569, 0.28041428760115655, 0.27232901398252884, 0.26450822736064444, 0.2603846002292223, 0.25524227731708815, 0.2491118810053681, 0.24406919165584287, 0.23761634876880552 ] } }, "action_phase_progress": { "train_windows": 813, "test_windows": 348, "input_dim": 7650, "target_source": "normalized position inside contiguous action-label runs", "minimal": { "mse": 0.1694207489490509, "mae": 0.3267306983470917, "r2": -1.0234981195481887, "num_test": 348 }, "neural_mlp": { "mse": 0.13695015013217926, "mae": 0.29772356152534485, "r2": -0.6356814781977305, "num_test": 348 }, "neural_training": { "available": true, "epochs": 25, "hidden_dim": 128, "loss_history": [ 3.0329475983482443, 1.2599444914274696, 0.85070915826279, 0.619871592133101, 0.48542586764432993, 0.42507788102711846, 0.29655895087991696, 0.2454112539798718, 0.20303582259869662, 0.15765487147909835, 0.13586207317946727, 0.13174593402817386, 0.1372238758305342, 0.13300886880354396, 0.124804903293741, 0.10118342695017729, 0.09366046138758442, 0.093825314583447, 0.09046100699520346, 0.08759752604171066, 0.07746588366351298, 0.07865790530654337, 0.06554747357744573, 0.06426462710251961, 0.05965157469956784 ] } }, "ego_motion_forecast": { "train_windows": 810, "test_windows": 347, "forecast_horizon_windows": 4, "forecast_horizon_frames": 20, "input_dim": 7629, "target_dim": 21, "target_source": "future minus current camera_translation feature block", "minimal": { "mse": 3.318504571914673, "mae": 0.1699875444173813, "r2": -5674.148995961734, "num_test": 347 }, "neural_mlp": { "mse": 0.7483499646186829, "mae": 0.0953596979379654, "r2": -1278.7925303160894, "num_test": 347 }, "neural_training": { "available": true, "epochs": 25, "hidden_dim": 128, "loss_history": [ 1.0473333364651527, 0.6150051871935527, 0.4443125699773247, 0.35273332441294636, 0.2848470503901258, 0.23567287907188322, 0.19179441708105582, 0.17041770444240098, 0.1639460261589215, 0.13933086925082735, 0.12862536557662635, 0.11986567676067353, 0.10877050499857208, 0.1035334379768666, 0.09534551799297333, 0.0938962385242368, 0.08685412046350079, 0.08199865805146135, 0.0804690555565887, 0.08263200140661664, 0.07935356992630312, 0.08362679818162212, 0.07105950446408472, 0.07413670999216444, 0.07001579207034758 ] } } } }