ropedia-xperience-10m-task-baselines / metrics /research_direction_extensions.json
cy0307's picture
Publish Ropedia Xperience-10M task baseline cards
e647650 verified
Raw
History Blame
11.9 kB
{
"source": {
"shared_windows": "results/episode_task_suite/shared_windows.npz",
"windows_csv": "results/episode_task_suite/windows.csv",
"feature_manifest": "results/episode_task_suite/feature_manifest.json"
},
"dataset_scope": {
"sample_episode_count": 1,
"num_windows": 1161,
"feature_dim": 8378,
"first_start_frame": 0,
"last_end_frame": 5819,
"warning": "Single public sample episode; these extension probes validate task design and pipeline mechanics, not cross-episode generalization."
},
"baselines": {
"minimal": "Ridge classifiers/regressors/projections plus cosine retrieval on the committed feature tensor.",
"neural_mlp": "Small one-hidden-layer PyTorch MLP heads using the same inputs, targets, chronological split, and evaluator."
},
"run_config": {
"train_fraction": 0.7,
"ridge_l2": 10.0,
"seed": 7,
"future_windows": 4,
"neural_epochs": 25,
"neural_hidden_dim": 128,
"neural_batch_size": 128,
"skip_neural": false
},
"task_specs": {
"body_motion_intensity": {
"direction": "A",
"direction_name": "Human Modeling & Motion Understanding",
"name": "Body/hand motion intensity",
"family": "classification",
"case_study": "A window with a fast reach or pour should be classified as high motion; a steady holding window should be low motion.",
"input": "Current non-mocap feature blocks: video, depth, camera pose/rotation, IMU, SLAM, calibration, and language context.",
"middle_process": "Compute the target from hand/body joint changes between neighboring windows, hide the mocap blocks from the input, then classify high versus low motion using the train-set median as the threshold.",
"output": "Binary label: high_motion or low_motion.",
"minimal_baseline": "Ridge classifier on standardized non-mocap features.",
"neural_baseline": "One-hidden-layer MLP binary classifier on the same input features.",
"metric_name": "macro-F1",
"metric_key": "macro_f1",
"metric_direction": "higher",
"current_limit": "This is a motion-energy proxy, not a SMPL/MANO body model or a generative motion prior."
},
"multi_view_consistency_retrieval": {
"direction": "B",
"direction_name": "3D/4D Reconstruction & Neural Rendering",
"name": "Multi-view consistency retrieval",
"family": "retrieval",
"case_study": "Given the fisheye camera features for a pouring moment, retrieve the synchronized stereo-left view from the same time window.",
"input": "Query side: fisheye_cam0 video feature block. Candidate side: stereo_left video feature block from held-out windows.",
"middle_process": "Learn a projection from one camera-view feature space into another, then rank held-out candidate windows by cosine similarity.",
"output": "Ranked candidate windows; the correct synchronized view should rank near the top.",
"minimal_baseline": "Ridge projection followed by cosine nearest-neighbor retrieval.",
"neural_baseline": "One-hidden-layer MLP projection followed by the same cosine retrieval evaluator.",
"metric_name": "MRR",
"metric_key": "mrr",
"metric_direction": "higher",
"current_limit": "This checks calibrated multi-view signal, but it is still feature retrieval, not NeRF, Gaussian Splatting, or novel-view synthesis."
},
"action_phase_progress": {
"direction": "C",
"direction_name": "Egocentric Vision & Interaction",
"name": "Action phase progress",
"family": "regression",
"case_study": "Inside a Pour coffee action segment, estimate whether the current window is near the beginning, middle, or end of that action.",
"input": "Current non-caption multimodal feature vector, so the label text cannot be copied directly from the language block.",
"middle_process": "Convert contiguous action-label runs into a normalized 0-to-1 progress target, train on earlier windows, and regress progress for later windows.",
"output": "A scalar progress value between 0.0 and 1.0 for the current action segment.",
"minimal_baseline": "Ridge regressor on standardized non-caption features.",
"neural_baseline": "One-hidden-layer MLP regressor on the same input features.",
"metric_name": "MAE",
"metric_key": "mae",
"metric_direction": "lower",
"current_limit": "This is an action-structure probe inside one episode, not a general intent model across homes, people, or tasks."
},
"ego_motion_forecast": {
"direction": "D",
"direction_name": "Scene Reconstruction & World Modeling",
"name": "Short-horizon ego-motion forecast",
"family": "forecast",
"case_study": "From the current sensors, predict how the camera translation will change over the next 20 frames while the wearer moves through the scene.",
"input": "Current multimodal features excluding the camera-translation block and caption text.",
"middle_process": "Build a future target from camera-translation difference at a four-window horizon, then regress that future ego-motion delta from current sensors.",
"output": "A future camera-translation delta vector.",
"minimal_baseline": "Ridge regressor with a 20-frame forecast horizon.",
"neural_baseline": "One-hidden-layer MLP regressor with the same horizon and split.",
"metric_name": "MAE",
"metric_key": "mae",
"metric_direction": "lower",
"current_limit": "This is a compact world-model proxy; it does not build a persistent map, scene graph, or object permanence model."
}
},
"tasks": {
"body_motion_intensity": {
"train_windows": 812,
"test_windows": 348,
"target_threshold_train_median": 0.476467490196228,
"input_dim": 6257,
"target_source": "hand/body joint delta between neighboring windows",
"minimal": {
"accuracy": 0.7931034482758621,
"macro_f1": 0.7827413984461709,
"positive_rate_true": 0.35919540229885055,
"positive_rate_pred": 0.4224137931034483,
"num_test": 348
},
"neural_mlp": {
"accuracy": 0.8045977011494253,
"macro_f1": 0.7986111111111112,
"positive_rate_true": 0.35919540229885055,
"positive_rate_pred": 0.46839080459770116,
"num_test": 348
},
"neural_training": {
"available": true,
"epochs": 25,
"hidden_dim": 128,
"loss_history": [
0.37396696033736165,
0.2156323000715284,
0.136186269006412,
0.10160321393623728,
0.07785259978157547,
0.06464119376660568,
0.05246563551240954,
0.04177419132933828,
0.03559323893905861,
0.03164790260600926,
0.029612853728565088,
0.02614598715301658,
0.022772305264261557,
0.02064551915881669,
0.021218053400149487,
0.015741589412625347,
0.018090384899927623,
0.012638344971940215,
0.009894669190131752,
0.010873594465826092,
0.008848077248268086,
0.007805832091654683,
0.0068195829434054235,
0.005149830504334325,
0.005361259917228533
]
}
},
"multi_view_consistency_retrieval": {
"train_windows": 813,
"test_windows": 348,
"query_block": "video_fisheye_cam0",
"target_block": "video_stereo_left",
"query_dim": 686,
"target_dim": 686,
"minimal": {
"mrr": 0.5533982515335083,
"top1": 0.41954022988505746,
"top5": 0.7068965517241379,
"top10": 0.8304597701149425,
"median_rank": 2.0,
"num_test": 348
},
"neural_mlp": {
"mrr": 0.34691643714904785,
"top1": 0.23275862068965517,
"top5": 0.46264367816091956,
"top10": 0.5890804597701149,
"median_rank": 7.0,
"num_test": 348
},
"neural_training": {
"available": true,
"epochs": 25,
"hidden_dim": 128,
"loss_history": [
0.9800718805740094,
0.8296866191855803,
0.7029420470986126,
0.6089339927846948,
0.5426930648814268,
0.49323386093200644,
0.45315542230600214,
0.4240272395578551,
0.3964498403400806,
0.37567753094588696,
0.3599070675332021,
0.3417643405048023,
0.32952829051721577,
0.31516501450450657,
0.3070896395824639,
0.29752101269888553,
0.287490411878072,
0.2791558311654193,
0.2707079971921693,
0.2669465311998811,
0.2603630047442728,
0.2501040017656148,
0.24714160980920216,
0.24146720613060843,
0.23866056472173036
]
}
},
"action_phase_progress": {
"train_windows": 813,
"test_windows": 348,
"input_dim": 7482,
"target_source": "normalized position inside contiguous action-label runs",
"minimal": {
"mse": 0.18191061913967133,
"mae": 0.3415532410144806,
"r2": -1.1726725562963778,
"num_test": 348
},
"neural_mlp": {
"mse": 0.14496584236621857,
"mae": 0.3038153648376465,
"r2": -0.73141785516685,
"num_test": 348
},
"neural_training": {
"available": true,
"epochs": 25,
"hidden_dim": 128,
"loss_history": [
3.4407916824169558,
1.4953648904739418,
1.0026900049065313,
0.6300096198788135,
0.5448755314355993,
0.4602517489650886,
0.3603706451506339,
0.2813053481179996,
0.2830014601991332,
0.23328637721207163,
0.18929187855414065,
0.16413429575579339,
0.1399544254586852,
0.14506905856754213,
0.12422180419551784,
0.12750434244984044,
0.1191924728022437,
0.10936984800141383,
0.10926014599178403,
0.1082181931824995,
0.11665978281773617,
0.11045804545558247,
0.12143501237969558,
0.0925682960639434,
0.0797398226910705
]
}
},
"ego_motion_forecast": {
"train_windows": 810,
"test_windows": 347,
"forecast_horizon_windows": 4,
"forecast_horizon_frames": 20,
"input_dim": 7461,
"target_dim": 21,
"target_source": "future minus current camera_translation feature block",
"minimal": {
"mse": 4.988531589508057,
"mae": 0.19889304041862488,
"r2": -8530.149735441328,
"num_test": 347
},
"neural_mlp": {
"mse": 0.8441281914710999,
"mae": 0.09888631105422974,
"r2": -1442.5879263394172,
"num_test": 347
},
"neural_training": {
"available": true,
"epochs": 25,
"hidden_dim": 128,
"loss_history": [
1.042560530886238,
0.6099034593429095,
0.4637497854085616,
0.35177371987590084,
0.2794519517892673,
0.2182157137143759,
0.19129328433378243,
0.16159257454636655,
0.15545119175940383,
0.13092747231324514,
0.12344975640744339,
0.11533710492981805,
0.10952432874912098,
0.1020829943963039,
0.09445952103461748,
0.10036175438651332,
0.09325228254368276,
0.08845738531262787,
0.07980410636023239,
0.07802200188607346,
0.07408671476590781,
0.07979858985837594,
0.0699865288388582,
0.07628073431091544,
0.06997921005075361
]
}
}
}
}