cy0307 commited on
Commit
f67c53b
·
verified ·
1 Parent(s): 39b6985

Add 128-episode raw-feature baseline results

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/input_report.json +89 -0
  2. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/logs/gpu0_tasks01_05.log +9 -0
  3. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/logs/gpu0_tasks01_05_rerun.log +9 -0
  4. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/logs/gpu1_tasks06_10.log +9 -0
  5. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/logs/gpu2_tasks11_15.log +9 -0
  6. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/logs/gpu3_task16_rerun_4096cap.log +5 -0
  7. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/logs/gpu3_tasks16_20.log +9 -0
  8. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/metrics_summary.csv +3 -0
  9. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/metrics_summary_all.csv +41 -0
  10. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/action_object_relation/metrics.json +62 -0
  11. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/action_object_relation/predictions.csv +0 -0
  12. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json +13 -0
  13. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/caption_grounding/metrics.json +52 -0
  14. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/contact_prediction/metrics.json +62 -0
  15. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/contact_prediction/predictions.csv +0 -0
  16. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/cross_modal_retrieval/metrics.json +52 -0
  17. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/hand_trajectory_forecast/metrics.json +52 -0
  18. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/imu_to_hand_pose/metrics.json +52 -0
  19. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/interaction_text_prediction/metrics.json +13 -0
  20. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/long_horizon_next_action/metrics.json +62 -0
  21. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/long_horizon_next_action/predictions.csv +0 -0
  22. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/misalignment_detection/metrics.json +62 -0
  23. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/misalignment_detection/predictions.csv +0 -0
  24. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/modality_reconstruction/metrics.json +52 -0
  25. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/next_action/metrics.json +62 -0
  26. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/next_action/predictions.csv +0 -0
  27. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/next_subtask_forecast/metrics.json +62 -0
  28. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/next_subtask_forecast/predictions.csv +0 -0
  29. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/object_relevance/metrics.json +51 -0
  30. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/object_set_forecast/metrics.json +51 -0
  31. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/temporal_order/metrics.json +62 -0
  32. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/temporal_order/predictions.csv +0 -0
  33. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/time_to_transition/metrics.json +52 -0
  34. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/timeline_action/metrics.json +62 -0
  35. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/timeline_action/predictions.csv +0 -0
  36. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/timeline_subtask/metrics.json +62 -0
  37. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/timeline_subtask/predictions.csv +0 -0
  38. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/transition_detection/metrics.json +62 -0
  39. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/transition_detection/predictions.csv +0 -0
  40. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/run_summary.json +43 -0
  41. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/run_summary_all.json +409 -0
  42. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/action_object_relation/metrics.json +38 -0
  43. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/action_object_relation/predictions.csv +0 -0
  44. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/caption_grounding/metrics.json +25 -0
  45. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/contact_prediction/metrics.json +38 -0
  46. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/contact_prediction/predictions.csv +0 -0
  47. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/cross_modal_retrieval/metrics.json +25 -0
  48. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/hand_trajectory_forecast/metrics.json +25 -0
  49. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/imu_to_hand_pose/metrics.json +25 -0
  50. results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/long_horizon_next_action/metrics.json +38 -0
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/input_report.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "load_report": {
3
+ "resolved_npz_files": 357,
4
+ "loaded_feature_rows": 34269,
5
+ "input_rows": 34269,
6
+ "dropped_rows": 0,
7
+ "missing_path_examples": [],
8
+ "feature_dim": 4430
9
+ },
10
+ "split_counts": {
11
+ "train": 25629,
12
+ "val": 4608,
13
+ "test": 4032
14
+ },
15
+ "feature_manifest": [
16
+ {
17
+ "name": "hand_left_joints",
18
+ "start": 0,
19
+ "end": 441,
20
+ "dim": 441
21
+ },
22
+ {
23
+ "name": "hand_right_joints",
24
+ "start": 441,
25
+ "end": 882,
26
+ "dim": 441
27
+ },
28
+ {
29
+ "name": "body_joints",
30
+ "start": 882,
31
+ "end": 1974,
32
+ "dim": 1092
33
+ },
34
+ {
35
+ "name": "body_contacts",
36
+ "start": 1974,
37
+ "end": 2121,
38
+ "dim": 147
39
+ },
40
+ {
41
+ "name": "camera_translation",
42
+ "start": 2121,
43
+ "end": 2142,
44
+ "dim": 21
45
+ },
46
+ {
47
+ "name": "camera_rotation_matrix",
48
+ "start": 2142,
49
+ "end": 2205,
50
+ "dim": 63
51
+ },
52
+ {
53
+ "name": "imu_accel_gyro",
54
+ "start": 2205,
55
+ "end": 2247,
56
+ "dim": 42
57
+ },
58
+ {
59
+ "name": "depth_confidence",
60
+ "start": 2247,
61
+ "end": 3227,
62
+ "dim": 980
63
+ },
64
+ {
65
+ "name": "audio_fisheye_cam0_aac",
66
+ "start": 3227,
67
+ "end": 3395,
68
+ "dim": 168
69
+ },
70
+ {
71
+ "name": "caption_objects_interaction_text",
72
+ "start": 3395,
73
+ "end": 4291,
74
+ "dim": 896
75
+ },
76
+ {
77
+ "name": "slam_point_cloud",
78
+ "start": 4291,
79
+ "end": 4313,
80
+ "dim": 22
81
+ },
82
+ {
83
+ "name": "calibration",
84
+ "start": 4313,
85
+ "end": 4430,
86
+ "dim": 117
87
+ }
88
+ ]
89
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/logs/gpu0_tasks01_05.log ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [raw20] loading rows from results/omni_finetune/xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_dataset/dataset.jsonl
2
+ [raw20] loading feature matrix for 34269 rows
3
+ [raw20] loaded 34269 x 4430 features from 357 NPZ files
4
+ [raw20] running timeline_action
5
+ [raw20] running timeline_subtask
6
+ [raw20] running transition_detection
7
+ [raw20] running next_action
8
+ [raw20] running hand_trajectory_forecast
9
+ [raw20] done; wrote 10 result records to results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/logs/gpu0_tasks01_05_rerun.log ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [raw20] loading rows from results/omni_finetune/xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_dataset/dataset.jsonl
2
+ [raw20] loading feature matrix for 34269 rows
3
+ [raw20] loaded 34269 x 4430 features from 357 NPZ files
4
+ [raw20] running timeline_action
5
+ [raw20] running timeline_subtask
6
+ [raw20] running transition_detection
7
+ [raw20] running next_action
8
+ [raw20] running hand_trajectory_forecast
9
+ [raw20] done; wrote 10 result records to results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/logs/gpu1_tasks06_10.log ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [raw20] loading rows from results/omni_finetune/xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_dataset/dataset.jsonl
2
+ [raw20] loading feature matrix for 34269 rows
3
+ [raw20] loaded 34269 x 4430 features from 357 NPZ files
4
+ [raw20] running contact_prediction
5
+ [raw20] running object_relevance
6
+ [raw20] running caption_grounding
7
+ [raw20] running cross_modal_retrieval
8
+ [raw20] running modality_reconstruction
9
+ [raw20] done; wrote 10 result records to results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/logs/gpu2_tasks11_15.log ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [raw20] loading rows from results/omni_finetune/xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_dataset/dataset.jsonl
2
+ [raw20] loading feature matrix for 34269 rows
3
+ [raw20] loaded 34269 x 4430 features from 357 NPZ files
4
+ [raw20] running temporal_order
5
+ [raw20] running misalignment_detection
6
+ [raw20] running long_horizon_next_action
7
+ [raw20] running next_subtask_forecast
8
+ [raw20] running interaction_text_prediction
9
+ [raw20] done; wrote 10 result records to results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/logs/gpu3_task16_rerun_4096cap.log ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [raw20] loading rows from results/omni_finetune/xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_dataset/dataset.jsonl
2
+ [raw20] loading feature matrix for 34269 rows
3
+ [raw20] loaded 34269 x 4430 features from 357 NPZ files
4
+ [raw20] running action_object_relation
5
+ [raw20] done; wrote 2 result records to results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/logs/gpu3_tasks16_20.log ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [raw20] loading rows from results/omni_finetune/xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_dataset/dataset.jsonl
2
+ [raw20] loading feature matrix for 34269 rows
3
+ [raw20] loaded 34269 x 4430 features from 357 NPZ files
4
+ [raw20] running action_object_relation
5
+ [raw20] running object_set_forecast
6
+ [raw20] running imu_to_hand_pose
7
+ [raw20] running camera_view_sync_retrieval
8
+ [raw20] running time_to_transition
9
+ [raw20] done; wrote 10 result records to results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/metrics_summary.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ task,task_display_name,model_family,status,primary_metric,primary_score,metric_direction,reason,error
2
+ action_object_relation,Action Object Relation,simple_raw128_centroid,pass,macro_f1,0.0,higher,,
3
+ action_object_relation,Action Object Relation,neural_mlp_raw128,pass,macro_f1,0.0,higher,,
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/metrics_summary_all.csv ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,model_family,status,primary_metric,primary_score,metric_direction,reason,error
2
+ action_object_relation,neural_mlp_raw128,pass,macro_f1,0.0,higher,,
3
+ action_object_relation,simple_raw128_centroid,pass,macro_f1,0.0,higher,,
4
+ camera_view_sync_retrieval,neural_mlp_raw128,unsupported,mrr,,higher,"128-episode NPZ manifest has camera pose plus audio/depth/caption features, but no two explicit video-view feature blocks for camera-view synchronization",
5
+ camera_view_sync_retrieval,simple_raw128_ridge,unsupported,mrr,,higher,"128-episode NPZ manifest has camera pose plus audio/depth/caption features, but no two explicit video-view feature blocks for camera-view synchronization",
6
+ caption_grounding,neural_mlp_raw128,pass,mrr,0.0063402121886610985,higher,,
7
+ caption_grounding,simple_raw128_ridge,pass,mrr,0.011150892823934555,higher,,
8
+ contact_prediction,neural_mlp_raw128,pass,macro_f1,1.0,higher,,
9
+ contact_prediction,simple_raw128_centroid,pass,macro_f1,0.886990707397193,higher,,
10
+ cross_modal_retrieval,neural_mlp_raw128,pass,mrr,0.002535284962505102,higher,,
11
+ cross_modal_retrieval,simple_raw128_ridge,pass,mrr,0.003459817497059703,higher,,
12
+ hand_trajectory_forecast,neural_mlp_raw128,pass,mae,0.18475216627120972,lower,,
13
+ hand_trajectory_forecast,simple_raw128_ridge,pass,mae,0.2729249894618988,lower,,
14
+ imu_to_hand_pose,neural_mlp_raw128,pass,mae,0.252998411655426,lower,,
15
+ imu_to_hand_pose,simple_raw128_ridge,pass,mae,0.22941437363624573,lower,,
16
+ interaction_text_prediction,neural_mlp_raw128,unsupported,macro_f1,,higher,raw 128-episode annotation.hdf5 interaction text is not present in the JSONL export; only hashed caption_objects_interaction_text features are available,
17
+ interaction_text_prediction,simple_raw128_centroid,unsupported,macro_f1,,higher,raw 128-episode annotation.hdf5 interaction text is not present in the JSONL export; only hashed caption_objects_interaction_text features are available,
18
+ long_horizon_next_action,neural_mlp_raw128,pass,macro_f1,0.001063859887389299,higher,,
19
+ long_horizon_next_action,simple_raw128_centroid,pass,macro_f1,0.0024280172369056294,higher,,
20
+ misalignment_detection,neural_mlp_raw128,pass,macro_f1,0.8272709077974252,higher,,
21
+ misalignment_detection,simple_raw128_centroid,pass,macro_f1,0.4958867673901769,higher,,
22
+ modality_reconstruction,neural_mlp_raw128,pass,r2,-1.3974418160502369,higher,,
23
+ modality_reconstruction,simple_raw128_ridge,pass,r2,-1.3450960391924882,higher,,
24
+ next_action,neural_mlp_raw128,pass,macro_f1,0.0018477984371755407,higher,,
25
+ next_action,simple_raw128_centroid,pass,macro_f1,0.003285273363482094,higher,,
26
+ next_subtask_forecast,neural_mlp_raw128,pass,macro_f1,0.0,higher,,
27
+ next_subtask_forecast,simple_raw128_centroid,pass,macro_f1,0.0,higher,,
28
+ object_relevance,neural_mlp_raw128_multilabel,pass,micro_f1,0.1765890386972509,higher,,
29
+ object_relevance,simple_raw128_ridge_multilabel,pass,micro_f1,0.0655376369662084,higher,,
30
+ object_set_forecast,neural_mlp_raw128_multilabel,pass,micro_f1,0.17523098630012288,higher,,
31
+ object_set_forecast,simple_raw128_ridge_multilabel,pass,micro_f1,0.06469493412657774,higher,,
32
+ temporal_order,neural_mlp_raw128,pass,macro_f1,0.8030047098504103,higher,,
33
+ temporal_order,simple_raw128_centroid,pass,macro_f1,0.49824413370686593,higher,,
34
+ time_to_transition,neural_mlp_raw128,pass,mae,42.374061584472656,lower,,
35
+ time_to_transition,simple_raw128_ridge,pass,mae,52.32759094238281,lower,,
36
+ timeline_action,neural_mlp_raw128,pass,macro_f1,0.0014955083181204041,higher,,
37
+ timeline_action,simple_raw128_centroid,pass,macro_f1,0.002915061325704321,higher,,
38
+ timeline_subtask,neural_mlp_raw128,pass,macro_f1,7.35632183908046e-05,higher,,
39
+ timeline_subtask,simple_raw128_centroid,pass,macro_f1,0.0,higher,,
40
+ transition_detection,neural_mlp_raw128,pass,macro_f1,0.4902206914147213,higher,,
41
+ transition_detection,simple_raw128_centroid,pass,macro_f1,0.4203613574238283,higher,,
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/action_object_relation/metrics.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "action_object_relation",
3
+ "task_display_name": "Action Object Relation",
4
+ "task_family": "classification",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "sensor features excluding hashed caption text",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 7.840090205664913,
16
+ "train_accuracy": 0.02232423101067169
17
+ },
18
+ {
19
+ "epoch": 5,
20
+ "loss": 1.9252043012845315,
21
+ "train_accuracy": 0.4901129943502825
22
+ },
23
+ {
24
+ "epoch": 10,
25
+ "loss": 0.8283010613446855,
26
+ "train_accuracy": 0.6963669177652229
27
+ },
28
+ {
29
+ "epoch": 15,
30
+ "loss": 0.5116730567724991,
31
+ "train_accuracy": 0.7887633396107973
32
+ },
33
+ {
34
+ "epoch": 20,
35
+ "loss": 0.37476183882840236,
36
+ "train_accuracy": 0.8343926553672316
37
+ },
38
+ {
39
+ "epoch": 25,
40
+ "loss": 0.2903947299819882,
41
+ "train_accuracy": 0.8655053358443189
42
+ }
43
+ ],
44
+ "num_train_windows": 25488,
45
+ "num_test_windows": 4014,
46
+ "num_classes": 4149,
47
+ "num_train_classes": 3058,
48
+ "input_dim": 3534,
49
+ "fit_input_dim": 2048,
50
+ "selected_column_count": 2048,
51
+ "splits": {
52
+ "test": {
53
+ "accuracy": 0.0,
54
+ "balanced_accuracy": 0.0,
55
+ "macro_f1": 0.0,
56
+ "weighted_f1": 0.0,
57
+ "num_eval_windows": 4014,
58
+ "num_classes": 4149
59
+ }
60
+ },
61
+ "primary_score": 0.0
62
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/action_object_relation/predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "camera_view_sync_retrieval",
3
+ "task_display_name": "Camera View Sync Retrieval",
4
+ "task_family": "retrieval",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "not run",
8
+ "primary_metric": "mrr",
9
+ "metric_direction": "higher",
10
+ "status": "unsupported",
11
+ "reason": "128-episode NPZ manifest has camera pose plus audio/depth/caption features, but no two explicit video-view feature blocks for camera-view synchronization",
12
+ "primary_score": null
13
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/caption_grounding/metrics.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "caption_grounding",
3
+ "task_display_name": "Language Grounding",
4
+ "task_family": "retrieval",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "non-caption sensor blocks projected to hashed caption/object/interaction block",
8
+ "primary_metric": "mrr",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 0.9730807784066104
16
+ },
17
+ {
18
+ "epoch": 5,
19
+ "loss": 0.8797651895419402
20
+ },
21
+ {
22
+ "epoch": 10,
23
+ "loss": 0.8487889279395084
24
+ },
25
+ {
26
+ "epoch": 15,
27
+ "loss": 0.8318103914064764
28
+ },
29
+ {
30
+ "epoch": 20,
31
+ "loss": 0.821267495579444
32
+ },
33
+ {
34
+ "epoch": 25,
35
+ "loss": 0.8125992868655396
36
+ }
37
+ ],
38
+ "num_train_windows": 25629,
39
+ "num_test_windows": 4032,
40
+ "input_dim": 3534,
41
+ "fit_input_dim": 2048,
42
+ "target_dim": 896,
43
+ "splits": {
44
+ "test": {
45
+ "mrr": 0.0063402121886610985,
46
+ "top1": 0.002232142857142857,
47
+ "median_rank": 1392.0,
48
+ "num_queries": 4032
49
+ }
50
+ },
51
+ "primary_score": 0.0063402121886610985
52
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/contact_prediction/metrics.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "contact_prediction",
3
+ "task_display_name": "Contact State Prediction",
4
+ "task_family": "classification",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "sensor features excluding hashed caption text",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 0.02249114251404696,
16
+ "train_accuracy": 0.9850950095594834
17
+ },
18
+ {
19
+ "epoch": 5,
20
+ "loss": 3.892005114592888e-06,
21
+ "train_accuracy": 1.0
22
+ },
23
+ {
24
+ "epoch": 10,
25
+ "loss": 1.6485579969316457e-06,
26
+ "train_accuracy": 1.0
27
+ },
28
+ {
29
+ "epoch": 15,
30
+ "loss": 7.494956065371638e-07,
31
+ "train_accuracy": 1.0
32
+ },
33
+ {
34
+ "epoch": 20,
35
+ "loss": 3.961833819914944e-07,
36
+ "train_accuracy": 1.0
37
+ },
38
+ {
39
+ "epoch": 25,
40
+ "loss": 3.9328009280915035e-07,
41
+ "train_accuracy": 1.0
42
+ }
43
+ ],
44
+ "num_train_windows": 25629,
45
+ "num_test_windows": 4032,
46
+ "num_classes": 2,
47
+ "num_train_classes": 2,
48
+ "input_dim": 3534,
49
+ "fit_input_dim": 2048,
50
+ "selected_column_count": 2048,
51
+ "splits": {
52
+ "test": {
53
+ "accuracy": 1.0,
54
+ "balanced_accuracy": 1.0,
55
+ "macro_f1": 1.0,
56
+ "weighted_f1": 1.0,
57
+ "num_eval_windows": 4032,
58
+ "num_classes": 2
59
+ }
60
+ },
61
+ "primary_score": 1.0
62
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/contact_prediction/predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/cross_modal_retrieval/metrics.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "cross_modal_retrieval",
3
+ "task_display_name": "Cross-Modal Retrieval",
4
+ "task_family": "retrieval",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "all non-depth sensor blocks projected to depth-confidence block",
8
+ "primary_metric": "mrr",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 0.7975420301166781
16
+ },
17
+ {
18
+ "epoch": 5,
19
+ "loss": 0.5641444217256827
20
+ },
21
+ {
22
+ "epoch": 10,
23
+ "loss": 0.5163868686951831
24
+ },
25
+ {
26
+ "epoch": 15,
27
+ "loss": 0.4953940257414378
28
+ },
29
+ {
30
+ "epoch": 20,
31
+ "loss": 0.4823577042322097
32
+ },
33
+ {
34
+ "epoch": 25,
35
+ "loss": 0.4730072832515932
36
+ }
37
+ ],
38
+ "num_train_windows": 25629,
39
+ "num_test_windows": 4032,
40
+ "input_dim": 3450,
41
+ "fit_input_dim": 2048,
42
+ "target_dim": 980,
43
+ "splits": {
44
+ "test": {
45
+ "mrr": 0.002535284962505102,
46
+ "top1": 0.0,
47
+ "median_rank": 1893.5,
48
+ "num_queries": 4032
49
+ }
50
+ },
51
+ "primary_score": 0.002535284962505102
52
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/hand_trajectory_forecast/metrics.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "hand_trajectory_forecast",
3
+ "task_display_name": "Hand Trajectory Forecasting",
4
+ "task_family": "regression",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "current non-hand/non-caption features; target hand joint feature block +20 frames",
8
+ "primary_metric": "mae",
9
+ "metric_direction": "lower",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 0.8031348615485111
16
+ },
17
+ {
18
+ "epoch": 5,
19
+ "loss": 0.5183512075500258
20
+ },
21
+ {
22
+ "epoch": 10,
23
+ "loss": 0.3657062302656374
24
+ },
25
+ {
26
+ "epoch": 15,
27
+ "loss": 0.28521906561420884
28
+ },
29
+ {
30
+ "epoch": 20,
31
+ "loss": 0.24974514583392887
32
+ },
33
+ {
34
+ "epoch": 25,
35
+ "loss": 0.22191785270443581
36
+ }
37
+ ],
38
+ "num_train_windows": 25502,
39
+ "num_test_windows": 4015,
40
+ "input_dim": 2652,
41
+ "fit_input_dim": 2048,
42
+ "target_dim": 882,
43
+ "splits": {
44
+ "test": {
45
+ "mae": 0.18475216627120972,
46
+ "rmse": 0.43915748596191406,
47
+ "r2": 0.11917128475110383,
48
+ "mean_l2": 9.090903282165527
49
+ }
50
+ },
51
+ "primary_score": 0.18475216627120972
52
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/imu_to_hand_pose/metrics.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "imu_to_hand_pose",
3
+ "task_display_name": "Imu To Hand Pose",
4
+ "task_family": "regression",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "IMU acceleration/gyroscope block reconstructs hand-joint blocks",
8
+ "primary_metric": "mae",
9
+ "metric_direction": "lower",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 0.9585941261004479
16
+ },
17
+ {
18
+ "epoch": 5,
19
+ "loss": 0.9127171490970956
20
+ },
21
+ {
22
+ "epoch": 10,
23
+ "loss": 0.8793233014191721
24
+ },
25
+ {
26
+ "epoch": 15,
27
+ "loss": 0.844667680290311
28
+ },
29
+ {
30
+ "epoch": 20,
31
+ "loss": 0.8032877514339916
32
+ },
33
+ {
34
+ "epoch": 25,
35
+ "loss": 0.7622054215548092
36
+ }
37
+ ],
38
+ "num_train_windows": 25629,
39
+ "num_test_windows": 4032,
40
+ "input_dim": 42,
41
+ "fit_input_dim": 42,
42
+ "target_dim": 882,
43
+ "splits": {
44
+ "test": {
45
+ "mae": 0.252998411655426,
46
+ "rmse": 0.5090259909629822,
47
+ "r2": -0.1798296121579115,
48
+ "mean_l2": 12.296762466430664
49
+ }
50
+ },
51
+ "primary_score": 0.252998411655426
52
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/interaction_text_prediction/metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "interaction_text_prediction",
3
+ "task_display_name": "Interaction Text Prediction",
4
+ "task_family": "classification",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "not run",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "unsupported",
11
+ "reason": "raw 128-episode annotation.hdf5 interaction text is not present in the JSONL export; only hashed caption_objects_interaction_text features are available",
12
+ "primary_score": null
13
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/long_horizon_next_action/metrics.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "long_horizon_next_action",
3
+ "task_display_name": "Long Horizon Next Action",
4
+ "task_family": "classification",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "current non-caption features; target action +100 frames",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 5.801561136460293,
16
+ "train_accuracy": 0.07798787298547949
17
+ },
18
+ {
19
+ "epoch": 5,
20
+ "loss": 1.1626708901668443,
21
+ "train_accuracy": 0.513602999840434
22
+ },
23
+ {
24
+ "epoch": 10,
25
+ "loss": 0.6149468233715847,
26
+ "train_accuracy": 0.6583692356789532
27
+ },
28
+ {
29
+ "epoch": 15,
30
+ "loss": 0.4115334245525937,
31
+ "train_accuracy": 0.7293362055209829
32
+ },
33
+ {
34
+ "epoch": 20,
35
+ "loss": 0.339562276861248,
36
+ "train_accuracy": 0.7671134514121589
37
+ },
38
+ {
39
+ "epoch": 25,
40
+ "loss": 0.25582631674416995,
41
+ "train_accuracy": 0.8034546034785384
42
+ }
43
+ ],
44
+ "num_train_windows": 25068,
45
+ "num_test_windows": 3951,
46
+ "num_classes": 1211,
47
+ "num_train_classes": 887,
48
+ "input_dim": 3534,
49
+ "fit_input_dim": 2048,
50
+ "selected_column_count": 2048,
51
+ "splits": {
52
+ "test": {
53
+ "accuracy": 0.0020248038471273096,
54
+ "balanced_accuracy": 0.0019148400100781054,
55
+ "macro_f1": 0.001063859887389299,
56
+ "weighted_f1": 0.0011975577833811789,
57
+ "num_eval_windows": 3951,
58
+ "num_classes": 1211
59
+ }
60
+ },
61
+ "primary_score": 0.001063859887389299
62
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/long_horizon_next_action/predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/misalignment_detection/metrics.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "misalignment_detection",
3
+ "task_display_name": "Multimodal Synchronization Detection",
4
+ "task_family": "classification",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "motion/camera/IMU query paired with aligned or shifted depth/audio target",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 0.584179866367518,
16
+ "train_accuracy": 0.6658104908295541
17
+ },
18
+ {
19
+ "epoch": 5,
20
+ "loss": 0.37274408434943146,
21
+ "train_accuracy": 0.8252598627443111
22
+ },
23
+ {
24
+ "epoch": 10,
25
+ "loss": 0.2720396854143376,
26
+ "train_accuracy": 0.8785969418469318
27
+ },
28
+ {
29
+ "epoch": 15,
30
+ "loss": 0.21012097329801926,
31
+ "train_accuracy": 0.9071116105470161
32
+ },
33
+ {
34
+ "epoch": 20,
35
+ "loss": 0.1713544537477405,
36
+ "train_accuracy": 0.925793634867761
37
+ },
38
+ {
39
+ "epoch": 25,
40
+ "loss": 0.14908742030340427,
41
+ "train_accuracy": 0.9357466789741943
42
+ }
43
+ ],
44
+ "num_train_windows": 49834,
45
+ "num_test_windows": 7840,
46
+ "num_classes": 2,
47
+ "num_train_classes": 2,
48
+ "input_dim": 3395,
49
+ "fit_input_dim": 2048,
50
+ "selected_column_count": 2048,
51
+ "splits": {
52
+ "test": {
53
+ "accuracy": 0.8274234693877551,
54
+ "balanced_accuracy": 0.8274234693877551,
55
+ "macro_f1": 0.8272709077974252,
56
+ "weighted_f1": 0.8272709077974253,
57
+ "num_eval_windows": 7840,
58
+ "num_classes": 2
59
+ }
60
+ },
61
+ "primary_score": 0.8272709077974252
62
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/misalignment_detection/predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/modality_reconstruction/metrics.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "modality_reconstruction",
3
+ "task_display_name": "Cross-Modal Reconstruction",
4
+ "task_family": "regression",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "all non-depth sensor blocks reconstruct depth-confidence block",
8
+ "primary_metric": "r2",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 0.795406650627551
16
+ },
17
+ {
18
+ "epoch": 5,
19
+ "loss": 0.5652745503729759
20
+ },
21
+ {
22
+ "epoch": 10,
23
+ "loss": 0.5144153572181445
24
+ },
25
+ {
26
+ "epoch": 15,
27
+ "loss": 0.4929477720702684
28
+ },
29
+ {
30
+ "epoch": 20,
31
+ "loss": 0.4814860376392508
32
+ },
33
+ {
34
+ "epoch": 25,
35
+ "loss": 0.4724465353307799
36
+ }
37
+ ],
38
+ "num_train_windows": 25629,
39
+ "num_test_windows": 4032,
40
+ "input_dim": 3450,
41
+ "fit_input_dim": 2048,
42
+ "target_dim": 980,
43
+ "splits": {
44
+ "test": {
45
+ "mae": 4963.66650390625,
46
+ "rmse": 381740.15625,
47
+ "r2": -1.3974418160502369,
48
+ "mean_l2": 3501567.0
49
+ }
50
+ },
51
+ "primary_score": -1.3974418160502369
52
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/next_action/metrics.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "next_action",
3
+ "task_display_name": "Next-Action Prediction",
4
+ "task_family": "classification",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "sensor features excluding hashed caption text",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 5.596322411365963,
16
+ "train_accuracy": 0.08123609973077374
17
+ },
18
+ {
19
+ "epoch": 5,
20
+ "loss": 1.0855037269383347,
21
+ "train_accuracy": 0.5211674275235085
22
+ },
23
+ {
24
+ "epoch": 10,
25
+ "loss": 0.5761551915229453,
26
+ "train_accuracy": 0.655780561083148
27
+ },
28
+ {
29
+ "epoch": 15,
30
+ "loss": 0.37495767347169556,
31
+ "train_accuracy": 0.7235943657575403
32
+ },
33
+ {
34
+ "epoch": 20,
35
+ "loss": 0.28612062079771855,
36
+ "train_accuracy": 0.758008505989309
37
+ },
38
+ {
39
+ "epoch": 25,
40
+ "loss": 0.2286090604913533,
41
+ "train_accuracy": 0.7849311327012369
42
+ }
43
+ ],
44
+ "num_train_windows": 25629,
45
+ "num_test_windows": 4032,
46
+ "num_classes": 1217,
47
+ "num_train_classes": 891,
48
+ "input_dim": 3534,
49
+ "fit_input_dim": 2048,
50
+ "selected_column_count": 2048,
51
+ "splits": {
52
+ "test": {
53
+ "accuracy": 0.004464285714285714,
54
+ "balanced_accuracy": 0.0034805255007437285,
55
+ "macro_f1": 0.0018477984371755407,
56
+ "weighted_f1": 0.003505490015635165,
57
+ "num_eval_windows": 4032,
58
+ "num_classes": 1217
59
+ }
60
+ },
61
+ "primary_score": 0.0018477984371755407
62
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/next_action/predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/next_subtask_forecast/metrics.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "next_subtask_forecast",
3
+ "task_display_name": "Next Subtask Forecast",
4
+ "task_family": "classification",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "current non-caption features; target subtask +100 frames",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 4.954612842526032,
16
+ "train_accuracy": 0.13140258496888463
17
+ },
18
+ {
19
+ "epoch": 5,
20
+ "loss": 0.9000010563651207,
21
+ "train_accuracy": 0.6606430509015477
22
+ },
23
+ {
24
+ "epoch": 10,
25
+ "loss": 0.46146437387365996,
26
+ "train_accuracy": 0.7953566299664911
27
+ },
28
+ {
29
+ "epoch": 15,
30
+ "loss": 0.28807071359682185,
31
+ "train_accuracy": 0.8596218286261369
32
+ },
33
+ {
34
+ "epoch": 20,
35
+ "loss": 0.21170043317832615,
36
+ "train_accuracy": 0.8893011010052657
37
+ },
38
+ {
39
+ "epoch": 25,
40
+ "loss": 0.17162281766032153,
41
+ "train_accuracy": 0.9070927078346896
42
+ }
43
+ ],
44
+ "num_train_windows": 25068,
45
+ "num_test_windows": 3951,
46
+ "num_classes": 891,
47
+ "num_train_classes": 651,
48
+ "input_dim": 3534,
49
+ "fit_input_dim": 2048,
50
+ "selected_column_count": 2048,
51
+ "splits": {
52
+ "test": {
53
+ "accuracy": 0.0,
54
+ "balanced_accuracy": 0.0,
55
+ "macro_f1": 0.0,
56
+ "weighted_f1": 0.0,
57
+ "num_eval_windows": 3951,
58
+ "num_classes": 891
59
+ }
60
+ },
61
+ "primary_score": 0.0
62
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/next_subtask_forecast/predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/object_relevance/metrics.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "object_relevance",
3
+ "task_display_name": "Object Relevance Prediction",
4
+ "task_family": "multi_label",
5
+ "model_family": "neural_mlp_raw128_multilabel",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "sensor features excluding hashed caption text",
8
+ "primary_metric": "micro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 0.3179367709060545
16
+ },
17
+ {
18
+ "epoch": 5,
19
+ "loss": 0.08152506840292292
20
+ },
21
+ {
22
+ "epoch": 10,
23
+ "loss": 0.05606942784121547
24
+ },
25
+ {
26
+ "epoch": 15,
27
+ "loss": 0.04551570554540027
28
+ },
29
+ {
30
+ "epoch": 20,
31
+ "loss": 0.03935748256850226
32
+ },
33
+ {
34
+ "epoch": 25,
35
+ "loss": 0.034496653577433256
36
+ }
37
+ ],
38
+ "num_train_windows": 25629,
39
+ "num_test_windows": 4032,
40
+ "num_labels": 256,
41
+ "input_dim": 3534,
42
+ "fit_input_dim": 2048,
43
+ "splits": {
44
+ "test": {
45
+ "micro_f1": 0.1765890386972509,
46
+ "macro_f1": 0.026473024044082846,
47
+ "exact_match": 0.010168650793650794
48
+ }
49
+ },
50
+ "primary_score": 0.1765890386972509
51
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/object_set_forecast/metrics.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "object_set_forecast",
3
+ "task_display_name": "Object Set Forecast",
4
+ "task_family": "multi_label",
5
+ "model_family": "neural_mlp_raw128_multilabel",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "current non-caption features; target object set +100 frames",
8
+ "primary_metric": "micro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 0.3173181395106896
16
+ },
17
+ {
18
+ "epoch": 5,
19
+ "loss": 0.08031858284117332
20
+ },
21
+ {
22
+ "epoch": 10,
23
+ "loss": 0.056561457963628554
24
+ },
25
+ {
26
+ "epoch": 15,
27
+ "loss": 0.04601203178259028
28
+ },
29
+ {
30
+ "epoch": 20,
31
+ "loss": 0.03955884521596166
32
+ },
33
+ {
34
+ "epoch": 25,
35
+ "loss": 0.03473486830971544
36
+ }
37
+ ],
38
+ "num_train_windows": 25068,
39
+ "num_test_windows": 3951,
40
+ "num_labels": 256,
41
+ "input_dim": 3534,
42
+ "fit_input_dim": 2048,
43
+ "splits": {
44
+ "test": {
45
+ "micro_f1": 0.17523098630012288,
46
+ "macro_f1": 0.021405026097435987,
47
+ "exact_match": 0.0030372057706909645
48
+ }
49
+ },
50
+ "primary_score": 0.17523098630012288
51
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/temporal_order/metrics.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "temporal_order",
3
+ "task_display_name": "Temporal Order Verification",
4
+ "task_family": "classification",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "concatenated adjacent sensor-window pairs",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 0.6823423368630226,
16
+ "train_accuracy": 0.534788566953798
17
+ },
18
+ {
19
+ "epoch": 5,
20
+ "loss": 0.47532859007268563,
21
+ "train_accuracy": 0.7526820673453406
22
+ },
23
+ {
24
+ "epoch": 10,
25
+ "loss": 0.3663804764708412,
26
+ "train_accuracy": 0.8226507439310885
27
+ },
28
+ {
29
+ "epoch": 15,
30
+ "loss": 0.30436070449387737,
31
+ "train_accuracy": 0.85561863743148
32
+ },
33
+ {
34
+ "epoch": 20,
35
+ "loss": 0.25795707907075516,
36
+ "train_accuracy": 0.8806382145653876
37
+ },
38
+ {
39
+ "epoch": 25,
40
+ "loss": 0.2276858064469268,
41
+ "train_accuracy": 0.8944988253719656
42
+ }
43
+ ],
44
+ "num_train_windows": 51080,
45
+ "num_test_windows": 8036,
46
+ "num_classes": 2,
47
+ "num_train_classes": 2,
48
+ "input_dim": 7068,
49
+ "fit_input_dim": 2048,
50
+ "selected_column_count": 2048,
51
+ "splits": {
52
+ "test": {
53
+ "accuracy": 0.8030114484818317,
54
+ "balanced_accuracy": 0.8030114484818318,
55
+ "macro_f1": 0.8030047098504103,
56
+ "weighted_f1": 0.8030047098504102,
57
+ "num_eval_windows": 8036,
58
+ "num_classes": 2
59
+ }
60
+ },
61
+ "primary_score": 0.8030047098504103
62
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/temporal_order/predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/time_to_transition/metrics.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "time_to_transition",
3
+ "task_display_name": "Time To Transition",
4
+ "task_family": "regression",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "non-caption sensor features regress frames to next action boundary capped at 200",
8
+ "primary_metric": "mae",
9
+ "metric_direction": "lower",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 0.8273973769419527
16
+ },
17
+ {
18
+ "epoch": 5,
19
+ "loss": 0.4202246279718631
20
+ },
21
+ {
22
+ "epoch": 10,
23
+ "loss": 0.26081196071136065
24
+ },
25
+ {
26
+ "epoch": 15,
27
+ "loss": 0.1965682344275895
28
+ },
29
+ {
30
+ "epoch": 20,
31
+ "loss": 0.1588804939971577
32
+ },
33
+ {
34
+ "epoch": 25,
35
+ "loss": 0.13691731317311034
36
+ }
37
+ ],
38
+ "num_train_windows": 25629,
39
+ "num_test_windows": 4032,
40
+ "input_dim": 3534,
41
+ "fit_input_dim": 2048,
42
+ "target_dim": 1,
43
+ "splits": {
44
+ "test": {
45
+ "mae": 42.374061584472656,
46
+ "rmse": 55.66938400268555,
47
+ "r2": -0.23432442537520948,
48
+ "mean_l2": 42.374061584472656
49
+ }
50
+ },
51
+ "primary_score": 42.374061584472656
52
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/timeline_action/metrics.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "timeline_action",
3
+ "task_display_name": "Action Recognition",
4
+ "task_family": "classification",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "sensor features excluding hashed caption text",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 5.34494523035712,
16
+ "train_accuracy": 0.08946896094268211
17
+ },
18
+ {
19
+ "epoch": 5,
20
+ "loss": 0.9199241166383841,
21
+ "train_accuracy": 0.5943267392407039
22
+ },
23
+ {
24
+ "epoch": 10,
25
+ "loss": 0.44217930797357696,
26
+ "train_accuracy": 0.7336610870498264
27
+ },
28
+ {
29
+ "epoch": 15,
30
+ "loss": 0.28310169599762225,
31
+ "train_accuracy": 0.8029966054079363
32
+ },
33
+ {
34
+ "epoch": 20,
35
+ "loss": 0.19902630149213452,
36
+ "train_accuracy": 0.842209996488353
37
+ },
38
+ {
39
+ "epoch": 25,
40
+ "loss": 0.18476505181807562,
41
+ "train_accuracy": 0.855359163447657
42
+ }
43
+ ],
44
+ "num_train_windows": 25629,
45
+ "num_test_windows": 4032,
46
+ "num_classes": 1222,
47
+ "num_train_classes": 896,
48
+ "input_dim": 3534,
49
+ "fit_input_dim": 2048,
50
+ "selected_column_count": 2048,
51
+ "splits": {
52
+ "test": {
53
+ "accuracy": 0.001984126984126984,
54
+ "balanced_accuracy": 0.0015447515447515447,
55
+ "macro_f1": 0.0014955083181204041,
56
+ "weighted_f1": 0.0019879946780531578,
57
+ "num_eval_windows": 4032,
58
+ "num_classes": 1222
59
+ }
60
+ },
61
+ "primary_score": 0.0014955083181204041
62
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/timeline_action/predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/timeline_subtask/metrics.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "timeline_subtask",
3
+ "task_display_name": "Procedure Step Recognition",
4
+ "task_family": "classification",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "sensor features excluding hashed caption text",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 4.781946432307218,
16
+ "train_accuracy": 0.14272893987280033
17
+ },
18
+ {
19
+ "epoch": 5,
20
+ "loss": 0.8307986326160719,
21
+ "train_accuracy": 0.6892972804245191
22
+ },
23
+ {
24
+ "epoch": 10,
25
+ "loss": 0.3857223062508468,
26
+ "train_accuracy": 0.8305435249131843
27
+ },
28
+ {
29
+ "epoch": 15,
30
+ "loss": 0.25177908692243084,
31
+ "train_accuracy": 0.8828280463537399
32
+ },
33
+ {
34
+ "epoch": 20,
35
+ "loss": 0.1772773926213155,
36
+ "train_accuracy": 0.9092044168715128
37
+ },
38
+ {
39
+ "epoch": 25,
40
+ "loss": 0.1421989650903465,
41
+ "train_accuracy": 0.9217683093370791
42
+ }
43
+ ],
44
+ "num_train_windows": 25629,
45
+ "num_test_windows": 4032,
46
+ "num_classes": 892,
47
+ "num_train_classes": 652,
48
+ "input_dim": 3534,
49
+ "fit_input_dim": 2048,
50
+ "selected_column_count": 2048,
51
+ "splits": {
52
+ "test": {
53
+ "accuracy": 0.000496031746031746,
54
+ "balanced_accuracy": 0.0013333333333333333,
55
+ "macro_f1": 7.35632183908046e-05,
56
+ "weighted_f1": 2.7367268746579092e-05,
57
+ "num_eval_windows": 4032,
58
+ "num_classes": 892
59
+ }
60
+ },
61
+ "primary_score": 7.35632183908046e-05
62
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/timeline_subtask/predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/transition_detection/metrics.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "transition_detection",
3
+ "task_display_name": "Action Boundary Detection",
4
+ "task_family": "classification",
5
+ "model_family": "neural_mlp_raw128",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "sensor features excluding hashed caption text",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "device": "cuda",
12
+ "history": [
13
+ {
14
+ "epoch": 1,
15
+ "loss": 0.5537091893220953,
16
+ "train_accuracy": 0.7428303874517148
17
+ },
18
+ {
19
+ "epoch": 5,
20
+ "loss": 0.19651318897678038,
21
+ "train_accuracy": 0.9144718873151508
22
+ },
23
+ {
24
+ "epoch": 10,
25
+ "loss": 0.09844583694868982,
26
+ "train_accuracy": 0.9596940965312731
27
+ },
28
+ {
29
+ "epoch": 15,
30
+ "loss": 0.10910748333434252,
31
+ "train_accuracy": 0.9566506691638378
32
+ },
33
+ {
34
+ "epoch": 20,
35
+ "loss": 0.07368280102906076,
36
+ "train_accuracy": 0.9692925982285692
37
+ },
38
+ {
39
+ "epoch": 25,
40
+ "loss": 0.030949957263201364,
41
+ "train_accuracy": 0.9879043271294237
42
+ }
43
+ ],
44
+ "num_train_windows": 25629,
45
+ "num_test_windows": 4032,
46
+ "num_classes": 2,
47
+ "num_train_classes": 2,
48
+ "input_dim": 3534,
49
+ "fit_input_dim": 2048,
50
+ "selected_column_count": 2048,
51
+ "splits": {
52
+ "test": {
53
+ "accuracy": 0.9446924603174603,
54
+ "balanced_accuracy": 0.49010925819436457,
55
+ "macro_f1": 0.4902206914147213,
56
+ "weighted_f1": 0.9627185273267364,
57
+ "num_eval_windows": 4032,
58
+ "num_classes": 2
59
+ }
60
+ },
61
+ "primary_score": 0.4902206914147213
62
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/neural_mlp_raw128/transition_detection/predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/run_summary.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_jsonl": "results/omni_finetune/xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_dataset/dataset.jsonl",
3
+ "feature_manifest_json": "results/omni_finetune/xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_dataset_dense_20f_stride10/dataset_manifest.json",
4
+ "tasks_requested": [
5
+ "action_object_relation"
6
+ ],
7
+ "load_report": {
8
+ "resolved_npz_files": 357,
9
+ "loaded_feature_rows": 34269,
10
+ "input_rows": 34269,
11
+ "dropped_rows": 0,
12
+ "missing_path_examples": [],
13
+ "feature_dim": 4430
14
+ },
15
+ "num_result_records": 2,
16
+ "status_counts": {
17
+ "pass": 2
18
+ },
19
+ "results": [
20
+ {
21
+ "task": "action_object_relation",
22
+ "task_display_name": "Action Object Relation",
23
+ "model_family": "simple_raw128_centroid",
24
+ "status": "pass",
25
+ "primary_metric": "macro_f1",
26
+ "primary_score": 0.0,
27
+ "metric_direction": "higher",
28
+ "reason": null,
29
+ "error": null
30
+ },
31
+ {
32
+ "task": "action_object_relation",
33
+ "task_display_name": "Action Object Relation",
34
+ "model_family": "neural_mlp_raw128",
35
+ "status": "pass",
36
+ "primary_metric": "macro_f1",
37
+ "primary_score": 0.0,
38
+ "metric_direction": "higher",
39
+ "reason": null,
40
+ "error": null
41
+ }
42
+ ]
43
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/run_summary_all.json ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_records": 40,
3
+ "status_counts": {
4
+ "pass": 36,
5
+ "unsupported": 4
6
+ },
7
+ "records": [
8
+ {
9
+ "task": "action_object_relation",
10
+ "model_family": "neural_mlp_raw128",
11
+ "status": "pass",
12
+ "primary_metric": "macro_f1",
13
+ "primary_score": 0.0,
14
+ "metric_direction": "higher",
15
+ "reason": null,
16
+ "error": null
17
+ },
18
+ {
19
+ "task": "action_object_relation",
20
+ "model_family": "simple_raw128_centroid",
21
+ "status": "pass",
22
+ "primary_metric": "macro_f1",
23
+ "primary_score": 0.0,
24
+ "metric_direction": "higher",
25
+ "reason": null,
26
+ "error": null
27
+ },
28
+ {
29
+ "task": "camera_view_sync_retrieval",
30
+ "model_family": "neural_mlp_raw128",
31
+ "status": "unsupported",
32
+ "primary_metric": "mrr",
33
+ "primary_score": null,
34
+ "metric_direction": "higher",
35
+ "reason": "128-episode NPZ manifest has camera pose plus audio/depth/caption features, but no two explicit video-view feature blocks for camera-view synchronization",
36
+ "error": null
37
+ },
38
+ {
39
+ "task": "camera_view_sync_retrieval",
40
+ "model_family": "simple_raw128_ridge",
41
+ "status": "unsupported",
42
+ "primary_metric": "mrr",
43
+ "primary_score": null,
44
+ "metric_direction": "higher",
45
+ "reason": "128-episode NPZ manifest has camera pose plus audio/depth/caption features, but no two explicit video-view feature blocks for camera-view synchronization",
46
+ "error": null
47
+ },
48
+ {
49
+ "task": "caption_grounding",
50
+ "model_family": "neural_mlp_raw128",
51
+ "status": "pass",
52
+ "primary_metric": "mrr",
53
+ "primary_score": 0.0063402121886610985,
54
+ "metric_direction": "higher",
55
+ "reason": null,
56
+ "error": null
57
+ },
58
+ {
59
+ "task": "caption_grounding",
60
+ "model_family": "simple_raw128_ridge",
61
+ "status": "pass",
62
+ "primary_metric": "mrr",
63
+ "primary_score": 0.011150892823934555,
64
+ "metric_direction": "higher",
65
+ "reason": null,
66
+ "error": null
67
+ },
68
+ {
69
+ "task": "contact_prediction",
70
+ "model_family": "neural_mlp_raw128",
71
+ "status": "pass",
72
+ "primary_metric": "macro_f1",
73
+ "primary_score": 1.0,
74
+ "metric_direction": "higher",
75
+ "reason": null,
76
+ "error": null
77
+ },
78
+ {
79
+ "task": "contact_prediction",
80
+ "model_family": "simple_raw128_centroid",
81
+ "status": "pass",
82
+ "primary_metric": "macro_f1",
83
+ "primary_score": 0.886990707397193,
84
+ "metric_direction": "higher",
85
+ "reason": null,
86
+ "error": null
87
+ },
88
+ {
89
+ "task": "cross_modal_retrieval",
90
+ "model_family": "neural_mlp_raw128",
91
+ "status": "pass",
92
+ "primary_metric": "mrr",
93
+ "primary_score": 0.002535284962505102,
94
+ "metric_direction": "higher",
95
+ "reason": null,
96
+ "error": null
97
+ },
98
+ {
99
+ "task": "cross_modal_retrieval",
100
+ "model_family": "simple_raw128_ridge",
101
+ "status": "pass",
102
+ "primary_metric": "mrr",
103
+ "primary_score": 0.003459817497059703,
104
+ "metric_direction": "higher",
105
+ "reason": null,
106
+ "error": null
107
+ },
108
+ {
109
+ "task": "hand_trajectory_forecast",
110
+ "model_family": "neural_mlp_raw128",
111
+ "status": "pass",
112
+ "primary_metric": "mae",
113
+ "primary_score": 0.18475216627120972,
114
+ "metric_direction": "lower",
115
+ "reason": null,
116
+ "error": null
117
+ },
118
+ {
119
+ "task": "hand_trajectory_forecast",
120
+ "model_family": "simple_raw128_ridge",
121
+ "status": "pass",
122
+ "primary_metric": "mae",
123
+ "primary_score": 0.2729249894618988,
124
+ "metric_direction": "lower",
125
+ "reason": null,
126
+ "error": null
127
+ },
128
+ {
129
+ "task": "imu_to_hand_pose",
130
+ "model_family": "neural_mlp_raw128",
131
+ "status": "pass",
132
+ "primary_metric": "mae",
133
+ "primary_score": 0.252998411655426,
134
+ "metric_direction": "lower",
135
+ "reason": null,
136
+ "error": null
137
+ },
138
+ {
139
+ "task": "imu_to_hand_pose",
140
+ "model_family": "simple_raw128_ridge",
141
+ "status": "pass",
142
+ "primary_metric": "mae",
143
+ "primary_score": 0.22941437363624573,
144
+ "metric_direction": "lower",
145
+ "reason": null,
146
+ "error": null
147
+ },
148
+ {
149
+ "task": "interaction_text_prediction",
150
+ "model_family": "neural_mlp_raw128",
151
+ "status": "unsupported",
152
+ "primary_metric": "macro_f1",
153
+ "primary_score": null,
154
+ "metric_direction": "higher",
155
+ "reason": "raw 128-episode annotation.hdf5 interaction text is not present in the JSONL export; only hashed caption_objects_interaction_text features are available",
156
+ "error": null
157
+ },
158
+ {
159
+ "task": "interaction_text_prediction",
160
+ "model_family": "simple_raw128_centroid",
161
+ "status": "unsupported",
162
+ "primary_metric": "macro_f1",
163
+ "primary_score": null,
164
+ "metric_direction": "higher",
165
+ "reason": "raw 128-episode annotation.hdf5 interaction text is not present in the JSONL export; only hashed caption_objects_interaction_text features are available",
166
+ "error": null
167
+ },
168
+ {
169
+ "task": "long_horizon_next_action",
170
+ "model_family": "neural_mlp_raw128",
171
+ "status": "pass",
172
+ "primary_metric": "macro_f1",
173
+ "primary_score": 0.001063859887389299,
174
+ "metric_direction": "higher",
175
+ "reason": null,
176
+ "error": null
177
+ },
178
+ {
179
+ "task": "long_horizon_next_action",
180
+ "model_family": "simple_raw128_centroid",
181
+ "status": "pass",
182
+ "primary_metric": "macro_f1",
183
+ "primary_score": 0.0024280172369056294,
184
+ "metric_direction": "higher",
185
+ "reason": null,
186
+ "error": null
187
+ },
188
+ {
189
+ "task": "misalignment_detection",
190
+ "model_family": "neural_mlp_raw128",
191
+ "status": "pass",
192
+ "primary_metric": "macro_f1",
193
+ "primary_score": 0.8272709077974252,
194
+ "metric_direction": "higher",
195
+ "reason": null,
196
+ "error": null
197
+ },
198
+ {
199
+ "task": "misalignment_detection",
200
+ "model_family": "simple_raw128_centroid",
201
+ "status": "pass",
202
+ "primary_metric": "macro_f1",
203
+ "primary_score": 0.4958867673901769,
204
+ "metric_direction": "higher",
205
+ "reason": null,
206
+ "error": null
207
+ },
208
+ {
209
+ "task": "modality_reconstruction",
210
+ "model_family": "neural_mlp_raw128",
211
+ "status": "pass",
212
+ "primary_metric": "r2",
213
+ "primary_score": -1.3974418160502369,
214
+ "metric_direction": "higher",
215
+ "reason": null,
216
+ "error": null
217
+ },
218
+ {
219
+ "task": "modality_reconstruction",
220
+ "model_family": "simple_raw128_ridge",
221
+ "status": "pass",
222
+ "primary_metric": "r2",
223
+ "primary_score": -1.3450960391924882,
224
+ "metric_direction": "higher",
225
+ "reason": null,
226
+ "error": null
227
+ },
228
+ {
229
+ "task": "next_action",
230
+ "model_family": "neural_mlp_raw128",
231
+ "status": "pass",
232
+ "primary_metric": "macro_f1",
233
+ "primary_score": 0.0018477984371755407,
234
+ "metric_direction": "higher",
235
+ "reason": null,
236
+ "error": null
237
+ },
238
+ {
239
+ "task": "next_action",
240
+ "model_family": "simple_raw128_centroid",
241
+ "status": "pass",
242
+ "primary_metric": "macro_f1",
243
+ "primary_score": 0.003285273363482094,
244
+ "metric_direction": "higher",
245
+ "reason": null,
246
+ "error": null
247
+ },
248
+ {
249
+ "task": "next_subtask_forecast",
250
+ "model_family": "neural_mlp_raw128",
251
+ "status": "pass",
252
+ "primary_metric": "macro_f1",
253
+ "primary_score": 0.0,
254
+ "metric_direction": "higher",
255
+ "reason": null,
256
+ "error": null
257
+ },
258
+ {
259
+ "task": "next_subtask_forecast",
260
+ "model_family": "simple_raw128_centroid",
261
+ "status": "pass",
262
+ "primary_metric": "macro_f1",
263
+ "primary_score": 0.0,
264
+ "metric_direction": "higher",
265
+ "reason": null,
266
+ "error": null
267
+ },
268
+ {
269
+ "task": "object_relevance",
270
+ "model_family": "neural_mlp_raw128_multilabel",
271
+ "status": "pass",
272
+ "primary_metric": "micro_f1",
273
+ "primary_score": 0.1765890386972509,
274
+ "metric_direction": "higher",
275
+ "reason": null,
276
+ "error": null
277
+ },
278
+ {
279
+ "task": "object_relevance",
280
+ "model_family": "simple_raw128_ridge_multilabel",
281
+ "status": "pass",
282
+ "primary_metric": "micro_f1",
283
+ "primary_score": 0.0655376369662084,
284
+ "metric_direction": "higher",
285
+ "reason": null,
286
+ "error": null
287
+ },
288
+ {
289
+ "task": "object_set_forecast",
290
+ "model_family": "neural_mlp_raw128_multilabel",
291
+ "status": "pass",
292
+ "primary_metric": "micro_f1",
293
+ "primary_score": 0.17523098630012288,
294
+ "metric_direction": "higher",
295
+ "reason": null,
296
+ "error": null
297
+ },
298
+ {
299
+ "task": "object_set_forecast",
300
+ "model_family": "simple_raw128_ridge_multilabel",
301
+ "status": "pass",
302
+ "primary_metric": "micro_f1",
303
+ "primary_score": 0.06469493412657774,
304
+ "metric_direction": "higher",
305
+ "reason": null,
306
+ "error": null
307
+ },
308
+ {
309
+ "task": "temporal_order",
310
+ "model_family": "neural_mlp_raw128",
311
+ "status": "pass",
312
+ "primary_metric": "macro_f1",
313
+ "primary_score": 0.8030047098504103,
314
+ "metric_direction": "higher",
315
+ "reason": null,
316
+ "error": null
317
+ },
318
+ {
319
+ "task": "temporal_order",
320
+ "model_family": "simple_raw128_centroid",
321
+ "status": "pass",
322
+ "primary_metric": "macro_f1",
323
+ "primary_score": 0.49824413370686593,
324
+ "metric_direction": "higher",
325
+ "reason": null,
326
+ "error": null
327
+ },
328
+ {
329
+ "task": "time_to_transition",
330
+ "model_family": "neural_mlp_raw128",
331
+ "status": "pass",
332
+ "primary_metric": "mae",
333
+ "primary_score": 42.374061584472656,
334
+ "metric_direction": "lower",
335
+ "reason": null,
336
+ "error": null
337
+ },
338
+ {
339
+ "task": "time_to_transition",
340
+ "model_family": "simple_raw128_ridge",
341
+ "status": "pass",
342
+ "primary_metric": "mae",
343
+ "primary_score": 52.32759094238281,
344
+ "metric_direction": "lower",
345
+ "reason": null,
346
+ "error": null
347
+ },
348
+ {
349
+ "task": "timeline_action",
350
+ "model_family": "neural_mlp_raw128",
351
+ "status": "pass",
352
+ "primary_metric": "macro_f1",
353
+ "primary_score": 0.0014955083181204041,
354
+ "metric_direction": "higher",
355
+ "reason": null,
356
+ "error": null
357
+ },
358
+ {
359
+ "task": "timeline_action",
360
+ "model_family": "simple_raw128_centroid",
361
+ "status": "pass",
362
+ "primary_metric": "macro_f1",
363
+ "primary_score": 0.002915061325704321,
364
+ "metric_direction": "higher",
365
+ "reason": null,
366
+ "error": null
367
+ },
368
+ {
369
+ "task": "timeline_subtask",
370
+ "model_family": "neural_mlp_raw128",
371
+ "status": "pass",
372
+ "primary_metric": "macro_f1",
373
+ "primary_score": 7.35632183908046e-05,
374
+ "metric_direction": "higher",
375
+ "reason": null,
376
+ "error": null
377
+ },
378
+ {
379
+ "task": "timeline_subtask",
380
+ "model_family": "simple_raw128_centroid",
381
+ "status": "pass",
382
+ "primary_metric": "macro_f1",
383
+ "primary_score": 0.0,
384
+ "metric_direction": "higher",
385
+ "reason": null,
386
+ "error": null
387
+ },
388
+ {
389
+ "task": "transition_detection",
390
+ "model_family": "neural_mlp_raw128",
391
+ "status": "pass",
392
+ "primary_metric": "macro_f1",
393
+ "primary_score": 0.4902206914147213,
394
+ "metric_direction": "higher",
395
+ "reason": null,
396
+ "error": null
397
+ },
398
+ {
399
+ "task": "transition_detection",
400
+ "model_family": "simple_raw128_centroid",
401
+ "status": "pass",
402
+ "primary_metric": "macro_f1",
403
+ "primary_score": 0.4203613574238283,
404
+ "metric_direction": "higher",
405
+ "reason": null,
406
+ "error": null
407
+ }
408
+ ]
409
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/action_object_relation/metrics.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "action_object_relation",
3
+ "task_display_name": "Action Object Relation",
4
+ "task_family": "classification",
5
+ "model_family": "simple_raw128_centroid",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "sensor features excluding hashed caption text",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "num_train_windows": 25488,
12
+ "num_val_windows": 4569,
13
+ "num_test_windows": 4014,
14
+ "num_classes": 4149,
15
+ "num_train_classes": 3058,
16
+ "input_dim": 3534,
17
+ "fit_input_dim": 2048,
18
+ "selected_column_count": 2048,
19
+ "splits": {
20
+ "val": {
21
+ "accuracy": 0.0,
22
+ "balanced_accuracy": 0.0,
23
+ "macro_f1": 0.0,
24
+ "weighted_f1": 0.0,
25
+ "num_eval_windows": 4569,
26
+ "num_classes": 4149
27
+ },
28
+ "test": {
29
+ "accuracy": 0.0,
30
+ "balanced_accuracy": 0.0,
31
+ "macro_f1": 0.0,
32
+ "weighted_f1": 0.0,
33
+ "num_eval_windows": 4014,
34
+ "num_classes": 4149
35
+ }
36
+ },
37
+ "primary_score": 0.0
38
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/action_object_relation/predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/caption_grounding/metrics.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "caption_grounding",
3
+ "task_display_name": "Language Grounding",
4
+ "task_family": "retrieval",
5
+ "model_family": "simple_raw128_ridge",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "non-caption sensor blocks projected to hashed caption/object/interaction block",
8
+ "primary_metric": "mrr",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "num_train_windows": 25629,
12
+ "num_test_windows": 4032,
13
+ "input_dim": 3534,
14
+ "fit_input_dim": 2048,
15
+ "target_dim": 896,
16
+ "splits": {
17
+ "test": {
18
+ "mrr": 0.011150892823934555,
19
+ "top1": 0.003720238095238095,
20
+ "median_rank": 786.0,
21
+ "num_queries": 4032
22
+ }
23
+ },
24
+ "primary_score": 0.011150892823934555
25
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/contact_prediction/metrics.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "contact_prediction",
3
+ "task_display_name": "Contact State Prediction",
4
+ "task_family": "classification",
5
+ "model_family": "simple_raw128_centroid",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "sensor features excluding hashed caption text",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "num_train_windows": 25629,
12
+ "num_val_windows": 4608,
13
+ "num_test_windows": 4032,
14
+ "num_classes": 2,
15
+ "num_train_classes": 2,
16
+ "input_dim": 3534,
17
+ "fit_input_dim": 2048,
18
+ "selected_column_count": 2048,
19
+ "splits": {
20
+ "val": {
21
+ "accuracy": 0.8932291666666666,
22
+ "balanced_accuracy": 0.9390938351077,
23
+ "macro_f1": 0.8166515974696689,
24
+ "weighted_f1": 0.9058806656743642,
25
+ "num_eval_windows": 4608,
26
+ "num_classes": 2
27
+ },
28
+ "test": {
29
+ "accuracy": 0.9109623015873016,
30
+ "balanced_accuracy": 0.9425416133162612,
31
+ "macro_f1": 0.886990707397193,
32
+ "weighted_f1": 0.9155965516219,
33
+ "num_eval_windows": 4032,
34
+ "num_classes": 2
35
+ }
36
+ },
37
+ "primary_score": 0.886990707397193
38
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/contact_prediction/predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/cross_modal_retrieval/metrics.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "cross_modal_retrieval",
3
+ "task_display_name": "Cross-Modal Retrieval",
4
+ "task_family": "retrieval",
5
+ "model_family": "simple_raw128_ridge",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "all non-depth sensor blocks projected to depth-confidence block",
8
+ "primary_metric": "mrr",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "num_train_windows": 25629,
12
+ "num_test_windows": 4032,
13
+ "input_dim": 3450,
14
+ "fit_input_dim": 2048,
15
+ "target_dim": 980,
16
+ "splits": {
17
+ "test": {
18
+ "mrr": 0.003459817497059703,
19
+ "top1": 0.000744047619047619,
20
+ "median_rank": 1797.5,
21
+ "num_queries": 4032
22
+ }
23
+ },
24
+ "primary_score": 0.003459817497059703
25
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/hand_trajectory_forecast/metrics.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "hand_trajectory_forecast",
3
+ "task_display_name": "Hand Trajectory Forecasting",
4
+ "task_family": "regression",
5
+ "model_family": "simple_raw128_ridge",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "current non-hand/non-caption features; target hand joint feature block +20 frames",
8
+ "primary_metric": "mae",
9
+ "metric_direction": "lower",
10
+ "status": "pass",
11
+ "num_train_windows": 25502,
12
+ "num_test_windows": 4015,
13
+ "input_dim": 2652,
14
+ "fit_input_dim": 2048,
15
+ "target_dim": 882,
16
+ "splits": {
17
+ "test": {
18
+ "mae": 0.2729249894618988,
19
+ "rmse": 0.5156853199005127,
20
+ "r2": -0.21456409310612812,
21
+ "mean_l2": 13.391268730163574
22
+ }
23
+ },
24
+ "primary_score": 0.2729249894618988
25
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/imu_to_hand_pose/metrics.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "imu_to_hand_pose",
3
+ "task_display_name": "Imu To Hand Pose",
4
+ "task_family": "regression",
5
+ "model_family": "simple_raw128_ridge",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "IMU acceleration/gyroscope block reconstructs hand-joint blocks",
8
+ "primary_metric": "mae",
9
+ "metric_direction": "lower",
10
+ "status": "pass",
11
+ "num_train_windows": 25629,
12
+ "num_test_windows": 4032,
13
+ "input_dim": 42,
14
+ "fit_input_dim": 42,
15
+ "target_dim": 882,
16
+ "splits": {
17
+ "test": {
18
+ "mae": 0.22941437363624573,
19
+ "rmse": 0.4729202091693878,
20
+ "r2": -0.018392341461985984,
21
+ "mean_l2": 11.224305152893066
22
+ }
23
+ },
24
+ "primary_score": 0.22941437363624573
25
+ }
results/omni_finetune/a100_128_raw20_task_baselines_20260616T073954Z/simple_raw128/long_horizon_next_action/metrics.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "long_horizon_next_action",
3
+ "task_display_name": "Long Horizon Next Action",
4
+ "task_family": "classification",
5
+ "model_family": "simple_raw128_centroid",
6
+ "source": "128_episode_raw_sensor_features",
7
+ "input_features": "current non-caption features; target action +100 frames",
8
+ "primary_metric": "macro_f1",
9
+ "metric_direction": "higher",
10
+ "status": "pass",
11
+ "num_train_windows": 25068,
12
+ "num_val_windows": 4496,
13
+ "num_test_windows": 3951,
14
+ "num_classes": 1211,
15
+ "num_train_classes": 887,
16
+ "input_dim": 3534,
17
+ "fit_input_dim": 2048,
18
+ "selected_column_count": 2048,
19
+ "splits": {
20
+ "val": {
21
+ "accuracy": 0.002224199288256228,
22
+ "balanced_accuracy": 0.003450987577971705,
23
+ "macro_f1": 0.0038119991336365605,
24
+ "weighted_f1": 0.0024887619404382845,
25
+ "num_eval_windows": 4496,
26
+ "num_classes": 1211
27
+ },
28
+ "test": {
29
+ "accuracy": 0.0030372057706909645,
30
+ "balanced_accuracy": 0.003899407470836042,
31
+ "macro_f1": 0.0024280172369056294,
32
+ "weighted_f1": 0.0025638705344299727,
33
+ "num_eval_windows": 3951,
34
+ "num_classes": 1211
35
+ }
36
+ },
37
+ "primary_score": 0.0024280172369056294
38
+ }