ropedia-xperience-10m-task-baselines / metrics /audio_ablation_summary.json
cy0307's picture
Publish Ropedia Xperience-10M task baseline cards
45c1706 verified
Raw
History Blame
9.7 kB
{
"description": "Measured audio contribution variants over the single public Xperience-10M sample episode.",
"scope": "single public sample episode; chronological split; ridge heads over fixed feature contracts",
"raw_audio_metadata": {
"source": "local_public_sample/fisheye_cam0.mp4",
"exists": true,
"has_audio": true,
"sample_rate": 16000,
"fps": 20.00137419266181,
"num_samples": 4656994,
"num_windows": 1161,
"feature_dim": 588,
"mel_bands": 64,
"fft_size": 512,
"hop_length": 160,
"feature_description": "Per-window raw waveform STFT log-mel statistics plus delta and waveform envelope statistics."
},
"num_tasks": 12,
"variants": {
"all_handcrafted_audio": "All Current Features",
"all_except_audio": "All Except Audio",
"handcrafted_audio_only": "Audio Only",
"raw_logmel_audio_only": "Raw Log-Mel Audio Only",
"replace_handcrafted_with_raw": "Audio Representation Replacement",
"all_plus_raw_logmel": "All Current Features + Raw Log-Mel"
},
"task_summaries": [
{
"task": "timeline_action",
"task_display": "Current Action Recognition",
"primary_metric": "macro_f1",
"higher_is_better": true,
"all_handcrafted_audio": 0.00905456968081885,
"all_except_audio": 0.008771929824561405,
"handcrafted_audio_delta": 0.0002826398562574446,
"raw_logmel_audio_only": 0.0,
"replace_handcrafted_with_raw": 0.0013495276653171392,
"raw_replacement_delta_vs_no_audio": -0.007422402159244265,
"raw_replacement_delta_vs_handcrafted": -0.00770504201550171,
"all_plus_raw_logmel": 0.002734107997265892,
"all_plus_raw_delta_vs_handcrafted": -0.006320461683552957
},
{
"task": "timeline_subtask",
"task_display": "Current Subtask Recognition",
"primary_metric": "macro_f1",
"higher_is_better": true,
"all_handcrafted_audio": 0.011256354393609296,
"all_except_audio": 0.0111731843575419,
"handcrafted_audio_delta": 8.317003606739606e-05,
"raw_logmel_audio_only": 0.0016722408026755855,
"replace_handcrafted_with_raw": 0.0008257638315441783,
"raw_replacement_delta_vs_no_audio": -0.01034742052599772,
"raw_replacement_delta_vs_handcrafted": -0.010430590562065117,
"all_plus_raw_logmel": 0.0017889087656529517,
"all_plus_raw_delta_vs_handcrafted": -0.009467445627956345
},
{
"task": "transition_detection",
"task_display": "Action Transition Detection",
"primary_metric": "macro_f1",
"higher_is_better": true,
"all_handcrafted_audio": 0.46213292117465227,
"all_except_audio": 0.46870229007633585,
"handcrafted_audio_delta": -0.006569368901683581,
"raw_logmel_audio_only": 0.4637904468412942,
"replace_handcrafted_with_raw": 0.4792100707180375,
"raw_replacement_delta_vs_no_audio": 0.010507780641701658,
"raw_replacement_delta_vs_handcrafted": 0.01707714954338524,
"all_plus_raw_logmel": 0.4816233470132239,
"all_plus_raw_delta_vs_handcrafted": 0.019490425838571634
},
{
"task": "next_action",
"task_display": "Next-Action Prediction",
"primary_metric": "macro_f1",
"higher_is_better": true,
"all_handcrafted_audio": 0.01058201058201058,
"all_except_audio": 0.010709504685408301,
"handcrafted_audio_delta": -0.0001274941033977215,
"raw_logmel_audio_only": 0.0017301038062283738,
"replace_handcrafted_with_raw": 0.006006006006006006,
"raw_replacement_delta_vs_no_audio": -0.004703498679402295,
"raw_replacement_delta_vs_handcrafted": -0.004576004576004574,
"all_plus_raw_logmel": 0.0058479532163742695,
"all_plus_raw_delta_vs_handcrafted": -0.00473405736563631
},
{
"task": "hand_trajectory_forecast",
"task_display": "Future Hand Motion Forecasting",
"primary_metric": "mae",
"higher_is_better": false,
"all_handcrafted_audio": 4.466395378112793,
"all_except_audio": 4.303755283355713,
"handcrafted_audio_delta": -0.16264009475708008,
"raw_logmel_audio_only": 3.1172122955322266,
"replace_handcrafted_with_raw": 4.305870532989502,
"raw_replacement_delta_vs_no_audio": -0.0021152496337890625,
"raw_replacement_delta_vs_handcrafted": 0.16052484512329102,
"all_plus_raw_logmel": 4.1367621421813965,
"all_plus_raw_delta_vs_handcrafted": 0.3296332359313965
},
{
"task": "contact_prediction",
"task_display": "Contact State Prediction",
"primary_metric": "macro_f1",
"higher_is_better": true,
"all_handcrafted_audio": 1.0,
"all_except_audio": 1.0,
"handcrafted_audio_delta": 0.0,
"raw_logmel_audio_only": 1.0,
"replace_handcrafted_with_raw": 1.0,
"raw_replacement_delta_vs_no_audio": 0.0,
"raw_replacement_delta_vs_handcrafted": 0.0,
"all_plus_raw_logmel": 1.0,
"all_plus_raw_delta_vs_handcrafted": 0.0
},
{
"task": "object_relevance",
"task_display": "Relevant Object Prediction",
"primary_metric": "micro_f1",
"higher_is_better": true,
"all_handcrafted_audio": 0.15813953488372093,
"all_except_audio": 0.14793328498912256,
"handcrafted_audio_delta": 0.010206249894598368,
"raw_logmel_audio_only": 0.15894868585732164,
"replace_handcrafted_with_raw": 0.17871759890859482,
"raw_replacement_delta_vs_no_audio": 0.030784313919472256,
"raw_replacement_delta_vs_handcrafted": 0.020578064024873888,
"all_plus_raw_logmel": 0.18262653898768813,
"all_plus_raw_delta_vs_handcrafted": 0.024487004103967203
},
{
"task": "caption_grounding",
"task_display": "Language-to-Time Grounding",
"primary_metric": "mrr",
"higher_is_better": true,
"all_handcrafted_audio": 0.03208567947149277,
"all_except_audio": 0.027228528633713722,
"handcrafted_audio_delta": 0.004857150837779045,
"raw_logmel_audio_only": 0.014815197326242924,
"replace_handcrafted_with_raw": 0.02484782598912716,
"raw_replacement_delta_vs_no_audio": -0.002380702644586563,
"raw_replacement_delta_vs_handcrafted": -0.007237853482365608,
"all_plus_raw_logmel": 0.02719014883041382,
"all_plus_raw_delta_vs_handcrafted": -0.004895530641078949
},
{
"task": "cross_modal_retrieval",
"task_display": "Cross-Modal Window Retrieval",
"primary_metric": "mrr",
"higher_is_better": true,
"all_handcrafted_audio": 0.3751238286495209,
"all_except_audio": 0.38921058177948,
"handcrafted_audio_delta": -0.014086753129959106,
"raw_logmel_audio_only": 0.01806792803108692,
"replace_handcrafted_with_raw": 0.32749155163764954,
"raw_replacement_delta_vs_no_audio": -0.061719030141830444,
"raw_replacement_delta_vs_handcrafted": -0.04763227701187134,
"all_plus_raw_logmel": 0.31795138120651245,
"all_plus_raw_delta_vs_handcrafted": -0.05717244744300842
},
{
"task": "modality_reconstruction",
"task_display": "Sensor-to-Visual Reconstruction",
"primary_metric": "mae",
"higher_is_better": false,
"all_handcrafted_audio": 9.79421329498291,
"all_except_audio": 10.446661949157715,
"handcrafted_audio_delta": 0.6524486541748047,
"raw_logmel_audio_only": 2.6225292682647705,
"replace_handcrafted_with_raw": 8.830678939819336,
"raw_replacement_delta_vs_no_audio": 1.615983009338379,
"raw_replacement_delta_vs_handcrafted": 0.9635343551635742,
"all_plus_raw_logmel": 8.392388343811035,
"all_plus_raw_delta_vs_handcrafted": 1.401824951171875
},
{
"task": "temporal_order",
"task_display": "Temporal Order Verification",
"primary_metric": "macro_f1",
"higher_is_better": true,
"all_handcrafted_audio": 0.5172413793103449,
"all_except_audio": 0.4942528735632184,
"handcrafted_audio_delta": 0.022988505747126464,
"raw_logmel_audio_only": 0.5028735632183908,
"replace_handcrafted_with_raw": 0.5301714439065678,
"raw_replacement_delta_vs_no_audio": 0.03591857034334939,
"raw_replacement_delta_vs_handcrafted": 0.012930064596222923,
"all_plus_raw_logmel": 0.5330450130569861,
"all_plus_raw_delta_vs_handcrafted": 0.015803633746641288
},
{
"task": "misalignment_detection",
"task_display": "Cross-Modal Misalignment Detection",
"primary_metric": "macro_f1",
"higher_is_better": true,
"all_handcrafted_audio": 0.41734045375379186,
"all_except_audio": 0.42258557365378524,
"handcrafted_audio_delta": -0.005245119899993378,
"raw_logmel_audio_only": 0.47823544277887897,
"replace_handcrafted_with_raw": 0.44378951880827355,
"raw_replacement_delta_vs_no_audio": 0.021203945154488313,
"raw_replacement_delta_vs_handcrafted": 0.02644906505448169,
"all_plus_raw_logmel": 0.4373795761078998,
"all_plus_raw_delta_vs_handcrafted": 0.02003912235410793
}
],
"aggregate": {
"mean_handcrafted_audio_delta": 0.041849794979543296,
"tasks_where_handcrafted_audio_improves": 6,
"mean_raw_replacement_delta_vs_handcrafted": 0.09362598132150173,
"tasks_where_raw_replacement_improves_over_handcrafted": 6
},
"provenance": {
"suite_dir": "results/episode_task_suite",
"shared_windows": "results/episode_task_suite/shared_windows.npz",
"feature_manifest": "results/episode_task_suite/feature_manifest.json",
"audio_source": "local_public_sample/fisheye_cam0.mp4",
"annotation_source": "local_public_sample/annotation.hdf5",
"homie_toolkit_available": true
}
}