| task,task_display,metric,current_audio,no_audio,current_audio_delta,raw_audio_only,replace_with_raw,raw_replacement_delta_vs_current,all_plus_raw,all_plus_raw_delta_vs_current |
| timeline_action,Current Action Recognition,macro_f1,0.00905456968081885,0.008771929824561405,0.0002826398562574446,0.0,0.0013495276653171392,-0.00770504201550171,0.002734107997265892,-0.006320461683552957 |
| timeline_subtask,Current Subtask Recognition,macro_f1,0.011256354393609296,0.0111731843575419,8.317003606739606e-05,0.0016722408026755855,0.0008257638315441783,-0.010430590562065117,0.0017889087656529517,-0.009467445627956345 |
| transition_detection,Action Transition Detection,macro_f1,0.46213292117465227,0.46870229007633585,-0.006569368901683581,0.4637904468412942,0.4792100707180375,0.01707714954338524,0.4816233470132239,0.019490425838571634 |
| next_action,Next-Action Prediction,macro_f1,0.01058201058201058,0.010709504685408301,-0.0001274941033977215,0.0017301038062283738,0.006006006006006006,-0.004576004576004574,0.0058479532163742695,-0.00473405736563631 |
| hand_trajectory_forecast,Future Hand Motion Forecasting,mae,4.466395378112793,4.303755283355713,-0.16264009475708008,3.1172122955322266,4.305870532989502,0.16052484512329102,4.1367621421813965,0.3296332359313965 |
| contact_prediction,Contact State Prediction,macro_f1,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0 |
| object_relevance,Relevant Object Prediction,micro_f1,0.15813953488372093,0.14793328498912256,0.010206249894598368,0.15894868585732164,0.17871759890859482,0.020578064024873888,0.18262653898768813,0.024487004103967203 |
| caption_grounding,Language-to-Time Grounding,mrr,0.03208567947149277,0.027228528633713722,0.004857150837779045,0.014815197326242924,0.02484782598912716,-0.007237853482365608,0.02719014883041382,-0.004895530641078949 |
| cross_modal_retrieval,Cross-Modal Window Retrieval,mrr,0.3751238286495209,0.38921058177948,-0.014086753129959106,0.01806792803108692,0.32749155163764954,-0.04763227701187134,0.31795138120651245,-0.05717244744300842 |
| modality_reconstruction,Sensor-to-Visual Reconstruction,mae,9.79421329498291,10.446661949157715,0.6524486541748047,2.6225292682647705,8.830678939819336,0.9635343551635742,8.392388343811035,1.401824951171875 |
| temporal_order,Temporal Order Verification,macro_f1,0.5172413793103449,0.4942528735632184,0.022988505747126464,0.5028735632183908,0.5301714439065678,0.012930064596222923,0.5330450130569861,0.015803633746641288 |
| misalignment_detection,Cross-Modal Misalignment Detection,macro_f1,0.41734045375379186,0.42258557365378524,-0.005245119899993378,0.47823544277887897,0.44378951880827355,0.02644906505448169,0.4373795761078998,0.02003912235410793 |
|
|