| { |
| "description": "Measured audio ablation and raw log-mel audio upgrade over the single public Xperience-10M sample episode.", |
| "scope": "single public sample episode; chronological split; ridge heads over fixed feature contracts", |
| "raw_audio_metadata": { |
| "source": "local_public_sample/fisheye_cam0.mp4", |
| "exists": true, |
| "has_audio": true, |
| "sample_rate": 16000, |
| "fps": 20.00137419266181, |
| "num_samples": 4656994, |
| "num_windows": 1161, |
| "feature_dim": 588, |
| "mel_bands": 64, |
| "fft_size": 512, |
| "hop_length": 160, |
| "feature_description": "Per-window raw waveform STFT log-mel statistics plus delta and waveform envelope statistics." |
| }, |
| "num_tasks": 12, |
| "variants": { |
| "all_handcrafted_audio": "All Current Features", |
| "all_except_audio": "All Except Audio", |
| "handcrafted_audio_only": "Handcrafted AAC Audio Only", |
| "raw_logmel_audio_only": "Raw Log-Mel Audio Only", |
| "replace_handcrafted_with_raw": "Replace AAC Block With Raw Log-Mel", |
| "all_plus_raw_logmel": "All Current Features + Raw Log-Mel" |
| }, |
| "task_summaries": [ |
| { |
| "task": "timeline_action", |
| "task_display": "Current Action Recognition", |
| "primary_metric": "macro_f1", |
| "higher_is_better": true, |
| "all_handcrafted_audio": 0.00905456968081885, |
| "all_except_audio": 0.008771929824561405, |
| "handcrafted_audio_delta": 0.0002826398562574446, |
| "raw_logmel_audio_only": 0.0, |
| "replace_handcrafted_with_raw": 0.0013495276653171392, |
| "raw_replacement_delta_vs_no_audio": -0.007422402159244265, |
| "raw_replacement_delta_vs_handcrafted": -0.00770504201550171, |
| "all_plus_raw_logmel": 0.002734107997265892, |
| "all_plus_raw_delta_vs_handcrafted": -0.006320461683552957 |
| }, |
| { |
| "task": "timeline_subtask", |
| "task_display": "Current Subtask Recognition", |
| "primary_metric": "macro_f1", |
| "higher_is_better": true, |
| "all_handcrafted_audio": 0.011256354393609296, |
| "all_except_audio": 0.0111731843575419, |
| "handcrafted_audio_delta": 8.317003606739606e-05, |
| "raw_logmel_audio_only": 0.0016722408026755855, |
| "replace_handcrafted_with_raw": 0.0008257638315441783, |
| "raw_replacement_delta_vs_no_audio": -0.01034742052599772, |
| "raw_replacement_delta_vs_handcrafted": -0.010430590562065117, |
| "all_plus_raw_logmel": 0.0017889087656529517, |
| "all_plus_raw_delta_vs_handcrafted": -0.009467445627956345 |
| }, |
| { |
| "task": "transition_detection", |
| "task_display": "Action Transition Detection", |
| "primary_metric": "macro_f1", |
| "higher_is_better": true, |
| "all_handcrafted_audio": 0.46213292117465227, |
| "all_except_audio": 0.46870229007633585, |
| "handcrafted_audio_delta": -0.006569368901683581, |
| "raw_logmel_audio_only": 0.4637904468412942, |
| "replace_handcrafted_with_raw": 0.4792100707180375, |
| "raw_replacement_delta_vs_no_audio": 0.010507780641701658, |
| "raw_replacement_delta_vs_handcrafted": 0.01707714954338524, |
| "all_plus_raw_logmel": 0.4816233470132239, |
| "all_plus_raw_delta_vs_handcrafted": 0.019490425838571634 |
| }, |
| { |
| "task": "next_action", |
| "task_display": "Next-Action Prediction", |
| "primary_metric": "macro_f1", |
| "higher_is_better": true, |
| "all_handcrafted_audio": 0.01058201058201058, |
| "all_except_audio": 0.010709504685408301, |
| "handcrafted_audio_delta": -0.0001274941033977215, |
| "raw_logmel_audio_only": 0.0017301038062283738, |
| "replace_handcrafted_with_raw": 0.006006006006006006, |
| "raw_replacement_delta_vs_no_audio": -0.004703498679402295, |
| "raw_replacement_delta_vs_handcrafted": -0.004576004576004574, |
| "all_plus_raw_logmel": 0.0058479532163742695, |
| "all_plus_raw_delta_vs_handcrafted": -0.00473405736563631 |
| }, |
| { |
| "task": "hand_trajectory_forecast", |
| "task_display": "Future Hand Motion Forecasting", |
| "primary_metric": "mae", |
| "higher_is_better": false, |
| "all_handcrafted_audio": 4.466395378112793, |
| "all_except_audio": 4.303755283355713, |
| "handcrafted_audio_delta": -0.16264009475708008, |
| "raw_logmel_audio_only": 3.1172122955322266, |
| "replace_handcrafted_with_raw": 4.305870532989502, |
| "raw_replacement_delta_vs_no_audio": -0.0021152496337890625, |
| "raw_replacement_delta_vs_handcrafted": 0.16052484512329102, |
| "all_plus_raw_logmel": 4.1367621421813965, |
| "all_plus_raw_delta_vs_handcrafted": 0.3296332359313965 |
| }, |
| { |
| "task": "contact_prediction", |
| "task_display": "Contact State Prediction", |
| "primary_metric": "macro_f1", |
| "higher_is_better": true, |
| "all_handcrafted_audio": 1.0, |
| "all_except_audio": 1.0, |
| "handcrafted_audio_delta": 0.0, |
| "raw_logmel_audio_only": 1.0, |
| "replace_handcrafted_with_raw": 1.0, |
| "raw_replacement_delta_vs_no_audio": 0.0, |
| "raw_replacement_delta_vs_handcrafted": 0.0, |
| "all_plus_raw_logmel": 1.0, |
| "all_plus_raw_delta_vs_handcrafted": 0.0 |
| }, |
| { |
| "task": "object_relevance", |
| "task_display": "Relevant Object Prediction", |
| "primary_metric": "micro_f1", |
| "higher_is_better": true, |
| "all_handcrafted_audio": 0.15813953488372093, |
| "all_except_audio": 0.14793328498912256, |
| "handcrafted_audio_delta": 0.010206249894598368, |
| "raw_logmel_audio_only": 0.15894868585732164, |
| "replace_handcrafted_with_raw": 0.17871759890859482, |
| "raw_replacement_delta_vs_no_audio": 0.030784313919472256, |
| "raw_replacement_delta_vs_handcrafted": 0.020578064024873888, |
| "all_plus_raw_logmel": 0.18262653898768813, |
| "all_plus_raw_delta_vs_handcrafted": 0.024487004103967203 |
| }, |
| { |
| "task": "caption_grounding", |
| "task_display": "Language-to-Time Grounding", |
| "primary_metric": "mrr", |
| "higher_is_better": true, |
| "all_handcrafted_audio": 0.03208567947149277, |
| "all_except_audio": 0.027228528633713722, |
| "handcrafted_audio_delta": 0.004857150837779045, |
| "raw_logmel_audio_only": 0.014815197326242924, |
| "replace_handcrafted_with_raw": 0.02484782598912716, |
| "raw_replacement_delta_vs_no_audio": -0.002380702644586563, |
| "raw_replacement_delta_vs_handcrafted": -0.007237853482365608, |
| "all_plus_raw_logmel": 0.02719014883041382, |
| "all_plus_raw_delta_vs_handcrafted": -0.004895530641078949 |
| }, |
| { |
| "task": "cross_modal_retrieval", |
| "task_display": "Cross-Modal Window Retrieval", |
| "primary_metric": "mrr", |
| "higher_is_better": true, |
| "all_handcrafted_audio": 0.3751238286495209, |
| "all_except_audio": 0.38921058177948, |
| "handcrafted_audio_delta": -0.014086753129959106, |
| "raw_logmel_audio_only": 0.01806792803108692, |
| "replace_handcrafted_with_raw": 0.32749155163764954, |
| "raw_replacement_delta_vs_no_audio": -0.061719030141830444, |
| "raw_replacement_delta_vs_handcrafted": -0.04763227701187134, |
| "all_plus_raw_logmel": 0.31795138120651245, |
| "all_plus_raw_delta_vs_handcrafted": -0.05717244744300842 |
| }, |
| { |
| "task": "modality_reconstruction", |
| "task_display": "Sensor-to-Visual Reconstruction", |
| "primary_metric": "mae", |
| "higher_is_better": false, |
| "all_handcrafted_audio": 9.79421329498291, |
| "all_except_audio": 10.446661949157715, |
| "handcrafted_audio_delta": 0.6524486541748047, |
| "raw_logmel_audio_only": 2.6225292682647705, |
| "replace_handcrafted_with_raw": 8.830678939819336, |
| "raw_replacement_delta_vs_no_audio": 1.615983009338379, |
| "raw_replacement_delta_vs_handcrafted": 0.9635343551635742, |
| "all_plus_raw_logmel": 8.392388343811035, |
| "all_plus_raw_delta_vs_handcrafted": 1.401824951171875 |
| }, |
| { |
| "task": "temporal_order", |
| "task_display": "Temporal Order Verification", |
| "primary_metric": "macro_f1", |
| "higher_is_better": true, |
| "all_handcrafted_audio": 0.5172413793103449, |
| "all_except_audio": 0.4942528735632184, |
| "handcrafted_audio_delta": 0.022988505747126464, |
| "raw_logmel_audio_only": 0.5028735632183908, |
| "replace_handcrafted_with_raw": 0.5301714439065678, |
| "raw_replacement_delta_vs_no_audio": 0.03591857034334939, |
| "raw_replacement_delta_vs_handcrafted": 0.012930064596222923, |
| "all_plus_raw_logmel": 0.5330450130569861, |
| "all_plus_raw_delta_vs_handcrafted": 0.015803633746641288 |
| }, |
| { |
| "task": "misalignment_detection", |
| "task_display": "Cross-Modal Misalignment Detection", |
| "primary_metric": "macro_f1", |
| "higher_is_better": true, |
| "all_handcrafted_audio": 0.41734045375379186, |
| "all_except_audio": 0.42258557365378524, |
| "handcrafted_audio_delta": -0.005245119899993378, |
| "raw_logmel_audio_only": 0.47823544277887897, |
| "replace_handcrafted_with_raw": 0.44378951880827355, |
| "raw_replacement_delta_vs_no_audio": 0.021203945154488313, |
| "raw_replacement_delta_vs_handcrafted": 0.02644906505448169, |
| "all_plus_raw_logmel": 0.4373795761078998, |
| "all_plus_raw_delta_vs_handcrafted": 0.02003912235410793 |
| } |
| ], |
| "aggregate": { |
| "mean_handcrafted_audio_delta": 0.041849794979543296, |
| "tasks_where_handcrafted_audio_improves": 6, |
| "mean_raw_replacement_delta_vs_handcrafted": 0.09362598132150173, |
| "tasks_where_raw_replacement_improves_over_handcrafted": 6 |
| }, |
| "provenance": { |
| "suite_dir": "results/episode_task_suite", |
| "shared_windows": "results/episode_task_suite/shared_windows.npz", |
| "feature_manifest": "results/episode_task_suite/feature_manifest.json", |
| "audio_source": "local_public_sample/fisheye_cam0.mp4", |
| "annotation_source": "local_public_sample/annotation.hdf5", |
| "homie_toolkit_available": true |
| } |
| } |