Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "description": "Measured audio contribution variants over the single public Xperience-10M sample episode.", | |
| "scope": "single public sample episode; chronological split; ridge heads over fixed feature contracts", | |
| "raw_audio_metadata": { | |
| "source": "local_public_sample/fisheye_cam0.mp4", | |
| "exists": true, | |
| "has_audio": true, | |
| "sample_rate": 16000, | |
| "fps": 20.00137419266181, | |
| "num_samples": 4656994, | |
| "num_windows": 1161, | |
| "feature_dim": 588, | |
| "mel_bands": 64, | |
| "fft_size": 512, | |
| "hop_length": 160, | |
| "feature_description": "Per-window raw waveform STFT log-mel statistics plus delta and waveform envelope statistics." | |
| }, | |
| "num_tasks": 12, | |
| "variants": { | |
| "all_handcrafted_audio": "All Current Features", | |
| "all_except_audio": "All Except Audio", | |
| "handcrafted_audio_only": "Audio Only", | |
| "raw_logmel_audio_only": "Raw Log-Mel Audio Only", | |
| "replace_handcrafted_with_raw": "Audio Representation Replacement", | |
| "all_plus_raw_logmel": "All Current Features + Raw Log-Mel" | |
| }, | |
| "task_summaries": [ | |
| { | |
| "task": "timeline_action", | |
| "task_display": "Current Action Recognition", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "all_handcrafted_audio": 0.00905456968081885, | |
| "all_except_audio": 0.008771929824561405, | |
| "handcrafted_audio_delta": 0.0002826398562574446, | |
| "raw_logmel_audio_only": 0.0, | |
| "replace_handcrafted_with_raw": 0.0013495276653171392, | |
| "raw_replacement_delta_vs_no_audio": -0.007422402159244265, | |
| "raw_replacement_delta_vs_handcrafted": -0.00770504201550171, | |
| "all_plus_raw_logmel": 0.002734107997265892, | |
| "all_plus_raw_delta_vs_handcrafted": -0.006320461683552957, | |
| "task_display_name": "Action Recognition" | |
| }, | |
| { | |
| "task": "timeline_subtask", | |
| "task_display": "Current Subtask Recognition", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "all_handcrafted_audio": 0.011256354393609296, | |
| "all_except_audio": 0.0111731843575419, | |
| "handcrafted_audio_delta": 8.317003606739606e-05, | |
| "raw_logmel_audio_only": 0.0016722408026755855, | |
| "replace_handcrafted_with_raw": 0.0008257638315441783, | |
| "raw_replacement_delta_vs_no_audio": -0.01034742052599772, | |
| "raw_replacement_delta_vs_handcrafted": -0.010430590562065117, | |
| "all_plus_raw_logmel": 0.0017889087656529517, | |
| "all_plus_raw_delta_vs_handcrafted": -0.009467445627956345, | |
| "task_display_name": "Procedure Step Recognition" | |
| }, | |
| { | |
| "task": "transition_detection", | |
| "task_display": "Action Transition Detection", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "all_handcrafted_audio": 0.46213292117465227, | |
| "all_except_audio": 0.46870229007633585, | |
| "handcrafted_audio_delta": -0.006569368901683581, | |
| "raw_logmel_audio_only": 0.4637904468412942, | |
| "replace_handcrafted_with_raw": 0.4792100707180375, | |
| "raw_replacement_delta_vs_no_audio": 0.010507780641701658, | |
| "raw_replacement_delta_vs_handcrafted": 0.01707714954338524, | |
| "all_plus_raw_logmel": 0.4816233470132239, | |
| "all_plus_raw_delta_vs_handcrafted": 0.019490425838571634, | |
| "task_display_name": "Action Boundary Detection" | |
| }, | |
| { | |
| "task": "next_action", | |
| "task_display": "Next-Action Prediction", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "all_handcrafted_audio": 0.01058201058201058, | |
| "all_except_audio": 0.010709504685408301, | |
| "handcrafted_audio_delta": -0.0001274941033977215, | |
| "raw_logmel_audio_only": 0.0017301038062283738, | |
| "replace_handcrafted_with_raw": 0.006006006006006006, | |
| "raw_replacement_delta_vs_no_audio": -0.004703498679402295, | |
| "raw_replacement_delta_vs_handcrafted": -0.004576004576004574, | |
| "all_plus_raw_logmel": 0.0058479532163742695, | |
| "all_plus_raw_delta_vs_handcrafted": -0.00473405736563631, | |
| "task_display_name": "Next-Action Prediction" | |
| }, | |
| { | |
| "task": "hand_trajectory_forecast", | |
| "task_display": "Future Hand Motion Forecasting", | |
| "primary_metric": "mae", | |
| "higher_is_better": false, | |
| "all_handcrafted_audio": 4.466395378112793, | |
| "all_except_audio": 4.303755283355713, | |
| "handcrafted_audio_delta": -0.16264009475708008, | |
| "raw_logmel_audio_only": 3.1172122955322266, | |
| "replace_handcrafted_with_raw": 4.305870532989502, | |
| "raw_replacement_delta_vs_no_audio": -0.0021152496337890625, | |
| "raw_replacement_delta_vs_handcrafted": 0.16052484512329102, | |
| "all_plus_raw_logmel": 4.1367621421813965, | |
| "all_plus_raw_delta_vs_handcrafted": 0.3296332359313965, | |
| "task_display_name": "Hand Trajectory Forecasting" | |
| }, | |
| { | |
| "task": "contact_prediction", | |
| "task_display": "Contact State Prediction", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "all_handcrafted_audio": 1.0, | |
| "all_except_audio": 1.0, | |
| "handcrafted_audio_delta": 0.0, | |
| "raw_logmel_audio_only": 1.0, | |
| "replace_handcrafted_with_raw": 1.0, | |
| "raw_replacement_delta_vs_no_audio": 0.0, | |
| "raw_replacement_delta_vs_handcrafted": 0.0, | |
| "all_plus_raw_logmel": 1.0, | |
| "all_plus_raw_delta_vs_handcrafted": 0.0, | |
| "task_display_name": "Contact State Prediction" | |
| }, | |
| { | |
| "task": "object_relevance", | |
| "task_display": "Relevant Object Prediction", | |
| "primary_metric": "micro_f1", | |
| "higher_is_better": true, | |
| "all_handcrafted_audio": 0.15813953488372093, | |
| "all_except_audio": 0.14793328498912256, | |
| "handcrafted_audio_delta": 0.010206249894598368, | |
| "raw_logmel_audio_only": 0.15894868585732164, | |
| "replace_handcrafted_with_raw": 0.17871759890859482, | |
| "raw_replacement_delta_vs_no_audio": 0.030784313919472256, | |
| "raw_replacement_delta_vs_handcrafted": 0.020578064024873888, | |
| "all_plus_raw_logmel": 0.18262653898768813, | |
| "all_plus_raw_delta_vs_handcrafted": 0.024487004103967203, | |
| "task_display_name": "Object Relevance Prediction" | |
| }, | |
| { | |
| "task": "caption_grounding", | |
| "task_display": "Language-to-Time Grounding", | |
| "primary_metric": "mrr", | |
| "higher_is_better": true, | |
| "all_handcrafted_audio": 0.03208567947149277, | |
| "all_except_audio": 0.027228528633713722, | |
| "handcrafted_audio_delta": 0.004857150837779045, | |
| "raw_logmel_audio_only": 0.014815197326242924, | |
| "replace_handcrafted_with_raw": 0.02484782598912716, | |
| "raw_replacement_delta_vs_no_audio": -0.002380702644586563, | |
| "raw_replacement_delta_vs_handcrafted": -0.007237853482365608, | |
| "all_plus_raw_logmel": 0.02719014883041382, | |
| "all_plus_raw_delta_vs_handcrafted": -0.004895530641078949, | |
| "task_display_name": "Language Grounding" | |
| }, | |
| { | |
| "task": "cross_modal_retrieval", | |
| "task_display": "Cross-Modal Window Retrieval", | |
| "primary_metric": "mrr", | |
| "higher_is_better": true, | |
| "all_handcrafted_audio": 0.3751238286495209, | |
| "all_except_audio": 0.38921058177948, | |
| "handcrafted_audio_delta": -0.014086753129959106, | |
| "raw_logmel_audio_only": 0.01806792803108692, | |
| "replace_handcrafted_with_raw": 0.32749155163764954, | |
| "raw_replacement_delta_vs_no_audio": -0.061719030141830444, | |
| "raw_replacement_delta_vs_handcrafted": -0.04763227701187134, | |
| "all_plus_raw_logmel": 0.31795138120651245, | |
| "all_plus_raw_delta_vs_handcrafted": -0.05717244744300842, | |
| "task_display_name": "Cross-Modal Retrieval" | |
| }, | |
| { | |
| "task": "modality_reconstruction", | |
| "task_display": "Sensor-to-Visual Reconstruction", | |
| "primary_metric": "mae", | |
| "higher_is_better": false, | |
| "all_handcrafted_audio": 9.79421329498291, | |
| "all_except_audio": 10.446661949157715, | |
| "handcrafted_audio_delta": 0.6524486541748047, | |
| "raw_logmel_audio_only": 2.6225292682647705, | |
| "replace_handcrafted_with_raw": 8.830678939819336, | |
| "raw_replacement_delta_vs_no_audio": 1.615983009338379, | |
| "raw_replacement_delta_vs_handcrafted": 0.9635343551635742, | |
| "all_plus_raw_logmel": 8.392388343811035, | |
| "all_plus_raw_delta_vs_handcrafted": 1.401824951171875, | |
| "task_display_name": "Cross-Modal Reconstruction" | |
| }, | |
| { | |
| "task": "temporal_order", | |
| "task_display": "Temporal Order Verification", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "all_handcrafted_audio": 0.5172413793103449, | |
| "all_except_audio": 0.4942528735632184, | |
| "handcrafted_audio_delta": 0.022988505747126464, | |
| "raw_logmel_audio_only": 0.5028735632183908, | |
| "replace_handcrafted_with_raw": 0.5301714439065678, | |
| "raw_replacement_delta_vs_no_audio": 0.03591857034334939, | |
| "raw_replacement_delta_vs_handcrafted": 0.012930064596222923, | |
| "all_plus_raw_logmel": 0.5330450130569861, | |
| "all_plus_raw_delta_vs_handcrafted": 0.015803633746641288, | |
| "task_display_name": "Temporal Order Verification" | |
| }, | |
| { | |
| "task": "misalignment_detection", | |
| "task_display": "Cross-Modal Misalignment Detection", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "all_handcrafted_audio": 0.41734045375379186, | |
| "all_except_audio": 0.42258557365378524, | |
| "handcrafted_audio_delta": -0.005245119899993378, | |
| "raw_logmel_audio_only": 0.47823544277887897, | |
| "replace_handcrafted_with_raw": 0.44378951880827355, | |
| "raw_replacement_delta_vs_no_audio": 0.021203945154488313, | |
| "raw_replacement_delta_vs_handcrafted": 0.02644906505448169, | |
| "all_plus_raw_logmel": 0.4373795761078998, | |
| "all_plus_raw_delta_vs_handcrafted": 0.02003912235410793, | |
| "task_display_name": "Multimodal Synchronization Detection" | |
| } | |
| ], | |
| "aggregate": { | |
| "mean_handcrafted_audio_delta": 0.041849794979543296, | |
| "tasks_where_handcrafted_audio_improves": 6, | |
| "mean_raw_replacement_delta_vs_handcrafted": 0.09362598132150173, | |
| "tasks_where_raw_replacement_improves_over_handcrafted": 6 | |
| }, | |
| "provenance": { | |
| "suite_dir": "results/episode_task_suite", | |
| "shared_windows": "results/episode_task_suite/shared_windows.npz", | |
| "feature_manifest": "results/episode_task_suite/feature_manifest.json", | |
| "audio_source": "local_public_sample/fisheye_cam0.mp4", | |
| "annotation_source": "local_public_sample/annotation.hdf5", | |
| "homie_toolkit_available": true | |
| } | |
| } | |