Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "title": "Ropedia Xperience-10M Research Takeaways", | |
| "status": "pass", | |
| "generated_at_utc": "2026-06-06T23:26:13+00:00", | |
| "source_files": [ | |
| "docs/data/summary_metrics.json", | |
| "results/episode_task_suite/summary_report.json", | |
| "results/episode_task_suite/neural_mlp/*/metrics.json", | |
| "docs/data/audio_ablation_summary.json", | |
| "results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md" | |
| ], | |
| "scope": { | |
| "validated_episode_count": 1, | |
| "num_frames": 5821, | |
| "num_windows": 1161, | |
| "feature_dim": 8546, | |
| "audio_featurized": true, | |
| "raw_data_redistributed": false | |
| }, | |
| "takeaways": [ | |
| { | |
| "id": "episode_to_benchmark", | |
| "title": "One episode can become a real benchmark contract", | |
| "readout": "The public sample is converted into 5,821 frames, 1,161 aligned 20-frame windows, and an 8,546-dimensional feature contract.", | |
| "evidence": [ | |
| { | |
| "label": "frames", | |
| "value": 5821 | |
| }, | |
| { | |
| "label": "windows", | |
| "value": 1161 | |
| }, | |
| { | |
| "label": "feature_dim", | |
| "value": 8546 | |
| } | |
| ], | |
| "source": "docs/data/summary_metrics.json", | |
| "current_scope": "This benchmark defines the task contract; cross-episode generalization is evaluated in the multi-episode stage." | |
| }, | |
| { | |
| "id": "chronological_split_exposes_class_shift", | |
| "title": "Chronological splits expose action-class shift", | |
| "readout": "Earlier all-feature action classifiers reach high macro-F1 on their local split, but the 12-task chronological action/subtask heads are much harder because later held-out windows include unseen labels.", | |
| "evidence": [ | |
| { | |
| "label": "all_feature_action_macro_f1", | |
| "value": 0.9828810433408773 | |
| }, | |
| { | |
| "label": "suite_action_macro_f1", | |
| "value": 0.05 | |
| }, | |
| { | |
| "label": "suite_subtask_macro_f1", | |
| "value": 0.05056355513846935 | |
| }, | |
| { | |
| "label": "unseen_action_test_classes", | |
| "value": 4 | |
| } | |
| ], | |
| "source": "results/episode_task_suite/summary_report.json", | |
| "current_scope": "This split is useful for studying label shift; broad action-recognition conclusions need held-out episodes." | |
| }, | |
| { | |
| "id": "neural_heads_help_dynamics", | |
| "title": "Small neural heads help dynamic and temporal probes", | |
| "readout": "The MLP heads substantially improve hand trajectory forecasting, temporal-order verification, and motion/visual synchronization.", | |
| "evidence": [ | |
| { | |
| "label": "hand_mpjpe_minimal", | |
| "value": 0.8646570444107056 | |
| }, | |
| { | |
| "label": "hand_mpjpe_neural", | |
| "value": 0.10785018652677536 | |
| }, | |
| { | |
| "label": "hand_mpjpe_relative_improvement", | |
| "value": 0.8752682497367739 | |
| }, | |
| { | |
| "label": "temporal_order_f1_minimal", | |
| "value": 0.5399515738498789 | |
| }, | |
| { | |
| "label": "temporal_order_f1_neural", | |
| "value": 0.8520179372197308 | |
| }, | |
| { | |
| "label": "misalignment_f1_minimal", | |
| "value": 0.5051698670605613 | |
| }, | |
| { | |
| "label": "misalignment_f1_neural", | |
| "value": 0.7152682255845944 | |
| } | |
| ], | |
| "source": "results/episode_task_suite/neural_mlp/*/metrics.json", | |
| "current_scope": "These gains are measured within one episode and are candidates for held-out-episode testing." | |
| }, | |
| { | |
| "id": "retrieval_and_reconstruction_remain_open", | |
| "title": "Retrieval and reconstruction remain the harder multimodal problems", | |
| "readout": "Ridge/cosine retrieval remains stronger than the neural projection on this sample, and cross-modal reconstruction still has negative R2.", | |
| "evidence": [ | |
| { | |
| "label": "retrieval_mrr_minimal", | |
| "value": 0.26925966892956127 | |
| }, | |
| { | |
| "label": "retrieval_mrr_neural", | |
| "value": 0.1299971898648288 | |
| }, | |
| { | |
| "label": "retrieval_top5_minimal", | |
| "value": 0.367816091954023 | |
| }, | |
| { | |
| "label": "reconstruction_r2_minimal", | |
| "value": -0.015271898913936655 | |
| }, | |
| { | |
| "label": "reconstruction_r2_neural", | |
| "value": -0.010171410134180991 | |
| } | |
| ], | |
| "source": "results/episode_task_suite/cross_modal_retrieval/metrics.json", | |
| "current_scope": "The current reconstruction task predicts feature vectors; depth, mesh, NeRF, and Gaussian-splatting outputs are future task variants." | |
| }, | |
| { | |
| "id": "audio_contribution_is_task_specific", | |
| "title": "Audio helps some tasks and hurts others on the public sample", | |
| "readout": "Audio improves the primary metric on 6 of 12 tasks, while raw log-mel replacement improves over the current handcrafted block on 6 of 12 tasks. The largest current-audio gain appears in feature reconstruction, not in action classification.", | |
| "evidence": [ | |
| { | |
| "label": "tasks_where_current_audio_improves", | |
| "value": 6 | |
| }, | |
| { | |
| "label": "mean_current_audio_delta", | |
| "value": 0.041849794979543296 | |
| }, | |
| { | |
| "label": "tasks_where_raw_replacement_improves", | |
| "value": 6 | |
| }, | |
| { | |
| "label": "mean_raw_replacement_delta_vs_current", | |
| "value": 0.09362598132150173 | |
| }, | |
| { | |
| "label": "reconstruction_current_audio_delta", | |
| "value": 0.6524486541748047 | |
| }, | |
| { | |
| "label": "object_relevance_current_audio_delta", | |
| "value": 0.010206249894598368 | |
| } | |
| ], | |
| "source": "results/audio_ablation/audio_ablation_summary.json", | |
| "current_scope": "This is a single-episode ablation over fixed ridge heads. It validates that audio is wired into the task suite and shows where it changes metrics; it does not prove cross-episode audio generalization." | |
| }, | |
| { | |
| "id": "scale_requires_episodes", | |
| "title": "The next scientific unit is held-out episodes, not more adjacent windows", | |
| "readout": "The selected Qwen3-Omni path now has a verified two-epoch held-out diagnostic result. It proves the cross-episode train/validation/eval loop and meets the strict-JSON target, while weak action/subtask metrics remain the next modeling problem.", | |
| "evidence": [ | |
| { | |
| "label": "selected_episodes", | |
| "value": 128 | |
| }, | |
| { | |
| "label": "held_out_test_windows", | |
| "value": null | |
| }, | |
| { | |
| "label": "json_validity_rate", | |
| "value": null | |
| }, | |
| { | |
| "label": "action_macro_f1", | |
| "value": null | |
| } | |
| ], | |
| "source": "docs/data/omni_finetune_verified_result.json", | |
| "current_scope": "The selected-episode Qwen3-Omni diagnostic pilot is verified on the 96/16/16 split and now meets the 98% target for JSON validity; action/subtask quality remains weak, so current results are diagnostic baselines, not strong model-quality claims." | |
| } | |
| ] | |
| } | |