Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "title": "Task Method 20-Result Matrix", | |
| "status": "pass", | |
| "generated_at_utc": "2026-06-18T06:01:21+00:00", | |
| "task_count": 20, | |
| "method_count": 9, | |
| "method_task_record_count": 180, | |
| "scored_method_task_count": 120, | |
| "series": [ | |
| { | |
| "id": "minimal", | |
| "label": "Minimal", | |
| "short_label": "Min", | |
| "color": "#ccffa0", | |
| "kind": "full_20_task_baseline", | |
| "scope": "1 public sample episode", | |
| "stroke_dasharray": null, | |
| "method_detail": "Single-episode simple heads over the public sample split.", | |
| "plotted_as": "filled polygon", | |
| "result_record_count": 20, | |
| "scored_task_count": 20, | |
| "covered_task_count": 20, | |
| "proxy_scored_task_count": 0, | |
| "scoreless_task_count": 0, | |
| "unsupported_task_count": 0, | |
| "not_evaluated_task_count": 0, | |
| "status_counts": { | |
| "scored": 20 | |
| }, | |
| "coverage_fraction": 1.0, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "neural_mlp", | |
| "label": "Neural MLP", | |
| "short_label": "NN", | |
| "color": "#67e8d1", | |
| "kind": "full_20_task_baseline", | |
| "scope": "1 public sample episode", | |
| "stroke_dasharray": null, | |
| "method_detail": "Single-episode compact PyTorch MLP heads on the same 20 task contracts.", | |
| "plotted_as": "filled polygon", | |
| "result_record_count": 20, | |
| "scored_task_count": 20, | |
| "covered_task_count": 20, | |
| "proxy_scored_task_count": 0, | |
| "scoreless_task_count": 0, | |
| "unsupported_task_count": 0, | |
| "not_evaluated_task_count": 0, | |
| "status_counts": { | |
| "scored": 20 | |
| }, | |
| "coverage_fraction": 1.0, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "metadata128_simple", | |
| "label": "128ep Metadata Simple", | |
| "short_label": "128-S", | |
| "color": "#ffd166", | |
| "kind": "partial_128_episode_metadata_baseline", | |
| "scope": "128 selected episodes, JSONL metadata/text only", | |
| "stroke_dasharray": "9 6", | |
| "method_detail": "128-episode JSONL metadata/text simple baselines.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 8, | |
| "covered_task_count": 8, | |
| "proxy_scored_task_count": 0, | |
| "scoreless_task_count": 12, | |
| "unsupported_task_count": 12, | |
| "not_evaluated_task_count": 0, | |
| "status_counts": { | |
| "not_supported_by_metadata_only_package": 8, | |
| "scored": 8, | |
| "unsupported_without_required_target": 4 | |
| }, | |
| "coverage_fraction": 0.4, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "metadata128_neural_mlp", | |
| "label": "128ep Metadata NN", | |
| "short_label": "128-NN", | |
| "color": "#f472b6", | |
| "kind": "partial_128_episode_metadata_baseline", | |
| "scope": "128 selected episodes, JSONL metadata/text only", | |
| "stroke_dasharray": "3 6", | |
| "method_detail": "128-episode JSONL metadata/text MLP baselines.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 6, | |
| "covered_task_count": 6, | |
| "proxy_scored_task_count": 0, | |
| "scoreless_task_count": 14, | |
| "unsupported_task_count": 14, | |
| "not_evaluated_task_count": 0, | |
| "status_counts": { | |
| "not_supported_by_metadata_only_package": 14, | |
| "scored": 6 | |
| }, | |
| "coverage_fraction": 0.3, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "raw128_simple", | |
| "label": "128ep Raw Simple", | |
| "short_label": "128-RS", | |
| "color": "#f59e0b", | |
| "kind": "complete_128_episode_raw_feature_baseline", | |
| "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", | |
| "stroke_dasharray": "8 4", | |
| "method_detail": "128-episode 4430-dim sensor NPZ simple heads; tasks 15/19 use compact proxies.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 20, | |
| "covered_task_count": 20, | |
| "proxy_scored_task_count": 2, | |
| "scoreless_task_count": 0, | |
| "unsupported_task_count": 0, | |
| "not_evaluated_task_count": 0, | |
| "status_counts": { | |
| "proxy_scored": 2, | |
| "scored": 18 | |
| }, | |
| "coverage_fraction": 1.0, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "raw128_neural_mlp", | |
| "label": "128ep Raw NN", | |
| "short_label": "128-RN", | |
| "color": "#22d3ee", | |
| "kind": "complete_128_episode_raw_feature_baseline", | |
| "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", | |
| "stroke_dasharray": "2 5", | |
| "method_detail": "128-episode 4430-dim sensor NPZ MLP heads; tasks 15/19 use compact proxies.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 20, | |
| "covered_task_count": 20, | |
| "proxy_scored_task_count": 2, | |
| "scoreless_task_count": 0, | |
| "unsupported_task_count": 0, | |
| "not_evaluated_task_count": 0, | |
| "status_counts": { | |
| "proxy_scored": 2, | |
| "scored": 18 | |
| }, | |
| "coverage_fraction": 1.0, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "qwen3_omni_v6_lora", | |
| "label": "Qwen3-Omni v6 LoRA", | |
| "short_label": "Qwen3", | |
| "color": "#9bb8ff", | |
| "kind": "partial_128_episode_foundation_model_overlay", | |
| "scope": "128 selected episodes, held-out test", | |
| "stroke_dasharray": "7 7", | |
| "method_detail": "Verified held-out Qwen3-Omni v6 LoRA metrics, plus task 16 and any completed private-GPU future-task probes scored from task-specific JSON.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 14, | |
| "covered_task_count": 14, | |
| "proxy_scored_task_count": 0, | |
| "scoreless_task_count": 6, | |
| "unsupported_task_count": 0, | |
| "not_evaluated_task_count": 6, | |
| "status_counts": { | |
| "not_evaluated_in_verified_package": 6, | |
| "scored": 14 | |
| }, | |
| "coverage_fraction": 0.7, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "cosmos3_super_reasoner", | |
| "label": "Cosmos3-Super Reasoner", | |
| "short_label": "C3-S", | |
| "color": "#ff9c7a", | |
| "kind": "partial_128_episode_foundation_model_overlay", | |
| "scope": "128 selected episodes, held-out test", | |
| "stroke_dasharray": "4 7", | |
| "method_detail": "Verified Cosmos3-Super base-weight Reasoner JSON-task evaluation, plus task 16 scored from existing verified action/object JSON.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 7, | |
| "covered_task_count": 7, | |
| "proxy_scored_task_count": 0, | |
| "scoreless_task_count": 13, | |
| "unsupported_task_count": 0, | |
| "not_evaluated_task_count": 13, | |
| "status_counts": { | |
| "not_evaluated_in_verified_package": 13, | |
| "scored": 7 | |
| }, | |
| "coverage_fraction": 0.35, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "cosmos3_nano_future_window", | |
| "label": "Cosmos3-Nano Future Window", | |
| "short_label": "C3-N", | |
| "color": "#d9c7ff", | |
| "kind": "partial_128_episode_world_model_overlay", | |
| "scope": "128 selected episodes, held-out test", | |
| "stroke_dasharray": "2 7", | |
| "method_detail": "Verified Cosmos3-Nano future-window compatibility metrics.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 5, | |
| "covered_task_count": 5, | |
| "proxy_scored_task_count": 0, | |
| "scoreless_task_count": 15, | |
| "unsupported_task_count": 0, | |
| "not_evaluated_task_count": 15, | |
| "status_counts": { | |
| "not_evaluated_in_verified_package": 15, | |
| "scored": 5 | |
| }, | |
| "coverage_fraction": 0.25, | |
| "result_record_fraction": 1.0 | |
| } | |
| ], | |
| "records": [ | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.05, | |
| "raw_text": "0.0500", | |
| "normalized_score": 0.05, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/timeline_action/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.014814814814814814, | |
| "raw_text": "0.0148", | |
| "normalized_score": 0.014814814814814814, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/neural_mlp/timeline_action/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.008252821966746326, | |
| "raw_text": "0.0083", | |
| "normalized_score": 0.008252821966746326, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/timeline_action/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.004175793689174209, | |
| "raw_text": "0.0042", | |
| "normalized_score": 0.004175793689174209, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/timeline_action/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.002915061325704321, | |
| "raw_text": "0.0029", | |
| "normalized_score": 0.002915061325704321, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/timeline_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0014955083181204041, | |
| "raw_text": "0.0015", | |
| "normalized_score": 0.0014955083181204041, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/timeline_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0028830723979596335, | |
| "raw_text": "0.0029", | |
| "normalized_score": 0.0028830723979596335, | |
| "metric_key": "action_macro_f1", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0008284021201089245, | |
| "raw_text": "0.0008", | |
| "normalized_score": 0.0008284021201089245, | |
| "metric_key": "action_macro_f1", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.007936507936507936, | |
| "raw_text": "0.0079", | |
| "normalized_score": 0.007936507936507936, | |
| "metric_key": "action_accuracy_from_retrieved_future", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.05056355513846935, | |
| "raw_text": "0.0506", | |
| "normalized_score": 0.05056355513846935, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/timeline_subtask/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.02810810810810811, | |
| "raw_text": "0.0281", | |
| "normalized_score": 0.02810810810810811, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.00019512195121951218, | |
| "raw_text": "0.0002", | |
| "normalized_score": 0.00019512195121951218, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/timeline_subtask/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 7.207207207207208e-05, | |
| "raw_text": "0.0001", | |
| "normalized_score": 7.207207207207208e-05, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/timeline_subtask/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/timeline_subtask/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 7.35632183908046e-05, | |
| "raw_text": "0.0001", | |
| "normalized_score": 7.35632183908046e-05, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/timeline_subtask/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0037313432835820895, | |
| "raw_text": "0.0037", | |
| "normalized_score": 0.0037313432835820895, | |
| "metric_key": "subtask_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "subtask_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.6118237590630229, | |
| "raw_text": "0.6118", | |
| "normalized_score": 0.6118237590630229, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/transition_detection/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.5862068965517241, | |
| "raw_text": "0.5862", | |
| "normalized_score": 0.5862068965517241, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/neural_mlp/transition_detection/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.29652162550029315, | |
| "raw_text": "0.2965", | |
| "normalized_score": 0.29652162550029315, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/transition_detection/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.4841733292368365, | |
| "raw_text": "0.4842", | |
| "normalized_score": 0.4841733292368365, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/transition_detection/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.4203613574238283, | |
| "raw_text": "0.4204", | |
| "normalized_score": 0.4203613574238283, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/transition_detection/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.4902206914147213, | |
| "raw_text": "0.4902", | |
| "normalized_score": 0.4902206914147213, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/transition_detection/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.9898313492063492, | |
| "raw_text": "0.9898", | |
| "normalized_score": 0.9898313492063492, | |
| "metric_key": "transition_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.36830357142857145, | |
| "raw_text": "0.3683", | |
| "normalized_score": 0.36830357142857145, | |
| "metric_key": "transition_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.9682539682539683, | |
| "raw_text": "0.9683", | |
| "normalized_score": 0.9682539682539683, | |
| "metric_key": "transition_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.05925925925925927, | |
| "raw_text": "0.0593", | |
| "normalized_score": 0.05925925925925927, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/next_action/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.04186046511627907, | |
| "raw_text": "0.0419", | |
| "normalized_score": 0.04186046511627907, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/neural_mlp/next_action/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.006514774539765508, | |
| "raw_text": "0.0065", | |
| "normalized_score": 0.006514774539765508, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/next_action/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.004910507980164745, | |
| "raw_text": "0.0049", | |
| "normalized_score": 0.004910507980164745, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/next_action/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.003285273363482094, | |
| "raw_text": "0.0033", | |
| "normalized_score": 0.003285273363482094, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/next_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0018477984371755407, | |
| "raw_text": "0.0018", | |
| "normalized_score": 0.0018477984371755407, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/next_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.04305335446381405, | |
| "raw_text": "0.0431", | |
| "normalized_score": 0.04305335446381405, | |
| "metric_key": "next_action_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.013392857142857142, | |
| "raw_text": "0.0134", | |
| "normalized_score": 0.013392857142857142, | |
| "metric_key": "next_action_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.007936507936507936, | |
| "raw_text": "0.0079", | |
| "normalized_score": 0.007936507936507936, | |
| "metric_key": "action_accuracy_from_retrieved_future", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.8646570444107056, | |
| "raw_text": "0.8647", | |
| "normalized_score": 0.12473175026322614, | |
| "metric_key": "mpjpe", | |
| "source": "results/episode_task_suite/hand_trajectory_forecast/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.10785018652677536, | |
| "raw_text": "0.1079", | |
| "normalized_score": 1.0, | |
| "metric_key": "mpjpe", | |
| "source": "results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "unsupported_without_required_target", | |
| "status_label": "unsupported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mpjpe", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/hand_trajectory_forecast/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "requires future hand-joint trajectories from raw sensor feature NPZ blocks, which are not in the public 128 package" | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mpjpe", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.2729249894618988, | |
| "raw_text": "0.2729", | |
| "normalized_score": 0.39516420515180267, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/hand_trajectory_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.18475216627120972, | |
| "raw_text": "0.1848", | |
| "normalized_score": 0.5837560051580399, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/hand_trajectory_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mpjpe", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mpjpe", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mpjpe", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 1.0, | |
| "raw_text": "1.000", | |
| "normalized_score": 1.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/contact_prediction/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 1.0, | |
| "raw_text": "1.000", | |
| "normalized_score": 1.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/neural_mlp/contact_prediction/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.4381481308057444, | |
| "raw_text": "0.4381", | |
| "normalized_score": 0.4381481308057444, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/contact_prediction/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.5682695682695682, | |
| "raw_text": "0.5683", | |
| "normalized_score": 0.5682695682695682, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/contact_prediction/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.886990707397193, | |
| "raw_text": "0.8870", | |
| "normalized_score": 0.886990707397193, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/contact_prediction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 1.0, | |
| "raw_text": "1.000", | |
| "normalized_score": 1.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/contact_prediction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.8177083333333334, | |
| "raw_text": "0.8177", | |
| "normalized_score": 0.8177083333333334, | |
| "metric_key": "contact_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.32142857142857145, | |
| "raw_text": "0.3214", | |
| "normalized_score": 0.32142857142857145, | |
| "metric_key": "contact_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.7433862433862434, | |
| "raw_text": "0.7434", | |
| "normalized_score": 0.7433862433862434, | |
| "metric_key": "contact_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.18034382095361662, | |
| "raw_text": "0.1803", | |
| "normalized_score": 0.18034382095361662, | |
| "metric_key": "micro_f1", | |
| "source": "results/episode_task_suite/object_relevance/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.1679279279279279, | |
| "raw_text": "0.1679", | |
| "normalized_score": 0.1679279279279279, | |
| "metric_key": "micro_f1", | |
| "source": "results/episode_task_suite/neural_mlp/object_relevance/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.17764578833693304, | |
| "raw_text": "0.1776", | |
| "normalized_score": 0.17764578833693304, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/object_relevance/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.18662723837686876, | |
| "raw_text": "0.1866", | |
| "normalized_score": 0.18662723837686876, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/object_relevance/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0655376369662084, | |
| "raw_text": "0.0655", | |
| "normalized_score": 0.0655376369662084, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/object_relevance/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.1765890386972509, | |
| "raw_text": "0.1766", | |
| "normalized_score": 0.1765890386972509, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/object_relevance/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.3064982378331287, | |
| "raw_text": "0.3065", | |
| "normalized_score": 0.3064982378331287, | |
| "metric_key": "object_micro_f1", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.13704276146316333, | |
| "raw_text": "0.1370", | |
| "normalized_score": 0.13704276146316333, | |
| "metric_key": "object_micro_f1", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "micro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.016023479050338015, | |
| "raw_text": "0.0160", | |
| "normalized_score": 0.016023479050338015, | |
| "metric_key": "mrr", | |
| "source": "results/episode_task_suite/caption_grounding/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.01684125567132316, | |
| "raw_text": "0.0168", | |
| "normalized_score": 0.01684125567132316, | |
| "metric_key": "mrr", | |
| "source": "results/episode_task_suite/neural_mlp/caption_grounding/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.002332374220713973, | |
| "raw_text": "0.0023", | |
| "normalized_score": 0.002332374220713973, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.011138836853206158, | |
| "raw_text": "0.0111", | |
| "normalized_score": 0.011138836853206158, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0063402121886610985, | |
| "raw_text": "0.0063", | |
| "normalized_score": 0.0063402121886610985, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.8764467592592605, | |
| "raw_text": "0.8764", | |
| "normalized_score": 0.8764467592592605, | |
| "metric_key": "caption_grounding_mrr", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_retrieval_task_probes_a100_20260617T175919Z/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.26925966892956127, | |
| "raw_text": "0.2693", | |
| "normalized_score": 0.26925966892956127, | |
| "metric_key": "mrr", | |
| "source": "results/episode_task_suite/cross_modal_retrieval/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.1299971898648288, | |
| "raw_text": "0.1300", | |
| "normalized_score": 0.1299971898648288, | |
| "metric_key": "mrr", | |
| "source": "results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "unsupported_without_required_target", | |
| "status_label": "unsupported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/cross_modal_retrieval/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "requires paired motion/IMU/camera/audio/depth feature blocks, which are not in the public 128 package" | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.003459817497059703, | |
| "raw_text": "0.0035", | |
| "normalized_score": 0.003459817497059703, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/cross_modal_retrieval/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.002535284962505102, | |
| "raw_text": "0.0025", | |
| "normalized_score": 0.002535284962505102, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/cross_modal_retrieval/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.022138720585222767, | |
| "raw_text": "0.0221", | |
| "normalized_score": 0.022138720585222767, | |
| "metric_key": "future_retrieval_mrr", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": -0.015271898913936655, | |
| "raw_text": "-0.0153", | |
| "normalized_score": 0.0, | |
| "metric_key": "r2", | |
| "source": "results/episode_task_suite/modality_reconstruction/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": -0.010171410134180991, | |
| "raw_text": "-0.0102", | |
| "normalized_score": 0.0, | |
| "metric_key": "r2", | |
| "source": "results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "unsupported_without_required_target", | |
| "status_label": "unsupported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "r2", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/modality_reconstruction/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "requires source and target modality feature blocks such as depth/video vectors, which are not in the public 128 package" | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "r2", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": -1.3450960391924882, | |
| "raw_text": "-1.345", | |
| "normalized_score": 0.0, | |
| "metric_key": "r2", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/modality_reconstruction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": -1.3974418160502369, | |
| "raw_text": "-1.397", | |
| "normalized_score": 0.0, | |
| "metric_key": "r2", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/modality_reconstruction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "r2", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "r2", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "r2", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.5399515738498789, | |
| "raw_text": "0.5400", | |
| "normalized_score": 0.5399515738498789, | |
| "metric_key": "f1", | |
| "source": "results/episode_task_suite/temporal_order/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.8520179372197308, | |
| "raw_text": "0.8520", | |
| "normalized_score": 0.8520179372197308, | |
| "metric_key": "f1", | |
| "source": "results/episode_task_suite/neural_mlp/temporal_order/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.4198864140782312, | |
| "raw_text": "0.4199", | |
| "normalized_score": 0.4198864140782312, | |
| "metric_key": "f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.49824413370686593, | |
| "raw_text": "0.4982", | |
| "normalized_score": 0.49824413370686593, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.8030047098504103, | |
| "raw_text": "0.8030", | |
| "normalized_score": 0.8030047098504103, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.40984631701404173, | |
| "raw_text": "0.4098", | |
| "normalized_score": 0.40984631701404173, | |
| "metric_key": "temporal_order_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.5051698670605613, | |
| "raw_text": "0.5052", | |
| "normalized_score": 0.5051698670605613, | |
| "metric_key": "f1", | |
| "source": "results/episode_task_suite/misalignment_detection/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.7152682255845944, | |
| "raw_text": "0.7153", | |
| "normalized_score": 0.7152682255845944, | |
| "metric_key": "f1", | |
| "source": "results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "unsupported_without_required_target", | |
| "status_label": "unsupported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "requires deliberately shifted cross-modal feature pairs, which cannot be reconstructed from the public JSONL labels alone" | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.4958867673901769, | |
| "raw_text": "0.4959", | |
| "normalized_score": 0.4958867673901769, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.8272709077974252, | |
| "raw_text": "0.8273", | |
| "normalized_score": 0.8272709077974252, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.3344936184319576, | |
| "raw_text": "0.3345", | |
| "normalized_score": 0.3344936184319576, | |
| "metric_key": "misalignment_detection_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.07499999999999998, | |
| "raw_text": "0.0750", | |
| "normalized_score": 0.07499999999999998, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/tier2_task_suite/long_horizon_next_action/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.06545454545454546, | |
| "raw_text": "0.0655", | |
| "normalized_score": 0.06545454545454546, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/long_horizon_next_action/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0024280172369056294, | |
| "raw_text": "0.0024", | |
| "normalized_score": 0.0024280172369056294, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/long_horizon_next_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.001063859887389299, | |
| "raw_text": "0.0011", | |
| "normalized_score": 0.001063859887389299, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/long_horizon_next_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0023356666867101906, | |
| "raw_text": "0.0023", | |
| "normalized_score": 0.0023356666867101906, | |
| "metric_key": "long_horizon_next_action_macro_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/long_horizon_next_action/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.04545454545454545, | |
| "raw_text": "0.0455", | |
| "normalized_score": 0.04545454545454545, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/tier2_task_suite/next_subtask_forecast/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.050724637681159424, | |
| "raw_text": "0.0507", | |
| "normalized_score": 0.050724637681159424, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/next_subtask_forecast/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/next_subtask_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/next_subtask_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.004206715978529301, | |
| "raw_text": "0.0042", | |
| "normalized_score": 0.004206715978529301, | |
| "metric_key": "next_subtask_forecast_macro_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/next_subtask_forecast/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.04444444444444444, | |
| "raw_text": "0.0444", | |
| "normalized_score": 0.04444444444444444, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/tier2_task_suite/interaction_text_prediction/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0380952380952381, | |
| "raw_text": "0.0381", | |
| "normalized_score": 0.0380952380952381, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/interaction_text_prediction/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "proxy_scored", | |
| "status_label": "proxy scored", | |
| "scored": true, | |
| "proxy_scored": true, | |
| "raw": 0.012611998261547169, | |
| "raw_text": "0.0126", | |
| "normalized_score": 0.012611998261547169, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": "documented compact proxy completion for this raw128 task axis" | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "proxy_scored", | |
| "status_label": "proxy scored", | |
| "scored": true, | |
| "proxy_scored": true, | |
| "raw": 0.009791421280985521, | |
| "raw_text": "0.0098", | |
| "normalized_score": 0.009791421280985521, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": "documented compact proxy completion for this raw128 task axis" | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/tier2_task_suite/action_object_relation/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/action_object_relation/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/action_object_relation/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/action_object_relation/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0002220083079671497, | |
| "raw_text": "0.0002", | |
| "normalized_score": 0.0002220083079671497, | |
| "metric_key": "action_object_relation_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/qwen3_omni_v6_lora/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "action_object_relation_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_super_reasoner/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.16939890710382516, | |
| "raw_text": "0.1694", | |
| "normalized_score": 0.16939890710382516, | |
| "metric_key": "micro_f1", | |
| "source": "results/episode_task_suite/tier2_task_suite/object_set_forecast/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.19718309859154928, | |
| "raw_text": "0.1972", | |
| "normalized_score": 0.19718309859154928, | |
| "metric_key": "micro_f1", | |
| "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/object_set_forecast/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "micro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "micro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.06469493412657774, | |
| "raw_text": "0.0647", | |
| "normalized_score": 0.06469493412657774, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/object_set_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.17523098630012288, | |
| "raw_text": "0.1752", | |
| "normalized_score": 0.17523098630012288, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/object_set_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.1659483964851402, | |
| "raw_text": "0.1659", | |
| "normalized_score": 0.1659483964851402, | |
| "metric_key": "object_set_forecast_micro_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/object_set_forecast/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "micro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "micro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.042049407958984375, | |
| "raw_text": "0.0420", | |
| "normalized_score": 1.0, | |
| "metric_key": "mae", | |
| "source": "results/episode_task_suite/tier2_task_suite/imu_to_hand_pose/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.042562149465084076, | |
| "raw_text": "0.0426", | |
| "normalized_score": 0.9879531106266066, | |
| "metric_key": "mae", | |
| "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/imu_to_hand_pose/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mae", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mae", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.22941437363624573, | |
| "raw_text": "0.2294", | |
| "normalized_score": 0.1832902066792771, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/imu_to_hand_pose/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.252998411655426, | |
| "raw_text": "0.2530", | |
| "normalized_score": 0.1662042369509182, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/imu_to_hand_pose/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mae", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mae", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mae", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.4943004846572876, | |
| "raw_text": "0.4943", | |
| "normalized_score": 0.4943004846572876, | |
| "metric_key": "mrr", | |
| "source": "results/episode_task_suite/tier2_task_suite/camera_view_sync_retrieval/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.24086658656597137, | |
| "raw_text": "0.2409", | |
| "normalized_score": 0.24086658656597137, | |
| "metric_key": "mrr", | |
| "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/camera_view_sync_retrieval/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "proxy_scored", | |
| "status_label": "proxy scored", | |
| "scored": true, | |
| "proxy_scored": true, | |
| "raw": 0.0026625150348991156, | |
| "raw_text": "0.0027", | |
| "normalized_score": 0.0026625150348991156, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": "documented compact proxy completion for this raw128 task axis" | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "proxy_scored", | |
| "status_label": "proxy scored", | |
| "scored": true, | |
| "proxy_scored": true, | |
| "raw": 0.0025448438245803118, | |
| "raw_text": "0.0025", | |
| "normalized_score": 0.0025448438245803118, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": "documented compact proxy completion for this raw128 task axis" | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "minimal", | |
| "method": "Minimal", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 10.53735637664795, | |
| "raw_text": "10.54", | |
| "normalized_score": 1.0, | |
| "metric_key": "mae", | |
| "source": "results/episode_task_suite/tier2_task_suite/time_to_transition/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "neural_mlp", | |
| "method": "Neural MLP", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 10.55449390411377, | |
| "raw_text": "10.55", | |
| "normalized_score": 0.9983762814568361, | |
| "metric_key": "mae", | |
| "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/time_to_transition/metrics.json", | |
| "scope": "single_episode_public_sample", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Metadata Simple", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mae", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Metadata NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mae", | |
| "source": null, | |
| "scope": "multi_episode_128_metadata_baseline", | |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 52.32759475708008, | |
| "raw_text": "52.33", | |
| "normalized_score": 0.20137284019197565, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/time_to_transition/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 42.374061584472656, | |
| "raw_text": "42.37", | |
| "normalized_score": 0.24867468405504953, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/time_to_transition/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 134.0687422166874, | |
| "raw_text": "134.07", | |
| "normalized_score": 0.07859666766782253, | |
| "metric_key": "time_to_transition_mae", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/time_to_transition/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mae", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mae", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| } | |
| ] | |
| } | |