{ "title": "Task Method 20-Result Matrix", "status": "pass", "generated_at_utc": "2026-06-18T09:06:25+00:00", "task_count": 20, "method_count": 9, "method_task_record_count": 180, "scored_method_task_count": 120, "series": [ { "id": "minimal", "label": "Minimal", "short_label": "Min", "color": "#ccffa0", "kind": "full_20_task_baseline", "scope": "1 public sample episode", "stroke_dasharray": null, "method_detail": "Single-episode simple heads over the public sample split.", "plotted_as": "filled polygon", "result_record_count": 20, "scored_task_count": 20, "covered_task_count": 20, "proxy_scored_task_count": 0, "scoreless_task_count": 0, "unsupported_task_count": 0, "not_evaluated_task_count": 0, "status_counts": { "scored": 20 }, "coverage_fraction": 1.0, "result_record_fraction": 1.0 }, { "id": "neural_mlp", "label": "Neural MLP", "short_label": "NN", "color": "#67e8d1", "kind": "full_20_task_baseline", "scope": "1 public sample episode", "stroke_dasharray": null, "method_detail": "Single-episode compact PyTorch MLP heads on the same 20 task contracts.", "plotted_as": "filled polygon", "result_record_count": 20, "scored_task_count": 20, "covered_task_count": 20, "proxy_scored_task_count": 0, "scoreless_task_count": 0, "unsupported_task_count": 0, "not_evaluated_task_count": 0, "status_counts": { "scored": 20 }, "coverage_fraction": 1.0, "result_record_fraction": 1.0 }, { "id": "metadata128_simple", "label": "128ep Metadata Simple", "short_label": "128-S", "color": "#ffd166", "kind": "partial_128_episode_metadata_baseline", "scope": "128 selected episodes, JSONL metadata/text only", "stroke_dasharray": "9 6", "method_detail": "128-episode JSONL metadata/text simple baselines.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 8, "covered_task_count": 8, "proxy_scored_task_count": 0, "scoreless_task_count": 12, "unsupported_task_count": 12, "not_evaluated_task_count": 0, "status_counts": { "not_supported_by_metadata_only_package": 8, "scored": 8, "unsupported_without_required_target": 4 }, "coverage_fraction": 0.4, "result_record_fraction": 1.0 }, { "id": "metadata128_neural_mlp", "label": "128ep Metadata NN", "short_label": "128-NN", "color": "#f472b6", "kind": "partial_128_episode_metadata_baseline", "scope": "128 selected episodes, JSONL metadata/text only", "stroke_dasharray": "3 6", "method_detail": "128-episode JSONL metadata/text MLP baselines.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 6, "covered_task_count": 6, "proxy_scored_task_count": 0, "scoreless_task_count": 14, "unsupported_task_count": 14, "not_evaluated_task_count": 0, "status_counts": { "not_supported_by_metadata_only_package": 14, "scored": 6 }, "coverage_fraction": 0.3, "result_record_fraction": 1.0 }, { "id": "raw128_simple", "label": "128ep Raw Simple", "short_label": "128-RS", "color": "#f59e0b", "kind": "complete_128_episode_raw_feature_baseline", "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", "stroke_dasharray": "8 4", "method_detail": "128-episode 4430-dim sensor NPZ simple heads; tasks 15/19 use compact proxies.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 20, "covered_task_count": 20, "proxy_scored_task_count": 2, "scoreless_task_count": 0, "unsupported_task_count": 0, "not_evaluated_task_count": 0, "status_counts": { "proxy_scored": 2, "scored": 18 }, "coverage_fraction": 1.0, "result_record_fraction": 1.0 }, { "id": "raw128_neural_mlp", "label": "128ep Raw NN", "short_label": "128-RN", "color": "#22d3ee", "kind": "complete_128_episode_raw_feature_baseline", "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", "stroke_dasharray": "2 5", "method_detail": "128-episode 4430-dim sensor NPZ MLP heads; tasks 15/19 use compact proxies.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 20, "covered_task_count": 20, "proxy_scored_task_count": 2, "scoreless_task_count": 0, "unsupported_task_count": 0, "not_evaluated_task_count": 0, "status_counts": { "proxy_scored": 2, "scored": 18 }, "coverage_fraction": 1.0, "result_record_fraction": 1.0 }, { "id": "qwen3_omni_v6_lora", "label": "Qwen3-Omni v6 LoRA", "short_label": "Qwen3", "color": "#9bb8ff", "kind": "partial_128_episode_foundation_model_overlay", "scope": "128 selected episodes, held-out test", "stroke_dasharray": "7 7", "method_detail": "Verified held-out Qwen3-Omni v6 LoRA metrics, plus task 16 and any completed private-GPU future-task probes scored from task-specific JSON.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 14, "covered_task_count": 14, "proxy_scored_task_count": 0, "scoreless_task_count": 6, "unsupported_task_count": 0, "not_evaluated_task_count": 6, "status_counts": { "not_evaluated_in_verified_package": 6, "scored": 14 }, "coverage_fraction": 0.7, "result_record_fraction": 1.0 }, { "id": "cosmos3_super_reasoner", "label": "Cosmos3-Super Reasoner", "short_label": "C3-S", "color": "#ff9c7a", "kind": "partial_128_episode_foundation_model_overlay", "scope": "128 selected episodes, held-out test", "stroke_dasharray": "4 7", "method_detail": "Verified Cosmos3-Super base-weight Reasoner JSON-task evaluation, plus task 16 scored from existing verified action/object JSON.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 7, "covered_task_count": 7, "proxy_scored_task_count": 0, "scoreless_task_count": 13, "unsupported_task_count": 0, "not_evaluated_task_count": 13, "status_counts": { "not_evaluated_in_verified_package": 13, "scored": 7 }, "coverage_fraction": 0.35, "result_record_fraction": 1.0 }, { "id": "cosmos3_nano_future_window", "label": "Cosmos3-Nano Future Window", "short_label": "C3-N", "color": "#d9c7ff", "kind": "partial_128_episode_world_model_overlay", "scope": "128 selected episodes, held-out test", "stroke_dasharray": "2 7", "method_detail": "Verified Cosmos3-Nano future-window compatibility metrics.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 5, "covered_task_count": 5, "proxy_scored_task_count": 0, "scoreless_task_count": 15, "unsupported_task_count": 0, "not_evaluated_task_count": 15, "status_counts": { "not_evaluated_in_verified_package": 15, "scored": 5 }, "coverage_fraction": 0.25, "result_record_fraction": 1.0 } ], "records": [ { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.05, "raw_text": "0.0500", "normalized_score": 0.05, "metric_key": "macro_f1", "source": "results/episode_task_suite/timeline_action/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.014814814814814814, "raw_text": "0.0148", "normalized_score": 0.014814814814814814, "metric_key": "macro_f1", "source": "results/episode_task_suite/neural_mlp/timeline_action/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.008252821966746326, "raw_text": "0.0083", "normalized_score": 0.008252821966746326, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/timeline_action/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.004175793689174209, "raw_text": "0.0042", "normalized_score": 0.004175793689174209, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/timeline_action/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.002915061325704321, "raw_text": "0.0029", "normalized_score": 0.002915061325704321, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/timeline_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0014955083181204041, "raw_text": "0.0015", "normalized_score": 0.0014955083181204041, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/timeline_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0028830723979596335, "raw_text": "0.0029", "normalized_score": 0.0028830723979596335, "metric_key": "action_macro_f1", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0008284021201089245, "raw_text": "0.0008", "normalized_score": 0.0008284021201089245, "metric_key": "action_macro_f1", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.007936507936507936, "raw_text": "0.0079", "normalized_score": 0.007936507936507936, "metric_key": "action_accuracy_from_retrieved_future", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.05056355513846935, "raw_text": "0.0506", "normalized_score": 0.05056355513846935, "metric_key": "macro_f1", "source": "results/episode_task_suite/timeline_subtask/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.02810810810810811, "raw_text": "0.0281", "normalized_score": 0.02810810810810811, "metric_key": "macro_f1", "source": "results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.00019512195121951218, "raw_text": "0.0002", "normalized_score": 0.00019512195121951218, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/timeline_subtask/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 7.207207207207208e-05, "raw_text": "0.0001", "normalized_score": 7.207207207207208e-05, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/timeline_subtask/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/timeline_subtask/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 7.35632183908046e-05, "raw_text": "0.0001", "normalized_score": 7.35632183908046e-05, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/timeline_subtask/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0037313432835820895, "raw_text": "0.0037", "normalized_score": 0.0037313432835820895, "metric_key": "subtask_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "subtask_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.6118237590630229, "raw_text": "0.6118", "normalized_score": 0.6118237590630229, "metric_key": "macro_f1", "source": "results/episode_task_suite/transition_detection/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.5862068965517241, "raw_text": "0.5862", "normalized_score": 0.5862068965517241, "metric_key": "macro_f1", "source": "results/episode_task_suite/neural_mlp/transition_detection/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.29652162550029315, "raw_text": "0.2965", "normalized_score": 0.29652162550029315, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/transition_detection/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.4841733292368365, "raw_text": "0.4842", "normalized_score": 0.4841733292368365, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/transition_detection/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.4203613574238283, "raw_text": "0.4204", "normalized_score": 0.4203613574238283, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/transition_detection/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.4902206914147213, "raw_text": "0.4902", "normalized_score": 0.4902206914147213, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/transition_detection/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.9898313492063492, "raw_text": "0.9898", "normalized_score": 0.9898313492063492, "metric_key": "transition_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.36830357142857145, "raw_text": "0.3683", "normalized_score": 0.36830357142857145, "metric_key": "transition_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.9682539682539683, "raw_text": "0.9683", "normalized_score": 0.9682539682539683, "metric_key": "transition_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.05925925925925927, "raw_text": "0.0593", "normalized_score": 0.05925925925925927, "metric_key": "macro_f1", "source": "results/episode_task_suite/next_action/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.04186046511627907, "raw_text": "0.0419", "normalized_score": 0.04186046511627907, "metric_key": "macro_f1", "source": "results/episode_task_suite/neural_mlp/next_action/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.006514774539765508, "raw_text": "0.0065", "normalized_score": 0.006514774539765508, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/next_action/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.004910507980164745, "raw_text": "0.0049", "normalized_score": 0.004910507980164745, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/next_action/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.003285273363482094, "raw_text": "0.0033", "normalized_score": 0.003285273363482094, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/next_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0018477984371755407, "raw_text": "0.0018", "normalized_score": 0.0018477984371755407, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/next_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.04305335446381405, "raw_text": "0.0431", "normalized_score": 0.04305335446381405, "metric_key": "next_action_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.013392857142857142, "raw_text": "0.0134", "normalized_score": 0.013392857142857142, "metric_key": "next_action_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.007936507936507936, "raw_text": "0.0079", "normalized_score": 0.007936507936507936, "metric_key": "action_accuracy_from_retrieved_future", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.8646570444107056, "raw_text": "0.8647", "normalized_score": 0.12473175026322614, "metric_key": "mpjpe", "source": "results/episode_task_suite/hand_trajectory_forecast/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.10785018652677536, "raw_text": "0.1079", "normalized_score": 1.0, "metric_key": "mpjpe", "source": "results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "unsupported_without_required_target", "status_label": "unsupported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mpjpe", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/hand_trajectory_forecast/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": "requires future hand-joint trajectories from raw sensor feature NPZ blocks, which are not in the public 128 package" }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mpjpe", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.2729249894618988, "raw_text": "0.2729", "normalized_score": 0.39516420515180267, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/hand_trajectory_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.18475216627120972, "raw_text": "0.1848", "normalized_score": 0.5837560051580399, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/hand_trajectory_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mpjpe", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mpjpe", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mpjpe", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 1.0, "raw_text": "1.000", "normalized_score": 1.0, "metric_key": "macro_f1", "source": "results/episode_task_suite/contact_prediction/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 1.0, "raw_text": "1.000", "normalized_score": 1.0, "metric_key": "macro_f1", "source": "results/episode_task_suite/neural_mlp/contact_prediction/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.4381481308057444, "raw_text": "0.4381", "normalized_score": 0.4381481308057444, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/contact_prediction/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.5682695682695682, "raw_text": "0.5683", "normalized_score": 0.5682695682695682, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/contact_prediction/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.886990707397193, "raw_text": "0.8870", "normalized_score": 0.886990707397193, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/contact_prediction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 1.0, "raw_text": "1.000", "normalized_score": 1.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/contact_prediction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.8177083333333334, "raw_text": "0.8177", "normalized_score": 0.8177083333333334, "metric_key": "contact_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.32142857142857145, "raw_text": "0.3214", "normalized_score": 0.32142857142857145, "metric_key": "contact_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.7433862433862434, "raw_text": "0.7434", "normalized_score": 0.7433862433862434, "metric_key": "contact_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.18034382095361662, "raw_text": "0.1803", "normalized_score": 0.18034382095361662, "metric_key": "micro_f1", "source": "results/episode_task_suite/object_relevance/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.1679279279279279, "raw_text": "0.1679", "normalized_score": 0.1679279279279279, "metric_key": "micro_f1", "source": "results/episode_task_suite/neural_mlp/object_relevance/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.17764578833693304, "raw_text": "0.1776", "normalized_score": 0.17764578833693304, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/object_relevance/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.18662723837686876, "raw_text": "0.1866", "normalized_score": 0.18662723837686876, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/object_relevance/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0655376369662084, "raw_text": "0.0655", "normalized_score": 0.0655376369662084, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/object_relevance/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.1765890386972509, "raw_text": "0.1766", "normalized_score": 0.1765890386972509, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/object_relevance/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.3064982378331287, "raw_text": "0.3065", "normalized_score": 0.3064982378331287, "metric_key": "object_micro_f1", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.13704276146316333, "raw_text": "0.1370", "normalized_score": 0.13704276146316333, "metric_key": "object_micro_f1", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.016023479050338015, "raw_text": "0.0160", "normalized_score": 0.016023479050338015, "metric_key": "mrr", "source": "results/episode_task_suite/caption_grounding/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.01684125567132316, "raw_text": "0.0168", "normalized_score": 0.01684125567132316, "metric_key": "mrr", "source": "results/episode_task_suite/neural_mlp/caption_grounding/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.002332374220713973, "raw_text": "0.0023", "normalized_score": 0.002332374220713973, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/caption_grounding/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.011138836853206158, "raw_text": "0.0111", "normalized_score": 0.011138836853206158, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/caption_grounding/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0063402121886610985, "raw_text": "0.0063", "normalized_score": 0.0063402121886610985, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/caption_grounding/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.8764467592592605, "raw_text": "0.8764", "normalized_score": 0.8764467592592605, "metric_key": "caption_grounding_mrr", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_retrieval_task_probes_a100_20260617T175919Z/caption_grounding/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.26925966892956127, "raw_text": "0.2693", "normalized_score": 0.26925966892956127, "metric_key": "mrr", "source": "results/episode_task_suite/cross_modal_retrieval/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.1299971898648288, "raw_text": "0.1300", "normalized_score": 0.1299971898648288, "metric_key": "mrr", "source": "results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "unsupported_without_required_target", "status_label": "unsupported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/cross_modal_retrieval/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": "requires paired motion/IMU/camera/audio/depth feature blocks, which are not in the public 128 package" }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.003459817497059703, "raw_text": "0.0035", "normalized_score": 0.003459817497059703, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/cross_modal_retrieval/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.002535284962505102, "raw_text": "0.0025", "normalized_score": 0.002535284962505102, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/cross_modal_retrieval/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.022138720585222767, "raw_text": "0.0221", "normalized_score": 0.022138720585222767, "metric_key": "future_retrieval_mrr", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": -0.015271898913936655, "raw_text": "-0.0153", "normalized_score": 0.0, "metric_key": "r2", "source": "results/episode_task_suite/modality_reconstruction/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": -0.010171410134180991, "raw_text": "-0.0102", "normalized_score": 0.0, "metric_key": "r2", "source": "results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "unsupported_without_required_target", "status_label": "unsupported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "r2", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/modality_reconstruction/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": "requires source and target modality feature blocks such as depth/video vectors, which are not in the public 128 package" }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "r2", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": -1.3450960391924882, "raw_text": "-1.345", "normalized_score": 0.0, "metric_key": "r2", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/modality_reconstruction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": -1.3974418160502369, "raw_text": "-1.397", "normalized_score": 0.0, "metric_key": "r2", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/modality_reconstruction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "r2", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "r2", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "r2", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.5399515738498789, "raw_text": "0.5400", "normalized_score": 0.5399515738498789, "metric_key": "f1", "source": "results/episode_task_suite/temporal_order/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.8520179372197308, "raw_text": "0.8520", "normalized_score": 0.8520179372197308, "metric_key": "f1", "source": "results/episode_task_suite/neural_mlp/temporal_order/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.4198864140782312, "raw_text": "0.4199", "normalized_score": 0.4198864140782312, "metric_key": "f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/temporal_order/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.49824413370686593, "raw_text": "0.4982", "normalized_score": 0.49824413370686593, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/temporal_order/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.8030047098504103, "raw_text": "0.8030", "normalized_score": 0.8030047098504103, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/temporal_order/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.40984631701404173, "raw_text": "0.4098", "normalized_score": 0.40984631701404173, "metric_key": "temporal_order_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/temporal_order/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.5051698670605613, "raw_text": "0.5052", "normalized_score": 0.5051698670605613, "metric_key": "f1", "source": "results/episode_task_suite/misalignment_detection/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.7152682255845944, "raw_text": "0.7153", "normalized_score": 0.7152682255845944, "metric_key": "f1", "source": "results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "unsupported_without_required_target", "status_label": "unsupported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/misalignment_detection/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": "requires deliberately shifted cross-modal feature pairs, which cannot be reconstructed from the public JSONL labels alone" }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.4958867673901769, "raw_text": "0.4959", "normalized_score": 0.4958867673901769, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/misalignment_detection/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.8272709077974252, "raw_text": "0.8273", "normalized_score": 0.8272709077974252, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/misalignment_detection/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.3344936184319576, "raw_text": "0.3345", "normalized_score": 0.3344936184319576, "metric_key": "misalignment_detection_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/misalignment_detection/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.07499999999999998, "raw_text": "0.0750", "normalized_score": 0.07499999999999998, "metric_key": "macro_f1", "source": "results/episode_task_suite/tier2_task_suite/long_horizon_next_action/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.06545454545454546, "raw_text": "0.0655", "normalized_score": 0.06545454545454546, "metric_key": "macro_f1", "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/long_horizon_next_action/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0024280172369056294, "raw_text": "0.0024", "normalized_score": 0.0024280172369056294, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/long_horizon_next_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.001063859887389299, "raw_text": "0.0011", "normalized_score": 0.001063859887389299, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/long_horizon_next_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0023356666867101906, "raw_text": "0.0023", "normalized_score": 0.0023356666867101906, "metric_key": "long_horizon_next_action_macro_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/long_horizon_next_action/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.04545454545454545, "raw_text": "0.0455", "normalized_score": 0.04545454545454545, "metric_key": "macro_f1", "source": "results/episode_task_suite/tier2_task_suite/next_subtask_forecast/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.050724637681159424, "raw_text": "0.0507", "normalized_score": 0.050724637681159424, "metric_key": "macro_f1", "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/next_subtask_forecast/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/next_subtask_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/next_subtask_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.004206715978529301, "raw_text": "0.0042", "normalized_score": 0.004206715978529301, "metric_key": "next_subtask_forecast_macro_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/next_subtask_forecast/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.04444444444444444, "raw_text": "0.0444", "normalized_score": 0.04444444444444444, "metric_key": "macro_f1", "source": "results/episode_task_suite/tier2_task_suite/interaction_text_prediction/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0380952380952381, "raw_text": "0.0381", "normalized_score": 0.0380952380952381, "metric_key": "macro_f1", "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/interaction_text_prediction/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "proxy_scored", "status_label": "proxy scored", "scored": true, "proxy_scored": true, "raw": 0.012611998261547169, "raw_text": "0.0126", "normalized_score": 0.012611998261547169, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": "documented compact proxy completion for this raw128 task axis" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "proxy_scored", "status_label": "proxy scored", "scored": true, "proxy_scored": true, "raw": 0.009791421280985521, "raw_text": "0.0098", "normalized_score": 0.009791421280985521, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": "documented compact proxy completion for this raw128 task axis" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "macro_f1", "source": "results/episode_task_suite/tier2_task_suite/action_object_relation/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "macro_f1", "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/action_object_relation/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/action_object_relation/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/action_object_relation/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0002220083079671497, "raw_text": "0.0002", "normalized_score": 0.0002220083079671497, "metric_key": "action_object_relation_macro_f1", "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/qwen3_omni_v6_lora/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "action_object_relation_macro_f1", "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_super_reasoner/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.16939890710382516, "raw_text": "0.1694", "normalized_score": 0.16939890710382516, "metric_key": "micro_f1", "source": "results/episode_task_suite/tier2_task_suite/object_set_forecast/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.19718309859154928, "raw_text": "0.1972", "normalized_score": 0.19718309859154928, "metric_key": "micro_f1", "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/object_set_forecast/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.06469493412657774, "raw_text": "0.0647", "normalized_score": 0.06469493412657774, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/object_set_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.17523098630012288, "raw_text": "0.1752", "normalized_score": 0.17523098630012288, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/object_set_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.1659483964851402, "raw_text": "0.1659", "normalized_score": 0.1659483964851402, "metric_key": "object_set_forecast_micro_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/object_set_forecast/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.042049407958984375, "raw_text": "0.0420", "normalized_score": 1.0, "metric_key": "mae", "source": "results/episode_task_suite/tier2_task_suite/imu_to_hand_pose/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.042562149465084076, "raw_text": "0.0426", "normalized_score": 0.9879531106266066, "metric_key": "mae", "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/imu_to_hand_pose/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.22941437363624573, "raw_text": "0.2294", "normalized_score": 0.1832902066792771, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/imu_to_hand_pose/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.252998411655426, "raw_text": "0.2530", "normalized_score": 0.1662042369509182, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/imu_to_hand_pose/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.4943004846572876, "raw_text": "0.4943", "normalized_score": 0.4943004846572876, "metric_key": "mrr", "source": "results/episode_task_suite/tier2_task_suite/camera_view_sync_retrieval/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.24086658656597137, "raw_text": "0.2409", "normalized_score": 0.24086658656597137, "metric_key": "mrr", "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/camera_view_sync_retrieval/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "proxy_scored", "status_label": "proxy scored", "scored": true, "proxy_scored": true, "raw": 0.0026625150348991156, "raw_text": "0.0027", "normalized_score": 0.0026625150348991156, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": "documented compact proxy completion for this raw128 task axis" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "proxy_scored", "status_label": "proxy scored", "scored": true, "proxy_scored": true, "raw": 0.0025448438245803118, "raw_text": "0.0025", "normalized_score": 0.0025448438245803118, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": "documented compact proxy completion for this raw128 task axis" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "minimal", "method": "Minimal", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 10.53735637664795, "raw_text": "10.54", "normalized_score": 1.0, "metric_key": "mae", "source": "results/episode_task_suite/tier2_task_suite/time_to_transition/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "neural_mlp", "method": "Neural MLP", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 10.55449390411377, "raw_text": "10.55", "normalized_score": 0.9983762814568361, "metric_key": "mae", "source": "results/episode_task_suite/tier2_task_suite/neural_mlp/time_to_transition/metrics.json", "scope": "single_episode_public_sample", "reason": null }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 52.32759475708008, "raw_text": "52.33", "normalized_score": 0.20137284019197565, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/time_to_transition/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 42.374061584472656, "raw_text": "42.37", "normalized_score": 0.24867468405504953, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/time_to_transition/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 134.0687422166874, "raw_text": "134.07", "normalized_score": 0.07859666766782253, "metric_key": "time_to_transition_mae", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/time_to_transition/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" } ] }