{ "title": "128-Episode 20-Task Radar", "status": "pass", "generated_at_utc": "2026-06-18T06:01:21+00:00", "description": "Selected 128-episode metadata/raw baselines plus verified Qwen3/Cosmos branches. Every method has 20 records; numeric scores appear only where the public artifact produced that task target.", "task_count": 20, "method_count": 7, "method_task_record_count": 140, "scored_method_task_count": 80, "normalization_policy": { "higher_is_better": "bounded metrics are plotted directly on 0-1 axes after clipping to [0, 1]", "lower_is_better": "lower-error metrics are converted to best_observed_value / raw_value within the same task", "raw_values": "raw metric values, metric keys, and sources are retained in this JSON; the SVG is an overview, not a replacement for the metric table", "result_record_policy": "every method has 20 task records; records without a numeric score carry explicit unsupported/not-evaluated status and reason fields", "foundation_model_overlay": "Qwen3/Cosmos points are plotted only on task-aligned axes. Scoreless records mean the public result does not evaluate that task contract.", "metadata_128_overlay": "128-episode metadata baselines have 20 records, but numeric scores only where the public JSONL contains enough task labels without raw feature blocks.", "raw_128_overlay": "128-episode raw-feature baselines use staged sensor NPZ features. Eighteen axes use direct task targets; interaction text and camera-view sync are completed with documented compact proxies because raw interaction strings and paired video-view embeddings are absent from the 128 export." }, "source_unified_radar": "docs/data/unified_task_model_radar.json", "source_result_matrix": "docs/data/task_method_20_result_matrix.json", "series": [ { "id": "metadata128_simple", "label": "128ep Metadata Simple", "short_label": "128-S", "color": "#ffd166", "kind": "partial_128_episode_metadata_baseline", "scope": "128 selected episodes, JSONL metadata/text only", "stroke_dasharray": "9 6", "method_detail": "128-episode JSONL metadata/text simple baselines.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 8, "covered_task_count": 8, "proxy_scored_task_count": 0, "scoreless_task_count": 12, "unsupported_task_count": 12, "not_evaluated_task_count": 0, "status_counts": { "not_supported_by_metadata_only_package": 8, "scored": 8, "unsupported_without_required_target": 4 }, "coverage_fraction": 0.4, "result_record_fraction": 1.0 }, { "id": "metadata128_neural_mlp", "label": "128ep Metadata NN", "short_label": "128-NN", "color": "#f472b6", "kind": "partial_128_episode_metadata_baseline", "scope": "128 selected episodes, JSONL metadata/text only", "stroke_dasharray": "3 6", "method_detail": "128-episode JSONL metadata/text MLP baselines.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 6, "covered_task_count": 6, "proxy_scored_task_count": 0, "scoreless_task_count": 14, "unsupported_task_count": 14, "not_evaluated_task_count": 0, "status_counts": { "not_supported_by_metadata_only_package": 14, "scored": 6 }, "coverage_fraction": 0.3, "result_record_fraction": 1.0 }, { "id": "raw128_simple", "label": "128ep Raw Simple", "short_label": "128-RS", "color": "#f59e0b", "kind": "complete_128_episode_raw_feature_baseline", "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", "stroke_dasharray": "8 4", "method_detail": "128-episode 4430-dim sensor NPZ simple heads; tasks 15/19 use compact proxies.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 20, "covered_task_count": 20, "proxy_scored_task_count": 2, "scoreless_task_count": 0, "unsupported_task_count": 0, "not_evaluated_task_count": 0, "status_counts": { "proxy_scored": 2, "scored": 18 }, "coverage_fraction": 1.0, "result_record_fraction": 1.0 }, { "id": "raw128_neural_mlp", "label": "128ep Raw NN", "short_label": "128-RN", "color": "#22d3ee", "kind": "complete_128_episode_raw_feature_baseline", "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", "stroke_dasharray": "2 5", "method_detail": "128-episode 4430-dim sensor NPZ MLP heads; tasks 15/19 use compact proxies.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 20, "covered_task_count": 20, "proxy_scored_task_count": 2, "scoreless_task_count": 0, "unsupported_task_count": 0, "not_evaluated_task_count": 0, "status_counts": { "proxy_scored": 2, "scored": 18 }, "coverage_fraction": 1.0, "result_record_fraction": 1.0 }, { "id": "qwen3_omni_v6_lora", "label": "Qwen3-Omni v6 LoRA", "short_label": "Qwen3", "color": "#9bb8ff", "kind": "partial_128_episode_foundation_model_overlay", "scope": "128 selected episodes, held-out test", "stroke_dasharray": "7 7", "method_detail": "Verified held-out Qwen3-Omni v6 LoRA metrics, plus task 16 and any completed private-GPU future-task probes scored from task-specific JSON.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 14, "covered_task_count": 14, "proxy_scored_task_count": 0, "scoreless_task_count": 6, "unsupported_task_count": 0, "not_evaluated_task_count": 6, "status_counts": { "not_evaluated_in_verified_package": 6, "scored": 14 }, "coverage_fraction": 0.7, "result_record_fraction": 1.0 }, { "id": "cosmos3_super_reasoner", "label": "Cosmos3-Super Reasoner", "short_label": "C3-S", "color": "#ff9c7a", "kind": "partial_128_episode_foundation_model_overlay", "scope": "128 selected episodes, held-out test", "stroke_dasharray": "4 7", "method_detail": "Verified Cosmos3-Super base-weight Reasoner JSON-task evaluation, plus task 16 scored from existing verified action/object JSON.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 7, "covered_task_count": 7, "proxy_scored_task_count": 0, "scoreless_task_count": 13, "unsupported_task_count": 0, "not_evaluated_task_count": 13, "status_counts": { "not_evaluated_in_verified_package": 13, "scored": 7 }, "coverage_fraction": 0.35, "result_record_fraction": 1.0 }, { "id": "cosmos3_nano_future_window", "label": "Cosmos3-Nano Future Window", "short_label": "C3-N", "color": "#d9c7ff", "kind": "partial_128_episode_world_model_overlay", "scope": "128 selected episodes, held-out test", "stroke_dasharray": "2 7", "method_detail": "Verified Cosmos3-Nano future-window compatibility metrics.", "plotted_as": "colored point overlay", "result_record_count": 20, "scored_task_count": 5, "covered_task_count": 5, "proxy_scored_task_count": 0, "scoreless_task_count": 15, "unsupported_task_count": 0, "not_evaluated_task_count": 15, "status_counts": { "not_evaluated_in_verified_package": 15, "scored": 5 }, "coverage_fraction": 0.25, "result_record_fraction": 1.0 } ], "tasks": [ { "task_number": 1, "task_id": "timeline_action", "label": "Action Recognition", "axis_label": "01 Action Recognition", "short_label": "Action", "origin": "original_public_sample_tasks", "metric_key": "macro_f1", "metric_name": "macro-F1", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": 0.008252821966746326, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/timeline_action/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 0.008252821966746326, "raw_text": "0.0083", "status_label": "scored" }, "metadata128_neural_mlp": { "raw": 0.004175793689174209, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/timeline_action/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 0.004175793689174209, "raw_text": "0.0042", "status_label": "scored" }, "raw128_simple": { "raw": 0.002915061325704321, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/timeline_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.002915061325704321, "raw_text": "0.0029", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.0014955083181204041, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/timeline_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.0014955083181204041, "raw_text": "0.0015", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 0.0028830723979596335, "metric_key": "action_macro_f1", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.0028830723979596335, "raw_text": "0.0029", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": 0.0008284021201089245, "metric_key": "action_macro_f1", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.0008284021201089245, "raw_text": "0.0008", "status_label": "scored" }, "cosmos3_nano_future_window": { "raw": 0.007936507936507936, "metric_key": "action_accuracy_from_retrieved_future", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.007936507936507936, "raw_text": "0.0079", "status_label": "scored" } } }, { "task_number": 2, "task_id": "timeline_subtask", "label": "Procedure Step Recognition", "axis_label": "02 Procedure Step Recognition", "short_label": "Step", "origin": "original_public_sample_tasks", "metric_key": "macro_f1", "metric_name": "macro-F1", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": 0.00019512195121951218, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/timeline_subtask/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 0.00019512195121951218, "raw_text": "0.0002", "status_label": "scored" }, "metadata128_neural_mlp": { "raw": 7.207207207207208e-05, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/timeline_subtask/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 7.207207207207208e-05, "raw_text": "0.0001", "status_label": "scored" }, "raw128_simple": { "raw": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/timeline_subtask/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.0, "raw_text": "0.0000", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 7.35632183908046e-05, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/timeline_subtask/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 7.35632183908046e-05, "raw_text": "0.0001", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 0.0037313432835820895, "metric_key": "subtask_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.0037313432835820895, "raw_text": "0.0037", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": 0.0, "metric_key": "subtask_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.0, "raw_text": "0.0000", "status_label": "scored" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 3, "task_id": "transition_detection", "label": "Action Boundary Detection", "axis_label": "03 Action Boundary Detection", "short_label": "Boundary", "origin": "original_public_sample_tasks", "metric_key": "macro_f1", "metric_name": "macro-F1", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": 0.29652162550029315, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/transition_detection/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 0.29652162550029315, "raw_text": "0.2965", "status_label": "scored" }, "metadata128_neural_mlp": { "raw": 0.4841733292368365, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/transition_detection/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 0.4841733292368365, "raw_text": "0.4842", "status_label": "scored" }, "raw128_simple": { "raw": 0.4203613574238283, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/transition_detection/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.4203613574238283, "raw_text": "0.4204", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.4902206914147213, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/transition_detection/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.4902206914147213, "raw_text": "0.4902", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 0.9898313492063492, "metric_key": "transition_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.9898313492063492, "raw_text": "0.9898", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": 0.36830357142857145, "metric_key": "transition_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.36830357142857145, "raw_text": "0.3683", "status_label": "scored" }, "cosmos3_nano_future_window": { "raw": 0.9682539682539683, "metric_key": "transition_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.9682539682539683, "raw_text": "0.9683", "status_label": "scored" } } }, { "task_number": 4, "task_id": "next_action", "label": "Next-Action Prediction", "axis_label": "04 Next-Action Prediction", "short_label": "Next act", "origin": "original_public_sample_tasks", "metric_key": "macro_f1", "metric_name": "macro-F1", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": 0.006514774539765508, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/next_action/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 0.006514774539765508, "raw_text": "0.0065", "status_label": "scored" }, "metadata128_neural_mlp": { "raw": 0.004910507980164745, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/next_action/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 0.004910507980164745, "raw_text": "0.0049", "status_label": "scored" }, "raw128_simple": { "raw": 0.003285273363482094, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/next_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.003285273363482094, "raw_text": "0.0033", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.0018477984371755407, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/next_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.0018477984371755407, "raw_text": "0.0018", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 0.04305335446381405, "metric_key": "next_action_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.04305335446381405, "raw_text": "0.0431", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": 0.013392857142857142, "metric_key": "next_action_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.013392857142857142, "raw_text": "0.0134", "status_label": "scored" }, "cosmos3_nano_future_window": { "raw": 0.007936507936507936, "metric_key": "action_accuracy_from_retrieved_future", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.007936507936507936, "raw_text": "0.0079", "status_label": "scored" } } }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "label": "Hand Trajectory Forecasting", "axis_label": "05 Hand Trajectory Forecasting", "short_label": "Hand traj", "origin": "original_public_sample_tasks", "metric_key": "mpjpe", "metric_name": "MPJPE", "metric_direction": "lower", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": null, "metric_key": "mpjpe", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/hand_trajectory_forecast/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "unsupported_without_required_target", "reason": "requires future hand-joint trajectories from raw sensor feature NPZ blocks, which are not in the public 128 package", "normalized_score": null, "raw_text": "n/a", "status_label": "unsupported" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "mpjpe", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": 0.2729249894618988, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/hand_trajectory_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.39516420515180267, "raw_text": "0.2729", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.18475216627120972, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/hand_trajectory_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.5837560051580399, "raw_text": "0.1848", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": null, "metric_key": "mpjpe", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_super_reasoner": { "raw": null, "metric_key": "mpjpe", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "mpjpe", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 6, "task_id": "contact_prediction", "label": "Contact State Prediction", "axis_label": "06 Contact State Prediction", "short_label": "Contact", "origin": "original_public_sample_tasks", "metric_key": "macro_f1", "metric_name": "macro-F1", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": 0.4381481308057444, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/contact_prediction/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 0.4381481308057444, "raw_text": "0.4381", "status_label": "scored" }, "metadata128_neural_mlp": { "raw": 0.5682695682695682, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/contact_prediction/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 0.5682695682695682, "raw_text": "0.5683", "status_label": "scored" }, "raw128_simple": { "raw": 0.886990707397193, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/contact_prediction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.886990707397193, "raw_text": "0.8870", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 1.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/contact_prediction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 1.0, "raw_text": "1.000", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 0.8177083333333334, "metric_key": "contact_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.8177083333333334, "raw_text": "0.8177", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": 0.32142857142857145, "metric_key": "contact_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.32142857142857145, "raw_text": "0.3214", "status_label": "scored" }, "cosmos3_nano_future_window": { "raw": 0.7433862433862434, "metric_key": "contact_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.7433862433862434, "raw_text": "0.7434", "status_label": "scored" } } }, { "task_number": 7, "task_id": "object_relevance", "label": "Object Relevance Prediction", "axis_label": "07 Object Relevance Prediction", "short_label": "Objects", "origin": "original_public_sample_tasks", "metric_key": "micro_f1", "metric_name": "micro-F1", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": 0.17764578833693304, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/object_relevance/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 0.17764578833693304, "raw_text": "0.1776", "status_label": "scored" }, "metadata128_neural_mlp": { "raw": 0.18662723837686876, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/object_relevance/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 0.18662723837686876, "raw_text": "0.1866", "status_label": "scored" }, "raw128_simple": { "raw": 0.0655376369662084, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/object_relevance/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.0655376369662084, "raw_text": "0.0655", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.1765890386972509, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/object_relevance/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.1765890386972509, "raw_text": "0.1766", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 0.3064982378331287, "metric_key": "object_micro_f1", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.3064982378331287, "raw_text": "0.3065", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": 0.13704276146316333, "metric_key": "object_micro_f1", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.13704276146316333, "raw_text": "0.1370", "status_label": "scored" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 8, "task_id": "caption_grounding", "label": "Language Grounding", "axis_label": "08 Language Grounding", "short_label": "Language", "origin": "original_public_sample_tasks", "metric_key": "mrr", "metric_name": "MRR", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": 0.002332374220713973, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/caption_grounding/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 0.002332374220713973, "raw_text": "0.0023", "status_label": "scored" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": 0.011138836853206158, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/caption_grounding/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.011138836853206158, "raw_text": "0.0111", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.0063402121886610985, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/caption_grounding/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.0063402121886610985, "raw_text": "0.0063", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 0.8764467592592605, "metric_key": "caption_grounding_mrr", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_retrieval_task_probes_a100_20260617T175919Z/caption_grounding/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.8764467592592605, "raw_text": "0.8764", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 9, "task_id": "cross_modal_retrieval", "label": "Cross-Modal Retrieval", "axis_label": "09 Cross-Modal Retrieval", "short_label": "X-modal", "origin": "original_public_sample_tasks", "metric_key": "mrr", "metric_name": "MRR", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": null, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/cross_modal_retrieval/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "unsupported_without_required_target", "reason": "requires paired motion/IMU/camera/audio/depth feature blocks, which are not in the public 128 package", "normalized_score": null, "raw_text": "n/a", "status_label": "unsupported" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": 0.003459817497059703, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/cross_modal_retrieval/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.003459817497059703, "raw_text": "0.0035", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.002535284962505102, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/cross_modal_retrieval/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.002535284962505102, "raw_text": "0.0025", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_super_reasoner": { "raw": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_nano_future_window": { "raw": 0.022138720585222767, "metric_key": "future_retrieval_mrr", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.022138720585222767, "raw_text": "0.0221", "status_label": "scored" } } }, { "task_number": 10, "task_id": "modality_reconstruction", "label": "Cross-Modal Reconstruction", "axis_label": "10 Cross-Modal Reconstruction", "short_label": "Recon", "origin": "original_public_sample_tasks", "metric_key": "r2", "metric_name": "R2", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": null, "metric_key": "r2", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/modality_reconstruction/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "unsupported_without_required_target", "reason": "requires source and target modality feature blocks such as depth/video vectors, which are not in the public 128 package", "normalized_score": null, "raw_text": "n/a", "status_label": "unsupported" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "r2", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": -1.3450960391924882, "metric_key": "r2", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/modality_reconstruction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.0, "raw_text": "-1.345", "status_label": "scored" }, "raw128_neural_mlp": { "raw": -1.3974418160502369, "metric_key": "r2", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/modality_reconstruction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.0, "raw_text": "-1.397", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": null, "metric_key": "r2", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_super_reasoner": { "raw": null, "metric_key": "r2", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "r2", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 11, "task_id": "temporal_order", "label": "Temporal Order Verification", "axis_label": "11 Temporal Order Verification", "short_label": "Order", "origin": "original_public_sample_tasks", "metric_key": "f1", "metric_name": "F1", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": 0.4198864140782312, "metric_key": "f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/temporal_order/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "scored", "reason": null, "normalized_score": 0.4198864140782312, "raw_text": "0.4199", "status_label": "scored" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": 0.49824413370686593, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/temporal_order/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.49824413370686593, "raw_text": "0.4982", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.8030047098504103, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/temporal_order/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.8030047098504103, "raw_text": "0.8030", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 0.40984631701404173, "metric_key": "temporal_order_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/temporal_order/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.40984631701404173, "raw_text": "0.4098", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 12, "task_id": "misalignment_detection", "label": "Multimodal Synchronization Detection", "axis_label": "12 Multimodal Synchronization Detection", "short_label": "Sync", "origin": "original_public_sample_tasks", "metric_key": "f1", "metric_name": "F1", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": null, "metric_key": "f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/misalignment_detection/metrics.json", "scope": "multi_episode_128_metadata_baseline", "status": "unsupported_without_required_target", "reason": "requires deliberately shifted cross-modal feature pairs, which cannot be reconstructed from the public JSONL labels alone", "normalized_score": null, "raw_text": "n/a", "status_label": "unsupported" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": 0.4958867673901769, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/misalignment_detection/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.4958867673901769, "raw_text": "0.4959", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.8272709077974252, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/misalignment_detection/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.8272709077974252, "raw_text": "0.8273", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 0.3344936184319576, "metric_key": "misalignment_detection_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/misalignment_detection/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.3344936184319576, "raw_text": "0.3345", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 13, "task_id": "long_horizon_next_action", "label": "Long-Horizon Next-Action Forecasting", "axis_label": "13 Long-Horizon Next-Action Forecasting", "short_label": "Long act", "origin": "additional_public_sample_tasks", "metric_key": "macro_f1", "metric_name": "macro-F1", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": 0.0024280172369056294, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/long_horizon_next_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.0024280172369056294, "raw_text": "0.0024", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.001063859887389299, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/long_horizon_next_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.001063859887389299, "raw_text": "0.0011", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 0.0023356666867101906, "metric_key": "long_horizon_next_action_macro_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/long_horizon_next_action/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.0023356666867101906, "raw_text": "0.0023", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 14, "task_id": "next_subtask_forecast", "label": "Long-Horizon Next-Subtask Forecasting", "axis_label": "14 Long-Horizon Next-Subtask Forecasting", "short_label": "Long step", "origin": "additional_public_sample_tasks", "metric_key": "macro_f1", "metric_name": "macro-F1", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/next_subtask_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.0, "raw_text": "0.0000", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/next_subtask_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.0, "raw_text": "0.0000", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 0.004206715978529301, "metric_key": "next_subtask_forecast_macro_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/next_subtask_forecast/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.004206715978529301, "raw_text": "0.0042", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 15, "task_id": "interaction_text_prediction", "label": "Interaction Text Prediction", "axis_label": "15 Interaction Text Prediction", "short_label": "Interact txt", "origin": "additional_public_sample_tasks", "metric_key": "macro_f1", "metric_name": "macro-F1", "metric_direction": "higher", "raw128_proxy_axis": true, "values": { "metadata128_simple": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": 0.012611998261547169, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "proxy_scored", "reason": "documented compact proxy completion for this raw128 task axis", "normalized_score": 0.012611998261547169, "raw_text": "0.0126", "status_label": "proxy scored" }, "raw128_neural_mlp": { "raw": 0.009791421280985521, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "proxy_scored", "reason": "documented compact proxy completion for this raw128 task axis", "normalized_score": 0.009791421280985521, "raw_text": "0.0098", "status_label": "proxy scored" }, "qwen3_omni_v6_lora": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_super_reasoner": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 16, "task_id": "action_object_relation", "label": "Action-Object Relation Prediction", "axis_label": "16 Action-Object Relation Prediction", "short_label": "Act+obj", "origin": "additional_public_sample_tasks", "metric_key": "macro_f1", "metric_name": "macro-F1", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/action_object_relation/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.0, "raw_text": "0.0000", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/action_object_relation/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.0, "raw_text": "0.0000", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 0.0002220083079671497, "metric_key": "action_object_relation_macro_f1", "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/qwen3_omni_v6_lora/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.0002220083079671497, "raw_text": "0.0002", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": 0.0, "metric_key": "action_object_relation_macro_f1", "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_super_reasoner/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.0, "raw_text": "0.0000", "status_label": "scored" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 17, "task_id": "object_set_forecast", "label": "Future Object-Set Forecasting", "axis_label": "17 Future Object-Set Forecasting", "short_label": "Future obj", "origin": "additional_public_sample_tasks", "metric_key": "micro_f1", "metric_name": "micro-F1", "metric_direction": "higher", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": 0.06469493412657774, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/object_set_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.06469493412657774, "raw_text": "0.0647", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.17523098630012288, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/object_set_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.17523098630012288, "raw_text": "0.1752", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 0.1659483964851402, "metric_key": "object_set_forecast_micro_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/object_set_forecast/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.1659483964851402, "raw_text": "0.1659", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 18, "task_id": "imu_to_hand_pose", "label": "IMU-to-Hand Pose Reconstruction", "axis_label": "18 IMU-to-Hand Pose Reconstruction", "short_label": "IMU->hand", "origin": "additional_public_sample_tasks", "metric_key": "mae", "metric_name": "MAE", "metric_direction": "lower", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": 0.22941437363624573, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/imu_to_hand_pose/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.1832902066792771, "raw_text": "0.2294", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 0.252998411655426, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/imu_to_hand_pose/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.1662042369509182, "raw_text": "0.2530", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_super_reasoner": { "raw": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "label": "Camera-View Synchronization Retrieval", "axis_label": "19 Camera-View Synchronization Retrieval", "short_label": "Cam sync", "origin": "additional_public_sample_tasks", "metric_key": "mrr", "metric_name": "MRR", "metric_direction": "higher", "raw128_proxy_axis": true, "values": { "metadata128_simple": { "raw": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": 0.0026625150348991156, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "proxy_scored", "reason": "documented compact proxy completion for this raw128 task axis", "normalized_score": 0.0026625150348991156, "raw_text": "0.0027", "status_label": "proxy scored" }, "raw128_neural_mlp": { "raw": 0.0025448438245803118, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "proxy_scored", "reason": "documented compact proxy completion for this raw128 task axis", "normalized_score": 0.0025448438245803118, "raw_text": "0.0025", "status_label": "proxy scored" }, "qwen3_omni_v6_lora": { "raw": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_super_reasoner": { "raw": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } }, { "task_number": 20, "task_id": "time_to_transition", "label": "Time-to-Next-Transition Regression", "axis_label": "20 Time-to-Next-Transition Regression", "short_label": "Time2bdry", "origin": "additional_public_sample_tasks", "metric_key": "mae", "metric_name": "MAE frames", "metric_direction": "lower", "raw128_proxy_axis": false, "values": { "metadata128_simple": { "raw": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "metadata128_neural_mlp": { "raw": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_metadata_baseline", "status": "not_supported_by_metadata_only_package", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", "normalized_score": null, "raw_text": "n/a", "status_label": "not supported" }, "raw128_simple": { "raw": 52.32759475708008, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/time_to_transition/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.20137284019197565, "raw_text": "52.33", "status_label": "scored" }, "raw128_neural_mlp": { "raw": 42.374061584472656, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/time_to_transition/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "scored", "reason": null, "normalized_score": 0.24867468405504953, "raw_text": "42.37", "status_label": "scored" }, "qwen3_omni_v6_lora": { "raw": 134.0687422166874, "metric_key": "time_to_transition_mae", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/time_to_transition/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "status": "scored", "reason": null, "normalized_score": 0.07859666766782253, "raw_text": "134.07", "status_label": "scored" }, "cosmos3_super_reasoner": { "raw": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" }, "cosmos3_nano_future_window": { "raw": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "status": "not_evaluated_in_verified_package", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", "normalized_score": null, "raw_text": "n/a", "status_label": "not evaluated" } } } ], "task_method_result_matrix": [ { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.008252821966746326, "raw_text": "0.0083", "normalized_score": 0.008252821966746326, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/timeline_action/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.004175793689174209, "raw_text": "0.0042", "normalized_score": 0.004175793689174209, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/timeline_action/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.002915061325704321, "raw_text": "0.0029", "normalized_score": 0.002915061325704321, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/timeline_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0014955083181204041, "raw_text": "0.0015", "normalized_score": 0.0014955083181204041, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/timeline_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0028830723979596335, "raw_text": "0.0029", "normalized_score": 0.0028830723979596335, "metric_key": "action_macro_f1", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0008284021201089245, "raw_text": "0.0008", "normalized_score": 0.0008284021201089245, "metric_key": "action_macro_f1", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 1, "task_id": "timeline_action", "task_label": "Action Recognition", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.007936507936507936, "raw_text": "0.0079", "normalized_score": 0.007936507936507936, "metric_key": "action_accuracy_from_retrieved_future", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.00019512195121951218, "raw_text": "0.0002", "normalized_score": 0.00019512195121951218, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/timeline_subtask/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 7.207207207207208e-05, "raw_text": "0.0001", "normalized_score": 7.207207207207208e-05, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/timeline_subtask/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/timeline_subtask/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 7.35632183908046e-05, "raw_text": "0.0001", "normalized_score": 7.35632183908046e-05, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/timeline_subtask/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0037313432835820895, "raw_text": "0.0037", "normalized_score": 0.0037313432835820895, "metric_key": "subtask_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "subtask_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 2, "task_id": "timeline_subtask", "task_label": "Procedure Step Recognition", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.29652162550029315, "raw_text": "0.2965", "normalized_score": 0.29652162550029315, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/transition_detection/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.4841733292368365, "raw_text": "0.4842", "normalized_score": 0.4841733292368365, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/transition_detection/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.4203613574238283, "raw_text": "0.4204", "normalized_score": 0.4203613574238283, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/transition_detection/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.4902206914147213, "raw_text": "0.4902", "normalized_score": 0.4902206914147213, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/transition_detection/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.9898313492063492, "raw_text": "0.9898", "normalized_score": 0.9898313492063492, "metric_key": "transition_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.36830357142857145, "raw_text": "0.3683", "normalized_score": 0.36830357142857145, "metric_key": "transition_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 3, "task_id": "transition_detection", "task_label": "Action Boundary Detection", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.9682539682539683, "raw_text": "0.9683", "normalized_score": 0.9682539682539683, "metric_key": "transition_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.006514774539765508, "raw_text": "0.0065", "normalized_score": 0.006514774539765508, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/next_action/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.004910507980164745, "raw_text": "0.0049", "normalized_score": 0.004910507980164745, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/next_action/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.003285273363482094, "raw_text": "0.0033", "normalized_score": 0.003285273363482094, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/next_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0018477984371755407, "raw_text": "0.0018", "normalized_score": 0.0018477984371755407, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/next_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.04305335446381405, "raw_text": "0.0431", "normalized_score": 0.04305335446381405, "metric_key": "next_action_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.013392857142857142, "raw_text": "0.0134", "normalized_score": 0.013392857142857142, "metric_key": "next_action_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 4, "task_id": "next_action", "task_label": "Next-Action Prediction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.007936507936507936, "raw_text": "0.0079", "normalized_score": 0.007936507936507936, "metric_key": "action_accuracy_from_retrieved_future", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "unsupported_without_required_target", "status_label": "unsupported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mpjpe", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/hand_trajectory_forecast/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": "requires future hand-joint trajectories from raw sensor feature NPZ blocks, which are not in the public 128 package" }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mpjpe", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.2729249894618988, "raw_text": "0.2729", "normalized_score": 0.39516420515180267, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/hand_trajectory_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.18475216627120972, "raw_text": "0.1848", "normalized_score": 0.5837560051580399, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/hand_trajectory_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mpjpe", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mpjpe", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 5, "task_id": "hand_trajectory_forecast", "task_label": "Hand Trajectory Forecasting", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mpjpe", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.4381481308057444, "raw_text": "0.4381", "normalized_score": 0.4381481308057444, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/contact_prediction/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.5682695682695682, "raw_text": "0.5683", "normalized_score": 0.5682695682695682, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/contact_prediction/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.886990707397193, "raw_text": "0.8870", "normalized_score": 0.886990707397193, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/contact_prediction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 1.0, "raw_text": "1.000", "normalized_score": 1.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/contact_prediction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.8177083333333334, "raw_text": "0.8177", "normalized_score": 0.8177083333333334, "metric_key": "contact_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.32142857142857145, "raw_text": "0.3214", "normalized_score": 0.32142857142857145, "metric_key": "contact_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 6, "task_id": "contact_prediction", "task_label": "Contact State Prediction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.7433862433862434, "raw_text": "0.7434", "normalized_score": 0.7433862433862434, "metric_key": "contact_accuracy", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.17764578833693304, "raw_text": "0.1776", "normalized_score": 0.17764578833693304, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/object_relevance/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.18662723837686876, "raw_text": "0.1866", "normalized_score": 0.18662723837686876, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/object_relevance/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0655376369662084, "raw_text": "0.0655", "normalized_score": 0.0655376369662084, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/object_relevance/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.1765890386972509, "raw_text": "0.1766", "normalized_score": 0.1765890386972509, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/object_relevance/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.3064982378331287, "raw_text": "0.3065", "normalized_score": 0.3064982378331287, "metric_key": "object_micro_f1", "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.13704276146316333, "raw_text": "0.1370", "normalized_score": 0.13704276146316333, "metric_key": "object_micro_f1", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 7, "task_id": "object_relevance", "task_label": "Object Relevance Prediction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.002332374220713973, "raw_text": "0.0023", "normalized_score": 0.002332374220713973, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/caption_grounding/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.011138836853206158, "raw_text": "0.0111", "normalized_score": 0.011138836853206158, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/caption_grounding/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0063402121886610985, "raw_text": "0.0063", "normalized_score": 0.0063402121886610985, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/caption_grounding/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.8764467592592605, "raw_text": "0.8764", "normalized_score": 0.8764467592592605, "metric_key": "caption_grounding_mrr", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_retrieval_task_probes_a100_20260617T175919Z/caption_grounding/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 8, "task_id": "caption_grounding", "task_label": "Language Grounding", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "unsupported_without_required_target", "status_label": "unsupported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/cross_modal_retrieval/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": "requires paired motion/IMU/camera/audio/depth feature blocks, which are not in the public 128 package" }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.003459817497059703, "raw_text": "0.0035", "normalized_score": 0.003459817497059703, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/cross_modal_retrieval/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.002535284962505102, "raw_text": "0.0025", "normalized_score": 0.002535284962505102, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/cross_modal_retrieval/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 9, "task_id": "cross_modal_retrieval", "task_label": "Cross-Modal Retrieval", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.022138720585222767, "raw_text": "0.0221", "normalized_score": 0.022138720585222767, "metric_key": "future_retrieval_mrr", "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "unsupported_without_required_target", "status_label": "unsupported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "r2", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/modality_reconstruction/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": "requires source and target modality feature blocks such as depth/video vectors, which are not in the public 128 package" }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "r2", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": -1.3450960391924882, "raw_text": "-1.345", "normalized_score": 0.0, "metric_key": "r2", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/modality_reconstruction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": -1.3974418160502369, "raw_text": "-1.397", "normalized_score": 0.0, "metric_key": "r2", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/modality_reconstruction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "r2", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "r2", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 10, "task_id": "modality_reconstruction", "task_label": "Cross-Modal Reconstruction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "r2", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.4198864140782312, "raw_text": "0.4199", "normalized_score": 0.4198864140782312, "metric_key": "f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/temporal_order/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": null }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.49824413370686593, "raw_text": "0.4982", "normalized_score": 0.49824413370686593, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/temporal_order/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.8030047098504103, "raw_text": "0.8030", "normalized_score": 0.8030047098504103, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/temporal_order/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.40984631701404173, "raw_text": "0.4098", "normalized_score": 0.40984631701404173, "metric_key": "temporal_order_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/temporal_order/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 11, "task_id": "temporal_order", "task_label": "Temporal Order Verification", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "unsupported_without_required_target", "status_label": "unsupported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/misalignment_detection/metrics.json", "scope": "multi_episode_128_metadata_baseline", "reason": "requires deliberately shifted cross-modal feature pairs, which cannot be reconstructed from the public JSONL labels alone" }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.4958867673901769, "raw_text": "0.4959", "normalized_score": 0.4958867673901769, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/misalignment_detection/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.8272709077974252, "raw_text": "0.8273", "normalized_score": 0.8272709077974252, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/misalignment_detection/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.3344936184319576, "raw_text": "0.3345", "normalized_score": 0.3344936184319576, "metric_key": "misalignment_detection_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/misalignment_detection/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 12, "task_id": "misalignment_detection", "task_label": "Multimodal Synchronization Detection", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0024280172369056294, "raw_text": "0.0024", "normalized_score": 0.0024280172369056294, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/long_horizon_next_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.001063859887389299, "raw_text": "0.0011", "normalized_score": 0.001063859887389299, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/long_horizon_next_action/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0023356666867101906, "raw_text": "0.0023", "normalized_score": 0.0023356666867101906, "metric_key": "long_horizon_next_action_macro_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/long_horizon_next_action/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 13, "task_id": "long_horizon_next_action", "task_label": "Long-Horizon Next-Action Forecasting", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/next_subtask_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/next_subtask_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.004206715978529301, "raw_text": "0.0042", "normalized_score": 0.004206715978529301, "metric_key": "next_subtask_forecast_macro_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/next_subtask_forecast/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 14, "task_id": "next_subtask_forecast", "task_label": "Long-Horizon Next-Subtask Forecasting", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "proxy_scored", "status_label": "proxy scored", "scored": true, "proxy_scored": true, "raw": 0.012611998261547169, "raw_text": "0.0126", "normalized_score": 0.012611998261547169, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": "documented compact proxy completion for this raw128 task axis" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "proxy_scored", "status_label": "proxy scored", "scored": true, "proxy_scored": true, "raw": 0.009791421280985521, "raw_text": "0.0098", "normalized_score": 0.009791421280985521, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": "documented compact proxy completion for this raw128 task axis" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 15, "task_id": "interaction_text_prediction", "task_label": "Interaction Text Prediction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/action_object_relation/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "macro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/action_object_relation/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0002220083079671497, "raw_text": "0.0002", "normalized_score": 0.0002220083079671497, "metric_key": "action_object_relation_macro_f1", "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/qwen3_omni_v6_lora/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.0, "raw_text": "0.0000", "normalized_score": 0.0, "metric_key": "action_object_relation_macro_f1", "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_super_reasoner/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 16, "task_id": "action_object_relation", "task_label": "Action-Object Relation Prediction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "macro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.06469493412657774, "raw_text": "0.0647", "normalized_score": 0.06469493412657774, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/object_set_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.17523098630012288, "raw_text": "0.1752", "normalized_score": 0.17523098630012288, "metric_key": "micro_f1", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/object_set_forecast/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.1659483964851402, "raw_text": "0.1659", "normalized_score": 0.1659483964851402, "metric_key": "object_set_forecast_micro_f1", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/object_set_forecast/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 17, "task_id": "object_set_forecast", "task_label": "Future Object-Set Forecasting", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "micro_f1", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.22941437363624573, "raw_text": "0.2294", "normalized_score": 0.1832902066792771, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/imu_to_hand_pose/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 0.252998411655426, "raw_text": "0.2530", "normalized_score": 0.1662042369509182, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/imu_to_hand_pose/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 18, "task_id": "imu_to_hand_pose", "task_label": "IMU-to-Hand Pose Reconstruction", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "proxy_scored", "status_label": "proxy scored", "scored": true, "proxy_scored": true, "raw": 0.0026625150348991156, "raw_text": "0.0027", "normalized_score": 0.0026625150348991156, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": "documented compact proxy completion for this raw128 task axis" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "proxy_scored", "status_label": "proxy scored", "scored": true, "proxy_scored": true, "raw": 0.0025448438245803118, "raw_text": "0.0025", "normalized_score": 0.0025448438245803118, "metric_key": "mrr", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": "documented compact proxy completion for this raw128 task axis" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 19, "task_id": "camera_view_sync_retrieval", "task_label": "Camera-View Synchronization Retrieval", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mrr", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "metadata128_simple", "method": "128ep Metadata Simple", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "metadata128_neural_mlp", "method": "128ep Metadata NN", "status": "not_supported_by_metadata_only_package", "status_label": "not supported", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_metadata_baseline", "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required" }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "raw128_simple", "method": "128ep Raw Simple", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 52.32759475708008, "raw_text": "52.33", "normalized_score": 0.20137284019197565, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/time_to_transition/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "raw128_neural_mlp", "method": "128ep Raw NN", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 42.374061584472656, "raw_text": "42.37", "normalized_score": 0.24867468405504953, "metric_key": "mae", "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/time_to_transition/metrics.json", "scope": "multi_episode_128_raw_sensor_feature_baseline", "reason": null }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "qwen3_omni_v6_lora", "method": "Qwen3-Omni v6 LoRA", "status": "scored", "status_label": "scored", "scored": true, "proxy_scored": false, "raw": 134.0687422166874, "raw_text": "134.07", "normalized_score": 0.07859666766782253, "metric_key": "time_to_transition_mae", "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/time_to_transition/metrics.json", "scope": "multi_episode_128_partial_model_overlay", "reason": null }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "cosmos3_super_reasoner", "method": "Cosmos3-Super Reasoner", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" }, { "task_number": 20, "task_id": "time_to_transition", "task_label": "Time-to-Next-Transition Regression", "series_id": "cosmos3_nano_future_window", "method": "Cosmos3-Nano Future Window", "status": "not_evaluated_in_verified_package", "status_label": "not evaluated", "scored": false, "proxy_scored": false, "raw": null, "raw_text": "n/a", "normalized_score": null, "metric_key": "mae", "source": null, "scope": "multi_episode_128_partial_model_overlay", "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" } ] }