{ "generated_at_utc": "2026-06-18T22:52:18+00:00", "methods": { "cosmos3_nano_future_window": { "label": "Cosmos3-Nano Future Window", "reason": null, "source_prediction_jsonl": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/future_predictions.jsonl", "status": "scored", "tasks": { "action_object_relation": { "action_object_relation_accuracy": 0.013297872340425532, "action_object_relation_macro_f1": 0.002794157670325683, "scored_rows": 376, "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_nano_future_window/metrics.json" }, "long_horizon_next_action": { "horizon_windows": 5, "long_horizon_next_action_accuracy": 0.007936507936507936, "long_horizon_next_action_macro_f1": 0.0024906600249066007, "scored_rows": 378, "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/long_horizon_next_action/cosmos3_nano_future_window/metrics.json" }, "modality_reconstruction": { "feature_reconstruction_error": 3479.218317102503, "feature_reconstruction_quality": 0.0002873382957286892, "num_samples": 378, "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/modality_reconstruction/cosmos3_nano_future_window/metrics.json", "source_verified_metrics_json": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json" }, "next_subtask_forecast": { "next_subtask_forecast_accuracy": 0.015873015873015872, "next_subtask_forecast_macro_f1": 0.006614876224708678, "scored_rows": 378, "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/next_subtask_forecast/cosmos3_nano_future_window/metrics.json" }, "object_set_forecast": { "object_set_forecast_micro_f1": 0.01781970649895178, "object_set_forecast_precision": 0.02225130890052356, "object_set_forecast_recall": 0.01486013986013986, "scored_rows": 378, "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/object_set_forecast/cosmos3_nano_future_window/metrics.json" }, "time_to_transition": { "scored_rows": 378, "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/time_to_transition/cosmos3_nano_future_window/metrics.json", "time_to_transition_mae": 33.80952380952381, "within_20_frames": 0.6666666666666666 } }, "unsupported_tasks": {} }, "cosmos3_super_reasoner": { "label": "Cosmos3-Super Reasoner", "reason": null, "source_prediction_jsonl": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/predictions.jsonl", "status": "scored", "tasks": { "action_object_relation": { "action_object_relation_accuracy": 0.0, "action_object_relation_macro_f1": 0.0, "scored_rows": 446, "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_super_reasoner/metrics.json", "valid_pred_relation_rate": 0.49327354260089684 }, "caption_grounding": { "caption_grounding_center_hit_rate": 0.3236607142857143, "caption_grounding_iou": 0.30639899644580487, "missing_pred_evidence_window_count": 219, "scored_rows": 448, "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/caption_grounding/cosmos3_super_reasoner/metrics.json" }, "long_horizon_next_action": { "long_horizon_next_action_accuracy": 0.03794642857142857, "long_horizon_next_action_macro_f1": 0.008807588075880758, "scored_rows": 448, "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/long_horizon_next_action/cosmos3_super_reasoner/metrics.json" }, "time_to_transition": { "scored_rows": 448, "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/time_to_transition/cosmos3_super_reasoner/metrics.json", "time_to_transition_mae": 52.94642857142857, "within_20_frames": 0.6473214285714286 } }, "unsupported_tasks": {} }, "qwen3_omni_v6_lora": { "label": "Qwen3-Omni v6 LoRA", "reason": null, "source_prediction_jsonl": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/predictions.jsonl", "status": "scored", "tasks": { "action_object_relation": { "action_object_relation_accuracy": 0.000996512207274539, "action_object_relation_macro_f1": 0.0002220083079671497, "scored_rows": 4014, "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/qwen3_omni_v6_lora/metrics.json", "valid_pred_relation_rate": 0.9990034877927254 } }, "unsupported_tasks": {} } }, "scope": "Task-specific scoring from existing verified held-out model outputs. No new model inference, training, or target backfilling is performed.", "scored_method_task_count_added": 11, "status": "pass", "task_ids_added_to_matrix": [ "action_object_relation", "caption_grounding", "long_horizon_next_action", "modality_reconstruction", "next_subtask_forecast", "object_set_forecast", "time_to_transition" ], "title": "Existing Model-Output Task Probes" }