cy0307's picture
Add files using upload-large-folder tool
094eb82 verified
Raw
History Blame
5.95 kB
{
"generated_at_utc": "2026-06-18T22:52:18+00:00",
"methods": {
"cosmos3_nano_future_window": {
"label": "Cosmos3-Nano Future Window",
"reason": null,
"source_prediction_jsonl": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/future_predictions.jsonl",
"status": "scored",
"tasks": {
"action_object_relation": {
"action_object_relation_accuracy": 0.013297872340425532,
"action_object_relation_macro_f1": 0.002794157670325683,
"scored_rows": 376,
"source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_nano_future_window/metrics.json"
},
"long_horizon_next_action": {
"horizon_windows": 5,
"long_horizon_next_action_accuracy": 0.007936507936507936,
"long_horizon_next_action_macro_f1": 0.0024906600249066007,
"scored_rows": 378,
"source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/long_horizon_next_action/cosmos3_nano_future_window/metrics.json"
},
"modality_reconstruction": {
"feature_reconstruction_error": 3479.218317102503,
"feature_reconstruction_quality": 0.0002873382957286892,
"num_samples": 378,
"source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/modality_reconstruction/cosmos3_nano_future_window/metrics.json",
"source_verified_metrics_json": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json"
},
"next_subtask_forecast": {
"next_subtask_forecast_accuracy": 0.015873015873015872,
"next_subtask_forecast_macro_f1": 0.006614876224708678,
"scored_rows": 378,
"source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/next_subtask_forecast/cosmos3_nano_future_window/metrics.json"
},
"object_set_forecast": {
"object_set_forecast_micro_f1": 0.01781970649895178,
"object_set_forecast_precision": 0.02225130890052356,
"object_set_forecast_recall": 0.01486013986013986,
"scored_rows": 378,
"source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/object_set_forecast/cosmos3_nano_future_window/metrics.json"
},
"time_to_transition": {
"scored_rows": 378,
"source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/time_to_transition/cosmos3_nano_future_window/metrics.json",
"time_to_transition_mae": 33.80952380952381,
"within_20_frames": 0.6666666666666666
}
},
"unsupported_tasks": {}
},
"cosmos3_super_reasoner": {
"label": "Cosmos3-Super Reasoner",
"reason": null,
"source_prediction_jsonl": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/predictions.jsonl",
"status": "scored",
"tasks": {
"action_object_relation": {
"action_object_relation_accuracy": 0.0,
"action_object_relation_macro_f1": 0.0,
"scored_rows": 446,
"source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_super_reasoner/metrics.json",
"valid_pred_relation_rate": 0.49327354260089684
},
"caption_grounding": {
"caption_grounding_center_hit_rate": 0.3236607142857143,
"caption_grounding_iou": 0.30639899644580487,
"missing_pred_evidence_window_count": 219,
"scored_rows": 448,
"source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/caption_grounding/cosmos3_super_reasoner/metrics.json"
},
"long_horizon_next_action": {
"long_horizon_next_action_accuracy": 0.03794642857142857,
"long_horizon_next_action_macro_f1": 0.008807588075880758,
"scored_rows": 448,
"source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/long_horizon_next_action/cosmos3_super_reasoner/metrics.json"
},
"time_to_transition": {
"scored_rows": 448,
"source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/time_to_transition/cosmos3_super_reasoner/metrics.json",
"time_to_transition_mae": 52.94642857142857,
"within_20_frames": 0.6473214285714286
}
},
"unsupported_tasks": {}
},
"qwen3_omni_v6_lora": {
"label": "Qwen3-Omni v6 LoRA",
"reason": null,
"source_prediction_jsonl": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/predictions.jsonl",
"status": "scored",
"tasks": {
"action_object_relation": {
"action_object_relation_accuracy": 0.000996512207274539,
"action_object_relation_macro_f1": 0.0002220083079671497,
"scored_rows": 4014,
"source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/qwen3_omni_v6_lora/metrics.json",
"valid_pred_relation_rate": 0.9990034877927254
}
},
"unsupported_tasks": {}
}
},
"scope": "Task-specific scoring from existing verified held-out model outputs. No new model inference, training, or target backfilling is performed.",
"scored_method_task_count_added": 11,
"status": "pass",
"task_ids_added_to_matrix": [
"action_object_relation",
"caption_grounding",
"long_horizon_next_action",
"modality_reconstruction",
"next_subtask_forecast",
"object_set_forecast",
"time_to_transition"
],
"title": "Existing Model-Output Task Probes"
}