File size: 5,951 Bytes
8ca5135
094eb82
8ca5135
 
 
942c6d8
8ca5135
942c6d8
 
094eb82
 
 
 
 
 
942c6d8
 
 
 
 
 
7606bed
 
 
 
 
 
 
c975eb1
8508633
 
 
 
 
 
 
 
 
 
 
 
 
c975eb1
 
 
 
 
942c6d8
 
094eb82
8ca5135
 
 
942c6d8
8ca5135
 
942c6d8
 
 
 
 
 
 
417a659
094eb82
 
 
 
 
 
 
a8277a7
 
 
 
 
 
417a659
 
 
 
 
942c6d8
 
 
8ca5135
 
 
942c6d8
8ca5135
 
942c6d8
 
 
 
 
 
 
 
 
 
8ca5135
 
 
094eb82
8ca5135
942c6d8
 
094eb82
417a659
7606bed
8508633
 
417a659
942c6d8
8ca5135
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
{
  "generated_at_utc": "2026-06-18T22:52:18+00:00",
  "methods": {
    "cosmos3_nano_future_window": {
      "label": "Cosmos3-Nano Future Window",
      "reason": null,
      "source_prediction_jsonl": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/future_predictions.jsonl",
      "status": "scored",
      "tasks": {
        "action_object_relation": {
          "action_object_relation_accuracy": 0.013297872340425532,
          "action_object_relation_macro_f1": 0.002794157670325683,
          "scored_rows": 376,
          "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_nano_future_window/metrics.json"
        },
        "long_horizon_next_action": {
          "horizon_windows": 5,
          "long_horizon_next_action_accuracy": 0.007936507936507936,
          "long_horizon_next_action_macro_f1": 0.0024906600249066007,
          "scored_rows": 378,
          "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/long_horizon_next_action/cosmos3_nano_future_window/metrics.json"
        },
        "modality_reconstruction": {
          "feature_reconstruction_error": 3479.218317102503,
          "feature_reconstruction_quality": 0.0002873382957286892,
          "num_samples": 378,
          "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/modality_reconstruction/cosmos3_nano_future_window/metrics.json",
          "source_verified_metrics_json": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json"
        },
        "next_subtask_forecast": {
          "next_subtask_forecast_accuracy": 0.015873015873015872,
          "next_subtask_forecast_macro_f1": 0.006614876224708678,
          "scored_rows": 378,
          "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/next_subtask_forecast/cosmos3_nano_future_window/metrics.json"
        },
        "object_set_forecast": {
          "object_set_forecast_micro_f1": 0.01781970649895178,
          "object_set_forecast_precision": 0.02225130890052356,
          "object_set_forecast_recall": 0.01486013986013986,
          "scored_rows": 378,
          "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/object_set_forecast/cosmos3_nano_future_window/metrics.json"
        },
        "time_to_transition": {
          "scored_rows": 378,
          "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/time_to_transition/cosmos3_nano_future_window/metrics.json",
          "time_to_transition_mae": 33.80952380952381,
          "within_20_frames": 0.6666666666666666
        }
      },
      "unsupported_tasks": {}
    },
    "cosmos3_super_reasoner": {
      "label": "Cosmos3-Super Reasoner",
      "reason": null,
      "source_prediction_jsonl": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/predictions.jsonl",
      "status": "scored",
      "tasks": {
        "action_object_relation": {
          "action_object_relation_accuracy": 0.0,
          "action_object_relation_macro_f1": 0.0,
          "scored_rows": 446,
          "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_super_reasoner/metrics.json",
          "valid_pred_relation_rate": 0.49327354260089684
        },
        "caption_grounding": {
          "caption_grounding_center_hit_rate": 0.3236607142857143,
          "caption_grounding_iou": 0.30639899644580487,
          "missing_pred_evidence_window_count": 219,
          "scored_rows": 448,
          "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/caption_grounding/cosmos3_super_reasoner/metrics.json"
        },
        "long_horizon_next_action": {
          "long_horizon_next_action_accuracy": 0.03794642857142857,
          "long_horizon_next_action_macro_f1": 0.008807588075880758,
          "scored_rows": 448,
          "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/long_horizon_next_action/cosmos3_super_reasoner/metrics.json"
        },
        "time_to_transition": {
          "scored_rows": 448,
          "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/time_to_transition/cosmos3_super_reasoner/metrics.json",
          "time_to_transition_mae": 52.94642857142857,
          "within_20_frames": 0.6473214285714286
        }
      },
      "unsupported_tasks": {}
    },
    "qwen3_omni_v6_lora": {
      "label": "Qwen3-Omni v6 LoRA",
      "reason": null,
      "source_prediction_jsonl": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/predictions.jsonl",
      "status": "scored",
      "tasks": {
        "action_object_relation": {
          "action_object_relation_accuracy": 0.000996512207274539,
          "action_object_relation_macro_f1": 0.0002220083079671497,
          "scored_rows": 4014,
          "source_metrics_json": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/qwen3_omni_v6_lora/metrics.json",
          "valid_pred_relation_rate": 0.9990034877927254
        }
      },
      "unsupported_tasks": {}
    }
  },
  "scope": "Task-specific scoring from existing verified held-out model outputs. No new model inference, training, or target backfilling is performed.",
  "scored_method_task_count_added": 11,
  "status": "pass",
  "task_ids_added_to_matrix": [
    "action_object_relation",
    "caption_grounding",
    "long_horizon_next_action",
    "modality_reconstruction",
    "next_subtask_forecast",
    "object_set_forecast",
    "time_to_transition"
  ],
  "title": "Existing Model-Output Task Probes"
}