File size: 8,500 Bytes
13d3eec
d272538
13d3eec
 
 
 
f52ad36
13d3eec
 
 
 
f52ad36
13d3eec
 
 
 
f52ad36
13d3eec
 
 
 
 
 
 
 
 
3a3e7ac
 
13d3eec
3a3e7ac
13d3eec
 
 
 
 
 
 
 
79ed47f
 
13d3eec
79ed47f
13d3eec
 
 
69865f3
 
84ea166
13d3eec
69865f3
84ea166
 
13d3eec
84ea166
d73afa7
13d3eec
 
 
69865f3
 
84ea166
13d3eec
69865f3
84ea166
 
13d3eec
84ea166
 
13d3eec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17c38d5
 
13d3eec
17c38d5
13d3eec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a3e7ac
 
 
 
13d3eec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84ea166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13d3eec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84ea166
3a3e7ac
 
13d3eec
 
 
 
 
 
 
f52ad36
13d3eec
f52ad36
13d3eec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
{
  "generated_at_utc": "2026-06-20T20:38:59+00:00",
  "immediate_actions": [
    {
      "artifact": "docs/data/task_method_20_gap_audit.json",
      "id": "gap_audit",
      "purpose": "Verify the 180/180 scored result records and keep proxy flags reproducible."
    },
    {
      "artifact": "scripts/omni/score_model_output_probes.py",
      "id": "model_output_probe",
      "purpose": "Rescore verified model-output probes when new held-out artifacts arrive without fabricating unsupported cells."
    },
    {
      "artifact": "scripts/omni/launch_all_task_model_scoring_when_free.sh",
      "id": "guarded_gpu_launcher",
      "purpose": "Launch future replacement scoring runs only after enough private GPU capacity is idle."
    }
  ],
  "methods": {
    "cosmos3_nano_future_window": {
      "kind": "partial_128_episode_world_model_overlay",
      "label": "Cosmos3-Nano Future Window",
      "proxy_scored_task_count": 0,
      "result_record_count": 20,
      "scope": "128 selected episodes, held-out test",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "scored": 20
      }
    },
    "cosmos3_super_reasoner": {
      "kind": "partial_128_episode_foundation_model_overlay",
      "label": "Cosmos3-Super Reasoner",
      "proxy_scored_task_count": 0,
      "result_record_count": 20,
      "scope": "128 selected episodes, held-out test",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "scored": 20
      }
    },
    "metadata128_neural_mlp": {
      "kind": "partial_128_episode_aligned_baseline",
      "label": "128ep Aligned NN",
      "proxy_scored_task_count": 1,
      "result_record_count": 20,
      "scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "proxy_scored": 1,
        "scored": 19
      }
    },
    "metadata128_simple": {
      "kind": "partial_128_episode_aligned_baseline",
      "label": "128ep Aligned Simple",
      "proxy_scored_task_count": 1,
      "result_record_count": 20,
      "scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "proxy_scored": 1,
        "scored": 19
      }
    },
    "minimal": {
      "kind": "full_20_task_baseline",
      "label": "Minimal",
      "proxy_scored_task_count": 0,
      "result_record_count": 20,
      "scope": "1 public sample episode",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "scored": 20
      }
    },
    "neural_mlp": {
      "kind": "full_20_task_baseline",
      "label": "Neural MLP",
      "proxy_scored_task_count": 0,
      "result_record_count": 20,
      "scope": "1 public sample episode",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "scored": 20
      }
    },
    "qwen3_omni_v6_lora": {
      "kind": "partial_128_episode_foundation_model_overlay",
      "label": "Qwen3-Omni v6 LoRA",
      "proxy_scored_task_count": 0,
      "result_record_count": 20,
      "scope": "128 selected episodes, held-out test",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "scored": 20
      }
    },
    "raw128_neural_mlp": {
      "kind": "complete_128_episode_raw_feature_baseline",
      "label": "128ep Raw NN",
      "proxy_scored_task_count": 2,
      "result_record_count": 20,
      "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "proxy_scored": 2,
        "scored": 18
      }
    },
    "raw128_simple": {
      "kind": "complete_128_episode_raw_feature_baseline",
      "label": "128ep Raw Simple",
      "proxy_scored_task_count": 2,
      "result_record_count": 20,
      "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "proxy_scored": 2,
        "scored": 18
      }
    }
  },
  "missing_by_method": {},
  "missing_by_status": {},
  "missing_by_task": {},
  "missing_records": [],
  "proxy_records": [
    {
      "method": "128ep Raw Simple",
      "metric_key": "macro_f1",
      "reason": "documented compact proxy completion for this raw128 task axis",
      "series_id": "raw128_simple",
      "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json",
      "task_id": "interaction_text_prediction",
      "task_label": "Interaction Text Prediction",
      "task_number": 15
    },
    {
      "method": "128ep Raw NN",
      "metric_key": "macro_f1",
      "reason": "documented compact proxy completion for this raw128 task axis",
      "series_id": "raw128_neural_mlp",
      "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json",
      "task_id": "interaction_text_prediction",
      "task_label": "Interaction Text Prediction",
      "task_number": 15
    },
    {
      "method": "128ep Aligned Simple",
      "metric_key": "mrr",
      "reason": "paired camera-view embeddings are absent from the 128 JSONL/feature export; metadata features retrieve the synchronized same-window depth/audio block as a documented compact synchronization proxy",
      "series_id": "metadata128_simple",
      "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/camera_view_sync_retrieval/metrics.json",
      "task_id": "camera_view_sync_retrieval",
      "task_label": "Camera-View Synchronization Retrieval",
      "task_number": 19
    },
    {
      "method": "128ep Aligned NN",
      "metric_key": "mrr",
      "reason": "paired camera-view embeddings are absent from the 128 JSONL/feature export; metadata features retrieve the synchronized same-window depth/audio block as a documented compact synchronization proxy",
      "series_id": "metadata128_neural_mlp",
      "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/camera_view_sync_retrieval/metrics.json",
      "task_id": "camera_view_sync_retrieval",
      "task_label": "Camera-View Synchronization Retrieval",
      "task_number": 19
    },
    {
      "method": "128ep Raw Simple",
      "metric_key": "mrr",
      "reason": "documented compact proxy completion for this raw128 task axis",
      "series_id": "raw128_simple",
      "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json",
      "task_id": "camera_view_sync_retrieval",
      "task_label": "Camera-View Synchronization Retrieval",
      "task_number": 19
    },
    {
      "method": "128ep Raw NN",
      "metric_key": "mrr",
      "reason": "documented compact proxy completion for this raw128 task axis",
      "series_id": "raw128_neural_mlp",
      "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json",
      "task_id": "camera_view_sync_retrieval",
      "task_label": "Camera-View Synchronization Retrieval",
      "task_number": 19
    }
  ],
  "score_summary": {
    "method_count": 9,
    "method_task_record_count": 180,
    "proxy_scored_method_task_count": 6,
    "scored_method_task_count": 180,
    "scoreless_method_task_count": 0,
    "task_count": 20
  },
  "source_matrix": "docs/data/task_method_20_result_matrix.json",
  "status": "pass",
  "target_policy": {
    "numeric_score_gate": "A method-task cell is numeric only when a runner or verified package emits that exact task target and metric.",
    "proxy_policy": "Proxy scores are allowed only when the matrix marks them as proxy_scored and keeps the reason/source attached.",
    "scoreless_cell_policy": "If future unsupported or not-evaluated cells appear, they must stay explicit in the public matrix instead of being hidden or backfilled with proxy model claims. The current release has zero scoreless cells."
  },
  "title": "Task Method 20-Result Completion Audit"
}