File size: 17,416 Bytes
9a8a5a7
f590137
389c0f8
9a8a5a7
 
 
 
 
 
 
 
 
3f09cb0
9a8a5a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f09cb0
9a8a5a7
3f09cb0
 
 
 
9a8a5a7
3f09cb0
9a8a5a7
3f09cb0
9a8a5a7
 
 
 
 
 
 
 
32cee9a
9a8a5a7
 
389c0f8
9a8a5a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f09cb0
389c0f8
3f09cb0
 
 
 
 
9a8a5a7
389c0f8
9a8a5a7
3f09cb0
9a8a5a7
 
 
32cee9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a8a5a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32cee9a
9a8a5a7
 
f590137
3f09cb0
 
 
 
 
 
389c0f8
3f09cb0
 
 
 
 
 
 
 
 
 
 
32cee9a
 
 
 
 
 
 
 
 
 
 
 
3f09cb0
9a8a5a7
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
{
  "generated_at_utc": "2026-06-21T11:49:06+00:00",
  "interpretation_rule": "Use the 1-episode line for task construction and reproducibility claims. Use the 128-episode line for same-split metadata/raw baselines, Qwen3-Omni v6 LoRA diagnostics, and Cosmos3 diagnostics.",
  "lines": [
    {
      "artifact_entry_points": [
        "docs/data/single_episode_task_model_radar.json",
        "docs/data/two_evidence_line_result_summary.json",
        "results/episode_task_suite/summary_report.json",
        "results/episode_task_suite/feature_manifest.json",
        "docs/single_episode_explorer.html"
      ],
      "claim_boundary": "Supports task construction, file inspection, local reproducibility, and controlled single-episode baseline claims.",
      "data_unit": "One public Xperience-10M sample episode",
      "direct_scored_method_task_count": 40,
      "id": "single_public_sample_episode",
      "label": "1 sample episode",
      "method_count": 2,
      "method_task_record_count": 40,
      "methods": [
        {
          "direct_scored_task_count": 20,
          "id": "minimal",
          "label": "Minimal",
          "method_detail": "Single-episode simple heads over the public sample split.",
          "proxy_scored_task_count": 0,
          "result_record_count": 20,
          "scope": "1 public sample episode",
          "scored_task_count": 20,
          "status_counts": {
            "scored": 20
          }
        },
        {
          "direct_scored_task_count": 20,
          "id": "neural_mlp",
          "label": "Neural MLP",
          "method_detail": "Single-episode compact PyTorch MLP heads on the same 20 task contracts.",
          "proxy_scored_task_count": 0,
          "result_record_count": 20,
          "scope": "1 public sample episode",
          "scored_task_count": 20,
          "status_counts": {
            "scored": 20
          }
        }
      ],
      "not_for": "Do not use this line as evidence of multi-episode generalization.",
      "primary_use": "Inspect raw files, understand each task, rerun local baselines, and debug task quality.",
      "primary_visuals": [
        "docs/assets/charts/two_evidence_line_map.svg",
        "docs/assets/charts/single_episode_task_model_radar.svg"
      ],
      "proxy_scored_method_task_count": 0,
      "result_statement": "40/40 direct scores from Minimal and Neural MLP heads on the same 20 task contracts.",
      "scored_method_task_count": 40,
      "short_label": "Line 1",
      "task_count": 20
    },
    {
      "artifact_entry_points": [
        "docs/data/episode128_task_model_radar.json",
        "docs/data/two_evidence_line_result_summary.json",
        "docs/data/xperience10m_128_episode_feature_index.json",
        "docs/data/omni_model_comparison.json",
        "docs/data/qwen3_omni_run_lineage.json",
        "docs/data/task_method_20_gap_audit.json"
      ],
      "claim_boundary": "Supports same-split metadata/raw baseline comparison, Qwen3-Omni v6 diagnostics, Cosmos3 diagnostics, and scale-up planning on public-safe processed artifacts.",
      "data_unit": "Selected held-out 96/16/16 split with public-safe processed features linked to official gated episode paths",
      "direct_scored_method_task_count": 134,
      "id": "selected_128_episode_surface",
      "label": "128 selected episodes",
      "method_count": 7,
      "method_task_record_count": 140,
      "methods": [
        {
          "direct_scored_task_count": 19,
          "id": "metadata128_simple",
          "label": "128ep Aligned Simple",
          "method_detail": "128-episode aligned simple baselines: JSONL metadata/text tasks plus staged sensor-block tasks where the processed target exists.",
          "proxy_scored_task_count": 1,
          "result_record_count": 20,
          "scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available",
          "scored_task_count": 20,
          "status_counts": {
            "proxy_scored": 1,
            "scored": 19
          }
        },
        {
          "direct_scored_task_count": 19,
          "id": "metadata128_neural_mlp",
          "label": "128ep Aligned NN",
          "method_detail": "128-episode aligned MLP baselines: JSONL metadata/text tasks plus staged sensor-block tasks where the processed target exists.",
          "proxy_scored_task_count": 1,
          "result_record_count": 20,
          "scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available",
          "scored_task_count": 20,
          "status_counts": {
            "proxy_scored": 1,
            "scored": 19
          }
        },
        {
          "direct_scored_task_count": 18,
          "id": "raw128_simple",
          "label": "128ep Raw Simple",
          "method_detail": "128-episode 4430-dim sensor NPZ simple heads; tasks 15/19 use compact proxies.",
          "proxy_scored_task_count": 2,
          "result_record_count": 20,
          "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes",
          "scored_task_count": 20,
          "status_counts": {
            "proxy_scored": 2,
            "scored": 18
          }
        },
        {
          "direct_scored_task_count": 18,
          "id": "raw128_neural_mlp",
          "label": "128ep Raw NN",
          "method_detail": "128-episode 4430-dim sensor NPZ MLP heads; tasks 15/19 use compact proxies.",
          "proxy_scored_task_count": 2,
          "result_record_count": 20,
          "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes",
          "scored_task_count": 20,
          "status_counts": {
            "proxy_scored": 2,
            "scored": 18
          }
        },
        {
          "direct_scored_task_count": 20,
          "id": "qwen3_omni_v6_lora",
          "label": "Qwen3-Omni v6 LoRA",
          "method_detail": "Verified held-out Qwen3-Omni v6 LoRA metrics, plus task 16 and any completed private-GPU future/retrieval/sensor-target probes scored from task-specific JSON.",
          "proxy_scored_task_count": 0,
          "result_record_count": 20,
          "scope": "128 selected episodes, held-out test",
          "scored_task_count": 20,
          "status_counts": {
            "scored": 20
          }
        },
        {
          "direct_scored_task_count": 20,
          "id": "cosmos3_super_reasoner",
          "label": "Cosmos3-Super Reasoner",
          "method_detail": "Verified Cosmos3-Super base-weight Reasoner JSON-task evaluation, plus task 5/8/9/10/11/12/13/14/16/17/18/19/20 probes where public metrics exist.",
          "proxy_scored_task_count": 0,
          "result_record_count": 20,
          "scope": "128 selected episodes, held-out test",
          "scored_task_count": 20,
          "status_counts": {
            "scored": 20
          }
        },
        {
          "direct_scored_task_count": 20,
          "id": "cosmos3_nano_future_window",
          "label": "Cosmos3-Nano Future Window",
          "method_detail": "Verified Cosmos3-Nano future-window compatibility metrics, plus model-output probes for tasks 2/5/7/8/10/11/12/13/14/15/16/17/18/19 and a derived task-20 boundary timing probe scored from held-out future-window artifacts.",
          "proxy_scored_task_count": 0,
          "result_record_count": 20,
          "scope": "128 selected episodes, held-out test",
          "scored_task_count": 20,
          "status_counts": {
            "scored": 20
          }
        }
      ],
      "not_for": "Do not read compact-proxy cells as direct raw-target measurements.",
      "primary_use": "Compare same-split metadata/raw baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super Reasoner, and Cosmos3-Nano Future Window while keeping evidence type explicit.",
      "primary_visuals": [
        "docs/assets/charts/two_evidence_line_map.svg",
        "docs/assets/charts/episode128_task_model_radar.svg",
        "docs/assets/charts/unified_task_model_radar.svg"
      ],
      "proxy_scored_method_task_count": 6,
      "result_statement": "140/140 selected-128 scores across seven methods: 134 direct scores plus 6 documented compact-proxy scores.",
      "scored_method_task_count": 140,
      "short_label": "Line 2",
      "task_count": 20
    }
  ],
  "method_blocks": [
    {
      "block": "Task-head baselines",
      "direct_scored_method_task_count": 40,
      "evidence_type": "Direct target metrics on the public sample windows.",
      "line_id": "single_public_sample_episode",
      "line_label": "1 sample episode",
      "method_ids": [
        "minimal",
        "neural_mlp"
      ],
      "method_task_record_count": 40,
      "methods": [
        "Minimal",
        "Neural MLP"
      ],
      "proxy_scored_method_task_count": 0,
      "read_as": "Task construction, local reproducibility, and Minimal-vs-Neural behavior.",
      "scored_method_task_count": 40
    },
    {
      "block": "Aligned baseline heads",
      "direct_scored_method_task_count": 74,
      "evidence_type": "Direct processed-target metrics where available; compact proxies for documented raw-target gaps.",
      "line_id": "selected_128_episode_surface",
      "line_label": "128 selected episodes",
      "method_ids": [
        "metadata128_simple",
        "metadata128_neural_mlp",
        "raw128_simple",
        "raw128_neural_mlp"
      ],
      "method_task_record_count": 80,
      "methods": [
        "128ep Aligned Simple",
        "128ep Aligned NN",
        "128ep Raw Simple",
        "128ep Raw NN"
      ],
      "proxy_scored_method_task_count": 6,
      "read_as": "Same-split metadata/raw-feature baseline comparison.",
      "scored_method_task_count": 80
    },
    {
      "block": "Qwen3-Omni series",
      "direct_scored_method_task_count": 20,
      "evidence_type": "Verified selected-128 Qwen3-Omni v6 LoRA plus source-linked task-specific probes.",
      "line_id": "selected_128_episode_surface",
      "line_label": "128 selected episodes",
      "method_ids": [
        "qwen3_omni_v6_lora"
      ],
      "method_task_record_count": 20,
      "methods": [
        "Qwen3-Omni v6 LoRA"
      ],
      "proxy_scored_method_task_count": 0,
      "read_as": "Trainable Qwen3-Omni diagnostic baseline on the selected-128 surface.",
      "scored_method_task_count": 20
    },
    {
      "block": "Cosmos3 series",
      "direct_scored_method_task_count": 40,
      "evidence_type": "Verified Cosmos3-Super Reasoner and Cosmos3-Nano Future Window public-safe artifacts.",
      "line_id": "selected_128_episode_surface",
      "line_label": "128 selected episodes",
      "method_ids": [
        "cosmos3_super_reasoner",
        "cosmos3_nano_future_window"
      ],
      "method_task_record_count": 40,
      "methods": [
        "Cosmos3-Super Reasoner",
        "Cosmos3-Nano Future Window"
      ],
      "proxy_scored_method_task_count": 0,
      "read_as": "Cosmos3 reasoner and future-window diagnostics on the selected-128 surface.",
      "scored_method_task_count": 40
    }
  ],
  "proxy_records": [
    {
      "line_id": "selected_128_episode_surface",
      "method": "128ep Raw Simple",
      "metric_key": "macro_f1",
      "reason": "documented compact proxy completion for this raw128 task axis",
      "series_id": "raw128_simple",
      "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json",
      "task_id": "interaction_text_prediction",
      "task_label": "Interaction Text Prediction",
      "task_number": 15
    },
    {
      "line_id": "selected_128_episode_surface",
      "method": "128ep Raw NN",
      "metric_key": "macro_f1",
      "reason": "documented compact proxy completion for this raw128 task axis",
      "series_id": "raw128_neural_mlp",
      "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json",
      "task_id": "interaction_text_prediction",
      "task_label": "Interaction Text Prediction",
      "task_number": 15
    },
    {
      "line_id": "selected_128_episode_surface",
      "method": "128ep Aligned Simple",
      "metric_key": "mrr",
      "reason": "paired camera-view embeddings are absent from the 128 JSONL/feature export; metadata features retrieve the synchronized same-window depth/audio block as a documented compact synchronization proxy",
      "series_id": "metadata128_simple",
      "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/camera_view_sync_retrieval/metrics.json",
      "task_id": "camera_view_sync_retrieval",
      "task_label": "Camera-View Synchronization Retrieval",
      "task_number": 19
    },
    {
      "line_id": "selected_128_episode_surface",
      "method": "128ep Aligned NN",
      "metric_key": "mrr",
      "reason": "paired camera-view embeddings are absent from the 128 JSONL/feature export; metadata features retrieve the synchronized same-window depth/audio block as a documented compact synchronization proxy",
      "series_id": "metadata128_neural_mlp",
      "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/camera_view_sync_retrieval/metrics.json",
      "task_id": "camera_view_sync_retrieval",
      "task_label": "Camera-View Synchronization Retrieval",
      "task_number": 19
    },
    {
      "line_id": "selected_128_episode_surface",
      "method": "128ep Raw Simple",
      "metric_key": "mrr",
      "reason": "documented compact proxy completion for this raw128 task axis",
      "series_id": "raw128_simple",
      "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json",
      "task_id": "camera_view_sync_retrieval",
      "task_label": "Camera-View Synchronization Retrieval",
      "task_number": 19
    },
    {
      "line_id": "selected_128_episode_surface",
      "method": "128ep Raw NN",
      "metric_key": "mrr",
      "reason": "documented compact proxy completion for this raw128 task axis",
      "series_id": "raw128_neural_mlp",
      "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json",
      "task_id": "camera_view_sync_retrieval",
      "task_label": "Camera-View Synchronization Retrieval",
      "task_number": 19
    }
  ],
  "reader_policy": {
    "proxy_policy": "Proxy-scored cells stay numeric only when the source artifact and reason are attached; they should not be read as direct raw-target measurements.",
    "selected_128_episode_surface": "Use for held-out comparison, metadata/raw-feature baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super Reasoner, Cosmos3-Nano Future Window, and scale-up decisions.",
    "single_public_sample_episode": "Use for task construction, raw-file inspection, local reproducibility, and controlled Minimal-vs-Neural baseline behavior."
  },
  "reader_summary": "The suite has two public evidence lines. Line 1 is the fully inspectable one-episode task lab. Line 2 is the 128-episode comparison surface for aligned baselines, the Qwen3-Omni series, and the Cosmos3 series. Do not mix the two when reading scores.",
  "reading_order": [
    {
      "reason": "Line 1 answers task-lab and reproducibility questions; line 2 answers selected-128 comparison questions.",
      "step": "Choose the evidence line"
    },
    {
      "reason": "Use the 1-episode radar for Minimal-vs-Neural behavior and the 128-episode radar for metadata/raw baselines, Qwen3-Omni v6, Cosmos3-Super, and Cosmos3-Nano.",
      "step": "Open the matching radar"
    },
    {
      "reason": "Every numeric score is tied to a method, task, metric key, source artifact, and proxy flag.",
      "step": "Inspect the matrix row"
    },
    {
      "reason": "The six compact-proxy cells are numeric but are not direct raw-target measurements.",
      "step": "Check proxy cells before interpreting totals"
    }
  ],
  "related_model_artifacts": [
    {
      "name": "Qwen3-Omni v1-v6 run lineage",
      "repo": "docs/data/qwen3_omni_run_lineage.json",
      "role": "Explains the LoRA/evaluation version ladder; v6 is the current 20-task matrix row, v5 remains the pinned prior release, and v1-v4 are lineage/ablation evidence."
    },
    {
      "name": "Cosmos3-Super Forward-Dynamics LoRA",
      "repo": "https://huggingface.co/cy0307/ropedia-cosmos3-super-forward-dynamics-lora-128ep",
      "role": "Separate fine-tuned adapter artifact for forward-dynamics loss metrics; published with weights/results but not counted as a 20-task matrix method row."
    }
  ],
  "score_formula": "2 single-episode methods x 20 tasks = 40 records; 7 selected-128 methods x 20 tasks = 140 records; total public matrix = 180/180 scored records.",
  "source_lines": "docs/data/two_evidence_lines.json",
  "source_matrix": "docs/data/task_method_20_result_matrix.json",
  "status": "pass",
  "summary": {
    "direct_scored_method_task_count": 174,
    "line_count": 2,
    "method_count": 9,
    "method_task_record_count": 180,
    "proxy_scored_method_task_count": 6,
    "scored_method_task_count": 180,
    "task_count": 20
  },
  "title": "Two Evidence-Line Result Summary"
}