File size: 7,386 Bytes
aa70370
 
 
2600a90
 
3f09cb0
aa70370
 
 
 
3f09cb0
aa70370
3f09cb0
2600a90
 
aa70370
 
 
 
 
 
 
 
 
 
 
9a8a5a7
 
aa70370
3f09cb0
 
 
 
aa70370
 
9a8a5a7
aa70370
 
 
 
 
 
 
 
3f09cb0
aa70370
389c0f8
2600a90
 
aa70370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a8a5a7
 
aa70370
389c0f8
3f09cb0
 
 
 
 
aa70370
 
9a8a5a7
aa70370
 
07da339
aa70370
 
 
 
07da339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa70370
 
 
 
 
9a8a5a7
 
 
aa70370
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
{
  "status": "current",
  "updated_utc": "2026-06-21T00:00:00Z",
  "interpretation_rule": "Read the 1-episode line as the inspectable task lab. Read the 128-episode line as the selected comparison surface for metadata/raw baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super, and Cosmos3-Nano.",
  "reader_summary": "The suite has two public reading lanes. Line 1 is the fully inspectable one-episode task lab. Line 2 is the 128-episode comparison surface for aligned baselines, the Qwen3-Omni series, and the Cosmos3 series. Compare scores within the same lane first.",
  "score_formula": "2 single-episode methods x 20 tasks = 40 records; 7 selected-128 methods x 20 tasks = 140 records; total public matrix = 180/180 scored records.",
  "lines": [
    {
      "id": "single_public_sample_episode",
      "label": "1 sample episode",
      "short_label": "Line 1",
      "data_unit": "One public Xperience-10M sample episode",
      "result_statement": "40/40 direct scores from Minimal and Neural MLP heads on the same 20 task contracts.",
      "best_read_as": "Inspect the raw sample, understand file organization, reproduce the 20 task targets, and compare Minimal vs Neural MLP behavior inside one episode.",
      "read_separately_from": "The selected-128 comparison rows and broader held-out model behavior.",
      "frames": 5821,
      "windows": 1161,
      "window_definition": "20-frame aligned windows with 5-frame stride",
      "feature_dimensions": 8546,
      "methods": [
        "Minimal heads",
        "Neural MLP heads"
      ],
      "task_axes": 20,
      "method_task_records": 40,
      "scored_records": 40,
      "direct_scored_records": 40,
      "proxy_scored_records": 0,
      "best_use": "Inspect raw files, understand each task, rerun local baselines, and debug task quality.",
      "primary_visuals": [
        "docs/assets/charts/two_evidence_line_map.svg",
        "docs/assets/charts/single_episode_task_model_radar.svg"
      ],
      "primary_artifacts": [
        "docs/data/single_episode_task_model_radar.json",
        "docs/data/two_evidence_line_result_summary.json",
        "results/episode_task_suite/summary_report.json",
        "results/episode_task_suite/feature_manifest.json",
        "docs/single_episode_explorer.html"
      ]
    },
    {
      "id": "selected_128_episode_surface",
      "label": "128 selected episodes",
      "short_label": "Line 2",
      "data_unit": "Selected held-out 96/16/16 split with public-safe processed features linked to official gated episode paths",
      "result_statement": "140/140 selected-128 scores across seven methods: 134 direct scores plus 6 documented compact-proxy scores.",
      "best_read_as": "Compare same-split metadata/raw baselines, Qwen3-Omni v6, Cosmos3-Super, and Cosmos3-Nano while keeping the 6 compact-proxy cells visible.",
      "read_separately_from": "Direct raw-target interpretation for the proxy-marked cells.",
      "episodes": 128,
      "split": {
        "train": 96,
        "validation": 16,
        "test": 16
      },
      "exported_windows": 34269,
      "methods": [
        "Metadata simple",
        "Metadata NN",
        "Raw-feature simple",
        "Raw-feature NN",
        "Qwen3-Omni",
        "Cosmos3-Super",
        "Cosmos3-Nano"
      ],
      "task_axes": 20,
      "method_task_records": 140,
      "scored_records": 140,
      "direct_scored_records": 134,
      "proxy_scored_records": 6,
      "proxy_policy": "Proxy flags remain visible where the public export lacks a direct raw target.",
      "best_use": "Compare same-split metadata/raw baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super Reasoner, and Cosmos3-Nano Future Window while keeping evidence type explicit.",
      "primary_visuals": [
        "docs/assets/charts/two_evidence_line_map.svg",
        "docs/assets/charts/episode128_task_model_radar.svg",
        "docs/assets/charts/unified_task_model_radar.svg"
      ],
      "primary_artifacts": [
        "docs/data/episode128_task_model_radar.json",
        "docs/data/two_evidence_line_result_summary.json",
        "docs/data/xperience10m_128_episode_feature_index.json",
        "docs/data/omni_model_comparison.json",
        "docs/data/qwen3_omni_run_lineage.json",
        "docs/data/task_method_20_gap_audit.json"
      ]
    }
  ],
  "method_blocks": [
    {
      "line_id": "single_public_sample_episode",
      "line_label": "1 sample episode",
      "block": "Task-head baselines",
      "methods": [
        "Minimal",
        "Neural MLP"
      ],
      "scored_records": 40,
      "direct_scored_records": 40,
      "proxy_scored_records": 0,
      "evidence_type": "Direct target metrics on the public sample windows.",
      "read_as": "Task-lab reproducibility and simple-vs-neural behavior."
    },
    {
      "line_id": "selected_128_episode_surface",
      "line_label": "128 selected episodes",
      "block": "Aligned baseline heads",
      "methods": [
        "Metadata simple",
        "Metadata NN",
        "Raw-feature simple",
        "Raw-feature NN"
      ],
      "scored_records": 80,
      "direct_scored_records": 74,
      "proxy_scored_records": 6,
      "evidence_type": "Direct processed-target metrics where available; compact proxies for documented raw-target gaps.",
      "read_as": "Same-split metadata/raw-feature baseline comparison."
    },
    {
      "line_id": "selected_128_episode_surface",
      "line_label": "128 selected episodes",
      "block": "Qwen3-Omni series",
      "methods": [
        "Qwen3-Omni v6 LoRA"
      ],
      "scored_records": 20,
      "direct_scored_records": 20,
      "proxy_scored_records": 0,
      "evidence_type": "Verified selected-128 Qwen3-Omni v6 LoRA plus source-linked task-specific probes.",
      "read_as": "Trainable Qwen3-Omni diagnostic baseline on the selected-128 surface."
    },
    {
      "line_id": "selected_128_episode_surface",
      "line_label": "128 selected episodes",
      "block": "Cosmos3 series",
      "methods": [
        "Cosmos3-Super Reasoner",
        "Cosmos3-Nano Future Window"
      ],
      "scored_records": 40,
      "direct_scored_records": 40,
      "proxy_scored_records": 0,
      "evidence_type": "Verified Cosmos3-Super Reasoner and Cosmos3-Nano Future Window public-safe artifacts.",
      "read_as": "Cosmos3 reasoner and future-window diagnostics on the selected-128 surface."
    }
  ],
  "related_model_artifacts": [
    {
      "name": "Qwen3-Omni v1-v6 run lineage",
      "role": "Explains the LoRA/evaluation version ladder; v6 is the current 20-task matrix row, v5 remains the pinned prior release, and v1-v4 are lineage/ablation evidence.",
      "repo": "docs/data/qwen3_omni_run_lineage.json"
    },
    {
      "name": "Cosmos3-Super Forward-Dynamics LoRA",
      "role": "Separate fine-tuned adapter artifact for forward-dynamics loss metrics; published with weights/results but not counted as a 20-task matrix method row.",
      "repo": "https://huggingface.co/cy0307/ropedia-cosmos3-super-forward-dynamics-lora-128ep"
    }
  ],
  "combined_public_matrix": {
    "task_axes": 20,
    "methods": 9,
    "method_task_records": 180,
    "scored_records": 180,
    "direct_scored_records": 174,
    "proxy_scored_records": 6,
    "summary_artifact": "docs/data/two_evidence_line_result_summary.json",
    "artifact": "docs/data/task_method_20_result_matrix.json"
  }
}