cy0307 commited on
Commit
3d29d20
·
verified ·
1 Parent(s): 0bc6a41

Refine reader-facing public wording (2/6)

Browse files
data/evidence_contract.json CHANGED
@@ -1,170 +1,169 @@
1
  {
2
  "project": "Ropedia Xperience-10M Task Suite",
3
  "scope": "single public Xperience-10M sample episode",
4
- "claims": [
5
  {
6
  "id": "project_status",
7
- "claim": "A first-pass reader has a compact current-state summary.",
8
  "status": "verified",
9
  "evidence": [
10
  "PROJECT_STATUS.md",
11
  "docs/data/project_status.json"
12
  ],
13
- "boundary": "summarizes existing evidence and current limitations"
 
14
  },
15
  {
16
  "id": "research_roadmap",
17
- "claim": "The research roadmap is explicit.",
18
  "status": "current",
19
  "evidence": [
20
  "RESEARCH_ROADMAP.md",
21
  "docs/data/research_roadmap.json"
22
  ],
23
- "boundary": "connects public-sample task development to multi-episode data preparation, Qwen3-Omni LoRA, robustness runs, and larger omni-model extensions"
 
24
  },
25
- {
26
- "id": "official_dataset_card_alignment",
27
- "claim": "The public dataset description is aligned with the official gated Xperience-10M dataset card and public sample card.",
28
  "status": "verified",
29
  "evidence": [
30
  "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md",
31
  "docs/data/xperience10m_dataset_card_alignment.json",
32
  "https://huggingface.co/datasets/ropedia-ai/xperience-10m"
33
  ],
34
- "boundary": "summarizes upstream public metadata, API listing facts, sample license/tooling, and dataset-card facts; does not grant access or mirror raw data"
35
- },
36
- {
37
- "id": "source_alignment",
38
- "claim": "Source facts, sample details, API-listing notes, and project coverage are validated across repo, website, and HF cards.",
39
- "status": "verified",
40
- "evidence": [
41
- "SOURCE_ALIGNMENT_AUDIT.md",
42
- "docs/data/source_alignment_audit.json",
43
- "scripts/validate_source_alignment.py"
44
- ],
45
- "boundary": "offline committed-fact check; does not fetch private gated data"
46
- },
47
- {
48
- "id": "aligned_windows",
49
- "claim": "The public Xperience-10M sample has been converted into aligned model windows.",
50
  "status": "verified",
51
  "evidence": [
52
  "results/episode_task_suite/windows.csv",
53
  "results/episode_task_suite/shared_windows.npz",
54
  "results/episode_task_suite/summary_report.json"
55
  ],
56
- "boundary": "5,821 frames, 1,161 windows, one public sample episode"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  },
58
- {
59
- "id": "feature_contract",
60
- "claim": "The current feature contract is explicit and inspectable.",
61
- "status": "verified",
62
- "evidence": [
63
- "results/episode_task_suite/feature_manifest.json",
64
- "results/episode_task_suite/available_modalities.json"
65
- ],
66
- "boundary": "8,546-dimensional aligned multimodal window representation"
67
- },
68
- {
69
- "id": "evaluation_protocol",
70
- "claim": "The task evaluation protocol is explicit and generated from committed metrics.",
71
- "status": "verified",
72
- "evidence": [
73
- "EVALUATION_PROTOCOL.md",
74
- "docs/data/evaluation_protocol.json",
75
- "scripts/build_evaluation_protocol.py"
76
- ],
77
- "boundary": "defines windows, split, per-task metrics, leakage controls, and current limitations"
78
- },
79
  {
80
  "id": "modality_atlas",
81
- "claim": "The public sample modalities are inspectable without raw data redistribution.",
82
  "status": "verified",
83
  "evidence": [
84
  "docs/data/modality_atlas.json",
85
  "docs/assets/modalities/",
86
  "docs/index.html"
87
  ],
88
- "boundary": "derived thumbnails for presentation; raw data remains excluded"
 
89
  },
90
  {
91
  "id": "task_surface_integrity",
92
- "claim": "Public task cards stay readable for non-expert readers.",
93
  "status": "verified",
94
  "evidence": [
95
  "docs/data/task_surface_integrity.json",
96
  "scripts/validate_task_surface.py",
97
  "docs/index.html"
98
  ],
99
- "boundary": "presentation integrity for the public task surface"
 
100
  },
101
  {
102
  "id": "figure_index",
103
- "claim": "Public figures, charts, and modality thumbnails are indexed as project evidence.",
104
  "status": "verified",
105
  "evidence": [
106
  "FIGURE_INDEX.md",
107
  "docs/data/figure_index.json",
108
  "scripts/build_figure_index.py"
109
  ],
110
- "boundary": "records derived visual assets, dimensions, hashes, roles, and source scripts; raw Xperience-10M data remains excluded"
 
111
  },
112
  {
113
  "id": "brand_assets",
114
- "claim": "A project logo is consistently applied across public surfaces.",
115
  "status": "verified",
116
  "evidence": [
117
  "docs/assets/brand/",
118
  "docs/data/brand_assets.json",
119
  "scripts/build_brand_assets.py"
120
  ],
121
- "boundary": "generated logo and deterministic derivatives only; no raw dataset data or model weights"
 
122
  },
123
  {
124
  "id": "twelve_tasks",
125
- "claim": "The 12 task heads are implemented as scripts with saved metrics and predictions.",
126
  "status": "verified",
127
  "evidence": [
128
  "scripts/episode_task_suite.py",
129
  "results/episode_task_suite/*/metrics.json",
130
  "results/episode_task_suite/*/predictions.*"
131
  ],
132
- "boundary": "chronological single-episode split, not cross-episode generalization"
 
133
  },
134
  {
135
  "id": "minimal_vs_neural",
136
- "claim": "Minimal and neural heads use the same task contracts.",
137
  "status": "verified",
138
  "evidence": [
139
  "scripts/neural_task_models.py",
140
  "results/episode_task_suite/neural_mlp/",
141
  "docs/assets/task_architectures.png"
142
  ],
143
- "boundary": "small heads only; not a foundation model"
 
144
  },
145
  {
146
  "id": "research_directions",
147
- "claim": "Four Ropedia research directions are mapped honestly as direct, proxy, or diagnostic evidence.",
148
  "status": "verified",
149
  "evidence": [
150
  "results/episode_task_suite/research_directions/research_direction_taxonomy.json",
151
  "docs/data/research_directions.json"
152
  ],
153
- "boundary": "some directions remain proxy-only"
 
154
  },
155
  {
156
  "id": "direction_extensions",
157
- "claim": "Four extra direction probes are coded and evaluated.",
158
  "status": "verified",
159
  "evidence": [
160
  "results/episode_task_suite/research_direction_extensions/research_direction_extension_results.json",
161
  "docs/data/research_direction_extensions.json"
162
  ],
163
- "boundary": "single-episode probes, not full research-direction solutions"
 
164
  },
165
  {
166
  "id": "qwen3_omni_diagnostic_pilot",
167
- "claim": "Qwen3-Omni has a verified selected-episode held-out diagnostic pilot.",
168
  "status": "verified_diagnostic",
169
  "evidence": [
170
  "docs/data/omni_finetune_verified_result.json",
@@ -172,94 +171,94 @@
172
  "scripts/omni/package_verified_omni_result.py",
173
  "scripts/omni/audit_verified_omni_package.py"
174
  ],
175
- "boundary": "the pipeline is verified, but model quality is weak: JSON validity is below target and action/subtask metrics are low"
 
176
  },
177
  {
178
  "id": "multi_episode_quality_improvement",
179
- "claim": "The next Qwen3-Omni step is structured-output and task-quality improvement on the same selected split.",
180
  "status": "active_next_step",
181
  "evidence": [
182
  "scripts/omni/run_128_fullsplit_parallel_export_8gpu.sh",
183
  "docs/data/omni_finetune_verified_result.json",
184
  "FOUNDATION_MODEL_PLAN.md"
185
  ],
186
- "boundary": "stronger model quality requires output-format improvements and action/subtask error analysis"
 
187
  },
188
  {
189
  "id": "scale_up_status_check",
190
- "claim": "Older pilot path strings are tracked as setup-file provenance.",
191
  "status": "verified",
192
  "evidence": [
193
  "scripts/validate_scope_claims.py",
194
  "docs/data/scope_claims_audit.json"
195
  ],
196
- "boundary": "run/path identifiers stay separate from completed held-out-episode results"
 
197
  },
198
  {
199
  "id": "mirror_parity",
200
- "claim": "Prepared GitHub and Hugging Face mirrors carry matching critical data, visual, HTML, and validator files.",
201
  "status": "verified",
202
  "evidence": [
203
  "scripts/validate_mirror_parity.py",
204
  "docs/data/mirror_parity.json"
205
  ],
206
- "boundary": "compares prepared local mirror bundles before upload; live URLs are checked after publishing"
 
207
  },
208
  {
209
  "id": "publication_package",
210
- "claim": "The public GitHub and Hugging Face bundles contain the intended release files.",
211
  "status": "verified",
212
  "evidence": [
213
  "scripts/validate_publication_package.py",
214
  "docs/data/publication_audit.json"
215
  ],
216
- "boundary": "checks public files, HF bundles, and current public-card assets; temporary local outputs are excluded"
 
217
  },
218
  {
219
  "id": "website_integrity",
220
- "claim": "The public website has checked local references.",
221
  "status": "verified",
222
  "evidence": [
223
  "scripts/validate_website_integrity.py",
224
  "docs/data/website_integrity.json"
225
  ],
226
- "boundary": "checks local links, anchors, JSON data, and referenced images; external URLs are not fetched"
 
227
  },
228
  {
229
  "id": "rendered_site_check",
230
- "claim": "The rendered website walkthrough has a browser-level interaction check.",
231
  "status": "verified",
232
  "evidence": [
233
  "RENDERED_SITE_CHECK.md",
234
  "scripts/build_rendered_site_check.py",
235
  "docs/data/rendered_site_check.json"
236
  ],
237
- "boundary": "checks local page load, tab switch, walkthrough deep link, player controls, and console health"
 
238
  },
239
  {
240
  "id": "quality_gates",
241
- "claim": "The release gate is explicit.",
242
  "status": "verified",
243
  "evidence": [
244
  "QUALITY_GATES.md",
245
  "scripts/build_quality_gates.py",
246
  "docs/data/quality_gates.json"
247
  ],
248
- "boundary": "summarizes packaging and live-mirror checks; cross-episode model quality is measured by later held-out reports"
 
249
  },
250
  {
251
  "id": "live_publication_status",
252
- "claim": "The live public mirrors are checked after upload.",
253
  "status": "verified",
254
  "evidence": [
255
  "scripts/verify_live_publication.py",
256
  "docs/data/live_publication_status.json"
257
  ],
258
- "boundary": "fetches public GitHub/HF URLs; it does not validate private training state"
 
259
  },
260
  {
261
  "id": "citation_metadata",
262
- "claim": "The project is externally citable and machine-readable.",
263
  "status": "verified",
264
  "evidence": [
265
  "CITATION.cff",
@@ -267,11 +266,11 @@
267
  "docs/data/project_manifest.json",
268
  "LICENSE"
269
  ],
270
- "boundary": "code license does not override original Xperience-10M dataset terms"
 
271
  },
272
  {
273
  "id": "project_path",
274
- "claim": "A first-time reader has an explicit project path.",
275
  "status": "verified",
276
  "evidence": [
277
  "docs/data/project_packet.json",
@@ -280,29 +279,30 @@
280
  "README.md",
281
  "docs/index.html"
282
  ],
283
- "boundary": "guides inspection across data, tasks, results, and scale-up status"
 
284
  },
285
  {
286
  "id": "artifact_index",
287
- "claim": "The core project artifacts are grouped for human reading and indexed with existence, size, and hash metadata where stable.",
288
  "status": "verified",
289
  "evidence": [
290
  "ARTIFACT_GUIDE.md",
291
  "scripts/build_artifact_index.py",
292
  "docs/data/artifact_index.json"
293
  ],
294
- "boundary": "selective source-of-truth catalog, not a complete inventory of every output file"
 
295
  },
296
  {
297
  "id": "reproducibility_contract",
298
- "claim": "The public reproduction path is documented with commands, expected outputs, and exact-match reproduction evidence.",
299
  "status": "verified",
300
  "evidence": [
301
  "REPRODUCIBILITY.md",
302
  "docs/data/reproducibility_matrix.json",
303
  "notes/reproducibility_audit.md"
304
  ],
305
- "boundary": "publicly reproduces the single-episode pipeline; multi-episode Qwen3-Omni metrics are added only after data preparation and held-out evaluation"
 
306
  }
307
  ]
308
  }
 
1
  {
2
  "project": "Ropedia Xperience-10M Task Suite",
3
  "scope": "single public Xperience-10M sample episode",
4
+ "readouts": [
5
  {
6
  "id": "project_status",
 
7
  "status": "verified",
8
  "evidence": [
9
  "PROJECT_STATUS.md",
10
  "docs/data/project_status.json"
11
  ],
12
+ "readout": "A first-pass reader has a compact current-state summary.",
13
+ "scope_note": "summarizes existing evidence and current limitations"
14
  },
15
  {
16
  "id": "research_roadmap",
 
17
  "status": "current",
18
  "evidence": [
19
  "RESEARCH_ROADMAP.md",
20
  "docs/data/research_roadmap.json"
21
  ],
22
+ "readout": "The research roadmap is explicit.",
23
+ "scope_note": "connects public-sample task development to multi-episode data preparation, Qwen3-Omni LoRA, robustness runs, and larger omni-model extensions"
24
  },
25
+ {
26
+ "id": "official_dataset_card_alignment",
 
27
  "status": "verified",
28
  "evidence": [
29
  "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md",
30
  "docs/data/xperience10m_dataset_card_alignment.json",
31
  "https://huggingface.co/datasets/ropedia-ai/xperience-10m"
32
  ],
33
+ "readout": "The public dataset description is aligned with the official gated Xperience-10M dataset card and public sample card.",
34
+ "scope_note": "summarizes upstream public metadata, API listing facts, sample license/tooling, and dataset-card facts; does not grant access or mirror raw data"
35
+ },
36
+ {
37
+ "id": "source_alignment",
38
+ "status": "verified",
39
+ "evidence": [
40
+ "SOURCE_ALIGNMENT_AUDIT.md",
41
+ "docs/data/source_alignment_audit.json",
42
+ "scripts/validate_source_alignment.py"
43
+ ],
44
+ "readout": "Source facts, sample details, API-listing notes, and project coverage are validated across repo, website, and HF cards.",
45
+ "scope_note": "offline committed-fact check; does not fetch private gated data"
46
+ },
47
+ {
48
+ "id": "aligned_windows",
49
  "status": "verified",
50
  "evidence": [
51
  "results/episode_task_suite/windows.csv",
52
  "results/episode_task_suite/shared_windows.npz",
53
  "results/episode_task_suite/summary_report.json"
54
  ],
55
+ "readout": "The public Xperience-10M sample has been converted into aligned model windows.",
56
+ "scope_note": "5,821 frames, 1,161 windows, one public sample episode"
57
+ },
58
+ {
59
+ "id": "feature_contract",
60
+ "status": "verified",
61
+ "evidence": [
62
+ "results/episode_task_suite/feature_manifest.json",
63
+ "results/episode_task_suite/available_modalities.json"
64
+ ],
65
+ "readout": "The current feature contract is explicit and inspectable.",
66
+ "scope_note": "8,546-dimensional aligned multimodal window representation"
67
+ },
68
+ {
69
+ "id": "evaluation_protocol",
70
+ "status": "verified",
71
+ "evidence": [
72
+ "EVALUATION_PROTOCOL.md",
73
+ "docs/data/evaluation_protocol.json",
74
+ "scripts/build_evaluation_protocol.py"
75
+ ],
76
+ "readout": "The task evaluation protocol is explicit and generated from committed metrics.",
77
+ "scope_note": "defines windows, split, per-task metrics, leakage controls, and current limitations"
78
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  {
80
  "id": "modality_atlas",
 
81
  "status": "verified",
82
  "evidence": [
83
  "docs/data/modality_atlas.json",
84
  "docs/assets/modalities/",
85
  "docs/index.html"
86
  ],
87
+ "readout": "The public sample modalities are inspectable without raw data redistribution.",
88
+ "scope_note": "derived thumbnails for presentation; raw data remains excluded"
89
  },
90
  {
91
  "id": "task_surface_integrity",
 
92
  "status": "verified",
93
  "evidence": [
94
  "docs/data/task_surface_integrity.json",
95
  "scripts/validate_task_surface.py",
96
  "docs/index.html"
97
  ],
98
+ "readout": "Public task cards stay readable for non-expert readers.",
99
+ "scope_note": "presentation integrity for the public task surface"
100
  },
101
  {
102
  "id": "figure_index",
 
103
  "status": "verified",
104
  "evidence": [
105
  "FIGURE_INDEX.md",
106
  "docs/data/figure_index.json",
107
  "scripts/build_figure_index.py"
108
  ],
109
+ "readout": "Public figures, charts, and modality thumbnails are indexed as project evidence.",
110
+ "scope_note": "records derived visual assets, dimensions, hashes, roles, and source scripts; raw Xperience-10M data remains excluded"
111
  },
112
  {
113
  "id": "brand_assets",
 
114
  "status": "verified",
115
  "evidence": [
116
  "docs/assets/brand/",
117
  "docs/data/brand_assets.json",
118
  "scripts/build_brand_assets.py"
119
  ],
120
+ "readout": "A project logo is consistently applied across public surfaces.",
121
+ "scope_note": "generated logo and deterministic derivatives only; no raw dataset data or model weights"
122
  },
123
  {
124
  "id": "twelve_tasks",
 
125
  "status": "verified",
126
  "evidence": [
127
  "scripts/episode_task_suite.py",
128
  "results/episode_task_suite/*/metrics.json",
129
  "results/episode_task_suite/*/predictions.*"
130
  ],
131
+ "readout": "The 12 task heads are implemented as scripts with saved metrics and predictions.",
132
+ "scope_note": "chronological single-episode split, not cross-episode generalization"
133
  },
134
  {
135
  "id": "minimal_vs_neural",
 
136
  "status": "verified",
137
  "evidence": [
138
  "scripts/neural_task_models.py",
139
  "results/episode_task_suite/neural_mlp/",
140
  "docs/assets/task_architectures.png"
141
  ],
142
+ "readout": "Minimal and neural heads use the same task contracts.",
143
+ "scope_note": "small heads only; not a foundation model"
144
  },
145
  {
146
  "id": "research_directions",
 
147
  "status": "verified",
148
  "evidence": [
149
  "results/episode_task_suite/research_directions/research_direction_taxonomy.json",
150
  "docs/data/research_directions.json"
151
  ],
152
+ "readout": "Four Ropedia research directions are mapped honestly as direct, proxy, or diagnostic evidence.",
153
+ "scope_note": "some directions remain proxy-only"
154
  },
155
  {
156
  "id": "direction_extensions",
 
157
  "status": "verified",
158
  "evidence": [
159
  "results/episode_task_suite/research_direction_extensions/research_direction_extension_results.json",
160
  "docs/data/research_direction_extensions.json"
161
  ],
162
+ "readout": "Four extra direction probes are coded and evaluated.",
163
+ "scope_note": "single-episode probes, not full research-direction solutions"
164
  },
165
  {
166
  "id": "qwen3_omni_diagnostic_pilot",
 
167
  "status": "verified_diagnostic",
168
  "evidence": [
169
  "docs/data/omni_finetune_verified_result.json",
 
171
  "scripts/omni/package_verified_omni_result.py",
172
  "scripts/omni/audit_verified_omni_package.py"
173
  ],
174
+ "readout": "Qwen3-Omni has a verified selected-episode held-out diagnostic pilot.",
175
+ "scope_note": "the pipeline is verified, but model quality is weak: JSON validity is below target and action/subtask metrics are low"
176
  },
177
  {
178
  "id": "multi_episode_quality_improvement",
 
179
  "status": "active_next_step",
180
  "evidence": [
181
  "scripts/omni/run_128_fullsplit_parallel_export_8gpu.sh",
182
  "docs/data/omni_finetune_verified_result.json",
183
  "FOUNDATION_MODEL_PLAN.md"
184
  ],
185
+ "readout": "The next Qwen3-Omni step is structured-output and task-quality improvement on the same selected split.",
186
+ "scope_note": "stronger model quality requires output-format improvements and action/subtask error analysis"
187
  },
188
  {
189
  "id": "scale_up_status_check",
 
190
  "status": "verified",
191
  "evidence": [
192
  "scripts/validate_scope_claims.py",
193
  "docs/data/scope_claims_audit.json"
194
  ],
195
+ "readout": "Older pilot path strings are tracked as setup-file provenance.",
196
+ "scope_note": "run/path identifiers stay separate from completed held-out-episode results"
197
  },
198
  {
199
  "id": "mirror_parity",
 
200
  "status": "verified",
201
  "evidence": [
202
  "scripts/validate_mirror_parity.py",
203
  "docs/data/mirror_parity.json"
204
  ],
205
+ "readout": "Prepared GitHub and Hugging Face mirrors carry matching critical data, visual, HTML, and validator files.",
206
+ "scope_note": "compares prepared local mirror bundles before upload; live URLs are checked after publishing"
207
  },
208
  {
209
  "id": "publication_package",
 
210
  "status": "verified",
211
  "evidence": [
212
  "scripts/validate_publication_package.py",
213
  "docs/data/publication_audit.json"
214
  ],
215
+ "readout": "The public GitHub and Hugging Face bundles contain the intended release files.",
216
+ "scope_note": "checks public files, HF bundles, and current public-card assets; temporary local outputs are excluded"
217
  },
218
  {
219
  "id": "website_integrity",
 
220
  "status": "verified",
221
  "evidence": [
222
  "scripts/validate_website_integrity.py",
223
  "docs/data/website_integrity.json"
224
  ],
225
+ "readout": "The public website has checked local references.",
226
+ "scope_note": "checks local links, anchors, JSON data, and referenced images; external URLs are not fetched"
227
  },
228
  {
229
  "id": "rendered_site_check",
 
230
  "status": "verified",
231
  "evidence": [
232
  "RENDERED_SITE_CHECK.md",
233
  "scripts/build_rendered_site_check.py",
234
  "docs/data/rendered_site_check.json"
235
  ],
236
+ "readout": "The rendered website walkthrough has a browser-level interaction check.",
237
+ "scope_note": "checks local page load, tab switch, walkthrough deep link, player controls, and console health"
238
  },
239
  {
240
  "id": "quality_gates",
 
241
  "status": "verified",
242
  "evidence": [
243
  "QUALITY_GATES.md",
244
  "scripts/build_quality_gates.py",
245
  "docs/data/quality_gates.json"
246
  ],
247
+ "readout": "The release gate is explicit.",
248
+ "scope_note": "summarizes packaging and live-mirror checks; cross-episode model quality is measured by later held-out reports"
249
  },
250
  {
251
  "id": "live_publication_status",
 
252
  "status": "verified",
253
  "evidence": [
254
  "scripts/verify_live_publication.py",
255
  "docs/data/live_publication_status.json"
256
  ],
257
+ "readout": "The live public mirrors are checked after upload.",
258
+ "scope_note": "fetches public GitHub/HF URLs; it does not validate private training state"
259
  },
260
  {
261
  "id": "citation_metadata",
 
262
  "status": "verified",
263
  "evidence": [
264
  "CITATION.cff",
 
266
  "docs/data/project_manifest.json",
267
  "LICENSE"
268
  ],
269
+ "readout": "The project is externally citable and machine-readable.",
270
+ "scope_note": "code license does not override original Xperience-10M dataset terms"
271
  },
272
  {
273
  "id": "project_path",
 
274
  "status": "verified",
275
  "evidence": [
276
  "docs/data/project_packet.json",
 
279
  "README.md",
280
  "docs/index.html"
281
  ],
282
+ "readout": "A first-time reader has an explicit project path.",
283
+ "scope_note": "guides inspection across data, tasks, results, and scale-up status"
284
  },
285
  {
286
  "id": "artifact_index",
 
287
  "status": "verified",
288
  "evidence": [
289
  "ARTIFACT_GUIDE.md",
290
  "scripts/build_artifact_index.py",
291
  "docs/data/artifact_index.json"
292
  ],
293
+ "readout": "The core project artifacts are grouped for human reading and indexed with existence, size, and hash metadata where stable.",
294
+ "scope_note": "selective source-of-truth catalog, not a complete inventory of every output file"
295
  },
296
  {
297
  "id": "reproducibility_contract",
 
298
  "status": "verified",
299
  "evidence": [
300
  "REPRODUCIBILITY.md",
301
  "docs/data/reproducibility_matrix.json",
302
  "notes/reproducibility_audit.md"
303
  ],
304
+ "readout": "The public reproduction path is documented with commands, expected outputs, and exact-match reproduction evidence.",
305
+ "scope_note": "publicly reproduces the single-episode pipeline; multi-episode Qwen3-Omni metrics are added only after data preparation and held-out evaluation"
306
  }
307
  ]
308
  }
data/mirror_parity.json CHANGED
The diff for this file is too large to render. See raw diff
 
data/omni_model_comparison.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "title": "Ropedia Xperience-10M Current Result Versions and Model Groups",
3
- "generated_at_utc": "2026-06-21T15:17:00+00:00",
4
  "status": "pass",
5
  "version_count": 3,
6
  "model_group_count": 5,
@@ -1758,6 +1758,6 @@
1758
  ],
1759
  "pending": [
1760
  "Use the verified Qwen3 v6 rank64/lr5e-5 dense multiscale full-eval package as the latest current Qwen row; the v5 release tag remains pinned as the previous verified release.",
1761
- "Read results/omni_finetune/QWEN3_V5_V6_COMPARISON_20260614.md before claiming v6 is globally better than v5, because v6 improves action macro-F1 and contact accuracy but regresses subtask, next-action, object micro-F1, and JSON validity slightly."
1762
  ]
1763
  }
 
1
  {
2
  "title": "Ropedia Xperience-10M Current Result Versions and Model Groups",
3
+ "generated_at_utc": "2026-06-22T10:59:59+00:00",
4
  "status": "pass",
5
  "version_count": 3,
6
  "model_group_count": 5,
 
1758
  ],
1759
  "pending": [
1760
  "Use the verified Qwen3 v6 rank64/lr5e-5 dense multiscale full-eval package as the latest current Qwen row; the v5 release tag remains pinned as the previous verified release.",
1761
+ "Read results/omni_finetune/QWEN3_V5_V6_COMPARISON_20260614.md before presenting v6 as globally better than v5, because v6 improves action macro-F1 and contact accuracy but regresses subtask, next-action, object micro-F1, and JSON validity slightly."
1762
  ]
1763
  }
data/project_brief.json CHANGED
@@ -56,7 +56,7 @@
56
  "Use docs/data/omni_finetune_verified_result.json for the current multi-episode Qwen3-Omni pilot result."
57
  ],
58
  "scope_boundary": "The public sample is enough to build and verify task definitions, feature contracts, metrics, visualization, and baseline code. The final multi-episode Qwen3-Omni diagnostic result verifies the training loop and strict-JSON output reliability, but does not yet show strong action/subtask model quality.",
59
- "next_stage": "Improve action/subtask quality through error analysis before larger robustness or alternative-backbone claims.",
60
  "entry_points": {
61
  "visual_dashboard": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/",
62
  "hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite",
 
56
  "Use docs/data/omni_finetune_verified_result.json for the current multi-episode Qwen3-Omni pilot result."
57
  ],
58
  "scope_boundary": "The public sample is enough to build and verify task definitions, feature contracts, metrics, visualization, and baseline code. The final multi-episode Qwen3-Omni diagnostic result verifies the training loop and strict-JSON output reliability, but does not yet show strong action/subtask model quality.",
59
+ "next_stage": "Improve action/subtask quality through error analysis before presenting larger robustness or alternative-backbone results.",
60
  "entry_points": {
61
  "visual_dashboard": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/",
62
  "hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite",
data/public_reader_map.json CHANGED
@@ -6,77 +6,124 @@
6
  {
7
  "reader_goal": "Understand the project in one pass",
8
  "start_here": "PROJECT_BRIEF.md",
9
- "then_inspect": ["PROJECT_STATUS.md", "RESEARCH_TAKEAWAYS.md"]
 
 
 
10
  },
11
  {
12
  "reader_goal": "Understand the two evidence lines",
13
  "start_here": "TWO_EVIDENCE_LINES.md",
14
- "then_inspect": ["docs/data/two_evidence_lines.json", "docs/data/two_evidence_line_result_summary.json"]
 
 
 
15
  },
16
  {
17
  "reader_goal": "See the visual public dashboard",
18
  "start_here": "GitHub Pages dashboard or Hugging Face Space",
19
- "then_inspect": ["docs/index.html", "docs/data/project_packet.json"]
 
 
 
20
  },
21
  {
22
  "reader_goal": "Decode project terminology",
23
  "start_here": "GLOSSARY.md",
24
- "then_inspect": ["docs/data/glossary.json", "Homepage Glossary section"]
 
 
 
25
  },
26
  {
27
  "reader_goal": "Understand the data unit",
28
  "start_here": "results/episode_task_suite/windows.csv",
29
- "then_inspect": ["results/episode_task_suite/feature_manifest.json", "docs/data/raw_sample_files.json"]
 
 
 
30
  },
31
  {
32
  "reader_goal": "Trace the 128-episode split",
33
  "start_here": "XPERIENCE10M_128_EPISODE_FEATURE_INDEX.md",
34
- "then_inspect": ["docs/data/xperience10m_128_episode_feature_index.json", "results/omni_finetune/xperience10m_128_episode_selection.csv"]
 
 
 
35
  },
36
  {
37
  "reader_goal": "Inspect the 20-task benchmark",
38
  "start_here": "TASK_SUITE_20.md",
39
- "then_inspect": ["docs/data/task_suite_20.json", "EVALUATION_PROTOCOL.md"]
 
 
 
40
  },
41
  {
42
  "reader_goal": "Compare current results",
43
  "start_here": "RESEARCH_TAKEAWAYS.md",
44
- "then_inspect": ["docs/data/task_method_20_result_matrix.json", "docs/data/unified_task_model_radar.json"]
 
 
 
45
  },
46
  {
47
  "reader_goal": "Compare 1-episode and 128-episode methods",
48
  "start_here": "Homepage radar section",
49
- "then_inspect": ["docs/data/single_episode_task_model_radar.json", "docs/data/episode128_task_model_radar.json"]
 
 
 
50
  },
51
  {
52
  "reader_goal": "Read Qwen3-Omni v1-v6 correctly",
53
  "start_here": "QWEN3_OMNI_RUN_LINEAGE.md",
54
- "then_inspect": ["docs/data/qwen3_omni_run_lineage.json", "docs/data/qwen3_v5_v6_comparison.json"]
 
 
 
55
  },
56
  {
57
  "reader_goal": "Find all derived artifacts",
58
  "start_here": "ARTIFACT_GUIDE.md",
59
- "then_inspect": ["Hugging Face artifact dataset", "docs/data/artifact_index.json"]
 
 
 
60
  },
61
  {
62
  "reader_goal": "Download model weights with their matching results",
63
  "start_here": "Hugging Face weights/results repo",
64
- "then_inspect": ["manifest.json", "analysis/docs/data/task_method_20_result_matrix.json", "results/"]
 
 
 
 
65
  },
66
  {
67
  "reader_goal": "Reproduce or extend the work",
68
  "start_here": "REPRODUCIBILITY.md",
69
- "then_inspect": ["QUALITY_GATES.md", "scripts/", "results/"]
 
 
 
 
70
  },
71
  {
72
  "reader_goal": "Understand foundation-model directions",
73
  "start_here": "THREE_FOUNDATION_PIPELINES.md",
74
- "then_inspect": ["FOUNDATION_MODEL_PLAN.md", "docs/data/three_foundation_pipelines.json"]
 
 
 
75
  },
76
  {
77
  "reader_goal": "Check public-release health",
78
  "start_here": "PUBLIC_SURFACE_QA.md",
79
- "then_inspect": ["docs/data/live_publication_status.json", "docs/data/mirror_parity.json"]
 
 
 
80
  }
81
  ],
82
  "public_surfaces": [
@@ -125,31 +172,49 @@
125
  "Foundation directions",
126
  "Public-release checks"
127
  ],
128
- "claim_boundaries": [
129
- {
130
- "claim_type": "Single public-sample task behavior",
131
- "public_evidence": ["results/episode_task_suite/", "docs/data/task_suite_20.json"],
132
- "boundary": "Describes one public sample episode, not the full dataset distribution."
133
- },
134
- {
135
- "claim_type": "128-episode method comparison",
136
- "public_evidence": ["XPERIENCE10M_128_EPISODE_FEATURE_INDEX.md", "docs/data/xperience10m_128_episode_feature_index.json", "results/omni_finetune/*128*", "docs/data/omni_model_comparison.json"],
137
- "boundary": "Uses selected held-out episodes and derived public-safe summaries; official raw files remain gated upstream."
138
- },
139
- {
140
- "claim_type": "Qwen3-Omni v1-v6 lineage",
141
- "public_evidence": ["QWEN3_OMNI_RUN_LINEAGE.md", "docs/data/qwen3_omni_run_lineage.json"],
142
- "boundary": "v1-v4 are pipeline and ablation evidence, v5 is the pinned prior release, and v6 is the current public 20-task Qwen row."
143
- },
144
- {
145
- "claim_type": "Foundation-model track quality",
146
- "public_evidence": ["Verified Qwen3-Omni and Cosmos3 result packages", "model cards"],
147
- "boundary": "Numeric task scores appear only when a task-specific eval or probe exists."
148
- },
149
- {
150
- "claim_type": "Reproducibility",
151
- "public_evidence": ["REPRODUCIBILITY.md", "QUALITY_GATES.md", "release validators"],
152
- "boundary": "Raw gated Xperience-10M files and full foundation weights are not redistributed."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  }
154
  ]
155
  }
 
6
  {
7
  "reader_goal": "Understand the project in one pass",
8
  "start_here": "PROJECT_BRIEF.md",
9
+ "then_inspect": [
10
+ "PROJECT_STATUS.md",
11
+ "RESEARCH_TAKEAWAYS.md"
12
+ ]
13
  },
14
  {
15
  "reader_goal": "Understand the two evidence lines",
16
  "start_here": "TWO_EVIDENCE_LINES.md",
17
+ "then_inspect": [
18
+ "docs/data/two_evidence_lines.json",
19
+ "docs/data/two_evidence_line_result_summary.json"
20
+ ]
21
  },
22
  {
23
  "reader_goal": "See the visual public dashboard",
24
  "start_here": "GitHub Pages dashboard or Hugging Face Space",
25
+ "then_inspect": [
26
+ "docs/index.html",
27
+ "docs/data/project_packet.json"
28
+ ]
29
  },
30
  {
31
  "reader_goal": "Decode project terminology",
32
  "start_here": "GLOSSARY.md",
33
+ "then_inspect": [
34
+ "docs/data/glossary.json",
35
+ "Homepage Glossary section"
36
+ ]
37
  },
38
  {
39
  "reader_goal": "Understand the data unit",
40
  "start_here": "results/episode_task_suite/windows.csv",
41
+ "then_inspect": [
42
+ "results/episode_task_suite/feature_manifest.json",
43
+ "docs/data/raw_sample_files.json"
44
+ ]
45
  },
46
  {
47
  "reader_goal": "Trace the 128-episode split",
48
  "start_here": "XPERIENCE10M_128_EPISODE_FEATURE_INDEX.md",
49
+ "then_inspect": [
50
+ "docs/data/xperience10m_128_episode_feature_index.json",
51
+ "results/omni_finetune/xperience10m_128_episode_selection.csv"
52
+ ]
53
  },
54
  {
55
  "reader_goal": "Inspect the 20-task benchmark",
56
  "start_here": "TASK_SUITE_20.md",
57
+ "then_inspect": [
58
+ "docs/data/task_suite_20.json",
59
+ "EVALUATION_PROTOCOL.md"
60
+ ]
61
  },
62
  {
63
  "reader_goal": "Compare current results",
64
  "start_here": "RESEARCH_TAKEAWAYS.md",
65
+ "then_inspect": [
66
+ "docs/data/task_method_20_result_matrix.json",
67
+ "docs/data/unified_task_model_radar.json"
68
+ ]
69
  },
70
  {
71
  "reader_goal": "Compare 1-episode and 128-episode methods",
72
  "start_here": "Homepage radar section",
73
+ "then_inspect": [
74
+ "docs/data/single_episode_task_model_radar.json",
75
+ "docs/data/episode128_task_model_radar.json"
76
+ ]
77
  },
78
  {
79
  "reader_goal": "Read Qwen3-Omni v1-v6 correctly",
80
  "start_here": "QWEN3_OMNI_RUN_LINEAGE.md",
81
+ "then_inspect": [
82
+ "docs/data/qwen3_omni_run_lineage.json",
83
+ "docs/data/qwen3_v5_v6_comparison.json"
84
+ ]
85
  },
86
  {
87
  "reader_goal": "Find all derived artifacts",
88
  "start_here": "ARTIFACT_GUIDE.md",
89
+ "then_inspect": [
90
+ "Hugging Face artifact dataset",
91
+ "docs/data/artifact_index.json"
92
+ ]
93
  },
94
  {
95
  "reader_goal": "Download model weights with their matching results",
96
  "start_here": "Hugging Face weights/results repo",
97
+ "then_inspect": [
98
+ "manifest.json",
99
+ "analysis/docs/data/task_method_20_result_matrix.json",
100
+ "results/"
101
+ ]
102
  },
103
  {
104
  "reader_goal": "Reproduce or extend the work",
105
  "start_here": "REPRODUCIBILITY.md",
106
+ "then_inspect": [
107
+ "QUALITY_GATES.md",
108
+ "scripts/",
109
+ "results/"
110
+ ]
111
  },
112
  {
113
  "reader_goal": "Understand foundation-model directions",
114
  "start_here": "THREE_FOUNDATION_PIPELINES.md",
115
+ "then_inspect": [
116
+ "FOUNDATION_MODEL_PLAN.md",
117
+ "docs/data/three_foundation_pipelines.json"
118
+ ]
119
  },
120
  {
121
  "reader_goal": "Check public-release health",
122
  "start_here": "PUBLIC_SURFACE_QA.md",
123
+ "then_inspect": [
124
+ "docs/data/live_publication_status.json",
125
+ "docs/data/mirror_parity.json"
126
+ ]
127
  }
128
  ],
129
  "public_surfaces": [
 
172
  "Foundation directions",
173
  "Public-release checks"
174
  ],
175
+ "reading_scopes": [
176
+ {
177
+ "public_evidence": [
178
+ "results/episode_task_suite/",
179
+ "docs/data/task_suite_20.json"
180
+ ],
181
+ "topic": "Single public-sample task behavior",
182
+ "scope_note": "Describes one public sample episode, not the full dataset distribution."
183
+ },
184
+ {
185
+ "public_evidence": [
186
+ "XPERIENCE10M_128_EPISODE_FEATURE_INDEX.md",
187
+ "docs/data/xperience10m_128_episode_feature_index.json",
188
+ "results/omni_finetune/*128*",
189
+ "docs/data/omni_model_comparison.json"
190
+ ],
191
+ "topic": "128-episode method comparison",
192
+ "scope_note": "Uses selected held-out episodes and derived public-safe summaries; official raw files remain gated upstream."
193
+ },
194
+ {
195
+ "public_evidence": [
196
+ "QWEN3_OMNI_RUN_LINEAGE.md",
197
+ "docs/data/qwen3_omni_run_lineage.json"
198
+ ],
199
+ "topic": "Qwen3-Omni v1-v6 lineage",
200
+ "scope_note": "v1-v4 are pipeline and ablation evidence, v5 is the pinned prior release, and v6 is the current public 20-task Qwen row."
201
+ },
202
+ {
203
+ "public_evidence": [
204
+ "Verified Qwen3-Omni and Cosmos3 result packages",
205
+ "model cards"
206
+ ],
207
+ "topic": "Foundation-model track quality",
208
+ "scope_note": "Numeric task scores appear only when a task-specific eval or probe exists."
209
+ },
210
+ {
211
+ "public_evidence": [
212
+ "REPRODUCIBILITY.md",
213
+ "QUALITY_GATES.md",
214
+ "release validators"
215
+ ],
216
+ "topic": "Reproducibility",
217
+ "scope_note": "Raw gated Xperience-10M files and full foundation weights are not redistributed."
218
  }
219
  ]
220
  }
data/public_surface_qa.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "title": "Ropedia Xperience-10M Public Project Surface",
3
  "status": "pass",
4
- "generated_at_utc": "2026-06-22T10:20:26+00:00",
5
  "scope": "Repo README, GitHub Pages HTML, Hugging Face Space card, artifact dataset card, and model card.",
6
  "checks": [
7
  {
@@ -18,7 +18,7 @@
18
  "website_integrity": {
19
  "exists": true,
20
  "status": "pass",
21
- "generated_at_utc": "2026-06-22T10:09:34+00:00"
22
  },
23
  "rendered_site_check": {
24
  "exists": true,
@@ -28,27 +28,27 @@
28
  "task_surface_integrity": {
29
  "exists": true,
30
  "status": "pass",
31
- "generated_at_utc": "2026-06-22T10:10:38+00:00"
32
  },
33
  "source_alignment": {
34
  "exists": true,
35
  "status": "pass",
36
- "generated_at_utc": "2026-06-22T10:10:38+00:00"
37
  },
38
  "scale_up_status": {
39
  "exists": true,
40
  "status": "pass",
41
- "generated_at_utc": "2026-06-21T20:58:21+00:00"
42
  },
43
  "publication_package": {
44
  "exists": true,
45
  "status": "pass",
46
- "generated_at_utc": "2026-06-22T10:19:22+00:00"
47
  },
48
  "mirror_parity": {
49
  "exists": true,
50
  "status": "pass",
51
- "generated_at_utc": "2026-06-22T10:19:19+00:00"
52
  }
53
  },
54
  "failures": {}
 
1
  {
2
  "title": "Ropedia Xperience-10M Public Project Surface",
3
  "status": "pass",
4
+ "generated_at_utc": "2026-06-22T11:18:45+00:00",
5
  "scope": "Repo README, GitHub Pages HTML, Hugging Face Space card, artifact dataset card, and model card.",
6
  "checks": [
7
  {
 
18
  "website_integrity": {
19
  "exists": true,
20
  "status": "pass",
21
+ "generated_at_utc": "2026-06-22T11:17:07+00:00"
22
  },
23
  "rendered_site_check": {
24
  "exists": true,
 
28
  "task_surface_integrity": {
29
  "exists": true,
30
  "status": "pass",
31
+ "generated_at_utc": "2026-06-22T11:17:07+00:00"
32
  },
33
  "source_alignment": {
34
  "exists": true,
35
  "status": "pass",
36
+ "generated_at_utc": "2026-06-22T11:17:08+00:00"
37
  },
38
  "scale_up_status": {
39
  "exists": true,
40
  "status": "pass",
41
+ "generated_at_utc": "2026-06-22T11:17:10+00:00"
42
  },
43
  "publication_package": {
44
  "exists": true,
45
  "status": "pass",
46
+ "generated_at_utc": "2026-06-22T11:18:16+00:00"
47
  },
48
  "mirror_parity": {
49
  "exists": true,
50
  "status": "pass",
51
+ "generated_at_utc": "2026-06-22T11:18:11+00:00"
52
  }
53
  },
54
  "failures": {}
data/publication_audit.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "status": "pass",
3
- "generated_at_utc": "2026-06-22T10:19:22+00:00",
4
  "checks": [
5
  {
6
  "name": "required_publication_assets_present",
@@ -246,8 +246,8 @@
246
  "hf_space_bundle": {
247
  "root": "hf_publish/space",
248
  "exists": true,
249
- "file_count": 631,
250
- "text_file_count": 470,
251
  "largest_file": {
252
  "path": "results/omni_finetune/xperience10m_qwen3_omni_v6_sensor_target_probes_a100_20260619T000000Z/modality_reconstruction/predictions.jsonl",
253
  "bytes": 10221085
@@ -257,8 +257,8 @@
257
  "hf_artifact_bundle": {
258
  "root": "hf_publish/artifacts",
259
  "exists": true,
260
- "file_count": 4702,
261
- "text_file_count": 1328,
262
  "largest_file": {
263
  "path": "results/omni_finetune/xperience10m_128ep_dense_multiscale_hierarchical_v1_20260608/dense_multiscale_windows.jsonl",
264
  "bytes": 135591061
@@ -268,8 +268,8 @@
268
  "hf_model_bundle": {
269
  "root": "hf_publish/model",
270
  "exists": true,
271
- "file_count": 5464,
272
- "text_file_count": 1502,
273
  "largest_file": {
274
  "path": "results/omni_finetune/xperience10m_128ep_dense_multiscale_hierarchical_v1_20260608/dense_multiscale_windows.jsonl",
275
  "bytes": 135591061
 
1
  {
2
  "status": "pass",
3
+ "generated_at_utc": "2026-06-22T11:18:16+00:00",
4
  "checks": [
5
  {
6
  "name": "required_publication_assets_present",
 
246
  "hf_space_bundle": {
247
  "root": "hf_publish/space",
248
  "exists": true,
249
+ "file_count": 640,
250
+ "text_file_count": 479,
251
  "largest_file": {
252
  "path": "results/omni_finetune/xperience10m_qwen3_omni_v6_sensor_target_probes_a100_20260619T000000Z/modality_reconstruction/predictions.jsonl",
253
  "bytes": 10221085
 
257
  "hf_artifact_bundle": {
258
  "root": "hf_publish/artifacts",
259
  "exists": true,
260
+ "file_count": 4708,
261
+ "text_file_count": 1334,
262
  "largest_file": {
263
  "path": "results/omni_finetune/xperience10m_128ep_dense_multiscale_hierarchical_v1_20260608/dense_multiscale_windows.jsonl",
264
  "bytes": 135591061
 
268
  "hf_model_bundle": {
269
  "root": "hf_publish/model",
270
  "exists": true,
271
+ "file_count": 5470,
272
+ "text_file_count": 1508,
273
  "largest_file": {
274
  "path": "results/omni_finetune/xperience10m_128ep_dense_multiscale_hierarchical_v1_20260608/dense_multiscale_windows.jsonl",
275
  "bytes": 135591061
docs/data/evidence_contract.json CHANGED
@@ -1,170 +1,169 @@
1
  {
2
  "project": "Ropedia Xperience-10M Task Suite",
3
  "scope": "single public Xperience-10M sample episode",
4
- "claims": [
5
  {
6
  "id": "project_status",
7
- "claim": "A first-pass reader has a compact current-state summary.",
8
  "status": "verified",
9
  "evidence": [
10
  "PROJECT_STATUS.md",
11
  "docs/data/project_status.json"
12
  ],
13
- "boundary": "summarizes existing evidence and current limitations"
 
14
  },
15
  {
16
  "id": "research_roadmap",
17
- "claim": "The research roadmap is explicit.",
18
  "status": "current",
19
  "evidence": [
20
  "RESEARCH_ROADMAP.md",
21
  "docs/data/research_roadmap.json"
22
  ],
23
- "boundary": "connects public-sample task development to multi-episode data preparation, Qwen3-Omni LoRA, robustness runs, and larger omni-model extensions"
 
24
  },
25
- {
26
- "id": "official_dataset_card_alignment",
27
- "claim": "The public dataset description is aligned with the official gated Xperience-10M dataset card and public sample card.",
28
  "status": "verified",
29
  "evidence": [
30
  "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md",
31
  "docs/data/xperience10m_dataset_card_alignment.json",
32
  "https://huggingface.co/datasets/ropedia-ai/xperience-10m"
33
  ],
34
- "boundary": "summarizes upstream public metadata, API listing facts, sample license/tooling, and dataset-card facts; does not grant access or mirror raw data"
35
- },
36
- {
37
- "id": "source_alignment",
38
- "claim": "Source facts, sample details, API-listing notes, and project coverage are validated across repo, website, and HF cards.",
39
- "status": "verified",
40
- "evidence": [
41
- "SOURCE_ALIGNMENT_AUDIT.md",
42
- "docs/data/source_alignment_audit.json",
43
- "scripts/validate_source_alignment.py"
44
- ],
45
- "boundary": "offline committed-fact check; does not fetch private gated data"
46
- },
47
- {
48
- "id": "aligned_windows",
49
- "claim": "The public Xperience-10M sample has been converted into aligned model windows.",
50
  "status": "verified",
51
  "evidence": [
52
  "results/episode_task_suite/windows.csv",
53
  "results/episode_task_suite/shared_windows.npz",
54
  "results/episode_task_suite/summary_report.json"
55
  ],
56
- "boundary": "5,821 frames, 1,161 windows, one public sample episode"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  },
58
- {
59
- "id": "feature_contract",
60
- "claim": "The current feature contract is explicit and inspectable.",
61
- "status": "verified",
62
- "evidence": [
63
- "results/episode_task_suite/feature_manifest.json",
64
- "results/episode_task_suite/available_modalities.json"
65
- ],
66
- "boundary": "8,546-dimensional aligned multimodal window representation"
67
- },
68
- {
69
- "id": "evaluation_protocol",
70
- "claim": "The task evaluation protocol is explicit and generated from committed metrics.",
71
- "status": "verified",
72
- "evidence": [
73
- "EVALUATION_PROTOCOL.md",
74
- "docs/data/evaluation_protocol.json",
75
- "scripts/build_evaluation_protocol.py"
76
- ],
77
- "boundary": "defines windows, split, per-task metrics, leakage controls, and current limitations"
78
- },
79
  {
80
  "id": "modality_atlas",
81
- "claim": "The public sample modalities are inspectable without raw data redistribution.",
82
  "status": "verified",
83
  "evidence": [
84
  "docs/data/modality_atlas.json",
85
  "docs/assets/modalities/",
86
  "docs/index.html"
87
  ],
88
- "boundary": "derived thumbnails for presentation; raw data remains excluded"
 
89
  },
90
  {
91
  "id": "task_surface_integrity",
92
- "claim": "Public task cards stay readable for non-expert readers.",
93
  "status": "verified",
94
  "evidence": [
95
  "docs/data/task_surface_integrity.json",
96
  "scripts/validate_task_surface.py",
97
  "docs/index.html"
98
  ],
99
- "boundary": "presentation integrity for the public task surface"
 
100
  },
101
  {
102
  "id": "figure_index",
103
- "claim": "Public figures, charts, and modality thumbnails are indexed as project evidence.",
104
  "status": "verified",
105
  "evidence": [
106
  "FIGURE_INDEX.md",
107
  "docs/data/figure_index.json",
108
  "scripts/build_figure_index.py"
109
  ],
110
- "boundary": "records derived visual assets, dimensions, hashes, roles, and source scripts; raw Xperience-10M data remains excluded"
 
111
  },
112
  {
113
  "id": "brand_assets",
114
- "claim": "A project logo is consistently applied across public surfaces.",
115
  "status": "verified",
116
  "evidence": [
117
  "docs/assets/brand/",
118
  "docs/data/brand_assets.json",
119
  "scripts/build_brand_assets.py"
120
  ],
121
- "boundary": "generated logo and deterministic derivatives only; no raw dataset data or model weights"
 
122
  },
123
  {
124
  "id": "twelve_tasks",
125
- "claim": "The 12 task heads are implemented as scripts with saved metrics and predictions.",
126
  "status": "verified",
127
  "evidence": [
128
  "scripts/episode_task_suite.py",
129
  "results/episode_task_suite/*/metrics.json",
130
  "results/episode_task_suite/*/predictions.*"
131
  ],
132
- "boundary": "chronological single-episode split, not cross-episode generalization"
 
133
  },
134
  {
135
  "id": "minimal_vs_neural",
136
- "claim": "Minimal and neural heads use the same task contracts.",
137
  "status": "verified",
138
  "evidence": [
139
  "scripts/neural_task_models.py",
140
  "results/episode_task_suite/neural_mlp/",
141
  "docs/assets/task_architectures.png"
142
  ],
143
- "boundary": "small heads only; not a foundation model"
 
144
  },
145
  {
146
  "id": "research_directions",
147
- "claim": "Four Ropedia research directions are mapped honestly as direct, proxy, or diagnostic evidence.",
148
  "status": "verified",
149
  "evidence": [
150
  "results/episode_task_suite/research_directions/research_direction_taxonomy.json",
151
  "docs/data/research_directions.json"
152
  ],
153
- "boundary": "some directions remain proxy-only"
 
154
  },
155
  {
156
  "id": "direction_extensions",
157
- "claim": "Four extra direction probes are coded and evaluated.",
158
  "status": "verified",
159
  "evidence": [
160
  "results/episode_task_suite/research_direction_extensions/research_direction_extension_results.json",
161
  "docs/data/research_direction_extensions.json"
162
  ],
163
- "boundary": "single-episode probes, not full research-direction solutions"
 
164
  },
165
  {
166
  "id": "qwen3_omni_diagnostic_pilot",
167
- "claim": "Qwen3-Omni has a verified selected-episode held-out diagnostic pilot.",
168
  "status": "verified_diagnostic",
169
  "evidence": [
170
  "docs/data/omni_finetune_verified_result.json",
@@ -172,94 +171,94 @@
172
  "scripts/omni/package_verified_omni_result.py",
173
  "scripts/omni/audit_verified_omni_package.py"
174
  ],
175
- "boundary": "the pipeline is verified, but model quality is weak: JSON validity is below target and action/subtask metrics are low"
 
176
  },
177
  {
178
  "id": "multi_episode_quality_improvement",
179
- "claim": "The next Qwen3-Omni step is structured-output and task-quality improvement on the same selected split.",
180
  "status": "active_next_step",
181
  "evidence": [
182
  "scripts/omni/run_128_fullsplit_parallel_export_8gpu.sh",
183
  "docs/data/omni_finetune_verified_result.json",
184
  "FOUNDATION_MODEL_PLAN.md"
185
  ],
186
- "boundary": "stronger model quality requires output-format improvements and action/subtask error analysis"
 
187
  },
188
  {
189
  "id": "scale_up_status_check",
190
- "claim": "Older pilot path strings are tracked as setup-file provenance.",
191
  "status": "verified",
192
  "evidence": [
193
  "scripts/validate_scope_claims.py",
194
  "docs/data/scope_claims_audit.json"
195
  ],
196
- "boundary": "run/path identifiers stay separate from completed held-out-episode results"
 
197
  },
198
  {
199
  "id": "mirror_parity",
200
- "claim": "Prepared GitHub and Hugging Face mirrors carry matching critical data, visual, HTML, and validator files.",
201
  "status": "verified",
202
  "evidence": [
203
  "scripts/validate_mirror_parity.py",
204
  "docs/data/mirror_parity.json"
205
  ],
206
- "boundary": "compares prepared local mirror bundles before upload; live URLs are checked after publishing"
 
207
  },
208
  {
209
  "id": "publication_package",
210
- "claim": "The public GitHub and Hugging Face bundles contain the intended release files.",
211
  "status": "verified",
212
  "evidence": [
213
  "scripts/validate_publication_package.py",
214
  "docs/data/publication_audit.json"
215
  ],
216
- "boundary": "checks public files, HF bundles, and current public-card assets; temporary local outputs are excluded"
 
217
  },
218
  {
219
  "id": "website_integrity",
220
- "claim": "The public website has checked local references.",
221
  "status": "verified",
222
  "evidence": [
223
  "scripts/validate_website_integrity.py",
224
  "docs/data/website_integrity.json"
225
  ],
226
- "boundary": "checks local links, anchors, JSON data, and referenced images; external URLs are not fetched"
 
227
  },
228
  {
229
  "id": "rendered_site_check",
230
- "claim": "The rendered website walkthrough has a browser-level interaction check.",
231
  "status": "verified",
232
  "evidence": [
233
  "RENDERED_SITE_CHECK.md",
234
  "scripts/build_rendered_site_check.py",
235
  "docs/data/rendered_site_check.json"
236
  ],
237
- "boundary": "checks local page load, tab switch, walkthrough deep link, player controls, and console health"
 
238
  },
239
  {
240
  "id": "quality_gates",
241
- "claim": "The release gate is explicit.",
242
  "status": "verified",
243
  "evidence": [
244
  "QUALITY_GATES.md",
245
  "scripts/build_quality_gates.py",
246
  "docs/data/quality_gates.json"
247
  ],
248
- "boundary": "summarizes packaging and live-mirror checks; cross-episode model quality is measured by later held-out reports"
 
249
  },
250
  {
251
  "id": "live_publication_status",
252
- "claim": "The live public mirrors are checked after upload.",
253
  "status": "verified",
254
  "evidence": [
255
  "scripts/verify_live_publication.py",
256
  "docs/data/live_publication_status.json"
257
  ],
258
- "boundary": "fetches public GitHub/HF URLs; it does not validate private training state"
 
259
  },
260
  {
261
  "id": "citation_metadata",
262
- "claim": "The project is externally citable and machine-readable.",
263
  "status": "verified",
264
  "evidence": [
265
  "CITATION.cff",
@@ -267,11 +266,11 @@
267
  "docs/data/project_manifest.json",
268
  "LICENSE"
269
  ],
270
- "boundary": "code license does not override original Xperience-10M dataset terms"
 
271
  },
272
  {
273
  "id": "project_path",
274
- "claim": "A first-time reader has an explicit project path.",
275
  "status": "verified",
276
  "evidence": [
277
  "docs/data/project_packet.json",
@@ -280,29 +279,30 @@
280
  "README.md",
281
  "docs/index.html"
282
  ],
283
- "boundary": "guides inspection across data, tasks, results, and scale-up status"
 
284
  },
285
  {
286
  "id": "artifact_index",
287
- "claim": "The core project artifacts are grouped for human reading and indexed with existence, size, and hash metadata where stable.",
288
  "status": "verified",
289
  "evidence": [
290
  "ARTIFACT_GUIDE.md",
291
  "scripts/build_artifact_index.py",
292
  "docs/data/artifact_index.json"
293
  ],
294
- "boundary": "selective source-of-truth catalog, not a complete inventory of every output file"
 
295
  },
296
  {
297
  "id": "reproducibility_contract",
298
- "claim": "The public reproduction path is documented with commands, expected outputs, and exact-match reproduction evidence.",
299
  "status": "verified",
300
  "evidence": [
301
  "REPRODUCIBILITY.md",
302
  "docs/data/reproducibility_matrix.json",
303
  "notes/reproducibility_audit.md"
304
  ],
305
- "boundary": "publicly reproduces the single-episode pipeline; multi-episode Qwen3-Omni metrics are added only after data preparation and held-out evaluation"
 
306
  }
307
  ]
308
  }
 
1
  {
2
  "project": "Ropedia Xperience-10M Task Suite",
3
  "scope": "single public Xperience-10M sample episode",
4
+ "readouts": [
5
  {
6
  "id": "project_status",
 
7
  "status": "verified",
8
  "evidence": [
9
  "PROJECT_STATUS.md",
10
  "docs/data/project_status.json"
11
  ],
12
+ "readout": "A first-pass reader has a compact current-state summary.",
13
+ "scope_note": "summarizes existing evidence and current limitations"
14
  },
15
  {
16
  "id": "research_roadmap",
 
17
  "status": "current",
18
  "evidence": [
19
  "RESEARCH_ROADMAP.md",
20
  "docs/data/research_roadmap.json"
21
  ],
22
+ "readout": "The research roadmap is explicit.",
23
+ "scope_note": "connects public-sample task development to multi-episode data preparation, Qwen3-Omni LoRA, robustness runs, and larger omni-model extensions"
24
  },
25
+ {
26
+ "id": "official_dataset_card_alignment",
 
27
  "status": "verified",
28
  "evidence": [
29
  "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md",
30
  "docs/data/xperience10m_dataset_card_alignment.json",
31
  "https://huggingface.co/datasets/ropedia-ai/xperience-10m"
32
  ],
33
+ "readout": "The public dataset description is aligned with the official gated Xperience-10M dataset card and public sample card.",
34
+ "scope_note": "summarizes upstream public metadata, API listing facts, sample license/tooling, and dataset-card facts; does not grant access or mirror raw data"
35
+ },
36
+ {
37
+ "id": "source_alignment",
38
+ "status": "verified",
39
+ "evidence": [
40
+ "SOURCE_ALIGNMENT_AUDIT.md",
41
+ "docs/data/source_alignment_audit.json",
42
+ "scripts/validate_source_alignment.py"
43
+ ],
44
+ "readout": "Source facts, sample details, API-listing notes, and project coverage are validated across repo, website, and HF cards.",
45
+ "scope_note": "offline committed-fact check; does not fetch private gated data"
46
+ },
47
+ {
48
+ "id": "aligned_windows",
49
  "status": "verified",
50
  "evidence": [
51
  "results/episode_task_suite/windows.csv",
52
  "results/episode_task_suite/shared_windows.npz",
53
  "results/episode_task_suite/summary_report.json"
54
  ],
55
+ "readout": "The public Xperience-10M sample has been converted into aligned model windows.",
56
+ "scope_note": "5,821 frames, 1,161 windows, one public sample episode"
57
+ },
58
+ {
59
+ "id": "feature_contract",
60
+ "status": "verified",
61
+ "evidence": [
62
+ "results/episode_task_suite/feature_manifest.json",
63
+ "results/episode_task_suite/available_modalities.json"
64
+ ],
65
+ "readout": "The current feature contract is explicit and inspectable.",
66
+ "scope_note": "8,546-dimensional aligned multimodal window representation"
67
+ },
68
+ {
69
+ "id": "evaluation_protocol",
70
+ "status": "verified",
71
+ "evidence": [
72
+ "EVALUATION_PROTOCOL.md",
73
+ "docs/data/evaluation_protocol.json",
74
+ "scripts/build_evaluation_protocol.py"
75
+ ],
76
+ "readout": "The task evaluation protocol is explicit and generated from committed metrics.",
77
+ "scope_note": "defines windows, split, per-task metrics, leakage controls, and current limitations"
78
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  {
80
  "id": "modality_atlas",
 
81
  "status": "verified",
82
  "evidence": [
83
  "docs/data/modality_atlas.json",
84
  "docs/assets/modalities/",
85
  "docs/index.html"
86
  ],
87
+ "readout": "The public sample modalities are inspectable without raw data redistribution.",
88
+ "scope_note": "derived thumbnails for presentation; raw data remains excluded"
89
  },
90
  {
91
  "id": "task_surface_integrity",
 
92
  "status": "verified",
93
  "evidence": [
94
  "docs/data/task_surface_integrity.json",
95
  "scripts/validate_task_surface.py",
96
  "docs/index.html"
97
  ],
98
+ "readout": "Public task cards stay readable for non-expert readers.",
99
+ "scope_note": "presentation integrity for the public task surface"
100
  },
101
  {
102
  "id": "figure_index",
 
103
  "status": "verified",
104
  "evidence": [
105
  "FIGURE_INDEX.md",
106
  "docs/data/figure_index.json",
107
  "scripts/build_figure_index.py"
108
  ],
109
+ "readout": "Public figures, charts, and modality thumbnails are indexed as project evidence.",
110
+ "scope_note": "records derived visual assets, dimensions, hashes, roles, and source scripts; raw Xperience-10M data remains excluded"
111
  },
112
  {
113
  "id": "brand_assets",
 
114
  "status": "verified",
115
  "evidence": [
116
  "docs/assets/brand/",
117
  "docs/data/brand_assets.json",
118
  "scripts/build_brand_assets.py"
119
  ],
120
+ "readout": "A project logo is consistently applied across public surfaces.",
121
+ "scope_note": "generated logo and deterministic derivatives only; no raw dataset data or model weights"
122
  },
123
  {
124
  "id": "twelve_tasks",
 
125
  "status": "verified",
126
  "evidence": [
127
  "scripts/episode_task_suite.py",
128
  "results/episode_task_suite/*/metrics.json",
129
  "results/episode_task_suite/*/predictions.*"
130
  ],
131
+ "readout": "The 12 task heads are implemented as scripts with saved metrics and predictions.",
132
+ "scope_note": "chronological single-episode split, not cross-episode generalization"
133
  },
134
  {
135
  "id": "minimal_vs_neural",
 
136
  "status": "verified",
137
  "evidence": [
138
  "scripts/neural_task_models.py",
139
  "results/episode_task_suite/neural_mlp/",
140
  "docs/assets/task_architectures.png"
141
  ],
142
+ "readout": "Minimal and neural heads use the same task contracts.",
143
+ "scope_note": "small heads only; not a foundation model"
144
  },
145
  {
146
  "id": "research_directions",
 
147
  "status": "verified",
148
  "evidence": [
149
  "results/episode_task_suite/research_directions/research_direction_taxonomy.json",
150
  "docs/data/research_directions.json"
151
  ],
152
+ "readout": "Four Ropedia research directions are mapped honestly as direct, proxy, or diagnostic evidence.",
153
+ "scope_note": "some directions remain proxy-only"
154
  },
155
  {
156
  "id": "direction_extensions",
 
157
  "status": "verified",
158
  "evidence": [
159
  "results/episode_task_suite/research_direction_extensions/research_direction_extension_results.json",
160
  "docs/data/research_direction_extensions.json"
161
  ],
162
+ "readout": "Four extra direction probes are coded and evaluated.",
163
+ "scope_note": "single-episode probes, not full research-direction solutions"
164
  },
165
  {
166
  "id": "qwen3_omni_diagnostic_pilot",
 
167
  "status": "verified_diagnostic",
168
  "evidence": [
169
  "docs/data/omni_finetune_verified_result.json",
 
171
  "scripts/omni/package_verified_omni_result.py",
172
  "scripts/omni/audit_verified_omni_package.py"
173
  ],
174
+ "readout": "Qwen3-Omni has a verified selected-episode held-out diagnostic pilot.",
175
+ "scope_note": "the pipeline is verified, but model quality is weak: JSON validity is below target and action/subtask metrics are low"
176
  },
177
  {
178
  "id": "multi_episode_quality_improvement",
 
179
  "status": "active_next_step",
180
  "evidence": [
181
  "scripts/omni/run_128_fullsplit_parallel_export_8gpu.sh",
182
  "docs/data/omni_finetune_verified_result.json",
183
  "FOUNDATION_MODEL_PLAN.md"
184
  ],
185
+ "readout": "The next Qwen3-Omni step is structured-output and task-quality improvement on the same selected split.",
186
+ "scope_note": "stronger model quality requires output-format improvements and action/subtask error analysis"
187
  },
188
  {
189
  "id": "scale_up_status_check",
 
190
  "status": "verified",
191
  "evidence": [
192
  "scripts/validate_scope_claims.py",
193
  "docs/data/scope_claims_audit.json"
194
  ],
195
+ "readout": "Older pilot path strings are tracked as setup-file provenance.",
196
+ "scope_note": "run/path identifiers stay separate from completed held-out-episode results"
197
  },
198
  {
199
  "id": "mirror_parity",
 
200
  "status": "verified",
201
  "evidence": [
202
  "scripts/validate_mirror_parity.py",
203
  "docs/data/mirror_parity.json"
204
  ],
205
+ "readout": "Prepared GitHub and Hugging Face mirrors carry matching critical data, visual, HTML, and validator files.",
206
+ "scope_note": "compares prepared local mirror bundles before upload; live URLs are checked after publishing"
207
  },
208
  {
209
  "id": "publication_package",
 
210
  "status": "verified",
211
  "evidence": [
212
  "scripts/validate_publication_package.py",
213
  "docs/data/publication_audit.json"
214
  ],
215
+ "readout": "The public GitHub and Hugging Face bundles contain the intended release files.",
216
+ "scope_note": "checks public files, HF bundles, and current public-card assets; temporary local outputs are excluded"
217
  },
218
  {
219
  "id": "website_integrity",
 
220
  "status": "verified",
221
  "evidence": [
222
  "scripts/validate_website_integrity.py",
223
  "docs/data/website_integrity.json"
224
  ],
225
+ "readout": "The public website has checked local references.",
226
+ "scope_note": "checks local links, anchors, JSON data, and referenced images; external URLs are not fetched"
227
  },
228
  {
229
  "id": "rendered_site_check",
 
230
  "status": "verified",
231
  "evidence": [
232
  "RENDERED_SITE_CHECK.md",
233
  "scripts/build_rendered_site_check.py",
234
  "docs/data/rendered_site_check.json"
235
  ],
236
+ "readout": "The rendered website walkthrough has a browser-level interaction check.",
237
+ "scope_note": "checks local page load, tab switch, walkthrough deep link, player controls, and console health"
238
  },
239
  {
240
  "id": "quality_gates",
 
241
  "status": "verified",
242
  "evidence": [
243
  "QUALITY_GATES.md",
244
  "scripts/build_quality_gates.py",
245
  "docs/data/quality_gates.json"
246
  ],
247
+ "readout": "The release gate is explicit.",
248
+ "scope_note": "summarizes packaging and live-mirror checks; cross-episode model quality is measured by later held-out reports"
249
  },
250
  {
251
  "id": "live_publication_status",
 
252
  "status": "verified",
253
  "evidence": [
254
  "scripts/verify_live_publication.py",
255
  "docs/data/live_publication_status.json"
256
  ],
257
+ "readout": "The live public mirrors are checked after upload.",
258
+ "scope_note": "fetches public GitHub/HF URLs; it does not validate private training state"
259
  },
260
  {
261
  "id": "citation_metadata",
 
262
  "status": "verified",
263
  "evidence": [
264
  "CITATION.cff",
 
266
  "docs/data/project_manifest.json",
267
  "LICENSE"
268
  ],
269
+ "readout": "The project is externally citable and machine-readable.",
270
+ "scope_note": "code license does not override original Xperience-10M dataset terms"
271
  },
272
  {
273
  "id": "project_path",
 
274
  "status": "verified",
275
  "evidence": [
276
  "docs/data/project_packet.json",
 
279
  "README.md",
280
  "docs/index.html"
281
  ],
282
+ "readout": "A first-time reader has an explicit project path.",
283
+ "scope_note": "guides inspection across data, tasks, results, and scale-up status"
284
  },
285
  {
286
  "id": "artifact_index",
 
287
  "status": "verified",
288
  "evidence": [
289
  "ARTIFACT_GUIDE.md",
290
  "scripts/build_artifact_index.py",
291
  "docs/data/artifact_index.json"
292
  ],
293
+ "readout": "The core project artifacts are grouped for human reading and indexed with existence, size, and hash metadata where stable.",
294
+ "scope_note": "selective source-of-truth catalog, not a complete inventory of every output file"
295
  },
296
  {
297
  "id": "reproducibility_contract",
 
298
  "status": "verified",
299
  "evidence": [
300
  "REPRODUCIBILITY.md",
301
  "docs/data/reproducibility_matrix.json",
302
  "notes/reproducibility_audit.md"
303
  ],
304
+ "readout": "The public reproduction path is documented with commands, expected outputs, and exact-match reproduction evidence.",
305
+ "scope_note": "publicly reproduces the single-episode pipeline; multi-episode Qwen3-Omni metrics are added only after data preparation and held-out evaluation"
306
  }
307
  ]
308
  }
docs/data/mirror_parity.json CHANGED
The diff for this file is too large to render. See raw diff
 
docs/data/omni_model_comparison.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "title": "Ropedia Xperience-10M Current Result Versions and Model Groups",
3
- "generated_at_utc": "2026-06-21T15:17:00+00:00",
4
  "status": "pass",
5
  "version_count": 3,
6
  "model_group_count": 5,
@@ -1758,6 +1758,6 @@
1758
  ],
1759
  "pending": [
1760
  "Use the verified Qwen3 v6 rank64/lr5e-5 dense multiscale full-eval package as the latest current Qwen row; the v5 release tag remains pinned as the previous verified release.",
1761
- "Read results/omni_finetune/QWEN3_V5_V6_COMPARISON_20260614.md before claiming v6 is globally better than v5, because v6 improves action macro-F1 and contact accuracy but regresses subtask, next-action, object micro-F1, and JSON validity slightly."
1762
  ]
1763
  }
 
1
  {
2
  "title": "Ropedia Xperience-10M Current Result Versions and Model Groups",
3
+ "generated_at_utc": "2026-06-22T10:59:59+00:00",
4
  "status": "pass",
5
  "version_count": 3,
6
  "model_group_count": 5,
 
1758
  ],
1759
  "pending": [
1760
  "Use the verified Qwen3 v6 rank64/lr5e-5 dense multiscale full-eval package as the latest current Qwen row; the v5 release tag remains pinned as the previous verified release.",
1761
+ "Read results/omni_finetune/QWEN3_V5_V6_COMPARISON_20260614.md before presenting v6 as globally better than v5, because v6 improves action macro-F1 and contact accuracy but regresses subtask, next-action, object micro-F1, and JSON validity slightly."
1762
  ]
1763
  }
docs/data/project_brief.json CHANGED
@@ -56,7 +56,7 @@
56
  "Use docs/data/omni_finetune_verified_result.json for the current multi-episode Qwen3-Omni pilot result."
57
  ],
58
  "scope_boundary": "The public sample is enough to build and verify task definitions, feature contracts, metrics, visualization, and baseline code. The final multi-episode Qwen3-Omni diagnostic result verifies the training loop and strict-JSON output reliability, but does not yet show strong action/subtask model quality.",
59
- "next_stage": "Improve action/subtask quality through error analysis before larger robustness or alternative-backbone claims.",
60
  "entry_points": {
61
  "visual_dashboard": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/",
62
  "hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite",
 
56
  "Use docs/data/omni_finetune_verified_result.json for the current multi-episode Qwen3-Omni pilot result."
57
  ],
58
  "scope_boundary": "The public sample is enough to build and verify task definitions, feature contracts, metrics, visualization, and baseline code. The final multi-episode Qwen3-Omni diagnostic result verifies the training loop and strict-JSON output reliability, but does not yet show strong action/subtask model quality.",
59
+ "next_stage": "Improve action/subtask quality through error analysis before presenting larger robustness or alternative-backbone results.",
60
  "entry_points": {
61
  "visual_dashboard": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/",
62
  "hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite",
docs/data/public_reader_map.json CHANGED
@@ -6,77 +6,124 @@
6
  {
7
  "reader_goal": "Understand the project in one pass",
8
  "start_here": "PROJECT_BRIEF.md",
9
- "then_inspect": ["PROJECT_STATUS.md", "RESEARCH_TAKEAWAYS.md"]
 
 
 
10
  },
11
  {
12
  "reader_goal": "Understand the two evidence lines",
13
  "start_here": "TWO_EVIDENCE_LINES.md",
14
- "then_inspect": ["docs/data/two_evidence_lines.json", "docs/data/two_evidence_line_result_summary.json"]
 
 
 
15
  },
16
  {
17
  "reader_goal": "See the visual public dashboard",
18
  "start_here": "GitHub Pages dashboard or Hugging Face Space",
19
- "then_inspect": ["docs/index.html", "docs/data/project_packet.json"]
 
 
 
20
  },
21
  {
22
  "reader_goal": "Decode project terminology",
23
  "start_here": "GLOSSARY.md",
24
- "then_inspect": ["docs/data/glossary.json", "Homepage Glossary section"]
 
 
 
25
  },
26
  {
27
  "reader_goal": "Understand the data unit",
28
  "start_here": "results/episode_task_suite/windows.csv",
29
- "then_inspect": ["results/episode_task_suite/feature_manifest.json", "docs/data/raw_sample_files.json"]
 
 
 
30
  },
31
  {
32
  "reader_goal": "Trace the 128-episode split",
33
  "start_here": "XPERIENCE10M_128_EPISODE_FEATURE_INDEX.md",
34
- "then_inspect": ["docs/data/xperience10m_128_episode_feature_index.json", "results/omni_finetune/xperience10m_128_episode_selection.csv"]
 
 
 
35
  },
36
  {
37
  "reader_goal": "Inspect the 20-task benchmark",
38
  "start_here": "TASK_SUITE_20.md",
39
- "then_inspect": ["docs/data/task_suite_20.json", "EVALUATION_PROTOCOL.md"]
 
 
 
40
  },
41
  {
42
  "reader_goal": "Compare current results",
43
  "start_here": "RESEARCH_TAKEAWAYS.md",
44
- "then_inspect": ["docs/data/task_method_20_result_matrix.json", "docs/data/unified_task_model_radar.json"]
 
 
 
45
  },
46
  {
47
  "reader_goal": "Compare 1-episode and 128-episode methods",
48
  "start_here": "Homepage radar section",
49
- "then_inspect": ["docs/data/single_episode_task_model_radar.json", "docs/data/episode128_task_model_radar.json"]
 
 
 
50
  },
51
  {
52
  "reader_goal": "Read Qwen3-Omni v1-v6 correctly",
53
  "start_here": "QWEN3_OMNI_RUN_LINEAGE.md",
54
- "then_inspect": ["docs/data/qwen3_omni_run_lineage.json", "docs/data/qwen3_v5_v6_comparison.json"]
 
 
 
55
  },
56
  {
57
  "reader_goal": "Find all derived artifacts",
58
  "start_here": "ARTIFACT_GUIDE.md",
59
- "then_inspect": ["Hugging Face artifact dataset", "docs/data/artifact_index.json"]
 
 
 
60
  },
61
  {
62
  "reader_goal": "Download model weights with their matching results",
63
  "start_here": "Hugging Face weights/results repo",
64
- "then_inspect": ["manifest.json", "analysis/docs/data/task_method_20_result_matrix.json", "results/"]
 
 
 
 
65
  },
66
  {
67
  "reader_goal": "Reproduce or extend the work",
68
  "start_here": "REPRODUCIBILITY.md",
69
- "then_inspect": ["QUALITY_GATES.md", "scripts/", "results/"]
 
 
 
 
70
  },
71
  {
72
  "reader_goal": "Understand foundation-model directions",
73
  "start_here": "THREE_FOUNDATION_PIPELINES.md",
74
- "then_inspect": ["FOUNDATION_MODEL_PLAN.md", "docs/data/three_foundation_pipelines.json"]
 
 
 
75
  },
76
  {
77
  "reader_goal": "Check public-release health",
78
  "start_here": "PUBLIC_SURFACE_QA.md",
79
- "then_inspect": ["docs/data/live_publication_status.json", "docs/data/mirror_parity.json"]
 
 
 
80
  }
81
  ],
82
  "public_surfaces": [
@@ -125,31 +172,49 @@
125
  "Foundation directions",
126
  "Public-release checks"
127
  ],
128
- "claim_boundaries": [
129
- {
130
- "claim_type": "Single public-sample task behavior",
131
- "public_evidence": ["results/episode_task_suite/", "docs/data/task_suite_20.json"],
132
- "boundary": "Describes one public sample episode, not the full dataset distribution."
133
- },
134
- {
135
- "claim_type": "128-episode method comparison",
136
- "public_evidence": ["XPERIENCE10M_128_EPISODE_FEATURE_INDEX.md", "docs/data/xperience10m_128_episode_feature_index.json", "results/omni_finetune/*128*", "docs/data/omni_model_comparison.json"],
137
- "boundary": "Uses selected held-out episodes and derived public-safe summaries; official raw files remain gated upstream."
138
- },
139
- {
140
- "claim_type": "Qwen3-Omni v1-v6 lineage",
141
- "public_evidence": ["QWEN3_OMNI_RUN_LINEAGE.md", "docs/data/qwen3_omni_run_lineage.json"],
142
- "boundary": "v1-v4 are pipeline and ablation evidence, v5 is the pinned prior release, and v6 is the current public 20-task Qwen row."
143
- },
144
- {
145
- "claim_type": "Foundation-model track quality",
146
- "public_evidence": ["Verified Qwen3-Omni and Cosmos3 result packages", "model cards"],
147
- "boundary": "Numeric task scores appear only when a task-specific eval or probe exists."
148
- },
149
- {
150
- "claim_type": "Reproducibility",
151
- "public_evidence": ["REPRODUCIBILITY.md", "QUALITY_GATES.md", "release validators"],
152
- "boundary": "Raw gated Xperience-10M files and full foundation weights are not redistributed."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  }
154
  ]
155
  }
 
6
  {
7
  "reader_goal": "Understand the project in one pass",
8
  "start_here": "PROJECT_BRIEF.md",
9
+ "then_inspect": [
10
+ "PROJECT_STATUS.md",
11
+ "RESEARCH_TAKEAWAYS.md"
12
+ ]
13
  },
14
  {
15
  "reader_goal": "Understand the two evidence lines",
16
  "start_here": "TWO_EVIDENCE_LINES.md",
17
+ "then_inspect": [
18
+ "docs/data/two_evidence_lines.json",
19
+ "docs/data/two_evidence_line_result_summary.json"
20
+ ]
21
  },
22
  {
23
  "reader_goal": "See the visual public dashboard",
24
  "start_here": "GitHub Pages dashboard or Hugging Face Space",
25
+ "then_inspect": [
26
+ "docs/index.html",
27
+ "docs/data/project_packet.json"
28
+ ]
29
  },
30
  {
31
  "reader_goal": "Decode project terminology",
32
  "start_here": "GLOSSARY.md",
33
+ "then_inspect": [
34
+ "docs/data/glossary.json",
35
+ "Homepage Glossary section"
36
+ ]
37
  },
38
  {
39
  "reader_goal": "Understand the data unit",
40
  "start_here": "results/episode_task_suite/windows.csv",
41
+ "then_inspect": [
42
+ "results/episode_task_suite/feature_manifest.json",
43
+ "docs/data/raw_sample_files.json"
44
+ ]
45
  },
46
  {
47
  "reader_goal": "Trace the 128-episode split",
48
  "start_here": "XPERIENCE10M_128_EPISODE_FEATURE_INDEX.md",
49
+ "then_inspect": [
50
+ "docs/data/xperience10m_128_episode_feature_index.json",
51
+ "results/omni_finetune/xperience10m_128_episode_selection.csv"
52
+ ]
53
  },
54
  {
55
  "reader_goal": "Inspect the 20-task benchmark",
56
  "start_here": "TASK_SUITE_20.md",
57
+ "then_inspect": [
58
+ "docs/data/task_suite_20.json",
59
+ "EVALUATION_PROTOCOL.md"
60
+ ]
61
  },
62
  {
63
  "reader_goal": "Compare current results",
64
  "start_here": "RESEARCH_TAKEAWAYS.md",
65
+ "then_inspect": [
66
+ "docs/data/task_method_20_result_matrix.json",
67
+ "docs/data/unified_task_model_radar.json"
68
+ ]
69
  },
70
  {
71
  "reader_goal": "Compare 1-episode and 128-episode methods",
72
  "start_here": "Homepage radar section",
73
+ "then_inspect": [
74
+ "docs/data/single_episode_task_model_radar.json",
75
+ "docs/data/episode128_task_model_radar.json"
76
+ ]
77
  },
78
  {
79
  "reader_goal": "Read Qwen3-Omni v1-v6 correctly",
80
  "start_here": "QWEN3_OMNI_RUN_LINEAGE.md",
81
+ "then_inspect": [
82
+ "docs/data/qwen3_omni_run_lineage.json",
83
+ "docs/data/qwen3_v5_v6_comparison.json"
84
+ ]
85
  },
86
  {
87
  "reader_goal": "Find all derived artifacts",
88
  "start_here": "ARTIFACT_GUIDE.md",
89
+ "then_inspect": [
90
+ "Hugging Face artifact dataset",
91
+ "docs/data/artifact_index.json"
92
+ ]
93
  },
94
  {
95
  "reader_goal": "Download model weights with their matching results",
96
  "start_here": "Hugging Face weights/results repo",
97
+ "then_inspect": [
98
+ "manifest.json",
99
+ "analysis/docs/data/task_method_20_result_matrix.json",
100
+ "results/"
101
+ ]
102
  },
103
  {
104
  "reader_goal": "Reproduce or extend the work",
105
  "start_here": "REPRODUCIBILITY.md",
106
+ "then_inspect": [
107
+ "QUALITY_GATES.md",
108
+ "scripts/",
109
+ "results/"
110
+ ]
111
  },
112
  {
113
  "reader_goal": "Understand foundation-model directions",
114
  "start_here": "THREE_FOUNDATION_PIPELINES.md",
115
+ "then_inspect": [
116
+ "FOUNDATION_MODEL_PLAN.md",
117
+ "docs/data/three_foundation_pipelines.json"
118
+ ]
119
  },
120
  {
121
  "reader_goal": "Check public-release health",
122
  "start_here": "PUBLIC_SURFACE_QA.md",
123
+ "then_inspect": [
124
+ "docs/data/live_publication_status.json",
125
+ "docs/data/mirror_parity.json"
126
+ ]
127
  }
128
  ],
129
  "public_surfaces": [
 
172
  "Foundation directions",
173
  "Public-release checks"
174
  ],
175
+ "reading_scopes": [
176
+ {
177
+ "public_evidence": [
178
+ "results/episode_task_suite/",
179
+ "docs/data/task_suite_20.json"
180
+ ],
181
+ "topic": "Single public-sample task behavior",
182
+ "scope_note": "Describes one public sample episode, not the full dataset distribution."
183
+ },
184
+ {
185
+ "public_evidence": [
186
+ "XPERIENCE10M_128_EPISODE_FEATURE_INDEX.md",
187
+ "docs/data/xperience10m_128_episode_feature_index.json",
188
+ "results/omni_finetune/*128*",
189
+ "docs/data/omni_model_comparison.json"
190
+ ],
191
+ "topic": "128-episode method comparison",
192
+ "scope_note": "Uses selected held-out episodes and derived public-safe summaries; official raw files remain gated upstream."
193
+ },
194
+ {
195
+ "public_evidence": [
196
+ "QWEN3_OMNI_RUN_LINEAGE.md",
197
+ "docs/data/qwen3_omni_run_lineage.json"
198
+ ],
199
+ "topic": "Qwen3-Omni v1-v6 lineage",
200
+ "scope_note": "v1-v4 are pipeline and ablation evidence, v5 is the pinned prior release, and v6 is the current public 20-task Qwen row."
201
+ },
202
+ {
203
+ "public_evidence": [
204
+ "Verified Qwen3-Omni and Cosmos3 result packages",
205
+ "model cards"
206
+ ],
207
+ "topic": "Foundation-model track quality",
208
+ "scope_note": "Numeric task scores appear only when a task-specific eval or probe exists."
209
+ },
210
+ {
211
+ "public_evidence": [
212
+ "REPRODUCIBILITY.md",
213
+ "QUALITY_GATES.md",
214
+ "release validators"
215
+ ],
216
+ "topic": "Reproducibility",
217
+ "scope_note": "Raw gated Xperience-10M files and full foundation weights are not redistributed."
218
  }
219
  ]
220
  }
docs/data/public_surface_qa.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "title": "Ropedia Xperience-10M Public Project Surface",
3
  "status": "pass",
4
- "generated_at_utc": "2026-06-22T10:20:26+00:00",
5
  "scope": "Repo README, GitHub Pages HTML, Hugging Face Space card, artifact dataset card, and model card.",
6
  "checks": [
7
  {
@@ -18,7 +18,7 @@
18
  "website_integrity": {
19
  "exists": true,
20
  "status": "pass",
21
- "generated_at_utc": "2026-06-22T10:09:34+00:00"
22
  },
23
  "rendered_site_check": {
24
  "exists": true,
@@ -28,27 +28,27 @@
28
  "task_surface_integrity": {
29
  "exists": true,
30
  "status": "pass",
31
- "generated_at_utc": "2026-06-22T10:10:38+00:00"
32
  },
33
  "source_alignment": {
34
  "exists": true,
35
  "status": "pass",
36
- "generated_at_utc": "2026-06-22T10:10:38+00:00"
37
  },
38
  "scale_up_status": {
39
  "exists": true,
40
  "status": "pass",
41
- "generated_at_utc": "2026-06-21T20:58:21+00:00"
42
  },
43
  "publication_package": {
44
  "exists": true,
45
  "status": "pass",
46
- "generated_at_utc": "2026-06-22T10:19:22+00:00"
47
  },
48
  "mirror_parity": {
49
  "exists": true,
50
  "status": "pass",
51
- "generated_at_utc": "2026-06-22T10:19:19+00:00"
52
  }
53
  },
54
  "failures": {}
 
1
  {
2
  "title": "Ropedia Xperience-10M Public Project Surface",
3
  "status": "pass",
4
+ "generated_at_utc": "2026-06-22T11:18:45+00:00",
5
  "scope": "Repo README, GitHub Pages HTML, Hugging Face Space card, artifact dataset card, and model card.",
6
  "checks": [
7
  {
 
18
  "website_integrity": {
19
  "exists": true,
20
  "status": "pass",
21
+ "generated_at_utc": "2026-06-22T11:17:07+00:00"
22
  },
23
  "rendered_site_check": {
24
  "exists": true,
 
28
  "task_surface_integrity": {
29
  "exists": true,
30
  "status": "pass",
31
+ "generated_at_utc": "2026-06-22T11:17:07+00:00"
32
  },
33
  "source_alignment": {
34
  "exists": true,
35
  "status": "pass",
36
+ "generated_at_utc": "2026-06-22T11:17:08+00:00"
37
  },
38
  "scale_up_status": {
39
  "exists": true,
40
  "status": "pass",
41
+ "generated_at_utc": "2026-06-22T11:17:10+00:00"
42
  },
43
  "publication_package": {
44
  "exists": true,
45
  "status": "pass",
46
+ "generated_at_utc": "2026-06-22T11:18:16+00:00"
47
  },
48
  "mirror_parity": {
49
  "exists": true,
50
  "status": "pass",
51
+ "generated_at_utc": "2026-06-22T11:18:11+00:00"
52
  }
53
  },
54
  "failures": {}
docs/data/publication_audit.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "status": "pass",
3
- "generated_at_utc": "2026-06-22T10:19:22+00:00",
4
  "checks": [
5
  {
6
  "name": "required_publication_assets_present",
@@ -246,8 +246,8 @@
246
  "hf_space_bundle": {
247
  "root": "hf_publish/space",
248
  "exists": true,
249
- "file_count": 631,
250
- "text_file_count": 470,
251
  "largest_file": {
252
  "path": "results/omni_finetune/xperience10m_qwen3_omni_v6_sensor_target_probes_a100_20260619T000000Z/modality_reconstruction/predictions.jsonl",
253
  "bytes": 10221085
@@ -257,8 +257,8 @@
257
  "hf_artifact_bundle": {
258
  "root": "hf_publish/artifacts",
259
  "exists": true,
260
- "file_count": 4702,
261
- "text_file_count": 1328,
262
  "largest_file": {
263
  "path": "results/omni_finetune/xperience10m_128ep_dense_multiscale_hierarchical_v1_20260608/dense_multiscale_windows.jsonl",
264
  "bytes": 135591061
@@ -268,8 +268,8 @@
268
  "hf_model_bundle": {
269
  "root": "hf_publish/model",
270
  "exists": true,
271
- "file_count": 5464,
272
- "text_file_count": 1502,
273
  "largest_file": {
274
  "path": "results/omni_finetune/xperience10m_128ep_dense_multiscale_hierarchical_v1_20260608/dense_multiscale_windows.jsonl",
275
  "bytes": 135591061
 
1
  {
2
  "status": "pass",
3
+ "generated_at_utc": "2026-06-22T11:18:16+00:00",
4
  "checks": [
5
  {
6
  "name": "required_publication_assets_present",
 
246
  "hf_space_bundle": {
247
  "root": "hf_publish/space",
248
  "exists": true,
249
+ "file_count": 640,
250
+ "text_file_count": 479,
251
  "largest_file": {
252
  "path": "results/omni_finetune/xperience10m_qwen3_omni_v6_sensor_target_probes_a100_20260619T000000Z/modality_reconstruction/predictions.jsonl",
253
  "bytes": 10221085
 
257
  "hf_artifact_bundle": {
258
  "root": "hf_publish/artifacts",
259
  "exists": true,
260
+ "file_count": 4708,
261
+ "text_file_count": 1334,
262
  "largest_file": {
263
  "path": "results/omni_finetune/xperience10m_128ep_dense_multiscale_hierarchical_v1_20260608/dense_multiscale_windows.jsonl",
264
  "bytes": 135591061
 
268
  "hf_model_bundle": {
269
  "root": "hf_publish/model",
270
  "exists": true,
271
+ "file_count": 5470,
272
+ "text_file_count": 1508,
273
  "largest_file": {
274
  "path": "results/omni_finetune/xperience10m_128ep_dense_multiscale_hierarchical_v1_20260608/dense_multiscale_windows.jsonl",
275
  "bytes": 135591061
metrics/evidence_contract.json CHANGED
@@ -1,170 +1,169 @@
1
  {
2
  "project": "Ropedia Xperience-10M Task Suite",
3
  "scope": "single public Xperience-10M sample episode",
4
- "claims": [
5
  {
6
  "id": "project_status",
7
- "claim": "A first-pass reader has a compact current-state summary.",
8
  "status": "verified",
9
  "evidence": [
10
  "PROJECT_STATUS.md",
11
  "docs/data/project_status.json"
12
  ],
13
- "boundary": "summarizes existing evidence and current limitations"
 
14
  },
15
  {
16
  "id": "research_roadmap",
17
- "claim": "The research roadmap is explicit.",
18
  "status": "current",
19
  "evidence": [
20
  "RESEARCH_ROADMAP.md",
21
  "docs/data/research_roadmap.json"
22
  ],
23
- "boundary": "connects public-sample task development to multi-episode data preparation, Qwen3-Omni LoRA, robustness runs, and larger omni-model extensions"
 
24
  },
25
- {
26
- "id": "official_dataset_card_alignment",
27
- "claim": "The public dataset description is aligned with the official gated Xperience-10M dataset card and public sample card.",
28
  "status": "verified",
29
  "evidence": [
30
  "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md",
31
  "docs/data/xperience10m_dataset_card_alignment.json",
32
  "https://huggingface.co/datasets/ropedia-ai/xperience-10m"
33
  ],
34
- "boundary": "summarizes upstream public metadata, API listing facts, sample license/tooling, and dataset-card facts; does not grant access or mirror raw data"
35
- },
36
- {
37
- "id": "source_alignment",
38
- "claim": "Source facts, sample details, API-listing notes, and project coverage are validated across repo, website, and HF cards.",
39
- "status": "verified",
40
- "evidence": [
41
- "SOURCE_ALIGNMENT_AUDIT.md",
42
- "docs/data/source_alignment_audit.json",
43
- "scripts/validate_source_alignment.py"
44
- ],
45
- "boundary": "offline committed-fact check; does not fetch private gated data"
46
- },
47
- {
48
- "id": "aligned_windows",
49
- "claim": "The public Xperience-10M sample has been converted into aligned model windows.",
50
  "status": "verified",
51
  "evidence": [
52
  "results/episode_task_suite/windows.csv",
53
  "results/episode_task_suite/shared_windows.npz",
54
  "results/episode_task_suite/summary_report.json"
55
  ],
56
- "boundary": "5,821 frames, 1,161 windows, one public sample episode"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  },
58
- {
59
- "id": "feature_contract",
60
- "claim": "The current feature contract is explicit and inspectable.",
61
- "status": "verified",
62
- "evidence": [
63
- "results/episode_task_suite/feature_manifest.json",
64
- "results/episode_task_suite/available_modalities.json"
65
- ],
66
- "boundary": "8,546-dimensional aligned multimodal window representation"
67
- },
68
- {
69
- "id": "evaluation_protocol",
70
- "claim": "The task evaluation protocol is explicit and generated from committed metrics.",
71
- "status": "verified",
72
- "evidence": [
73
- "EVALUATION_PROTOCOL.md",
74
- "docs/data/evaluation_protocol.json",
75
- "scripts/build_evaluation_protocol.py"
76
- ],
77
- "boundary": "defines windows, split, per-task metrics, leakage controls, and current limitations"
78
- },
79
  {
80
  "id": "modality_atlas",
81
- "claim": "The public sample modalities are inspectable without raw data redistribution.",
82
  "status": "verified",
83
  "evidence": [
84
  "docs/data/modality_atlas.json",
85
  "docs/assets/modalities/",
86
  "docs/index.html"
87
  ],
88
- "boundary": "derived thumbnails for presentation; raw data remains excluded"
 
89
  },
90
  {
91
  "id": "task_surface_integrity",
92
- "claim": "Public task cards stay readable for non-expert readers.",
93
  "status": "verified",
94
  "evidence": [
95
  "docs/data/task_surface_integrity.json",
96
  "scripts/validate_task_surface.py",
97
  "docs/index.html"
98
  ],
99
- "boundary": "presentation integrity for the public task surface"
 
100
  },
101
  {
102
  "id": "figure_index",
103
- "claim": "Public figures, charts, and modality thumbnails are indexed as project evidence.",
104
  "status": "verified",
105
  "evidence": [
106
  "FIGURE_INDEX.md",
107
  "docs/data/figure_index.json",
108
  "scripts/build_figure_index.py"
109
  ],
110
- "boundary": "records derived visual assets, dimensions, hashes, roles, and source scripts; raw Xperience-10M data remains excluded"
 
111
  },
112
  {
113
  "id": "brand_assets",
114
- "claim": "A project logo is consistently applied across public surfaces.",
115
  "status": "verified",
116
  "evidence": [
117
  "docs/assets/brand/",
118
  "docs/data/brand_assets.json",
119
  "scripts/build_brand_assets.py"
120
  ],
121
- "boundary": "generated logo and deterministic derivatives only; no raw dataset data or model weights"
 
122
  },
123
  {
124
  "id": "twelve_tasks",
125
- "claim": "The 12 task heads are implemented as scripts with saved metrics and predictions.",
126
  "status": "verified",
127
  "evidence": [
128
  "scripts/episode_task_suite.py",
129
  "results/episode_task_suite/*/metrics.json",
130
  "results/episode_task_suite/*/predictions.*"
131
  ],
132
- "boundary": "chronological single-episode split, not cross-episode generalization"
 
133
  },
134
  {
135
  "id": "minimal_vs_neural",
136
- "claim": "Minimal and neural heads use the same task contracts.",
137
  "status": "verified",
138
  "evidence": [
139
  "scripts/neural_task_models.py",
140
  "results/episode_task_suite/neural_mlp/",
141
  "docs/assets/task_architectures.png"
142
  ],
143
- "boundary": "small heads only; not a foundation model"
 
144
  },
145
  {
146
  "id": "research_directions",
147
- "claim": "Four Ropedia research directions are mapped honestly as direct, proxy, or diagnostic evidence.",
148
  "status": "verified",
149
  "evidence": [
150
  "results/episode_task_suite/research_directions/research_direction_taxonomy.json",
151
  "docs/data/research_directions.json"
152
  ],
153
- "boundary": "some directions remain proxy-only"
 
154
  },
155
  {
156
  "id": "direction_extensions",
157
- "claim": "Four extra direction probes are coded and evaluated.",
158
  "status": "verified",
159
  "evidence": [
160
  "results/episode_task_suite/research_direction_extensions/research_direction_extension_results.json",
161
  "docs/data/research_direction_extensions.json"
162
  ],
163
- "boundary": "single-episode probes, not full research-direction solutions"
 
164
  },
165
  {
166
  "id": "qwen3_omni_diagnostic_pilot",
167
- "claim": "Qwen3-Omni has a verified selected-episode held-out diagnostic pilot.",
168
  "status": "verified_diagnostic",
169
  "evidence": [
170
  "docs/data/omni_finetune_verified_result.json",
@@ -172,94 +171,94 @@
172
  "scripts/omni/package_verified_omni_result.py",
173
  "scripts/omni/audit_verified_omni_package.py"
174
  ],
175
- "boundary": "the pipeline is verified, but model quality is weak: JSON validity is below target and action/subtask metrics are low"
 
176
  },
177
  {
178
  "id": "multi_episode_quality_improvement",
179
- "claim": "The next Qwen3-Omni step is structured-output and task-quality improvement on the same selected split.",
180
  "status": "active_next_step",
181
  "evidence": [
182
  "scripts/omni/run_128_fullsplit_parallel_export_8gpu.sh",
183
  "docs/data/omni_finetune_verified_result.json",
184
  "FOUNDATION_MODEL_PLAN.md"
185
  ],
186
- "boundary": "stronger model quality requires output-format improvements and action/subtask error analysis"
 
187
  },
188
  {
189
  "id": "scale_up_status_check",
190
- "claim": "Older pilot path strings are tracked as setup-file provenance.",
191
  "status": "verified",
192
  "evidence": [
193
  "scripts/validate_scope_claims.py",
194
  "docs/data/scope_claims_audit.json"
195
  ],
196
- "boundary": "run/path identifiers stay separate from completed held-out-episode results"
 
197
  },
198
  {
199
  "id": "mirror_parity",
200
- "claim": "Prepared GitHub and Hugging Face mirrors carry matching critical data, visual, HTML, and validator files.",
201
  "status": "verified",
202
  "evidence": [
203
  "scripts/validate_mirror_parity.py",
204
  "docs/data/mirror_parity.json"
205
  ],
206
- "boundary": "compares prepared local mirror bundles before upload; live URLs are checked after publishing"
 
207
  },
208
  {
209
  "id": "publication_package",
210
- "claim": "The public GitHub and Hugging Face bundles contain the intended release files.",
211
  "status": "verified",
212
  "evidence": [
213
  "scripts/validate_publication_package.py",
214
  "docs/data/publication_audit.json"
215
  ],
216
- "boundary": "checks public files, HF bundles, and current public-card assets; temporary local outputs are excluded"
 
217
  },
218
  {
219
  "id": "website_integrity",
220
- "claim": "The public website has checked local references.",
221
  "status": "verified",
222
  "evidence": [
223
  "scripts/validate_website_integrity.py",
224
  "docs/data/website_integrity.json"
225
  ],
226
- "boundary": "checks local links, anchors, JSON data, and referenced images; external URLs are not fetched"
 
227
  },
228
  {
229
  "id": "rendered_site_check",
230
- "claim": "The rendered website walkthrough has a browser-level interaction check.",
231
  "status": "verified",
232
  "evidence": [
233
  "RENDERED_SITE_CHECK.md",
234
  "scripts/build_rendered_site_check.py",
235
  "docs/data/rendered_site_check.json"
236
  ],
237
- "boundary": "checks local page load, tab switch, walkthrough deep link, player controls, and console health"
 
238
  },
239
  {
240
  "id": "quality_gates",
241
- "claim": "The release gate is explicit.",
242
  "status": "verified",
243
  "evidence": [
244
  "QUALITY_GATES.md",
245
  "scripts/build_quality_gates.py",
246
  "docs/data/quality_gates.json"
247
  ],
248
- "boundary": "summarizes packaging and live-mirror checks; cross-episode model quality is measured by later held-out reports"
 
249
  },
250
  {
251
  "id": "live_publication_status",
252
- "claim": "The live public mirrors are checked after upload.",
253
  "status": "verified",
254
  "evidence": [
255
  "scripts/verify_live_publication.py",
256
  "docs/data/live_publication_status.json"
257
  ],
258
- "boundary": "fetches public GitHub/HF URLs; it does not validate private training state"
 
259
  },
260
  {
261
  "id": "citation_metadata",
262
- "claim": "The project is externally citable and machine-readable.",
263
  "status": "verified",
264
  "evidence": [
265
  "CITATION.cff",
@@ -267,11 +266,11 @@
267
  "docs/data/project_manifest.json",
268
  "LICENSE"
269
  ],
270
- "boundary": "code license does not override original Xperience-10M dataset terms"
 
271
  },
272
  {
273
  "id": "project_path",
274
- "claim": "A first-time reader has an explicit project path.",
275
  "status": "verified",
276
  "evidence": [
277
  "docs/data/project_packet.json",
@@ -280,29 +279,30 @@
280
  "README.md",
281
  "docs/index.html"
282
  ],
283
- "boundary": "guides inspection across data, tasks, results, and scale-up status"
 
284
  },
285
  {
286
  "id": "artifact_index",
287
- "claim": "The core project artifacts are grouped for human reading and indexed with existence, size, and hash metadata where stable.",
288
  "status": "verified",
289
  "evidence": [
290
  "ARTIFACT_GUIDE.md",
291
  "scripts/build_artifact_index.py",
292
  "docs/data/artifact_index.json"
293
  ],
294
- "boundary": "selective source-of-truth catalog, not a complete inventory of every output file"
 
295
  },
296
  {
297
  "id": "reproducibility_contract",
298
- "claim": "The public reproduction path is documented with commands, expected outputs, and exact-match reproduction evidence.",
299
  "status": "verified",
300
  "evidence": [
301
  "REPRODUCIBILITY.md",
302
  "docs/data/reproducibility_matrix.json",
303
  "notes/reproducibility_audit.md"
304
  ],
305
- "boundary": "publicly reproduces the single-episode pipeline; multi-episode Qwen3-Omni metrics are added only after data preparation and held-out evaluation"
 
306
  }
307
  ]
308
  }
 
1
  {
2
  "project": "Ropedia Xperience-10M Task Suite",
3
  "scope": "single public Xperience-10M sample episode",
4
+ "readouts": [
5
  {
6
  "id": "project_status",
 
7
  "status": "verified",
8
  "evidence": [
9
  "PROJECT_STATUS.md",
10
  "docs/data/project_status.json"
11
  ],
12
+ "readout": "A first-pass reader has a compact current-state summary.",
13
+ "scope_note": "summarizes existing evidence and current limitations"
14
  },
15
  {
16
  "id": "research_roadmap",
 
17
  "status": "current",
18
  "evidence": [
19
  "RESEARCH_ROADMAP.md",
20
  "docs/data/research_roadmap.json"
21
  ],
22
+ "readout": "The research roadmap is explicit.",
23
+ "scope_note": "connects public-sample task development to multi-episode data preparation, Qwen3-Omni LoRA, robustness runs, and larger omni-model extensions"
24
  },
25
+ {
26
+ "id": "official_dataset_card_alignment",
 
27
  "status": "verified",
28
  "evidence": [
29
  "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md",
30
  "docs/data/xperience10m_dataset_card_alignment.json",
31
  "https://huggingface.co/datasets/ropedia-ai/xperience-10m"
32
  ],
33
+ "readout": "The public dataset description is aligned with the official gated Xperience-10M dataset card and public sample card.",
34
+ "scope_note": "summarizes upstream public metadata, API listing facts, sample license/tooling, and dataset-card facts; does not grant access or mirror raw data"
35
+ },
36
+ {
37
+ "id": "source_alignment",
38
+ "status": "verified",
39
+ "evidence": [
40
+ "SOURCE_ALIGNMENT_AUDIT.md",
41
+ "docs/data/source_alignment_audit.json",
42
+ "scripts/validate_source_alignment.py"
43
+ ],
44
+ "readout": "Source facts, sample details, API-listing notes, and project coverage are validated across repo, website, and HF cards.",
45
+ "scope_note": "offline committed-fact check; does not fetch private gated data"
46
+ },
47
+ {
48
+ "id": "aligned_windows",
49
  "status": "verified",
50
  "evidence": [
51
  "results/episode_task_suite/windows.csv",
52
  "results/episode_task_suite/shared_windows.npz",
53
  "results/episode_task_suite/summary_report.json"
54
  ],
55
+ "readout": "The public Xperience-10M sample has been converted into aligned model windows.",
56
+ "scope_note": "5,821 frames, 1,161 windows, one public sample episode"
57
+ },
58
+ {
59
+ "id": "feature_contract",
60
+ "status": "verified",
61
+ "evidence": [
62
+ "results/episode_task_suite/feature_manifest.json",
63
+ "results/episode_task_suite/available_modalities.json"
64
+ ],
65
+ "readout": "The current feature contract is explicit and inspectable.",
66
+ "scope_note": "8,546-dimensional aligned multimodal window representation"
67
+ },
68
+ {
69
+ "id": "evaluation_protocol",
70
+ "status": "verified",
71
+ "evidence": [
72
+ "EVALUATION_PROTOCOL.md",
73
+ "docs/data/evaluation_protocol.json",
74
+ "scripts/build_evaluation_protocol.py"
75
+ ],
76
+ "readout": "The task evaluation protocol is explicit and generated from committed metrics.",
77
+ "scope_note": "defines windows, split, per-task metrics, leakage controls, and current limitations"
78
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  {
80
  "id": "modality_atlas",
 
81
  "status": "verified",
82
  "evidence": [
83
  "docs/data/modality_atlas.json",
84
  "docs/assets/modalities/",
85
  "docs/index.html"
86
  ],
87
+ "readout": "The public sample modalities are inspectable without raw data redistribution.",
88
+ "scope_note": "derived thumbnails for presentation; raw data remains excluded"
89
  },
90
  {
91
  "id": "task_surface_integrity",
 
92
  "status": "verified",
93
  "evidence": [
94
  "docs/data/task_surface_integrity.json",
95
  "scripts/validate_task_surface.py",
96
  "docs/index.html"
97
  ],
98
+ "readout": "Public task cards stay readable for non-expert readers.",
99
+ "scope_note": "presentation integrity for the public task surface"
100
  },
101
  {
102
  "id": "figure_index",
 
103
  "status": "verified",
104
  "evidence": [
105
  "FIGURE_INDEX.md",
106
  "docs/data/figure_index.json",
107
  "scripts/build_figure_index.py"
108
  ],
109
+ "readout": "Public figures, charts, and modality thumbnails are indexed as project evidence.",
110
+ "scope_note": "records derived visual assets, dimensions, hashes, roles, and source scripts; raw Xperience-10M data remains excluded"
111
  },
112
  {
113
  "id": "brand_assets",
 
114
  "status": "verified",
115
  "evidence": [
116
  "docs/assets/brand/",
117
  "docs/data/brand_assets.json",
118
  "scripts/build_brand_assets.py"
119
  ],
120
+ "readout": "A project logo is consistently applied across public surfaces.",
121
+ "scope_note": "generated logo and deterministic derivatives only; no raw dataset data or model weights"
122
  },
123
  {
124
  "id": "twelve_tasks",
 
125
  "status": "verified",
126
  "evidence": [
127
  "scripts/episode_task_suite.py",
128
  "results/episode_task_suite/*/metrics.json",
129
  "results/episode_task_suite/*/predictions.*"
130
  ],
131
+ "readout": "The 12 task heads are implemented as scripts with saved metrics and predictions.",
132
+ "scope_note": "chronological single-episode split, not cross-episode generalization"
133
  },
134
  {
135
  "id": "minimal_vs_neural",
 
136
  "status": "verified",
137
  "evidence": [
138
  "scripts/neural_task_models.py",
139
  "results/episode_task_suite/neural_mlp/",
140
  "docs/assets/task_architectures.png"
141
  ],
142
+ "readout": "Minimal and neural heads use the same task contracts.",
143
+ "scope_note": "small heads only; not a foundation model"
144
  },
145
  {
146
  "id": "research_directions",
 
147
  "status": "verified",
148
  "evidence": [
149
  "results/episode_task_suite/research_directions/research_direction_taxonomy.json",
150
  "docs/data/research_directions.json"
151
  ],
152
+ "readout": "Four Ropedia research directions are mapped honestly as direct, proxy, or diagnostic evidence.",
153
+ "scope_note": "some directions remain proxy-only"
154
  },
155
  {
156
  "id": "direction_extensions",
 
157
  "status": "verified",
158
  "evidence": [
159
  "results/episode_task_suite/research_direction_extensions/research_direction_extension_results.json",
160
  "docs/data/research_direction_extensions.json"
161
  ],
162
+ "readout": "Four extra direction probes are coded and evaluated.",
163
+ "scope_note": "single-episode probes, not full research-direction solutions"
164
  },
165
  {
166
  "id": "qwen3_omni_diagnostic_pilot",
 
167
  "status": "verified_diagnostic",
168
  "evidence": [
169
  "docs/data/omni_finetune_verified_result.json",
 
171
  "scripts/omni/package_verified_omni_result.py",
172
  "scripts/omni/audit_verified_omni_package.py"
173
  ],
174
+ "readout": "Qwen3-Omni has a verified selected-episode held-out diagnostic pilot.",
175
+ "scope_note": "the pipeline is verified, but model quality is weak: JSON validity is below target and action/subtask metrics are low"
176
  },
177
  {
178
  "id": "multi_episode_quality_improvement",
 
179
  "status": "active_next_step",
180
  "evidence": [
181
  "scripts/omni/run_128_fullsplit_parallel_export_8gpu.sh",
182
  "docs/data/omni_finetune_verified_result.json",
183
  "FOUNDATION_MODEL_PLAN.md"
184
  ],
185
+ "readout": "The next Qwen3-Omni step is structured-output and task-quality improvement on the same selected split.",
186
+ "scope_note": "stronger model quality requires output-format improvements and action/subtask error analysis"
187
  },
188
  {
189
  "id": "scale_up_status_check",
 
190
  "status": "verified",
191
  "evidence": [
192
  "scripts/validate_scope_claims.py",
193
  "docs/data/scope_claims_audit.json"
194
  ],
195
+ "readout": "Older pilot path strings are tracked as setup-file provenance.",
196
+ "scope_note": "run/path identifiers stay separate from completed held-out-episode results"
197
  },
198
  {
199
  "id": "mirror_parity",
 
200
  "status": "verified",
201
  "evidence": [
202
  "scripts/validate_mirror_parity.py",
203
  "docs/data/mirror_parity.json"
204
  ],
205
+ "readout": "Prepared GitHub and Hugging Face mirrors carry matching critical data, visual, HTML, and validator files.",
206
+ "scope_note": "compares prepared local mirror bundles before upload; live URLs are checked after publishing"
207
  },
208
  {
209
  "id": "publication_package",
 
210
  "status": "verified",
211
  "evidence": [
212
  "scripts/validate_publication_package.py",
213
  "docs/data/publication_audit.json"
214
  ],
215
+ "readout": "The public GitHub and Hugging Face bundles contain the intended release files.",
216
+ "scope_note": "checks public files, HF bundles, and current public-card assets; temporary local outputs are excluded"
217
  },
218
  {
219
  "id": "website_integrity",
 
220
  "status": "verified",
221
  "evidence": [
222
  "scripts/validate_website_integrity.py",
223
  "docs/data/website_integrity.json"
224
  ],
225
+ "readout": "The public website has checked local references.",
226
+ "scope_note": "checks local links, anchors, JSON data, and referenced images; external URLs are not fetched"
227
  },
228
  {
229
  "id": "rendered_site_check",
 
230
  "status": "verified",
231
  "evidence": [
232
  "RENDERED_SITE_CHECK.md",
233
  "scripts/build_rendered_site_check.py",
234
  "docs/data/rendered_site_check.json"
235
  ],
236
+ "readout": "The rendered website walkthrough has a browser-level interaction check.",
237
+ "scope_note": "checks local page load, tab switch, walkthrough deep link, player controls, and console health"
238
  },
239
  {
240
  "id": "quality_gates",
 
241
  "status": "verified",
242
  "evidence": [
243
  "QUALITY_GATES.md",
244
  "scripts/build_quality_gates.py",
245
  "docs/data/quality_gates.json"
246
  ],
247
+ "readout": "The release gate is explicit.",
248
+ "scope_note": "summarizes packaging and live-mirror checks; cross-episode model quality is measured by later held-out reports"
249
  },
250
  {
251
  "id": "live_publication_status",
 
252
  "status": "verified",
253
  "evidence": [
254
  "scripts/verify_live_publication.py",
255
  "docs/data/live_publication_status.json"
256
  ],
257
+ "readout": "The live public mirrors are checked after upload.",
258
+ "scope_note": "fetches public GitHub/HF URLs; it does not validate private training state"
259
  },
260
  {
261
  "id": "citation_metadata",
 
262
  "status": "verified",
263
  "evidence": [
264
  "CITATION.cff",
 
266
  "docs/data/project_manifest.json",
267
  "LICENSE"
268
  ],
269
+ "readout": "The project is externally citable and machine-readable.",
270
+ "scope_note": "code license does not override original Xperience-10M dataset terms"
271
  },
272
  {
273
  "id": "project_path",
 
274
  "status": "verified",
275
  "evidence": [
276
  "docs/data/project_packet.json",
 
279
  "README.md",
280
  "docs/index.html"
281
  ],
282
+ "readout": "A first-time reader has an explicit project path.",
283
+ "scope_note": "guides inspection across data, tasks, results, and scale-up status"
284
  },
285
  {
286
  "id": "artifact_index",
 
287
  "status": "verified",
288
  "evidence": [
289
  "ARTIFACT_GUIDE.md",
290
  "scripts/build_artifact_index.py",
291
  "docs/data/artifact_index.json"
292
  ],
293
+ "readout": "The core project artifacts are grouped for human reading and indexed with existence, size, and hash metadata where stable.",
294
+ "scope_note": "selective source-of-truth catalog, not a complete inventory of every output file"
295
  },
296
  {
297
  "id": "reproducibility_contract",
 
298
  "status": "verified",
299
  "evidence": [
300
  "REPRODUCIBILITY.md",
301
  "docs/data/reproducibility_matrix.json",
302
  "notes/reproducibility_audit.md"
303
  ],
304
+ "readout": "The public reproduction path is documented with commands, expected outputs, and exact-match reproduction evidence.",
305
+ "scope_note": "publicly reproduces the single-episode pipeline; multi-episode Qwen3-Omni metrics are added only after data preparation and held-out evaluation"
306
  }
307
  ]
308
  }
metrics/mirror_parity.json CHANGED
The diff for this file is too large to render. See raw diff
 
metrics/omni_model_comparison.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "title": "Ropedia Xperience-10M Current Result Versions and Model Groups",
3
- "generated_at_utc": "2026-06-21T15:17:00+00:00",
4
  "status": "pass",
5
  "version_count": 3,
6
  "model_group_count": 5,
@@ -1758,6 +1758,6 @@
1758
  ],
1759
  "pending": [
1760
  "Use the verified Qwen3 v6 rank64/lr5e-5 dense multiscale full-eval package as the latest current Qwen row; the v5 release tag remains pinned as the previous verified release.",
1761
- "Read results/omni_finetune/QWEN3_V5_V6_COMPARISON_20260614.md before claiming v6 is globally better than v5, because v6 improves action macro-F1 and contact accuracy but regresses subtask, next-action, object micro-F1, and JSON validity slightly."
1762
  ]
1763
  }
 
1
  {
2
  "title": "Ropedia Xperience-10M Current Result Versions and Model Groups",
3
+ "generated_at_utc": "2026-06-22T10:59:59+00:00",
4
  "status": "pass",
5
  "version_count": 3,
6
  "model_group_count": 5,
 
1758
  ],
1759
  "pending": [
1760
  "Use the verified Qwen3 v6 rank64/lr5e-5 dense multiscale full-eval package as the latest current Qwen row; the v5 release tag remains pinned as the previous verified release.",
1761
+ "Read results/omni_finetune/QWEN3_V5_V6_COMPARISON_20260614.md before presenting v6 as globally better than v5, because v6 improves action macro-F1 and contact accuracy but regresses subtask, next-action, object micro-F1, and JSON validity slightly."
1762
  ]
1763
  }
metrics/project_brief.json CHANGED
@@ -56,7 +56,7 @@
56
  "Use docs/data/omni_finetune_verified_result.json for the current multi-episode Qwen3-Omni pilot result."
57
  ],
58
  "scope_boundary": "The public sample is enough to build and verify task definitions, feature contracts, metrics, visualization, and baseline code. The final multi-episode Qwen3-Omni diagnostic result verifies the training loop and strict-JSON output reliability, but does not yet show strong action/subtask model quality.",
59
- "next_stage": "Improve action/subtask quality through error analysis before larger robustness or alternative-backbone claims.",
60
  "entry_points": {
61
  "visual_dashboard": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/",
62
  "hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite",
 
56
  "Use docs/data/omni_finetune_verified_result.json for the current multi-episode Qwen3-Omni pilot result."
57
  ],
58
  "scope_boundary": "The public sample is enough to build and verify task definitions, feature contracts, metrics, visualization, and baseline code. The final multi-episode Qwen3-Omni diagnostic result verifies the training loop and strict-JSON output reliability, but does not yet show strong action/subtask model quality.",
59
+ "next_stage": "Improve action/subtask quality through error analysis before presenting larger robustness or alternative-backbone results.",
60
  "entry_points": {
61
  "visual_dashboard": "https://chaoyue0307.github.io/ropedia-xperience-10m-task-suite/",
62
  "hf_space": "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite",
metrics/public_reader_map.json CHANGED
@@ -6,77 +6,124 @@
6
  {
7
  "reader_goal": "Understand the project in one pass",
8
  "start_here": "PROJECT_BRIEF.md",
9
- "then_inspect": ["PROJECT_STATUS.md", "RESEARCH_TAKEAWAYS.md"]
 
 
 
10
  },
11
  {
12
  "reader_goal": "Understand the two evidence lines",
13
  "start_here": "TWO_EVIDENCE_LINES.md",
14
- "then_inspect": ["docs/data/two_evidence_lines.json", "docs/data/two_evidence_line_result_summary.json"]
 
 
 
15
  },
16
  {
17
  "reader_goal": "See the visual public dashboard",
18
  "start_here": "GitHub Pages dashboard or Hugging Face Space",
19
- "then_inspect": ["docs/index.html", "docs/data/project_packet.json"]
 
 
 
20
  },
21
  {
22
  "reader_goal": "Decode project terminology",
23
  "start_here": "GLOSSARY.md",
24
- "then_inspect": ["docs/data/glossary.json", "Homepage Glossary section"]
 
 
 
25
  },
26
  {
27
  "reader_goal": "Understand the data unit",
28
  "start_here": "results/episode_task_suite/windows.csv",
29
- "then_inspect": ["results/episode_task_suite/feature_manifest.json", "docs/data/raw_sample_files.json"]
 
 
 
30
  },
31
  {
32
  "reader_goal": "Trace the 128-episode split",
33
  "start_here": "XPERIENCE10M_128_EPISODE_FEATURE_INDEX.md",
34
- "then_inspect": ["docs/data/xperience10m_128_episode_feature_index.json", "results/omni_finetune/xperience10m_128_episode_selection.csv"]
 
 
 
35
  },
36
  {
37
  "reader_goal": "Inspect the 20-task benchmark",
38
  "start_here": "TASK_SUITE_20.md",
39
- "then_inspect": ["docs/data/task_suite_20.json", "EVALUATION_PROTOCOL.md"]
 
 
 
40
  },
41
  {
42
  "reader_goal": "Compare current results",
43
  "start_here": "RESEARCH_TAKEAWAYS.md",
44
- "then_inspect": ["docs/data/task_method_20_result_matrix.json", "docs/data/unified_task_model_radar.json"]
 
 
 
45
  },
46
  {
47
  "reader_goal": "Compare 1-episode and 128-episode methods",
48
  "start_here": "Homepage radar section",
49
- "then_inspect": ["docs/data/single_episode_task_model_radar.json", "docs/data/episode128_task_model_radar.json"]
 
 
 
50
  },
51
  {
52
  "reader_goal": "Read Qwen3-Omni v1-v6 correctly",
53
  "start_here": "QWEN3_OMNI_RUN_LINEAGE.md",
54
- "then_inspect": ["docs/data/qwen3_omni_run_lineage.json", "docs/data/qwen3_v5_v6_comparison.json"]
 
 
 
55
  },
56
  {
57
  "reader_goal": "Find all derived artifacts",
58
  "start_here": "ARTIFACT_GUIDE.md",
59
- "then_inspect": ["Hugging Face artifact dataset", "docs/data/artifact_index.json"]
 
 
 
60
  },
61
  {
62
  "reader_goal": "Download model weights with their matching results",
63
  "start_here": "Hugging Face weights/results repo",
64
- "then_inspect": ["manifest.json", "analysis/docs/data/task_method_20_result_matrix.json", "results/"]
 
 
 
 
65
  },
66
  {
67
  "reader_goal": "Reproduce or extend the work",
68
  "start_here": "REPRODUCIBILITY.md",
69
- "then_inspect": ["QUALITY_GATES.md", "scripts/", "results/"]
 
 
 
 
70
  },
71
  {
72
  "reader_goal": "Understand foundation-model directions",
73
  "start_here": "THREE_FOUNDATION_PIPELINES.md",
74
- "then_inspect": ["FOUNDATION_MODEL_PLAN.md", "docs/data/three_foundation_pipelines.json"]
 
 
 
75
  },
76
  {
77
  "reader_goal": "Check public-release health",
78
  "start_here": "PUBLIC_SURFACE_QA.md",
79
- "then_inspect": ["docs/data/live_publication_status.json", "docs/data/mirror_parity.json"]
 
 
 
80
  }
81
  ],
82
  "public_surfaces": [
@@ -125,31 +172,49 @@
125
  "Foundation directions",
126
  "Public-release checks"
127
  ],
128
- "claim_boundaries": [
129
- {
130
- "claim_type": "Single public-sample task behavior",
131
- "public_evidence": ["results/episode_task_suite/", "docs/data/task_suite_20.json"],
132
- "boundary": "Describes one public sample episode, not the full dataset distribution."
133
- },
134
- {
135
- "claim_type": "128-episode method comparison",
136
- "public_evidence": ["XPERIENCE10M_128_EPISODE_FEATURE_INDEX.md", "docs/data/xperience10m_128_episode_feature_index.json", "results/omni_finetune/*128*", "docs/data/omni_model_comparison.json"],
137
- "boundary": "Uses selected held-out episodes and derived public-safe summaries; official raw files remain gated upstream."
138
- },
139
- {
140
- "claim_type": "Qwen3-Omni v1-v6 lineage",
141
- "public_evidence": ["QWEN3_OMNI_RUN_LINEAGE.md", "docs/data/qwen3_omni_run_lineage.json"],
142
- "boundary": "v1-v4 are pipeline and ablation evidence, v5 is the pinned prior release, and v6 is the current public 20-task Qwen row."
143
- },
144
- {
145
- "claim_type": "Foundation-model track quality",
146
- "public_evidence": ["Verified Qwen3-Omni and Cosmos3 result packages", "model cards"],
147
- "boundary": "Numeric task scores appear only when a task-specific eval or probe exists."
148
- },
149
- {
150
- "claim_type": "Reproducibility",
151
- "public_evidence": ["REPRODUCIBILITY.md", "QUALITY_GATES.md", "release validators"],
152
- "boundary": "Raw gated Xperience-10M files and full foundation weights are not redistributed."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  }
154
  ]
155
  }
 
6
  {
7
  "reader_goal": "Understand the project in one pass",
8
  "start_here": "PROJECT_BRIEF.md",
9
+ "then_inspect": [
10
+ "PROJECT_STATUS.md",
11
+ "RESEARCH_TAKEAWAYS.md"
12
+ ]
13
  },
14
  {
15
  "reader_goal": "Understand the two evidence lines",
16
  "start_here": "TWO_EVIDENCE_LINES.md",
17
+ "then_inspect": [
18
+ "docs/data/two_evidence_lines.json",
19
+ "docs/data/two_evidence_line_result_summary.json"
20
+ ]
21
  },
22
  {
23
  "reader_goal": "See the visual public dashboard",
24
  "start_here": "GitHub Pages dashboard or Hugging Face Space",
25
+ "then_inspect": [
26
+ "docs/index.html",
27
+ "docs/data/project_packet.json"
28
+ ]
29
  },
30
  {
31
  "reader_goal": "Decode project terminology",
32
  "start_here": "GLOSSARY.md",
33
+ "then_inspect": [
34
+ "docs/data/glossary.json",
35
+ "Homepage Glossary section"
36
+ ]
37
  },
38
  {
39
  "reader_goal": "Understand the data unit",
40
  "start_here": "results/episode_task_suite/windows.csv",
41
+ "then_inspect": [
42
+ "results/episode_task_suite/feature_manifest.json",
43
+ "docs/data/raw_sample_files.json"
44
+ ]
45
  },
46
  {
47
  "reader_goal": "Trace the 128-episode split",
48
  "start_here": "XPERIENCE10M_128_EPISODE_FEATURE_INDEX.md",
49
+ "then_inspect": [
50
+ "docs/data/xperience10m_128_episode_feature_index.json",
51
+ "results/omni_finetune/xperience10m_128_episode_selection.csv"
52
+ ]
53
  },
54
  {
55
  "reader_goal": "Inspect the 20-task benchmark",
56
  "start_here": "TASK_SUITE_20.md",
57
+ "then_inspect": [
58
+ "docs/data/task_suite_20.json",
59
+ "EVALUATION_PROTOCOL.md"
60
+ ]
61
  },
62
  {
63
  "reader_goal": "Compare current results",
64
  "start_here": "RESEARCH_TAKEAWAYS.md",
65
+ "then_inspect": [
66
+ "docs/data/task_method_20_result_matrix.json",
67
+ "docs/data/unified_task_model_radar.json"
68
+ ]
69
  },
70
  {
71
  "reader_goal": "Compare 1-episode and 128-episode methods",
72
  "start_here": "Homepage radar section",
73
+ "then_inspect": [
74
+ "docs/data/single_episode_task_model_radar.json",
75
+ "docs/data/episode128_task_model_radar.json"
76
+ ]
77
  },
78
  {
79
  "reader_goal": "Read Qwen3-Omni v1-v6 correctly",
80
  "start_here": "QWEN3_OMNI_RUN_LINEAGE.md",
81
+ "then_inspect": [
82
+ "docs/data/qwen3_omni_run_lineage.json",
83
+ "docs/data/qwen3_v5_v6_comparison.json"
84
+ ]
85
  },
86
  {
87
  "reader_goal": "Find all derived artifacts",
88
  "start_here": "ARTIFACT_GUIDE.md",
89
+ "then_inspect": [
90
+ "Hugging Face artifact dataset",
91
+ "docs/data/artifact_index.json"
92
+ ]
93
  },
94
  {
95
  "reader_goal": "Download model weights with their matching results",
96
  "start_here": "Hugging Face weights/results repo",
97
+ "then_inspect": [
98
+ "manifest.json",
99
+ "analysis/docs/data/task_method_20_result_matrix.json",
100
+ "results/"
101
+ ]
102
  },
103
  {
104
  "reader_goal": "Reproduce or extend the work",
105
  "start_here": "REPRODUCIBILITY.md",
106
+ "then_inspect": [
107
+ "QUALITY_GATES.md",
108
+ "scripts/",
109
+ "results/"
110
+ ]
111
  },
112
  {
113
  "reader_goal": "Understand foundation-model directions",
114
  "start_here": "THREE_FOUNDATION_PIPELINES.md",
115
+ "then_inspect": [
116
+ "FOUNDATION_MODEL_PLAN.md",
117
+ "docs/data/three_foundation_pipelines.json"
118
+ ]
119
  },
120
  {
121
  "reader_goal": "Check public-release health",
122
  "start_here": "PUBLIC_SURFACE_QA.md",
123
+ "then_inspect": [
124
+ "docs/data/live_publication_status.json",
125
+ "docs/data/mirror_parity.json"
126
+ ]
127
  }
128
  ],
129
  "public_surfaces": [
 
172
  "Foundation directions",
173
  "Public-release checks"
174
  ],
175
+ "reading_scopes": [
176
+ {
177
+ "public_evidence": [
178
+ "results/episode_task_suite/",
179
+ "docs/data/task_suite_20.json"
180
+ ],
181
+ "topic": "Single public-sample task behavior",
182
+ "scope_note": "Describes one public sample episode, not the full dataset distribution."
183
+ },
184
+ {
185
+ "public_evidence": [
186
+ "XPERIENCE10M_128_EPISODE_FEATURE_INDEX.md",
187
+ "docs/data/xperience10m_128_episode_feature_index.json",
188
+ "results/omni_finetune/*128*",
189
+ "docs/data/omni_model_comparison.json"
190
+ ],
191
+ "topic": "128-episode method comparison",
192
+ "scope_note": "Uses selected held-out episodes and derived public-safe summaries; official raw files remain gated upstream."
193
+ },
194
+ {
195
+ "public_evidence": [
196
+ "QWEN3_OMNI_RUN_LINEAGE.md",
197
+ "docs/data/qwen3_omni_run_lineage.json"
198
+ ],
199
+ "topic": "Qwen3-Omni v1-v6 lineage",
200
+ "scope_note": "v1-v4 are pipeline and ablation evidence, v5 is the pinned prior release, and v6 is the current public 20-task Qwen row."
201
+ },
202
+ {
203
+ "public_evidence": [
204
+ "Verified Qwen3-Omni and Cosmos3 result packages",
205
+ "model cards"
206
+ ],
207
+ "topic": "Foundation-model track quality",
208
+ "scope_note": "Numeric task scores appear only when a task-specific eval or probe exists."
209
+ },
210
+ {
211
+ "public_evidence": [
212
+ "REPRODUCIBILITY.md",
213
+ "QUALITY_GATES.md",
214
+ "release validators"
215
+ ],
216
+ "topic": "Reproducibility",
217
+ "scope_note": "Raw gated Xperience-10M files and full foundation weights are not redistributed."
218
  }
219
  ]
220
  }
metrics/public_surface_qa.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "title": "Ropedia Xperience-10M Public Project Surface",
3
  "status": "pass",
4
- "generated_at_utc": "2026-06-22T10:20:26+00:00",
5
  "scope": "Repo README, GitHub Pages HTML, Hugging Face Space card, artifact dataset card, and model card.",
6
  "checks": [
7
  {
@@ -18,7 +18,7 @@
18
  "website_integrity": {
19
  "exists": true,
20
  "status": "pass",
21
- "generated_at_utc": "2026-06-22T10:09:34+00:00"
22
  },
23
  "rendered_site_check": {
24
  "exists": true,
@@ -28,27 +28,27 @@
28
  "task_surface_integrity": {
29
  "exists": true,
30
  "status": "pass",
31
- "generated_at_utc": "2026-06-22T10:10:38+00:00"
32
  },
33
  "source_alignment": {
34
  "exists": true,
35
  "status": "pass",
36
- "generated_at_utc": "2026-06-22T10:10:38+00:00"
37
  },
38
  "scale_up_status": {
39
  "exists": true,
40
  "status": "pass",
41
- "generated_at_utc": "2026-06-21T20:58:21+00:00"
42
  },
43
  "publication_package": {
44
  "exists": true,
45
  "status": "pass",
46
- "generated_at_utc": "2026-06-22T10:19:22+00:00"
47
  },
48
  "mirror_parity": {
49
  "exists": true,
50
  "status": "pass",
51
- "generated_at_utc": "2026-06-22T10:19:19+00:00"
52
  }
53
  },
54
  "failures": {}
 
1
  {
2
  "title": "Ropedia Xperience-10M Public Project Surface",
3
  "status": "pass",
4
+ "generated_at_utc": "2026-06-22T11:18:45+00:00",
5
  "scope": "Repo README, GitHub Pages HTML, Hugging Face Space card, artifact dataset card, and model card.",
6
  "checks": [
7
  {
 
18
  "website_integrity": {
19
  "exists": true,
20
  "status": "pass",
21
+ "generated_at_utc": "2026-06-22T11:17:07+00:00"
22
  },
23
  "rendered_site_check": {
24
  "exists": true,
 
28
  "task_surface_integrity": {
29
  "exists": true,
30
  "status": "pass",
31
+ "generated_at_utc": "2026-06-22T11:17:07+00:00"
32
  },
33
  "source_alignment": {
34
  "exists": true,
35
  "status": "pass",
36
+ "generated_at_utc": "2026-06-22T11:17:08+00:00"
37
  },
38
  "scale_up_status": {
39
  "exists": true,
40
  "status": "pass",
41
+ "generated_at_utc": "2026-06-22T11:17:10+00:00"
42
  },
43
  "publication_package": {
44
  "exists": true,
45
  "status": "pass",
46
+ "generated_at_utc": "2026-06-22T11:18:16+00:00"
47
  },
48
  "mirror_parity": {
49
  "exists": true,
50
  "status": "pass",
51
+ "generated_at_utc": "2026-06-22T11:18:11+00:00"
52
  }
53
  },
54
  "failures": {}