File size: 17,416 Bytes
9a8a5a7 f590137 389c0f8 9a8a5a7 3f09cb0 9a8a5a7 3f09cb0 9a8a5a7 3f09cb0 9a8a5a7 3f09cb0 9a8a5a7 3f09cb0 9a8a5a7 32cee9a 9a8a5a7 389c0f8 9a8a5a7 3f09cb0 389c0f8 3f09cb0 9a8a5a7 389c0f8 9a8a5a7 3f09cb0 9a8a5a7 32cee9a 9a8a5a7 32cee9a 9a8a5a7 f590137 3f09cb0 389c0f8 3f09cb0 32cee9a 3f09cb0 9a8a5a7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 | {
"generated_at_utc": "2026-06-21T11:49:06+00:00",
"interpretation_rule": "Use the 1-episode line for task construction and reproducibility claims. Use the 128-episode line for same-split metadata/raw baselines, Qwen3-Omni v6 LoRA diagnostics, and Cosmos3 diagnostics.",
"lines": [
{
"artifact_entry_points": [
"docs/data/single_episode_task_model_radar.json",
"docs/data/two_evidence_line_result_summary.json",
"results/episode_task_suite/summary_report.json",
"results/episode_task_suite/feature_manifest.json",
"docs/single_episode_explorer.html"
],
"claim_boundary": "Supports task construction, file inspection, local reproducibility, and controlled single-episode baseline claims.",
"data_unit": "One public Xperience-10M sample episode",
"direct_scored_method_task_count": 40,
"id": "single_public_sample_episode",
"label": "1 sample episode",
"method_count": 2,
"method_task_record_count": 40,
"methods": [
{
"direct_scored_task_count": 20,
"id": "minimal",
"label": "Minimal",
"method_detail": "Single-episode simple heads over the public sample split.",
"proxy_scored_task_count": 0,
"result_record_count": 20,
"scope": "1 public sample episode",
"scored_task_count": 20,
"status_counts": {
"scored": 20
}
},
{
"direct_scored_task_count": 20,
"id": "neural_mlp",
"label": "Neural MLP",
"method_detail": "Single-episode compact PyTorch MLP heads on the same 20 task contracts.",
"proxy_scored_task_count": 0,
"result_record_count": 20,
"scope": "1 public sample episode",
"scored_task_count": 20,
"status_counts": {
"scored": 20
}
}
],
"not_for": "Do not use this line as evidence of multi-episode generalization.",
"primary_use": "Inspect raw files, understand each task, rerun local baselines, and debug task quality.",
"primary_visuals": [
"docs/assets/charts/two_evidence_line_map.svg",
"docs/assets/charts/single_episode_task_model_radar.svg"
],
"proxy_scored_method_task_count": 0,
"result_statement": "40/40 direct scores from Minimal and Neural MLP heads on the same 20 task contracts.",
"scored_method_task_count": 40,
"short_label": "Line 1",
"task_count": 20
},
{
"artifact_entry_points": [
"docs/data/episode128_task_model_radar.json",
"docs/data/two_evidence_line_result_summary.json",
"docs/data/xperience10m_128_episode_feature_index.json",
"docs/data/omni_model_comparison.json",
"docs/data/qwen3_omni_run_lineage.json",
"docs/data/task_method_20_gap_audit.json"
],
"claim_boundary": "Supports same-split metadata/raw baseline comparison, Qwen3-Omni v6 diagnostics, Cosmos3 diagnostics, and scale-up planning on public-safe processed artifacts.",
"data_unit": "Selected held-out 96/16/16 split with public-safe processed features linked to official gated episode paths",
"direct_scored_method_task_count": 134,
"id": "selected_128_episode_surface",
"label": "128 selected episodes",
"method_count": 7,
"method_task_record_count": 140,
"methods": [
{
"direct_scored_task_count": 19,
"id": "metadata128_simple",
"label": "128ep Aligned Simple",
"method_detail": "128-episode aligned simple baselines: JSONL metadata/text tasks plus staged sensor-block tasks where the processed target exists.",
"proxy_scored_task_count": 1,
"result_record_count": 20,
"scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available",
"scored_task_count": 20,
"status_counts": {
"proxy_scored": 1,
"scored": 19
}
},
{
"direct_scored_task_count": 19,
"id": "metadata128_neural_mlp",
"label": "128ep Aligned NN",
"method_detail": "128-episode aligned MLP baselines: JSONL metadata/text tasks plus staged sensor-block tasks where the processed target exists.",
"proxy_scored_task_count": 1,
"result_record_count": 20,
"scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available",
"scored_task_count": 20,
"status_counts": {
"proxy_scored": 1,
"scored": 19
}
},
{
"direct_scored_task_count": 18,
"id": "raw128_simple",
"label": "128ep Raw Simple",
"method_detail": "128-episode 4430-dim sensor NPZ simple heads; tasks 15/19 use compact proxies.",
"proxy_scored_task_count": 2,
"result_record_count": 20,
"scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes",
"scored_task_count": 20,
"status_counts": {
"proxy_scored": 2,
"scored": 18
}
},
{
"direct_scored_task_count": 18,
"id": "raw128_neural_mlp",
"label": "128ep Raw NN",
"method_detail": "128-episode 4430-dim sensor NPZ MLP heads; tasks 15/19 use compact proxies.",
"proxy_scored_task_count": 2,
"result_record_count": 20,
"scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes",
"scored_task_count": 20,
"status_counts": {
"proxy_scored": 2,
"scored": 18
}
},
{
"direct_scored_task_count": 20,
"id": "qwen3_omni_v6_lora",
"label": "Qwen3-Omni v6 LoRA",
"method_detail": "Verified held-out Qwen3-Omni v6 LoRA metrics, plus task 16 and any completed private-GPU future/retrieval/sensor-target probes scored from task-specific JSON.",
"proxy_scored_task_count": 0,
"result_record_count": 20,
"scope": "128 selected episodes, held-out test",
"scored_task_count": 20,
"status_counts": {
"scored": 20
}
},
{
"direct_scored_task_count": 20,
"id": "cosmos3_super_reasoner",
"label": "Cosmos3-Super Reasoner",
"method_detail": "Verified Cosmos3-Super base-weight Reasoner JSON-task evaluation, plus task 5/8/9/10/11/12/13/14/16/17/18/19/20 probes where public metrics exist.",
"proxy_scored_task_count": 0,
"result_record_count": 20,
"scope": "128 selected episodes, held-out test",
"scored_task_count": 20,
"status_counts": {
"scored": 20
}
},
{
"direct_scored_task_count": 20,
"id": "cosmos3_nano_future_window",
"label": "Cosmos3-Nano Future Window",
"method_detail": "Verified Cosmos3-Nano future-window compatibility metrics, plus model-output probes for tasks 2/5/7/8/10/11/12/13/14/15/16/17/18/19 and a derived task-20 boundary timing probe scored from held-out future-window artifacts.",
"proxy_scored_task_count": 0,
"result_record_count": 20,
"scope": "128 selected episodes, held-out test",
"scored_task_count": 20,
"status_counts": {
"scored": 20
}
}
],
"not_for": "Do not read compact-proxy cells as direct raw-target measurements.",
"primary_use": "Compare same-split metadata/raw baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super Reasoner, and Cosmos3-Nano Future Window while keeping evidence type explicit.",
"primary_visuals": [
"docs/assets/charts/two_evidence_line_map.svg",
"docs/assets/charts/episode128_task_model_radar.svg",
"docs/assets/charts/unified_task_model_radar.svg"
],
"proxy_scored_method_task_count": 6,
"result_statement": "140/140 selected-128 scores across seven methods: 134 direct scores plus 6 documented compact-proxy scores.",
"scored_method_task_count": 140,
"short_label": "Line 2",
"task_count": 20
}
],
"method_blocks": [
{
"block": "Task-head baselines",
"direct_scored_method_task_count": 40,
"evidence_type": "Direct target metrics on the public sample windows.",
"line_id": "single_public_sample_episode",
"line_label": "1 sample episode",
"method_ids": [
"minimal",
"neural_mlp"
],
"method_task_record_count": 40,
"methods": [
"Minimal",
"Neural MLP"
],
"proxy_scored_method_task_count": 0,
"read_as": "Task construction, local reproducibility, and Minimal-vs-Neural behavior.",
"scored_method_task_count": 40
},
{
"block": "Aligned baseline heads",
"direct_scored_method_task_count": 74,
"evidence_type": "Direct processed-target metrics where available; compact proxies for documented raw-target gaps.",
"line_id": "selected_128_episode_surface",
"line_label": "128 selected episodes",
"method_ids": [
"metadata128_simple",
"metadata128_neural_mlp",
"raw128_simple",
"raw128_neural_mlp"
],
"method_task_record_count": 80,
"methods": [
"128ep Aligned Simple",
"128ep Aligned NN",
"128ep Raw Simple",
"128ep Raw NN"
],
"proxy_scored_method_task_count": 6,
"read_as": "Same-split metadata/raw-feature baseline comparison.",
"scored_method_task_count": 80
},
{
"block": "Qwen3-Omni series",
"direct_scored_method_task_count": 20,
"evidence_type": "Verified selected-128 Qwen3-Omni v6 LoRA plus source-linked task-specific probes.",
"line_id": "selected_128_episode_surface",
"line_label": "128 selected episodes",
"method_ids": [
"qwen3_omni_v6_lora"
],
"method_task_record_count": 20,
"methods": [
"Qwen3-Omni v6 LoRA"
],
"proxy_scored_method_task_count": 0,
"read_as": "Trainable Qwen3-Omni diagnostic baseline on the selected-128 surface.",
"scored_method_task_count": 20
},
{
"block": "Cosmos3 series",
"direct_scored_method_task_count": 40,
"evidence_type": "Verified Cosmos3-Super Reasoner and Cosmos3-Nano Future Window public-safe artifacts.",
"line_id": "selected_128_episode_surface",
"line_label": "128 selected episodes",
"method_ids": [
"cosmos3_super_reasoner",
"cosmos3_nano_future_window"
],
"method_task_record_count": 40,
"methods": [
"Cosmos3-Super Reasoner",
"Cosmos3-Nano Future Window"
],
"proxy_scored_method_task_count": 0,
"read_as": "Cosmos3 reasoner and future-window diagnostics on the selected-128 surface.",
"scored_method_task_count": 40
}
],
"proxy_records": [
{
"line_id": "selected_128_episode_surface",
"method": "128ep Raw Simple",
"metric_key": "macro_f1",
"reason": "documented compact proxy completion for this raw128 task axis",
"series_id": "raw128_simple",
"source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json",
"task_id": "interaction_text_prediction",
"task_label": "Interaction Text Prediction",
"task_number": 15
},
{
"line_id": "selected_128_episode_surface",
"method": "128ep Raw NN",
"metric_key": "macro_f1",
"reason": "documented compact proxy completion for this raw128 task axis",
"series_id": "raw128_neural_mlp",
"source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json",
"task_id": "interaction_text_prediction",
"task_label": "Interaction Text Prediction",
"task_number": 15
},
{
"line_id": "selected_128_episode_surface",
"method": "128ep Aligned Simple",
"metric_key": "mrr",
"reason": "paired camera-view embeddings are absent from the 128 JSONL/feature export; metadata features retrieve the synchronized same-window depth/audio block as a documented compact synchronization proxy",
"series_id": "metadata128_simple",
"source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/camera_view_sync_retrieval/metrics.json",
"task_id": "camera_view_sync_retrieval",
"task_label": "Camera-View Synchronization Retrieval",
"task_number": 19
},
{
"line_id": "selected_128_episode_surface",
"method": "128ep Aligned NN",
"metric_key": "mrr",
"reason": "paired camera-view embeddings are absent from the 128 JSONL/feature export; metadata features retrieve the synchronized same-window depth/audio block as a documented compact synchronization proxy",
"series_id": "metadata128_neural_mlp",
"source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/camera_view_sync_retrieval/metrics.json",
"task_id": "camera_view_sync_retrieval",
"task_label": "Camera-View Synchronization Retrieval",
"task_number": 19
},
{
"line_id": "selected_128_episode_surface",
"method": "128ep Raw Simple",
"metric_key": "mrr",
"reason": "documented compact proxy completion for this raw128 task axis",
"series_id": "raw128_simple",
"source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json",
"task_id": "camera_view_sync_retrieval",
"task_label": "Camera-View Synchronization Retrieval",
"task_number": 19
},
{
"line_id": "selected_128_episode_surface",
"method": "128ep Raw NN",
"metric_key": "mrr",
"reason": "documented compact proxy completion for this raw128 task axis",
"series_id": "raw128_neural_mlp",
"source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json",
"task_id": "camera_view_sync_retrieval",
"task_label": "Camera-View Synchronization Retrieval",
"task_number": 19
}
],
"reader_policy": {
"proxy_policy": "Proxy-scored cells stay numeric only when the source artifact and reason are attached; they should not be read as direct raw-target measurements.",
"selected_128_episode_surface": "Use for held-out comparison, metadata/raw-feature baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super Reasoner, Cosmos3-Nano Future Window, and scale-up decisions.",
"single_public_sample_episode": "Use for task construction, raw-file inspection, local reproducibility, and controlled Minimal-vs-Neural baseline behavior."
},
"reader_summary": "The suite has two public evidence lines. Line 1 is the fully inspectable one-episode task lab. Line 2 is the 128-episode comparison surface for aligned baselines, the Qwen3-Omni series, and the Cosmos3 series. Do not mix the two when reading scores.",
"reading_order": [
{
"reason": "Line 1 answers task-lab and reproducibility questions; line 2 answers selected-128 comparison questions.",
"step": "Choose the evidence line"
},
{
"reason": "Use the 1-episode radar for Minimal-vs-Neural behavior and the 128-episode radar for metadata/raw baselines, Qwen3-Omni v6, Cosmos3-Super, and Cosmos3-Nano.",
"step": "Open the matching radar"
},
{
"reason": "Every numeric score is tied to a method, task, metric key, source artifact, and proxy flag.",
"step": "Inspect the matrix row"
},
{
"reason": "The six compact-proxy cells are numeric but are not direct raw-target measurements.",
"step": "Check proxy cells before interpreting totals"
}
],
"related_model_artifacts": [
{
"name": "Qwen3-Omni v1-v6 run lineage",
"repo": "docs/data/qwen3_omni_run_lineage.json",
"role": "Explains the LoRA/evaluation version ladder; v6 is the current 20-task matrix row, v5 remains the pinned prior release, and v1-v4 are lineage/ablation evidence."
},
{
"name": "Cosmos3-Super Forward-Dynamics LoRA",
"repo": "https://huggingface.co/cy0307/ropedia-cosmos3-super-forward-dynamics-lora-128ep",
"role": "Separate fine-tuned adapter artifact for forward-dynamics loss metrics; published with weights/results but not counted as a 20-task matrix method row."
}
],
"score_formula": "2 single-episode methods x 20 tasks = 40 records; 7 selected-128 methods x 20 tasks = 140 records; total public matrix = 180/180 scored records.",
"source_lines": "docs/data/two_evidence_lines.json",
"source_matrix": "docs/data/task_method_20_result_matrix.json",
"status": "pass",
"summary": {
"direct_scored_method_task_count": 174,
"line_count": 2,
"method_count": 9,
"method_task_record_count": 180,
"proxy_scored_method_task_count": 6,
"scored_method_task_count": 180,
"task_count": 20
},
"title": "Two Evidence-Line Result Summary"
}
|