cy0307 commited on
Commit
8a8926d
·
verified ·
1 Parent(s): 0f657b5

Refine reader-facing public wording (4/6)

Browse files
Files changed (20) hide show
  1. data/task_surface_integrity.json +1 -1
  2. data/website_integrity.json +10 -10
  3. data/xperience10m_dataset_card_alignment.json +2 -2
  4. docs/data/website_integrity.json +10 -10
  5. docs/data/xperience10m_dataset_card_alignment.json +2 -2
  6. metrics/task_surface_integrity.json +1 -1
  7. metrics/website_integrity.json +10 -10
  8. metrics/xperience10m_dataset_card_alignment.json +2 -2
  9. results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md +1 -1
  10. results/omni_finetune/OMNI_MODEL_COMPARISON.md +2 -2
  11. results/omni_finetune/XPERIENCE10M_128_DATA_PREPARATION_AND_FINETUNE_PLAN.md +2 -2
  12. results/omni_finetune/verified_public/xperience10m_cosmos3_super_forward_dynamics_lora_128ep_train1epoch_256_attn_full8gpu_20260608_eval_test_full_fsdp/dataset/target_manifest.json +1 -1
  13. results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/analysis/ERROR_ANALYSIS.md +1 -1
  14. results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/analysis/error_analysis_summary.json +1 -1
  15. results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval/analysis/ERROR_ANALYSIS.md +1 -1
  16. results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval/analysis/error_analysis_summary.json +1 -1
  17. results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora_eval_test_full/analysis/ERROR_ANALYSIS.md +1 -1
  18. results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora_eval_test_full/analysis/error_analysis_summary.json +1 -1
  19. results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full/analysis/ERROR_ANALYSIS.md +1 -1
  20. results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full/analysis/error_analysis_summary.json +1 -1
data/task_surface_integrity.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "status": "pass",
3
- "generated_at_utc": "2026-06-22T10:10:38+00:00",
4
  "summary": {
5
  "original_walkthrough_task_count": 12,
6
  "expected_original_walkthrough_task_count": 12,
 
1
  {
2
  "status": "pass",
3
+ "generated_at_utc": "2026-06-22T11:17:07+00:00",
4
  "summary": {
5
  "original_walkthrough_task_count": 12,
6
  "expected_original_walkthrough_task_count": 12,
data/website_integrity.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "status": "pass",
3
- "generated_at_utc": "2026-06-22T10:09:34+00:00",
4
  "docs_root": "docs",
5
  "site_base": "/ropedia-xperience-10m-task-suite/",
6
  "summary": {
@@ -340,7 +340,7 @@
340
  },
341
  {
342
  "path": "data/evidence_contract.json",
343
- "bytes": 12219,
344
  "top_level_type": "dict"
345
  },
346
  {
@@ -370,7 +370,7 @@
370
  },
371
  {
372
  "path": "data/mirror_parity.json",
373
- "bytes": 1460402,
374
  "top_level_type": "dict"
375
  },
376
  {
@@ -385,12 +385,12 @@
385
  },
386
  {
387
  "path": "data/omni_model_comparison.json",
388
- "bytes": 82102,
389
  "top_level_type": "dict"
390
  },
391
  {
392
  "path": "data/project_brief.json",
393
- "bytes": 4032,
394
  "top_level_type": "dict"
395
  },
396
  {
@@ -410,7 +410,7 @@
410
  },
411
  {
412
  "path": "data/public_reader_map.json",
413
- "bytes": 6951,
414
  "top_level_type": "dict"
415
  },
416
  {
@@ -480,12 +480,12 @@
480
  },
481
  {
482
  "path": "data/research_takeaways.json",
483
- "bytes": 7165,
484
  "top_level_type": "dict"
485
  },
486
  {
487
  "path": "data/scope_claims_audit.json",
488
- "bytes": 21313,
489
  "top_level_type": "dict"
490
  },
491
  {
@@ -505,7 +505,7 @@
505
  },
506
  {
507
  "path": "data/summary_metrics.json",
508
- "bytes": 27518,
509
  "top_level_type": "dict"
510
  },
511
  {
@@ -515,7 +515,7 @@
515
  },
516
  {
517
  "path": "data/task_method_20_gap_audit.json",
518
- "bytes": 8500,
519
  "top_level_type": "dict"
520
  },
521
  {
 
1
  {
2
  "status": "pass",
3
+ "generated_at_utc": "2026-06-22T11:17:07+00:00",
4
  "docs_root": "docs",
5
  "site_base": "/ropedia-xperience-10m-task-suite/",
6
  "summary": {
 
340
  },
341
  {
342
  "path": "data/evidence_contract.json",
343
+ "bytes": 12101,
344
  "top_level_type": "dict"
345
  },
346
  {
 
370
  },
371
  {
372
  "path": "data/mirror_parity.json",
373
+ "bytes": 1461299,
374
  "top_level_type": "dict"
375
  },
376
  {
 
385
  },
386
  {
387
  "path": "data/omni_model_comparison.json",
388
+ "bytes": 82104,
389
  "top_level_type": "dict"
390
  },
391
  {
392
  "path": "data/project_brief.json",
393
+ "bytes": 4044,
394
  "top_level_type": "dict"
395
  },
396
  {
 
410
  },
411
  {
412
  "path": "data/public_reader_map.json",
413
+ "bytes": 7454,
414
  "top_level_type": "dict"
415
  },
416
  {
 
480
  },
481
  {
482
  "path": "data/research_takeaways.json",
483
+ "bytes": 7174,
484
  "top_level_type": "dict"
485
  },
486
  {
487
  "path": "data/scope_claims_audit.json",
488
+ "bytes": 21322,
489
  "top_level_type": "dict"
490
  },
491
  {
 
505
  },
506
  {
507
  "path": "data/summary_metrics.json",
508
+ "bytes": 27527,
509
  "top_level_type": "dict"
510
  },
511
  {
 
515
  },
516
  {
517
  "path": "data/task_method_20_gap_audit.json",
518
+ "bytes": 8501,
519
  "top_level_type": "dict"
520
  },
521
  {
data/xperience10m_dataset_card_alignment.json CHANGED
@@ -186,7 +186,7 @@
186
  "modality reconstruction",
187
  "misalignment detection"
188
  ],
189
- "not_yet_claimed": [
190
  "large-scale audio-visual pretraining",
191
  "caption generation",
192
  "depth-pixel estimation",
@@ -194,7 +194,7 @@
194
  "neural rendering",
195
  "policy learning",
196
  "cross-episode generalization",
197
- "real held-out multi-episode Qwen3-Omni model quality"
198
  ]
199
  },
200
  "responsible_use_boundary": [
 
186
  "modality reconstruction",
187
  "misalignment detection"
188
  ],
189
+ "not_yet_demonstrated": [
190
  "large-scale audio-visual pretraining",
191
  "caption generation",
192
  "depth-pixel estimation",
 
194
  "neural rendering",
195
  "policy learning",
196
  "cross-episode generalization",
197
+ "held-out multi-episode Qwen3-Omni model quality"
198
  ]
199
  },
200
  "responsible_use_boundary": [
docs/data/website_integrity.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "status": "pass",
3
- "generated_at_utc": "2026-06-22T10:09:34+00:00",
4
  "docs_root": "docs",
5
  "site_base": "/ropedia-xperience-10m-task-suite/",
6
  "summary": {
@@ -340,7 +340,7 @@
340
  },
341
  {
342
  "path": "data/evidence_contract.json",
343
- "bytes": 12219,
344
  "top_level_type": "dict"
345
  },
346
  {
@@ -370,7 +370,7 @@
370
  },
371
  {
372
  "path": "data/mirror_parity.json",
373
- "bytes": 1460402,
374
  "top_level_type": "dict"
375
  },
376
  {
@@ -385,12 +385,12 @@
385
  },
386
  {
387
  "path": "data/omni_model_comparison.json",
388
- "bytes": 82102,
389
  "top_level_type": "dict"
390
  },
391
  {
392
  "path": "data/project_brief.json",
393
- "bytes": 4032,
394
  "top_level_type": "dict"
395
  },
396
  {
@@ -410,7 +410,7 @@
410
  },
411
  {
412
  "path": "data/public_reader_map.json",
413
- "bytes": 6951,
414
  "top_level_type": "dict"
415
  },
416
  {
@@ -480,12 +480,12 @@
480
  },
481
  {
482
  "path": "data/research_takeaways.json",
483
- "bytes": 7165,
484
  "top_level_type": "dict"
485
  },
486
  {
487
  "path": "data/scope_claims_audit.json",
488
- "bytes": 21313,
489
  "top_level_type": "dict"
490
  },
491
  {
@@ -505,7 +505,7 @@
505
  },
506
  {
507
  "path": "data/summary_metrics.json",
508
- "bytes": 27518,
509
  "top_level_type": "dict"
510
  },
511
  {
@@ -515,7 +515,7 @@
515
  },
516
  {
517
  "path": "data/task_method_20_gap_audit.json",
518
- "bytes": 8500,
519
  "top_level_type": "dict"
520
  },
521
  {
 
1
  {
2
  "status": "pass",
3
+ "generated_at_utc": "2026-06-22T11:17:07+00:00",
4
  "docs_root": "docs",
5
  "site_base": "/ropedia-xperience-10m-task-suite/",
6
  "summary": {
 
340
  },
341
  {
342
  "path": "data/evidence_contract.json",
343
+ "bytes": 12101,
344
  "top_level_type": "dict"
345
  },
346
  {
 
370
  },
371
  {
372
  "path": "data/mirror_parity.json",
373
+ "bytes": 1461299,
374
  "top_level_type": "dict"
375
  },
376
  {
 
385
  },
386
  {
387
  "path": "data/omni_model_comparison.json",
388
+ "bytes": 82104,
389
  "top_level_type": "dict"
390
  },
391
  {
392
  "path": "data/project_brief.json",
393
+ "bytes": 4044,
394
  "top_level_type": "dict"
395
  },
396
  {
 
410
  },
411
  {
412
  "path": "data/public_reader_map.json",
413
+ "bytes": 7454,
414
  "top_level_type": "dict"
415
  },
416
  {
 
480
  },
481
  {
482
  "path": "data/research_takeaways.json",
483
+ "bytes": 7174,
484
  "top_level_type": "dict"
485
  },
486
  {
487
  "path": "data/scope_claims_audit.json",
488
+ "bytes": 21322,
489
  "top_level_type": "dict"
490
  },
491
  {
 
505
  },
506
  {
507
  "path": "data/summary_metrics.json",
508
+ "bytes": 27527,
509
  "top_level_type": "dict"
510
  },
511
  {
 
515
  },
516
  {
517
  "path": "data/task_method_20_gap_audit.json",
518
+ "bytes": 8501,
519
  "top_level_type": "dict"
520
  },
521
  {
docs/data/xperience10m_dataset_card_alignment.json CHANGED
@@ -186,7 +186,7 @@
186
  "modality reconstruction",
187
  "misalignment detection"
188
  ],
189
- "not_yet_claimed": [
190
  "large-scale audio-visual pretraining",
191
  "caption generation",
192
  "depth-pixel estimation",
@@ -194,7 +194,7 @@
194
  "neural rendering",
195
  "policy learning",
196
  "cross-episode generalization",
197
- "real held-out multi-episode Qwen3-Omni model quality"
198
  ]
199
  },
200
  "responsible_use_boundary": [
 
186
  "modality reconstruction",
187
  "misalignment detection"
188
  ],
189
+ "not_yet_demonstrated": [
190
  "large-scale audio-visual pretraining",
191
  "caption generation",
192
  "depth-pixel estimation",
 
194
  "neural rendering",
195
  "policy learning",
196
  "cross-episode generalization",
197
+ "held-out multi-episode Qwen3-Omni model quality"
198
  ]
199
  },
200
  "responsible_use_boundary": [
metrics/task_surface_integrity.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "status": "pass",
3
- "generated_at_utc": "2026-06-22T10:10:38+00:00",
4
  "summary": {
5
  "original_walkthrough_task_count": 12,
6
  "expected_original_walkthrough_task_count": 12,
 
1
  {
2
  "status": "pass",
3
+ "generated_at_utc": "2026-06-22T11:17:07+00:00",
4
  "summary": {
5
  "original_walkthrough_task_count": 12,
6
  "expected_original_walkthrough_task_count": 12,
metrics/website_integrity.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "status": "pass",
3
- "generated_at_utc": "2026-06-22T10:09:34+00:00",
4
  "docs_root": "docs",
5
  "site_base": "/ropedia-xperience-10m-task-suite/",
6
  "summary": {
@@ -340,7 +340,7 @@
340
  },
341
  {
342
  "path": "data/evidence_contract.json",
343
- "bytes": 12219,
344
  "top_level_type": "dict"
345
  },
346
  {
@@ -370,7 +370,7 @@
370
  },
371
  {
372
  "path": "data/mirror_parity.json",
373
- "bytes": 1460402,
374
  "top_level_type": "dict"
375
  },
376
  {
@@ -385,12 +385,12 @@
385
  },
386
  {
387
  "path": "data/omni_model_comparison.json",
388
- "bytes": 82102,
389
  "top_level_type": "dict"
390
  },
391
  {
392
  "path": "data/project_brief.json",
393
- "bytes": 4032,
394
  "top_level_type": "dict"
395
  },
396
  {
@@ -410,7 +410,7 @@
410
  },
411
  {
412
  "path": "data/public_reader_map.json",
413
- "bytes": 6951,
414
  "top_level_type": "dict"
415
  },
416
  {
@@ -480,12 +480,12 @@
480
  },
481
  {
482
  "path": "data/research_takeaways.json",
483
- "bytes": 7165,
484
  "top_level_type": "dict"
485
  },
486
  {
487
  "path": "data/scope_claims_audit.json",
488
- "bytes": 21313,
489
  "top_level_type": "dict"
490
  },
491
  {
@@ -505,7 +505,7 @@
505
  },
506
  {
507
  "path": "data/summary_metrics.json",
508
- "bytes": 27518,
509
  "top_level_type": "dict"
510
  },
511
  {
@@ -515,7 +515,7 @@
515
  },
516
  {
517
  "path": "data/task_method_20_gap_audit.json",
518
- "bytes": 8500,
519
  "top_level_type": "dict"
520
  },
521
  {
 
1
  {
2
  "status": "pass",
3
+ "generated_at_utc": "2026-06-22T11:17:07+00:00",
4
  "docs_root": "docs",
5
  "site_base": "/ropedia-xperience-10m-task-suite/",
6
  "summary": {
 
340
  },
341
  {
342
  "path": "data/evidence_contract.json",
343
+ "bytes": 12101,
344
  "top_level_type": "dict"
345
  },
346
  {
 
370
  },
371
  {
372
  "path": "data/mirror_parity.json",
373
+ "bytes": 1461299,
374
  "top_level_type": "dict"
375
  },
376
  {
 
385
  },
386
  {
387
  "path": "data/omni_model_comparison.json",
388
+ "bytes": 82104,
389
  "top_level_type": "dict"
390
  },
391
  {
392
  "path": "data/project_brief.json",
393
+ "bytes": 4044,
394
  "top_level_type": "dict"
395
  },
396
  {
 
410
  },
411
  {
412
  "path": "data/public_reader_map.json",
413
+ "bytes": 7454,
414
  "top_level_type": "dict"
415
  },
416
  {
 
480
  },
481
  {
482
  "path": "data/research_takeaways.json",
483
+ "bytes": 7174,
484
  "top_level_type": "dict"
485
  },
486
  {
487
  "path": "data/scope_claims_audit.json",
488
+ "bytes": 21322,
489
  "top_level_type": "dict"
490
  },
491
  {
 
505
  },
506
  {
507
  "path": "data/summary_metrics.json",
508
+ "bytes": 27527,
509
  "top_level_type": "dict"
510
  },
511
  {
 
515
  },
516
  {
517
  "path": "data/task_method_20_gap_audit.json",
518
+ "bytes": 8501,
519
  "top_level_type": "dict"
520
  },
521
  {
metrics/xperience10m_dataset_card_alignment.json CHANGED
@@ -186,7 +186,7 @@
186
  "modality reconstruction",
187
  "misalignment detection"
188
  ],
189
- "not_yet_claimed": [
190
  "large-scale audio-visual pretraining",
191
  "caption generation",
192
  "depth-pixel estimation",
@@ -194,7 +194,7 @@
194
  "neural rendering",
195
  "policy learning",
196
  "cross-episode generalization",
197
- "real held-out multi-episode Qwen3-Omni model quality"
198
  ]
199
  },
200
  "responsible_use_boundary": [
 
186
  "modality reconstruction",
187
  "misalignment detection"
188
  ],
189
+ "not_yet_demonstrated": [
190
  "large-scale audio-visual pretraining",
191
  "caption generation",
192
  "depth-pixel estimation",
 
194
  "neural rendering",
195
  "policy learning",
196
  "cross-episode generalization",
197
+ "held-out multi-episode Qwen3-Omni model quality"
198
  ]
199
  },
200
  "responsible_use_boundary": [
results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md CHANGED
@@ -36,7 +36,7 @@ final train loss of 0.4130 plus a validation loss of 0.0331. The result verifies
36
  the multi-episode pipeline and gives a real error-analysis baseline; it is still
37
  not a strong final model.
38
 
39
- A stronger model-quality pilot should be claimed only after:
40
 
41
  - selected valid episodes are available locally,
42
  - the manifest builder confirms complete held-out episode splits,
 
36
  the multi-episode pipeline and gives a real error-analysis baseline; it is still
37
  not a strong final model.
38
 
39
+ A stronger model-quality pilot should be presented only after:
40
 
41
  - selected valid episodes are available locally,
42
  - the manifest builder confirms complete held-out episode splits,
results/omni_finetune/OMNI_MODEL_COMPARISON.md CHANGED
@@ -1,6 +1,6 @@
1
  # Omni Model Comparison
2
 
3
- Generated: `2026-06-21T15:17:00+00:00`
4
 
5
  Compare only rows with the same scope and target. Single-episode raw-feature metrics, 128-episode metadata baselines, Qwen3 structured JSON metrics, and the two Cosmos3 targets answer different questions: Nano future-window retrieval versus Super structured JSON Reasoner evaluation.
6
 
@@ -133,4 +133,4 @@ This is the first verified Cosmos3-Super fine-tuned adapter branch. Its metric i
133
  ## Pending
134
 
135
  - Use the verified Qwen3 v6 rank64/lr5e-5 dense multiscale full-eval package as the latest current Qwen row; the v5 release tag remains pinned as the previous verified release.
136
- - Read results/omni_finetune/QWEN3_V5_V6_COMPARISON_20260614.md before claiming v6 is globally better than v5, because v6 improves action macro-F1 and contact accuracy but regresses subtask, next-action, object micro-F1, and JSON validity slightly.
 
1
  # Omni Model Comparison
2
 
3
+ Generated: `2026-06-22T10:59:59+00:00`
4
 
5
  Compare only rows with the same scope and target. Single-episode raw-feature metrics, 128-episode metadata baselines, Qwen3 structured JSON metrics, and the two Cosmos3 targets answer different questions: Nano future-window retrieval versus Super structured JSON Reasoner evaluation.
6
 
 
133
  ## Pending
134
 
135
  - Use the verified Qwen3 v6 rank64/lr5e-5 dense multiscale full-eval package as the latest current Qwen row; the v5 release tag remains pinned as the previous verified release.
136
+ - Read results/omni_finetune/QWEN3_V5_V6_COMPARISON_20260614.md before presenting v6 as globally better than v5, because v6 improves action macro-F1 and contact accuracy but regresses subtask, next-action, object micro-F1, and JSON validity slightly.
results/omni_finetune/XPERIENCE10M_128_DATA_PREPARATION_AND_FINETUNE_PLAN.md CHANGED
@@ -1,7 +1,7 @@
1
  # Xperience-10M 128-Episode Data Preparation and Fine-Tune Plan
2
 
3
  This is the executable plan for moving from metadata selection to real
4
- multi-episode training. It does not claim model-quality results until data is
5
  downloaded, staged, audited, trained, and evaluated on held-out sessions.
6
 
7
  ## Current Preflight
@@ -21,7 +21,7 @@ Current execution status:
21
  - a 128-episode data-preparation job has been launched on an HF-reachable host,
22
  - staged-file transfer is active,
23
  - later batches are scheduled after storage checks,
24
- - no multi-episode model-quality training result is claimed yet.
25
 
26
  ## Selected Data
27
 
 
1
  # Xperience-10M 128-Episode Data Preparation and Fine-Tune Plan
2
 
3
  This is the executable plan for moving from metadata selection to real
4
+ multi-episode training. It does not present model-quality results until data is
5
  downloaded, staged, audited, trained, and evaluated on held-out sessions.
6
 
7
  ## Current Preflight
 
21
  - a 128-episode data-preparation job has been launched on an HF-reachable host,
22
  - staged-file transfer is active,
23
  - later batches are scheduled after storage checks,
24
+ - no multi-episode model-quality training result is reported yet.
25
 
26
  ## Selected Data
27
 
results/omni_finetune/verified_public/xperience10m_cosmos3_super_forward_dynamics_lora_128ep_train1epoch_256_attn_full8gpu_20260608_eval_test_full_fsdp/dataset/target_manifest.json CHANGED
@@ -16,7 +16,7 @@
16
  "issues": [],
17
  "limitations": [
18
  "This is an egocentric camera-motion proxy, not a robot gripper or human hand-control action.",
19
- "Use it for Cosmos3 action-packer and one-episode overfit smoke tests before claiming model-quality improvement.",
20
  "Fit any normalization on train episodes only before a full publishable Cosmos adapter run."
21
  ]
22
  }
 
16
  "issues": [],
17
  "limitations": [
18
  "This is an egocentric camera-motion proxy, not a robot gripper or human hand-control action.",
19
+ "Use it for Cosmos3 action-packer and one-episode overfit smoke tests before reporting model-quality improvement.",
20
  "Fit any normalization on train episodes only before a full publishable Cosmos adapter run."
21
  ]
22
  }
results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/analysis/ERROR_ANALYSIS.md CHANGED
@@ -66,7 +66,7 @@ This report is computed from the verified public package predictions. It contain
66
 
67
  ## Interpretation
68
 
69
- The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before claiming stronger model quality.
70
 
71
  Generated files:
72
 
 
66
 
67
  ## Interpretation
68
 
69
+ The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before presenting stronger model quality.
70
 
71
  Generated files:
72
 
results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/analysis/error_analysis_summary.json CHANGED
@@ -523,5 +523,5 @@
523
  "visualization.rrd"
524
  ]
525
  },
526
- "interpretation": "The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before claiming stronger model quality."
527
  }
 
523
  "visualization.rrd"
524
  ]
525
  },
526
+ "interpretation": "The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before presenting stronger model quality."
527
  }
results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval/analysis/ERROR_ANALYSIS.md CHANGED
@@ -66,7 +66,7 @@ This report is computed from the verified public package predictions. It contain
66
 
67
  ## Interpretation
68
 
69
- The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before claiming stronger model quality.
70
 
71
  Generated files:
72
 
 
66
 
67
  ## Interpretation
68
 
69
+ The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before presenting stronger model quality.
70
 
71
  Generated files:
72
 
results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval/analysis/error_analysis_summary.json CHANGED
@@ -663,5 +663,5 @@
663
  "visualization.rrd"
664
  ]
665
  },
666
- "interpretation": "The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before claiming stronger model quality."
667
  }
 
663
  "visualization.rrd"
664
  ]
665
  },
666
+ "interpretation": "The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before presenting stronger model quality."
667
  }
results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora_eval_test_full/analysis/ERROR_ANALYSIS.md CHANGED
@@ -66,7 +66,7 @@ This report is computed from the verified public package predictions. It contain
66
 
67
  ## Interpretation
68
 
69
- The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before claiming stronger model quality.
70
 
71
  Generated files:
72
 
 
66
 
67
  ## Interpretation
68
 
69
+ The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before presenting stronger model quality.
70
 
71
  Generated files:
72
 
results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora_eval_test_full/analysis/error_analysis_summary.json CHANGED
@@ -457,5 +457,5 @@
457
  "visualization.rrd"
458
  ]
459
  },
460
- "interpretation": "The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before claiming stronger model quality."
461
  }
 
457
  "visualization.rrd"
458
  ]
459
  },
460
+ "interpretation": "The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before presenting stronger model quality."
461
  }
results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full/analysis/ERROR_ANALYSIS.md CHANGED
@@ -66,7 +66,7 @@ This report is computed from public-safe predictions and an episode manifest. It
66
 
67
  ## Interpretation
68
 
69
- The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before claiming stronger model quality.
70
 
71
  Generated files:
72
 
 
66
 
67
  ## Interpretation
68
 
69
+ The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before presenting stronger model quality.
70
 
71
  Generated files:
72
 
results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full/analysis/error_analysis_summary.json CHANGED
@@ -452,5 +452,5 @@
452
  "visualization.rrd"
453
  ]
454
  },
455
- "interpretation": "The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before claiming stronger model quality."
456
  }
 
452
  "visualization.rrd"
453
  ]
454
  },
455
+ "interpretation": "The diagnostic pilot is dominated by invalid or weak structured outputs and exact-label failures. These tables identify where to tighten JSON constraints, action/subtask target formatting, object vocabularies, and missing-modality robustness before presenting stronger model quality."
456
  }