Add files using upload-large-folder tool

Browse files

Files changed (15) hide show

data/artifact_index.json +67 -34
data/episode128_task_model_radar.json +11 -11
data/public_surface_qa.json +8 -8
data/quality_gates.json +13 -1
data/single_episode_task_model_radar.json +1 -1
data/task_method_20_gap_audit.json +1 -1
data/task_method_20_result_matrix.json +6 -6
data/task_method_20_source_audit.json +17 -0
data/website_integrity.json +11 -6
scripts/omni/eval_cosmos3_super_future_task_probes.py +17 -8
scripts/omni/merge_cosmos3_super_future_task_probe_shards.py +18 -1
scripts/omni/merge_qwen3_omni_future_task_probe_shards.py +18 -1
scripts/omni/merge_qwen3_omni_retrieval_task_probe_shards.py +18 -1
scripts/omni/run_128_task_baselines.py +1 -3
scripts/omni/train_cosmos3_super_forward_dynamics_lora.py +2 -1

data/artifact_index.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "title": "Ropedia Xperience-10M Task Suite Artifact Index",
-  "generated_at_utc": "2026-06-20T19:54:40+00:00",
   "status": "pass",
-  "artifact_count": 222,
   "missing": [],
   "by_kind": {
     "project_path": 18,
@@ -10,12 +10,12 @@
     "visual_asset_source": 3,
     "scaleup_contract": 7,
     "scaleup_status": 52,
-    "publication_workflow": 6,
     "reproducibility": 4,
     "project_scope": 1,
     "source_alignment": 5,
-    "evaluation_protocol": 8,
-    "website_data": 10,
     "generated_figure": 7,
     "visualization_builder": 1,
     "model_result": 5,
@@ -301,8 +301,8 @@
       "surface": "repo_hf",
       "shows": "Runs simple metadata and neural MLP baselines on the same selected 96/16/16 episode split used by the Qwen3-Omni diagnostic pilot.",
       "exists": true,
-      "bytes": 74368,
-      "sha256": "6f54bfb963d5102ebd61eb8f8b6d8f6919db673378c9d5940d89ec5ea6f3d4b2"
     },
     {
       "id": "task_suite_enhancement_128",
@@ -610,7 +610,7 @@
       "shows": "Machine-readable source-alignment pass/fail check for repo, website, and HF surfaces.",
       "exists": true,
       "bytes": 4432,
-      "sha256": "169e325bc72c8de10a37b192948b69625ad51c3b9560b4249b03bbdc7f135f97"
     },
     {
       "id": "source_alignment_validator",
@@ -730,8 +730,8 @@
       "surface": "website_hf",
       "shows": "Stores normalized 20-axis radar values, raw task metrics, Qwen3/Cosmos overlay mappings, branch-card caveats, proxy flags, and source artifacts.",
       "exists": true,
-      "bytes": 228743,
-      "sha256": "89ec87e0c5abe27e2273e59e959565103aca7d360f9cc1fb086f6d76f96c1097"
     },
     {
       "id": "single_episode_task_model_radar_json",
@@ -742,7 +742,7 @@
       "shows": "Machine-readable split radar for the one-episode Minimal and Neural MLP baselines, both scored on all 20 task contracts.",
       "exists": true,
       "bytes": 51097,
-      "sha256": "60ec85be6976c92f9719e693b4aa6e0b2a14cd23366433c412d5077e4a79cd79"
     },
     {
       "id": "episode128_task_model_radar_json",
@@ -752,8 +752,8 @@
       "surface": "website_hf",
       "shows": "Machine-readable split radar for selected 128-episode metadata/raw baselines and verified Qwen3/Cosmos branches, now complete at 140/140 scored rows with proxy notes retained.",
       "exists": true,
-      "bytes": 184889,
-      "sha256": "b40c8f8721020b75c69a298e3650b6f469444e7c21dd1135e822d934960bab98"
     },
     {
       "id": "task_method_20_result_matrix_json",
@@ -763,8 +763,8 @@
       "surface": "website_hf",
       "shows": "Machine-readable 9-method by 20-task matrix where every method has 20 records and the current release is complete at 180/180 scored rows.",
       "exists": true,
-      "bytes": 128481,
-      "sha256": "9b8806f4e4be69e2a074af2a091e59dd5a2409d87b61b426283e21763c995dce"
     },
     {
       "id": "task_method_20_result_matrix",
@@ -786,7 +786,7 @@
       "shows": "Machine-readable 180-record completion ledger with numeric scores, proxy flags, explicit status reasons, and source artifacts.",
       "exists": true,
       "bytes": 8500,
-      "sha256": "2347b06517a9b43e78769ba97cc2da4bafaf1bffd6ebcffa8a07508093ca2059"
     },
     {
       "id": "task_method_20_gap_audit",
@@ -797,7 +797,29 @@
       "shows": "Reader-facing ledger confirming 180/180 scored method-task cells and listing the six compact-proxy records separately.",
       "exists": true,
       "bytes": 3417,
-      "sha256": "1f3db1cacb53a26be98aeb71ac4ab9e9319ed7ad823b882d9c99b20ec9615626"
     },
     {
       "id": "unified_task_model_radar_chart",
@@ -840,8 +862,8 @@
       "surface": "repo_hf",
       "shows": "Regenerates the direction-aware radar chart and machine-readable metric overlay JSON.",
       "exists": true,
-      "bytes": 67953,
-      "sha256": "3533f33e52af60edf49b1c1fa586d2b8c158e3925875ca46c38714018e6035a0"
     },
     {
       "id": "task_method_20_gap_audit_builder",
@@ -854,6 +876,17 @@
       "bytes": 10295,
       "sha256": "e2a3b41d3cca6efee7076b68c35693a4c53f5f2549e2eecbf035b98a717a3f65"
     },
     {
       "id": "all_task_model_scoring_waiter",
       "title": "All-task model scoring guarded waiter",
@@ -873,8 +906,8 @@
       "surface": "repo_hf",
       "shows": "Checks whether Qwen3/Cosmos branches have train, validation, and test prediction files before extending model overlays to all 20 task contracts.",
       "exists": true,
-      "bytes": 4770,
-      "sha256": "ad701c03153c6755f284281640465efde3313f48bc0189942f5637bb19328bfb"
     },
     {
       "id": "model_output_probe_script",
@@ -884,8 +917,8 @@
       "surface": "repo_hf",
       "shows": "Audits model-output split availability and writes a readiness report without assigning new numeric task scores.",
       "exists": true,
-      "bytes": 9133,
-      "sha256": "3a867d0333fe591999715158e311011db25da018ca39c9b4638930841f35efb8"
     },
     {
       "id": "existing_model_output_task_probe",
@@ -1104,8 +1137,8 @@
       "surface": "repo_hf",
       "shows": "Lists the automated and post-publish checks used to keep the release current.",
       "exists": true,
-      "bytes": 4880,
-      "sha256": "526a38edffdb8e96eb7be3fc4ae4c8fab5a43ac4ed6e57137e9e0857c75b0a27"
     },
     {
       "id": "quality_gate_manifest",
@@ -1115,8 +1148,8 @@
       "surface": "website_hf",
       "shows": "Machine-readable release-check summary for validators, mirrors, and public project surfaces.",
       "exists": true,
-      "bytes": 8100,
-      "sha256": "48e27e70a590e881f25c6ee01dff8c0218b6b83bb2b5f8b17b68c8d38d1bf6a6"
     },
     {
       "id": "public_surface_qa",
@@ -1252,8 +1285,8 @@
       "surface": "repo",
       "shows": "Fetches the published GitHub/HF URLs and compares live hashes and public-card markers against the release assets.",
       "exists": true,
-      "bytes": 66834,
-      "sha256": "4cad7de030e0192ac2cf676738c30e070bf25414e482e594fbfe724aa3a7853a"
     },
     {
       "id": "reproducibility_contract",
@@ -1285,8 +1318,8 @@
       "surface": "repo_hf",
       "shows": "Generates the selective artifact catalog from local files.",
       "exists": true,
-      "bytes": 66058,
-      "sha256": "cc9c83c7094ef36b73125f902c6d8776203f8abc910cedb61610dadc1bb823a5"
     },
     {
       "id": "publication_audit",
@@ -1297,7 +1330,7 @@
       "volatile": true,
       "shows": "Confirms public bundles exclude raw data, caches, heavy archives, and credential text.",
       "exists": true,
-      "bytes": 10502,
       "hash_policy": "existence_and_size_only"
     },
     {
@@ -1321,7 +1354,7 @@
       "volatile": true,
       "shows": "Confirms prepared GitHub/HF Space/artifact/model mirrors share the same critical data, figure, website HTML, and validator files.",
       "exists": true,
-      "bytes": 1392513,
       "hash_policy": "existence_and_size_only"
     },
     {
@@ -1333,7 +1366,7 @@
       "volatile": true,
       "shows": "Confirms local website links, anchors, JSON data files, and referenced images resolve.",
       "exists": true,
-      "bytes": 20022,
       "hash_policy": "existence_and_size_only"
     },
     {

 {
   "title": "Ropedia Xperience-10M Task Suite Artifact Index",
+  "generated_at_utc": "2026-06-20T20:48:09+00:00",
   "status": "pass",
+  "artifact_count": 225,
   "missing": [],
   "by_kind": {
     "project_path": 18,
     "visual_asset_source": 3,
     "scaleup_contract": 7,
     "scaleup_status": 52,
+    "publication_workflow": 7,
     "reproducibility": 4,
     "project_scope": 1,
     "source_alignment": 5,
+    "evaluation_protocol": 9,
+    "website_data": 11,
     "generated_figure": 7,
     "visualization_builder": 1,
     "model_result": 5,
       "surface": "repo_hf",
       "shows": "Runs simple metadata and neural MLP baselines on the same selected 96/16/16 episode split used by the Qwen3-Omni diagnostic pilot.",
       "exists": true,
+      "bytes": 74316,
+      "sha256": "164c908bee1d4a6e0db344692833787582e45317b240ef5afbfbdb609a5175e6"
     },
     {
       "id": "task_suite_enhancement_128",
       "shows": "Machine-readable source-alignment pass/fail check for repo, website, and HF surfaces.",
       "exists": true,
       "bytes": 4432,
+      "sha256": "c916b18a11917e46e8561520cf2307f190c671c82e710ebd0f3522ec8a4be2bd"
     },
     {
       "id": "source_alignment_validator",
       "surface": "website_hf",
       "shows": "Stores normalized 20-axis radar values, raw task metrics, Qwen3/Cosmos overlay mappings, branch-card caveats, proxy flags, and source artifacts.",
       "exists": true,
+      "bytes": 228799,
+      "sha256": "c9c708f64963dac10e764eaae8e1b14c7161a938afa5ef5723fe59dc4ce764af"
     },
     {
       "id": "single_episode_task_model_radar_json",
       "shows": "Machine-readable split radar for the one-episode Minimal and Neural MLP baselines, both scored on all 20 task contracts.",
       "exists": true,
       "bytes": 51097,
+      "sha256": "d5e882120633f4d3ae90f1491682701c7593a42fc09e39b83fc5f375258e76e7"
     },
     {
       "id": "episode128_task_model_radar_json",
       "surface": "website_hf",
       "shows": "Machine-readable split radar for selected 128-episode metadata/raw baselines and verified Qwen3/Cosmos branches, now complete at 140/140 scored rows with proxy notes retained.",
       "exists": true,
+      "bytes": 184945,
+      "sha256": "8d4ef9c4cf1cf334fd41417d40fa0687ceefa964da9f8338c82f8cc6d36a3e76"
     },
     {
       "id": "task_method_20_result_matrix_json",
       "surface": "website_hf",
       "shows": "Machine-readable 9-method by 20-task matrix where every method has 20 records and the current release is complete at 180/180 scored rows.",
       "exists": true,
+      "bytes": 128509,
+      "sha256": "382e538dff284c5e2cf19fe2b3eb014d1b48fb33082bb2ece532ce3de6c1e9bb"
     },
     {
       "id": "task_method_20_result_matrix",
       "shows": "Machine-readable 180-record completion ledger with numeric scores, proxy flags, explicit status reasons, and source artifacts.",
       "exists": true,
       "bytes": 8500,
+      "sha256": "9cfd2ce8c4eb3bbe7e2af3f41df3b3ab74db9db08d9ea2e4f569f612358470dd"
     },
     {
       "id": "task_method_20_gap_audit",
       "shows": "Reader-facing ledger confirming 180/180 scored method-task cells and listing the six compact-proxy records separately.",
       "exists": true,
       "bytes": 3417,
+      "sha256": "3afc5db9803b6419ce4f40d6fb0dd5380ae182fb85b4f7b0f6ea6a46ae065c63"
+    },
+    {
+      "id": "task_method_20_source_audit_json",
+      "title": "Task-method 20-result source audit JSON",
+      "path": "docs/data/task_method_20_source_audit.json",
+      "kind": "website_data",
+      "surface": "website_hf",
+      "shows": "Machine-readable check that scored JSON-backed matrix cells match their declared metric source values.",
+      "exists": true,
+      "bytes": 561,
+      "sha256": "c795c8f387648a90e66146efc44a4be2f272d4a44097f0b9b39a7347df83daa0"
+    },
+    {
+      "id": "task_method_20_source_audit",
+      "title": "Task-method 20-result source audit",
+      "path": "TASK_METHOD_20_SOURCE_AUDIT.md",
+      "kind": "evaluation_protocol",
+      "surface": "repo_hf",
+      "shows": "Reader-facing source-value audit for the 180-result matrix.",
+      "exists": true,
+      "bytes": 447,
+      "sha256": "2b8bc99b7157894d59fa2f23ebaee33ce9e6e01c0b7316c7555ab0071c85eb41"
     },
     {
       "id": "unified_task_model_radar_chart",
       "surface": "repo_hf",
       "shows": "Regenerates the direction-aware radar chart and machine-readable metric overlay JSON.",
       "exists": true,
+      "bytes": 68542,
+      "sha256": "470b4c8acc437114b51d96987cd6324b9bf1d2ca16e9721d7fb00708aa58b383"
     },
     {
       "id": "task_method_20_gap_audit_builder",
       "bytes": 10295,
       "sha256": "e2a3b41d3cca6efee7076b68c35693a4c53f5f2549e2eecbf035b98a717a3f65"
     },
+    {
+      "id": "task_method_20_source_audit_validator",
+      "title": "Task-method source-audit validator",
+      "path": "scripts/validate_task_method_matrix_sources.py",
+      "kind": "publication_workflow",
+      "surface": "repo_hf",
+      "shows": "Fails release checks if a scored matrix row disagrees with its JSON metric source.",
+      "exists": true,
+      "bytes": 7877,
+      "sha256": "97edc3f064f77d544eff539bb7f16f8162e58ec581a63b91c473bada080f86ae"
+    },
     {
       "id": "all_task_model_scoring_waiter",
       "title": "All-task model scoring guarded waiter",
       "surface": "repo_hf",
       "shows": "Checks whether Qwen3/Cosmos branches have train, validation, and test prediction files before extending model overlays to all 20 task contracts.",
       "exists": true,
+      "bytes": 4320,
+      "sha256": "11cff26749bf6ad8b8ee028b18e0b4be5713ed8b5325578caa03be25d894263b"
     },
     {
       "id": "model_output_probe_script",
       "surface": "repo_hf",
       "shows": "Audits model-output split availability and writes a readiness report without assigning new numeric task scores.",
       "exists": true,
+      "bytes": 10520,
+      "sha256": "741ee733068e87c52c8da2bd15987e2b4538b5e705592182d76c42b5cf34fe96"
     },
     {
       "id": "existing_model_output_task_probe",
       "surface": "repo_hf",
       "shows": "Lists the automated and post-publish checks used to keep the release current.",
       "exists": true,
+      "bytes": 5184,
+      "sha256": "4931d4457c4c5b0978fdf31861b6e3e2da6e24368398cf1756120a32cbff98f0"
     },
     {
       "id": "quality_gate_manifest",
       "surface": "website_hf",
       "shows": "Machine-readable release-check summary for validators, mirrors, and public project surfaces.",
       "exists": true,
+      "bytes": 8640,
+      "sha256": "445196830bb913bfa075ae4174e7b1f5b64f623cf13a2afde7513add9dbefc21"
     },
     {
       "id": "public_surface_qa",
       "surface": "repo",
       "shows": "Fetches the published GitHub/HF URLs and compares live hashes and public-card markers against the release assets.",
       "exists": true,
+      "bytes": 67647,
+      "sha256": "d2b4af98e6fd8b23fd86cd068f2bbf887e5d69686dd62fe3bfc7e8251a6d75d6"
     },
     {
       "id": "reproducibility_contract",
       "surface": "repo_hf",
       "shows": "Generates the selective artifact catalog from local files.",
       "exists": true,
+      "bytes": 67105,
+      "sha256": "8fc1a2b5d4a50d49ff5738ec1e5e91088dbfa514c9f0485d3afe708add6d94a1"
     },
     {
       "id": "publication_audit",
       "volatile": true,
       "shows": "Confirms public bundles exclude raw data, caches, heavy archives, and credential text.",
       "exists": true,
+      "bytes": 10662,
       "hash_policy": "existence_and_size_only"
     },
     {
       "volatile": true,
       "shows": "Confirms prepared GitHub/HF Space/artifact/model mirrors share the same critical data, figure, website HTML, and validator files.",
       "exists": true,
+      "bytes": 1395239,
       "hash_policy": "existence_and_size_only"
     },
     {
       "volatile": true,
       "shows": "Confirms local website links, anchors, JSON data files, and referenced images resolve.",
       "exists": true,
+      "bytes": 20141,
       "hash_policy": "existence_and_size_only"
     },
     {

data/episode128_task_model_radar.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "title": "128-Episode 20-Task Radar",
   "status": "pass",
-  "generated_at_utc": "2026-06-20T19:54:37+00:00",
   "description": "Selected 128-episode metadata/raw baselines plus verified Qwen3/Cosmos branches. Every method has 20 records; numeric scores appear only where the public artifact produced that task target.",
   "task_count": 20,
   "method_count": 7,
@@ -1166,7 +1166,7 @@
         "cosmos3_super_reasoner": {
           "raw": 0.6286317274823326,
           "metric_key": "temporal_order_f1",
-          "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
           "scope": "multi_episode_128_partial_model_overlay",
           "status": "scored",
           "reason": null,
@@ -1257,7 +1257,7 @@
         "cosmos3_super_reasoner": {
           "raw": 0.37271645981034185,
           "metric_key": "misalignment_detection_f1",
-          "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
           "scope": "multi_episode_128_partial_model_overlay",
           "status": "scored",
           "reason": null,
@@ -1439,7 +1439,7 @@
         "cosmos3_super_reasoner": {
           "raw": 0.0,
           "metric_key": "next_subtask_forecast_macro_f1",
-          "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
           "scope": "multi_episode_128_partial_model_overlay",
           "status": "scored",
           "reason": null,
@@ -1519,7 +1519,7 @@
         "qwen3_omni_v6_lora": {
           "raw": 0.4318674027510605,
           "metric_key": "macro_f1",
-          "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json",
           "scope": "multi_episode_128_partial_model_overlay",
           "status": "scored",
           "reason": null,
@@ -1712,7 +1712,7 @@
         "cosmos3_super_reasoner": {
           "raw": 0.0009279881217520415,
           "metric_key": "object_set_forecast_micro_f1",
-          "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
           "scope": "multi_episode_128_partial_model_overlay",
           "status": "scored",
           "reason": null,
@@ -3372,7 +3372,7 @@
       "raw_text": "0.6286",
       "normalized_score": 0.6286317274823326,
       "metric_key": "temporal_order_f1",
-      "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
@@ -3498,7 +3498,7 @@
       "raw_text": "0.3727",
       "normalized_score": 0.37271645981034185,
       "metric_key": "misalignment_detection_f1",
-      "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
@@ -3750,7 +3750,7 @@
       "raw_text": "0.0000",
       "normalized_score": 0.0,
       "metric_key": "next_subtask_forecast_macro_f1",
-      "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
@@ -3858,7 +3858,7 @@
       "raw_text": "0.4319",
       "normalized_score": 0.4318674027510605,
       "metric_key": "macro_f1",
-      "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
@@ -4128,7 +4128,7 @@
       "raw_text": "0.0009",
       "normalized_score": 0.0009279881217520415,
       "metric_key": "object_set_forecast_micro_f1",
-      "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },

 {
   "title": "128-Episode 20-Task Radar",
   "status": "pass",
+  "generated_at_utc": "2026-06-20T20:38:21+00:00",
   "description": "Selected 128-episode metadata/raw baselines plus verified Qwen3/Cosmos branches. Every method has 20 records; numeric scores appear only where the public artifact produced that task target.",
   "task_count": 20,
   "method_count": 7,
         "cosmos3_super_reasoner": {
           "raw": 0.6286317274823326,
           "metric_key": "temporal_order_f1",
+          "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/temporal_order/metrics.json",
           "scope": "multi_episode_128_partial_model_overlay",
           "status": "scored",
           "reason": null,
         "cosmos3_super_reasoner": {
           "raw": 0.37271645981034185,
           "metric_key": "misalignment_detection_f1",
+          "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/misalignment_detection/metrics.json",
           "scope": "multi_episode_128_partial_model_overlay",
           "status": "scored",
           "reason": null,
         "cosmos3_super_reasoner": {
           "raw": 0.0,
           "metric_key": "next_subtask_forecast_macro_f1",
+          "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/next_subtask_forecast/metrics.json",
           "scope": "multi_episode_128_partial_model_overlay",
           "status": "scored",
           "reason": null,
         "qwen3_omni_v6_lora": {
           "raw": 0.4318674027510605,
           "metric_key": "macro_f1",
+          "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_interaction_text_task15_a100_20260620T010305Z/interaction_text_prediction/metrics.json",
           "scope": "multi_episode_128_partial_model_overlay",
           "status": "scored",
           "reason": null,
         "cosmos3_super_reasoner": {
           "raw": 0.0009279881217520415,
           "metric_key": "object_set_forecast_micro_f1",
+          "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/object_set_forecast/metrics.json",
           "scope": "multi_episode_128_partial_model_overlay",
           "status": "scored",
           "reason": null,
       "raw_text": "0.6286",
       "normalized_score": 0.6286317274823326,
       "metric_key": "temporal_order_f1",
+      "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/temporal_order/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
       "raw_text": "0.3727",
       "normalized_score": 0.37271645981034185,
       "metric_key": "misalignment_detection_f1",
+      "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/misalignment_detection/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
       "raw_text": "0.0000",
       "normalized_score": 0.0,
       "metric_key": "next_subtask_forecast_macro_f1",
+      "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/next_subtask_forecast/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
       "raw_text": "0.4319",
       "normalized_score": 0.4318674027510605,
       "metric_key": "macro_f1",
+      "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_interaction_text_task15_a100_20260620T010305Z/interaction_text_prediction/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
       "raw_text": "0.0009",
       "normalized_score": 0.0009279881217520415,
       "metric_key": "object_set_forecast_micro_f1",
+      "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/object_set_forecast/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },

data/public_surface_qa.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "title": "Ropedia Xperience-10M Public Project Surface",
   "status": "pass",
-  "generated_at_utc": "2026-06-20T19:55:18+00:00",
   "scope": "Repo README, GitHub Pages HTML, Hugging Face Space card, artifact dataset card, and model card.",
   "checks": [
     {
@@ -18,7 +18,7 @@
         "website_integrity": {
           "exists": true,
           "status": "pass",
-          "generated_at_utc": "2026-06-20T19:39:18+00:00"
         },
         "rendered_site_check": {
           "exists": true,
@@ -28,27 +28,27 @@
         "task_surface_integrity": {
           "exists": true,
           "status": "pass",
-          "generated_at_utc": "2026-06-20T18:44:50+00:00"
         },
         "source_alignment": {
           "exists": true,
           "status": "pass",
-          "generated_at_utc": "2026-06-20T18:44:49+00:00"
         },
         "scale_up_status": {
           "exists": true,
           "status": "pass",
-          "generated_at_utc": "2026-06-20T18:45:07+00:00"
         },
         "publication_package": {
           "exists": true,
           "status": "pass",
-          "generated_at_utc": "2026-06-20T19:39:40+00:00"
         },
         "mirror_parity": {
           "exists": true,
           "status": "pass",
-          "generated_at_utc": "2026-06-20T19:40:53+00:00"
         }
       },
       "failures": {}
@@ -111,7 +111,7 @@
         "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite": 11,
         "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts": 11,
         "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines": 14,
-        "https://huggingface.co/cy0307/ropedia-xperience-10m-weights-results": 9,
         "https://huggingface.co/datasets/ropedia-ai/xperience-10m": 38,
         "https://ropedia.com/dataset": 5
       }

 {
   "title": "Ropedia Xperience-10M Public Project Surface",
   "status": "pass",
+  "generated_at_utc": "2026-06-20T20:48:08+00:00",
   "scope": "Repo README, GitHub Pages HTML, Hugging Face Space card, artifact dataset card, and model card.",
   "checks": [
     {
         "website_integrity": {
           "exists": true,
           "status": "pass",
+          "generated_at_utc": "2026-06-20T20:41:45+00:00"
         },
         "rendered_site_check": {
           "exists": true,
         "task_surface_integrity": {
           "exists": true,
           "status": "pass",
+          "generated_at_utc": "2026-06-20T19:55:17+00:00"
         },
         "source_alignment": {
           "exists": true,
           "status": "pass",
+          "generated_at_utc": "2026-06-20T19:55:18+00:00"
         },
         "scale_up_status": {
           "exists": true,
           "status": "pass",
+          "generated_at_utc": "2026-06-20T19:55:26+00:00"
         },
         "publication_package": {
           "exists": true,
           "status": "pass",
+          "generated_at_utc": "2026-06-20T20:42:41+00:00"
         },
         "mirror_parity": {
           "exists": true,
           "status": "pass",
+          "generated_at_utc": "2026-06-20T20:47:51+00:00"
         }
       },
       "failures": {}
         "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite": 11,
         "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts": 11,
         "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines": 14,
+        "https://huggingface.co/cy0307/ropedia-xperience-10m-weights-results": 6,
         "https://huggingface.co/datasets/ropedia-ai/xperience-10m": 38,
         "https://ropedia.com/dataset": 5
       }

data/quality_gates.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "title": "Ropedia Xperience-10M Release Checks",
   "status": "pass",
-  "generated_at_utc": "2026-06-20T18:44:28+00:00",
   "rule": "A release is current when the automated reports pass and the live GitHub/Hugging Face mirrors are verified after publishing.",
   "automated_gates": [
     {
@@ -76,6 +76,18 @@
         "status": "pass"
       }
     },
     {
       "id": "figure_index",
       "title": "Figure index",

 {
   "title": "Ropedia Xperience-10M Release Checks",
   "status": "pass",
+  "generated_at_utc": "2026-06-20T20:48:18+00:00",
   "rule": "A release is current when the automated reports pass and the live GitHub/Hugging Face mirrors are verified after publishing.",
   "automated_gates": [
     {
         "status": "pass"
       }
     },
+    {
+      "id": "task_method_source_audit",
+      "title": "Task-method source audit",
+      "command": "python scripts/validate_task_method_matrix_sources.py",
+      "report": "docs/data/task_method_20_source_audit.json",
+      "blocks_if": "A scored 20-task matrix cell points to a JSON metric source that does not contain the same metric value.",
+      "shows": "Public 20-task scores remain traceable to their task-specific metric artifacts.",
+      "current_report": {
+        "exists": true,
+        "status": "pass"
+      }
+    },
     {
       "id": "figure_index",
       "title": "Figure index",

data/single_episode_task_model_radar.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "title": "Single-Episode 20-Task Radar",
   "status": "pass",
-  "generated_at_utc": "2026-06-20T19:54:37+00:00",
   "description": "Minimal and Neural MLP baselines on the one public sample episode, both scored on all 20 task contracts.",
   "task_count": 20,
   "method_count": 2,

 {
   "title": "Single-Episode 20-Task Radar",
   "status": "pass",
+  "generated_at_utc": "2026-06-20T20:38:21+00:00",
   "description": "Minimal and Neural MLP baselines on the one public sample episode, both scored on all 20 task contracts.",
   "task_count": 20,
   "method_count": 2,

data/task_method_20_gap_audit.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "generated_at_utc": "2026-06-20T19:54:37+00:00",
   "immediate_actions": [
     {
       "artifact": "docs/data/task_method_20_gap_audit.json",

 {
+  "generated_at_utc": "2026-06-20T20:38:59+00:00",
   "immediate_actions": [
     {
       "artifact": "docs/data/task_method_20_gap_audit.json",

data/task_method_20_result_matrix.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "title": "Task Method 20-Result Matrix",
   "status": "pass",
-  "generated_at_utc": "2026-06-20T19:54:37+00:00",
   "task_count": 20,
   "method_count": 9,
   "method_task_record_count": 180,
@@ -1980,7 +1980,7 @@
       "raw_text": "0.6286",
       "normalized_score": 0.6286317274823326,
       "metric_key": "temporal_order_f1",
-      "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
@@ -2142,7 +2142,7 @@
       "raw_text": "0.3727",
       "normalized_score": 0.37271645981034185,
       "metric_key": "misalignment_detection_f1",
-      "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
@@ -2466,7 +2466,7 @@
       "raw_text": "0.0000",
       "normalized_score": 0.0,
       "metric_key": "next_subtask_forecast_macro_f1",
-      "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
@@ -2610,7 +2610,7 @@
       "raw_text": "0.4319",
       "normalized_score": 0.4318674027510605,
       "metric_key": "macro_f1",
-      "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
@@ -2952,7 +2952,7 @@
       "raw_text": "0.0009",
       "normalized_score": 0.0009279881217520415,
       "metric_key": "object_set_forecast_micro_f1",
-      "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },

 {
   "title": "Task Method 20-Result Matrix",
   "status": "pass",
+  "generated_at_utc": "2026-06-20T20:38:21+00:00",
   "task_count": 20,
   "method_count": 9,
   "method_task_record_count": 180,
       "raw_text": "0.6286",
       "normalized_score": 0.6286317274823326,
       "metric_key": "temporal_order_f1",
+      "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/temporal_order/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
       "raw_text": "0.3727",
       "normalized_score": 0.37271645981034185,
       "metric_key": "misalignment_detection_f1",
+      "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/misalignment_detection/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
       "raw_text": "0.0000",
       "normalized_score": 0.0,
       "metric_key": "next_subtask_forecast_macro_f1",
+      "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/next_subtask_forecast/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
       "raw_text": "0.4319",
       "normalized_score": 0.4318674027510605,
       "metric_key": "macro_f1",
+      "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_interaction_text_task15_a100_20260620T010305Z/interaction_text_prediction/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },
       "raw_text": "0.0009",
       "normalized_score": 0.0009279881217520415,
       "metric_key": "object_set_forecast_micro_f1",
+      "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/object_set_forecast/metrics.json",
       "scope": "multi_episode_128_partial_model_overlay",
       "reason": null
     },

data/task_method_20_source_audit.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "checked_json_metric_count": 180,
+  "failure_count": 0,
+  "failures": [],
+  "generated_at_utc": "2026-06-20T20:48:41+00:00",
+  "method_task_record_count": 180,
+  "rule": "Every scored row that declares a JSON metric source must have the same numeric value under that row's metric_key.",
+  "scored_method_task_count": 180,
+  "skipped_record_count": 0,
+  "skipped_records": [],
+  "source_matrix": "docs/data/task_method_20_result_matrix.json",
+  "status": "pass",
+  "status_counts": {
+    "checked": 180
+  },
+  "title": "Task Method 20 Matrix Source Audit"
+}

data/website_integrity.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
   "status": "pass",
-  "generated_at_utc": "2026-06-20T19:55:48+00:00",
   "docs_root": "docs",
   "site_base": "/ropedia-xperience-10m-task-suite/",
   "summary": {
     "html_pages": 4,
     "local_references": 213,
     "external_reference_count": 152,
-    "json_files": 50,
     "image_assets_referenced": 28,
     "failure_count": 0
   },
@@ -301,7 +301,7 @@
     },
     {
       "path": "data/artifact_index.json",
-      "bytes": 121435,
       "top_level_type": "dict"
     },
     {
@@ -316,7 +316,7 @@
     },
     {
       "path": "data/episode128_task_model_radar.json",
-      "bytes": 184889,
       "top_level_type": "dict"
     },
     {
@@ -491,7 +491,12 @@
     },
     {
       "path": "data/task_method_20_result_matrix.json",
-      "bytes": 128481,
       "top_level_type": "dict"
     },
     {
@@ -526,7 +531,7 @@
     },
     {
       "path": "data/unified_task_model_radar.json",
-      "bytes": 228743,
       "top_level_type": "dict"
     },
     {

 {
   "status": "pass",
+  "generated_at_utc": "2026-06-20T20:41:45+00:00",
   "docs_root": "docs",
   "site_base": "/ropedia-xperience-10m-task-suite/",
   "summary": {
     "html_pages": 4,
     "local_references": 213,
     "external_reference_count": 152,
+    "json_files": 51,
     "image_assets_referenced": 28,
     "failure_count": 0
   },
     },
     {
       "path": "data/artifact_index.json",
+      "bytes": 122823,
       "top_level_type": "dict"
     },
     {
     },
     {
       "path": "data/episode128_task_model_radar.json",
+      "bytes": 184945,
       "top_level_type": "dict"
     },
     {
     },
     {
       "path": "data/task_method_20_result_matrix.json",
+      "bytes": 128509,
+      "top_level_type": "dict"
+    },
+    {
+      "path": "data/task_method_20_source_audit.json",
+      "bytes": 561,
       "top_level_type": "dict"
     },
     {
     },
     {
       "path": "data/unified_task_model_radar.json",
+      "bytes": 228799,
       "top_level_type": "dict"
     },
     {

scripts/omni/eval_cosmos3_super_future_task_probes.py CHANGED Viewed

@@ -31,6 +31,7 @@ from eval_qwen3_omni_future_task_probes import (
     score_task as qwen_score_task,
     select_eval_indices,
     select_tasks,
     task_target_value,
     time_to_transition_map,
     write_json,
@@ -212,9 +213,14 @@ def main() -> int:
     samples = load_jsonl(args.dataset_jsonl)
     future_map = future_index_map(samples, args.future_frames)
     transition_targets = time_to_transition_map(samples)
-    eval_indices = [idx for idx in select_eval_indices(samples, args) if idx in future_map]
-    if not eval_indices:
-        raise ValueError("No evaluation samples with future targets selected.")
     write_json(args.output_dir / "server_info.json", server_info(args))
     append_jsonl(
@@ -224,7 +230,9 @@ def main() -> int:
             "timestamp": time.time(),
             "run_id": args.run_id,
             "tasks": selected_tasks,
-            "num_eval_samples_with_future": len(eval_indices),
             "sample_offset": args.sample_offset,
             "sample_stride": args.sample_stride,
             "future_frames": args.future_frames,
@@ -246,9 +254,10 @@ def main() -> int:
     for task_id in selected_tasks:
         spec = TASK_SPECS[task_id]
         partial_path = args.output_dir / task_id / "predictions.partial.jsonl"
-        for local_pos, sample_idx in enumerate(eval_indices, start=1):
             sample = samples[sample_idx]
-            future_sample = samples[future_map[sample_idx]]
             pred_id = prediction_id(task_id, sample)
             if args.resume and pred_id in partial_by_task[task_id]:
                 continue
@@ -301,7 +310,7 @@ def main() -> int:
                     "timestamp": time.time(),
                     "task_id": task_id,
                     "sample_index": local_pos,
-                    "num_eval_samples": len(eval_indices),
                     "completed_samples_for_task": len(partial_by_task[task_id]),
                     "sample_id": sample.get("id"),
                     "seconds": round(time.time() - started, 3),
@@ -310,7 +319,7 @@ def main() -> int:
     task_metrics = {}
     for task_id in selected_tasks:
-        rows = [partial_by_task[task_id][prediction_id(task_id, samples[idx])] for idx in eval_indices]
         task_metrics[task_id] = score_task(task_id, TASK_SPECS[task_id], rows, args.output_dir, args)
     display_name = model_display_name(args)

     score_task as qwen_score_task,
     select_eval_indices,
     select_tasks,
+    task_requires_future_sample,
     task_target_value,
     time_to_transition_map,
     write_json,
     samples = load_jsonl(args.dataset_jsonl)
     future_map = future_index_map(samples, args.future_frames)
     transition_targets = time_to_transition_map(samples)
+    base_eval_indices = select_eval_indices(samples, args)
+    eval_indices_by_task = {
+        task_id: [idx for idx in base_eval_indices if (not task_requires_future_sample(task_id) or idx in future_map)]
+        for task_id in selected_tasks
+    }
+    empty_tasks = [task_id for task_id, indices in eval_indices_by_task.items() if not indices]
+    if empty_tasks:
+        raise ValueError(f"No evaluation samples selected for tasks: {', '.join(empty_tasks)}")
     write_json(args.output_dir / "server_info.json", server_info(args))
     append_jsonl(
             "timestamp": time.time(),
             "run_id": args.run_id,
             "tasks": selected_tasks,
+            "num_base_eval_samples": len(base_eval_indices),
+            "num_eval_samples_by_task": {task_id: len(indices) for task_id, indices in eval_indices_by_task.items()},
+            "num_eval_samples_with_future": sum(1 for idx in base_eval_indices if idx in future_map),
             "sample_offset": args.sample_offset,
             "sample_stride": args.sample_stride,
             "future_frames": args.future_frames,
     for task_id in selected_tasks:
         spec = TASK_SPECS[task_id]
         partial_path = args.output_dir / task_id / "predictions.partial.jsonl"
+        task_eval_indices = eval_indices_by_task[task_id]
+        for local_pos, sample_idx in enumerate(task_eval_indices, start=1):
             sample = samples[sample_idx]
+            future_sample = samples[future_map[sample_idx]] if task_requires_future_sample(task_id) else sample
             pred_id = prediction_id(task_id, sample)
             if args.resume and pred_id in partial_by_task[task_id]:
                 continue
                     "timestamp": time.time(),
                     "task_id": task_id,
                     "sample_index": local_pos,
+                    "num_eval_samples": len(task_eval_indices),
                     "completed_samples_for_task": len(partial_by_task[task_id]),
                     "sample_id": sample.get("id"),
                     "seconds": round(time.time() - started, 3),
     task_metrics = {}
     for task_id in selected_tasks:
+        rows = [partial_by_task[task_id][prediction_id(task_id, samples[idx])] for idx in eval_indices_by_task[task_id]]
         task_metrics[task_id] = score_task(task_id, TASK_SPECS[task_id], rows, args.output_dir, args)
     display_name = model_display_name(args)

scripts/omni/merge_cosmos3_super_future_task_probe_shards.py CHANGED Viewed

@@ -56,13 +56,27 @@ def main() -> int:
     args.output_dir.mkdir(parents=True, exist_ok=True)
     task_metrics: dict[str, dict[str, Any]] = {}
     first_metrics: dict[str, Any] | None = None
     for task_id, spec in TASK_SPECS.items():
         rows_by_id: dict[str, dict[str, Any]] = {}
         for shard_dir in args.shard_dir:
             for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
                 key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
-                rows_by_id.setdefault(key, row)
             shard_metrics = read_json(shard_dir / task_id / "metrics.json")
             if shard_metrics and first_metrics is None:
                 first_metrics = shard_metrics
@@ -90,6 +104,9 @@ def main() -> int:
         "status": "pass",
         "run_id": args.run_id,
         "shard_dirs": [str(path) for path in args.shard_dir],
         "tasks": {
             task_id: {
                 "task_number": metrics["task_number"],

     args.output_dir.mkdir(parents=True, exist_ok=True)
     task_metrics: dict[str, dict[str, Any]] = {}
     first_metrics: dict[str, Any] | None = None
+    duplicate_predictions: list[dict[str, Any]] = []
     for task_id, spec in TASK_SPECS.items():
         rows_by_id: dict[str, dict[str, Any]] = {}
+        row_sources: dict[str, str] = {}
         for shard_dir in args.shard_dir:
             for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
                 key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
+                if key in rows_by_id:
+                    duplicate_predictions.append(
+                        {
+                            "task_id": task_id,
+                            "prediction_id": key,
+                            "kept_shard": row_sources.get(key),
+                            "duplicate_shard": str(shard_dir),
+                            "conflict": rows_by_id[key] != row,
+                        }
+                    )
+                    continue
+                rows_by_id[key] = row
+                row_sources[key] = str(shard_dir)
             shard_metrics = read_json(shard_dir / task_id / "metrics.json")
             if shard_metrics and first_metrics is None:
                 first_metrics = shard_metrics
         "status": "pass",
         "run_id": args.run_id,
         "shard_dirs": [str(path) for path in args.shard_dir],
+        "duplicate_prediction_count": len(duplicate_predictions),
+        "duplicate_prediction_conflict_count": sum(1 for row in duplicate_predictions if row["conflict"]),
+        "duplicate_predictions": duplicate_predictions[:50],
         "tasks": {
             task_id: {
                 "task_number": metrics["task_number"],

scripts/omni/merge_qwen3_omni_future_task_probe_shards.py CHANGED Viewed

@@ -54,13 +54,27 @@ def main() -> int:
     args.output_dir.mkdir(parents=True, exist_ok=True)
     task_metrics: dict[str, dict[str, Any]] = {}
     first_metrics: dict[str, Any] | None = None
     for task_id, spec in TASK_SPECS.items():
         rows_by_id: dict[str, dict[str, Any]] = {}
         for shard_dir in args.shard_dir:
             for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
                 key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
-                rows_by_id.setdefault(key, row)
             shard_metrics = read_json(shard_dir / task_id / "metrics.json")
             if shard_metrics and first_metrics is None:
                 first_metrics = shard_metrics
@@ -82,6 +96,9 @@ def main() -> int:
         "status": "pass",
         "run_id": args.run_id,
         "shard_dirs": [str(path) for path in args.shard_dir],
         "tasks": {
             task_id: {
                 "task_number": metrics["task_number"],

     args.output_dir.mkdir(parents=True, exist_ok=True)
     task_metrics: dict[str, dict[str, Any]] = {}
     first_metrics: dict[str, Any] | None = None
+    duplicate_predictions: list[dict[str, Any]] = []
     for task_id, spec in TASK_SPECS.items():
         rows_by_id: dict[str, dict[str, Any]] = {}
+        row_sources: dict[str, str] = {}
         for shard_dir in args.shard_dir:
             for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
                 key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
+                if key in rows_by_id:
+                    duplicate_predictions.append(
+                        {
+                            "task_id": task_id,
+                            "prediction_id": key,
+                            "kept_shard": row_sources.get(key),
+                            "duplicate_shard": str(shard_dir),
+                            "conflict": rows_by_id[key] != row,
+                        }
+                    )
+                    continue
+                rows_by_id[key] = row
+                row_sources[key] = str(shard_dir)
             shard_metrics = read_json(shard_dir / task_id / "metrics.json")
             if shard_metrics and first_metrics is None:
                 first_metrics = shard_metrics
         "status": "pass",
         "run_id": args.run_id,
         "shard_dirs": [str(path) for path in args.shard_dir],
+        "duplicate_prediction_count": len(duplicate_predictions),
+        "duplicate_prediction_conflict_count": sum(1 for row in duplicate_predictions if row["conflict"]),
+        "duplicate_predictions": duplicate_predictions[:50],
         "tasks": {
             task_id: {
                 "task_number": metrics["task_number"],

scripts/omni/merge_qwen3_omni_retrieval_task_probe_shards.py CHANGED Viewed

@@ -55,13 +55,27 @@ def main() -> int:
     args.output_dir.mkdir(parents=True, exist_ok=True)
     task_metrics: dict[str, dict[str, Any]] = {}
     first_metrics: dict[str, Any] | None = None
     for task_id, spec in TASK_SPECS.items():
         rows_by_id: dict[str, dict[str, Any]] = {}
         for shard_dir in args.shard_dir:
             for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
                 key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
-                rows_by_id.setdefault(key, row)
             shard_metrics = read_json(shard_dir / task_id / "metrics.json")
             if shard_metrics and first_metrics is None:
                 first_metrics = shard_metrics
@@ -86,6 +100,9 @@ def main() -> int:
         "status": "pass",
         "run_id": args.run_id,
         "shard_dirs": [str(path) for path in args.shard_dir],
         "tasks": {
             task_id: {
                 "task_number": metrics["task_number"],

     args.output_dir.mkdir(parents=True, exist_ok=True)
     task_metrics: dict[str, dict[str, Any]] = {}
     first_metrics: dict[str, Any] | None = None
+    duplicate_predictions: list[dict[str, Any]] = []
     for task_id, spec in TASK_SPECS.items():
         rows_by_id: dict[str, dict[str, Any]] = {}
+        row_sources: dict[str, str] = {}
         for shard_dir in args.shard_dir:
             for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
                 key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
+                if key in rows_by_id:
+                    duplicate_predictions.append(
+                        {
+                            "task_id": task_id,
+                            "prediction_id": key,
+                            "kept_shard": row_sources.get(key),
+                            "duplicate_shard": str(shard_dir),
+                            "conflict": rows_by_id[key] != row,
+                        }
+                    )
+                    continue
+                rows_by_id[key] = row
+                row_sources[key] = str(shard_dir)
             shard_metrics = read_json(shard_dir / task_id / "metrics.json")
             if shard_metrics and first_metrics is None:
                 first_metrics = shard_metrics
         "status": "pass",
         "run_id": args.run_id,
         "shard_dirs": [str(path) for path in args.shard_dir],
+        "duplicate_prediction_count": len(duplicate_predictions),
+        "duplicate_prediction_conflict_count": sum(1 for row in duplicate_predictions if row["conflict"]),
+        "duplicate_predictions": duplicate_predictions[:50],
         "tasks": {
             task_id: {
                 "task_number": metrics["task_number"],

scripts/omni/run_128_task_baselines.py CHANGED Viewed

@@ -133,7 +133,7 @@ def parse_args() -> argparse.Namespace:
         default=256,
         help="Use centroid classification instead of dense softmax when the train label space is larger than this.",
     )
-    parser.add_argument("--include-neural", action="store_true", default=True)
     parser.add_argument("--neural-epochs", type=int, default=35)
     parser.add_argument("--neural-hidden-dim", type=int, default=128)
     parser.add_argument("--neural-batch-size", type=int, default=256)
@@ -335,8 +335,6 @@ def row_text_features(row: dict[str, Any], episode: dict[str, Any] | None) -> st
         parts.extend([
             "main_task:",
             norm(episode.get("main_task")),
-            "episode_split:",
-            norm(episode.get("split")),
         ])
     media = row.get("media") or {}
     parts.extend([

         default=256,
         help="Use centroid classification instead of dense softmax when the train label space is larger than this.",
     )
+    parser.add_argument("--include-neural", action=argparse.BooleanOptionalAction, default=True)
     parser.add_argument("--neural-epochs", type=int, default=35)
     parser.add_argument("--neural-hidden-dim", type=int, default=128)
     parser.add_argument("--neural-batch-size", type=int, default=256)
         parts.extend([
             "main_task:",
             norm(episode.get("main_task")),
         ])
     media = row.get("media") or {}
     parts.extend([

scripts/omni/train_cosmos3_super_forward_dynamics_lora.py CHANGED Viewed

@@ -884,7 +884,8 @@ def main() -> int:
         if accelerator.is_main_process:
             write_json(output_dir / "training_metadata.json", payload)
             write_report(output_dir, payload)
-            append_jsonl(progress_path, {"event": "complete", "timestamp": time.time(), "status": status})
     if accelerator.is_main_process:
         print(json.dumps({"status": status, "output_dir": str(output_dir), "adapter_dir": str(adapter_dir) if adapter_dir else None}, indent=2))

         if accelerator.is_main_process:
             write_json(output_dir / "training_metadata.json", payload)
             write_report(output_dir, payload)
+            final_event = "complete" if status in {"complete", "dry_run_complete"} else "finalized_failed"
+            append_jsonl(progress_path, {"event": final_event, "timestamp": time.time(), "status": status})
     if accelerator.is_main_process:
         print(json.dumps({"status": status, "output_dir": str(output_dir), "adapter_dir": str(adapter_dir) if adapter_dir else None}, indent=2))