cy0307 commited on
Commit
d272538
·
verified ·
1 Parent(s): 5331178

Add files using upload-large-folder tool

Browse files
data/artifact_index.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "title": "Ropedia Xperience-10M Task Suite Artifact Index",
3
- "generated_at_utc": "2026-06-20T19:54:40+00:00",
4
  "status": "pass",
5
- "artifact_count": 222,
6
  "missing": [],
7
  "by_kind": {
8
  "project_path": 18,
@@ -10,12 +10,12 @@
10
  "visual_asset_source": 3,
11
  "scaleup_contract": 7,
12
  "scaleup_status": 52,
13
- "publication_workflow": 6,
14
  "reproducibility": 4,
15
  "project_scope": 1,
16
  "source_alignment": 5,
17
- "evaluation_protocol": 8,
18
- "website_data": 10,
19
  "generated_figure": 7,
20
  "visualization_builder": 1,
21
  "model_result": 5,
@@ -301,8 +301,8 @@
301
  "surface": "repo_hf",
302
  "shows": "Runs simple metadata and neural MLP baselines on the same selected 96/16/16 episode split used by the Qwen3-Omni diagnostic pilot.",
303
  "exists": true,
304
- "bytes": 74368,
305
- "sha256": "6f54bfb963d5102ebd61eb8f8b6d8f6919db673378c9d5940d89ec5ea6f3d4b2"
306
  },
307
  {
308
  "id": "task_suite_enhancement_128",
@@ -610,7 +610,7 @@
610
  "shows": "Machine-readable source-alignment pass/fail check for repo, website, and HF surfaces.",
611
  "exists": true,
612
  "bytes": 4432,
613
- "sha256": "169e325bc72c8de10a37b192948b69625ad51c3b9560b4249b03bbdc7f135f97"
614
  },
615
  {
616
  "id": "source_alignment_validator",
@@ -730,8 +730,8 @@
730
  "surface": "website_hf",
731
  "shows": "Stores normalized 20-axis radar values, raw task metrics, Qwen3/Cosmos overlay mappings, branch-card caveats, proxy flags, and source artifacts.",
732
  "exists": true,
733
- "bytes": 228743,
734
- "sha256": "89ec87e0c5abe27e2273e59e959565103aca7d360f9cc1fb086f6d76f96c1097"
735
  },
736
  {
737
  "id": "single_episode_task_model_radar_json",
@@ -742,7 +742,7 @@
742
  "shows": "Machine-readable split radar for the one-episode Minimal and Neural MLP baselines, both scored on all 20 task contracts.",
743
  "exists": true,
744
  "bytes": 51097,
745
- "sha256": "60ec85be6976c92f9719e693b4aa6e0b2a14cd23366433c412d5077e4a79cd79"
746
  },
747
  {
748
  "id": "episode128_task_model_radar_json",
@@ -752,8 +752,8 @@
752
  "surface": "website_hf",
753
  "shows": "Machine-readable split radar for selected 128-episode metadata/raw baselines and verified Qwen3/Cosmos branches, now complete at 140/140 scored rows with proxy notes retained.",
754
  "exists": true,
755
- "bytes": 184889,
756
- "sha256": "b40c8f8721020b75c69a298e3650b6f469444e7c21dd1135e822d934960bab98"
757
  },
758
  {
759
  "id": "task_method_20_result_matrix_json",
@@ -763,8 +763,8 @@
763
  "surface": "website_hf",
764
  "shows": "Machine-readable 9-method by 20-task matrix where every method has 20 records and the current release is complete at 180/180 scored rows.",
765
  "exists": true,
766
- "bytes": 128481,
767
- "sha256": "9b8806f4e4be69e2a074af2a091e59dd5a2409d87b61b426283e21763c995dce"
768
  },
769
  {
770
  "id": "task_method_20_result_matrix",
@@ -786,7 +786,7 @@
786
  "shows": "Machine-readable 180-record completion ledger with numeric scores, proxy flags, explicit status reasons, and source artifacts.",
787
  "exists": true,
788
  "bytes": 8500,
789
- "sha256": "2347b06517a9b43e78769ba97cc2da4bafaf1bffd6ebcffa8a07508093ca2059"
790
  },
791
  {
792
  "id": "task_method_20_gap_audit",
@@ -797,7 +797,29 @@
797
  "shows": "Reader-facing ledger confirming 180/180 scored method-task cells and listing the six compact-proxy records separately.",
798
  "exists": true,
799
  "bytes": 3417,
800
- "sha256": "1f3db1cacb53a26be98aeb71ac4ab9e9319ed7ad823b882d9c99b20ec9615626"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
801
  },
802
  {
803
  "id": "unified_task_model_radar_chart",
@@ -840,8 +862,8 @@
840
  "surface": "repo_hf",
841
  "shows": "Regenerates the direction-aware radar chart and machine-readable metric overlay JSON.",
842
  "exists": true,
843
- "bytes": 67953,
844
- "sha256": "3533f33e52af60edf49b1c1fa586d2b8c158e3925875ca46c38714018e6035a0"
845
  },
846
  {
847
  "id": "task_method_20_gap_audit_builder",
@@ -854,6 +876,17 @@
854
  "bytes": 10295,
855
  "sha256": "e2a3b41d3cca6efee7076b68c35693a4c53f5f2549e2eecbf035b98a717a3f65"
856
  },
 
 
 
 
 
 
 
 
 
 
 
857
  {
858
  "id": "all_task_model_scoring_waiter",
859
  "title": "All-task model scoring guarded waiter",
@@ -873,8 +906,8 @@
873
  "surface": "repo_hf",
874
  "shows": "Checks whether Qwen3/Cosmos branches have train, validation, and test prediction files before extending model overlays to all 20 task contracts.",
875
  "exists": true,
876
- "bytes": 4770,
877
- "sha256": "ad701c03153c6755f284281640465efde3313f48bc0189942f5637bb19328bfb"
878
  },
879
  {
880
  "id": "model_output_probe_script",
@@ -884,8 +917,8 @@
884
  "surface": "repo_hf",
885
  "shows": "Audits model-output split availability and writes a readiness report without assigning new numeric task scores.",
886
  "exists": true,
887
- "bytes": 9133,
888
- "sha256": "3a867d0333fe591999715158e311011db25da018ca39c9b4638930841f35efb8"
889
  },
890
  {
891
  "id": "existing_model_output_task_probe",
@@ -1104,8 +1137,8 @@
1104
  "surface": "repo_hf",
1105
  "shows": "Lists the automated and post-publish checks used to keep the release current.",
1106
  "exists": true,
1107
- "bytes": 4880,
1108
- "sha256": "526a38edffdb8e96eb7be3fc4ae4c8fab5a43ac4ed6e57137e9e0857c75b0a27"
1109
  },
1110
  {
1111
  "id": "quality_gate_manifest",
@@ -1115,8 +1148,8 @@
1115
  "surface": "website_hf",
1116
  "shows": "Machine-readable release-check summary for validators, mirrors, and public project surfaces.",
1117
  "exists": true,
1118
- "bytes": 8100,
1119
- "sha256": "48e27e70a590e881f25c6ee01dff8c0218b6b83bb2b5f8b17b68c8d38d1bf6a6"
1120
  },
1121
  {
1122
  "id": "public_surface_qa",
@@ -1252,8 +1285,8 @@
1252
  "surface": "repo",
1253
  "shows": "Fetches the published GitHub/HF URLs and compares live hashes and public-card markers against the release assets.",
1254
  "exists": true,
1255
- "bytes": 66834,
1256
- "sha256": "4cad7de030e0192ac2cf676738c30e070bf25414e482e594fbfe724aa3a7853a"
1257
  },
1258
  {
1259
  "id": "reproducibility_contract",
@@ -1285,8 +1318,8 @@
1285
  "surface": "repo_hf",
1286
  "shows": "Generates the selective artifact catalog from local files.",
1287
  "exists": true,
1288
- "bytes": 66058,
1289
- "sha256": "cc9c83c7094ef36b73125f902c6d8776203f8abc910cedb61610dadc1bb823a5"
1290
  },
1291
  {
1292
  "id": "publication_audit",
@@ -1297,7 +1330,7 @@
1297
  "volatile": true,
1298
  "shows": "Confirms public bundles exclude raw data, caches, heavy archives, and credential text.",
1299
  "exists": true,
1300
- "bytes": 10502,
1301
  "hash_policy": "existence_and_size_only"
1302
  },
1303
  {
@@ -1321,7 +1354,7 @@
1321
  "volatile": true,
1322
  "shows": "Confirms prepared GitHub/HF Space/artifact/model mirrors share the same critical data, figure, website HTML, and validator files.",
1323
  "exists": true,
1324
- "bytes": 1392513,
1325
  "hash_policy": "existence_and_size_only"
1326
  },
1327
  {
@@ -1333,7 +1366,7 @@
1333
  "volatile": true,
1334
  "shows": "Confirms local website links, anchors, JSON data files, and referenced images resolve.",
1335
  "exists": true,
1336
- "bytes": 20022,
1337
  "hash_policy": "existence_and_size_only"
1338
  },
1339
  {
 
1
  {
2
  "title": "Ropedia Xperience-10M Task Suite Artifact Index",
3
+ "generated_at_utc": "2026-06-20T20:48:09+00:00",
4
  "status": "pass",
5
+ "artifact_count": 225,
6
  "missing": [],
7
  "by_kind": {
8
  "project_path": 18,
 
10
  "visual_asset_source": 3,
11
  "scaleup_contract": 7,
12
  "scaleup_status": 52,
13
+ "publication_workflow": 7,
14
  "reproducibility": 4,
15
  "project_scope": 1,
16
  "source_alignment": 5,
17
+ "evaluation_protocol": 9,
18
+ "website_data": 11,
19
  "generated_figure": 7,
20
  "visualization_builder": 1,
21
  "model_result": 5,
 
301
  "surface": "repo_hf",
302
  "shows": "Runs simple metadata and neural MLP baselines on the same selected 96/16/16 episode split used by the Qwen3-Omni diagnostic pilot.",
303
  "exists": true,
304
+ "bytes": 74316,
305
+ "sha256": "164c908bee1d4a6e0db344692833787582e45317b240ef5afbfbdb609a5175e6"
306
  },
307
  {
308
  "id": "task_suite_enhancement_128",
 
610
  "shows": "Machine-readable source-alignment pass/fail check for repo, website, and HF surfaces.",
611
  "exists": true,
612
  "bytes": 4432,
613
+ "sha256": "c916b18a11917e46e8561520cf2307f190c671c82e710ebd0f3522ec8a4be2bd"
614
  },
615
  {
616
  "id": "source_alignment_validator",
 
730
  "surface": "website_hf",
731
  "shows": "Stores normalized 20-axis radar values, raw task metrics, Qwen3/Cosmos overlay mappings, branch-card caveats, proxy flags, and source artifacts.",
732
  "exists": true,
733
+ "bytes": 228799,
734
+ "sha256": "c9c708f64963dac10e764eaae8e1b14c7161a938afa5ef5723fe59dc4ce764af"
735
  },
736
  {
737
  "id": "single_episode_task_model_radar_json",
 
742
  "shows": "Machine-readable split radar for the one-episode Minimal and Neural MLP baselines, both scored on all 20 task contracts.",
743
  "exists": true,
744
  "bytes": 51097,
745
+ "sha256": "d5e882120633f4d3ae90f1491682701c7593a42fc09e39b83fc5f375258e76e7"
746
  },
747
  {
748
  "id": "episode128_task_model_radar_json",
 
752
  "surface": "website_hf",
753
  "shows": "Machine-readable split radar for selected 128-episode metadata/raw baselines and verified Qwen3/Cosmos branches, now complete at 140/140 scored rows with proxy notes retained.",
754
  "exists": true,
755
+ "bytes": 184945,
756
+ "sha256": "8d4ef9c4cf1cf334fd41417d40fa0687ceefa964da9f8338c82f8cc6d36a3e76"
757
  },
758
  {
759
  "id": "task_method_20_result_matrix_json",
 
763
  "surface": "website_hf",
764
  "shows": "Machine-readable 9-method by 20-task matrix where every method has 20 records and the current release is complete at 180/180 scored rows.",
765
  "exists": true,
766
+ "bytes": 128509,
767
+ "sha256": "382e538dff284c5e2cf19fe2b3eb014d1b48fb33082bb2ece532ce3de6c1e9bb"
768
  },
769
  {
770
  "id": "task_method_20_result_matrix",
 
786
  "shows": "Machine-readable 180-record completion ledger with numeric scores, proxy flags, explicit status reasons, and source artifacts.",
787
  "exists": true,
788
  "bytes": 8500,
789
+ "sha256": "9cfd2ce8c4eb3bbe7e2af3f41df3b3ab74db9db08d9ea2e4f569f612358470dd"
790
  },
791
  {
792
  "id": "task_method_20_gap_audit",
 
797
  "shows": "Reader-facing ledger confirming 180/180 scored method-task cells and listing the six compact-proxy records separately.",
798
  "exists": true,
799
  "bytes": 3417,
800
+ "sha256": "3afc5db9803b6419ce4f40d6fb0dd5380ae182fb85b4f7b0f6ea6a46ae065c63"
801
+ },
802
+ {
803
+ "id": "task_method_20_source_audit_json",
804
+ "title": "Task-method 20-result source audit JSON",
805
+ "path": "docs/data/task_method_20_source_audit.json",
806
+ "kind": "website_data",
807
+ "surface": "website_hf",
808
+ "shows": "Machine-readable check that scored JSON-backed matrix cells match their declared metric source values.",
809
+ "exists": true,
810
+ "bytes": 561,
811
+ "sha256": "c795c8f387648a90e66146efc44a4be2f272d4a44097f0b9b39a7347df83daa0"
812
+ },
813
+ {
814
+ "id": "task_method_20_source_audit",
815
+ "title": "Task-method 20-result source audit",
816
+ "path": "TASK_METHOD_20_SOURCE_AUDIT.md",
817
+ "kind": "evaluation_protocol",
818
+ "surface": "repo_hf",
819
+ "shows": "Reader-facing source-value audit for the 180-result matrix.",
820
+ "exists": true,
821
+ "bytes": 447,
822
+ "sha256": "2b8bc99b7157894d59fa2f23ebaee33ce9e6e01c0b7316c7555ab0071c85eb41"
823
  },
824
  {
825
  "id": "unified_task_model_radar_chart",
 
862
  "surface": "repo_hf",
863
  "shows": "Regenerates the direction-aware radar chart and machine-readable metric overlay JSON.",
864
  "exists": true,
865
+ "bytes": 68542,
866
+ "sha256": "470b4c8acc437114b51d96987cd6324b9bf1d2ca16e9721d7fb00708aa58b383"
867
  },
868
  {
869
  "id": "task_method_20_gap_audit_builder",
 
876
  "bytes": 10295,
877
  "sha256": "e2a3b41d3cca6efee7076b68c35693a4c53f5f2549e2eecbf035b98a717a3f65"
878
  },
879
+ {
880
+ "id": "task_method_20_source_audit_validator",
881
+ "title": "Task-method source-audit validator",
882
+ "path": "scripts/validate_task_method_matrix_sources.py",
883
+ "kind": "publication_workflow",
884
+ "surface": "repo_hf",
885
+ "shows": "Fails release checks if a scored matrix row disagrees with its JSON metric source.",
886
+ "exists": true,
887
+ "bytes": 7877,
888
+ "sha256": "97edc3f064f77d544eff539bb7f16f8162e58ec581a63b91c473bada080f86ae"
889
+ },
890
  {
891
  "id": "all_task_model_scoring_waiter",
892
  "title": "All-task model scoring guarded waiter",
 
906
  "surface": "repo_hf",
907
  "shows": "Checks whether Qwen3/Cosmos branches have train, validation, and test prediction files before extending model overlays to all 20 task contracts.",
908
  "exists": true,
909
+ "bytes": 4320,
910
+ "sha256": "11cff26749bf6ad8b8ee028b18e0b4be5713ed8b5325578caa03be25d894263b"
911
  },
912
  {
913
  "id": "model_output_probe_script",
 
917
  "surface": "repo_hf",
918
  "shows": "Audits model-output split availability and writes a readiness report without assigning new numeric task scores.",
919
  "exists": true,
920
+ "bytes": 10520,
921
+ "sha256": "741ee733068e87c52c8da2bd15987e2b4538b5e705592182d76c42b5cf34fe96"
922
  },
923
  {
924
  "id": "existing_model_output_task_probe",
 
1137
  "surface": "repo_hf",
1138
  "shows": "Lists the automated and post-publish checks used to keep the release current.",
1139
  "exists": true,
1140
+ "bytes": 5184,
1141
+ "sha256": "4931d4457c4c5b0978fdf31861b6e3e2da6e24368398cf1756120a32cbff98f0"
1142
  },
1143
  {
1144
  "id": "quality_gate_manifest",
 
1148
  "surface": "website_hf",
1149
  "shows": "Machine-readable release-check summary for validators, mirrors, and public project surfaces.",
1150
  "exists": true,
1151
+ "bytes": 8640,
1152
+ "sha256": "445196830bb913bfa075ae4174e7b1f5b64f623cf13a2afde7513add9dbefc21"
1153
  },
1154
  {
1155
  "id": "public_surface_qa",
 
1285
  "surface": "repo",
1286
  "shows": "Fetches the published GitHub/HF URLs and compares live hashes and public-card markers against the release assets.",
1287
  "exists": true,
1288
+ "bytes": 67647,
1289
+ "sha256": "d2b4af98e6fd8b23fd86cd068f2bbf887e5d69686dd62fe3bfc7e8251a6d75d6"
1290
  },
1291
  {
1292
  "id": "reproducibility_contract",
 
1318
  "surface": "repo_hf",
1319
  "shows": "Generates the selective artifact catalog from local files.",
1320
  "exists": true,
1321
+ "bytes": 67105,
1322
+ "sha256": "8fc1a2b5d4a50d49ff5738ec1e5e91088dbfa514c9f0485d3afe708add6d94a1"
1323
  },
1324
  {
1325
  "id": "publication_audit",
 
1330
  "volatile": true,
1331
  "shows": "Confirms public bundles exclude raw data, caches, heavy archives, and credential text.",
1332
  "exists": true,
1333
+ "bytes": 10662,
1334
  "hash_policy": "existence_and_size_only"
1335
  },
1336
  {
 
1354
  "volatile": true,
1355
  "shows": "Confirms prepared GitHub/HF Space/artifact/model mirrors share the same critical data, figure, website HTML, and validator files.",
1356
  "exists": true,
1357
+ "bytes": 1395239,
1358
  "hash_policy": "existence_and_size_only"
1359
  },
1360
  {
 
1366
  "volatile": true,
1367
  "shows": "Confirms local website links, anchors, JSON data files, and referenced images resolve.",
1368
  "exists": true,
1369
+ "bytes": 20141,
1370
  "hash_policy": "existence_and_size_only"
1371
  },
1372
  {
data/episode128_task_model_radar.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "title": "128-Episode 20-Task Radar",
3
  "status": "pass",
4
- "generated_at_utc": "2026-06-20T19:54:37+00:00",
5
  "description": "Selected 128-episode metadata/raw baselines plus verified Qwen3/Cosmos branches. Every method has 20 records; numeric scores appear only where the public artifact produced that task target.",
6
  "task_count": 20,
7
  "method_count": 7,
@@ -1166,7 +1166,7 @@
1166
  "cosmos3_super_reasoner": {
1167
  "raw": 0.6286317274823326,
1168
  "metric_key": "temporal_order_f1",
1169
- "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
1170
  "scope": "multi_episode_128_partial_model_overlay",
1171
  "status": "scored",
1172
  "reason": null,
@@ -1257,7 +1257,7 @@
1257
  "cosmos3_super_reasoner": {
1258
  "raw": 0.37271645981034185,
1259
  "metric_key": "misalignment_detection_f1",
1260
- "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
1261
  "scope": "multi_episode_128_partial_model_overlay",
1262
  "status": "scored",
1263
  "reason": null,
@@ -1439,7 +1439,7 @@
1439
  "cosmos3_super_reasoner": {
1440
  "raw": 0.0,
1441
  "metric_key": "next_subtask_forecast_macro_f1",
1442
- "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
1443
  "scope": "multi_episode_128_partial_model_overlay",
1444
  "status": "scored",
1445
  "reason": null,
@@ -1519,7 +1519,7 @@
1519
  "qwen3_omni_v6_lora": {
1520
  "raw": 0.4318674027510605,
1521
  "metric_key": "macro_f1",
1522
- "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json",
1523
  "scope": "multi_episode_128_partial_model_overlay",
1524
  "status": "scored",
1525
  "reason": null,
@@ -1712,7 +1712,7 @@
1712
  "cosmos3_super_reasoner": {
1713
  "raw": 0.0009279881217520415,
1714
  "metric_key": "object_set_forecast_micro_f1",
1715
- "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
1716
  "scope": "multi_episode_128_partial_model_overlay",
1717
  "status": "scored",
1718
  "reason": null,
@@ -3372,7 +3372,7 @@
3372
  "raw_text": "0.6286",
3373
  "normalized_score": 0.6286317274823326,
3374
  "metric_key": "temporal_order_f1",
3375
- "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
3376
  "scope": "multi_episode_128_partial_model_overlay",
3377
  "reason": null
3378
  },
@@ -3498,7 +3498,7 @@
3498
  "raw_text": "0.3727",
3499
  "normalized_score": 0.37271645981034185,
3500
  "metric_key": "misalignment_detection_f1",
3501
- "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
3502
  "scope": "multi_episode_128_partial_model_overlay",
3503
  "reason": null
3504
  },
@@ -3750,7 +3750,7 @@
3750
  "raw_text": "0.0000",
3751
  "normalized_score": 0.0,
3752
  "metric_key": "next_subtask_forecast_macro_f1",
3753
- "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
3754
  "scope": "multi_episode_128_partial_model_overlay",
3755
  "reason": null
3756
  },
@@ -3858,7 +3858,7 @@
3858
  "raw_text": "0.4319",
3859
  "normalized_score": 0.4318674027510605,
3860
  "metric_key": "macro_f1",
3861
- "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json",
3862
  "scope": "multi_episode_128_partial_model_overlay",
3863
  "reason": null
3864
  },
@@ -4128,7 +4128,7 @@
4128
  "raw_text": "0.0009",
4129
  "normalized_score": 0.0009279881217520415,
4130
  "metric_key": "object_set_forecast_micro_f1",
4131
- "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
4132
  "scope": "multi_episode_128_partial_model_overlay",
4133
  "reason": null
4134
  },
 
1
  {
2
  "title": "128-Episode 20-Task Radar",
3
  "status": "pass",
4
+ "generated_at_utc": "2026-06-20T20:38:21+00:00",
5
  "description": "Selected 128-episode metadata/raw baselines plus verified Qwen3/Cosmos branches. Every method has 20 records; numeric scores appear only where the public artifact produced that task target.",
6
  "task_count": 20,
7
  "method_count": 7,
 
1166
  "cosmos3_super_reasoner": {
1167
  "raw": 0.6286317274823326,
1168
  "metric_key": "temporal_order_f1",
1169
+ "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/temporal_order/metrics.json",
1170
  "scope": "multi_episode_128_partial_model_overlay",
1171
  "status": "scored",
1172
  "reason": null,
 
1257
  "cosmos3_super_reasoner": {
1258
  "raw": 0.37271645981034185,
1259
  "metric_key": "misalignment_detection_f1",
1260
+ "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/misalignment_detection/metrics.json",
1261
  "scope": "multi_episode_128_partial_model_overlay",
1262
  "status": "scored",
1263
  "reason": null,
 
1439
  "cosmos3_super_reasoner": {
1440
  "raw": 0.0,
1441
  "metric_key": "next_subtask_forecast_macro_f1",
1442
+ "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/next_subtask_forecast/metrics.json",
1443
  "scope": "multi_episode_128_partial_model_overlay",
1444
  "status": "scored",
1445
  "reason": null,
 
1519
  "qwen3_omni_v6_lora": {
1520
  "raw": 0.4318674027510605,
1521
  "metric_key": "macro_f1",
1522
+ "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_interaction_text_task15_a100_20260620T010305Z/interaction_text_prediction/metrics.json",
1523
  "scope": "multi_episode_128_partial_model_overlay",
1524
  "status": "scored",
1525
  "reason": null,
 
1712
  "cosmos3_super_reasoner": {
1713
  "raw": 0.0009279881217520415,
1714
  "metric_key": "object_set_forecast_micro_f1",
1715
+ "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/object_set_forecast/metrics.json",
1716
  "scope": "multi_episode_128_partial_model_overlay",
1717
  "status": "scored",
1718
  "reason": null,
 
3372
  "raw_text": "0.6286",
3373
  "normalized_score": 0.6286317274823326,
3374
  "metric_key": "temporal_order_f1",
3375
+ "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/temporal_order/metrics.json",
3376
  "scope": "multi_episode_128_partial_model_overlay",
3377
  "reason": null
3378
  },
 
3498
  "raw_text": "0.3727",
3499
  "normalized_score": 0.37271645981034185,
3500
  "metric_key": "misalignment_detection_f1",
3501
+ "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/misalignment_detection/metrics.json",
3502
  "scope": "multi_episode_128_partial_model_overlay",
3503
  "reason": null
3504
  },
 
3750
  "raw_text": "0.0000",
3751
  "normalized_score": 0.0,
3752
  "metric_key": "next_subtask_forecast_macro_f1",
3753
+ "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/next_subtask_forecast/metrics.json",
3754
  "scope": "multi_episode_128_partial_model_overlay",
3755
  "reason": null
3756
  },
 
3858
  "raw_text": "0.4319",
3859
  "normalized_score": 0.4318674027510605,
3860
  "metric_key": "macro_f1",
3861
+ "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_interaction_text_task15_a100_20260620T010305Z/interaction_text_prediction/metrics.json",
3862
  "scope": "multi_episode_128_partial_model_overlay",
3863
  "reason": null
3864
  },
 
4128
  "raw_text": "0.0009",
4129
  "normalized_score": 0.0009279881217520415,
4130
  "metric_key": "object_set_forecast_micro_f1",
4131
+ "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/object_set_forecast/metrics.json",
4132
  "scope": "multi_episode_128_partial_model_overlay",
4133
  "reason": null
4134
  },
data/public_surface_qa.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "title": "Ropedia Xperience-10M Public Project Surface",
3
  "status": "pass",
4
- "generated_at_utc": "2026-06-20T19:55:18+00:00",
5
  "scope": "Repo README, GitHub Pages HTML, Hugging Face Space card, artifact dataset card, and model card.",
6
  "checks": [
7
  {
@@ -18,7 +18,7 @@
18
  "website_integrity": {
19
  "exists": true,
20
  "status": "pass",
21
- "generated_at_utc": "2026-06-20T19:39:18+00:00"
22
  },
23
  "rendered_site_check": {
24
  "exists": true,
@@ -28,27 +28,27 @@
28
  "task_surface_integrity": {
29
  "exists": true,
30
  "status": "pass",
31
- "generated_at_utc": "2026-06-20T18:44:50+00:00"
32
  },
33
  "source_alignment": {
34
  "exists": true,
35
  "status": "pass",
36
- "generated_at_utc": "2026-06-20T18:44:49+00:00"
37
  },
38
  "scale_up_status": {
39
  "exists": true,
40
  "status": "pass",
41
- "generated_at_utc": "2026-06-20T18:45:07+00:00"
42
  },
43
  "publication_package": {
44
  "exists": true,
45
  "status": "pass",
46
- "generated_at_utc": "2026-06-20T19:39:40+00:00"
47
  },
48
  "mirror_parity": {
49
  "exists": true,
50
  "status": "pass",
51
- "generated_at_utc": "2026-06-20T19:40:53+00:00"
52
  }
53
  },
54
  "failures": {}
@@ -111,7 +111,7 @@
111
  "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite": 11,
112
  "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts": 11,
113
  "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines": 14,
114
- "https://huggingface.co/cy0307/ropedia-xperience-10m-weights-results": 9,
115
  "https://huggingface.co/datasets/ropedia-ai/xperience-10m": 38,
116
  "https://ropedia.com/dataset": 5
117
  }
 
1
  {
2
  "title": "Ropedia Xperience-10M Public Project Surface",
3
  "status": "pass",
4
+ "generated_at_utc": "2026-06-20T20:48:08+00:00",
5
  "scope": "Repo README, GitHub Pages HTML, Hugging Face Space card, artifact dataset card, and model card.",
6
  "checks": [
7
  {
 
18
  "website_integrity": {
19
  "exists": true,
20
  "status": "pass",
21
+ "generated_at_utc": "2026-06-20T20:41:45+00:00"
22
  },
23
  "rendered_site_check": {
24
  "exists": true,
 
28
  "task_surface_integrity": {
29
  "exists": true,
30
  "status": "pass",
31
+ "generated_at_utc": "2026-06-20T19:55:17+00:00"
32
  },
33
  "source_alignment": {
34
  "exists": true,
35
  "status": "pass",
36
+ "generated_at_utc": "2026-06-20T19:55:18+00:00"
37
  },
38
  "scale_up_status": {
39
  "exists": true,
40
  "status": "pass",
41
+ "generated_at_utc": "2026-06-20T19:55:26+00:00"
42
  },
43
  "publication_package": {
44
  "exists": true,
45
  "status": "pass",
46
+ "generated_at_utc": "2026-06-20T20:42:41+00:00"
47
  },
48
  "mirror_parity": {
49
  "exists": true,
50
  "status": "pass",
51
+ "generated_at_utc": "2026-06-20T20:47:51+00:00"
52
  }
53
  },
54
  "failures": {}
 
111
  "https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite": 11,
112
  "https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts": 11,
113
  "https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines": 14,
114
+ "https://huggingface.co/cy0307/ropedia-xperience-10m-weights-results": 6,
115
  "https://huggingface.co/datasets/ropedia-ai/xperience-10m": 38,
116
  "https://ropedia.com/dataset": 5
117
  }
data/quality_gates.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "title": "Ropedia Xperience-10M Release Checks",
3
  "status": "pass",
4
- "generated_at_utc": "2026-06-20T18:44:28+00:00",
5
  "rule": "A release is current when the automated reports pass and the live GitHub/Hugging Face mirrors are verified after publishing.",
6
  "automated_gates": [
7
  {
@@ -76,6 +76,18 @@
76
  "status": "pass"
77
  }
78
  },
 
 
 
 
 
 
 
 
 
 
 
 
79
  {
80
  "id": "figure_index",
81
  "title": "Figure index",
 
1
  {
2
  "title": "Ropedia Xperience-10M Release Checks",
3
  "status": "pass",
4
+ "generated_at_utc": "2026-06-20T20:48:18+00:00",
5
  "rule": "A release is current when the automated reports pass and the live GitHub/Hugging Face mirrors are verified after publishing.",
6
  "automated_gates": [
7
  {
 
76
  "status": "pass"
77
  }
78
  },
79
+ {
80
+ "id": "task_method_source_audit",
81
+ "title": "Task-method source audit",
82
+ "command": "python scripts/validate_task_method_matrix_sources.py",
83
+ "report": "docs/data/task_method_20_source_audit.json",
84
+ "blocks_if": "A scored 20-task matrix cell points to a JSON metric source that does not contain the same metric value.",
85
+ "shows": "Public 20-task scores remain traceable to their task-specific metric artifacts.",
86
+ "current_report": {
87
+ "exists": true,
88
+ "status": "pass"
89
+ }
90
+ },
91
  {
92
  "id": "figure_index",
93
  "title": "Figure index",
data/single_episode_task_model_radar.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "title": "Single-Episode 20-Task Radar",
3
  "status": "pass",
4
- "generated_at_utc": "2026-06-20T19:54:37+00:00",
5
  "description": "Minimal and Neural MLP baselines on the one public sample episode, both scored on all 20 task contracts.",
6
  "task_count": 20,
7
  "method_count": 2,
 
1
  {
2
  "title": "Single-Episode 20-Task Radar",
3
  "status": "pass",
4
+ "generated_at_utc": "2026-06-20T20:38:21+00:00",
5
  "description": "Minimal and Neural MLP baselines on the one public sample episode, both scored on all 20 task contracts.",
6
  "task_count": 20,
7
  "method_count": 2,
data/task_method_20_gap_audit.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "generated_at_utc": "2026-06-20T19:54:37+00:00",
3
  "immediate_actions": [
4
  {
5
  "artifact": "docs/data/task_method_20_gap_audit.json",
 
1
  {
2
+ "generated_at_utc": "2026-06-20T20:38:59+00:00",
3
  "immediate_actions": [
4
  {
5
  "artifact": "docs/data/task_method_20_gap_audit.json",
data/task_method_20_result_matrix.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "title": "Task Method 20-Result Matrix",
3
  "status": "pass",
4
- "generated_at_utc": "2026-06-20T19:54:37+00:00",
5
  "task_count": 20,
6
  "method_count": 9,
7
  "method_task_record_count": 180,
@@ -1980,7 +1980,7 @@
1980
  "raw_text": "0.6286",
1981
  "normalized_score": 0.6286317274823326,
1982
  "metric_key": "temporal_order_f1",
1983
- "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
1984
  "scope": "multi_episode_128_partial_model_overlay",
1985
  "reason": null
1986
  },
@@ -2142,7 +2142,7 @@
2142
  "raw_text": "0.3727",
2143
  "normalized_score": 0.37271645981034185,
2144
  "metric_key": "misalignment_detection_f1",
2145
- "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
2146
  "scope": "multi_episode_128_partial_model_overlay",
2147
  "reason": null
2148
  },
@@ -2466,7 +2466,7 @@
2466
  "raw_text": "0.0000",
2467
  "normalized_score": 0.0,
2468
  "metric_key": "next_subtask_forecast_macro_f1",
2469
- "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
2470
  "scope": "multi_episode_128_partial_model_overlay",
2471
  "reason": null
2472
  },
@@ -2610,7 +2610,7 @@
2610
  "raw_text": "0.4319",
2611
  "normalized_score": 0.4318674027510605,
2612
  "metric_key": "macro_f1",
2613
- "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json",
2614
  "scope": "multi_episode_128_partial_model_overlay",
2615
  "reason": null
2616
  },
@@ -2952,7 +2952,7 @@
2952
  "raw_text": "0.0009",
2953
  "normalized_score": 0.0009279881217520415,
2954
  "metric_key": "object_set_forecast_micro_f1",
2955
- "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json",
2956
  "scope": "multi_episode_128_partial_model_overlay",
2957
  "reason": null
2958
  },
 
1
  {
2
  "title": "Task Method 20-Result Matrix",
3
  "status": "pass",
4
+ "generated_at_utc": "2026-06-20T20:38:21+00:00",
5
  "task_count": 20,
6
  "method_count": 9,
7
  "method_task_record_count": 180,
 
1980
  "raw_text": "0.6286",
1981
  "normalized_score": 0.6286317274823326,
1982
  "metric_key": "temporal_order_f1",
1983
+ "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/temporal_order/metrics.json",
1984
  "scope": "multi_episode_128_partial_model_overlay",
1985
  "reason": null
1986
  },
 
2142
  "raw_text": "0.3727",
2143
  "normalized_score": 0.37271645981034185,
2144
  "metric_key": "misalignment_detection_f1",
2145
+ "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/misalignment_detection/metrics.json",
2146
  "scope": "multi_episode_128_partial_model_overlay",
2147
  "reason": null
2148
  },
 
2466
  "raw_text": "0.0000",
2467
  "normalized_score": 0.0,
2468
  "metric_key": "next_subtask_forecast_macro_f1",
2469
+ "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/next_subtask_forecast/metrics.json",
2470
  "scope": "multi_episode_128_partial_model_overlay",
2471
  "reason": null
2472
  },
 
2610
  "raw_text": "0.4319",
2611
  "normalized_score": 0.4318674027510605,
2612
  "metric_key": "macro_f1",
2613
+ "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_interaction_text_task15_a100_20260620T010305Z/interaction_text_prediction/metrics.json",
2614
  "scope": "multi_episode_128_partial_model_overlay",
2615
  "reason": null
2616
  },
 
2952
  "raw_text": "0.0009",
2953
  "normalized_score": 0.0009279881217520415,
2954
  "metric_key": "object_set_forecast_micro_f1",
2955
+ "source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/object_set_forecast/metrics.json",
2956
  "scope": "multi_episode_128_partial_model_overlay",
2957
  "reason": null
2958
  },
data/task_method_20_source_audit.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "checked_json_metric_count": 180,
3
+ "failure_count": 0,
4
+ "failures": [],
5
+ "generated_at_utc": "2026-06-20T20:48:41+00:00",
6
+ "method_task_record_count": 180,
7
+ "rule": "Every scored row that declares a JSON metric source must have the same numeric value under that row's metric_key.",
8
+ "scored_method_task_count": 180,
9
+ "skipped_record_count": 0,
10
+ "skipped_records": [],
11
+ "source_matrix": "docs/data/task_method_20_result_matrix.json",
12
+ "status": "pass",
13
+ "status_counts": {
14
+ "checked": 180
15
+ },
16
+ "title": "Task Method 20 Matrix Source Audit"
17
+ }
data/website_integrity.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "status": "pass",
3
- "generated_at_utc": "2026-06-20T19:55:48+00:00",
4
  "docs_root": "docs",
5
  "site_base": "/ropedia-xperience-10m-task-suite/",
6
  "summary": {
7
  "html_pages": 4,
8
  "local_references": 213,
9
  "external_reference_count": 152,
10
- "json_files": 50,
11
  "image_assets_referenced": 28,
12
  "failure_count": 0
13
  },
@@ -301,7 +301,7 @@
301
  },
302
  {
303
  "path": "data/artifact_index.json",
304
- "bytes": 121435,
305
  "top_level_type": "dict"
306
  },
307
  {
@@ -316,7 +316,7 @@
316
  },
317
  {
318
  "path": "data/episode128_task_model_radar.json",
319
- "bytes": 184889,
320
  "top_level_type": "dict"
321
  },
322
  {
@@ -491,7 +491,12 @@
491
  },
492
  {
493
  "path": "data/task_method_20_result_matrix.json",
494
- "bytes": 128481,
 
 
 
 
 
495
  "top_level_type": "dict"
496
  },
497
  {
@@ -526,7 +531,7 @@
526
  },
527
  {
528
  "path": "data/unified_task_model_radar.json",
529
- "bytes": 228743,
530
  "top_level_type": "dict"
531
  },
532
  {
 
1
  {
2
  "status": "pass",
3
+ "generated_at_utc": "2026-06-20T20:41:45+00:00",
4
  "docs_root": "docs",
5
  "site_base": "/ropedia-xperience-10m-task-suite/",
6
  "summary": {
7
  "html_pages": 4,
8
  "local_references": 213,
9
  "external_reference_count": 152,
10
+ "json_files": 51,
11
  "image_assets_referenced": 28,
12
  "failure_count": 0
13
  },
 
301
  },
302
  {
303
  "path": "data/artifact_index.json",
304
+ "bytes": 122823,
305
  "top_level_type": "dict"
306
  },
307
  {
 
316
  },
317
  {
318
  "path": "data/episode128_task_model_radar.json",
319
+ "bytes": 184945,
320
  "top_level_type": "dict"
321
  },
322
  {
 
491
  },
492
  {
493
  "path": "data/task_method_20_result_matrix.json",
494
+ "bytes": 128509,
495
+ "top_level_type": "dict"
496
+ },
497
+ {
498
+ "path": "data/task_method_20_source_audit.json",
499
+ "bytes": 561,
500
  "top_level_type": "dict"
501
  },
502
  {
 
531
  },
532
  {
533
  "path": "data/unified_task_model_radar.json",
534
+ "bytes": 228799,
535
  "top_level_type": "dict"
536
  },
537
  {
scripts/omni/eval_cosmos3_super_future_task_probes.py CHANGED
@@ -31,6 +31,7 @@ from eval_qwen3_omni_future_task_probes import (
31
  score_task as qwen_score_task,
32
  select_eval_indices,
33
  select_tasks,
 
34
  task_target_value,
35
  time_to_transition_map,
36
  write_json,
@@ -212,9 +213,14 @@ def main() -> int:
212
  samples = load_jsonl(args.dataset_jsonl)
213
  future_map = future_index_map(samples, args.future_frames)
214
  transition_targets = time_to_transition_map(samples)
215
- eval_indices = [idx for idx in select_eval_indices(samples, args) if idx in future_map]
216
- if not eval_indices:
217
- raise ValueError("No evaluation samples with future targets selected.")
 
 
 
 
 
218
 
219
  write_json(args.output_dir / "server_info.json", server_info(args))
220
  append_jsonl(
@@ -224,7 +230,9 @@ def main() -> int:
224
  "timestamp": time.time(),
225
  "run_id": args.run_id,
226
  "tasks": selected_tasks,
227
- "num_eval_samples_with_future": len(eval_indices),
 
 
228
  "sample_offset": args.sample_offset,
229
  "sample_stride": args.sample_stride,
230
  "future_frames": args.future_frames,
@@ -246,9 +254,10 @@ def main() -> int:
246
  for task_id in selected_tasks:
247
  spec = TASK_SPECS[task_id]
248
  partial_path = args.output_dir / task_id / "predictions.partial.jsonl"
249
- for local_pos, sample_idx in enumerate(eval_indices, start=1):
 
250
  sample = samples[sample_idx]
251
- future_sample = samples[future_map[sample_idx]]
252
  pred_id = prediction_id(task_id, sample)
253
  if args.resume and pred_id in partial_by_task[task_id]:
254
  continue
@@ -301,7 +310,7 @@ def main() -> int:
301
  "timestamp": time.time(),
302
  "task_id": task_id,
303
  "sample_index": local_pos,
304
- "num_eval_samples": len(eval_indices),
305
  "completed_samples_for_task": len(partial_by_task[task_id]),
306
  "sample_id": sample.get("id"),
307
  "seconds": round(time.time() - started, 3),
@@ -310,7 +319,7 @@ def main() -> int:
310
 
311
  task_metrics = {}
312
  for task_id in selected_tasks:
313
- rows = [partial_by_task[task_id][prediction_id(task_id, samples[idx])] for idx in eval_indices]
314
  task_metrics[task_id] = score_task(task_id, TASK_SPECS[task_id], rows, args.output_dir, args)
315
 
316
  display_name = model_display_name(args)
 
31
  score_task as qwen_score_task,
32
  select_eval_indices,
33
  select_tasks,
34
+ task_requires_future_sample,
35
  task_target_value,
36
  time_to_transition_map,
37
  write_json,
 
213
  samples = load_jsonl(args.dataset_jsonl)
214
  future_map = future_index_map(samples, args.future_frames)
215
  transition_targets = time_to_transition_map(samples)
216
+ base_eval_indices = select_eval_indices(samples, args)
217
+ eval_indices_by_task = {
218
+ task_id: [idx for idx in base_eval_indices if (not task_requires_future_sample(task_id) or idx in future_map)]
219
+ for task_id in selected_tasks
220
+ }
221
+ empty_tasks = [task_id for task_id, indices in eval_indices_by_task.items() if not indices]
222
+ if empty_tasks:
223
+ raise ValueError(f"No evaluation samples selected for tasks: {', '.join(empty_tasks)}")
224
 
225
  write_json(args.output_dir / "server_info.json", server_info(args))
226
  append_jsonl(
 
230
  "timestamp": time.time(),
231
  "run_id": args.run_id,
232
  "tasks": selected_tasks,
233
+ "num_base_eval_samples": len(base_eval_indices),
234
+ "num_eval_samples_by_task": {task_id: len(indices) for task_id, indices in eval_indices_by_task.items()},
235
+ "num_eval_samples_with_future": sum(1 for idx in base_eval_indices if idx in future_map),
236
  "sample_offset": args.sample_offset,
237
  "sample_stride": args.sample_stride,
238
  "future_frames": args.future_frames,
 
254
  for task_id in selected_tasks:
255
  spec = TASK_SPECS[task_id]
256
  partial_path = args.output_dir / task_id / "predictions.partial.jsonl"
257
+ task_eval_indices = eval_indices_by_task[task_id]
258
+ for local_pos, sample_idx in enumerate(task_eval_indices, start=1):
259
  sample = samples[sample_idx]
260
+ future_sample = samples[future_map[sample_idx]] if task_requires_future_sample(task_id) else sample
261
  pred_id = prediction_id(task_id, sample)
262
  if args.resume and pred_id in partial_by_task[task_id]:
263
  continue
 
310
  "timestamp": time.time(),
311
  "task_id": task_id,
312
  "sample_index": local_pos,
313
+ "num_eval_samples": len(task_eval_indices),
314
  "completed_samples_for_task": len(partial_by_task[task_id]),
315
  "sample_id": sample.get("id"),
316
  "seconds": round(time.time() - started, 3),
 
319
 
320
  task_metrics = {}
321
  for task_id in selected_tasks:
322
+ rows = [partial_by_task[task_id][prediction_id(task_id, samples[idx])] for idx in eval_indices_by_task[task_id]]
323
  task_metrics[task_id] = score_task(task_id, TASK_SPECS[task_id], rows, args.output_dir, args)
324
 
325
  display_name = model_display_name(args)
scripts/omni/merge_cosmos3_super_future_task_probe_shards.py CHANGED
@@ -56,13 +56,27 @@ def main() -> int:
56
  args.output_dir.mkdir(parents=True, exist_ok=True)
57
  task_metrics: dict[str, dict[str, Any]] = {}
58
  first_metrics: dict[str, Any] | None = None
 
59
 
60
  for task_id, spec in TASK_SPECS.items():
61
  rows_by_id: dict[str, dict[str, Any]] = {}
 
62
  for shard_dir in args.shard_dir:
63
  for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
64
  key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
65
- rows_by_id.setdefault(key, row)
 
 
 
 
 
 
 
 
 
 
 
 
66
  shard_metrics = read_json(shard_dir / task_id / "metrics.json")
67
  if shard_metrics and first_metrics is None:
68
  first_metrics = shard_metrics
@@ -90,6 +104,9 @@ def main() -> int:
90
  "status": "pass",
91
  "run_id": args.run_id,
92
  "shard_dirs": [str(path) for path in args.shard_dir],
 
 
 
93
  "tasks": {
94
  task_id: {
95
  "task_number": metrics["task_number"],
 
56
  args.output_dir.mkdir(parents=True, exist_ok=True)
57
  task_metrics: dict[str, dict[str, Any]] = {}
58
  first_metrics: dict[str, Any] | None = None
59
+ duplicate_predictions: list[dict[str, Any]] = []
60
 
61
  for task_id, spec in TASK_SPECS.items():
62
  rows_by_id: dict[str, dict[str, Any]] = {}
63
+ row_sources: dict[str, str] = {}
64
  for shard_dir in args.shard_dir:
65
  for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
66
  key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
67
+ if key in rows_by_id:
68
+ duplicate_predictions.append(
69
+ {
70
+ "task_id": task_id,
71
+ "prediction_id": key,
72
+ "kept_shard": row_sources.get(key),
73
+ "duplicate_shard": str(shard_dir),
74
+ "conflict": rows_by_id[key] != row,
75
+ }
76
+ )
77
+ continue
78
+ rows_by_id[key] = row
79
+ row_sources[key] = str(shard_dir)
80
  shard_metrics = read_json(shard_dir / task_id / "metrics.json")
81
  if shard_metrics and first_metrics is None:
82
  first_metrics = shard_metrics
 
104
  "status": "pass",
105
  "run_id": args.run_id,
106
  "shard_dirs": [str(path) for path in args.shard_dir],
107
+ "duplicate_prediction_count": len(duplicate_predictions),
108
+ "duplicate_prediction_conflict_count": sum(1 for row in duplicate_predictions if row["conflict"]),
109
+ "duplicate_predictions": duplicate_predictions[:50],
110
  "tasks": {
111
  task_id: {
112
  "task_number": metrics["task_number"],
scripts/omni/merge_qwen3_omni_future_task_probe_shards.py CHANGED
@@ -54,13 +54,27 @@ def main() -> int:
54
  args.output_dir.mkdir(parents=True, exist_ok=True)
55
  task_metrics: dict[str, dict[str, Any]] = {}
56
  first_metrics: dict[str, Any] | None = None
 
57
 
58
  for task_id, spec in TASK_SPECS.items():
59
  rows_by_id: dict[str, dict[str, Any]] = {}
 
60
  for shard_dir in args.shard_dir:
61
  for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
62
  key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
63
- rows_by_id.setdefault(key, row)
 
 
 
 
 
 
 
 
 
 
 
 
64
  shard_metrics = read_json(shard_dir / task_id / "metrics.json")
65
  if shard_metrics and first_metrics is None:
66
  first_metrics = shard_metrics
@@ -82,6 +96,9 @@ def main() -> int:
82
  "status": "pass",
83
  "run_id": args.run_id,
84
  "shard_dirs": [str(path) for path in args.shard_dir],
 
 
 
85
  "tasks": {
86
  task_id: {
87
  "task_number": metrics["task_number"],
 
54
  args.output_dir.mkdir(parents=True, exist_ok=True)
55
  task_metrics: dict[str, dict[str, Any]] = {}
56
  first_metrics: dict[str, Any] | None = None
57
+ duplicate_predictions: list[dict[str, Any]] = []
58
 
59
  for task_id, spec in TASK_SPECS.items():
60
  rows_by_id: dict[str, dict[str, Any]] = {}
61
+ row_sources: dict[str, str] = {}
62
  for shard_dir in args.shard_dir:
63
  for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
64
  key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
65
+ if key in rows_by_id:
66
+ duplicate_predictions.append(
67
+ {
68
+ "task_id": task_id,
69
+ "prediction_id": key,
70
+ "kept_shard": row_sources.get(key),
71
+ "duplicate_shard": str(shard_dir),
72
+ "conflict": rows_by_id[key] != row,
73
+ }
74
+ )
75
+ continue
76
+ rows_by_id[key] = row
77
+ row_sources[key] = str(shard_dir)
78
  shard_metrics = read_json(shard_dir / task_id / "metrics.json")
79
  if shard_metrics and first_metrics is None:
80
  first_metrics = shard_metrics
 
96
  "status": "pass",
97
  "run_id": args.run_id,
98
  "shard_dirs": [str(path) for path in args.shard_dir],
99
+ "duplicate_prediction_count": len(duplicate_predictions),
100
+ "duplicate_prediction_conflict_count": sum(1 for row in duplicate_predictions if row["conflict"]),
101
+ "duplicate_predictions": duplicate_predictions[:50],
102
  "tasks": {
103
  task_id: {
104
  "task_number": metrics["task_number"],
scripts/omni/merge_qwen3_omni_retrieval_task_probe_shards.py CHANGED
@@ -55,13 +55,27 @@ def main() -> int:
55
  args.output_dir.mkdir(parents=True, exist_ok=True)
56
  task_metrics: dict[str, dict[str, Any]] = {}
57
  first_metrics: dict[str, Any] | None = None
 
58
 
59
  for task_id, spec in TASK_SPECS.items():
60
  rows_by_id: dict[str, dict[str, Any]] = {}
 
61
  for shard_dir in args.shard_dir:
62
  for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
63
  key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
64
- rows_by_id.setdefault(key, row)
 
 
 
 
 
 
 
 
 
 
 
 
65
  shard_metrics = read_json(shard_dir / task_id / "metrics.json")
66
  if shard_metrics and first_metrics is None:
67
  first_metrics = shard_metrics
@@ -86,6 +100,9 @@ def main() -> int:
86
  "status": "pass",
87
  "run_id": args.run_id,
88
  "shard_dirs": [str(path) for path in args.shard_dir],
 
 
 
89
  "tasks": {
90
  task_id: {
91
  "task_number": metrics["task_number"],
 
55
  args.output_dir.mkdir(parents=True, exist_ok=True)
56
  task_metrics: dict[str, dict[str, Any]] = {}
57
  first_metrics: dict[str, Any] | None = None
58
+ duplicate_predictions: list[dict[str, Any]] = []
59
 
60
  for task_id, spec in TASK_SPECS.items():
61
  rows_by_id: dict[str, dict[str, Any]] = {}
62
+ row_sources: dict[str, str] = {}
63
  for shard_dir in args.shard_dir:
64
  for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
65
  key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
66
+ if key in rows_by_id:
67
+ duplicate_predictions.append(
68
+ {
69
+ "task_id": task_id,
70
+ "prediction_id": key,
71
+ "kept_shard": row_sources.get(key),
72
+ "duplicate_shard": str(shard_dir),
73
+ "conflict": rows_by_id[key] != row,
74
+ }
75
+ )
76
+ continue
77
+ rows_by_id[key] = row
78
+ row_sources[key] = str(shard_dir)
79
  shard_metrics = read_json(shard_dir / task_id / "metrics.json")
80
  if shard_metrics and first_metrics is None:
81
  first_metrics = shard_metrics
 
100
  "status": "pass",
101
  "run_id": args.run_id,
102
  "shard_dirs": [str(path) for path in args.shard_dir],
103
+ "duplicate_prediction_count": len(duplicate_predictions),
104
+ "duplicate_prediction_conflict_count": sum(1 for row in duplicate_predictions if row["conflict"]),
105
+ "duplicate_predictions": duplicate_predictions[:50],
106
  "tasks": {
107
  task_id: {
108
  "task_number": metrics["task_number"],
scripts/omni/run_128_task_baselines.py CHANGED
@@ -133,7 +133,7 @@ def parse_args() -> argparse.Namespace:
133
  default=256,
134
  help="Use centroid classification instead of dense softmax when the train label space is larger than this.",
135
  )
136
- parser.add_argument("--include-neural", action="store_true", default=True)
137
  parser.add_argument("--neural-epochs", type=int, default=35)
138
  parser.add_argument("--neural-hidden-dim", type=int, default=128)
139
  parser.add_argument("--neural-batch-size", type=int, default=256)
@@ -335,8 +335,6 @@ def row_text_features(row: dict[str, Any], episode: dict[str, Any] | None) -> st
335
  parts.extend([
336
  "main_task:",
337
  norm(episode.get("main_task")),
338
- "episode_split:",
339
- norm(episode.get("split")),
340
  ])
341
  media = row.get("media") or {}
342
  parts.extend([
 
133
  default=256,
134
  help="Use centroid classification instead of dense softmax when the train label space is larger than this.",
135
  )
136
+ parser.add_argument("--include-neural", action=argparse.BooleanOptionalAction, default=True)
137
  parser.add_argument("--neural-epochs", type=int, default=35)
138
  parser.add_argument("--neural-hidden-dim", type=int, default=128)
139
  parser.add_argument("--neural-batch-size", type=int, default=256)
 
335
  parts.extend([
336
  "main_task:",
337
  norm(episode.get("main_task")),
 
 
338
  ])
339
  media = row.get("media") or {}
340
  parts.extend([
scripts/omni/train_cosmos3_super_forward_dynamics_lora.py CHANGED
@@ -884,7 +884,8 @@ def main() -> int:
884
  if accelerator.is_main_process:
885
  write_json(output_dir / "training_metadata.json", payload)
886
  write_report(output_dir, payload)
887
- append_jsonl(progress_path, {"event": "complete", "timestamp": time.time(), "status": status})
 
888
 
889
  if accelerator.is_main_process:
890
  print(json.dumps({"status": status, "output_dir": str(output_dir), "adapter_dir": str(adapter_dir) if adapter_dir else None}, indent=2))
 
884
  if accelerator.is_main_process:
885
  write_json(output_dir / "training_metadata.json", payload)
886
  write_report(output_dir, payload)
887
+ final_event = "complete" if status in {"complete", "dry_run_complete"} else "finalized_failed"
888
+ append_jsonl(progress_path, {"event": final_event, "timestamp": time.time(), "status": status})
889
 
890
  if accelerator.is_main_process:
891
  print(json.dumps({"status": status, "output_dir": str(output_dir), "adapter_dir": str(adapter_dir) if adapter_dir else None}, indent=2))