Add files using upload-large-folder tool
Browse files- data/artifact_index.json +67 -34
- data/episode128_task_model_radar.json +11 -11
- data/public_surface_qa.json +8 -8
- data/quality_gates.json +13 -1
- data/single_episode_task_model_radar.json +1 -1
- data/task_method_20_gap_audit.json +1 -1
- data/task_method_20_result_matrix.json +6 -6
- data/task_method_20_source_audit.json +17 -0
- data/website_integrity.json +11 -6
- scripts/omni/eval_cosmos3_super_future_task_probes.py +17 -8
- scripts/omni/merge_cosmos3_super_future_task_probe_shards.py +18 -1
- scripts/omni/merge_qwen3_omni_future_task_probe_shards.py +18 -1
- scripts/omni/merge_qwen3_omni_retrieval_task_probe_shards.py +18 -1
- scripts/omni/run_128_task_baselines.py +1 -3
- scripts/omni/train_cosmos3_super_forward_dynamics_lora.py +2 -1
data/artifact_index.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"title": "Ropedia Xperience-10M Task Suite Artifact Index",
|
| 3 |
-
"generated_at_utc": "2026-06-
|
| 4 |
"status": "pass",
|
| 5 |
-
"artifact_count":
|
| 6 |
"missing": [],
|
| 7 |
"by_kind": {
|
| 8 |
"project_path": 18,
|
|
@@ -10,12 +10,12 @@
|
|
| 10 |
"visual_asset_source": 3,
|
| 11 |
"scaleup_contract": 7,
|
| 12 |
"scaleup_status": 52,
|
| 13 |
-
"publication_workflow":
|
| 14 |
"reproducibility": 4,
|
| 15 |
"project_scope": 1,
|
| 16 |
"source_alignment": 5,
|
| 17 |
-
"evaluation_protocol":
|
| 18 |
-
"website_data":
|
| 19 |
"generated_figure": 7,
|
| 20 |
"visualization_builder": 1,
|
| 21 |
"model_result": 5,
|
|
@@ -301,8 +301,8 @@
|
|
| 301 |
"surface": "repo_hf",
|
| 302 |
"shows": "Runs simple metadata and neural MLP baselines on the same selected 96/16/16 episode split used by the Qwen3-Omni diagnostic pilot.",
|
| 303 |
"exists": true,
|
| 304 |
-
"bytes":
|
| 305 |
-
"sha256": "
|
| 306 |
},
|
| 307 |
{
|
| 308 |
"id": "task_suite_enhancement_128",
|
|
@@ -610,7 +610,7 @@
|
|
| 610 |
"shows": "Machine-readable source-alignment pass/fail check for repo, website, and HF surfaces.",
|
| 611 |
"exists": true,
|
| 612 |
"bytes": 4432,
|
| 613 |
-
"sha256": "
|
| 614 |
},
|
| 615 |
{
|
| 616 |
"id": "source_alignment_validator",
|
|
@@ -730,8 +730,8 @@
|
|
| 730 |
"surface": "website_hf",
|
| 731 |
"shows": "Stores normalized 20-axis radar values, raw task metrics, Qwen3/Cosmos overlay mappings, branch-card caveats, proxy flags, and source artifacts.",
|
| 732 |
"exists": true,
|
| 733 |
-
"bytes":
|
| 734 |
-
"sha256": "
|
| 735 |
},
|
| 736 |
{
|
| 737 |
"id": "single_episode_task_model_radar_json",
|
|
@@ -742,7 +742,7 @@
|
|
| 742 |
"shows": "Machine-readable split radar for the one-episode Minimal and Neural MLP baselines, both scored on all 20 task contracts.",
|
| 743 |
"exists": true,
|
| 744 |
"bytes": 51097,
|
| 745 |
-
"sha256": "
|
| 746 |
},
|
| 747 |
{
|
| 748 |
"id": "episode128_task_model_radar_json",
|
|
@@ -752,8 +752,8 @@
|
|
| 752 |
"surface": "website_hf",
|
| 753 |
"shows": "Machine-readable split radar for selected 128-episode metadata/raw baselines and verified Qwen3/Cosmos branches, now complete at 140/140 scored rows with proxy notes retained.",
|
| 754 |
"exists": true,
|
| 755 |
-
"bytes":
|
| 756 |
-
"sha256": "
|
| 757 |
},
|
| 758 |
{
|
| 759 |
"id": "task_method_20_result_matrix_json",
|
|
@@ -763,8 +763,8 @@
|
|
| 763 |
"surface": "website_hf",
|
| 764 |
"shows": "Machine-readable 9-method by 20-task matrix where every method has 20 records and the current release is complete at 180/180 scored rows.",
|
| 765 |
"exists": true,
|
| 766 |
-
"bytes":
|
| 767 |
-
"sha256": "
|
| 768 |
},
|
| 769 |
{
|
| 770 |
"id": "task_method_20_result_matrix",
|
|
@@ -786,7 +786,7 @@
|
|
| 786 |
"shows": "Machine-readable 180-record completion ledger with numeric scores, proxy flags, explicit status reasons, and source artifacts.",
|
| 787 |
"exists": true,
|
| 788 |
"bytes": 8500,
|
| 789 |
-
"sha256": "
|
| 790 |
},
|
| 791 |
{
|
| 792 |
"id": "task_method_20_gap_audit",
|
|
@@ -797,7 +797,29 @@
|
|
| 797 |
"shows": "Reader-facing ledger confirming 180/180 scored method-task cells and listing the six compact-proxy records separately.",
|
| 798 |
"exists": true,
|
| 799 |
"bytes": 3417,
|
| 800 |
-
"sha256": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 801 |
},
|
| 802 |
{
|
| 803 |
"id": "unified_task_model_radar_chart",
|
|
@@ -840,8 +862,8 @@
|
|
| 840 |
"surface": "repo_hf",
|
| 841 |
"shows": "Regenerates the direction-aware radar chart and machine-readable metric overlay JSON.",
|
| 842 |
"exists": true,
|
| 843 |
-
"bytes":
|
| 844 |
-
"sha256": "
|
| 845 |
},
|
| 846 |
{
|
| 847 |
"id": "task_method_20_gap_audit_builder",
|
|
@@ -854,6 +876,17 @@
|
|
| 854 |
"bytes": 10295,
|
| 855 |
"sha256": "e2a3b41d3cca6efee7076b68c35693a4c53f5f2549e2eecbf035b98a717a3f65"
|
| 856 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 857 |
{
|
| 858 |
"id": "all_task_model_scoring_waiter",
|
| 859 |
"title": "All-task model scoring guarded waiter",
|
|
@@ -873,8 +906,8 @@
|
|
| 873 |
"surface": "repo_hf",
|
| 874 |
"shows": "Checks whether Qwen3/Cosmos branches have train, validation, and test prediction files before extending model overlays to all 20 task contracts.",
|
| 875 |
"exists": true,
|
| 876 |
-
"bytes":
|
| 877 |
-
"sha256": "
|
| 878 |
},
|
| 879 |
{
|
| 880 |
"id": "model_output_probe_script",
|
|
@@ -884,8 +917,8 @@
|
|
| 884 |
"surface": "repo_hf",
|
| 885 |
"shows": "Audits model-output split availability and writes a readiness report without assigning new numeric task scores.",
|
| 886 |
"exists": true,
|
| 887 |
-
"bytes":
|
| 888 |
-
"sha256": "
|
| 889 |
},
|
| 890 |
{
|
| 891 |
"id": "existing_model_output_task_probe",
|
|
@@ -1104,8 +1137,8 @@
|
|
| 1104 |
"surface": "repo_hf",
|
| 1105 |
"shows": "Lists the automated and post-publish checks used to keep the release current.",
|
| 1106 |
"exists": true,
|
| 1107 |
-
"bytes":
|
| 1108 |
-
"sha256": "
|
| 1109 |
},
|
| 1110 |
{
|
| 1111 |
"id": "quality_gate_manifest",
|
|
@@ -1115,8 +1148,8 @@
|
|
| 1115 |
"surface": "website_hf",
|
| 1116 |
"shows": "Machine-readable release-check summary for validators, mirrors, and public project surfaces.",
|
| 1117 |
"exists": true,
|
| 1118 |
-
"bytes":
|
| 1119 |
-
"sha256": "
|
| 1120 |
},
|
| 1121 |
{
|
| 1122 |
"id": "public_surface_qa",
|
|
@@ -1252,8 +1285,8 @@
|
|
| 1252 |
"surface": "repo",
|
| 1253 |
"shows": "Fetches the published GitHub/HF URLs and compares live hashes and public-card markers against the release assets.",
|
| 1254 |
"exists": true,
|
| 1255 |
-
"bytes":
|
| 1256 |
-
"sha256": "
|
| 1257 |
},
|
| 1258 |
{
|
| 1259 |
"id": "reproducibility_contract",
|
|
@@ -1285,8 +1318,8 @@
|
|
| 1285 |
"surface": "repo_hf",
|
| 1286 |
"shows": "Generates the selective artifact catalog from local files.",
|
| 1287 |
"exists": true,
|
| 1288 |
-
"bytes":
|
| 1289 |
-
"sha256": "
|
| 1290 |
},
|
| 1291 |
{
|
| 1292 |
"id": "publication_audit",
|
|
@@ -1297,7 +1330,7 @@
|
|
| 1297 |
"volatile": true,
|
| 1298 |
"shows": "Confirms public bundles exclude raw data, caches, heavy archives, and credential text.",
|
| 1299 |
"exists": true,
|
| 1300 |
-
"bytes":
|
| 1301 |
"hash_policy": "existence_and_size_only"
|
| 1302 |
},
|
| 1303 |
{
|
|
@@ -1321,7 +1354,7 @@
|
|
| 1321 |
"volatile": true,
|
| 1322 |
"shows": "Confirms prepared GitHub/HF Space/artifact/model mirrors share the same critical data, figure, website HTML, and validator files.",
|
| 1323 |
"exists": true,
|
| 1324 |
-
"bytes":
|
| 1325 |
"hash_policy": "existence_and_size_only"
|
| 1326 |
},
|
| 1327 |
{
|
|
@@ -1333,7 +1366,7 @@
|
|
| 1333 |
"volatile": true,
|
| 1334 |
"shows": "Confirms local website links, anchors, JSON data files, and referenced images resolve.",
|
| 1335 |
"exists": true,
|
| 1336 |
-
"bytes":
|
| 1337 |
"hash_policy": "existence_and_size_only"
|
| 1338 |
},
|
| 1339 |
{
|
|
|
|
| 1 |
{
|
| 2 |
"title": "Ropedia Xperience-10M Task Suite Artifact Index",
|
| 3 |
+
"generated_at_utc": "2026-06-20T20:48:09+00:00",
|
| 4 |
"status": "pass",
|
| 5 |
+
"artifact_count": 225,
|
| 6 |
"missing": [],
|
| 7 |
"by_kind": {
|
| 8 |
"project_path": 18,
|
|
|
|
| 10 |
"visual_asset_source": 3,
|
| 11 |
"scaleup_contract": 7,
|
| 12 |
"scaleup_status": 52,
|
| 13 |
+
"publication_workflow": 7,
|
| 14 |
"reproducibility": 4,
|
| 15 |
"project_scope": 1,
|
| 16 |
"source_alignment": 5,
|
| 17 |
+
"evaluation_protocol": 9,
|
| 18 |
+
"website_data": 11,
|
| 19 |
"generated_figure": 7,
|
| 20 |
"visualization_builder": 1,
|
| 21 |
"model_result": 5,
|
|
|
|
| 301 |
"surface": "repo_hf",
|
| 302 |
"shows": "Runs simple metadata and neural MLP baselines on the same selected 96/16/16 episode split used by the Qwen3-Omni diagnostic pilot.",
|
| 303 |
"exists": true,
|
| 304 |
+
"bytes": 74316,
|
| 305 |
+
"sha256": "164c908bee1d4a6e0db344692833787582e45317b240ef5afbfbdb609a5175e6"
|
| 306 |
},
|
| 307 |
{
|
| 308 |
"id": "task_suite_enhancement_128",
|
|
|
|
| 610 |
"shows": "Machine-readable source-alignment pass/fail check for repo, website, and HF surfaces.",
|
| 611 |
"exists": true,
|
| 612 |
"bytes": 4432,
|
| 613 |
+
"sha256": "c916b18a11917e46e8561520cf2307f190c671c82e710ebd0f3522ec8a4be2bd"
|
| 614 |
},
|
| 615 |
{
|
| 616 |
"id": "source_alignment_validator",
|
|
|
|
| 730 |
"surface": "website_hf",
|
| 731 |
"shows": "Stores normalized 20-axis radar values, raw task metrics, Qwen3/Cosmos overlay mappings, branch-card caveats, proxy flags, and source artifacts.",
|
| 732 |
"exists": true,
|
| 733 |
+
"bytes": 228799,
|
| 734 |
+
"sha256": "c9c708f64963dac10e764eaae8e1b14c7161a938afa5ef5723fe59dc4ce764af"
|
| 735 |
},
|
| 736 |
{
|
| 737 |
"id": "single_episode_task_model_radar_json",
|
|
|
|
| 742 |
"shows": "Machine-readable split radar for the one-episode Minimal and Neural MLP baselines, both scored on all 20 task contracts.",
|
| 743 |
"exists": true,
|
| 744 |
"bytes": 51097,
|
| 745 |
+
"sha256": "d5e882120633f4d3ae90f1491682701c7593a42fc09e39b83fc5f375258e76e7"
|
| 746 |
},
|
| 747 |
{
|
| 748 |
"id": "episode128_task_model_radar_json",
|
|
|
|
| 752 |
"surface": "website_hf",
|
| 753 |
"shows": "Machine-readable split radar for selected 128-episode metadata/raw baselines and verified Qwen3/Cosmos branches, now complete at 140/140 scored rows with proxy notes retained.",
|
| 754 |
"exists": true,
|
| 755 |
+
"bytes": 184945,
|
| 756 |
+
"sha256": "8d4ef9c4cf1cf334fd41417d40fa0687ceefa964da9f8338c82f8cc6d36a3e76"
|
| 757 |
},
|
| 758 |
{
|
| 759 |
"id": "task_method_20_result_matrix_json",
|
|
|
|
| 763 |
"surface": "website_hf",
|
| 764 |
"shows": "Machine-readable 9-method by 20-task matrix where every method has 20 records and the current release is complete at 180/180 scored rows.",
|
| 765 |
"exists": true,
|
| 766 |
+
"bytes": 128509,
|
| 767 |
+
"sha256": "382e538dff284c5e2cf19fe2b3eb014d1b48fb33082bb2ece532ce3de6c1e9bb"
|
| 768 |
},
|
| 769 |
{
|
| 770 |
"id": "task_method_20_result_matrix",
|
|
|
|
| 786 |
"shows": "Machine-readable 180-record completion ledger with numeric scores, proxy flags, explicit status reasons, and source artifacts.",
|
| 787 |
"exists": true,
|
| 788 |
"bytes": 8500,
|
| 789 |
+
"sha256": "9cfd2ce8c4eb3bbe7e2af3f41df3b3ab74db9db08d9ea2e4f569f612358470dd"
|
| 790 |
},
|
| 791 |
{
|
| 792 |
"id": "task_method_20_gap_audit",
|
|
|
|
| 797 |
"shows": "Reader-facing ledger confirming 180/180 scored method-task cells and listing the six compact-proxy records separately.",
|
| 798 |
"exists": true,
|
| 799 |
"bytes": 3417,
|
| 800 |
+
"sha256": "3afc5db9803b6419ce4f40d6fb0dd5380ae182fb85b4f7b0f6ea6a46ae065c63"
|
| 801 |
+
},
|
| 802 |
+
{
|
| 803 |
+
"id": "task_method_20_source_audit_json",
|
| 804 |
+
"title": "Task-method 20-result source audit JSON",
|
| 805 |
+
"path": "docs/data/task_method_20_source_audit.json",
|
| 806 |
+
"kind": "website_data",
|
| 807 |
+
"surface": "website_hf",
|
| 808 |
+
"shows": "Machine-readable check that scored JSON-backed matrix cells match their declared metric source values.",
|
| 809 |
+
"exists": true,
|
| 810 |
+
"bytes": 561,
|
| 811 |
+
"sha256": "c795c8f387648a90e66146efc44a4be2f272d4a44097f0b9b39a7347df83daa0"
|
| 812 |
+
},
|
| 813 |
+
{
|
| 814 |
+
"id": "task_method_20_source_audit",
|
| 815 |
+
"title": "Task-method 20-result source audit",
|
| 816 |
+
"path": "TASK_METHOD_20_SOURCE_AUDIT.md",
|
| 817 |
+
"kind": "evaluation_protocol",
|
| 818 |
+
"surface": "repo_hf",
|
| 819 |
+
"shows": "Reader-facing source-value audit for the 180-result matrix.",
|
| 820 |
+
"exists": true,
|
| 821 |
+
"bytes": 447,
|
| 822 |
+
"sha256": "2b8bc99b7157894d59fa2f23ebaee33ce9e6e01c0b7316c7555ab0071c85eb41"
|
| 823 |
},
|
| 824 |
{
|
| 825 |
"id": "unified_task_model_radar_chart",
|
|
|
|
| 862 |
"surface": "repo_hf",
|
| 863 |
"shows": "Regenerates the direction-aware radar chart and machine-readable metric overlay JSON.",
|
| 864 |
"exists": true,
|
| 865 |
+
"bytes": 68542,
|
| 866 |
+
"sha256": "470b4c8acc437114b51d96987cd6324b9bf1d2ca16e9721d7fb00708aa58b383"
|
| 867 |
},
|
| 868 |
{
|
| 869 |
"id": "task_method_20_gap_audit_builder",
|
|
|
|
| 876 |
"bytes": 10295,
|
| 877 |
"sha256": "e2a3b41d3cca6efee7076b68c35693a4c53f5f2549e2eecbf035b98a717a3f65"
|
| 878 |
},
|
| 879 |
+
{
|
| 880 |
+
"id": "task_method_20_source_audit_validator",
|
| 881 |
+
"title": "Task-method source-audit validator",
|
| 882 |
+
"path": "scripts/validate_task_method_matrix_sources.py",
|
| 883 |
+
"kind": "publication_workflow",
|
| 884 |
+
"surface": "repo_hf",
|
| 885 |
+
"shows": "Fails release checks if a scored matrix row disagrees with its JSON metric source.",
|
| 886 |
+
"exists": true,
|
| 887 |
+
"bytes": 7877,
|
| 888 |
+
"sha256": "97edc3f064f77d544eff539bb7f16f8162e58ec581a63b91c473bada080f86ae"
|
| 889 |
+
},
|
| 890 |
{
|
| 891 |
"id": "all_task_model_scoring_waiter",
|
| 892 |
"title": "All-task model scoring guarded waiter",
|
|
|
|
| 906 |
"surface": "repo_hf",
|
| 907 |
"shows": "Checks whether Qwen3/Cosmos branches have train, validation, and test prediction files before extending model overlays to all 20 task contracts.",
|
| 908 |
"exists": true,
|
| 909 |
+
"bytes": 4320,
|
| 910 |
+
"sha256": "11cff26749bf6ad8b8ee028b18e0b4be5713ed8b5325578caa03be25d894263b"
|
| 911 |
},
|
| 912 |
{
|
| 913 |
"id": "model_output_probe_script",
|
|
|
|
| 917 |
"surface": "repo_hf",
|
| 918 |
"shows": "Audits model-output split availability and writes a readiness report without assigning new numeric task scores.",
|
| 919 |
"exists": true,
|
| 920 |
+
"bytes": 10520,
|
| 921 |
+
"sha256": "741ee733068e87c52c8da2bd15987e2b4538b5e705592182d76c42b5cf34fe96"
|
| 922 |
},
|
| 923 |
{
|
| 924 |
"id": "existing_model_output_task_probe",
|
|
|
|
| 1137 |
"surface": "repo_hf",
|
| 1138 |
"shows": "Lists the automated and post-publish checks used to keep the release current.",
|
| 1139 |
"exists": true,
|
| 1140 |
+
"bytes": 5184,
|
| 1141 |
+
"sha256": "4931d4457c4c5b0978fdf31861b6e3e2da6e24368398cf1756120a32cbff98f0"
|
| 1142 |
},
|
| 1143 |
{
|
| 1144 |
"id": "quality_gate_manifest",
|
|
|
|
| 1148 |
"surface": "website_hf",
|
| 1149 |
"shows": "Machine-readable release-check summary for validators, mirrors, and public project surfaces.",
|
| 1150 |
"exists": true,
|
| 1151 |
+
"bytes": 8640,
|
| 1152 |
+
"sha256": "445196830bb913bfa075ae4174e7b1f5b64f623cf13a2afde7513add9dbefc21"
|
| 1153 |
},
|
| 1154 |
{
|
| 1155 |
"id": "public_surface_qa",
|
|
|
|
| 1285 |
"surface": "repo",
|
| 1286 |
"shows": "Fetches the published GitHub/HF URLs and compares live hashes and public-card markers against the release assets.",
|
| 1287 |
"exists": true,
|
| 1288 |
+
"bytes": 67647,
|
| 1289 |
+
"sha256": "d2b4af98e6fd8b23fd86cd068f2bbf887e5d69686dd62fe3bfc7e8251a6d75d6"
|
| 1290 |
},
|
| 1291 |
{
|
| 1292 |
"id": "reproducibility_contract",
|
|
|
|
| 1318 |
"surface": "repo_hf",
|
| 1319 |
"shows": "Generates the selective artifact catalog from local files.",
|
| 1320 |
"exists": true,
|
| 1321 |
+
"bytes": 67105,
|
| 1322 |
+
"sha256": "8fc1a2b5d4a50d49ff5738ec1e5e91088dbfa514c9f0485d3afe708add6d94a1"
|
| 1323 |
},
|
| 1324 |
{
|
| 1325 |
"id": "publication_audit",
|
|
|
|
| 1330 |
"volatile": true,
|
| 1331 |
"shows": "Confirms public bundles exclude raw data, caches, heavy archives, and credential text.",
|
| 1332 |
"exists": true,
|
| 1333 |
+
"bytes": 10662,
|
| 1334 |
"hash_policy": "existence_and_size_only"
|
| 1335 |
},
|
| 1336 |
{
|
|
|
|
| 1354 |
"volatile": true,
|
| 1355 |
"shows": "Confirms prepared GitHub/HF Space/artifact/model mirrors share the same critical data, figure, website HTML, and validator files.",
|
| 1356 |
"exists": true,
|
| 1357 |
+
"bytes": 1395239,
|
| 1358 |
"hash_policy": "existence_and_size_only"
|
| 1359 |
},
|
| 1360 |
{
|
|
|
|
| 1366 |
"volatile": true,
|
| 1367 |
"shows": "Confirms local website links, anchors, JSON data files, and referenced images resolve.",
|
| 1368 |
"exists": true,
|
| 1369 |
+
"bytes": 20141,
|
| 1370 |
"hash_policy": "existence_and_size_only"
|
| 1371 |
},
|
| 1372 |
{
|
data/episode128_task_model_radar.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"title": "128-Episode 20-Task Radar",
|
| 3 |
"status": "pass",
|
| 4 |
-
"generated_at_utc": "2026-06-
|
| 5 |
"description": "Selected 128-episode metadata/raw baselines plus verified Qwen3/Cosmos branches. Every method has 20 records; numeric scores appear only where the public artifact produced that task target.",
|
| 6 |
"task_count": 20,
|
| 7 |
"method_count": 7,
|
|
@@ -1166,7 +1166,7 @@
|
|
| 1166 |
"cosmos3_super_reasoner": {
|
| 1167 |
"raw": 0.6286317274823326,
|
| 1168 |
"metric_key": "temporal_order_f1",
|
| 1169 |
-
"source": "results/omni_finetune/
|
| 1170 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 1171 |
"status": "scored",
|
| 1172 |
"reason": null,
|
|
@@ -1257,7 +1257,7 @@
|
|
| 1257 |
"cosmos3_super_reasoner": {
|
| 1258 |
"raw": 0.37271645981034185,
|
| 1259 |
"metric_key": "misalignment_detection_f1",
|
| 1260 |
-
"source": "results/omni_finetune/
|
| 1261 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 1262 |
"status": "scored",
|
| 1263 |
"reason": null,
|
|
@@ -1439,7 +1439,7 @@
|
|
| 1439 |
"cosmos3_super_reasoner": {
|
| 1440 |
"raw": 0.0,
|
| 1441 |
"metric_key": "next_subtask_forecast_macro_f1",
|
| 1442 |
-
"source": "results/omni_finetune/
|
| 1443 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 1444 |
"status": "scored",
|
| 1445 |
"reason": null,
|
|
@@ -1519,7 +1519,7 @@
|
|
| 1519 |
"qwen3_omni_v6_lora": {
|
| 1520 |
"raw": 0.4318674027510605,
|
| 1521 |
"metric_key": "macro_f1",
|
| 1522 |
-
"source": "results/omni_finetune/
|
| 1523 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 1524 |
"status": "scored",
|
| 1525 |
"reason": null,
|
|
@@ -1712,7 +1712,7 @@
|
|
| 1712 |
"cosmos3_super_reasoner": {
|
| 1713 |
"raw": 0.0009279881217520415,
|
| 1714 |
"metric_key": "object_set_forecast_micro_f1",
|
| 1715 |
-
"source": "results/omni_finetune/
|
| 1716 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 1717 |
"status": "scored",
|
| 1718 |
"reason": null,
|
|
@@ -3372,7 +3372,7 @@
|
|
| 3372 |
"raw_text": "0.6286",
|
| 3373 |
"normalized_score": 0.6286317274823326,
|
| 3374 |
"metric_key": "temporal_order_f1",
|
| 3375 |
-
"source": "results/omni_finetune/
|
| 3376 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 3377 |
"reason": null
|
| 3378 |
},
|
|
@@ -3498,7 +3498,7 @@
|
|
| 3498 |
"raw_text": "0.3727",
|
| 3499 |
"normalized_score": 0.37271645981034185,
|
| 3500 |
"metric_key": "misalignment_detection_f1",
|
| 3501 |
-
"source": "results/omni_finetune/
|
| 3502 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 3503 |
"reason": null
|
| 3504 |
},
|
|
@@ -3750,7 +3750,7 @@
|
|
| 3750 |
"raw_text": "0.0000",
|
| 3751 |
"normalized_score": 0.0,
|
| 3752 |
"metric_key": "next_subtask_forecast_macro_f1",
|
| 3753 |
-
"source": "results/omni_finetune/
|
| 3754 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 3755 |
"reason": null
|
| 3756 |
},
|
|
@@ -3858,7 +3858,7 @@
|
|
| 3858 |
"raw_text": "0.4319",
|
| 3859 |
"normalized_score": 0.4318674027510605,
|
| 3860 |
"metric_key": "macro_f1",
|
| 3861 |
-
"source": "results/omni_finetune/
|
| 3862 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 3863 |
"reason": null
|
| 3864 |
},
|
|
@@ -4128,7 +4128,7 @@
|
|
| 4128 |
"raw_text": "0.0009",
|
| 4129 |
"normalized_score": 0.0009279881217520415,
|
| 4130 |
"metric_key": "object_set_forecast_micro_f1",
|
| 4131 |
-
"source": "results/omni_finetune/
|
| 4132 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 4133 |
"reason": null
|
| 4134 |
},
|
|
|
|
| 1 |
{
|
| 2 |
"title": "128-Episode 20-Task Radar",
|
| 3 |
"status": "pass",
|
| 4 |
+
"generated_at_utc": "2026-06-20T20:38:21+00:00",
|
| 5 |
"description": "Selected 128-episode metadata/raw baselines plus verified Qwen3/Cosmos branches. Every method has 20 records; numeric scores appear only where the public artifact produced that task target.",
|
| 6 |
"task_count": 20,
|
| 7 |
"method_count": 7,
|
|
|
|
| 1166 |
"cosmos3_super_reasoner": {
|
| 1167 |
"raw": 0.6286317274823326,
|
| 1168 |
"metric_key": "temporal_order_f1",
|
| 1169 |
+
"source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/temporal_order/metrics.json",
|
| 1170 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 1171 |
"status": "scored",
|
| 1172 |
"reason": null,
|
|
|
|
| 1257 |
"cosmos3_super_reasoner": {
|
| 1258 |
"raw": 0.37271645981034185,
|
| 1259 |
"metric_key": "misalignment_detection_f1",
|
| 1260 |
+
"source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/misalignment_detection/metrics.json",
|
| 1261 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 1262 |
"status": "scored",
|
| 1263 |
"reason": null,
|
|
|
|
| 1439 |
"cosmos3_super_reasoner": {
|
| 1440 |
"raw": 0.0,
|
| 1441 |
"metric_key": "next_subtask_forecast_macro_f1",
|
| 1442 |
+
"source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/next_subtask_forecast/metrics.json",
|
| 1443 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 1444 |
"status": "scored",
|
| 1445 |
"reason": null,
|
|
|
|
| 1519 |
"qwen3_omni_v6_lora": {
|
| 1520 |
"raw": 0.4318674027510605,
|
| 1521 |
"metric_key": "macro_f1",
|
| 1522 |
+
"source": "results/omni_finetune/xperience10m_qwen3_omni_v6_interaction_text_task15_a100_20260620T010305Z/interaction_text_prediction/metrics.json",
|
| 1523 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 1524 |
"status": "scored",
|
| 1525 |
"reason": null,
|
|
|
|
| 1712 |
"cosmos3_super_reasoner": {
|
| 1713 |
"raw": 0.0009279881217520415,
|
| 1714 |
"metric_key": "object_set_forecast_micro_f1",
|
| 1715 |
+
"source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/object_set_forecast/metrics.json",
|
| 1716 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 1717 |
"status": "scored",
|
| 1718 |
"reason": null,
|
|
|
|
| 3372 |
"raw_text": "0.6286",
|
| 3373 |
"normalized_score": 0.6286317274823326,
|
| 3374 |
"metric_key": "temporal_order_f1",
|
| 3375 |
+
"source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/temporal_order/metrics.json",
|
| 3376 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 3377 |
"reason": null
|
| 3378 |
},
|
|
|
|
| 3498 |
"raw_text": "0.3727",
|
| 3499 |
"normalized_score": 0.37271645981034185,
|
| 3500 |
"metric_key": "misalignment_detection_f1",
|
| 3501 |
+
"source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/misalignment_detection/metrics.json",
|
| 3502 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 3503 |
"reason": null
|
| 3504 |
},
|
|
|
|
| 3750 |
"raw_text": "0.0000",
|
| 3751 |
"normalized_score": 0.0,
|
| 3752 |
"metric_key": "next_subtask_forecast_macro_f1",
|
| 3753 |
+
"source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/next_subtask_forecast/metrics.json",
|
| 3754 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 3755 |
"reason": null
|
| 3756 |
},
|
|
|
|
| 3858 |
"raw_text": "0.4319",
|
| 3859 |
"normalized_score": 0.4318674027510605,
|
| 3860 |
"metric_key": "macro_f1",
|
| 3861 |
+
"source": "results/omni_finetune/xperience10m_qwen3_omni_v6_interaction_text_task15_a100_20260620T010305Z/interaction_text_prediction/metrics.json",
|
| 3862 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 3863 |
"reason": null
|
| 3864 |
},
|
|
|
|
| 4128 |
"raw_text": "0.0009",
|
| 4129 |
"normalized_score": 0.0009279881217520415,
|
| 4130 |
"metric_key": "object_set_forecast_micro_f1",
|
| 4131 |
+
"source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/object_set_forecast/metrics.json",
|
| 4132 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 4133 |
"reason": null
|
| 4134 |
},
|
data/public_surface_qa.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"title": "Ropedia Xperience-10M Public Project Surface",
|
| 3 |
"status": "pass",
|
| 4 |
-
"generated_at_utc": "2026-06-
|
| 5 |
"scope": "Repo README, GitHub Pages HTML, Hugging Face Space card, artifact dataset card, and model card.",
|
| 6 |
"checks": [
|
| 7 |
{
|
|
@@ -18,7 +18,7 @@
|
|
| 18 |
"website_integrity": {
|
| 19 |
"exists": true,
|
| 20 |
"status": "pass",
|
| 21 |
-
"generated_at_utc": "2026-06-
|
| 22 |
},
|
| 23 |
"rendered_site_check": {
|
| 24 |
"exists": true,
|
|
@@ -28,27 +28,27 @@
|
|
| 28 |
"task_surface_integrity": {
|
| 29 |
"exists": true,
|
| 30 |
"status": "pass",
|
| 31 |
-
"generated_at_utc": "2026-06-
|
| 32 |
},
|
| 33 |
"source_alignment": {
|
| 34 |
"exists": true,
|
| 35 |
"status": "pass",
|
| 36 |
-
"generated_at_utc": "2026-06-
|
| 37 |
},
|
| 38 |
"scale_up_status": {
|
| 39 |
"exists": true,
|
| 40 |
"status": "pass",
|
| 41 |
-
"generated_at_utc": "2026-06-
|
| 42 |
},
|
| 43 |
"publication_package": {
|
| 44 |
"exists": true,
|
| 45 |
"status": "pass",
|
| 46 |
-
"generated_at_utc": "2026-06-
|
| 47 |
},
|
| 48 |
"mirror_parity": {
|
| 49 |
"exists": true,
|
| 50 |
"status": "pass",
|
| 51 |
-
"generated_at_utc": "2026-06-
|
| 52 |
}
|
| 53 |
},
|
| 54 |
"failures": {}
|
|
@@ -111,7 +111,7 @@
|
|
| 111 |
"https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite": 11,
|
| 112 |
"https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts": 11,
|
| 113 |
"https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines": 14,
|
| 114 |
-
"https://huggingface.co/cy0307/ropedia-xperience-10m-weights-results":
|
| 115 |
"https://huggingface.co/datasets/ropedia-ai/xperience-10m": 38,
|
| 116 |
"https://ropedia.com/dataset": 5
|
| 117 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"title": "Ropedia Xperience-10M Public Project Surface",
|
| 3 |
"status": "pass",
|
| 4 |
+
"generated_at_utc": "2026-06-20T20:48:08+00:00",
|
| 5 |
"scope": "Repo README, GitHub Pages HTML, Hugging Face Space card, artifact dataset card, and model card.",
|
| 6 |
"checks": [
|
| 7 |
{
|
|
|
|
| 18 |
"website_integrity": {
|
| 19 |
"exists": true,
|
| 20 |
"status": "pass",
|
| 21 |
+
"generated_at_utc": "2026-06-20T20:41:45+00:00"
|
| 22 |
},
|
| 23 |
"rendered_site_check": {
|
| 24 |
"exists": true,
|
|
|
|
| 28 |
"task_surface_integrity": {
|
| 29 |
"exists": true,
|
| 30 |
"status": "pass",
|
| 31 |
+
"generated_at_utc": "2026-06-20T19:55:17+00:00"
|
| 32 |
},
|
| 33 |
"source_alignment": {
|
| 34 |
"exists": true,
|
| 35 |
"status": "pass",
|
| 36 |
+
"generated_at_utc": "2026-06-20T19:55:18+00:00"
|
| 37 |
},
|
| 38 |
"scale_up_status": {
|
| 39 |
"exists": true,
|
| 40 |
"status": "pass",
|
| 41 |
+
"generated_at_utc": "2026-06-20T19:55:26+00:00"
|
| 42 |
},
|
| 43 |
"publication_package": {
|
| 44 |
"exists": true,
|
| 45 |
"status": "pass",
|
| 46 |
+
"generated_at_utc": "2026-06-20T20:42:41+00:00"
|
| 47 |
},
|
| 48 |
"mirror_parity": {
|
| 49 |
"exists": true,
|
| 50 |
"status": "pass",
|
| 51 |
+
"generated_at_utc": "2026-06-20T20:47:51+00:00"
|
| 52 |
}
|
| 53 |
},
|
| 54 |
"failures": {}
|
|
|
|
| 111 |
"https://huggingface.co/spaces/cy0307/ropedia-xperience-10m-task-suite": 11,
|
| 112 |
"https://huggingface.co/datasets/cy0307/ropedia-xperience-10m-task-suite-artifacts": 11,
|
| 113 |
"https://huggingface.co/cy0307/ropedia-xperience-10m-task-baselines": 14,
|
| 114 |
+
"https://huggingface.co/cy0307/ropedia-xperience-10m-weights-results": 6,
|
| 115 |
"https://huggingface.co/datasets/ropedia-ai/xperience-10m": 38,
|
| 116 |
"https://ropedia.com/dataset": 5
|
| 117 |
}
|
data/quality_gates.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"title": "Ropedia Xperience-10M Release Checks",
|
| 3 |
"status": "pass",
|
| 4 |
-
"generated_at_utc": "2026-06-
|
| 5 |
"rule": "A release is current when the automated reports pass and the live GitHub/Hugging Face mirrors are verified after publishing.",
|
| 6 |
"automated_gates": [
|
| 7 |
{
|
|
@@ -76,6 +76,18 @@
|
|
| 76 |
"status": "pass"
|
| 77 |
}
|
| 78 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
{
|
| 80 |
"id": "figure_index",
|
| 81 |
"title": "Figure index",
|
|
|
|
| 1 |
{
|
| 2 |
"title": "Ropedia Xperience-10M Release Checks",
|
| 3 |
"status": "pass",
|
| 4 |
+
"generated_at_utc": "2026-06-20T20:48:18+00:00",
|
| 5 |
"rule": "A release is current when the automated reports pass and the live GitHub/Hugging Face mirrors are verified after publishing.",
|
| 6 |
"automated_gates": [
|
| 7 |
{
|
|
|
|
| 76 |
"status": "pass"
|
| 77 |
}
|
| 78 |
},
|
| 79 |
+
{
|
| 80 |
+
"id": "task_method_source_audit",
|
| 81 |
+
"title": "Task-method source audit",
|
| 82 |
+
"command": "python scripts/validate_task_method_matrix_sources.py",
|
| 83 |
+
"report": "docs/data/task_method_20_source_audit.json",
|
| 84 |
+
"blocks_if": "A scored 20-task matrix cell points to a JSON metric source that does not contain the same metric value.",
|
| 85 |
+
"shows": "Public 20-task scores remain traceable to their task-specific metric artifacts.",
|
| 86 |
+
"current_report": {
|
| 87 |
+
"exists": true,
|
| 88 |
+
"status": "pass"
|
| 89 |
+
}
|
| 90 |
+
},
|
| 91 |
{
|
| 92 |
"id": "figure_index",
|
| 93 |
"title": "Figure index",
|
data/single_episode_task_model_radar.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"title": "Single-Episode 20-Task Radar",
|
| 3 |
"status": "pass",
|
| 4 |
-
"generated_at_utc": "2026-06-
|
| 5 |
"description": "Minimal and Neural MLP baselines on the one public sample episode, both scored on all 20 task contracts.",
|
| 6 |
"task_count": 20,
|
| 7 |
"method_count": 2,
|
|
|
|
| 1 |
{
|
| 2 |
"title": "Single-Episode 20-Task Radar",
|
| 3 |
"status": "pass",
|
| 4 |
+
"generated_at_utc": "2026-06-20T20:38:21+00:00",
|
| 5 |
"description": "Minimal and Neural MLP baselines on the one public sample episode, both scored on all 20 task contracts.",
|
| 6 |
"task_count": 20,
|
| 7 |
"method_count": 2,
|
data/task_method_20_gap_audit.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"generated_at_utc": "2026-06-
|
| 3 |
"immediate_actions": [
|
| 4 |
{
|
| 5 |
"artifact": "docs/data/task_method_20_gap_audit.json",
|
|
|
|
| 1 |
{
|
| 2 |
+
"generated_at_utc": "2026-06-20T20:38:59+00:00",
|
| 3 |
"immediate_actions": [
|
| 4 |
{
|
| 5 |
"artifact": "docs/data/task_method_20_gap_audit.json",
|
data/task_method_20_result_matrix.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"title": "Task Method 20-Result Matrix",
|
| 3 |
"status": "pass",
|
| 4 |
-
"generated_at_utc": "2026-06-
|
| 5 |
"task_count": 20,
|
| 6 |
"method_count": 9,
|
| 7 |
"method_task_record_count": 180,
|
|
@@ -1980,7 +1980,7 @@
|
|
| 1980 |
"raw_text": "0.6286",
|
| 1981 |
"normalized_score": 0.6286317274823326,
|
| 1982 |
"metric_key": "temporal_order_f1",
|
| 1983 |
-
"source": "results/omni_finetune/
|
| 1984 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 1985 |
"reason": null
|
| 1986 |
},
|
|
@@ -2142,7 +2142,7 @@
|
|
| 2142 |
"raw_text": "0.3727",
|
| 2143 |
"normalized_score": 0.37271645981034185,
|
| 2144 |
"metric_key": "misalignment_detection_f1",
|
| 2145 |
-
"source": "results/omni_finetune/
|
| 2146 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 2147 |
"reason": null
|
| 2148 |
},
|
|
@@ -2466,7 +2466,7 @@
|
|
| 2466 |
"raw_text": "0.0000",
|
| 2467 |
"normalized_score": 0.0,
|
| 2468 |
"metric_key": "next_subtask_forecast_macro_f1",
|
| 2469 |
-
"source": "results/omni_finetune/
|
| 2470 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 2471 |
"reason": null
|
| 2472 |
},
|
|
@@ -2610,7 +2610,7 @@
|
|
| 2610 |
"raw_text": "0.4319",
|
| 2611 |
"normalized_score": 0.4318674027510605,
|
| 2612 |
"metric_key": "macro_f1",
|
| 2613 |
-
"source": "results/omni_finetune/
|
| 2614 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 2615 |
"reason": null
|
| 2616 |
},
|
|
@@ -2952,7 +2952,7 @@
|
|
| 2952 |
"raw_text": "0.0009",
|
| 2953 |
"normalized_score": 0.0009279881217520415,
|
| 2954 |
"metric_key": "object_set_forecast_micro_f1",
|
| 2955 |
-
"source": "results/omni_finetune/
|
| 2956 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 2957 |
"reason": null
|
| 2958 |
},
|
|
|
|
| 1 |
{
|
| 2 |
"title": "Task Method 20-Result Matrix",
|
| 3 |
"status": "pass",
|
| 4 |
+
"generated_at_utc": "2026-06-20T20:38:21+00:00",
|
| 5 |
"task_count": 20,
|
| 6 |
"method_count": 9,
|
| 7 |
"method_task_record_count": 180,
|
|
|
|
| 1980 |
"raw_text": "0.6286",
|
| 1981 |
"normalized_score": 0.6286317274823326,
|
| 1982 |
"metric_key": "temporal_order_f1",
|
| 1983 |
+
"source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/temporal_order/metrics.json",
|
| 1984 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 1985 |
"reason": null
|
| 1986 |
},
|
|
|
|
| 2142 |
"raw_text": "0.3727",
|
| 2143 |
"normalized_score": 0.37271645981034185,
|
| 2144 |
"metric_key": "misalignment_detection_f1",
|
| 2145 |
+
"source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/misalignment_detection/metrics.json",
|
| 2146 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 2147 |
"reason": null
|
| 2148 |
},
|
|
|
|
| 2466 |
"raw_text": "0.0000",
|
| 2467 |
"normalized_score": 0.0,
|
| 2468 |
"metric_key": "next_subtask_forecast_macro_f1",
|
| 2469 |
+
"source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/next_subtask_forecast/metrics.json",
|
| 2470 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 2471 |
"reason": null
|
| 2472 |
},
|
|
|
|
| 2610 |
"raw_text": "0.4319",
|
| 2611 |
"normalized_score": 0.4318674027510605,
|
| 2612 |
"metric_key": "macro_f1",
|
| 2613 |
+
"source": "results/omni_finetune/xperience10m_qwen3_omni_v6_interaction_text_task15_a100_20260620T010305Z/interaction_text_prediction/metrics.json",
|
| 2614 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 2615 |
"reason": null
|
| 2616 |
},
|
|
|
|
| 2952 |
"raw_text": "0.0009",
|
| 2953 |
"normalized_score": 0.0009279881217520415,
|
| 2954 |
"metric_key": "object_set_forecast_micro_f1",
|
| 2955 |
+
"source": "results/omni_finetune/xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620/object_set_forecast/metrics.json",
|
| 2956 |
"scope": "multi_episode_128_partial_model_overlay",
|
| 2957 |
"reason": null
|
| 2958 |
},
|
data/task_method_20_source_audit.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"checked_json_metric_count": 180,
|
| 3 |
+
"failure_count": 0,
|
| 4 |
+
"failures": [],
|
| 5 |
+
"generated_at_utc": "2026-06-20T20:48:41+00:00",
|
| 6 |
+
"method_task_record_count": 180,
|
| 7 |
+
"rule": "Every scored row that declares a JSON metric source must have the same numeric value under that row's metric_key.",
|
| 8 |
+
"scored_method_task_count": 180,
|
| 9 |
+
"skipped_record_count": 0,
|
| 10 |
+
"skipped_records": [],
|
| 11 |
+
"source_matrix": "docs/data/task_method_20_result_matrix.json",
|
| 12 |
+
"status": "pass",
|
| 13 |
+
"status_counts": {
|
| 14 |
+
"checked": 180
|
| 15 |
+
},
|
| 16 |
+
"title": "Task Method 20 Matrix Source Audit"
|
| 17 |
+
}
|
data/website_integrity.json
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
{
|
| 2 |
"status": "pass",
|
| 3 |
-
"generated_at_utc": "2026-06-
|
| 4 |
"docs_root": "docs",
|
| 5 |
"site_base": "/ropedia-xperience-10m-task-suite/",
|
| 6 |
"summary": {
|
| 7 |
"html_pages": 4,
|
| 8 |
"local_references": 213,
|
| 9 |
"external_reference_count": 152,
|
| 10 |
-
"json_files":
|
| 11 |
"image_assets_referenced": 28,
|
| 12 |
"failure_count": 0
|
| 13 |
},
|
|
@@ -301,7 +301,7 @@
|
|
| 301 |
},
|
| 302 |
{
|
| 303 |
"path": "data/artifact_index.json",
|
| 304 |
-
"bytes":
|
| 305 |
"top_level_type": "dict"
|
| 306 |
},
|
| 307 |
{
|
|
@@ -316,7 +316,7 @@
|
|
| 316 |
},
|
| 317 |
{
|
| 318 |
"path": "data/episode128_task_model_radar.json",
|
| 319 |
-
"bytes":
|
| 320 |
"top_level_type": "dict"
|
| 321 |
},
|
| 322 |
{
|
|
@@ -491,7 +491,12 @@
|
|
| 491 |
},
|
| 492 |
{
|
| 493 |
"path": "data/task_method_20_result_matrix.json",
|
| 494 |
-
"bytes":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
"top_level_type": "dict"
|
| 496 |
},
|
| 497 |
{
|
|
@@ -526,7 +531,7 @@
|
|
| 526 |
},
|
| 527 |
{
|
| 528 |
"path": "data/unified_task_model_radar.json",
|
| 529 |
-
"bytes":
|
| 530 |
"top_level_type": "dict"
|
| 531 |
},
|
| 532 |
{
|
|
|
|
| 1 |
{
|
| 2 |
"status": "pass",
|
| 3 |
+
"generated_at_utc": "2026-06-20T20:41:45+00:00",
|
| 4 |
"docs_root": "docs",
|
| 5 |
"site_base": "/ropedia-xperience-10m-task-suite/",
|
| 6 |
"summary": {
|
| 7 |
"html_pages": 4,
|
| 8 |
"local_references": 213,
|
| 9 |
"external_reference_count": 152,
|
| 10 |
+
"json_files": 51,
|
| 11 |
"image_assets_referenced": 28,
|
| 12 |
"failure_count": 0
|
| 13 |
},
|
|
|
|
| 301 |
},
|
| 302 |
{
|
| 303 |
"path": "data/artifact_index.json",
|
| 304 |
+
"bytes": 122823,
|
| 305 |
"top_level_type": "dict"
|
| 306 |
},
|
| 307 |
{
|
|
|
|
| 316 |
},
|
| 317 |
{
|
| 318 |
"path": "data/episode128_task_model_radar.json",
|
| 319 |
+
"bytes": 184945,
|
| 320 |
"top_level_type": "dict"
|
| 321 |
},
|
| 322 |
{
|
|
|
|
| 491 |
},
|
| 492 |
{
|
| 493 |
"path": "data/task_method_20_result_matrix.json",
|
| 494 |
+
"bytes": 128509,
|
| 495 |
+
"top_level_type": "dict"
|
| 496 |
+
},
|
| 497 |
+
{
|
| 498 |
+
"path": "data/task_method_20_source_audit.json",
|
| 499 |
+
"bytes": 561,
|
| 500 |
"top_level_type": "dict"
|
| 501 |
},
|
| 502 |
{
|
|
|
|
| 531 |
},
|
| 532 |
{
|
| 533 |
"path": "data/unified_task_model_radar.json",
|
| 534 |
+
"bytes": 228799,
|
| 535 |
"top_level_type": "dict"
|
| 536 |
},
|
| 537 |
{
|
scripts/omni/eval_cosmos3_super_future_task_probes.py
CHANGED
|
@@ -31,6 +31,7 @@ from eval_qwen3_omni_future_task_probes import (
|
|
| 31 |
score_task as qwen_score_task,
|
| 32 |
select_eval_indices,
|
| 33 |
select_tasks,
|
|
|
|
| 34 |
task_target_value,
|
| 35 |
time_to_transition_map,
|
| 36 |
write_json,
|
|
@@ -212,9 +213,14 @@ def main() -> int:
|
|
| 212 |
samples = load_jsonl(args.dataset_jsonl)
|
| 213 |
future_map = future_index_map(samples, args.future_frames)
|
| 214 |
transition_targets = time_to_transition_map(samples)
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
write_json(args.output_dir / "server_info.json", server_info(args))
|
| 220 |
append_jsonl(
|
|
@@ -224,7 +230,9 @@ def main() -> int:
|
|
| 224 |
"timestamp": time.time(),
|
| 225 |
"run_id": args.run_id,
|
| 226 |
"tasks": selected_tasks,
|
| 227 |
-
"
|
|
|
|
|
|
|
| 228 |
"sample_offset": args.sample_offset,
|
| 229 |
"sample_stride": args.sample_stride,
|
| 230 |
"future_frames": args.future_frames,
|
|
@@ -246,9 +254,10 @@ def main() -> int:
|
|
| 246 |
for task_id in selected_tasks:
|
| 247 |
spec = TASK_SPECS[task_id]
|
| 248 |
partial_path = args.output_dir / task_id / "predictions.partial.jsonl"
|
| 249 |
-
|
|
|
|
| 250 |
sample = samples[sample_idx]
|
| 251 |
-
future_sample = samples[future_map[sample_idx]]
|
| 252 |
pred_id = prediction_id(task_id, sample)
|
| 253 |
if args.resume and pred_id in partial_by_task[task_id]:
|
| 254 |
continue
|
|
@@ -301,7 +310,7 @@ def main() -> int:
|
|
| 301 |
"timestamp": time.time(),
|
| 302 |
"task_id": task_id,
|
| 303 |
"sample_index": local_pos,
|
| 304 |
-
"num_eval_samples": len(
|
| 305 |
"completed_samples_for_task": len(partial_by_task[task_id]),
|
| 306 |
"sample_id": sample.get("id"),
|
| 307 |
"seconds": round(time.time() - started, 3),
|
|
@@ -310,7 +319,7 @@ def main() -> int:
|
|
| 310 |
|
| 311 |
task_metrics = {}
|
| 312 |
for task_id in selected_tasks:
|
| 313 |
-
rows = [partial_by_task[task_id][prediction_id(task_id, samples[idx])] for idx in
|
| 314 |
task_metrics[task_id] = score_task(task_id, TASK_SPECS[task_id], rows, args.output_dir, args)
|
| 315 |
|
| 316 |
display_name = model_display_name(args)
|
|
|
|
| 31 |
score_task as qwen_score_task,
|
| 32 |
select_eval_indices,
|
| 33 |
select_tasks,
|
| 34 |
+
task_requires_future_sample,
|
| 35 |
task_target_value,
|
| 36 |
time_to_transition_map,
|
| 37 |
write_json,
|
|
|
|
| 213 |
samples = load_jsonl(args.dataset_jsonl)
|
| 214 |
future_map = future_index_map(samples, args.future_frames)
|
| 215 |
transition_targets = time_to_transition_map(samples)
|
| 216 |
+
base_eval_indices = select_eval_indices(samples, args)
|
| 217 |
+
eval_indices_by_task = {
|
| 218 |
+
task_id: [idx for idx in base_eval_indices if (not task_requires_future_sample(task_id) or idx in future_map)]
|
| 219 |
+
for task_id in selected_tasks
|
| 220 |
+
}
|
| 221 |
+
empty_tasks = [task_id for task_id, indices in eval_indices_by_task.items() if not indices]
|
| 222 |
+
if empty_tasks:
|
| 223 |
+
raise ValueError(f"No evaluation samples selected for tasks: {', '.join(empty_tasks)}")
|
| 224 |
|
| 225 |
write_json(args.output_dir / "server_info.json", server_info(args))
|
| 226 |
append_jsonl(
|
|
|
|
| 230 |
"timestamp": time.time(),
|
| 231 |
"run_id": args.run_id,
|
| 232 |
"tasks": selected_tasks,
|
| 233 |
+
"num_base_eval_samples": len(base_eval_indices),
|
| 234 |
+
"num_eval_samples_by_task": {task_id: len(indices) for task_id, indices in eval_indices_by_task.items()},
|
| 235 |
+
"num_eval_samples_with_future": sum(1 for idx in base_eval_indices if idx in future_map),
|
| 236 |
"sample_offset": args.sample_offset,
|
| 237 |
"sample_stride": args.sample_stride,
|
| 238 |
"future_frames": args.future_frames,
|
|
|
|
| 254 |
for task_id in selected_tasks:
|
| 255 |
spec = TASK_SPECS[task_id]
|
| 256 |
partial_path = args.output_dir / task_id / "predictions.partial.jsonl"
|
| 257 |
+
task_eval_indices = eval_indices_by_task[task_id]
|
| 258 |
+
for local_pos, sample_idx in enumerate(task_eval_indices, start=1):
|
| 259 |
sample = samples[sample_idx]
|
| 260 |
+
future_sample = samples[future_map[sample_idx]] if task_requires_future_sample(task_id) else sample
|
| 261 |
pred_id = prediction_id(task_id, sample)
|
| 262 |
if args.resume and pred_id in partial_by_task[task_id]:
|
| 263 |
continue
|
|
|
|
| 310 |
"timestamp": time.time(),
|
| 311 |
"task_id": task_id,
|
| 312 |
"sample_index": local_pos,
|
| 313 |
+
"num_eval_samples": len(task_eval_indices),
|
| 314 |
"completed_samples_for_task": len(partial_by_task[task_id]),
|
| 315 |
"sample_id": sample.get("id"),
|
| 316 |
"seconds": round(time.time() - started, 3),
|
|
|
|
| 319 |
|
| 320 |
task_metrics = {}
|
| 321 |
for task_id in selected_tasks:
|
| 322 |
+
rows = [partial_by_task[task_id][prediction_id(task_id, samples[idx])] for idx in eval_indices_by_task[task_id]]
|
| 323 |
task_metrics[task_id] = score_task(task_id, TASK_SPECS[task_id], rows, args.output_dir, args)
|
| 324 |
|
| 325 |
display_name = model_display_name(args)
|
scripts/omni/merge_cosmos3_super_future_task_probe_shards.py
CHANGED
|
@@ -56,13 +56,27 @@ def main() -> int:
|
|
| 56 |
args.output_dir.mkdir(parents=True, exist_ok=True)
|
| 57 |
task_metrics: dict[str, dict[str, Any]] = {}
|
| 58 |
first_metrics: dict[str, Any] | None = None
|
|
|
|
| 59 |
|
| 60 |
for task_id, spec in TASK_SPECS.items():
|
| 61 |
rows_by_id: dict[str, dict[str, Any]] = {}
|
|
|
|
| 62 |
for shard_dir in args.shard_dir:
|
| 63 |
for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
|
| 64 |
key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
shard_metrics = read_json(shard_dir / task_id / "metrics.json")
|
| 67 |
if shard_metrics and first_metrics is None:
|
| 68 |
first_metrics = shard_metrics
|
|
@@ -90,6 +104,9 @@ def main() -> int:
|
|
| 90 |
"status": "pass",
|
| 91 |
"run_id": args.run_id,
|
| 92 |
"shard_dirs": [str(path) for path in args.shard_dir],
|
|
|
|
|
|
|
|
|
|
| 93 |
"tasks": {
|
| 94 |
task_id: {
|
| 95 |
"task_number": metrics["task_number"],
|
|
|
|
| 56 |
args.output_dir.mkdir(parents=True, exist_ok=True)
|
| 57 |
task_metrics: dict[str, dict[str, Any]] = {}
|
| 58 |
first_metrics: dict[str, Any] | None = None
|
| 59 |
+
duplicate_predictions: list[dict[str, Any]] = []
|
| 60 |
|
| 61 |
for task_id, spec in TASK_SPECS.items():
|
| 62 |
rows_by_id: dict[str, dict[str, Any]] = {}
|
| 63 |
+
row_sources: dict[str, str] = {}
|
| 64 |
for shard_dir in args.shard_dir:
|
| 65 |
for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
|
| 66 |
key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
|
| 67 |
+
if key in rows_by_id:
|
| 68 |
+
duplicate_predictions.append(
|
| 69 |
+
{
|
| 70 |
+
"task_id": task_id,
|
| 71 |
+
"prediction_id": key,
|
| 72 |
+
"kept_shard": row_sources.get(key),
|
| 73 |
+
"duplicate_shard": str(shard_dir),
|
| 74 |
+
"conflict": rows_by_id[key] != row,
|
| 75 |
+
}
|
| 76 |
+
)
|
| 77 |
+
continue
|
| 78 |
+
rows_by_id[key] = row
|
| 79 |
+
row_sources[key] = str(shard_dir)
|
| 80 |
shard_metrics = read_json(shard_dir / task_id / "metrics.json")
|
| 81 |
if shard_metrics and first_metrics is None:
|
| 82 |
first_metrics = shard_metrics
|
|
|
|
| 104 |
"status": "pass",
|
| 105 |
"run_id": args.run_id,
|
| 106 |
"shard_dirs": [str(path) for path in args.shard_dir],
|
| 107 |
+
"duplicate_prediction_count": len(duplicate_predictions),
|
| 108 |
+
"duplicate_prediction_conflict_count": sum(1 for row in duplicate_predictions if row["conflict"]),
|
| 109 |
+
"duplicate_predictions": duplicate_predictions[:50],
|
| 110 |
"tasks": {
|
| 111 |
task_id: {
|
| 112 |
"task_number": metrics["task_number"],
|
scripts/omni/merge_qwen3_omni_future_task_probe_shards.py
CHANGED
|
@@ -54,13 +54,27 @@ def main() -> int:
|
|
| 54 |
args.output_dir.mkdir(parents=True, exist_ok=True)
|
| 55 |
task_metrics: dict[str, dict[str, Any]] = {}
|
| 56 |
first_metrics: dict[str, Any] | None = None
|
|
|
|
| 57 |
|
| 58 |
for task_id, spec in TASK_SPECS.items():
|
| 59 |
rows_by_id: dict[str, dict[str, Any]] = {}
|
|
|
|
| 60 |
for shard_dir in args.shard_dir:
|
| 61 |
for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
|
| 62 |
key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
shard_metrics = read_json(shard_dir / task_id / "metrics.json")
|
| 65 |
if shard_metrics and first_metrics is None:
|
| 66 |
first_metrics = shard_metrics
|
|
@@ -82,6 +96,9 @@ def main() -> int:
|
|
| 82 |
"status": "pass",
|
| 83 |
"run_id": args.run_id,
|
| 84 |
"shard_dirs": [str(path) for path in args.shard_dir],
|
|
|
|
|
|
|
|
|
|
| 85 |
"tasks": {
|
| 86 |
task_id: {
|
| 87 |
"task_number": metrics["task_number"],
|
|
|
|
| 54 |
args.output_dir.mkdir(parents=True, exist_ok=True)
|
| 55 |
task_metrics: dict[str, dict[str, Any]] = {}
|
| 56 |
first_metrics: dict[str, Any] | None = None
|
| 57 |
+
duplicate_predictions: list[dict[str, Any]] = []
|
| 58 |
|
| 59 |
for task_id, spec in TASK_SPECS.items():
|
| 60 |
rows_by_id: dict[str, dict[str, Any]] = {}
|
| 61 |
+
row_sources: dict[str, str] = {}
|
| 62 |
for shard_dir in args.shard_dir:
|
| 63 |
for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
|
| 64 |
key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
|
| 65 |
+
if key in rows_by_id:
|
| 66 |
+
duplicate_predictions.append(
|
| 67 |
+
{
|
| 68 |
+
"task_id": task_id,
|
| 69 |
+
"prediction_id": key,
|
| 70 |
+
"kept_shard": row_sources.get(key),
|
| 71 |
+
"duplicate_shard": str(shard_dir),
|
| 72 |
+
"conflict": rows_by_id[key] != row,
|
| 73 |
+
}
|
| 74 |
+
)
|
| 75 |
+
continue
|
| 76 |
+
rows_by_id[key] = row
|
| 77 |
+
row_sources[key] = str(shard_dir)
|
| 78 |
shard_metrics = read_json(shard_dir / task_id / "metrics.json")
|
| 79 |
if shard_metrics and first_metrics is None:
|
| 80 |
first_metrics = shard_metrics
|
|
|
|
| 96 |
"status": "pass",
|
| 97 |
"run_id": args.run_id,
|
| 98 |
"shard_dirs": [str(path) for path in args.shard_dir],
|
| 99 |
+
"duplicate_prediction_count": len(duplicate_predictions),
|
| 100 |
+
"duplicate_prediction_conflict_count": sum(1 for row in duplicate_predictions if row["conflict"]),
|
| 101 |
+
"duplicate_predictions": duplicate_predictions[:50],
|
| 102 |
"tasks": {
|
| 103 |
task_id: {
|
| 104 |
"task_number": metrics["task_number"],
|
scripts/omni/merge_qwen3_omni_retrieval_task_probe_shards.py
CHANGED
|
@@ -55,13 +55,27 @@ def main() -> int:
|
|
| 55 |
args.output_dir.mkdir(parents=True, exist_ok=True)
|
| 56 |
task_metrics: dict[str, dict[str, Any]] = {}
|
| 57 |
first_metrics: dict[str, Any] | None = None
|
|
|
|
| 58 |
|
| 59 |
for task_id, spec in TASK_SPECS.items():
|
| 60 |
rows_by_id: dict[str, dict[str, Any]] = {}
|
|
|
|
| 61 |
for shard_dir in args.shard_dir:
|
| 62 |
for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
|
| 63 |
key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
shard_metrics = read_json(shard_dir / task_id / "metrics.json")
|
| 66 |
if shard_metrics and first_metrics is None:
|
| 67 |
first_metrics = shard_metrics
|
|
@@ -86,6 +100,9 @@ def main() -> int:
|
|
| 86 |
"status": "pass",
|
| 87 |
"run_id": args.run_id,
|
| 88 |
"shard_dirs": [str(path) for path in args.shard_dir],
|
|
|
|
|
|
|
|
|
|
| 89 |
"tasks": {
|
| 90 |
task_id: {
|
| 91 |
"task_number": metrics["task_number"],
|
|
|
|
| 55 |
args.output_dir.mkdir(parents=True, exist_ok=True)
|
| 56 |
task_metrics: dict[str, dict[str, Any]] = {}
|
| 57 |
first_metrics: dict[str, Any] | None = None
|
| 58 |
+
duplicate_predictions: list[dict[str, Any]] = []
|
| 59 |
|
| 60 |
for task_id, spec in TASK_SPECS.items():
|
| 61 |
rows_by_id: dict[str, dict[str, Any]] = {}
|
| 62 |
+
row_sources: dict[str, str] = {}
|
| 63 |
for shard_dir in args.shard_dir:
|
| 64 |
for row in read_jsonl(shard_dir / task_id / "predictions.jsonl"):
|
| 65 |
key = str(row.get("prediction_id") or f"{task_id}::{row.get('id')}")
|
| 66 |
+
if key in rows_by_id:
|
| 67 |
+
duplicate_predictions.append(
|
| 68 |
+
{
|
| 69 |
+
"task_id": task_id,
|
| 70 |
+
"prediction_id": key,
|
| 71 |
+
"kept_shard": row_sources.get(key),
|
| 72 |
+
"duplicate_shard": str(shard_dir),
|
| 73 |
+
"conflict": rows_by_id[key] != row,
|
| 74 |
+
}
|
| 75 |
+
)
|
| 76 |
+
continue
|
| 77 |
+
rows_by_id[key] = row
|
| 78 |
+
row_sources[key] = str(shard_dir)
|
| 79 |
shard_metrics = read_json(shard_dir / task_id / "metrics.json")
|
| 80 |
if shard_metrics and first_metrics is None:
|
| 81 |
first_metrics = shard_metrics
|
|
|
|
| 100 |
"status": "pass",
|
| 101 |
"run_id": args.run_id,
|
| 102 |
"shard_dirs": [str(path) for path in args.shard_dir],
|
| 103 |
+
"duplicate_prediction_count": len(duplicate_predictions),
|
| 104 |
+
"duplicate_prediction_conflict_count": sum(1 for row in duplicate_predictions if row["conflict"]),
|
| 105 |
+
"duplicate_predictions": duplicate_predictions[:50],
|
| 106 |
"tasks": {
|
| 107 |
task_id: {
|
| 108 |
"task_number": metrics["task_number"],
|
scripts/omni/run_128_task_baselines.py
CHANGED
|
@@ -133,7 +133,7 @@ def parse_args() -> argparse.Namespace:
|
|
| 133 |
default=256,
|
| 134 |
help="Use centroid classification instead of dense softmax when the train label space is larger than this.",
|
| 135 |
)
|
| 136 |
-
parser.add_argument("--include-neural", action=
|
| 137 |
parser.add_argument("--neural-epochs", type=int, default=35)
|
| 138 |
parser.add_argument("--neural-hidden-dim", type=int, default=128)
|
| 139 |
parser.add_argument("--neural-batch-size", type=int, default=256)
|
|
@@ -335,8 +335,6 @@ def row_text_features(row: dict[str, Any], episode: dict[str, Any] | None) -> st
|
|
| 335 |
parts.extend([
|
| 336 |
"main_task:",
|
| 337 |
norm(episode.get("main_task")),
|
| 338 |
-
"episode_split:",
|
| 339 |
-
norm(episode.get("split")),
|
| 340 |
])
|
| 341 |
media = row.get("media") or {}
|
| 342 |
parts.extend([
|
|
|
|
| 133 |
default=256,
|
| 134 |
help="Use centroid classification instead of dense softmax when the train label space is larger than this.",
|
| 135 |
)
|
| 136 |
+
parser.add_argument("--include-neural", action=argparse.BooleanOptionalAction, default=True)
|
| 137 |
parser.add_argument("--neural-epochs", type=int, default=35)
|
| 138 |
parser.add_argument("--neural-hidden-dim", type=int, default=128)
|
| 139 |
parser.add_argument("--neural-batch-size", type=int, default=256)
|
|
|
|
| 335 |
parts.extend([
|
| 336 |
"main_task:",
|
| 337 |
norm(episode.get("main_task")),
|
|
|
|
|
|
|
| 338 |
])
|
| 339 |
media = row.get("media") or {}
|
| 340 |
parts.extend([
|
scripts/omni/train_cosmos3_super_forward_dynamics_lora.py
CHANGED
|
@@ -884,7 +884,8 @@ def main() -> int:
|
|
| 884 |
if accelerator.is_main_process:
|
| 885 |
write_json(output_dir / "training_metadata.json", payload)
|
| 886 |
write_report(output_dir, payload)
|
| 887 |
-
|
|
|
|
| 888 |
|
| 889 |
if accelerator.is_main_process:
|
| 890 |
print(json.dumps({"status": status, "output_dir": str(output_dir), "adapter_dir": str(adapter_dir) if adapter_dir else None}, indent=2))
|
|
|
|
| 884 |
if accelerator.is_main_process:
|
| 885 |
write_json(output_dir / "training_metadata.json", payload)
|
| 886 |
write_report(output_dir, payload)
|
| 887 |
+
final_event = "complete" if status in {"complete", "dry_run_complete"} else "finalized_failed"
|
| 888 |
+
append_jsonl(progress_path, {"event": final_event, "timestamp": time.time(), "status": status})
|
| 889 |
|
| 890 |
if accelerator.is_main_process:
|
| 891 |
print(json.dumps({"status": status, "output_dir": str(output_dir), "adapter_dir": str(adapter_dir) if adapter_dir else None}, indent=2))
|