File size: 7,386 Bytes
aa70370 2600a90 3f09cb0 aa70370 3f09cb0 aa70370 3f09cb0 2600a90 aa70370 9a8a5a7 aa70370 3f09cb0 aa70370 9a8a5a7 aa70370 3f09cb0 aa70370 389c0f8 2600a90 aa70370 9a8a5a7 aa70370 389c0f8 3f09cb0 aa70370 9a8a5a7 aa70370 07da339 aa70370 07da339 aa70370 9a8a5a7 aa70370 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | {
"status": "current",
"updated_utc": "2026-06-21T00:00:00Z",
"interpretation_rule": "Read the 1-episode line as the inspectable task lab. Read the 128-episode line as the selected comparison surface for metadata/raw baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super, and Cosmos3-Nano.",
"reader_summary": "The suite has two public reading lanes. Line 1 is the fully inspectable one-episode task lab. Line 2 is the 128-episode comparison surface for aligned baselines, the Qwen3-Omni series, and the Cosmos3 series. Compare scores within the same lane first.",
"score_formula": "2 single-episode methods x 20 tasks = 40 records; 7 selected-128 methods x 20 tasks = 140 records; total public matrix = 180/180 scored records.",
"lines": [
{
"id": "single_public_sample_episode",
"label": "1 sample episode",
"short_label": "Line 1",
"data_unit": "One public Xperience-10M sample episode",
"result_statement": "40/40 direct scores from Minimal and Neural MLP heads on the same 20 task contracts.",
"best_read_as": "Inspect the raw sample, understand file organization, reproduce the 20 task targets, and compare Minimal vs Neural MLP behavior inside one episode.",
"read_separately_from": "The selected-128 comparison rows and broader held-out model behavior.",
"frames": 5821,
"windows": 1161,
"window_definition": "20-frame aligned windows with 5-frame stride",
"feature_dimensions": 8546,
"methods": [
"Minimal heads",
"Neural MLP heads"
],
"task_axes": 20,
"method_task_records": 40,
"scored_records": 40,
"direct_scored_records": 40,
"proxy_scored_records": 0,
"best_use": "Inspect raw files, understand each task, rerun local baselines, and debug task quality.",
"primary_visuals": [
"docs/assets/charts/two_evidence_line_map.svg",
"docs/assets/charts/single_episode_task_model_radar.svg"
],
"primary_artifacts": [
"docs/data/single_episode_task_model_radar.json",
"docs/data/two_evidence_line_result_summary.json",
"results/episode_task_suite/summary_report.json",
"results/episode_task_suite/feature_manifest.json",
"docs/single_episode_explorer.html"
]
},
{
"id": "selected_128_episode_surface",
"label": "128 selected episodes",
"short_label": "Line 2",
"data_unit": "Selected held-out 96/16/16 split with public-safe processed features linked to official gated episode paths",
"result_statement": "140/140 selected-128 scores across seven methods: 134 direct scores plus 6 documented compact-proxy scores.",
"best_read_as": "Compare same-split metadata/raw baselines, Qwen3-Omni v6, Cosmos3-Super, and Cosmos3-Nano while keeping the 6 compact-proxy cells visible.",
"read_separately_from": "Direct raw-target interpretation for the proxy-marked cells.",
"episodes": 128,
"split": {
"train": 96,
"validation": 16,
"test": 16
},
"exported_windows": 34269,
"methods": [
"Metadata simple",
"Metadata NN",
"Raw-feature simple",
"Raw-feature NN",
"Qwen3-Omni",
"Cosmos3-Super",
"Cosmos3-Nano"
],
"task_axes": 20,
"method_task_records": 140,
"scored_records": 140,
"direct_scored_records": 134,
"proxy_scored_records": 6,
"proxy_policy": "Proxy flags remain visible where the public export lacks a direct raw target.",
"best_use": "Compare same-split metadata/raw baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super Reasoner, and Cosmos3-Nano Future Window while keeping evidence type explicit.",
"primary_visuals": [
"docs/assets/charts/two_evidence_line_map.svg",
"docs/assets/charts/episode128_task_model_radar.svg",
"docs/assets/charts/unified_task_model_radar.svg"
],
"primary_artifacts": [
"docs/data/episode128_task_model_radar.json",
"docs/data/two_evidence_line_result_summary.json",
"docs/data/xperience10m_128_episode_feature_index.json",
"docs/data/omni_model_comparison.json",
"docs/data/qwen3_omni_run_lineage.json",
"docs/data/task_method_20_gap_audit.json"
]
}
],
"method_blocks": [
{
"line_id": "single_public_sample_episode",
"line_label": "1 sample episode",
"block": "Task-head baselines",
"methods": [
"Minimal",
"Neural MLP"
],
"scored_records": 40,
"direct_scored_records": 40,
"proxy_scored_records": 0,
"evidence_type": "Direct target metrics on the public sample windows.",
"read_as": "Task-lab reproducibility and simple-vs-neural behavior."
},
{
"line_id": "selected_128_episode_surface",
"line_label": "128 selected episodes",
"block": "Aligned baseline heads",
"methods": [
"Metadata simple",
"Metadata NN",
"Raw-feature simple",
"Raw-feature NN"
],
"scored_records": 80,
"direct_scored_records": 74,
"proxy_scored_records": 6,
"evidence_type": "Direct processed-target metrics where available; compact proxies for documented raw-target gaps.",
"read_as": "Same-split metadata/raw-feature baseline comparison."
},
{
"line_id": "selected_128_episode_surface",
"line_label": "128 selected episodes",
"block": "Qwen3-Omni series",
"methods": [
"Qwen3-Omni v6 LoRA"
],
"scored_records": 20,
"direct_scored_records": 20,
"proxy_scored_records": 0,
"evidence_type": "Verified selected-128 Qwen3-Omni v6 LoRA plus source-linked task-specific probes.",
"read_as": "Trainable Qwen3-Omni diagnostic baseline on the selected-128 surface."
},
{
"line_id": "selected_128_episode_surface",
"line_label": "128 selected episodes",
"block": "Cosmos3 series",
"methods": [
"Cosmos3-Super Reasoner",
"Cosmos3-Nano Future Window"
],
"scored_records": 40,
"direct_scored_records": 40,
"proxy_scored_records": 0,
"evidence_type": "Verified Cosmos3-Super Reasoner and Cosmos3-Nano Future Window public-safe artifacts.",
"read_as": "Cosmos3 reasoner and future-window diagnostics on the selected-128 surface."
}
],
"related_model_artifacts": [
{
"name": "Qwen3-Omni v1-v6 run lineage",
"role": "Explains the LoRA/evaluation version ladder; v6 is the current 20-task matrix row, v5 remains the pinned prior release, and v1-v4 are lineage/ablation evidence.",
"repo": "docs/data/qwen3_omni_run_lineage.json"
},
{
"name": "Cosmos3-Super Forward-Dynamics LoRA",
"role": "Separate fine-tuned adapter artifact for forward-dynamics loss metrics; published with weights/results but not counted as a 20-task matrix method row.",
"repo": "https://huggingface.co/cy0307/ropedia-cosmos3-super-forward-dynamics-lora-128ep"
}
],
"combined_public_matrix": {
"task_axes": 20,
"methods": 9,
"method_task_records": 180,
"scored_records": 180,
"direct_scored_records": 174,
"proxy_scored_records": 6,
"summary_artifact": "docs/data/two_evidence_line_result_summary.json",
"artifact": "docs/data/task_method_20_result_matrix.json"
}
}
|