File size: 8,500 Bytes
13d3eec d272538 13d3eec f52ad36 13d3eec f52ad36 13d3eec f52ad36 13d3eec 3a3e7ac 13d3eec 3a3e7ac 13d3eec 79ed47f 13d3eec 79ed47f 13d3eec 69865f3 84ea166 13d3eec 69865f3 84ea166 13d3eec 84ea166 d73afa7 13d3eec 69865f3 84ea166 13d3eec 69865f3 84ea166 13d3eec 84ea166 13d3eec 17c38d5 13d3eec 17c38d5 13d3eec 3a3e7ac 13d3eec 84ea166 13d3eec 84ea166 3a3e7ac 13d3eec f52ad36 13d3eec f52ad36 13d3eec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 | {
"generated_at_utc": "2026-06-20T20:38:59+00:00",
"immediate_actions": [
{
"artifact": "docs/data/task_method_20_gap_audit.json",
"id": "gap_audit",
"purpose": "Verify the 180/180 scored result records and keep proxy flags reproducible."
},
{
"artifact": "scripts/omni/score_model_output_probes.py",
"id": "model_output_probe",
"purpose": "Rescore verified model-output probes when new held-out artifacts arrive without fabricating unsupported cells."
},
{
"artifact": "scripts/omni/launch_all_task_model_scoring_when_free.sh",
"id": "guarded_gpu_launcher",
"purpose": "Launch future replacement scoring runs only after enough private GPU capacity is idle."
}
],
"methods": {
"cosmos3_nano_future_window": {
"kind": "partial_128_episode_world_model_overlay",
"label": "Cosmos3-Nano Future Window",
"proxy_scored_task_count": 0,
"result_record_count": 20,
"scope": "128 selected episodes, held-out test",
"scored_task_count": 20,
"scoreless_task_count": 0,
"status_counts": {
"scored": 20
}
},
"cosmos3_super_reasoner": {
"kind": "partial_128_episode_foundation_model_overlay",
"label": "Cosmos3-Super Reasoner",
"proxy_scored_task_count": 0,
"result_record_count": 20,
"scope": "128 selected episodes, held-out test",
"scored_task_count": 20,
"scoreless_task_count": 0,
"status_counts": {
"scored": 20
}
},
"metadata128_neural_mlp": {
"kind": "partial_128_episode_aligned_baseline",
"label": "128ep Aligned NN",
"proxy_scored_task_count": 1,
"result_record_count": 20,
"scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available",
"scored_task_count": 20,
"scoreless_task_count": 0,
"status_counts": {
"proxy_scored": 1,
"scored": 19
}
},
"metadata128_simple": {
"kind": "partial_128_episode_aligned_baseline",
"label": "128ep Aligned Simple",
"proxy_scored_task_count": 1,
"result_record_count": 20,
"scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available",
"scored_task_count": 20,
"scoreless_task_count": 0,
"status_counts": {
"proxy_scored": 1,
"scored": 19
}
},
"minimal": {
"kind": "full_20_task_baseline",
"label": "Minimal",
"proxy_scored_task_count": 0,
"result_record_count": 20,
"scope": "1 public sample episode",
"scored_task_count": 20,
"scoreless_task_count": 0,
"status_counts": {
"scored": 20
}
},
"neural_mlp": {
"kind": "full_20_task_baseline",
"label": "Neural MLP",
"proxy_scored_task_count": 0,
"result_record_count": 20,
"scope": "1 public sample episode",
"scored_task_count": 20,
"scoreless_task_count": 0,
"status_counts": {
"scored": 20
}
},
"qwen3_omni_v6_lora": {
"kind": "partial_128_episode_foundation_model_overlay",
"label": "Qwen3-Omni v6 LoRA",
"proxy_scored_task_count": 0,
"result_record_count": 20,
"scope": "128 selected episodes, held-out test",
"scored_task_count": 20,
"scoreless_task_count": 0,
"status_counts": {
"scored": 20
}
},
"raw128_neural_mlp": {
"kind": "complete_128_episode_raw_feature_baseline",
"label": "128ep Raw NN",
"proxy_scored_task_count": 2,
"result_record_count": 20,
"scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes",
"scored_task_count": 20,
"scoreless_task_count": 0,
"status_counts": {
"proxy_scored": 2,
"scored": 18
}
},
"raw128_simple": {
"kind": "complete_128_episode_raw_feature_baseline",
"label": "128ep Raw Simple",
"proxy_scored_task_count": 2,
"result_record_count": 20,
"scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes",
"scored_task_count": 20,
"scoreless_task_count": 0,
"status_counts": {
"proxy_scored": 2,
"scored": 18
}
}
},
"missing_by_method": {},
"missing_by_status": {},
"missing_by_task": {},
"missing_records": [],
"proxy_records": [
{
"method": "128ep Raw Simple",
"metric_key": "macro_f1",
"reason": "documented compact proxy completion for this raw128 task axis",
"series_id": "raw128_simple",
"source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json",
"task_id": "interaction_text_prediction",
"task_label": "Interaction Text Prediction",
"task_number": 15
},
{
"method": "128ep Raw NN",
"metric_key": "macro_f1",
"reason": "documented compact proxy completion for this raw128 task axis",
"series_id": "raw128_neural_mlp",
"source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json",
"task_id": "interaction_text_prediction",
"task_label": "Interaction Text Prediction",
"task_number": 15
},
{
"method": "128ep Aligned Simple",
"metric_key": "mrr",
"reason": "paired camera-view embeddings are absent from the 128 JSONL/feature export; metadata features retrieve the synchronized same-window depth/audio block as a documented compact synchronization proxy",
"series_id": "metadata128_simple",
"source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/camera_view_sync_retrieval/metrics.json",
"task_id": "camera_view_sync_retrieval",
"task_label": "Camera-View Synchronization Retrieval",
"task_number": 19
},
{
"method": "128ep Aligned NN",
"metric_key": "mrr",
"reason": "paired camera-view embeddings are absent from the 128 JSONL/feature export; metadata features retrieve the synchronized same-window depth/audio block as a documented compact synchronization proxy",
"series_id": "metadata128_neural_mlp",
"source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/camera_view_sync_retrieval/metrics.json",
"task_id": "camera_view_sync_retrieval",
"task_label": "Camera-View Synchronization Retrieval",
"task_number": 19
},
{
"method": "128ep Raw Simple",
"metric_key": "mrr",
"reason": "documented compact proxy completion for this raw128 task axis",
"series_id": "raw128_simple",
"source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json",
"task_id": "camera_view_sync_retrieval",
"task_label": "Camera-View Synchronization Retrieval",
"task_number": 19
},
{
"method": "128ep Raw NN",
"metric_key": "mrr",
"reason": "documented compact proxy completion for this raw128 task axis",
"series_id": "raw128_neural_mlp",
"source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json",
"task_id": "camera_view_sync_retrieval",
"task_label": "Camera-View Synchronization Retrieval",
"task_number": 19
}
],
"score_summary": {
"method_count": 9,
"method_task_record_count": 180,
"proxy_scored_method_task_count": 6,
"scored_method_task_count": 180,
"scoreless_method_task_count": 0,
"task_count": 20
},
"source_matrix": "docs/data/task_method_20_result_matrix.json",
"status": "pass",
"target_policy": {
"numeric_score_gate": "A method-task cell is numeric only when a runner or verified package emits that exact task target and metric.",
"proxy_policy": "Proxy scores are allowed only when the matrix marks them as proxy_scored and keeps the reason/source attached.",
"scoreless_cell_policy": "If future unsupported or not-evaluated cells appear, they must stay explicit in the public matrix instead of being hidden or backfilled with proxy model claims. The current release has zero scoreless cells."
},
"title": "Task Method 20-Result Completion Audit"
}
|