#!/usr/bin/env python3 """Build an explicit completion/proxy audit for the 9-method x 20-task matrix.""" from __future__ import annotations import json from collections import Counter, defaultdict from datetime import datetime, timezone from pathlib import Path ROOT = Path(__file__).resolve().parents[1] MATRIX_JSON = ROOT / "docs/data/task_method_20_result_matrix.json" OUTPUT_JSON = ROOT / "docs/data/task_method_20_gap_audit.json" OUTPUT_MD = ROOT / "TASK_METHOD_20_GAP_AUDIT.md" STATUS_NEXT_STEPS = { "not_supported_by_metadata_only_package": ( "Run the task with raw sensor-feature blocks or add a task-specific " "metadata target builder before assigning a numeric score." ), "unsupported_without_required_target": ( "Export the missing target field for this 128-episode method, then " "rerun the same train/validation/test split." ), "not_evaluated_in_verified_package": ( "Generate verified model outputs for this task contract and score them " "against the held-out labels." ), } def read_json(path: Path) -> dict: return json.loads(path.read_text(encoding="utf-8")) def write_json(path: Path, payload: dict) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") def markdown_table(headers: list[str], rows: list[list[str]]) -> str: lines = [ "| " + " | ".join(headers) + " |", "| " + " | ".join("---" for _ in headers) + " |", ] for row in rows: clean = [str(cell).replace("\n", " ").replace("|", "\\|") for cell in row] lines.append("| " + " | ".join(clean) + " |") return "\n".join(lines) def compact_record(record: dict) -> dict: return { "task_number": record["task_number"], "task_id": record["task_id"], "task_label": record["task_label"], "series_id": record["series_id"], "method": record["method"], "status": record["status"], "status_label": record.get("status_label"), "metric_key": record.get("metric_key"), "scope": record.get("scope"), "reason": record.get("reason"), "recommended_next_step": STATUS_NEXT_STEPS.get( record["status"], "Review the matrix status and source artifact before scoring." ), } def build_payload(matrix: dict) -> dict: records = matrix["records"] missing_records = [compact_record(row) for row in records if not row.get("scored")] proxy_records = [ { "task_number": row["task_number"], "task_id": row["task_id"], "task_label": row["task_label"], "series_id": row["series_id"], "method": row["method"], "metric_key": row.get("metric_key"), "source": row.get("source"), "reason": row.get("reason"), } for row in records if row.get("proxy_scored") ] missing_by_status = Counter(row["status"] for row in missing_records) missing_by_method = Counter(row["series_id"] for row in missing_records) missing_by_task = defaultdict(list) for row in missing_records: missing_by_task[f"{row['task_number']:02d} {row['task_label']}"].append(row["series_id"]) methods = { series["id"]: { "label": series["label"], "scope": series["scope"], "kind": series["kind"], "result_record_count": series["result_record_count"], "scored_task_count": series["scored_task_count"], "scoreless_task_count": series["scoreless_task_count"], "proxy_scored_task_count": series["proxy_scored_task_count"], "status_counts": series["status_counts"], } for series in matrix["series"] } return { "title": "Task Method 20-Result Completion Audit", "status": "pass", "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), "source_matrix": "docs/data/task_method_20_result_matrix.json", "score_summary": { "task_count": matrix["task_count"], "method_count": matrix["method_count"], "method_task_record_count": matrix["method_task_record_count"], "scored_method_task_count": matrix["scored_method_task_count"], "scoreless_method_task_count": matrix["method_task_record_count"] - matrix["scored_method_task_count"], "proxy_scored_method_task_count": len(proxy_records), }, "target_policy": { "numeric_score_gate": ( "A method-task cell is numeric only when a runner or verified package " "emits that exact task target and metric." ), "scoreless_cell_policy": ( "If future unsupported or not-evaluated cells appear, they must stay explicit " "in the public matrix instead of being hidden or backfilled with proxy model " "claims. The current release has zero scoreless cells." ), "proxy_policy": ( "Proxy scores are allowed only when the matrix marks them as proxy_scored " "and keeps the reason/source attached." ), }, "methods": methods, "missing_by_status": dict(sorted(missing_by_status.items())), "missing_by_method": dict(sorted(missing_by_method.items())), "missing_by_task": { task: sorted(series_ids) for task, series_ids in sorted(missing_by_task.items()) }, "missing_records": missing_records, "proxy_records": proxy_records, "immediate_actions": [ { "id": "gap_audit", "artifact": "docs/data/task_method_20_gap_audit.json", "purpose": ( f"Verify the {matrix['scored_method_task_count']}/" f"{matrix['method_task_record_count']} scored result records and keep " "proxy flags reproducible." ), }, { "id": "model_output_probe", "artifact": "scripts/omni/score_model_output_probes.py", "purpose": ( "Rescore verified model-output probes when new held-out artifacts arrive " "without fabricating unsupported cells." ), }, { "id": "guarded_gpu_launcher", "artifact": "scripts/omni/launch_all_task_model_scoring_when_free.sh", "purpose": ( "Launch future replacement scoring runs only after enough private GPU " "capacity is idle." ), }, ], } def write_markdown(payload: dict) -> None: summary = payload["score_summary"] method_rows = [] for method_id, method in payload["methods"].items(): method_rows.append( [ method["label"], method_id, f"{method['scored_task_count']}/20", str(method["scoreless_task_count"]), str(method["proxy_scored_task_count"]), ", ".join(f"{key}: {value}" for key, value in method["status_counts"].items()), ] ) status_rows = [ [status, str(count), STATUS_NEXT_STEPS.get(status, "Review matrix status.")] for status, count in payload["missing_by_status"].items() ] missing_rows = [ [ f"{row['task_number']:02d}", row["task_label"], row["method"], row["status_label"] or row["status"], row["recommended_next_step"], ] for row in payload["missing_records"] ] proxy_rows = [ [ f"{row['task_number']:02d}", row["task_label"], row["method"], row["metric_key"], row["reason"], ] for row in payload["proxy_records"] ] text = f"""# Task Method 20-Result Completion Audit Generated: `{payload['generated_at_utc']}` This audit is the explicit completion ledger for the 9-method x 20-task result matrix. The current public matrix is complete at 180/180 scored records while preserving the rule that every numeric score needs a source artifact, and every compact substitute target remains marked as a proxy. ## Score Summary - Method-task records: `{summary['method_task_record_count']}` - Numeric scored records: `{summary['scored_method_task_count']}` - Scoreless records: `{summary['scoreless_method_task_count']}` - Proxy-scored records: `{summary['proxy_scored_method_task_count']}` - Source matrix: [`docs/data/task_method_20_result_matrix.json`](docs/data/task_method_20_result_matrix.json) ## Method Coverage {markdown_table(['Method', 'ID', 'Scored', 'Scoreless', 'Proxy', 'Status counts'], method_rows)} ## Scoreless Classes {markdown_table(['Status', 'Count', 'Next step'], status_rows)} ## Scoreless Records {markdown_table(['Task', 'Task label', 'Method', 'Status', 'Required evidence'], missing_rows)} ## Proxy Records {markdown_table(['Task', 'Task label', 'Method', 'Metric', 'Proxy note'], proxy_rows)} ## Reproducibility Actions - Keep [`docs/data/task_method_20_gap_audit.json`](docs/data/task_method_20_gap_audit.json) next to the radar and matrix so readers can distinguish direct scored rows from proxy-scored rows. - Use [`scripts/omni/score_model_output_probes.py`](scripts/omni/score_model_output_probes.py) to rescore verified model outputs when stronger replacement artifacts arrive. - Use [`scripts/omni/launch_all_task_model_scoring_when_free.sh`](scripts/omni/launch_all_task_model_scoring_when_free.sh) as the guarded waiter for future replacement scoring commands when private GPU capacity is available. """ OUTPUT_MD.write_text(text, encoding="utf-8") def main() -> None: matrix = read_json(MATRIX_JSON) payload = build_payload(matrix) write_json(OUTPUT_JSON, payload) write_markdown(payload) print(f"wrote {OUTPUT_JSON.relative_to(ROOT)}") print(f"wrote {OUTPUT_MD.relative_to(ROOT)}") if __name__ == "__main__": main()