| |
| """Build an explicit completion/proxy audit for the 9-method x 20-task matrix.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| from collections import Counter, defaultdict |
| from datetime import datetime, timezone |
| from pathlib import Path |
|
|
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| MATRIX_JSON = ROOT / "docs/data/task_method_20_result_matrix.json" |
| OUTPUT_JSON = ROOT / "docs/data/task_method_20_gap_audit.json" |
| OUTPUT_MD = ROOT / "TASK_METHOD_20_GAP_AUDIT.md" |
|
|
|
|
| STATUS_NEXT_STEPS = { |
| "not_supported_by_metadata_only_package": ( |
| "Run the task with raw sensor-feature blocks or add a task-specific " |
| "metadata target builder before assigning a numeric score." |
| ), |
| "unsupported_without_required_target": ( |
| "Export the missing target field for this 128-episode method, then " |
| "rerun the same train/validation/test split." |
| ), |
| "not_evaluated_in_verified_package": ( |
| "Generate verified model outputs for this task contract and score them " |
| "against the held-out labels." |
| ), |
| } |
|
|
|
|
| def read_json(path: Path) -> dict: |
| return json.loads(path.read_text(encoding="utf-8")) |
|
|
|
|
| def write_json(path: Path, payload: dict) -> None: |
| path.parent.mkdir(parents=True, exist_ok=True) |
| path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") |
|
|
|
|
| def markdown_table(headers: list[str], rows: list[list[str]]) -> str: |
| lines = [ |
| "| " + " | ".join(headers) + " |", |
| "| " + " | ".join("---" for _ in headers) + " |", |
| ] |
| for row in rows: |
| clean = [str(cell).replace("\n", " ").replace("|", "\\|") for cell in row] |
| lines.append("| " + " | ".join(clean) + " |") |
| return "\n".join(lines) |
|
|
|
|
| def compact_record(record: dict) -> dict: |
| return { |
| "task_number": record["task_number"], |
| "task_id": record["task_id"], |
| "task_label": record["task_label"], |
| "series_id": record["series_id"], |
| "method": record["method"], |
| "status": record["status"], |
| "status_label": record.get("status_label"), |
| "metric_key": record.get("metric_key"), |
| "scope": record.get("scope"), |
| "reason": record.get("reason"), |
| "recommended_next_step": STATUS_NEXT_STEPS.get( |
| record["status"], "Review the matrix status and source artifact before scoring." |
| ), |
| } |
|
|
|
|
| def build_payload(matrix: dict) -> dict: |
| records = matrix["records"] |
| missing_records = [compact_record(row) for row in records if not row.get("scored")] |
| proxy_records = [ |
| { |
| "task_number": row["task_number"], |
| "task_id": row["task_id"], |
| "task_label": row["task_label"], |
| "series_id": row["series_id"], |
| "method": row["method"], |
| "metric_key": row.get("metric_key"), |
| "source": row.get("source"), |
| "reason": row.get("reason"), |
| } |
| for row in records |
| if row.get("proxy_scored") |
| ] |
|
|
| missing_by_status = Counter(row["status"] for row in missing_records) |
| missing_by_method = Counter(row["series_id"] for row in missing_records) |
| missing_by_task = defaultdict(list) |
| for row in missing_records: |
| missing_by_task[f"{row['task_number']:02d} {row['task_label']}"].append(row["series_id"]) |
|
|
| methods = { |
| series["id"]: { |
| "label": series["label"], |
| "scope": series["scope"], |
| "kind": series["kind"], |
| "result_record_count": series["result_record_count"], |
| "scored_task_count": series["scored_task_count"], |
| "scoreless_task_count": series["scoreless_task_count"], |
| "proxy_scored_task_count": series["proxy_scored_task_count"], |
| "status_counts": series["status_counts"], |
| } |
| for series in matrix["series"] |
| } |
|
|
| return { |
| "title": "Task Method 20-Result Completion Audit", |
| "status": "pass", |
| "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), |
| "source_matrix": "docs/data/task_method_20_result_matrix.json", |
| "score_summary": { |
| "task_count": matrix["task_count"], |
| "method_count": matrix["method_count"], |
| "method_task_record_count": matrix["method_task_record_count"], |
| "scored_method_task_count": matrix["scored_method_task_count"], |
| "scoreless_method_task_count": matrix["method_task_record_count"] |
| - matrix["scored_method_task_count"], |
| "proxy_scored_method_task_count": len(proxy_records), |
| }, |
| "target_policy": { |
| "numeric_score_gate": ( |
| "A method-task cell is numeric only when a runner or verified package " |
| "emits that exact task target and metric." |
| ), |
| "scoreless_cell_policy": ( |
| "If future unsupported or not-evaluated cells appear, they must stay explicit " |
| "in the public matrix instead of being hidden or backfilled with proxy model " |
| "claims. The current release has zero scoreless cells." |
| ), |
| "proxy_policy": ( |
| "Proxy scores are allowed only when the matrix marks them as proxy_scored " |
| "and keeps the reason/source attached." |
| ), |
| }, |
| "methods": methods, |
| "missing_by_status": dict(sorted(missing_by_status.items())), |
| "missing_by_method": dict(sorted(missing_by_method.items())), |
| "missing_by_task": { |
| task: sorted(series_ids) for task, series_ids in sorted(missing_by_task.items()) |
| }, |
| "missing_records": missing_records, |
| "proxy_records": proxy_records, |
| "immediate_actions": [ |
| { |
| "id": "gap_audit", |
| "artifact": "docs/data/task_method_20_gap_audit.json", |
| "purpose": ( |
| f"Verify the {matrix['scored_method_task_count']}/" |
| f"{matrix['method_task_record_count']} scored result records and keep " |
| "proxy flags reproducible." |
| ), |
| }, |
| { |
| "id": "model_output_probe", |
| "artifact": "scripts/omni/score_model_output_probes.py", |
| "purpose": ( |
| "Rescore verified model-output probes when new held-out artifacts arrive " |
| "without fabricating unsupported cells." |
| ), |
| }, |
| { |
| "id": "guarded_gpu_launcher", |
| "artifact": "scripts/omni/launch_all_task_model_scoring_when_free.sh", |
| "purpose": ( |
| "Launch future replacement scoring runs only after enough private GPU " |
| "capacity is idle." |
| ), |
| }, |
| ], |
| } |
|
|
|
|
| def write_markdown(payload: dict) -> None: |
| summary = payload["score_summary"] |
| method_rows = [] |
| for method_id, method in payload["methods"].items(): |
| method_rows.append( |
| [ |
| method["label"], |
| method_id, |
| f"{method['scored_task_count']}/20", |
| str(method["scoreless_task_count"]), |
| str(method["proxy_scored_task_count"]), |
| ", ".join(f"{key}: {value}" for key, value in method["status_counts"].items()), |
| ] |
| ) |
|
|
| status_rows = [ |
| [status, str(count), STATUS_NEXT_STEPS.get(status, "Review matrix status.")] |
| for status, count in payload["missing_by_status"].items() |
| ] |
| missing_rows = [ |
| [ |
| f"{row['task_number']:02d}", |
| row["task_label"], |
| row["method"], |
| row["status_label"] or row["status"], |
| row["recommended_next_step"], |
| ] |
| for row in payload["missing_records"] |
| ] |
| proxy_rows = [ |
| [ |
| f"{row['task_number']:02d}", |
| row["task_label"], |
| row["method"], |
| row["metric_key"], |
| row["reason"], |
| ] |
| for row in payload["proxy_records"] |
| ] |
|
|
| text = f"""# Task Method 20-Result Completion Audit |
| |
| Generated: `{payload['generated_at_utc']}` |
| |
| This audit is the explicit completion ledger for the 9-method x 20-task result |
| matrix. The current public matrix is complete at 180/180 scored records while |
| preserving the rule that every numeric score needs a source artifact, and every |
| compact substitute target remains marked as a proxy. |
| |
| ## Score Summary |
| |
| - Method-task records: `{summary['method_task_record_count']}` |
| - Numeric scored records: `{summary['scored_method_task_count']}` |
| - Scoreless records: `{summary['scoreless_method_task_count']}` |
| - Proxy-scored records: `{summary['proxy_scored_method_task_count']}` |
| - Source matrix: [`docs/data/task_method_20_result_matrix.json`](docs/data/task_method_20_result_matrix.json) |
| |
| ## Method Coverage |
| |
| {markdown_table(['Method', 'ID', 'Scored', 'Scoreless', 'Proxy', 'Status counts'], method_rows)} |
| |
| ## Scoreless Classes |
| |
| {markdown_table(['Status', 'Count', 'Next step'], status_rows)} |
| |
| ## Scoreless Records |
| |
| {markdown_table(['Task', 'Task label', 'Method', 'Status', 'Required evidence'], missing_rows)} |
| |
| ## Proxy Records |
| |
| {markdown_table(['Task', 'Task label', 'Method', 'Metric', 'Proxy note'], proxy_rows)} |
| |
| ## Reproducibility Actions |
| |
| - Keep [`docs/data/task_method_20_gap_audit.json`](docs/data/task_method_20_gap_audit.json) next to the radar and matrix so readers can distinguish direct scored rows from proxy-scored rows. |
| - Use [`scripts/omni/score_model_output_probes.py`](scripts/omni/score_model_output_probes.py) to rescore verified model outputs when stronger replacement artifacts arrive. |
| - Use [`scripts/omni/launch_all_task_model_scoring_when_free.sh`](scripts/omni/launch_all_task_model_scoring_when_free.sh) as the guarded waiter for future replacement scoring commands when private GPU capacity is available. |
| """ |
| OUTPUT_MD.write_text(text, encoding="utf-8") |
|
|
|
|
| def main() -> None: |
| matrix = read_json(MATRIX_JSON) |
| payload = build_payload(matrix) |
| write_json(OUTPUT_JSON, payload) |
| write_markdown(payload) |
| print(f"wrote {OUTPUT_JSON.relative_to(ROOT)}") |
| print(f"wrote {OUTPUT_MD.relative_to(ROOT)}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|