ropedia-xperience-10m-task-baselines / scripts /build_task_method_20_gap_audit.py
cy0307's picture
Add files using upload-large-folder tool
f52ad36 verified
Raw
History Blame
10.3 kB
#!/usr/bin/env python3
"""Build an explicit completion/proxy audit for the 9-method x 20-task matrix."""
from __future__ import annotations
import json
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
MATRIX_JSON = ROOT / "docs/data/task_method_20_result_matrix.json"
OUTPUT_JSON = ROOT / "docs/data/task_method_20_gap_audit.json"
OUTPUT_MD = ROOT / "TASK_METHOD_20_GAP_AUDIT.md"
STATUS_NEXT_STEPS = {
"not_supported_by_metadata_only_package": (
"Run the task with raw sensor-feature blocks or add a task-specific "
"metadata target builder before assigning a numeric score."
),
"unsupported_without_required_target": (
"Export the missing target field for this 128-episode method, then "
"rerun the same train/validation/test split."
),
"not_evaluated_in_verified_package": (
"Generate verified model outputs for this task contract and score them "
"against the held-out labels."
),
}
def read_json(path: Path) -> dict:
return json.loads(path.read_text(encoding="utf-8"))
def write_json(path: Path, payload: dict) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
def markdown_table(headers: list[str], rows: list[list[str]]) -> str:
lines = [
"| " + " | ".join(headers) + " |",
"| " + " | ".join("---" for _ in headers) + " |",
]
for row in rows:
clean = [str(cell).replace("\n", " ").replace("|", "\\|") for cell in row]
lines.append("| " + " | ".join(clean) + " |")
return "\n".join(lines)
def compact_record(record: dict) -> dict:
return {
"task_number": record["task_number"],
"task_id": record["task_id"],
"task_label": record["task_label"],
"series_id": record["series_id"],
"method": record["method"],
"status": record["status"],
"status_label": record.get("status_label"),
"metric_key": record.get("metric_key"),
"scope": record.get("scope"),
"reason": record.get("reason"),
"recommended_next_step": STATUS_NEXT_STEPS.get(
record["status"], "Review the matrix status and source artifact before scoring."
),
}
def build_payload(matrix: dict) -> dict:
records = matrix["records"]
missing_records = [compact_record(row) for row in records if not row.get("scored")]
proxy_records = [
{
"task_number": row["task_number"],
"task_id": row["task_id"],
"task_label": row["task_label"],
"series_id": row["series_id"],
"method": row["method"],
"metric_key": row.get("metric_key"),
"source": row.get("source"),
"reason": row.get("reason"),
}
for row in records
if row.get("proxy_scored")
]
missing_by_status = Counter(row["status"] for row in missing_records)
missing_by_method = Counter(row["series_id"] for row in missing_records)
missing_by_task = defaultdict(list)
for row in missing_records:
missing_by_task[f"{row['task_number']:02d} {row['task_label']}"].append(row["series_id"])
methods = {
series["id"]: {
"label": series["label"],
"scope": series["scope"],
"kind": series["kind"],
"result_record_count": series["result_record_count"],
"scored_task_count": series["scored_task_count"],
"scoreless_task_count": series["scoreless_task_count"],
"proxy_scored_task_count": series["proxy_scored_task_count"],
"status_counts": series["status_counts"],
}
for series in matrix["series"]
}
return {
"title": "Task Method 20-Result Completion Audit",
"status": "pass",
"generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
"source_matrix": "docs/data/task_method_20_result_matrix.json",
"score_summary": {
"task_count": matrix["task_count"],
"method_count": matrix["method_count"],
"method_task_record_count": matrix["method_task_record_count"],
"scored_method_task_count": matrix["scored_method_task_count"],
"scoreless_method_task_count": matrix["method_task_record_count"]
- matrix["scored_method_task_count"],
"proxy_scored_method_task_count": len(proxy_records),
},
"target_policy": {
"numeric_score_gate": (
"A method-task cell is numeric only when a runner or verified package "
"emits that exact task target and metric."
),
"scoreless_cell_policy": (
"If future unsupported or not-evaluated cells appear, they must stay explicit "
"in the public matrix instead of being hidden or backfilled with proxy model "
"claims. The current release has zero scoreless cells."
),
"proxy_policy": (
"Proxy scores are allowed only when the matrix marks them as proxy_scored "
"and keeps the reason/source attached."
),
},
"methods": methods,
"missing_by_status": dict(sorted(missing_by_status.items())),
"missing_by_method": dict(sorted(missing_by_method.items())),
"missing_by_task": {
task: sorted(series_ids) for task, series_ids in sorted(missing_by_task.items())
},
"missing_records": missing_records,
"proxy_records": proxy_records,
"immediate_actions": [
{
"id": "gap_audit",
"artifact": "docs/data/task_method_20_gap_audit.json",
"purpose": (
f"Verify the {matrix['scored_method_task_count']}/"
f"{matrix['method_task_record_count']} scored result records and keep "
"proxy flags reproducible."
),
},
{
"id": "model_output_probe",
"artifact": "scripts/omni/score_model_output_probes.py",
"purpose": (
"Rescore verified model-output probes when new held-out artifacts arrive "
"without fabricating unsupported cells."
),
},
{
"id": "guarded_gpu_launcher",
"artifact": "scripts/omni/launch_all_task_model_scoring_when_free.sh",
"purpose": (
"Launch future replacement scoring runs only after enough private GPU "
"capacity is idle."
),
},
],
}
def write_markdown(payload: dict) -> None:
summary = payload["score_summary"]
method_rows = []
for method_id, method in payload["methods"].items():
method_rows.append(
[
method["label"],
method_id,
f"{method['scored_task_count']}/20",
str(method["scoreless_task_count"]),
str(method["proxy_scored_task_count"]),
", ".join(f"{key}: {value}" for key, value in method["status_counts"].items()),
]
)
status_rows = [
[status, str(count), STATUS_NEXT_STEPS.get(status, "Review matrix status.")]
for status, count in payload["missing_by_status"].items()
]
missing_rows = [
[
f"{row['task_number']:02d}",
row["task_label"],
row["method"],
row["status_label"] or row["status"],
row["recommended_next_step"],
]
for row in payload["missing_records"]
]
proxy_rows = [
[
f"{row['task_number']:02d}",
row["task_label"],
row["method"],
row["metric_key"],
row["reason"],
]
for row in payload["proxy_records"]
]
text = f"""# Task Method 20-Result Completion Audit
Generated: `{payload['generated_at_utc']}`
This audit is the explicit completion ledger for the 9-method x 20-task result
matrix. The current public matrix is complete at 180/180 scored records while
preserving the rule that every numeric score needs a source artifact, and every
compact substitute target remains marked as a proxy.
## Score Summary
- Method-task records: `{summary['method_task_record_count']}`
- Numeric scored records: `{summary['scored_method_task_count']}`
- Scoreless records: `{summary['scoreless_method_task_count']}`
- Proxy-scored records: `{summary['proxy_scored_method_task_count']}`
- Source matrix: [`docs/data/task_method_20_result_matrix.json`](docs/data/task_method_20_result_matrix.json)
## Method Coverage
{markdown_table(['Method', 'ID', 'Scored', 'Scoreless', 'Proxy', 'Status counts'], method_rows)}
## Scoreless Classes
{markdown_table(['Status', 'Count', 'Next step'], status_rows)}
## Scoreless Records
{markdown_table(['Task', 'Task label', 'Method', 'Status', 'Required evidence'], missing_rows)}
## Proxy Records
{markdown_table(['Task', 'Task label', 'Method', 'Metric', 'Proxy note'], proxy_rows)}
## Reproducibility Actions
- Keep [`docs/data/task_method_20_gap_audit.json`](docs/data/task_method_20_gap_audit.json) next to the radar and matrix so readers can distinguish direct scored rows from proxy-scored rows.
- Use [`scripts/omni/score_model_output_probes.py`](scripts/omni/score_model_output_probes.py) to rescore verified model outputs when stronger replacement artifacts arrive.
- Use [`scripts/omni/launch_all_task_model_scoring_when_free.sh`](scripts/omni/launch_all_task_model_scoring_when_free.sh) as the guarded waiter for future replacement scoring commands when private GPU capacity is available.
"""
OUTPUT_MD.write_text(text, encoding="utf-8")
def main() -> None:
matrix = read_json(MATRIX_JSON)
payload = build_payload(matrix)
write_json(OUTPUT_JSON, payload)
write_markdown(payload)
print(f"wrote {OUTPUT_JSON.relative_to(ROOT)}")
print(f"wrote {OUTPUT_MD.relative_to(ROOT)}")
if __name__ == "__main__":
main()