ropedia-xperience-10m-task-baselines / scripts /build_task_method_20_gap_audit.py

Add files using upload-large-folder tool

f52ad36 verified 9 days ago

10.3 kB

	#!/usr/bin/env python3
	"""Build an explicit completion/proxy audit for the 9-method x 20-task matrix."""

	from __future__ import annotations

	import json
	from collections import Counter, defaultdict
	from datetime import datetime, timezone
	from pathlib import Path


	ROOT = Path(__file__).resolve().parents[1]
	MATRIX_JSON = ROOT / "docs/data/task_method_20_result_matrix.json"
	OUTPUT_JSON = ROOT / "docs/data/task_method_20_gap_audit.json"
	OUTPUT_MD = ROOT / "TASK_METHOD_20_GAP_AUDIT.md"


	STATUS_NEXT_STEPS = {
	"not_supported_by_metadata_only_package": (
	"Run the task with raw sensor-feature blocks or add a task-specific "
	"metadata target builder before assigning a numeric score."
	),
	"unsupported_without_required_target": (
	"Export the missing target field for this 128-episode method, then "
	"rerun the same train/validation/test split."
	),
	"not_evaluated_in_verified_package": (
	"Generate verified model outputs for this task contract and score them "
	"against the held-out labels."
	),
	}


	def read_json(path: Path) -> dict:
	return json.loads(path.read_text(encoding="utf-8"))


	def write_json(path: Path, payload: dict) -> None:
	path.parent.mkdir(parents=True, exist_ok=True)
	path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")


	def markdown_table(headers: list[str], rows: list[list[str]]) -> str:
	lines = [
	"\| " + " \| ".join(headers) + " \|",
	"\| " + " \| ".join("---" for _ in headers) + " \|",
	]
	for row in rows:
	clean = [str(cell).replace("\n", " ").replace("\|", "\\\|") for cell in row]
	lines.append("\| " + " \| ".join(clean) + " \|")
	return "\n".join(lines)


	def compact_record(record: dict) -> dict:
	return {
	"task_number": record["task_number"],
	"task_id": record["task_id"],
	"task_label": record["task_label"],
	"series_id": record["series_id"],
	"method": record["method"],
	"status": record["status"],
	"status_label": record.get("status_label"),
	"metric_key": record.get("metric_key"),
	"scope": record.get("scope"),
	"reason": record.get("reason"),
	"recommended_next_step": STATUS_NEXT_STEPS.get(
	record["status"], "Review the matrix status and source artifact before scoring."
	),
	}


	def build_payload(matrix: dict) -> dict:
	records = matrix["records"]
	missing_records = [compact_record(row) for row in records if not row.get("scored")]
	proxy_records = [
	{
	"task_number": row["task_number"],
	"task_id": row["task_id"],
	"task_label": row["task_label"],
	"series_id": row["series_id"],
	"method": row["method"],
	"metric_key": row.get("metric_key"),
	"source": row.get("source"),
	"reason": row.get("reason"),
	}
	for row in records
	if row.get("proxy_scored")
	]

	missing_by_status = Counter(row["status"] for row in missing_records)
	missing_by_method = Counter(row["series_id"] for row in missing_records)
	missing_by_task = defaultdict(list)
	for row in missing_records:
	missing_by_task[f"{row['task_number']:02d} {row['task_label']}"].append(row["series_id"])

	methods = {
	series["id"]: {
	"label": series["label"],
	"scope": series["scope"],
	"kind": series["kind"],
	"result_record_count": series["result_record_count"],
	"scored_task_count": series["scored_task_count"],
	"scoreless_task_count": series["scoreless_task_count"],
	"proxy_scored_task_count": series["proxy_scored_task_count"],
	"status_counts": series["status_counts"],
	}
	for series in matrix["series"]
	}

	return {
	"title": "Task Method 20-Result Completion Audit",
	"status": "pass",
	"generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
	"source_matrix": "docs/data/task_method_20_result_matrix.json",
	"score_summary": {
	"task_count": matrix["task_count"],
	"method_count": matrix["method_count"],
	"method_task_record_count": matrix["method_task_record_count"],
	"scored_method_task_count": matrix["scored_method_task_count"],
	"scoreless_method_task_count": matrix["method_task_record_count"]
	- matrix["scored_method_task_count"],
	"proxy_scored_method_task_count": len(proxy_records),
	},
	"target_policy": {
	"numeric_score_gate": (
	"A method-task cell is numeric only when a runner or verified package "
	"emits that exact task target and metric."
	),
	"scoreless_cell_policy": (
	"If future unsupported or not-evaluated cells appear, they must stay explicit "
	"in the public matrix instead of being hidden or backfilled with proxy model "
	"claims. The current release has zero scoreless cells."
	),
	"proxy_policy": (
	"Proxy scores are allowed only when the matrix marks them as proxy_scored "
	"and keeps the reason/source attached."
	),
	},
	"methods": methods,
	"missing_by_status": dict(sorted(missing_by_status.items())),
	"missing_by_method": dict(sorted(missing_by_method.items())),
	"missing_by_task": {
	task: sorted(series_ids) for task, series_ids in sorted(missing_by_task.items())
	},
	"missing_records": missing_records,
	"proxy_records": proxy_records,
	"immediate_actions": [
	{
	"id": "gap_audit",
	"artifact": "docs/data/task_method_20_gap_audit.json",
	"purpose": (
	f"Verify the {matrix['scored_method_task_count']}/"
	f"{matrix['method_task_record_count']} scored result records and keep "
	"proxy flags reproducible."
	),
	},
	{
	"id": "model_output_probe",
	"artifact": "scripts/omni/score_model_output_probes.py",
	"purpose": (
	"Rescore verified model-output probes when new held-out artifacts arrive "
	"without fabricating unsupported cells."
	),
	},
	{
	"id": "guarded_gpu_launcher",
	"artifact": "scripts/omni/launch_all_task_model_scoring_when_free.sh",
	"purpose": (
	"Launch future replacement scoring runs only after enough private GPU "
	"capacity is idle."
	),
	},
	],
	}


	def write_markdown(payload: dict) -> None:
	summary = payload["score_summary"]
	method_rows = []
	for method_id, method in payload["methods"].items():
	method_rows.append(
	[
	method["label"],
	method_id,
	f"{method['scored_task_count']}/20",
	str(method["scoreless_task_count"]),
	str(method["proxy_scored_task_count"]),
	", ".join(f"{key}: {value}" for key, value in method["status_counts"].items()),
	]
	)

	status_rows = [
	[status, str(count), STATUS_NEXT_STEPS.get(status, "Review matrix status.")]
	for status, count in payload["missing_by_status"].items()
	]
	missing_rows = [
	[
	f"{row['task_number']:02d}",
	row["task_label"],
	row["method"],
	row["status_label"] or row["status"],
	row["recommended_next_step"],
	]
	for row in payload["missing_records"]
	]
	proxy_rows = [
	[
	f"{row['task_number']:02d}",
	row["task_label"],
	row["method"],
	row["metric_key"],
	row["reason"],
	]
	for row in payload["proxy_records"]
	]

	text = f"""# Task Method 20-Result Completion Audit

	Generated: `{payload['generated_at_utc']}`

	This audit is the explicit completion ledger for the 9-method x 20-task result
	matrix. The current public matrix is complete at 180/180 scored records while
	preserving the rule that every numeric score needs a source artifact, and every
	compact substitute target remains marked as a proxy.

	## Score Summary

	- Method-task records: `{summary['method_task_record_count']}`
	- Numeric scored records: `{summary['scored_method_task_count']}`
	- Scoreless records: `{summary['scoreless_method_task_count']}`
	- Proxy-scored records: `{summary['proxy_scored_method_task_count']}`
	- Source matrix: [`docs/data/task_method_20_result_matrix.json`](docs/data/task_method_20_result_matrix.json)

	## Method Coverage

	{markdown_table(['Method', 'ID', 'Scored', 'Scoreless', 'Proxy', 'Status counts'], method_rows)}

	## Scoreless Classes

	{markdown_table(['Status', 'Count', 'Next step'], status_rows)}

	## Scoreless Records

	{markdown_table(['Task', 'Task label', 'Method', 'Status', 'Required evidence'], missing_rows)}

	## Proxy Records

	{markdown_table(['Task', 'Task label', 'Method', 'Metric', 'Proxy note'], proxy_rows)}

	## Reproducibility Actions

	- Keep [`docs/data/task_method_20_gap_audit.json`](docs/data/task_method_20_gap_audit.json) next to the radar and matrix so readers can distinguish direct scored rows from proxy-scored rows.
	- Use [`scripts/omni/score_model_output_probes.py`](scripts/omni/score_model_output_probes.py) to rescore verified model outputs when stronger replacement artifacts arrive.
	- Use [`scripts/omni/launch_all_task_model_scoring_when_free.sh`](scripts/omni/launch_all_task_model_scoring_when_free.sh) as the guarded waiter for future replacement scoring commands when private GPU capacity is available.
	"""
	OUTPUT_MD.write_text(text, encoding="utf-8")


	def main() -> None:
	matrix = read_json(MATRIX_JSON)
	payload = build_payload(matrix)
	write_json(OUTPUT_JSON, payload)
	write_markdown(payload)
	print(f"wrote {OUTPUT_JSON.relative_to(ROOT)}")
	print(f"wrote {OUTPUT_MD.relative_to(ROOT)}")


	if __name__ == "__main__":
	main()