#!/usr/bin/env python3
"""Build a concise result summary for the two public evidence lines."""
from __future__ import annotations
import json
from datetime import datetime, timezone
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
MATRIX_JSON = ROOT / "docs/data/task_method_20_result_matrix.json"
LINES_JSON = ROOT / "docs/data/two_evidence_lines.json"
OUTPUT_JSON = ROOT / "docs/data/two_evidence_line_result_summary.json"
OUTPUT_MD = ROOT / "TWO_EVIDENCE_LINE_RESULT_SUMMARY.md"
def read_json(path: Path) -> dict:
return json.loads(path.read_text(encoding="utf-8"))
def write_json(path: Path, payload: dict) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
def markdown_table(headers: list[str], rows: list[list[str]]) -> str:
lines = [
"| " + " | ".join(headers) + " |",
"| " + " | ".join("---" for _ in headers) + " |",
]
for row in rows:
escaped = [str(cell).replace("\n", " ").replace("|", "\\|") for cell in row]
lines.append("| " + " | ".join(escaped) + " |")
return "\n".join(lines)
def line_for_series(scope: str) -> str:
if scope.startswith("1 public sample episode"):
return "single_public_sample_episode"
if scope.startswith("128 selected episodes"):
return "selected_128_episode_surface"
raise ValueError(f"Cannot map series scope to evidence line: {scope}")
def build_method_blocks(lines_out: list[dict]) -> list[dict]:
methods_by_id = {
method["id"]: {**method, "line_label": line["label"], "line_id": line["id"]}
for line in lines_out
for method in line["methods"]
}
def summarize(method_ids: list[str]) -> dict:
methods = [methods_by_id[method_id] for method_id in method_ids]
return {
"methods": [method["label"] for method in methods],
"scored_method_task_count": sum(method["scored_task_count"] for method in methods),
"method_task_record_count": sum(method["result_record_count"] for method in methods),
"direct_scored_method_task_count": sum(method["direct_scored_task_count"] for method in methods),
"proxy_scored_method_task_count": sum(method["proxy_scored_task_count"] for method in methods),
}
blocks = [
{
"line_id": "single_public_sample_episode",
"line_label": "1 sample episode",
"block": "Task-head baselines",
"method_ids": ["minimal", "neural_mlp"],
"evidence_type": "Direct target metrics on the public sample windows.",
"read_as": "Task construction, local reproducibility, and Minimal-vs-Neural behavior.",
},
{
"line_id": "selected_128_episode_surface",
"line_label": "128 selected episodes",
"block": "Aligned baseline heads",
"method_ids": [
"metadata128_simple",
"metadata128_neural_mlp",
"raw128_simple",
"raw128_neural_mlp",
],
"evidence_type": "Direct processed-target metrics where available; compact proxies for documented raw-target gaps.",
"read_as": "Same-split metadata/raw-feature baseline comparison.",
},
{
"line_id": "selected_128_episode_surface",
"line_label": "128 selected episodes",
"block": "Qwen3-Omni series",
"method_ids": ["qwen3_omni_v6_lora"],
"evidence_type": "Verified selected-128 Qwen3-Omni v6 LoRA plus source-linked task-specific probes.",
"read_as": "Trainable Qwen3-Omni diagnostic baseline on the selected-128 surface.",
},
{
"line_id": "selected_128_episode_surface",
"line_label": "128 selected episodes",
"block": "Cosmos3 series",
"method_ids": [
"cosmos3_super_reasoner",
"cosmos3_nano_future_window",
],
"evidence_type": "Verified Cosmos3-Super Reasoner and Cosmos3-Nano Future Window public-safe artifacts.",
"read_as": "Cosmos3 reasoner and future-window diagnostics on the selected-128 surface.",
},
]
for block in blocks:
block.update(summarize(block["method_ids"]))
return blocks
def build_payload(matrix: dict, lines: dict) -> dict:
line_meta = {line["id"]: line for line in lines["lines"]}
line_rows: dict[str, dict] = {
line_id: {
"id": line_id,
"label": meta["label"],
"short_label": meta.get("short_label"),
"data_unit": meta["data_unit"],
"result_statement": meta.get("result_statement"),
"best_read_as": meta.get("best_read_as"),
"read_separately_from": meta.get("read_separately_from"),
"primary_use": meta["best_use"],
"task_count": matrix["task_count"],
"method_count": 0,
"method_task_record_count": 0,
"scored_method_task_count": 0,
"direct_scored_method_task_count": 0,
"proxy_scored_method_task_count": 0,
"methods": [],
"primary_visuals": meta.get("primary_visuals", []),
"artifact_entry_points": meta["primary_artifacts"],
}
for line_id, meta in line_meta.items()
}
series_to_line: dict[str, str] = {}
for series in matrix["series"]:
line_id = line_for_series(series["scope"])
series_to_line[series["id"]] = line_id
line = line_rows[line_id]
line["method_count"] += 1
line["method_task_record_count"] += series["result_record_count"]
line["scored_method_task_count"] += series["scored_task_count"]
line["proxy_scored_method_task_count"] += series.get("proxy_scored_task_count", 0)
line["direct_scored_method_task_count"] += (
series["scored_task_count"] - series.get("proxy_scored_task_count", 0)
)
line["methods"].append(
{
"id": series["id"],
"label": series["label"],
"scope": series["scope"],
"method_detail": series.get("method_detail"),
"scored_task_count": series["scored_task_count"],
"result_record_count": series["result_record_count"],
"direct_scored_task_count": (
series["scored_task_count"] - series.get("proxy_scored_task_count", 0)
),
"proxy_scored_task_count": series.get("proxy_scored_task_count", 0),
"status_counts": series.get("status_counts", {}),
}
)
proxy_records = []
for record in matrix["records"]:
if not record.get("proxy_scored"):
continue
proxy_records.append(
{
"line_id": series_to_line[record["series_id"]],
"task_number": record["task_number"],
"task_id": record["task_id"],
"task_label": record["task_label"],
"series_id": record["series_id"],
"method": record["method"],
"metric_key": record.get("metric_key"),
"source": record.get("source"),
"reason": record.get("reason"),
}
)
lines_out = list(line_rows.values())
total_records = sum(line["method_task_record_count"] for line in lines_out)
total_scored = sum(line["scored_method_task_count"] for line in lines_out)
total_direct = sum(line["direct_scored_method_task_count"] for line in lines_out)
total_proxy = sum(line["proxy_scored_method_task_count"] for line in lines_out)
return {
"title": "Two Evidence-Line Result Summary",
"status": "pass",
"generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
"source_matrix": "docs/data/task_method_20_result_matrix.json",
"source_lines": "docs/data/two_evidence_lines.json",
"interpretation_rule": lines["interpretation_rule"],
"reader_summary": lines.get("reader_summary"),
"score_formula": lines.get("score_formula"),
"summary": {
"line_count": len(lines_out),
"task_count": matrix["task_count"],
"method_count": matrix["method_count"],
"method_task_record_count": total_records,
"scored_method_task_count": total_scored,
"direct_scored_method_task_count": total_direct,
"proxy_scored_method_task_count": total_proxy,
},
"lines": lines_out,
"method_blocks": build_method_blocks(lines_out),
"related_model_artifacts": lines.get("related_model_artifacts", []),
"proxy_records": proxy_records,
"reading_order": [
{
"step": "Choose the evidence line",
"reason": "Line 1 answers task-lab and reproducibility questions; line 2 answers selected-128 comparison questions.",
},
{
"step": "Open the matching radar",
"reason": "Use the 1-episode radar for Minimal-vs-Neural behavior and the 128-episode radar for metadata/raw baselines, Qwen3-Omni v6, Cosmos3-Super, and Cosmos3-Nano.",
},
{
"step": "Inspect the matrix row",
"reason": "Every numeric score is tied to a method, task, metric key, source artifact, and proxy flag.",
},
{
"step": "Check proxy cells before interpreting totals",
"reason": "The six compact-proxy cells are numeric but are not direct raw-target measurements.",
},
],
"reader_policy": {
"single_public_sample_episode": (
"Use for task construction, raw-file inspection, local reproducibility, "
"and controlled Minimal-vs-Neural baseline behavior."
),
"selected_128_episode_surface": (
"Use for held-out comparison, metadata/raw-feature baselines, Qwen3-Omni v6 LoRA, "
"Cosmos3-Super Reasoner, Cosmos3-Nano Future Window, and scale-up decisions."
),
"proxy_policy": (
"Proxy-scored cells stay numeric only when the source artifact and reason "
"are attached; they should not be read as direct raw-target measurements."
),
},
}
def write_markdown(payload: dict) -> None:
summary = payload["summary"]
line_rows = []
entry_rows = []
method_rows = []
for line in payload["lines"]:
method_labels = ", ".join(method["label"] for method in line["methods"])
line_rows.append(
[
line["label"],
line.get("result_statement") or "",
line.get("best_read_as") or line["primary_use"],
line.get("read_separately_from") or "",
]
)
entry_rows.append(
[
line["label"],
str(line["method_count"]),
str(line["task_count"]),
f"{line['scored_method_task_count']}/{line['method_task_record_count']}",
str(line["direct_scored_method_task_count"]),
str(line["proxy_scored_method_task_count"]),
"
".join(line.get("primary_visuals", [])),
"
".join(line["artifact_entry_points"]),
]
)
for method in line["methods"]:
method_rows.append(
[
line["label"],
method["label"],
method.get("method_detail") or "",
f"{method['scored_task_count']}/{method['result_record_count']}",
str(method["direct_scored_task_count"]),
str(method["proxy_scored_task_count"]),
]
)
proxy_rows = [
[
row["task_number"],
row["task_label"],
row["method"],
row.get("metric_key") or "",
row.get("reason") or "",
]
for row in payload["proxy_records"]
]
method_block_rows = [
[
block["line_label"],
block["block"],
", ".join(block["methods"]),
f"{block['scored_method_task_count']}/{block['method_task_record_count']}",
str(block["direct_scored_method_task_count"]),
str(block["proxy_scored_method_task_count"]),
block["evidence_type"],
block["read_as"],
]
for block in payload["method_blocks"]
]
related_artifact_rows = [
[row.get("name", ""), row.get("role", ""), row.get("repo", "")]
for row in payload.get("related_model_artifacts", [])
]
text = f"""# Two Evidence-Line Result Summary
Generated: `{payload['generated_at_utc']}`.
Source matrix: [`{payload['source_matrix']}`]({payload['source_matrix']})
Interpretation rule: {payload['interpretation_rule']}
## Read This First
{payload.get('reader_summary') or ''}
Score formula: {payload.get('score_formula') or ''}
| Line | What the scores mean | Best use | Read separately from |
| --- | --- | --- | --- |
""" + "\n".join(
"| " + " | ".join(str(cell).replace("|", "\\|") for cell in row) + " |"
for row in line_rows
) + f"""
## Public Score Totals
- Lines: {summary['line_count']}
- Tasks per method: {summary['task_count']}
- Methods: {summary['method_count']}
- Scored records: {summary['scored_method_task_count']}/{summary['method_task_record_count']}
- Direct scores: {summary['direct_scored_method_task_count']}
- Compact-proxy scores: {summary['proxy_scored_method_task_count']} documented cells
## Line Ledger And Entry Points
{markdown_table(['Line', 'Methods', 'Tasks', 'Scored records', 'Direct scores', 'Proxy scores', 'Primary visuals', 'Source artifacts'], entry_rows)}
## Method Blocks By Evidence Line
{markdown_table(['Line', 'Method block', 'Methods', 'Scored records', 'Direct scores', 'Proxy scores', 'Evidence type', 'Read as'], method_block_rows)}
## Method Detail By Line
{markdown_table(['Line', 'Method', 'Method detail', 'Scored records', 'Direct scores', 'Proxy scores'], method_rows)}
## Related Model Artifacts
{markdown_table(['Artifact', 'Role', 'Link or path'], related_artifact_rows)}
## Proxy-Scored Cells
{markdown_table(['Task', 'Task label', 'Method', 'Metric', 'Reason'], proxy_rows)}
## Reading Order
{markdown_table(['Step', 'Reason'], [[row['step'], row['reason']] for row in payload['reading_order']])}
## Reader Policy
- 1 sample episode: {payload['reader_policy']['single_public_sample_episode']}
- 128 selected episodes: {payload['reader_policy']['selected_128_episode_surface']}
- Proxy scores: {payload['reader_policy']['proxy_policy']}
"""
OUTPUT_MD.write_text(text, encoding="utf-8")
def main() -> int:
payload = build_payload(read_json(MATRIX_JSON), read_json(LINES_JSON))
write_json(OUTPUT_JSON, payload)
write_markdown(payload)
print(f"Wrote {OUTPUT_JSON.relative_to(ROOT)} and {OUTPUT_MD.relative_to(ROOT)}")
return 0
if __name__ == "__main__":
raise SystemExit(main())