#!/usr/bin/env python3 """ Render a ChatGPT-image-backed 12-task infographic. The background bitmap is AI-generated. The task names, inputs, and metrics are read from results/episode_task_suite/summary_report.json so the published image does not rely on image-model text generation. """ from __future__ import annotations import argparse import html import json import subprocess import tempfile from pathlib import Path ROOT = Path(__file__).resolve().parents[1] SUMMARY_PATH = ROOT / "results/episode_task_suite/summary_report.json" DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png" DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png" GROUPS = [ { "name": "Label + State", "color": "#008b9a", "left": 94, "top": 374, "width": 246, "tasks": [ ("timeline_action", "supervised"), ("timeline_subtask", "supervised"), ("next_action", "supervised"), ], }, { "name": "Prediction + Reconstruction", "color": "#1f63e9", "left": 472, "top": 374, "width": 248, "tasks": [ ("hand_trajectory_forecast", "forecast"), ("modality_reconstruction", "forecast"), ("contact_prediction", "supervised"), ], }, { "name": "Grounding + Retrieval", "color": "#b65b04", "left": 848, "top": 374, "width": 220, "tasks": [ ("caption_grounding", "retrieval"), ("cross_modal_retrieval", "retrieval"), ("object_relevance", "supervised"), ], }, { "name": "Temporal Diagnostics", "color": "#b42318", "left": 1202, "top": 374, "width": 244, "tasks": [ ("transition_detection", "diagnostic"), ("temporal_order", "diagnostic"), ("misalignment_detection", "diagnostic"), ], }, ] def load_summary() -> dict: return json.loads(SUMMARY_PATH.read_text(encoding="utf-8")) def fmt(value: float) -> str: return f"{float(value):.4f}" def metric_for(task_name: str, metrics: dict) -> tuple[str, str]: if task_name == "hand_trajectory_forecast": return "MPJPE", fmt(metrics["mpjpe"]) if task_name == "cross_modal_retrieval": return "top-5", fmt(metrics["top5_accuracy"]) if task_name == "caption_grounding": return "MRR", fmt(metrics["mrr"]) if task_name == "object_relevance": return "micro-F1", fmt(metrics["micro_f1"]) if task_name == "modality_reconstruction": return "R2", fmt(metrics["r2"]) if task_name in {"temporal_order", "misalignment_detection"}: return "F1", fmt(metrics["f1"]) if "macro_f1" in metrics: return "macro-F1", fmt(metrics["macro_f1"]) if "accuracy" in metrics: return "accuracy", fmt(metrics["accuracy"]) raise KeyError(f"No main metric configured for {task_name}") def short_io(task_name: str, metrics: dict) -> str: custom = { "timeline_action": "all modalities -> action label", "timeline_subtask": "all modalities -> subtask label", "transition_detection": "all modalities -> boundary / steady", "next_action": "window at t -> action at t+20", "hand_trajectory_forecast": "all modalities -> future hand joints", "contact_prediction": "non-contact modalities -> contact", "object_relevance": "non-caption modalities -> object set", "caption_grounding": "text query -> matching window", "cross_modal_retrieval": "motion / IMU / camera -> depth / video", "modality_reconstruction": "motion / IMU / camera -> depth / video vec", "temporal_order": "two windows -> correct order?", "misalignment_detection": "motion + visual -> aligned / shifted", } return custom.get(task_name, metrics.get("input", "")) def task_html(task_name: str, kind: str, metrics: dict, top: int, group: dict) -> str: label, value = metric_for(task_name, metrics) io = short_io(task_name, metrics) name_size = 17 if len(task_name) > 22 else 18 return f"""
{html.escape(kind)}
{html.escape(task_name)}
{html.escape(io)}
{html.escape(label)}{html.escape(value)}
""" def build_html(summary: dict, base_image: Path) -> str: suite = summary["tasks"] task_count = len(suite) group_headers = [] cards = [] row_tops = [374, 552, 730] header_lefts = [38, 417, 792, 1143] for group, header_left in zip(GROUPS, header_lefts): group_headers.append( f'
{html.escape(group["name"])}
' ) for row_idx, (task_name, kind) in enumerate(group["tasks"]): cards.append(task_html(task_name, kind, suite[task_name], row_tops[row_idx], group)) stats = [ f"{summary['num_frames']:,} frames", f"{summary['num_windows']:,} windows", f"{summary['feature_dim']:,} features", f"{task_count} tasks", "chronological split", ] stat_html = "".join(f"{html.escape(item)}" for item in stats) base_uri = base_image.resolve().as_uri() return f""" Ropedia 12-Task Episode Suite Infographic

Ropedia 12-Task Episode Suite

All labels and metrics are overlaid from the verified single-episode results.
{stat_html}
fisheye video
depth
3D / SLAM
IMU
hands
text / objects
{''.join(group_headers)} {''.join(cards)}
""" def render_html(html_path: Path, output_path: Path) -> None: output_path.parent.mkdir(parents=True, exist_ok=True) subprocess.run( [ "npx", "--yes", "playwright", "screenshot", "--full-page", "--viewport-size=1536,1024", html_path.resolve().as_uri(), str(output_path), ], check=True, ) def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE) parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT) parser.add_argument("--html", type=Path) parser.add_argument("--no-export", action="store_true", help="Only write the HTML overlay.") args = parser.parse_args() summary = load_summary() html_text = build_html(summary, args.base_image) if args.html is None: with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle: handle.write(html_text) html_path = Path(handle.name) else: html_path = args.html html_path.parent.mkdir(parents=True, exist_ok=True) html_path.write_text(html_text, encoding="utf-8") if not args.no_export: render_html(html_path, args.output) print(f"Wrote image: {args.output}") print(f"Wrote overlay HTML: {html_path}") return 0 if __name__ == "__main__": raise SystemExit(main())