#!/usr/bin/env python3
"""
Render a ChatGPT-image-backed 12-task infographic.
The background bitmap is AI-generated. The task names, inputs, and metrics are
read from results/episode_task_suite/summary_report.json so the published image
does not rely on image-model text generation.
"""
from __future__ import annotations
import argparse
import html
import json
import subprocess
import tempfile
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
SUMMARY_PATH = ROOT / "results/episode_task_suite/summary_report.json"
DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png"
DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png"
GROUPS = [
{
"name": "Label + State",
"color": "#008b9a",
"left": 94,
"top": 374,
"width": 246,
"tasks": [
("timeline_action", "supervised"),
("timeline_subtask", "supervised"),
("next_action", "supervised"),
],
},
{
"name": "Prediction + Reconstruction",
"color": "#1f63e9",
"left": 472,
"top": 374,
"width": 248,
"tasks": [
("hand_trajectory_forecast", "forecast"),
("modality_reconstruction", "forecast"),
("contact_prediction", "supervised"),
],
},
{
"name": "Grounding + Retrieval",
"color": "#b65b04",
"left": 848,
"top": 374,
"width": 220,
"tasks": [
("caption_grounding", "retrieval"),
("cross_modal_retrieval", "retrieval"),
("object_relevance", "supervised"),
],
},
{
"name": "Temporal Diagnostics",
"color": "#b42318",
"left": 1202,
"top": 374,
"width": 244,
"tasks": [
("transition_detection", "diagnostic"),
("temporal_order", "diagnostic"),
("misalignment_detection", "diagnostic"),
],
},
]
def load_summary() -> dict:
return json.loads(SUMMARY_PATH.read_text(encoding="utf-8"))
def fmt(value: float) -> str:
return f"{float(value):.4f}"
def metric_for(task_name: str, metrics: dict) -> tuple[str, str]:
if task_name == "hand_trajectory_forecast":
return "MPJPE", fmt(metrics["mpjpe"])
if task_name == "cross_modal_retrieval":
return "top-5", fmt(metrics["top5_accuracy"])
if task_name == "caption_grounding":
return "MRR", fmt(metrics["mrr"])
if task_name == "object_relevance":
return "micro-F1", fmt(metrics["micro_f1"])
if task_name == "modality_reconstruction":
return "R2", fmt(metrics["r2"])
if task_name in {"temporal_order", "misalignment_detection"}:
return "F1", fmt(metrics["f1"])
if "macro_f1" in metrics:
return "macro-F1", fmt(metrics["macro_f1"])
if "accuracy" in metrics:
return "accuracy", fmt(metrics["accuracy"])
raise KeyError(f"No main metric configured for {task_name}")
def short_io(task_name: str, metrics: dict) -> str:
custom = {
"timeline_action": "all modalities -> action label",
"timeline_subtask": "all modalities -> subtask label",
"transition_detection": "all modalities -> boundary / steady",
"next_action": "window at t -> action at t+20",
"hand_trajectory_forecast": "all modalities -> future hand joints",
"contact_prediction": "non-contact modalities -> contact",
"object_relevance": "non-caption modalities -> object set",
"caption_grounding": "text query -> matching window",
"cross_modal_retrieval": "motion / IMU / camera -> depth / video",
"modality_reconstruction": "motion / IMU / camera -> depth / video vec",
"temporal_order": "two windows -> correct order?",
"misalignment_detection": "motion + visual -> aligned / shifted",
}
return custom.get(task_name, metrics.get("input", ""))
def task_html(task_name: str, kind: str, metrics: dict, top: int, group: dict) -> str:
label, value = metric_for(task_name, metrics)
io = short_io(task_name, metrics)
name_size = 17 if len(task_name) > 22 else 18
return f"""