ropedia-xperience-10m-task-baselines / scripts /render_task_suite_infographic.py

Publish Ropedia minimal task baseline weights

eea471e verified 29 days ago

11.6 kB

	#!/usr/bin/env python3
	"""
	Render a ChatGPT-image-backed 12-task infographic.

	The background bitmap is AI-generated. The task names, inputs, and metrics are
	read from results/episode_task_suite/summary_report.json so the published image
	does not rely on image-model text generation.
	"""

	from __future__ import annotations

	import argparse
	import html
	import json
	import subprocess
	import tempfile
	from pathlib import Path


	ROOT = Path(__file__).resolve().parents[1]
	SUMMARY_PATH = ROOT / "results/episode_task_suite/summary_report.json"
	DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png"
	DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png"


	GROUPS = [
	{
	"name": "Label + State",
	"color": "#008b9a",
	"left": 94,
	"top": 374,
	"width": 246,
	"tasks": [
	("timeline_action", "supervised"),
	("timeline_subtask", "supervised"),
	("next_action", "supervised"),
	],
	},
	{
	"name": "Prediction + Reconstruction",
	"color": "#1f63e9",
	"left": 472,
	"top": 374,
	"width": 248,
	"tasks": [
	("hand_trajectory_forecast", "forecast"),
	("modality_reconstruction", "forecast"),
	("contact_prediction", "supervised"),
	],
	},
	{
	"name": "Grounding + Retrieval",
	"color": "#b65b04",
	"left": 848,
	"top": 374,
	"width": 220,
	"tasks": [
	("caption_grounding", "retrieval"),
	("cross_modal_retrieval", "retrieval"),
	("object_relevance", "supervised"),
	],
	},
	{
	"name": "Temporal Diagnostics",
	"color": "#b42318",
	"left": 1202,
	"top": 374,
	"width": 244,
	"tasks": [
	("transition_detection", "diagnostic"),
	("temporal_order", "diagnostic"),
	("misalignment_detection", "diagnostic"),
	],
	},
	]


	def load_summary() -> dict:
	return json.loads(SUMMARY_PATH.read_text(encoding="utf-8"))


	def fmt(value: float) -> str:
	return f"{float(value):.4f}"


	def metric_for(task_name: str, metrics: dict) -> tuple[str, str]:
	if task_name == "hand_trajectory_forecast":
	return "MPJPE", fmt(metrics["mpjpe"])
	if task_name == "cross_modal_retrieval":
	return "top-5", fmt(metrics["top5_accuracy"])
	if task_name == "caption_grounding":
	return "MRR", fmt(metrics["mrr"])
	if task_name == "object_relevance":
	return "micro-F1", fmt(metrics["micro_f1"])
	if task_name == "modality_reconstruction":
	return "R2", fmt(metrics["r2"])
	if task_name in {"temporal_order", "misalignment_detection"}:
	return "F1", fmt(metrics["f1"])
	if "macro_f1" in metrics:
	return "macro-F1", fmt(metrics["macro_f1"])
	if "accuracy" in metrics:
	return "accuracy", fmt(metrics["accuracy"])
	raise KeyError(f"No main metric configured for {task_name}")


	def short_io(task_name: str, metrics: dict) -> str:
	custom = {
	"timeline_action": "all modalities -> action label",
	"timeline_subtask": "all modalities -> subtask label",
	"transition_detection": "all modalities -> boundary / steady",
	"next_action": "window at t -> action at t+20",
	"hand_trajectory_forecast": "all modalities -> future hand joints",
	"contact_prediction": "non-contact modalities -> contact",
	"object_relevance": "non-caption modalities -> object set",
	"caption_grounding": "text query -> matching window",
	"cross_modal_retrieval": "motion / IMU / camera -> depth / video",
	"modality_reconstruction": "motion / IMU / camera -> depth / video vec",
	"temporal_order": "two windows -> correct order?",
	"misalignment_detection": "motion + visual -> aligned / shifted",
	}
	return custom.get(task_name, metrics.get("input", ""))


	def task_html(task_name: str, kind: str, metrics: dict, top: int, group: dict) -> str:
	label, value = metric_for(task_name, metrics)
	io = short_io(task_name, metrics)
	name_size = 17 if len(task_name) > 22 else 18
	return f"""
	<section class="task" style="left:{group['left']}px;top:{top}px;width:{group['width']}px;--accent:{group['color']};">
	<div class="kind">{html.escape(kind)}</div>
	<div class="task-name" style="font-size:{name_size}px;">{html.escape(task_name)}</div>
	<div class="io">{html.escape(io)}</div>
	<div class="metric"><span>{html.escape(label)}</span><strong>{html.escape(value)}</strong></div>
	</section>
	"""


	def build_html(summary: dict, base_image: Path) -> str:
	suite = summary["tasks"]
	task_count = len(suite)
	group_headers = []
	cards = []
	row_tops = [374, 552, 730]
	header_lefts = [38, 417, 792, 1143]
	for group, header_left in zip(GROUPS, header_lefts):
	group_headers.append(
	f'<div class="group-title" style="left:{header_left}px;top:333px;color:{group["color"]};">{html.escape(group["name"])}</div>'
	)
	for row_idx, (task_name, kind) in enumerate(group["tasks"]):
	cards.append(task_html(task_name, kind, suite[task_name], row_tops[row_idx], group))

	stats = [
	f"{summary['num_frames']:,} frames",
	f"{summary['num_windows']:,} windows",
	f"{summary['feature_dim']:,} features",
	f"{task_count} tasks",
	"chronological split",
	]
	stat_html = "".join(f"<span>{html.escape(item)}</span>" for item in stats)
	base_uri = base_image.resolve().as_uri()
	return f"""<!doctype html>
	<html lang="en">
	<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width=1536, initial-scale=1">
	<title>Ropedia 12-Task Episode Suite Infographic</title>
	<style>
	* {{ box-sizing: border-box; }}
	html, body {{ margin: 0; width: 1536px; height: 1024px; background: #ffffff; }}
	body {{
	font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Arial, sans-serif;
	color: #10141f;
	}}
	.canvas {{
	position: relative;
	width: 1536px;
	height: 1024px;
	overflow: hidden;
	background-image: url("{base_uri}");
	background-size: 1536px 1024px;
	background-repeat: no-repeat;
	}}
	.title {{
	position: absolute;
	left: 330px;
	top: 42px;
	width: 876px;
	text-align: center;
	}}
	h1 {{
	margin: 0;
	font-size: 38px;
	line-height: 1.05;
	letter-spacing: 0;
	font-weight: 820;
	}}
	.subtitle {{
	margin-top: 8px;
	color: #425067;
	font-size: 15px;
	line-height: 1.35;
	font-weight: 520;
	}}
	.stats {{
	margin-top: 12px;
	display: flex;
	justify-content: center;
	gap: 8px;
	}}
	.stats span {{
	display: inline-flex;
	align-items: center;
	height: 24px;
	padding: 0 10px;
	border: 1px solid #cdd8e8;
	background: rgba(255, 255, 255, 0.82);
	border-radius: 999px;
	color: #253046;
	font-size: 12px;
	font-weight: 720;
	}}
	.modality {{
	position: absolute;
	top: 256px;
	width: 180px;
	text-align: center;
	font-size: 12px;
	color: #536074;
	font-weight: 720;
	text-transform: uppercase;
	letter-spacing: 0;
	}}
	.group-title {{
	position: absolute;
	width: 322px;
	text-align: center;
	font-size: 18px;
	line-height: 1;
	font-weight: 830;
	letter-spacing: 0;
	}}
	.task {{
	position: absolute;
	padding: 0;
	}}
	.kind {{
	display: inline-flex;
	align-items: center;
	height: 22px;
	padding: 0 8px;
	border-radius: 6px;
	border: 1px solid color-mix(in srgb, var(--accent) 35%, #ffffff);
	color: var(--accent);
	background: rgba(255, 255, 255, 0.76);
	text-transform: uppercase;
	font-size: 10px;
	line-height: 1;
	font-weight: 840;
	letter-spacing: 0;
	}}
	.task-name {{
	margin-top: 7px;
	color: #111827;
	line-height: 1.05;
	font-weight: 850;
	letter-spacing: 0;
	white-space: nowrap;
	}}
	.io {{
	margin-top: 8px;
	min-height: 36px;
	color: #475569;
	font-size: 13.5px;
	line-height: 1.28;
	font-weight: 570;
	}}
	.metric {{
	display: inline-flex;
	align-items: center;
	gap: 9px;
	margin-top: 8px;
	height: 30px;
	padding: 0 10px;
	border-radius: 7px;
	border: 1px solid color-mix(in srgb, var(--accent) 36%, #ffffff);
	background: rgba(255, 255, 255, 0.90);
	box-shadow: 0 7px 20px rgba(16, 20, 31, 0.07);
	}}
	.metric span {{
	color: #64748b;
	font-size: 12px;
	font-weight: 760;
	}}
	.metric strong {{
	color: var(--accent);
	font-size: 16px;
	line-height: 1;
	font-weight: 860;
	}}
	.footer {{
	position: absolute;
	left: 360px;
	top: 932px;
	width: 816px;
	text-align: center;
	color: #536074;
	font-size: 14px;
	font-weight: 650;
	}}
	</style>
	</head>
	<body>
	<main class="canvas" aria-label="Ropedia 12-task episode suite infographic">
	<div class="title">
	<h1>Ropedia 12-Task Episode Suite</h1>
	<div class="subtitle">All labels and metrics are overlaid from the verified single-episode results.</div>
	<div class="stats">{stat_html}</div>
	</div>
	<div class="modality" style="left:50px;">fisheye video</div>
	<div class="modality" style="left:270px;">depth</div>
	<div class="modality" style="left:530px;">3D / SLAM</div>
	<div class="modality" style="left:770px;">IMU</div>
	<div class="modality" style="left:1030px;">hands</div>
	<div class="modality" style="left:1278px;">text / objects</div>
	{''.join(group_headers)}
	{''.join(cards)}
	<div class="footer">Single public sample episode: useful for pipeline validation and task design, not cross-episode generalization.</div>
	</main>
	</body>
	</html>
	"""


	def render_html(html_path: Path, output_path: Path) -> None:
	output_path.parent.mkdir(parents=True, exist_ok=True)
	subprocess.run(
	[
	"npx",
	"--yes",
	"playwright",
	"screenshot",
	"--full-page",
	"--viewport-size=1536,1024",
	html_path.resolve().as_uri(),
	str(output_path),
	],
	check=True,
	)


	def main() -> int:
	parser = argparse.ArgumentParser()
	parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE)
	parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
	parser.add_argument("--html", type=Path)
	parser.add_argument("--no-export", action="store_true", help="Only write the HTML overlay.")
	args = parser.parse_args()

	summary = load_summary()
	html_text = build_html(summary, args.base_image)
	if args.html is None:
	with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle:
	handle.write(html_text)
	html_path = Path(handle.name)
	else:
	html_path = args.html
	html_path.parent.mkdir(parents=True, exist_ok=True)
	html_path.write_text(html_text, encoding="utf-8")

	if not args.no_export:
	render_html(html_path, args.output)
	print(f"Wrote image: {args.output}")
	print(f"Wrote overlay HTML: {html_path}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())