ropedia-xperience-10m-task-baselines / scripts /render_task_suite_infographic.py

Add files using upload-large-folder tool

33d2f83 verified 8 days ago

40.9 kB

	#!/usr/bin/env python3
	"""
	Render a polished Ropedia Xperience-10M 20-task infographic.

	The task names, inputs, and metrics are read from docs/data/task_suite_20.json.
	The output is a deterministic PNG rendered from HTML/CSS so the labels stay
	legible and inspectable.
	"""

	from __future__ import annotations

	import argparse
	import base64
	import html
	import io
	import json
	import os
	import subprocess
	import tempfile
	from pathlib import Path

	from task_display import task_display_name


	ROOT = Path(__file__).resolve().parents[1]
	SUMMARY_PATH = ROOT / "docs/data/task_suite_20.json"
	DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png"
	DEFAULT_SAMPLE_DIR = ROOT.parent / "data/sample/xperience-10m-sample"
	DROPBOX_SAMPLE_DIR = Path.home() / "Library/CloudStorage/Dropbox/Ropedia/data/sample/xperience-10m-sample"
	DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png"
	CANVAS_WIDTH = 1800
	CANVAS_HEIGHT = 5000
	THUMB_WIDTH = 880
	THUMB_HEIGHT = 520
	MODALITY_ASSET_FALLBACKS = {
	"video": "video.jpg",
	"audio": "audio.png",
	"depth": "depth.jpg",
	"pose / SLAM": "pose_slam.png",
	"motion capture": "motion_capture.png",
	"inertial": "inertial.png",
	"language": "language.png",
	}


	GROUPS = [
	{
	"name": "Action + Procedure",
	"tone": "teal",
	"color": "#9bdfff",
	"soft": "#071d20",
	"tasks": [
	("timeline_action", "supervised"),
	("timeline_subtask", "supervised"),
	("transition_detection", "diagnostic"),
	("next_action", "supervised"),
	],
	},
	{
	"name": "Motion + Objects",
	"tone": "blue",
	"color": "#ccffa0",
	"soft": "#10210a",
	"tasks": [
	("hand_trajectory_forecast", "forecast"),
	("contact_prediction", "supervised"),
	("object_relevance", "supervised"),
	("caption_grounding", "retrieval"),
	],
	},
	{
	"name": "Retrieval + Alignment",
	"tone": "amber",
	"color": "#7ae5c3",
	"soft": "#092019",
	"tasks": [
	("cross_modal_retrieval", "retrieval"),
	("modality_reconstruction", "forecast"),
	("temporal_order", "diagnostic"),
	("misalignment_detection", "diagnostic"),
	],
	},
	{
	"name": "Long-Horizon Semantics",
	"tone": "green",
	"color": "#d8f4a5",
	"soft": "#1b210d",
	"tasks": [
	("long_horizon_next_action", "forecast"),
	("next_subtask_forecast", "forecast"),
	("interaction_text_prediction", "language"),
	("action_object_relation", "relation"),
	],
	},
	{
	"name": "Future Sets + Sensors",
	"tone": "red",
	"color": "#b7ff91",
	"soft": "#1b210d",
	"tasks": [
	("object_set_forecast", "multi-label"),
	("imu_to_hand_pose", "regression"),
	("camera_view_sync_retrieval", "retrieval"),
	("time_to_transition", "regression"),
	],
	},
	]

	MODALITIES = [
	("video", "visual stream", "6 synchronized camera MP4 streams", "RGB/fisheye/stereo frame statistics"),
	("audio", "acoustic stream", "audio stream embedded in MP4", "audio feature group"),
	("depth", "geometry map", "depth map + confidence channel", "spatial geometry feature block"),
	("pose / SLAM", "camera pose", "trajectory + sparse SLAM map", "position + orientation features"),
	("motion capture", "human motion", "body + hand joint tracks", "3D mocap feature statistics"),
	("inertial", "wearable sensor", "accelerometer + gyroscope", "wearable motion statistics"),
	("language", "semantic annotation", "object tags + action captions", "task labels + semantic targets"),
	]

	HAND_EDGES = [
	(0, 1), (1, 2), (2, 3), (3, 4),
	(0, 5), (5, 6), (6, 7), (7, 8),
	(0, 9), (9, 10), (10, 11), (11, 12),
	(0, 13), (13, 14), (14, 15), (15, 16),
	(0, 17), (17, 18), (18, 19), (19, 20),
	]


	def image_data_uri(image, fmt: str = "PNG", quality: int = 92) -> str:
	buffer = io.BytesIO()
	save_kwargs = {"format": fmt}
	if fmt.upper() in {"JPEG", "JPG"}:
	save_kwargs.update({"quality": quality, "optimize": True})
	image.save(buffer, **save_kwargs)
	encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
	mime = "jpeg" if fmt.upper() in {"JPEG", "JPG"} else "png"
	return f"data:image/{mime};base64,{encoded}"


	def make_canvas(size=(THUMB_WIDTH, THUMB_HEIGHT), color=(2, 5, 2)):
	from PIL import Image

	return Image.new("RGB", size, color)


	def fit_image(image, size=(THUMB_WIDTH, THUMB_HEIGHT)):
	from PIL import ImageOps

	return ImageOps.fit(image.convert("RGB"), size, method=3, centering=(0.5, 0.5))


	def read_video_frame(video_path: Path, frame_index: int = 2400):
	import cv2
	from PIL import Image

	cap = cv2.VideoCapture(str(video_path))
	if not cap.isOpened():
	raise RuntimeError(f"Could not open video: {video_path}")
	total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
	if total:
	frame_index = max(0, min(frame_index, total - 1))
	cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
	ok, frame = cap.read()
	cap.release()
	if not ok:
	raise RuntimeError(f"Could not read frame {frame_index} from {video_path}")
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	return Image.fromarray(frame)


	def draw_label(draw, xy, text, fill=(244, 248, 239), size=18):
	from PIL import ImageFont

	try:
	font = ImageFont.truetype("/System/Library/Fonts/Supplemental/Arial Bold.ttf", size)
	except Exception:
	font = ImageFont.load_default()
	draw.text(xy, text, fill=fill, font=font)


	def video_thumb(sample_dir: Path) -> str:
	from PIL import Image, ImageDraw

	gutter = 18
	panel_width = (THUMB_WIDTH - gutter) // 2
	fish = fit_image(read_video_frame(sample_dir / "fisheye_cam0.mp4", 2450), (panel_width, THUMB_HEIGHT))
	stereo_path = sample_dir / "stereo_left.mp4"
	stereo = fit_image(read_video_frame(stereo_path, 2450), (panel_width, THUMB_HEIGHT)) if stereo_path.exists() else fish.copy()
	canvas = make_canvas()
	canvas.paste(fish, (0, 0))
	canvas.paste(stereo, (panel_width + gutter, 0))
	draw = ImageDraw.Draw(canvas, "RGBA")
	draw.rounded_rectangle((panel_width - 4, 0, panel_width + gutter + 4, THUMB_HEIGHT), radius=0, fill=(2, 5, 2, 220))
	draw_label(draw, (18, 20), "fisheye", fill=(255, 255, 255), size=22)
	draw_label(draw, (panel_width + gutter + 18, 20), "stereo", fill=(255, 255, 255), size=22)
	return image_data_uri(canvas, "JPEG")


	def colorize(values):
	import numpy as np

	stops = np.array([
	[2, 5, 2],
	[58, 136, 102],
	[122, 229, 195],
	[167, 240, 120],
	[216, 244, 165],
	], dtype=np.float32)
	x = np.clip(values, 0, 1)
	scaled = x * (len(stops) - 1)
	lo = np.floor(scaled).astype(int)
	hi = np.clip(lo + 1, 0, len(stops) - 1)
	frac = scaled - lo
	rgb = stops[lo] * (1 - frac[..., None]) + stops[hi] * frac[..., None]
	return rgb.astype("uint8")


	def depth_thumb(h5) -> str:
	import numpy as np
	from PIL import Image, ImageDraw

	gutter = 18
	panel_width = (THUMB_WIDTH - gutter) // 2
	frame = np.array(h5["depth/depth"][2450], dtype=np.float32)
	valid = np.isfinite(frame)
	lo, hi = np.percentile(frame[valid], [3, 97])
	norm = (frame - lo) / max(hi - lo, 1e-6)
	rgb = colorize(norm)
	depth = fit_image(Image.fromarray(rgb), (panel_width, THUMB_HEIGHT))
	conf = np.array(h5["depth/confidence"][2450], dtype=np.uint8)
	conf_img = Image.fromarray(conf, mode="L").convert("RGB")
	conf_img = fit_image(conf_img, (panel_width, THUMB_HEIGHT))
	canvas = make_canvas()
	canvas.paste(depth, (0, 0))
	canvas.paste(conf_img, (panel_width + gutter, 0))
	draw = ImageDraw.Draw(canvas, "RGBA")
	draw.rounded_rectangle((0, 0, 158, 44), radius=8, fill=(2, 5, 2, 178))
	draw.rounded_rectangle((panel_width + gutter, 0, panel_width + gutter + 220, 44), radius=8, fill=(2, 5, 2, 178))
	draw_label(draw, (14, 11), "depth", fill=(255, 255, 255), size=22)
	draw_label(draw, (panel_width + gutter + 14, 11), "confidence", fill=(255, 255, 255), size=22)
	return image_data_uri(canvas, "JPEG")


	def audio_thumb(sample_dir: Path) -> str:
	import numpy as np
	from PIL import ImageDraw

	canvas = make_canvas()
	draw = ImageDraw.Draw(canvas, "RGBA")
	try:
	raw = subprocess.run(
	[
	"ffmpeg",
	"-v",
	"error",
	"-ss",
	"45",
	"-t",
	"6",
	"-i",
	str(sample_dir / "fisheye_cam0.mp4"),
	"-ac",
	"1",
	"-ar",
	"16000",
	"-f",
	"s16le",
	"pipe:1",
	],
	check=True,
	stdout=subprocess.PIPE,
	).stdout
	samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
	if len(samples) == 0:
	raise RuntimeError("empty audio stream")
	samples = samples / max(float(np.max(np.abs(samples))), 1.0)
	bins = 220
	trimmed = samples[: bins * max(1, len(samples) // bins)]
	chunks = np.array_split(trimmed, bins)
	rms = np.array([np.sqrt(np.mean(chunk * chunk)) if len(chunk) else 0.0 for chunk in chunks])
	waveform = np.array([float(np.mean(chunk)) if len(chunk) else 0.0 for chunk in chunks])
	baseline = THUMB_HEIGHT - 72
	for i, value in enumerate(rms):
	x = 18 + i / max(bins - 1, 1) * (THUMB_WIDTH - 36)
	h = 14 + np.clip(value * 158, 0, 158)
	draw.line((x, baseline, x, baseline - h), fill=(167, 240, 120, 170), width=2)
	points = []
	for i, value in enumerate(waveform):
	x = 18 + i / max(bins - 1, 1) * (THUMB_WIDTH - 36)
	y = 126 - np.clip(value, -1, 1) * 82
	points.append((x, y))
	draw.line(points, fill=(122, 229, 195, 220), width=2)
	except Exception:
	for i in range(48):
	x = 22 + i * 8
	h = 16 + (i % 7) * 7
	draw.rounded_rectangle((x, THUMB_HEIGHT - 72 - h, x + 4, THUMB_HEIGHT - 72), radius=2, fill=(167, 240, 120, 170))
	draw_label(draw, (18, 18), "Audio waveform", fill=(244, 248, 239), size=22)
	return image_data_uri(canvas, "PNG")


	def normalize_points(points, width, height, pad=16):
	import numpy as np

	xy = points[:, :2].copy()
	lo = np.percentile(xy, 2, axis=0)
	hi = np.percentile(xy, 98, axis=0)
	span = np.maximum(hi - lo, 1e-6)
	norm = (xy - lo) / span
	norm = np.clip(norm, 0, 1)
	norm[:, 1] = 1 - norm[:, 1]
	out = np.empty_like(norm)
	out[:, 0] = pad + norm[:, 0] * (width - pad * 2)
	out[:, 1] = pad + norm[:, 1] * (height - pad * 2)
	return out


	def slam_thumb(h5) -> str:
	import numpy as np
	from PIL import ImageDraw

	canvas = make_canvas()
	draw = ImageDraw.Draw(canvas, "RGBA")
	points = np.array(h5["slam/point_cloud"], dtype=np.float64)
	points = points[np.isfinite(points).all(axis=1)]
	if len(points) > 2600:
	points = points[np.linspace(0, len(points) - 1, 2600).astype(int)]
	xy = normalize_points(points[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT)
	z = points[:, 1]
	z_norm = (z - np.percentile(z, 2)) / max(np.percentile(z, 98) - np.percentile(z, 2), 1e-6)
	colors = colorize(z_norm)
	for (x, y), color in zip(xy, colors):
	draw.ellipse((x - 1.2, y - 1.2, x + 1.2, y + 1.2), fill=tuple(color.tolist()) + (165,))
	traj = np.array(h5["slam/trans_xyz"][:2450:36], dtype=np.float64)
	traj_xy = normalize_points(traj[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT)
	for a, b in zip(traj_xy[:-1], traj_xy[1:]):
	draw.line((a[0], a[1], b[0], b[1]), fill=(167, 240, 120, 205), width=2)
	draw_label(draw, (18, 18), "camera pose + SLAM map", fill=(244, 248, 239), size=22)
	return image_data_uri(canvas, "PNG")


	def imu_thumb(h5) -> str:
	import numpy as np
	from PIL import ImageDraw

	canvas = make_canvas()
	draw = ImageDraw.Draw(canvas, "RGBA")
	key_idx = int(h5["imu/keyframe_indices"][2450])
	accel = np.array(h5["imu/accel_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64)
	gyro = np.array(h5["imu/gyro_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64)
	series = [accel[:, 0], accel[:, 1], accel[:, 2], gyro[:, 0], gyro[:, 1], gyro[:, 2]]
	colors = [(167, 240, 120), (122, 229, 195), (155, 223, 255), (216, 244, 165), (244, 248, 239), (165, 175, 162)]
	for row in range(6):
	y = 68 + row * 44
	draw.line((18, y, THUMB_WIDTH - 18, y), fill=(167, 240, 120, 48), width=1)
	for values, color in zip(series, colors):
	values = values[:420]
	if len(values) < 2:
	continue
	lo, hi = np.percentile(values, [3, 97])
	norm = (values - lo) / max(hi - lo, 1e-6)
	pts = []
	for i, v in enumerate(norm):
	x = 18 + i / max(len(values) - 1, 1) * (THUMB_WIDTH - 36)
	y = THUMB_HEIGHT - 48 - np.clip(v, 0, 1) * (THUMB_HEIGHT - 116)
	pts.append((x, y))
	draw.line(pts, fill=color + (200,), width=2)
	draw_label(draw, (18, 18), "inertial accel / gyro", fill=(244, 248, 239), size=22)
	return image_data_uri(canvas, "PNG")


	def mocap_thumb(h5) -> str:
	import numpy as np
	from PIL import ImageDraw

	canvas = make_canvas()
	draw = ImageDraw.Draw(canvas, "RGBA")
	body = np.array(h5["full_body_mocap/keypoints"][2450], dtype=np.float32)
	left = np.array(h5["hand_mocap/left_joints_3d"][2450], dtype=np.float32)
	right = np.array(h5["hand_mocap/right_joints_3d"][2450], dtype=np.float32)
	all_points = np.concatenate([body, left, right], axis=0)
	lo = np.percentile(all_points[:, :2], 2, axis=0)
	hi = np.percentile(all_points[:, :2], 98, axis=0)
	span = np.maximum(hi - lo, 1e-6)

	def project(points, x_offset, width):
	xy = (points[:, :2] - lo) / span
	xy[:, 1] = 1 - xy[:, 1]
	xy[:, 0] = x_offset + xy[:, 0] * width
	xy[:, 1] = 72 + xy[:, 1] * (THUMB_HEIGHT - 136)
	return xy

	body_xy = project(body, 28, 270)
	for x, y in body_xy:
	draw.ellipse((x - 2.4, y - 2.4, x + 2.4, y + 2.4), fill=(167, 240, 120, 185))
	for a, b in zip(body_xy[:-1], body_xy[1:]):
	draw.line((a[0], a[1], b[0], b[1]), fill=(167, 240, 120, 82), width=1)

	for points, x_offset, color in [(left, 392, (122, 229, 195)), (right, 562, (216, 244, 165))]:
	xy = project(points, x_offset, 126)
	for a, b in HAND_EDGES:
	draw.line((xy[a][0], xy[a][1], xy[b][0], xy[b][1]), fill=color + (180,), width=2)
	for x, y in xy:
	draw.ellipse((x - 2.4, y - 2.4, x + 2.4, y + 2.4), fill=color + (220,))
	draw_label(draw, (18, 18), "body + hand mocap", fill=(244, 248, 239), size=22)
	return image_data_uri(canvas, "PNG")


	def text_thumb(h5) -> str:
	from PIL import ImageDraw

	width = THUMB_WIDTH
	raw = h5["caption"][()]
	if isinstance(raw, bytes):
	raw = raw.decode("utf-8", errors="replace")
	data = json.loads(raw)
	segment = data["segments"][0]
	objects = sorted({item for values in segment.get("objects", {}).values() for item in values})[:5]
	actions = [a.get("label", "") for a in segment.get("Current Action", [])][:2]
	canvas = make_canvas((width, THUMB_HEIGHT))
	draw = ImageDraw.Draw(canvas, "RGBA")
	draw_label(draw, (28, 24), "language annotation", fill=(244, 248, 239), size=28)
	y = 82
	for label in objects:
	chip_width = 52 + len(label) * 16
	draw.rounded_rectangle((28, y, 28 + chip_width, y + 38), radius=8, fill=(7, 18, 7, 235), outline=(167, 240, 120, 170), width=2)
	draw_label(draw, (44, y + 8), label, fill=(244, 248, 239), size=18)
	y += 47
	x = 340
	y = 92
	for action in actions:
	wrapped = action[:66] + ("..." if len(action) > 66 else "")
	draw.rounded_rectangle((x, y, width - 28, y + 54), radius=9, fill=(7, 18, 7, 235), outline=(122, 229, 195, 180), width=2)
	draw_label(draw, (x + 22, y + 15), wrapped, fill=(244, 248, 239), size=20)
	y += 68
	return image_data_uri(canvas, "PNG")


	def load_sample_thumbnails(sample_dir: Path \| None) -> dict[str, str]:
	if sample_dir is None or not sample_dir.exists():
	return {}
	hdf5_path = sample_dir / "annotation.hdf5"
	required = [sample_dir / "fisheye_cam0.mp4", hdf5_path]
	if not all(path.exists() for path in required):
	return {}
	try:
	import h5py

	thumbnails = {"video": video_thumb(sample_dir), "audio": audio_thumb(sample_dir)}
	with h5py.File(hdf5_path, "r") as h5:
	thumbnails.update({
	"depth": depth_thumb(h5),
	"pose / SLAM": slam_thumb(h5),
	"motion capture": mocap_thumb(h5),
	"inertial": imu_thumb(h5),
	"language": text_thumb(h5),
	})
	return thumbnails
	except Exception as exc:
	print(f"Warning: could not build sample modality thumbnails: {exc}")
	return {}


	def valid_sample_dir(sample_dir: Path \| None) -> bool:
	if sample_dir is None:
	return False
	return (sample_dir / "annotation.hdf5").exists() and (sample_dir / "fisheye_cam0.mp4").exists()


	def resolve_sample_dir(sample_dir: Path \| None) -> Path \| None:
	candidates: list[Path] = []
	env_sample_dir = os.environ.get("XPERIENCE10M_SAMPLE_DIR")
	if env_sample_dir:
	candidates.append(Path(env_sample_dir).expanduser())
	workspace = os.environ.get("WORKSPACE")
	if workspace:
	candidates.append(Path(workspace).expanduser() / "data/sample/xperience-10m-sample")
	if sample_dir is not None:
	candidates.append(sample_dir)
	candidates.extend([
	DEFAULT_SAMPLE_DIR,
	DROPBOX_SAMPLE_DIR,
	])
	for candidate in candidates:
	if valid_sample_dir(candidate):
	return candidate
	return sample_dir


	def load_summary() -> dict:
	return json.loads(SUMMARY_PATH.read_text(encoding="utf-8"))


	def fmt(value: float) -> str:
	return f"{float(value):.4f}"


	def metric_for(task_name: str, metrics: dict) -> tuple[str, str]:
	if "minimal_primary_metric" in metrics:
	label = metrics.get("metric_name") or metrics.get("metric_key") or "score"
	value = metrics.get("minimal_primary_metric")
	return str(label), "n/a" if value is None else fmt(value)
	if task_name == "hand_trajectory_forecast":
	return "MPJPE", fmt(metrics["mpjpe"])
	if task_name == "cross_modal_retrieval":
	return "top-5", fmt(metrics["top5_accuracy"])
	if task_name == "caption_grounding":
	return "MRR", fmt(metrics["mrr"])
	if task_name == "object_relevance":
	return "micro-F1", fmt(metrics["micro_f1"])
	if task_name == "modality_reconstruction":
	return "R2", fmt(metrics["r2"])
	if task_name in {"temporal_order", "misalignment_detection"}:
	return "F1", fmt(metrics["f1"])
	if "macro_f1" in metrics:
	return "macro-F1", fmt(metrics["macro_f1"])
	if "accuracy" in metrics:
	return "accuracy", fmt(metrics["accuracy"])
	raise KeyError(f"No main metric configured for {task_name}")


	def short_io(task_name: str, metrics: dict) -> str:
	if metrics.get("input_short") or metrics.get("output_short"):
	left = metrics.get("input_short") or "input"
	right = metrics.get("output_short") or "target"
	return f"{left} -> {right}"
	custom = {
	"timeline_action": "all featurized modalities -> action label",
	"timeline_subtask": "all featurized modalities -> subtask label",
	"transition_detection": "all featurized modalities -> boundary vs steady",
	"next_action": "window at t -> action at t+20 frames",
	"hand_trajectory_forecast": "all featurized modalities -> future hand joints",
	"contact_prediction": "non-contact modalities -> contact state",
	"object_relevance": "non-caption feature blocks -> relevant objects",
	"caption_grounding": "text query -> matching sensor window",
	"cross_modal_retrieval": "motion / IMU / camera -> depth / video match",
	"modality_reconstruction": "motion / IMU / camera -> depth / video vector",
	"temporal_order": "two adjacent windows -> correct order",
	"misalignment_detection": "motion + visual pair -> aligned or shifted",
	}
	return custom.get(task_name, metrics.get("input", ""))


	def task_card(task_name: str, kind: str, metrics: dict, group: dict, index: int, neural_metrics: dict \| None = None) -> str:
	label, value = metric_for(task_name, metrics)
	neural_html = ""
	if "neural_primary_metric" in metrics and metrics.get("neural_primary_metric") is not None:
	neural_label = metrics.get("metric_name") or metrics.get("metric_key") or "score"
	neural_value = fmt(metrics["neural_primary_metric"])
	neural_html = f"""
	<div class="metric neural">
	<span>NN {html.escape(str(neural_label))}</span>
	<strong>{html.escape(neural_value)}</strong>
	</div>
	"""
	elif neural_metrics and "error" not in neural_metrics:
	neural_label, neural_value = metric_for(task_name, neural_metrics)
	neural_html = f"""
	<div class="metric neural">
	<span>NN {html.escape(neural_label)}</span>
	<strong>{html.escape(neural_value)}</strong>
	</div>
	"""
	io = short_io(task_name, metrics)
	return f"""
	<article class="task-card" style="--accent:{group['color']};--soft:{group['soft']};">
	<div class="task-meta">
	<span class="index">{index:02d}</span>
	<span class="kind">{html.escape(kind)}</span>
	</div>
	<h3>{html.escape(metrics.get("task_display_name") or task_display_name(task_name))}</h3>
	<p>{html.escape(io)}</p>
	<div class="metric">
	<span>min {html.escape(label)}</span>
	<strong>{html.escape(value)}</strong>
	</div>
	{neural_html}
	</article>
	"""


	def modality_card(name: str, modality_type: str, sample_text: str, feature_text: str, index: int, thumbnail: str \| None) -> str:
	thumb_html = ""
	if thumbnail:
	thumb_html = f'<div class="modality-thumb"><img src="{thumbnail}" alt=""></div>'
	return f"""
	<article class="modality">
	<div class="modality-heading">
	<div>
	<span class="modality-index">{index:02d}</span>
	<h3>{html.escape(name)}</h3>
	</div>
	<span class="modality-type">{html.escape(modality_type)}</span>
	</div>
	{thumb_html}
	<div class="modality-copy">
	<div class="modality-row">
	<span>Sample contains</span>
	<p>{html.escape(sample_text)}</p>
	</div>
	<div class="modality-row">
	<span>Current baseline use</span>
	<p>{html.escape(feature_text)}</p>
	</div>
	</div>
	</article>
	"""


	def build_html(summary: dict, base_image: Path \| None, sample_dir: Path \| None) -> str:
	if isinstance(summary.get("tasks"), list):
	task_rows = summary["tasks"]
	suite = {task["task_id"]: task for task in task_rows}
	neural_suite = {}
	dataset_scope = summary.get("dataset_scope", {})
	num_frames = int(dataset_scope.get("num_frames", 0))
	num_windows = int(dataset_scope.get("num_windows", 0))
	feature_dim = int(dataset_scope.get("feature_dim", 0))
	window_frames = int(dataset_scope.get("window_frames", 20))
	stride_frames = int(dataset_scope.get("stride_frames", 5))
	task_count = int(summary.get("task_count", len(suite)))
	scored_records = 180
	else:
	suite = summary["tasks"]
	neural_suite = summary.get("neural_tasks", {})
	num_frames = int(summary["num_frames"])
	num_windows = int(summary["num_windows"])
	feature_dim = int(summary["feature_dim"])
	window_frames = int(summary.get("window_frames", 20))
	stride_frames = int(summary.get("stride_frames", 5))
	task_count = len(suite)
	scored_records = len(suite) + len(neural_suite)
	thumbnails = load_sample_thumbnails(sample_dir)
	for modality_name, asset_name in MODALITY_ASSET_FALLBACKS.items():
	if thumbnails.get(modality_name):
	continue
	fallback = ROOT / "docs/assets/modalities" / asset_name
	if fallback.exists():
	thumbnails[modality_name] = fallback.resolve().as_uri()
	base_layer = ""
	if base_image is not None and base_image.exists():
	base_layer = f'<div class="image-background" style="background-image:url(\'{base_image.resolve().as_uri()}\');"></div>'
	stats = [
	(f"{num_frames:,}", "frames"),
	(f"{num_windows:,}", "windows"),
	(f"{feature_dim:,}", "features"),
	(f"{task_count}", "unified tasks"),
	(f"{scored_records}", "method-task results"),
	("70/30", "chronological split"),
	]
	stats_html = "".join(
	f"<div class=\"stat\"><strong>{html.escape(value)}</strong><span>{html.escape(label)}</span></div>"
	for value, label in stats
	)
	modalities_html = "".join(
	modality_card(name, modality_type, sample_text, feature_text, index, thumbnails.get(name))
	for index, (name, modality_type, sample_text, feature_text) in enumerate(MODALITIES, start=1)
	)

	task_index = 1
	families = []
	for group in GROUPS:
	cards = []
	for task_name, kind in group["tasks"]:
	cards.append(task_card(task_name, kind, suite[task_name], group, task_index, neural_suite.get(task_name)))
	task_index += 1
	families.append(
	f"""
	<section class="family" style="--accent:{group['color']};--soft:{group['soft']};">
	<div class="family-head">
	<span>{html.escape(group['tone'])}</span>
	<h2>{html.escape(group['name'])}</h2>
	</div>
	<div class="family-cards">{''.join(cards)}</div>
	</section>
	"""
	)

	return f"""<!doctype html>
	<html lang="en">
	<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width={CANVAS_WIDTH}, initial-scale=1">
	<title>Xperience-10M 20-Task Episode Suite Infographic</title>
	<style>
	* {{ box-sizing: border-box; }}
	html,
	body {{
	margin: 0;
	width: {CANVAS_WIDTH}px;
	height: {CANVAS_HEIGHT}px;
	background: #020502;
	}}
	body {{
	font-family: "Inter Tight", "Space Grotesk", ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
	color: #f4f8ef;
	text-rendering: optimizeLegibility;
	}}
	.canvas {{
	position: relative;
	width: {CANVAS_WIDTH}px;
	height: {CANVAS_HEIGHT}px;
	overflow: hidden;
	padding: 54px 64px 44px;
	background:
	radial-gradient(circle at 72% 10%, rgba(167,240,120,0.18), transparent 24%),
	radial-gradient(circle at 20% 28%, rgba(255,255,255,0.10) 1px, transparent 2px),
	#020502;
	background-size: auto, 18px 18px, auto;
	}}
	.image-background {{
	position: absolute;
	inset: 0;
	background-position: center;
	background-repeat: no-repeat;
	background-size: cover;
	opacity: 0.36;
	filter: saturate(1.05) contrast(1.08) brightness(0.42);
	}}
	.content {{
	position: relative;
	z-index: 1;
	}}
	.header {{
	display: grid;
	grid-template-columns: 1.25fr 0.75fr;
	gap: 44px;
	align-items: end;
	padding-bottom: 30px;
	border-bottom: 1px solid rgba(167,240,120,0.20);
	}}
	.kicker {{
	display: inline-flex;
	align-items: center;
	gap: 12px;
	color: #ccffa0;
	font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
	font-size: 15px;
	text-transform: uppercase;
	letter-spacing: 0.08em;
	}}
	.kicker::before {{
	content: "";
	width: 44px;
	height: 1px;
	background: #ccffa0;
	}}
	h1 {{
	margin: 18px 0 0;
	max-width: 930px;
	font-size: 72px;
	line-height: 0.95;
	letter-spacing: 0;
	}}
	.subtitle {{
	margin: 18px 0 0;
	max-width: 900px;
	color: #dce8d7;
	font-size: 23px;
	line-height: 1.35;
	font-weight: 520;
	}}
	.stats {{
	display: grid;
	grid-template-columns: repeat(5, minmax(0, 1fr));
	gap: 10px;
	}}
	.stat {{
	min-height: 78px;
	padding: 14px 15px;
	border: 1px solid rgba(167,240,120,0.24);
	background: rgba(7,18,7,0.80);
	border-radius: 8px;
	}}
	.stat strong {{
	display: block;
	font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
	font-size: 25px;
	line-height: 1;
	font-variant-numeric: tabular-nums;
	}}
	.stat span {{
	display: block;
	margin-top: 8px;
	color: #a5afa2;
	font-size: 13px;
	line-height: 1.15;
	}}
	.section-label {{
	display: grid;
	grid-template-columns: 1fr;
	gap: 12px;
	align-items: start;
	margin: 44px 0 24px;
	color: #a5afa2;
	font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
	font-size: 22px;
	text-transform: uppercase;
	letter-spacing: 0.08em;
	}}
	.section-label span:last-child {{
	max-width: 1400px;
	color: #dce8d7;
	text-transform: none;
	letter-spacing: 0;
	font-family: inherit;
	font-size: 21px;
	line-height: 1.42;
	text-align: left;
	}}
	.modalities {{
	display: grid;
	grid-template-columns: repeat(2, minmax(0, 1fr));
	gap: 24px;
	}}
	.modality {{
	min-height: 254px;
	padding: 22px;
	border: 1px solid rgba(167,240,120,0.22);
	background: rgba(7,18,7,0.84);
	border-radius: 8px;
	display: grid;
	grid-template-columns: 310px minmax(0, 1fr);
	grid-template-areas:
	"thumb heading"
	"thumb copy";
	column-gap: 24px;
	row-gap: 16px;
	align-items: start;
	}}
	.modality-thumb {{
	grid-area: thumb;
	height: 210px;
	overflow: hidden;
	border: 1px solid rgba(167,240,120,0.16);
	border-radius: 8px;
	background: #020502;
	}}
	.modality-thumb img {{
	display: block;
	width: 100%;
	height: 100%;
	object-fit: cover;
	}}
	.modality-index,
	.index {{
	font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
	font-variant-numeric: tabular-nums;
	}}
	.modality-heading {{
	grid-area: heading;
	display: flex;
	align-items: start;
	justify-content: space-between;
	gap: 16px;
	padding-bottom: 14px;
	border-bottom: 1px solid rgba(167,240,120,0.16);
	}}
	.modality-index {{
	color: #a5afa2;
	font-size: 18px;
	}}
	.modality-type {{
	color: #ccffa0;
	font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
	font-size: 13px;
	line-height: 1.15;
	text-transform: uppercase;
	letter-spacing: 0.08em;
	text-align: right;
	max-width: 210px;
	padding-top: 4px;
	}}
	.modality h3 {{
	margin: 8px 0 0;
	font-size: 36px;
	line-height: 1.02;
	text-transform: uppercase;
	}}
	.modality-copy {{
	grid-area: copy;
	display: grid;
	grid-template-columns: repeat(2, minmax(0, 1fr));
	gap: 12px;
	}}
	.modality-row {{
	display: grid;
	grid-template-columns: 1fr;
	gap: 8px;
	align-items: baseline;
	padding: 14px 16px;
	border: 1px solid rgba(167,240,120,0.16);
	border-radius: 8px;
	background: rgba(2,5,2,0.40);
	}}
	.modality-row span {{
	display: block;
	color: #a5afa2;
	font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
	font-size: 12px;
	letter-spacing: 0.06em;
	line-height: 1.25;
	text-transform: uppercase;
	}}
	.modality-row p {{
	margin: 0;
	color: #dce8d7;
	font-size: 21px;
	font-weight: 650;
	line-height: 1.2;
	}}
	.shared-band {{
	display: grid;
	grid-template-columns: 1fr auto 1fr auto 1fr auto 1fr;
	gap: 12px;
	align-items: center;
	margin-top: 30px;
	padding: 14px;
	border: 1px solid rgba(167,240,120,0.22);
	background: rgba(7,18,7,0.72);
	border-radius: 8px;
	}}
	.step {{
	min-height: 62px;
	padding: 13px 15px;
	background: rgba(7,18,7,0.92);
	border: 1px solid rgba(167,240,120,0.16);
	border-radius: 8px;
	}}
	.step strong {{
	display: block;
	font-size: 17px;
	line-height: 1.1;
	}}
	.step span {{
	display: block;
	margin-top: 5px;
	color: #a5afa2;
	font-size: 13px;
	}}
	.arrow {{
	color: #ccffa0;
	font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
	font-size: 22px;
	}}
	.families {{
	display: grid;
	grid-template-columns: repeat(2, minmax(0, 1fr));
	gap: 24px;
	margin-top: 30px;
	}}
	.family {{
	padding: 20px;
	border: 1px solid color-mix(in srgb, var(--accent) 28%, #020502);
	background: rgba(7,18,7,0.82);
	border-radius: 8px;
	}}
	.family-head {{
	display: flex;
	align-items: end;
	justify-content: space-between;
	gap: 16px;
	min-height: 66px;
	padding-bottom: 16px;
	border-bottom: 1px solid color-mix(in srgb, var(--accent) 24%, #020502);
	}}
	.family-head span {{
	color: var(--accent);
	font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
	font-size: 12px;
	text-transform: uppercase;
	letter-spacing: 0.08em;
	}}
	.family-head h2 {{
	margin: 0;
	color: var(--accent);
	font-size: 32px;
	line-height: 1.02;
	text-align: right;
	}}
	.family-cards {{
	display: grid;
	gap: 16px;
	margin-top: 18px;
	}}
	.task-card {{
	min-height: 178px;
	padding: 18px 20px;
	border: 1px solid color-mix(in srgb, var(--accent) 28%, #020502);
	background: linear-gradient(180deg, rgba(10,24,10,0.96), color-mix(in srgb, var(--soft) 24%, #071207));
	border-radius: 8px;
	}}
	.task-meta {{
	display: flex;
	align-items: center;
	justify-content: space-between;
	gap: 12px;
	}}
	.index {{
	color: #a5afa2;
	font-size: 12px;
	}}
	.kind {{
	display: inline-flex;
	align-items: center;
	height: 24px;
	padding: 0 9px;
	border-radius: 6px;
	border: 1px solid color-mix(in srgb, var(--accent) 40%, #020502);
	color: var(--accent);
	background: rgba(2,5,2,0.48);
	text-transform: uppercase;
	font-size: 11px;
	line-height: 1;
	font-weight: 830;
	}}
	.task-card h3 {{
	margin: 12px 0 0;
	color: #f4f8ef;
	font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
	font-size: 21px;
	line-height: 1.18;
	overflow-wrap: anywhere;
	}}
	.task-card p {{
	margin: 11px 0 0;
	min-height: 39px;
	color: #dce8d7;
	font-size: 15px;
	line-height: 1.28;
	font-weight: 560;
	}}
	.metric {{
	display: inline-flex;
	align-items: baseline;
	gap: 10px;
	margin-top: 10px;
	min-height: 32px;
	padding: 7px 10px;
	border-radius: 8px;
	border: 1px solid color-mix(in srgb, var(--accent) 42%, #020502);
	background: rgba(2,5,2,0.42);
	}}
	.metric.neural {{
	margin-left: 8px;
	border-color: rgba(255,255,255,0.20);
	background: rgba(255,255,255,0.08);
	}}
	.metric span {{
	color: #a5afa2;
	font-size: 13px;
	font-weight: 760;
	}}
	.metric strong {{
	color: var(--accent);
	font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
	font-size: 20px;
	line-height: 1;
	font-weight: 860;
	font-variant-numeric: tabular-nums;
	}}
	.footer {{
	display: flex;
	align-items: center;
	justify-content: space-between;
	gap: 32px;
	margin-top: 22px;
	padding-top: 20px;
	border-top: 1px solid rgba(167,240,120,0.20);
	color: #a5afa2;
	font-size: 18px;
	line-height: 1.35;
	font-weight: 620;
	}}
	.footer code {{
	font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
	color: #020502;
	background: #ccffa0;
	border: 1px solid #ccffa0;
	border-radius: 7px;
	padding: 6px 9px;
	white-space: nowrap;
	}}
	</style>
	</head>
	<body>
	<main class="canvas" aria-label="Ropedia Xperience-10M unified 20-task infographic">
	{base_layer}
	<div class="content">
	<header class="header">
	<div>
	<div class="kicker">verified unified 20-task release</div>
	<h1>Ropedia Xperience-10M task map</h1>
	<p class="subtitle">A clean map from synchronized multimodal windows to 20 task contracts, comparing minimal heads, neural MLP heads, and the public 180-result matrix.</p>
	</div>
	<div class="stats">{stats_html}</div>
	</header>

	<section class="shared-band" aria-label="shared processing contract">
	<div class="step"><strong>raw public episode</strong><span>video, audio, depth, pose, mocap, IMU, language</span></div>
	<div class="arrow">-></div>
	<div class="step"><strong>{window_frames}-frame windows</strong><span>stride {stride_frames}, chronological order</span></div>
	<div class="arrow">-></div>
	<div class="step"><strong>{feature_dim:,}-d vector</strong><span>current manifest includes audio features</span></div>
	<div class="arrow">-></div>
	<div class="step"><strong>20 task contracts</strong><span>minimal/NN baselines plus Qwen3-Omni/Cosmos3 diagnostics</span></div>
	</section>

	<div class="section-label">
	<span>20 task contracts</span>
	<span>Every task below is part of one unified public-sample suite with shared window/split discipline and source-linked scores in the 180-result matrix.</span>
	</div>
	<section class="families">{''.join(families)}</section>

	<div class="section-label">
	<span>Xperience-10M modalities</span>
	<span>Each public-sample stream is shown with a compact derived thumbnail, what the sample contains, and how the current baseline uses it. Audio is present in the sample MP4 stream and is now extracted into the current baseline manifest.</span>
	</div>
	<section class="modalities">{modalities_html}</section>

	<footer class="footer">
	<span>Single public sample episode: useful for pipeline validation and task design, not cross-episode generalization.</span>
	<code>results/episode_task_suite/summary_report.json</code>
	</footer>
	</div>
	</main>
	</body>
	</html>
	"""


	def render_html(html_path: Path, output_path: Path) -> None:
	output_path.parent.mkdir(parents=True, exist_ok=True)
	subprocess.run(
	[
	"npx",
	"--yes",
	"playwright",
	"screenshot",
	"--full-page",
	f"--viewport-size={CANVAS_WIDTH},{CANVAS_HEIGHT}",
	html_path.resolve().as_uri(),
	str(output_path),
	],
	check=True,
	)


	def main() -> int:
	parser = argparse.ArgumentParser()
	parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE)
	parser.add_argument("--sample-dir", type=Path, default=DEFAULT_SAMPLE_DIR)
	parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
	parser.add_argument("--html", type=Path)
	parser.add_argument("--no-export", action="store_true", help="Only write the HTML used to render the image.")
	args = parser.parse_args()

	summary = load_summary()
	sample_dir = resolve_sample_dir(args.sample_dir)
	html_text = build_html(summary, args.base_image, sample_dir)
	if args.html is None:
	with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle:
	handle.write(html_text)
	html_path = Path(handle.name)
	else:
	html_path = args.html
	html_path.parent.mkdir(parents=True, exist_ok=True)
	html_path.write_text(html_text, encoding="utf-8")

	if not args.no_export:
	render_html(html_path, args.output)
	print(f"Wrote image: {args.output}")
	print(f"Wrote render HTML: {html_path}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())