ropedia-xperience-10m-task-baselines / scripts /render_task_suite_infographic.py
cy0307's picture
Add files using upload-large-folder tool
33d2f83 verified
Raw
History Blame
40.9 kB
#!/usr/bin/env python3
"""
Render a polished Ropedia Xperience-10M 20-task infographic.
The task names, inputs, and metrics are read from docs/data/task_suite_20.json.
The output is a deterministic PNG rendered from HTML/CSS so the labels stay
legible and inspectable.
"""
from __future__ import annotations
import argparse
import base64
import html
import io
import json
import os
import subprocess
import tempfile
from pathlib import Path
from task_display import task_display_name
ROOT = Path(__file__).resolve().parents[1]
SUMMARY_PATH = ROOT / "docs/data/task_suite_20.json"
DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png"
DEFAULT_SAMPLE_DIR = ROOT.parent / "data/sample/xperience-10m-sample"
DROPBOX_SAMPLE_DIR = Path.home() / "Library/CloudStorage/Dropbox/Ropedia/data/sample/xperience-10m-sample"
DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png"
CANVAS_WIDTH = 1800
CANVAS_HEIGHT = 5000
THUMB_WIDTH = 880
THUMB_HEIGHT = 520
MODALITY_ASSET_FALLBACKS = {
"video": "video.jpg",
"audio": "audio.png",
"depth": "depth.jpg",
"pose / SLAM": "pose_slam.png",
"motion capture": "motion_capture.png",
"inertial": "inertial.png",
"language": "language.png",
}
GROUPS = [
{
"name": "Action + Procedure",
"tone": "teal",
"color": "#9bdfff",
"soft": "#071d20",
"tasks": [
("timeline_action", "supervised"),
("timeline_subtask", "supervised"),
("transition_detection", "diagnostic"),
("next_action", "supervised"),
],
},
{
"name": "Motion + Objects",
"tone": "blue",
"color": "#ccffa0",
"soft": "#10210a",
"tasks": [
("hand_trajectory_forecast", "forecast"),
("contact_prediction", "supervised"),
("object_relevance", "supervised"),
("caption_grounding", "retrieval"),
],
},
{
"name": "Retrieval + Alignment",
"tone": "amber",
"color": "#7ae5c3",
"soft": "#092019",
"tasks": [
("cross_modal_retrieval", "retrieval"),
("modality_reconstruction", "forecast"),
("temporal_order", "diagnostic"),
("misalignment_detection", "diagnostic"),
],
},
{
"name": "Long-Horizon Semantics",
"tone": "green",
"color": "#d8f4a5",
"soft": "#1b210d",
"tasks": [
("long_horizon_next_action", "forecast"),
("next_subtask_forecast", "forecast"),
("interaction_text_prediction", "language"),
("action_object_relation", "relation"),
],
},
{
"name": "Future Sets + Sensors",
"tone": "red",
"color": "#b7ff91",
"soft": "#1b210d",
"tasks": [
("object_set_forecast", "multi-label"),
("imu_to_hand_pose", "regression"),
("camera_view_sync_retrieval", "retrieval"),
("time_to_transition", "regression"),
],
},
]
MODALITIES = [
("video", "visual stream", "6 synchronized camera MP4 streams", "RGB/fisheye/stereo frame statistics"),
("audio", "acoustic stream", "audio stream embedded in MP4", "audio feature group"),
("depth", "geometry map", "depth map + confidence channel", "spatial geometry feature block"),
("pose / SLAM", "camera pose", "trajectory + sparse SLAM map", "position + orientation features"),
("motion capture", "human motion", "body + hand joint tracks", "3D mocap feature statistics"),
("inertial", "wearable sensor", "accelerometer + gyroscope", "wearable motion statistics"),
("language", "semantic annotation", "object tags + action captions", "task labels + semantic targets"),
]
HAND_EDGES = [
(0, 1), (1, 2), (2, 3), (3, 4),
(0, 5), (5, 6), (6, 7), (7, 8),
(0, 9), (9, 10), (10, 11), (11, 12),
(0, 13), (13, 14), (14, 15), (15, 16),
(0, 17), (17, 18), (18, 19), (19, 20),
]
def image_data_uri(image, fmt: str = "PNG", quality: int = 92) -> str:
buffer = io.BytesIO()
save_kwargs = {"format": fmt}
if fmt.upper() in {"JPEG", "JPG"}:
save_kwargs.update({"quality": quality, "optimize": True})
image.save(buffer, **save_kwargs)
encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
mime = "jpeg" if fmt.upper() in {"JPEG", "JPG"} else "png"
return f"data:image/{mime};base64,{encoded}"
def make_canvas(size=(THUMB_WIDTH, THUMB_HEIGHT), color=(2, 5, 2)):
from PIL import Image
return Image.new("RGB", size, color)
def fit_image(image, size=(THUMB_WIDTH, THUMB_HEIGHT)):
from PIL import ImageOps
return ImageOps.fit(image.convert("RGB"), size, method=3, centering=(0.5, 0.5))
def read_video_frame(video_path: Path, frame_index: int = 2400):
import cv2
from PIL import Image
cap = cv2.VideoCapture(str(video_path))
if not cap.isOpened():
raise RuntimeError(f"Could not open video: {video_path}")
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
if total:
frame_index = max(0, min(frame_index, total - 1))
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
ok, frame = cap.read()
cap.release()
if not ok:
raise RuntimeError(f"Could not read frame {frame_index} from {video_path}")
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
return Image.fromarray(frame)
def draw_label(draw, xy, text, fill=(244, 248, 239), size=18):
from PIL import ImageFont
try:
font = ImageFont.truetype("/System/Library/Fonts/Supplemental/Arial Bold.ttf", size)
except Exception:
font = ImageFont.load_default()
draw.text(xy, text, fill=fill, font=font)
def video_thumb(sample_dir: Path) -> str:
from PIL import Image, ImageDraw
gutter = 18
panel_width = (THUMB_WIDTH - gutter) // 2
fish = fit_image(read_video_frame(sample_dir / "fisheye_cam0.mp4", 2450), (panel_width, THUMB_HEIGHT))
stereo_path = sample_dir / "stereo_left.mp4"
stereo = fit_image(read_video_frame(stereo_path, 2450), (panel_width, THUMB_HEIGHT)) if stereo_path.exists() else fish.copy()
canvas = make_canvas()
canvas.paste(fish, (0, 0))
canvas.paste(stereo, (panel_width + gutter, 0))
draw = ImageDraw.Draw(canvas, "RGBA")
draw.rounded_rectangle((panel_width - 4, 0, panel_width + gutter + 4, THUMB_HEIGHT), radius=0, fill=(2, 5, 2, 220))
draw_label(draw, (18, 20), "fisheye", fill=(255, 255, 255), size=22)
draw_label(draw, (panel_width + gutter + 18, 20), "stereo", fill=(255, 255, 255), size=22)
return image_data_uri(canvas, "JPEG")
def colorize(values):
import numpy as np
stops = np.array([
[2, 5, 2],
[58, 136, 102],
[122, 229, 195],
[167, 240, 120],
[216, 244, 165],
], dtype=np.float32)
x = np.clip(values, 0, 1)
scaled = x * (len(stops) - 1)
lo = np.floor(scaled).astype(int)
hi = np.clip(lo + 1, 0, len(stops) - 1)
frac = scaled - lo
rgb = stops[lo] * (1 - frac[..., None]) + stops[hi] * frac[..., None]
return rgb.astype("uint8")
def depth_thumb(h5) -> str:
import numpy as np
from PIL import Image, ImageDraw
gutter = 18
panel_width = (THUMB_WIDTH - gutter) // 2
frame = np.array(h5["depth/depth"][2450], dtype=np.float32)
valid = np.isfinite(frame)
lo, hi = np.percentile(frame[valid], [3, 97])
norm = (frame - lo) / max(hi - lo, 1e-6)
rgb = colorize(norm)
depth = fit_image(Image.fromarray(rgb), (panel_width, THUMB_HEIGHT))
conf = np.array(h5["depth/confidence"][2450], dtype=np.uint8)
conf_img = Image.fromarray(conf, mode="L").convert("RGB")
conf_img = fit_image(conf_img, (panel_width, THUMB_HEIGHT))
canvas = make_canvas()
canvas.paste(depth, (0, 0))
canvas.paste(conf_img, (panel_width + gutter, 0))
draw = ImageDraw.Draw(canvas, "RGBA")
draw.rounded_rectangle((0, 0, 158, 44), radius=8, fill=(2, 5, 2, 178))
draw.rounded_rectangle((panel_width + gutter, 0, panel_width + gutter + 220, 44), radius=8, fill=(2, 5, 2, 178))
draw_label(draw, (14, 11), "depth", fill=(255, 255, 255), size=22)
draw_label(draw, (panel_width + gutter + 14, 11), "confidence", fill=(255, 255, 255), size=22)
return image_data_uri(canvas, "JPEG")
def audio_thumb(sample_dir: Path) -> str:
import numpy as np
from PIL import ImageDraw
canvas = make_canvas()
draw = ImageDraw.Draw(canvas, "RGBA")
try:
raw = subprocess.run(
[
"ffmpeg",
"-v",
"error",
"-ss",
"45",
"-t",
"6",
"-i",
str(sample_dir / "fisheye_cam0.mp4"),
"-ac",
"1",
"-ar",
"16000",
"-f",
"s16le",
"pipe:1",
],
check=True,
stdout=subprocess.PIPE,
).stdout
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
if len(samples) == 0:
raise RuntimeError("empty audio stream")
samples = samples / max(float(np.max(np.abs(samples))), 1.0)
bins = 220
trimmed = samples[: bins * max(1, len(samples) // bins)]
chunks = np.array_split(trimmed, bins)
rms = np.array([np.sqrt(np.mean(chunk * chunk)) if len(chunk) else 0.0 for chunk in chunks])
waveform = np.array([float(np.mean(chunk)) if len(chunk) else 0.0 for chunk in chunks])
baseline = THUMB_HEIGHT - 72
for i, value in enumerate(rms):
x = 18 + i / max(bins - 1, 1) * (THUMB_WIDTH - 36)
h = 14 + np.clip(value * 158, 0, 158)
draw.line((x, baseline, x, baseline - h), fill=(167, 240, 120, 170), width=2)
points = []
for i, value in enumerate(waveform):
x = 18 + i / max(bins - 1, 1) * (THUMB_WIDTH - 36)
y = 126 - np.clip(value, -1, 1) * 82
points.append((x, y))
draw.line(points, fill=(122, 229, 195, 220), width=2)
except Exception:
for i in range(48):
x = 22 + i * 8
h = 16 + (i % 7) * 7
draw.rounded_rectangle((x, THUMB_HEIGHT - 72 - h, x + 4, THUMB_HEIGHT - 72), radius=2, fill=(167, 240, 120, 170))
draw_label(draw, (18, 18), "Audio waveform", fill=(244, 248, 239), size=22)
return image_data_uri(canvas, "PNG")
def normalize_points(points, width, height, pad=16):
import numpy as np
xy = points[:, :2].copy()
lo = np.percentile(xy, 2, axis=0)
hi = np.percentile(xy, 98, axis=0)
span = np.maximum(hi - lo, 1e-6)
norm = (xy - lo) / span
norm = np.clip(norm, 0, 1)
norm[:, 1] = 1 - norm[:, 1]
out = np.empty_like(norm)
out[:, 0] = pad + norm[:, 0] * (width - pad * 2)
out[:, 1] = pad + norm[:, 1] * (height - pad * 2)
return out
def slam_thumb(h5) -> str:
import numpy as np
from PIL import ImageDraw
canvas = make_canvas()
draw = ImageDraw.Draw(canvas, "RGBA")
points = np.array(h5["slam/point_cloud"], dtype=np.float64)
points = points[np.isfinite(points).all(axis=1)]
if len(points) > 2600:
points = points[np.linspace(0, len(points) - 1, 2600).astype(int)]
xy = normalize_points(points[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT)
z = points[:, 1]
z_norm = (z - np.percentile(z, 2)) / max(np.percentile(z, 98) - np.percentile(z, 2), 1e-6)
colors = colorize(z_norm)
for (x, y), color in zip(xy, colors):
draw.ellipse((x - 1.2, y - 1.2, x + 1.2, y + 1.2), fill=tuple(color.tolist()) + (165,))
traj = np.array(h5["slam/trans_xyz"][:2450:36], dtype=np.float64)
traj_xy = normalize_points(traj[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT)
for a, b in zip(traj_xy[:-1], traj_xy[1:]):
draw.line((a[0], a[1], b[0], b[1]), fill=(167, 240, 120, 205), width=2)
draw_label(draw, (18, 18), "camera pose + SLAM map", fill=(244, 248, 239), size=22)
return image_data_uri(canvas, "PNG")
def imu_thumb(h5) -> str:
import numpy as np
from PIL import ImageDraw
canvas = make_canvas()
draw = ImageDraw.Draw(canvas, "RGBA")
key_idx = int(h5["imu/keyframe_indices"][2450])
accel = np.array(h5["imu/accel_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64)
gyro = np.array(h5["imu/gyro_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64)
series = [accel[:, 0], accel[:, 1], accel[:, 2], gyro[:, 0], gyro[:, 1], gyro[:, 2]]
colors = [(167, 240, 120), (122, 229, 195), (155, 223, 255), (216, 244, 165), (244, 248, 239), (165, 175, 162)]
for row in range(6):
y = 68 + row * 44
draw.line((18, y, THUMB_WIDTH - 18, y), fill=(167, 240, 120, 48), width=1)
for values, color in zip(series, colors):
values = values[:420]
if len(values) < 2:
continue
lo, hi = np.percentile(values, [3, 97])
norm = (values - lo) / max(hi - lo, 1e-6)
pts = []
for i, v in enumerate(norm):
x = 18 + i / max(len(values) - 1, 1) * (THUMB_WIDTH - 36)
y = THUMB_HEIGHT - 48 - np.clip(v, 0, 1) * (THUMB_HEIGHT - 116)
pts.append((x, y))
draw.line(pts, fill=color + (200,), width=2)
draw_label(draw, (18, 18), "inertial accel / gyro", fill=(244, 248, 239), size=22)
return image_data_uri(canvas, "PNG")
def mocap_thumb(h5) -> str:
import numpy as np
from PIL import ImageDraw
canvas = make_canvas()
draw = ImageDraw.Draw(canvas, "RGBA")
body = np.array(h5["full_body_mocap/keypoints"][2450], dtype=np.float32)
left = np.array(h5["hand_mocap/left_joints_3d"][2450], dtype=np.float32)
right = np.array(h5["hand_mocap/right_joints_3d"][2450], dtype=np.float32)
all_points = np.concatenate([body, left, right], axis=0)
lo = np.percentile(all_points[:, :2], 2, axis=0)
hi = np.percentile(all_points[:, :2], 98, axis=0)
span = np.maximum(hi - lo, 1e-6)
def project(points, x_offset, width):
xy = (points[:, :2] - lo) / span
xy[:, 1] = 1 - xy[:, 1]
xy[:, 0] = x_offset + xy[:, 0] * width
xy[:, 1] = 72 + xy[:, 1] * (THUMB_HEIGHT - 136)
return xy
body_xy = project(body, 28, 270)
for x, y in body_xy:
draw.ellipse((x - 2.4, y - 2.4, x + 2.4, y + 2.4), fill=(167, 240, 120, 185))
for a, b in zip(body_xy[:-1], body_xy[1:]):
draw.line((a[0], a[1], b[0], b[1]), fill=(167, 240, 120, 82), width=1)
for points, x_offset, color in [(left, 392, (122, 229, 195)), (right, 562, (216, 244, 165))]:
xy = project(points, x_offset, 126)
for a, b in HAND_EDGES:
draw.line((xy[a][0], xy[a][1], xy[b][0], xy[b][1]), fill=color + (180,), width=2)
for x, y in xy:
draw.ellipse((x - 2.4, y - 2.4, x + 2.4, y + 2.4), fill=color + (220,))
draw_label(draw, (18, 18), "body + hand mocap", fill=(244, 248, 239), size=22)
return image_data_uri(canvas, "PNG")
def text_thumb(h5) -> str:
from PIL import ImageDraw
width = THUMB_WIDTH
raw = h5["caption"][()]
if isinstance(raw, bytes):
raw = raw.decode("utf-8", errors="replace")
data = json.loads(raw)
segment = data["segments"][0]
objects = sorted({item for values in segment.get("objects", {}).values() for item in values})[:5]
actions = [a.get("label", "") for a in segment.get("Current Action", [])][:2]
canvas = make_canvas((width, THUMB_HEIGHT))
draw = ImageDraw.Draw(canvas, "RGBA")
draw_label(draw, (28, 24), "language annotation", fill=(244, 248, 239), size=28)
y = 82
for label in objects:
chip_width = 52 + len(label) * 16
draw.rounded_rectangle((28, y, 28 + chip_width, y + 38), radius=8, fill=(7, 18, 7, 235), outline=(167, 240, 120, 170), width=2)
draw_label(draw, (44, y + 8), label, fill=(244, 248, 239), size=18)
y += 47
x = 340
y = 92
for action in actions:
wrapped = action[:66] + ("..." if len(action) > 66 else "")
draw.rounded_rectangle((x, y, width - 28, y + 54), radius=9, fill=(7, 18, 7, 235), outline=(122, 229, 195, 180), width=2)
draw_label(draw, (x + 22, y + 15), wrapped, fill=(244, 248, 239), size=20)
y += 68
return image_data_uri(canvas, "PNG")
def load_sample_thumbnails(sample_dir: Path | None) -> dict[str, str]:
if sample_dir is None or not sample_dir.exists():
return {}
hdf5_path = sample_dir / "annotation.hdf5"
required = [sample_dir / "fisheye_cam0.mp4", hdf5_path]
if not all(path.exists() for path in required):
return {}
try:
import h5py
thumbnails = {"video": video_thumb(sample_dir), "audio": audio_thumb(sample_dir)}
with h5py.File(hdf5_path, "r") as h5:
thumbnails.update({
"depth": depth_thumb(h5),
"pose / SLAM": slam_thumb(h5),
"motion capture": mocap_thumb(h5),
"inertial": imu_thumb(h5),
"language": text_thumb(h5),
})
return thumbnails
except Exception as exc:
print(f"Warning: could not build sample modality thumbnails: {exc}")
return {}
def valid_sample_dir(sample_dir: Path | None) -> bool:
if sample_dir is None:
return False
return (sample_dir / "annotation.hdf5").exists() and (sample_dir / "fisheye_cam0.mp4").exists()
def resolve_sample_dir(sample_dir: Path | None) -> Path | None:
candidates: list[Path] = []
env_sample_dir = os.environ.get("XPERIENCE10M_SAMPLE_DIR")
if env_sample_dir:
candidates.append(Path(env_sample_dir).expanduser())
workspace = os.environ.get("WORKSPACE")
if workspace:
candidates.append(Path(workspace).expanduser() / "data/sample/xperience-10m-sample")
if sample_dir is not None:
candidates.append(sample_dir)
candidates.extend([
DEFAULT_SAMPLE_DIR,
DROPBOX_SAMPLE_DIR,
])
for candidate in candidates:
if valid_sample_dir(candidate):
return candidate
return sample_dir
def load_summary() -> dict:
return json.loads(SUMMARY_PATH.read_text(encoding="utf-8"))
def fmt(value: float) -> str:
return f"{float(value):.4f}"
def metric_for(task_name: str, metrics: dict) -> tuple[str, str]:
if "minimal_primary_metric" in metrics:
label = metrics.get("metric_name") or metrics.get("metric_key") or "score"
value = metrics.get("minimal_primary_metric")
return str(label), "n/a" if value is None else fmt(value)
if task_name == "hand_trajectory_forecast":
return "MPJPE", fmt(metrics["mpjpe"])
if task_name == "cross_modal_retrieval":
return "top-5", fmt(metrics["top5_accuracy"])
if task_name == "caption_grounding":
return "MRR", fmt(metrics["mrr"])
if task_name == "object_relevance":
return "micro-F1", fmt(metrics["micro_f1"])
if task_name == "modality_reconstruction":
return "R2", fmt(metrics["r2"])
if task_name in {"temporal_order", "misalignment_detection"}:
return "F1", fmt(metrics["f1"])
if "macro_f1" in metrics:
return "macro-F1", fmt(metrics["macro_f1"])
if "accuracy" in metrics:
return "accuracy", fmt(metrics["accuracy"])
raise KeyError(f"No main metric configured for {task_name}")
def short_io(task_name: str, metrics: dict) -> str:
if metrics.get("input_short") or metrics.get("output_short"):
left = metrics.get("input_short") or "input"
right = metrics.get("output_short") or "target"
return f"{left} -> {right}"
custom = {
"timeline_action": "all featurized modalities -> action label",
"timeline_subtask": "all featurized modalities -> subtask label",
"transition_detection": "all featurized modalities -> boundary vs steady",
"next_action": "window at t -> action at t+20 frames",
"hand_trajectory_forecast": "all featurized modalities -> future hand joints",
"contact_prediction": "non-contact modalities -> contact state",
"object_relevance": "non-caption feature blocks -> relevant objects",
"caption_grounding": "text query -> matching sensor window",
"cross_modal_retrieval": "motion / IMU / camera -> depth / video match",
"modality_reconstruction": "motion / IMU / camera -> depth / video vector",
"temporal_order": "two adjacent windows -> correct order",
"misalignment_detection": "motion + visual pair -> aligned or shifted",
}
return custom.get(task_name, metrics.get("input", ""))
def task_card(task_name: str, kind: str, metrics: dict, group: dict, index: int, neural_metrics: dict | None = None) -> str:
label, value = metric_for(task_name, metrics)
neural_html = ""
if "neural_primary_metric" in metrics and metrics.get("neural_primary_metric") is not None:
neural_label = metrics.get("metric_name") or metrics.get("metric_key") or "score"
neural_value = fmt(metrics["neural_primary_metric"])
neural_html = f"""
<div class="metric neural">
<span>NN {html.escape(str(neural_label))}</span>
<strong>{html.escape(neural_value)}</strong>
</div>
"""
elif neural_metrics and "error" not in neural_metrics:
neural_label, neural_value = metric_for(task_name, neural_metrics)
neural_html = f"""
<div class="metric neural">
<span>NN {html.escape(neural_label)}</span>
<strong>{html.escape(neural_value)}</strong>
</div>
"""
io = short_io(task_name, metrics)
return f"""
<article class="task-card" style="--accent:{group['color']};--soft:{group['soft']};">
<div class="task-meta">
<span class="index">{index:02d}</span>
<span class="kind">{html.escape(kind)}</span>
</div>
<h3>{html.escape(metrics.get("task_display_name") or task_display_name(task_name))}</h3>
<p>{html.escape(io)}</p>
<div class="metric">
<span>min {html.escape(label)}</span>
<strong>{html.escape(value)}</strong>
</div>
{neural_html}
</article>
"""
def modality_card(name: str, modality_type: str, sample_text: str, feature_text: str, index: int, thumbnail: str | None) -> str:
thumb_html = ""
if thumbnail:
thumb_html = f'<div class="modality-thumb"><img src="{thumbnail}" alt=""></div>'
return f"""
<article class="modality">
<div class="modality-heading">
<div>
<span class="modality-index">{index:02d}</span>
<h3>{html.escape(name)}</h3>
</div>
<span class="modality-type">{html.escape(modality_type)}</span>
</div>
{thumb_html}
<div class="modality-copy">
<div class="modality-row">
<span>Sample contains</span>
<p>{html.escape(sample_text)}</p>
</div>
<div class="modality-row">
<span>Current baseline use</span>
<p>{html.escape(feature_text)}</p>
</div>
</div>
</article>
"""
def build_html(summary: dict, base_image: Path | None, sample_dir: Path | None) -> str:
if isinstance(summary.get("tasks"), list):
task_rows = summary["tasks"]
suite = {task["task_id"]: task for task in task_rows}
neural_suite = {}
dataset_scope = summary.get("dataset_scope", {})
num_frames = int(dataset_scope.get("num_frames", 0))
num_windows = int(dataset_scope.get("num_windows", 0))
feature_dim = int(dataset_scope.get("feature_dim", 0))
window_frames = int(dataset_scope.get("window_frames", 20))
stride_frames = int(dataset_scope.get("stride_frames", 5))
task_count = int(summary.get("task_count", len(suite)))
scored_records = 180
else:
suite = summary["tasks"]
neural_suite = summary.get("neural_tasks", {})
num_frames = int(summary["num_frames"])
num_windows = int(summary["num_windows"])
feature_dim = int(summary["feature_dim"])
window_frames = int(summary.get("window_frames", 20))
stride_frames = int(summary.get("stride_frames", 5))
task_count = len(suite)
scored_records = len(suite) + len(neural_suite)
thumbnails = load_sample_thumbnails(sample_dir)
for modality_name, asset_name in MODALITY_ASSET_FALLBACKS.items():
if thumbnails.get(modality_name):
continue
fallback = ROOT / "docs/assets/modalities" / asset_name
if fallback.exists():
thumbnails[modality_name] = fallback.resolve().as_uri()
base_layer = ""
if base_image is not None and base_image.exists():
base_layer = f'<div class="image-background" style="background-image:url(\'{base_image.resolve().as_uri()}\');"></div>'
stats = [
(f"{num_frames:,}", "frames"),
(f"{num_windows:,}", "windows"),
(f"{feature_dim:,}", "features"),
(f"{task_count}", "unified tasks"),
(f"{scored_records}", "method-task results"),
("70/30", "chronological split"),
]
stats_html = "".join(
f"<div class=\"stat\"><strong>{html.escape(value)}</strong><span>{html.escape(label)}</span></div>"
for value, label in stats
)
modalities_html = "".join(
modality_card(name, modality_type, sample_text, feature_text, index, thumbnails.get(name))
for index, (name, modality_type, sample_text, feature_text) in enumerate(MODALITIES, start=1)
)
task_index = 1
families = []
for group in GROUPS:
cards = []
for task_name, kind in group["tasks"]:
cards.append(task_card(task_name, kind, suite[task_name], group, task_index, neural_suite.get(task_name)))
task_index += 1
families.append(
f"""
<section class="family" style="--accent:{group['color']};--soft:{group['soft']};">
<div class="family-head">
<span>{html.escape(group['tone'])}</span>
<h2>{html.escape(group['name'])}</h2>
</div>
<div class="family-cards">{''.join(cards)}</div>
</section>
"""
)
return f"""<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width={CANVAS_WIDTH}, initial-scale=1">
<title>Xperience-10M 20-Task Episode Suite Infographic</title>
<style>
* {{ box-sizing: border-box; }}
html,
body {{
margin: 0;
width: {CANVAS_WIDTH}px;
height: {CANVAS_HEIGHT}px;
background: #020502;
}}
body {{
font-family: "Inter Tight", "Space Grotesk", ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
color: #f4f8ef;
text-rendering: optimizeLegibility;
}}
.canvas {{
position: relative;
width: {CANVAS_WIDTH}px;
height: {CANVAS_HEIGHT}px;
overflow: hidden;
padding: 54px 64px 44px;
background:
radial-gradient(circle at 72% 10%, rgba(167,240,120,0.18), transparent 24%),
radial-gradient(circle at 20% 28%, rgba(255,255,255,0.10) 1px, transparent 2px),
#020502;
background-size: auto, 18px 18px, auto;
}}
.image-background {{
position: absolute;
inset: 0;
background-position: center;
background-repeat: no-repeat;
background-size: cover;
opacity: 0.36;
filter: saturate(1.05) contrast(1.08) brightness(0.42);
}}
.content {{
position: relative;
z-index: 1;
}}
.header {{
display: grid;
grid-template-columns: 1.25fr 0.75fr;
gap: 44px;
align-items: end;
padding-bottom: 30px;
border-bottom: 1px solid rgba(167,240,120,0.20);
}}
.kicker {{
display: inline-flex;
align-items: center;
gap: 12px;
color: #ccffa0;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 15px;
text-transform: uppercase;
letter-spacing: 0.08em;
}}
.kicker::before {{
content: "";
width: 44px;
height: 1px;
background: #ccffa0;
}}
h1 {{
margin: 18px 0 0;
max-width: 930px;
font-size: 72px;
line-height: 0.95;
letter-spacing: 0;
}}
.subtitle {{
margin: 18px 0 0;
max-width: 900px;
color: #dce8d7;
font-size: 23px;
line-height: 1.35;
font-weight: 520;
}}
.stats {{
display: grid;
grid-template-columns: repeat(5, minmax(0, 1fr));
gap: 10px;
}}
.stat {{
min-height: 78px;
padding: 14px 15px;
border: 1px solid rgba(167,240,120,0.24);
background: rgba(7,18,7,0.80);
border-radius: 8px;
}}
.stat strong {{
display: block;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 25px;
line-height: 1;
font-variant-numeric: tabular-nums;
}}
.stat span {{
display: block;
margin-top: 8px;
color: #a5afa2;
font-size: 13px;
line-height: 1.15;
}}
.section-label {{
display: grid;
grid-template-columns: 1fr;
gap: 12px;
align-items: start;
margin: 44px 0 24px;
color: #a5afa2;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 22px;
text-transform: uppercase;
letter-spacing: 0.08em;
}}
.section-label span:last-child {{
max-width: 1400px;
color: #dce8d7;
text-transform: none;
letter-spacing: 0;
font-family: inherit;
font-size: 21px;
line-height: 1.42;
text-align: left;
}}
.modalities {{
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: 24px;
}}
.modality {{
min-height: 254px;
padding: 22px;
border: 1px solid rgba(167,240,120,0.22);
background: rgba(7,18,7,0.84);
border-radius: 8px;
display: grid;
grid-template-columns: 310px minmax(0, 1fr);
grid-template-areas:
"thumb heading"
"thumb copy";
column-gap: 24px;
row-gap: 16px;
align-items: start;
}}
.modality-thumb {{
grid-area: thumb;
height: 210px;
overflow: hidden;
border: 1px solid rgba(167,240,120,0.16);
border-radius: 8px;
background: #020502;
}}
.modality-thumb img {{
display: block;
width: 100%;
height: 100%;
object-fit: cover;
}}
.modality-index,
.index {{
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-variant-numeric: tabular-nums;
}}
.modality-heading {{
grid-area: heading;
display: flex;
align-items: start;
justify-content: space-between;
gap: 16px;
padding-bottom: 14px;
border-bottom: 1px solid rgba(167,240,120,0.16);
}}
.modality-index {{
color: #a5afa2;
font-size: 18px;
}}
.modality-type {{
color: #ccffa0;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 13px;
line-height: 1.15;
text-transform: uppercase;
letter-spacing: 0.08em;
text-align: right;
max-width: 210px;
padding-top: 4px;
}}
.modality h3 {{
margin: 8px 0 0;
font-size: 36px;
line-height: 1.02;
text-transform: uppercase;
}}
.modality-copy {{
grid-area: copy;
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: 12px;
}}
.modality-row {{
display: grid;
grid-template-columns: 1fr;
gap: 8px;
align-items: baseline;
padding: 14px 16px;
border: 1px solid rgba(167,240,120,0.16);
border-radius: 8px;
background: rgba(2,5,2,0.40);
}}
.modality-row span {{
display: block;
color: #a5afa2;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 12px;
letter-spacing: 0.06em;
line-height: 1.25;
text-transform: uppercase;
}}
.modality-row p {{
margin: 0;
color: #dce8d7;
font-size: 21px;
font-weight: 650;
line-height: 1.2;
}}
.shared-band {{
display: grid;
grid-template-columns: 1fr auto 1fr auto 1fr auto 1fr;
gap: 12px;
align-items: center;
margin-top: 30px;
padding: 14px;
border: 1px solid rgba(167,240,120,0.22);
background: rgba(7,18,7,0.72);
border-radius: 8px;
}}
.step {{
min-height: 62px;
padding: 13px 15px;
background: rgba(7,18,7,0.92);
border: 1px solid rgba(167,240,120,0.16);
border-radius: 8px;
}}
.step strong {{
display: block;
font-size: 17px;
line-height: 1.1;
}}
.step span {{
display: block;
margin-top: 5px;
color: #a5afa2;
font-size: 13px;
}}
.arrow {{
color: #ccffa0;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 22px;
}}
.families {{
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: 24px;
margin-top: 30px;
}}
.family {{
padding: 20px;
border: 1px solid color-mix(in srgb, var(--accent) 28%, #020502);
background: rgba(7,18,7,0.82);
border-radius: 8px;
}}
.family-head {{
display: flex;
align-items: end;
justify-content: space-between;
gap: 16px;
min-height: 66px;
padding-bottom: 16px;
border-bottom: 1px solid color-mix(in srgb, var(--accent) 24%, #020502);
}}
.family-head span {{
color: var(--accent);
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 12px;
text-transform: uppercase;
letter-spacing: 0.08em;
}}
.family-head h2 {{
margin: 0;
color: var(--accent);
font-size: 32px;
line-height: 1.02;
text-align: right;
}}
.family-cards {{
display: grid;
gap: 16px;
margin-top: 18px;
}}
.task-card {{
min-height: 178px;
padding: 18px 20px;
border: 1px solid color-mix(in srgb, var(--accent) 28%, #020502);
background: linear-gradient(180deg, rgba(10,24,10,0.96), color-mix(in srgb, var(--soft) 24%, #071207));
border-radius: 8px;
}}
.task-meta {{
display: flex;
align-items: center;
justify-content: space-between;
gap: 12px;
}}
.index {{
color: #a5afa2;
font-size: 12px;
}}
.kind {{
display: inline-flex;
align-items: center;
height: 24px;
padding: 0 9px;
border-radius: 6px;
border: 1px solid color-mix(in srgb, var(--accent) 40%, #020502);
color: var(--accent);
background: rgba(2,5,2,0.48);
text-transform: uppercase;
font-size: 11px;
line-height: 1;
font-weight: 830;
}}
.task-card h3 {{
margin: 12px 0 0;
color: #f4f8ef;
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 21px;
line-height: 1.18;
overflow-wrap: anywhere;
}}
.task-card p {{
margin: 11px 0 0;
min-height: 39px;
color: #dce8d7;
font-size: 15px;
line-height: 1.28;
font-weight: 560;
}}
.metric {{
display: inline-flex;
align-items: baseline;
gap: 10px;
margin-top: 10px;
min-height: 32px;
padding: 7px 10px;
border-radius: 8px;
border: 1px solid color-mix(in srgb, var(--accent) 42%, #020502);
background: rgba(2,5,2,0.42);
}}
.metric.neural {{
margin-left: 8px;
border-color: rgba(255,255,255,0.20);
background: rgba(255,255,255,0.08);
}}
.metric span {{
color: #a5afa2;
font-size: 13px;
font-weight: 760;
}}
.metric strong {{
color: var(--accent);
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
font-size: 20px;
line-height: 1;
font-weight: 860;
font-variant-numeric: tabular-nums;
}}
.footer {{
display: flex;
align-items: center;
justify-content: space-between;
gap: 32px;
margin-top: 22px;
padding-top: 20px;
border-top: 1px solid rgba(167,240,120,0.20);
color: #a5afa2;
font-size: 18px;
line-height: 1.35;
font-weight: 620;
}}
.footer code {{
font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
color: #020502;
background: #ccffa0;
border: 1px solid #ccffa0;
border-radius: 7px;
padding: 6px 9px;
white-space: nowrap;
}}
</style>
</head>
<body>
<main class="canvas" aria-label="Ropedia Xperience-10M unified 20-task infographic">
{base_layer}
<div class="content">
<header class="header">
<div>
<div class="kicker">verified unified 20-task release</div>
<h1>Ropedia Xperience-10M task map</h1>
<p class="subtitle">A clean map from synchronized multimodal windows to 20 task contracts, comparing minimal heads, neural MLP heads, and the public 180-result matrix.</p>
</div>
<div class="stats">{stats_html}</div>
</header>
<section class="shared-band" aria-label="shared processing contract">
<div class="step"><strong>raw public episode</strong><span>video, audio, depth, pose, mocap, IMU, language</span></div>
<div class="arrow">-></div>
<div class="step"><strong>{window_frames}-frame windows</strong><span>stride {stride_frames}, chronological order</span></div>
<div class="arrow">-></div>
<div class="step"><strong>{feature_dim:,}-d vector</strong><span>current manifest includes audio features</span></div>
<div class="arrow">-></div>
<div class="step"><strong>20 task contracts</strong><span>minimal/NN baselines plus Qwen3-Omni/Cosmos3 diagnostics</span></div>
</section>
<div class="section-label">
<span>20 task contracts</span>
<span>Every task below is part of one unified public-sample suite with shared window/split discipline and source-linked scores in the 180-result matrix.</span>
</div>
<section class="families">{''.join(families)}</section>
<div class="section-label">
<span>Xperience-10M modalities</span>
<span>Each public-sample stream is shown with a compact derived thumbnail, what the sample contains, and how the current baseline uses it. Audio is present in the sample MP4 stream and is now extracted into the current baseline manifest.</span>
</div>
<section class="modalities">{modalities_html}</section>
<footer class="footer">
<span>Single public sample episode: useful for pipeline validation and task design, not cross-episode generalization.</span>
<code>results/episode_task_suite/summary_report.json</code>
</footer>
</div>
</main>
</body>
</html>
"""
def render_html(html_path: Path, output_path: Path) -> None:
output_path.parent.mkdir(parents=True, exist_ok=True)
subprocess.run(
[
"npx",
"--yes",
"playwright",
"screenshot",
"--full-page",
f"--viewport-size={CANVAS_WIDTH},{CANVAS_HEIGHT}",
html_path.resolve().as_uri(),
str(output_path),
],
check=True,
)
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE)
parser.add_argument("--sample-dir", type=Path, default=DEFAULT_SAMPLE_DIR)
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
parser.add_argument("--html", type=Path)
parser.add_argument("--no-export", action="store_true", help="Only write the HTML used to render the image.")
args = parser.parse_args()
summary = load_summary()
sample_dir = resolve_sample_dir(args.sample_dir)
html_text = build_html(summary, args.base_image, sample_dir)
if args.html is None:
with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle:
handle.write(html_text)
html_path = Path(handle.name)
else:
html_path = args.html
html_path.parent.mkdir(parents=True, exist_ok=True)
html_path.write_text(html_text, encoding="utf-8")
if not args.no_export:
render_html(html_path, args.output)
print(f"Wrote image: {args.output}")
print(f"Wrote render HTML: {html_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())