Publish Ropedia minimal task baseline weights

Browse files

Files changed (3) hide show

README.md +1 -0
assets/task_suite_infographic.png +2 -2
scripts/render_task_suite_infographic.py +710 -167

README.md CHANGED Viewed

@@ -64,6 +64,7 @@ Their purpose is to make every input/output contract auditable before scaling to
 | `artifacts/**/metrics.json` | records the committed metric values |
 | `artifacts/**/feature_manifest.json` | maps feature blocks back to source modalities |
 | `assets/task_architectures.svg` | shows the shared pipeline and all 12 heads |
 ## Included

 | `artifacts/**/metrics.json` | records the committed metric values |
 | `artifacts/**/feature_manifest.json` | maps feature blocks back to source modalities |
 | `assets/task_architectures.svg` | shows the shared pipeline and all 12 heads |
+| `assets/task_suite_infographic.png` | presents the 12 heads with public-sample modality thumbnails and verified metrics |
 ## Included

assets/task_suite_infographic.png CHANGED Viewed

Git LFS Details

SHA256: 38ba0968f53333b74069e36bec35382cb9c97568da8be528536acc2d69fdb168
Pointer size: 132 Bytes
Size of remote file: 1.32 MB

Git LFS Details

SHA256: 3a7055b5a3ac9ae4362d784347071002fb5ebf572061c65f100a2720e3311036
Pointer size: 132 Bytes
Size of remote file: 1.3 MB

scripts/render_task_suite_infographic.py CHANGED Viewed

@@ -1,16 +1,18 @@
 #!/usr/bin/env python3
 """
-Render a ChatGPT-image-backed 12-task infographic.
-The background bitmap is AI-generated. The task names, inputs, and metrics are
-read from results/episode_task_suite/summary_report.json so the published image
-does not rely on image-model text generation.
 """
 from __future__ import annotations
 import argparse
 import html
 import json
 import subprocess
 import tempfile
@@ -20,16 +22,20 @@ from pathlib import Path
 ROOT = Path(__file__).resolve().parents[1]
 SUMMARY_PATH = ROOT / "results/episode_task_suite/summary_report.json"
 DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png"
 DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png"
 GROUPS = [
     {
         "name": "Label + State",
-        "color": "#008b9a",
-        "left": 94,
-        "top": 374,
-        "width": 246,
         "tasks": [
             ("timeline_action", "supervised"),
             ("timeline_subtask", "supervised"),
@@ -38,10 +44,9 @@ GROUPS = [
     },
     {
         "name": "Prediction + Reconstruction",
-        "color": "#1f63e9",
-        "left": 472,
-        "top": 374,
-        "width": 248,
         "tasks": [
             ("hand_trajectory_forecast", "forecast"),
             ("modality_reconstruction", "forecast"),
@@ -50,10 +55,9 @@ GROUPS = [
     },
     {
         "name": "Grounding + Retrieval",
-        "color": "#b65b04",
-        "left": 848,
-        "top": 374,
-        "width": 220,
         "tasks": [
             ("caption_grounding", "retrieval"),
             ("cross_modal_retrieval", "retrieval"),
@@ -62,10 +66,9 @@ GROUPS = [
     },
     {
         "name": "Temporal Diagnostics",
-        "color": "#b42318",
-        "left": 1202,
-        "top": 374,
-        "width": 244,
         "tasks": [
             ("transition_detection", "diagnostic"),
             ("temporal_order", "diagnostic"),
@@ -74,6 +77,287 @@ GROUPS = [
     },
 ]
 def load_summary() -> dict:
     return json.loads(SUMMARY_PATH.read_text(encoding="utf-8"))
@@ -105,226 +389,484 @@ def metric_for(task_name: str, metrics: dict) -> tuple[str, str]:
 def short_io(task_name: str, metrics: dict) -> str:
     custom = {
-        "timeline_action": "all modalities -> action label",
-        "timeline_subtask": "all modalities -> subtask label",
-        "transition_detection": "all modalities -> boundary / steady",
-        "next_action": "window at t -> action at t+20",
         "hand_trajectory_forecast": "all modalities -> future hand joints",
-        "contact_prediction": "non-contact modalities -> contact",
-        "object_relevance": "non-caption modalities -> object set",
-        "caption_grounding": "text query -> matching window",
-        "cross_modal_retrieval": "motion / IMU / camera -> depth / video",
-        "modality_reconstruction": "motion / IMU / camera -> depth / video vec",
-        "temporal_order": "two windows -> correct order?",
-        "misalignment_detection": "motion + visual -> aligned / shifted",
     }
     return custom.get(task_name, metrics.get("input", ""))
-def task_html(task_name: str, kind: str, metrics: dict, top: int, group: dict) -> str:
     label, value = metric_for(task_name, metrics)
     io = short_io(task_name, metrics)
-    name_size = 17 if len(task_name) > 22 else 18
     return f"""
-      <section class="task" style="left:{group['left']}px;top:{top}px;width:{group['width']}px;--accent:{group['color']};">
-        <div class="kind">{html.escape(kind)}</div>
-        <div class="task-name" style="font-size:{name_size}px;">{html.escape(task_name)}</div>
-        <div class="io">{html.escape(io)}</div>
-        <div class="metric"><span>{html.escape(label)}</span><strong>{html.escape(value)}</strong></div>
-      </section>
     """
-def build_html(summary: dict, base_image: Path) -> str:
-    suite = summary["tasks"]
-    task_count = len(suite)
-    group_headers = []
-    cards = []
-    row_tops = [374, 552, 730]
-    header_lefts = [38, 417, 792, 1143]
-    for group, header_left in zip(GROUPS, header_lefts):
-        group_headers.append(
-            f'<div class="group-title" style="left:{header_left}px;top:333px;color:{group["color"]};">{html.escape(group["name"])}</div>'
-        )
-        for row_idx, (task_name, kind) in enumerate(group["tasks"]):
-            cards.append(task_html(task_name, kind, suite[task_name], row_tops[row_idx], group))
     stats = [
-        f"{summary['num_frames']:,} frames",
-        f"{summary['num_windows']:,} windows",
-        f"{summary['feature_dim']:,} features",
-        f"{task_count} tasks",
-        "chronological split",
     ]
-    stat_html = "".join(f"<span>{html.escape(item)}</span>" for item in stats)
-    base_uri = base_image.resolve().as_uri()
     return f"""<!doctype html>
 <html lang="en">
 <head>
   <meta charset="utf-8">
-  <meta name="viewport" content="width=1536, initial-scale=1">
   <title>Ropedia 12-Task Episode Suite Infographic</title>
   <style>
     * {{ box-sizing: border-box; }}
-    html, body {{ margin: 0; width: 1536px; height: 1024px; background: #ffffff; }}
     body {{
-      font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Arial, sans-serif;
-      color: #10141f;
     }}
     .canvas {{
       position: relative;
-      width: 1536px;
-      height: 1024px;
       overflow: hidden;
-      background-image: url("{base_uri}");
-      background-size: 1536px 1024px;
-      background-repeat: no-repeat;
     }}
-    .title {{
       position: absolute;
-      left: 330px;
-      top: 42px;
-      width: 876px;
-      text-align: center;
     }}
     h1 {{
-      margin: 0;
-      font-size: 38px;
-      line-height: 1.05;
       letter-spacing: 0;
-      font-weight: 820;
     }}
     .subtitle {{
-      margin-top: 8px;
-      color: #425067;
-      font-size: 15px;
       line-height: 1.35;
       font-weight: 520;
     }}
     .stats {{
-      margin-top: 12px;
-      display: flex;
-      justify-content: center;
-      gap: 8px;
     }}
-    .stats span {{
-      display: inline-flex;
       align-items: center;
-      height: 24px;
-      padding: 0 10px;
-      border: 1px solid #cdd8e8;
-      background: rgba(255, 255, 255, 0.82);
-      border-radius: 999px;
-      color: #253046;
-      font-size: 12px;
-      font-weight: 720;
     }}
     .modality {{
-      position: absolute;
-      top: 256px;
-      width: 180px;
-      text-align: center;
       font-size: 12px;
-      color: #536074;
-      font-weight: 720;
-      text-transform: uppercase;
-      letter-spacing: 0;
     }}
-    .group-title {{
-      position: absolute;
-      width: 322px;
-      text-align: center;
-      font-size: 18px;
       line-height: 1;
-      font-weight: 830;
-      letter-spacing: 0;
     }}
-    .task {{
-      position: absolute;
-      padding: 0;
     }}
     .kind {{
       display: inline-flex;
       align-items: center;
-      height: 22px;
-      padding: 0 8px;
       border-radius: 6px;
-      border: 1px solid color-mix(in srgb, var(--accent) 35%, #ffffff);
       color: var(--accent);
-      background: rgba(255, 255, 255, 0.76);
       text-transform: uppercase;
-      font-size: 10px;
       line-height: 1;
-      font-weight: 840;
-      letter-spacing: 0;
     }}
-    .task-name {{
-      margin-top: 7px;
       color: #111827;
-      line-height: 1.05;
-      font-weight: 850;
-      letter-spacing: 0;
-      white-space: nowrap;
     }}
-    .io {{
-      margin-top: 8px;
-      min-height: 36px;
-      color: #475569;
-      font-size: 13.5px;
       line-height: 1.28;
-      font-weight: 570;
     }}
     .metric {{
       display: inline-flex;
-      align-items: center;
-      gap: 9px;
-      margin-top: 8px;
-      height: 30px;
-      padding: 0 10px;
-      border-radius: 7px;
-      border: 1px solid color-mix(in srgb, var(--accent) 36%, #ffffff);
-      background: rgba(255, 255, 255, 0.90);
-      box-shadow: 0 7px 20px rgba(16, 20, 31, 0.07);
     }}
     .metric span {{
       color: #64748b;
-      font-size: 12px;
       font-weight: 760;
     }}
     .metric strong {{
       color: var(--accent);
-      font-size: 16px;
       line-height: 1;
       font-weight: 860;
     }}
     .footer {{
-      position: absolute;
-      left: 360px;
-      top: 932px;
-      width: 816px;
-      text-align: center;
-      color: #536074;
-      font-size: 14px;
-      font-weight: 650;
     }}
   </style>
 </head>
 <body>
   <main class="canvas" aria-label="Ropedia 12-task episode suite infographic">
-    <div class="title">
-      <h1>Ropedia 12-Task Episode Suite</h1>
-      <div class="subtitle">All labels and metrics are overlaid from the verified single-episode results.</div>
-      <div class="stats">{stat_html}</div>
     </div>
-    <div class="modality" style="left:50px;">fisheye video</div>
-    <div class="modality" style="left:270px;">depth</div>
-    <div class="modality" style="left:530px;">3D / SLAM</div>
-    <div class="modality" style="left:770px;">IMU</div>
-    <div class="modality" style="left:1030px;">hands</div>
-    <div class="modality" style="left:1278px;">text / objects</div>
-    {''.join(group_headers)}
-    {''.join(cards)}
-    <div class="footer">Single public sample episode: useful for pipeline validation and task design, not cross-episode generalization.</div>
   </main>
 </body>
 </html>
@@ -340,7 +882,7 @@ def render_html(html_path: Path, output_path: Path) -> None:
             "playwright",
             "screenshot",
             "--full-page",
-            "--viewport-size=1536,1024",
             html_path.resolve().as_uri(),
             str(output_path),
         ],
@@ -351,13 +893,14 @@ def render_html(html_path: Path, output_path: Path) -> None:
 def main() -> int:
     parser = argparse.ArgumentParser()
     parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE)
     parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
     parser.add_argument("--html", type=Path)
-    parser.add_argument("--no-export", action="store_true", help="Only write the HTML overlay.")
     args = parser.parse_args()
     summary = load_summary()
-    html_text = build_html(summary, args.base_image)
     if args.html is None:
         with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle:
             handle.write(html_text)
@@ -370,7 +913,7 @@ def main() -> int:
     if not args.no_export:
         render_html(html_path, args.output)
         print(f"Wrote image: {args.output}")
-    print(f"Wrote overlay HTML: {html_path}")
     return 0

 #!/usr/bin/env python3
 """
+Render a polished 12-task Ropedia episode-suite infographic.
+The task names, inputs, and metrics are read from
+results/episode_task_suite/summary_report.json. The output is a deterministic
+PNG rendered from HTML/CSS so the labels stay legible and reviewable.
 """
 from __future__ import annotations
 import argparse
+import base64
 import html
+import io
 import json
 import subprocess
 import tempfile
 ROOT = Path(__file__).resolve().parents[1]
 SUMMARY_PATH = ROOT / "results/episode_task_suite/summary_report.json"
 DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png"
+DEFAULT_SAMPLE_DIR = ROOT.parent / "data/sample/xperience-10m-sample"
 DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png"
+CANVAS_WIDTH = 1800
+CANVAS_HEIGHT = 1650
+THUMB_WIDTH = 420
+THUMB_HEIGHT = 160
 GROUPS = [
     {
         "name": "Label + State",
+        "tone": "teal",
+        "color": "#197d83",
+        "soft": "#e8f4f3",
         "tasks": [
             ("timeline_action", "supervised"),
             ("timeline_subtask", "supervised"),
     },
     {
         "name": "Prediction + Reconstruction",
+        "tone": "blue",
+        "color": "#1f6c9f",
+        "soft": "#e8f1fb",
         "tasks": [
             ("hand_trajectory_forecast", "forecast"),
             ("modality_reconstruction", "forecast"),
     },
     {
         "name": "Grounding + Retrieval",
+        "tone": "amber",
+        "color": "#9b6516",
+        "soft": "#fbf3df",
         "tasks": [
             ("caption_grounding", "retrieval"),
             ("cross_modal_retrieval", "retrieval"),
     },
     {
         "name": "Temporal Diagnostics",
+        "tone": "red",
+        "color": "#b0443e",
+        "soft": "#fdeceb",
         "tasks": [
             ("transition_detection", "diagnostic"),
             ("temporal_order", "diagnostic"),
     },
 ]
+MODALITIES = [
+    ("video", "6 camera streams", "fisheye + stereo"),
+    ("depth", "confidence maps", "spatial geometry"),
+    ("3D / SLAM", "point-cloud summaries", "scene structure"),
+    ("IMU", "accel + gyro", "body motion"),
+    ("hands", "future joints", "embodied action"),
+    ("text", "objects + captions", "semantic grounding"),
+]
+HAND_EDGES = [
+    (0, 1), (1, 2), (2, 3), (3, 4),
+    (0, 5), (5, 6), (6, 7), (7, 8),
+    (0, 9), (9, 10), (10, 11), (11, 12),
+    (0, 13), (13, 14), (14, 15), (15, 16),
+    (0, 17), (17, 18), (18, 19), (19, 20),
+]
+def image_data_uri(image, fmt: str = "PNG", quality: int = 92) -> str:
+    buffer = io.BytesIO()
+    save_kwargs = {"format": fmt}
+    if fmt.upper() in {"JPEG", "JPG"}:
+        save_kwargs.update({"quality": quality, "optimize": True})
+    image.save(buffer, **save_kwargs)
+    encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
+    mime = "jpeg" if fmt.upper() in {"JPEG", "JPG"} else "png"
+    return f"data:image/{mime};base64,{encoded}"
+def make_canvas(size=(THUMB_WIDTH, THUMB_HEIGHT), color=(255, 254, 253)):
+    from PIL import Image
+    return Image.new("RGB", size, color)
+def fit_image(image, size=(THUMB_WIDTH, THUMB_HEIGHT)):
+    from PIL import ImageOps
+    return ImageOps.fit(image.convert("RGB"), size, method=3, centering=(0.5, 0.5))
+def read_video_frame(video_path: Path, frame_index: int = 2400):
+    import cv2
+    from PIL import Image
+    cap = cv2.VideoCapture(str(video_path))
+    if not cap.isOpened():
+        raise RuntimeError(f"Could not open video: {video_path}")
+    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
+    if total:
+        frame_index = max(0, min(frame_index, total - 1))
+    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
+    ok, frame = cap.read()
+    cap.release()
+    if not ok:
+        raise RuntimeError(f"Could not read frame {frame_index} from {video_path}")
+    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    return Image.fromarray(frame)
+def draw_label(draw, xy, text, fill=(31, 36, 33), size=18):
+    from PIL import ImageFont
+    try:
+        font = ImageFont.truetype("/System/Library/Fonts/Supplemental/Arial Bold.ttf", size)
+    except Exception:
+        font = ImageFont.load_default()
+    draw.text(xy, text, fill=fill, font=font)
+def video_thumb(sample_dir: Path) -> str:
+    from PIL import Image, ImageDraw
+    fish = fit_image(read_video_frame(sample_dir / "fisheye_cam0.mp4", 2450), (194, THUMB_HEIGHT))
+    stereo_path = sample_dir / "stereo_left.mp4"
+    stereo = fit_image(read_video_frame(stereo_path, 2450), (194, THUMB_HEIGHT)) if stereo_path.exists() else fish.copy()
+    canvas = make_canvas()
+    canvas.paste(fish, (0, 0))
+    canvas.paste(stereo, (226, 0))
+    draw = ImageDraw.Draw(canvas, "RGBA")
+    draw.rounded_rectangle((188, 0, 232, THUMB_HEIGHT), radius=0, fill=(251, 250, 247, 235))
+    draw_label(draw, (194, 16), "fisheye", fill=(255, 255, 255), size=14)
+    draw_label(draw, (240, 16), "stereo", fill=(255, 255, 255), size=14)
+    return image_data_uri(canvas, "JPEG")
+def colorize(values):
+    import numpy as np
+    stops = np.array([
+        [26, 35, 126],
+        [36, 123, 160],
+        [68, 170, 122],
+        [238, 190, 76],
+        [197, 79, 51],
+    ], dtype=np.float32)
+    x = np.clip(values, 0, 1)
+    scaled = x * (len(stops) - 1)
+    lo = np.floor(scaled).astype(int)
+    hi = np.clip(lo + 1, 0, len(stops) - 1)
+    frac = scaled - lo
+    rgb = stops[lo] * (1 - frac[..., None]) + stops[hi] * frac[..., None]
+    return rgb.astype("uint8")
+def depth_thumb(h5) -> str:
+    import numpy as np
+    from PIL import Image, ImageDraw
+    frame = np.array(h5["depth/depth"][2450], dtype=np.float32)
+    valid = np.isfinite(frame)
+    lo, hi = np.percentile(frame[valid], [3, 97])
+    norm = (frame - lo) / max(hi - lo, 1e-6)
+    rgb = colorize(norm)
+    depth = fit_image(Image.fromarray(rgb), (204, THUMB_HEIGHT))
+    conf = np.array(h5["depth/confidence"][2450], dtype=np.uint8)
+    conf_img = Image.fromarray(conf, mode="L").convert("RGB")
+    conf_img = fit_image(conf_img, (204, THUMB_HEIGHT))
+    canvas = make_canvas()
+    canvas.paste(depth, (0, 0))
+    canvas.paste(conf_img, (216, 0))
+    draw = ImageDraw.Draw(canvas, "RGBA")
+    draw.rounded_rectangle((0, 0, 116, 28), radius=6, fill=(31, 36, 33, 150))
+    draw.rounded_rectangle((216, 0, 350, 28), radius=6, fill=(31, 36, 33, 150))
+    draw_label(draw, (10, 6), "depth", fill=(255, 255, 255), size=14)
+    draw_label(draw, (226, 6), "confidence", fill=(255, 255, 255), size=14)
+    return image_data_uri(canvas, "JPEG")
+def normalize_points(points, width, height, pad=16):
+    import numpy as np
+    xy = points[:, :2].copy()
+    lo = np.percentile(xy, 2, axis=0)
+    hi = np.percentile(xy, 98, axis=0)
+    span = np.maximum(hi - lo, 1e-6)
+    norm = (xy - lo) / span
+    norm = np.clip(norm, 0, 1)
+    norm[:, 1] = 1 - norm[:, 1]
+    out = np.empty_like(norm)
+    out[:, 0] = pad + norm[:, 0] * (width - pad * 2)
+    out[:, 1] = pad + norm[:, 1] * (height - pad * 2)
+    return out
+def slam_thumb(h5) -> str:
+    import numpy as np
+    from PIL import ImageDraw
+    canvas = make_canvas()
+    draw = ImageDraw.Draw(canvas, "RGBA")
+    points = np.array(h5["slam/point_cloud"], dtype=np.float64)
+    points = points[np.isfinite(points).all(axis=1)]
+    if len(points) > 2600:
+        points = points[np.linspace(0, len(points) - 1, 2600).astype(int)]
+    xy = normalize_points(points[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT)
+    z = points[:, 1]
+    z_norm = (z - np.percentile(z, 2)) / max(np.percentile(z, 98) - np.percentile(z, 2), 1e-6)
+    colors = colorize(z_norm)
+    for (x, y), color in zip(xy, colors):
+        draw.ellipse((x - 1.2, y - 1.2, x + 1.2, y + 1.2), fill=tuple(color.tolist()) + (165,))
+    traj = np.array(h5["slam/trans_xyz"][:2450:36], dtype=np.float64)
+    traj_xy = normalize_points(traj[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT)
+    for a, b in zip(traj_xy[:-1], traj_xy[1:]):
+        draw.line((a[0], a[1], b[0], b[1]), fill=(31, 108, 159, 190), width=2)
+    draw_label(draw, (16, 14), "SLAM point cloud + pose", fill=(31, 36, 33), size=17)
+    return image_data_uri(canvas, "PNG")
+def imu_thumb(h5) -> str:
+    import numpy as np
+    from PIL import ImageDraw
+    canvas = make_canvas()
+    draw = ImageDraw.Draw(canvas, "RGBA")
+    key_idx = int(h5["imu/keyframe_indices"][2450])
+    accel = np.array(h5["imu/accel_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64)
+    gyro = np.array(h5["imu/gyro_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64)
+    series = [accel[:, 0], accel[:, 1], accel[:, 2], gyro[:, 0], gyro[:, 1], gyro[:, 2]]
+    colors = [(31, 108, 159), (52, 101, 56), (176, 68, 62), (155, 101, 22), (46, 119, 117), (96, 109, 128)]
+    for row in range(4):
+        y = 26 + row * 33
+        draw.line((18, y, THUMB_WIDTH - 18, y), fill=(228, 222, 212, 180), width=1)
+    for values, color in zip(series, colors):
+        values = values[:420]
+        if len(values) < 2:
+            continue
+        lo, hi = np.percentile(values, [3, 97])
+        norm = (values - lo) / max(hi - lo, 1e-6)
+        pts = []
+        for i, v in enumerate(norm):
+            x = 18 + i / max(len(values) - 1, 1) * (THUMB_WIDTH - 36)
+            y = 138 - np.clip(v, 0, 1) * 112
+            pts.append((x, y))
+        draw.line(pts, fill=color + (200,), width=2)
+    draw_label(draw, (16, 12), "accel / gyro traces", fill=(31, 36, 33), size=17)
+    return image_data_uri(canvas, "PNG")
+def hands_thumb(h5) -> str:
+    import numpy as np
+    from PIL import ImageDraw
+    canvas = make_canvas()
+    draw = ImageDraw.Draw(canvas, "RGBA")
+    left = np.array(h5["hand_mocap/left_joints_3d"][2450], dtype=np.float32)
+    right = np.array(h5["hand_mocap/right_joints_3d"][2450], dtype=np.float32)
+    all_points = np.concatenate([left, right], axis=0)
+    lo = np.percentile(all_points[:, :2], 2, axis=0)
+    hi = np.percentile(all_points[:, :2], 98, axis=0)
+    span = np.maximum(hi - lo, 1e-6)
+    def project(points, x_offset):
+        xy = (points[:, :2] - lo) / span
+        xy[:, 1] = 1 - xy[:, 1]
+        xy[:, 0] = x_offset + xy[:, 0] * 150
+        xy[:, 1] = 26 + xy[:, 1] * 108
+        return xy
+    for points, x_offset, color in [(left, 28, (31, 108, 159)), (right, 224, (155, 101, 22))]:
+        xy = project(points, x_offset)
+        for a, b in HAND_EDGES:
+            draw.line((xy[a][0], xy[a][1], xy[b][0], xy[b][1]), fill=color + (185,), width=3)
+        for x, y in xy:
+            draw.ellipse((x - 3, y - 3, x + 3, y + 3), fill=color + (230,))
+    draw_label(draw, (16, 12), "left / right 3D hand joints", fill=(31, 36, 33), size=17)
+    return image_data_uri(canvas, "PNG")
+def text_thumb(h5) -> str:
+    from PIL import ImageDraw
+    raw = h5["caption"][()]
+    if isinstance(raw, bytes):
+        raw = raw.decode("utf-8", errors="replace")
+    data = json.loads(raw)
+    segment = data["segments"][0]
+    objects = sorted({item for values in segment.get("objects", {}).values() for item in values})[:5]
+    actions = [a.get("label", "") for a in segment.get("Current Action", [])][:2]
+    canvas = make_canvas()
+    draw = ImageDraw.Draw(canvas, "RGBA")
+    draw_label(draw, (16, 13), data["config"].get("Main Task", "caption"), fill=(31, 36, 33), size=17)
+    y = 46
+    for label in objects:
+        draw.rounded_rectangle((16, y, 16 + 20 + len(label) * 8, y + 24), radius=6, fill=(251, 243, 219, 230), outline=(226, 200, 144, 255))
+        draw_label(draw, (26, y + 5), label, fill=(83, 74, 56), size=12)
+        y += 30
+    x = 184
+    y = 48
+    for action in actions:
+        wrapped = action[:32] + ("..." if len(action) > 32 else "")
+        draw.rounded_rectangle((x, y, THUMB_WIDTH - 16, y + 36), radius=7, fill=(232, 244, 243, 230), outline=(169, 204, 202, 255))
+        draw_label(draw, (x + 10, y + 10), wrapped, fill=(31, 36, 33), size=12)
+        y += 44
+    return image_data_uri(canvas, "PNG")
+def load_sample_thumbnails(sample_dir: Path | None) -> dict[str, str]:
+    if sample_dir is None or not sample_dir.exists():
+        return {}
+    hdf5_path = sample_dir / "annotation.hdf5"
+    required = [sample_dir / "fisheye_cam0.mp4", hdf5_path]
+    if not all(path.exists() for path in required):
+        return {}
+    try:
+        import h5py
+        thumbnails = {"video": video_thumb(sample_dir)}
+        with h5py.File(hdf5_path, "r") as h5:
+            thumbnails.update({
+                "depth": depth_thumb(h5),
+                "3D / SLAM": slam_thumb(h5),
+                "IMU": imu_thumb(h5),
+                "hands": hands_thumb(h5),
+                "text": text_thumb(h5),
+            })
+        return thumbnails
+    except Exception as exc:
+        print(f"Warning: could not build sample modality thumbnails: {exc}")
+        return {}
 def load_summary() -> dict:
     return json.loads(SUMMARY_PATH.read_text(encoding="utf-8"))
 def short_io(task_name: str, metrics: dict) -> str:
     custom = {
+        "timeline_action": "all modalities -> current action label",
+        "timeline_subtask": "all modalities -> current subtask label",
+        "transition_detection": "all modalities -> boundary vs steady",
+        "next_action": "window at t -> action at t+20 frames",
         "hand_trajectory_forecast": "all modalities -> future hand joints",
+        "contact_prediction": "non-contact modalities -> contact state",
+        "object_relevance": "non-caption modalities -> relevant objects",
+        "caption_grounding": "text query -> matching sensor window",
+        "cross_modal_retrieval": "motion / IMU / camera -> depth / video match",
+        "modality_reconstruction": "motion / IMU / camera -> depth / video vector",
+        "temporal_order": "two adjacent windows -> correct order",
+        "misalignment_detection": "motion + visual pair -> aligned or shifted",
     }
     return custom.get(task_name, metrics.get("input", ""))
+def task_card(task_name: str, kind: str, metrics: dict, group: dict, index: int) -> str:
     label, value = metric_for(task_name, metrics)
     io = short_io(task_name, metrics)
     return f"""
+      <article class="task-card" style="--accent:{group['color']};--soft:{group['soft']};">
+        <div class="task-meta">
+          <span class="index">{index:02d}</span>
+          <span class="kind">{html.escape(kind)}</span>
+        </div>
+        <h3>{html.escape(task_name)}</h3>
+        <p>{html.escape(io)}</p>
+        <div class="metric">
+          <span>{html.escape(label)}</span>
+          <strong>{html.escape(value)}</strong>
+        </div>
+      </article>
     """
+def modality_card(name: str, line_one: str, line_two: str, index: int, thumbnail: str | None) -> str:
+    thumb_html = ""
+    if thumbnail:
+        thumb_html = f'<div class="modality-thumb"><img src="{thumbnail}" alt=""></div>'
+    return f"""
+      <article class="modality">
+        {thumb_html}
+        <div class="modality-index">{index:02d}</div>
+        <h3>{html.escape(name)}</h3>
+        <p>{html.escape(line_one)}</p>
+        <span>{html.escape(line_two)}</span>
+      </article>
+    """
+def build_html(summary: dict, base_image: Path | None, sample_dir: Path | None) -> str:
+    suite = summary["tasks"]
+    thumbnails = load_sample_thumbnails(sample_dir)
+    base_layer = ""
+    if base_image is not None and base_image.exists():
+        base_layer = f'<div class="image-background" style="background-image:url(\'{base_image.resolve().as_uri()}\');"></div>'
     stats = [
+        (f"{summary['num_frames']:,}", "frames"),
+        (f"{summary['num_windows']:,}", "windows"),
+        (f"{summary['feature_dim']:,}", "features"),
+        (f"{len(suite)}", "tasks"),
+        ("70/30", "chronological split"),
     ]
+    stats_html = "".join(
+        f"<div class=\"stat\"><strong>{html.escape(value)}</strong><span>{html.escape(label)}</span></div>"
+        for value, label in stats
+    )
+    modalities_html = "".join(
+        modality_card(name, line_one, line_two, index, thumbnails.get(name))
+        for index, (name, line_one, line_two) in enumerate(MODALITIES, start=1)
+    )
+    task_index = 1
+    families = []
+    for group in GROUPS:
+        cards = []
+        for task_name, kind in group["tasks"]:
+            cards.append(task_card(task_name, kind, suite[task_name], group, task_index))
+            task_index += 1
+        families.append(
+            f"""
+            <section class="family" style="--accent:{group['color']};--soft:{group['soft']};">
+              <div class="family-head">
+                <span>{html.escape(group['tone'])}</span>
+                <h2>{html.escape(group['name'])}</h2>
+              </div>
+              <div class="family-cards">{''.join(cards)}</div>
+            </section>
+            """
+        )
     return f"""<!doctype html>
 <html lang="en">
 <head>
   <meta charset="utf-8">
+  <meta name="viewport" content="width={CANVAS_WIDTH}, initial-scale=1">
   <title>Ropedia 12-Task Episode Suite Infographic</title>
   <style>
     * {{ box-sizing: border-box; }}
+    html,
     body {{
+      margin: 0;
+      width: {CANVAS_WIDTH}px;
+      height: {CANVAS_HEIGHT}px;
+      background: #fbfaf7;
+    }}
+    body {{
+      font-family: "Avenir Next", "SF Pro Display", ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
+      color: #1f2421;
+      text-rendering: optimizeLegibility;
     }}
     .canvas {{
       position: relative;
+      width: {CANVAS_WIDTH}px;
+      height: {CANVAS_HEIGHT}px;
       overflow: hidden;
+      padding: 54px 64px 44px;
+      background:
+        radial-gradient(circle at 9% 6%, rgba(31,108,159,0.13), transparent 20%),
+        radial-gradient(circle at 90% 9%, rgba(155,101,22,0.10), transparent 22%),
+        linear-gradient(90deg, rgba(68,55,38,0.035) 1px, transparent 1px),
+        linear-gradient(0deg, rgba(68,55,38,0.027) 1px, transparent 1px),
+        #fbfaf7;
+      background-size: auto, auto, 54px 54px, 54px 54px, auto;
     }}
+    .image-background {{
       position: absolute;
+      inset: 0;
+      background-position: center;
+      background-repeat: no-repeat;
+      background-size: cover;
+      opacity: 0.30;
+      filter: saturate(0.85) contrast(0.98);
+    }}
+    .content {{
+      position: relative;
+      z-index: 1;
+    }}
+    .header {{
+      display: grid;
+      grid-template-columns: 1.25fr 0.75fr;
+      gap: 44px;
+      align-items: end;
+      padding-bottom: 30px;
+      border-bottom: 1px solid #e4ded4;
+    }}
+    .kicker {{
+      display: inline-flex;
+      align-items: center;
+      gap: 12px;
+      color: #5f625d;
+      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
+      font-size: 15px;
+      text-transform: uppercase;
+      letter-spacing: 0.08em;
+    }}
+    .kicker::before {{
+      content: "";
+      width: 44px;
+      height: 1px;
+      background: #1f2421;
     }}
     h1 {{
+      margin: 18px 0 0;
+      max-width: 930px;
+      font-size: 72px;
+      line-height: 0.95;
       letter-spacing: 0;
     }}
     .subtitle {{
+      margin: 18px 0 0;
+      max-width: 900px;
+      color: #5f625d;
+      font-size: 23px;
       line-height: 1.35;
       font-weight: 520;
     }}
     .stats {{
+      display: grid;
+      grid-template-columns: repeat(5, minmax(0, 1fr));
+      gap: 10px;
     }}
+    .stat {{
+      min-height: 78px;
+      padding: 14px 15px;
+      border: 1px solid #e4ded4;
+      background: rgba(255,254,253,0.76);
+      border-radius: 10px;
+    }}
+    .stat strong {{
+      display: block;
+      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
+      font-size: 25px;
+      line-height: 1;
+      font-variant-numeric: tabular-nums;
+    }}
+    .stat span {{
+      display: block;
+      margin-top: 8px;
+      color: #6f716c;
+      font-size: 13px;
+      line-height: 1.15;
+    }}
+    .section-label {{
+      display: flex;
       align-items: center;
+      justify-content: space-between;
+      margin: 28px 0 14px;
+      color: #5f625d;
+      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
+      font-size: 14px;
+      text-transform: uppercase;
+      letter-spacing: 0.08em;
+    }}
+    .section-label span:last-child {{
+      color: #7e817b;
+      text-transform: none;
+      letter-spacing: 0;
+      font-family: inherit;
+    }}
+    .modalities {{
+      display: grid;
+      grid-template-columns: repeat(6, minmax(0, 1fr));
+      gap: 14px;
     }}
     .modality {{
+      min-height: 204px;
+      padding: 11px 12px 14px;
+      border: 1px solid #e4ded4;
+      background: rgba(255,254,253,0.84);
+      border-radius: 12px;
+    }}
+    .modality-thumb {{
+      height: 86px;
+      overflow: hidden;
+      border: 1px solid #eee9e1;
+      border-radius: 9px;
+      background: #f5f1e9;
+    }}
+    .modality-thumb img {{
+      display: block;
+      width: 100%;
+      height: 100%;
+      object-fit: cover;
+    }}
+    .modality-index,
+    .index {{
+      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
+      font-variant-numeric: tabular-nums;
+    }}
+    .modality-index {{
+      color: #8a8072;
       font-size: 12px;
+      margin-top: 10px;
     }}
+    .modality h3 {{
+      margin: 8px 0 0;
+      font-size: 22px;
       line-height: 1;
+      text-transform: uppercase;
     }}
+    .modality p {{
+      margin: 9px 0 0;
+      color: #4f565f;
+      font-size: 15px;
+      font-weight: 650;
+    }}
+    .modality span {{
+      display: block;
+      margin-top: 5px;
+      color: #7a7d77;
+      font-size: 13px;
+    }}
+    .shared-band {{
+      display: grid;
+      grid-template-columns: 1fr auto 1fr auto 1fr auto 1fr;
+      gap: 12px;
+      align-items: center;
+      margin-top: 20px;
+      padding: 14px;
+      border: 1px solid #e4ded4;
+      background: rgba(245,241,233,0.82);
+      border-radius: 12px;
+    }}
+    .step {{
+      min-height: 62px;
+      padding: 13px 15px;
+      background: #fffefd;
+      border: 1px solid #eee9e1;
+      border-radius: 9px;
+    }}
+    .step strong {{
+      display: block;
+      font-size: 17px;
+      line-height: 1.1;
+    }}
+    .step span {{
+      display: block;
+      margin-top: 5px;
+      color: #6f716c;
+      font-size: 13px;
+    }}
+    .arrow {{
+      color: #938a7d;
+      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
+      font-size: 22px;
+    }}
+    .families {{
+      display: grid;
+      grid-template-columns: repeat(4, minmax(0, 1fr));
+      gap: 20px;
+      margin-top: 26px;
+    }}
+    .family {{
+      padding: 17px;
+      border: 1px solid color-mix(in srgb, var(--accent) 24%, #e4ded4);
+      background: rgba(255,254,253,0.82);
+      border-radius: 16px;
+    }}
+    .family-head {{
+      display: flex;
+      align-items: end;
+      justify-content: space-between;
+      gap: 16px;
+      min-height: 78px;
+      padding-bottom: 14px;
+      border-bottom: 1px solid color-mix(in srgb, var(--accent) 18%, #eee9e1);
+    }}
+    .family-head span {{
+      color: var(--accent);
+      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
+      font-size: 12px;
+      text-transform: uppercase;
+      letter-spacing: 0.08em;
+    }}
+    .family-head h2 {{
+      margin: 0;
+      color: var(--accent);
+      font-size: 29px;
+      line-height: 1.02;
+      text-align: right;
+    }}
+    .family-cards {{
+      display: grid;
+      gap: 13px;
+      margin-top: 15px;
+    }}
+    .task-card {{
+      min-height: 168px;
+      padding: 17px 18px;
+      border: 1px solid color-mix(in srgb, var(--accent) 22%, #e4ded4);
+      background: linear-gradient(180deg, #fffefd, color-mix(in srgb, var(--soft) 45%, #fffefd));
+      border-radius: 13px;
+    }}
+    .task-meta {{
+      display: flex;
+      align-items: center;
+      justify-content: space-between;
+      gap: 12px;
+    }}
+    .index {{
+      color: #8a8072;
+      font-size: 12px;
     }}
     .kind {{
       display: inline-flex;
       align-items: center;
+      height: 24px;
+      padding: 0 9px;
       border-radius: 6px;
+      border: 1px solid color-mix(in srgb, var(--accent) 30%, #ffffff);
       color: var(--accent);
+      background: rgba(255,255,255,0.72);
       text-transform: uppercase;
+      font-size: 11px;
       line-height: 1;
+      font-weight: 830;
     }}
+    .task-card h3 {{
+      margin: 12px 0 0;
       color: #111827;
+      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
+      font-size: 21px;
+      line-height: 1.18;
+      overflow-wrap: anywhere;
     }}
+    .task-card p {{
+      margin: 11px 0 0;
+      min-height: 39px;
+      color: #4f565f;
+      font-size: 15px;
       line-height: 1.28;
+      font-weight: 560;
     }}
     .metric {{
       display: inline-flex;
+      align-items: baseline;
+      gap: 10px;
+      margin-top: 14px;
+      min-height: 32px;
+      padding: 7px 10px;
+      border-radius: 8px;
+      border: 1px solid color-mix(in srgb, var(--accent) 32%, #ffffff);
+      background: rgba(255,255,255,0.82);
     }}
     .metric span {{
       color: #64748b;
+      font-size: 13px;
       font-weight: 760;
     }}
     .metric strong {{
       color: var(--accent);
+      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
+      font-size: 20px;
       line-height: 1;
       font-weight: 860;
+      font-variant-numeric: tabular-nums;
     }}
     .footer {{
+      display: flex;
+      align-items: center;
+      justify-content: space-between;
+      gap: 32px;
+      margin-top: 22px;
+      padding-top: 20px;
+      border-top: 1px solid #e4ded4;
+      color: #5f625d;
+      font-size: 18px;
+      line-height: 1.35;
+      font-weight: 620;
+    }}
+    .footer code {{
+      font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace;
+      color: #1f2421;
+      background: #f5f1e9;
+      border: 1px solid #e4ded4;
+      border-radius: 7px;
+      padding: 6px 9px;
+      white-space: nowrap;
     }}
   </style>
 </head>
 <body>
   <main class="canvas" aria-label="Ropedia 12-task episode suite infographic">
+    {base_layer}
+    <div class="content">
+    <header class="header">
+      <div>
+        <div class="kicker">verified single-episode task suite</div>
+        <h1>Ropedia 12-task episode suite</h1>
+        <p class="subtitle">A clean map from synchronized multimodal windows to 12 auditable task heads, with metrics loaded from the committed summary report.</p>
+      </div>
+      <div class="stats">{stats_html}</div>
+    </header>
+    <div class="section-label">
+      <span>input modalities</span>
+      <span>all signals align to the same sliding-window contract</span>
+    </div>
+    <section class="modalities">{modalities_html}</section>
+    <section class="shared-band" aria-label="shared processing contract">
+      <div class="step"><strong>raw public episode</strong><span>videos, depth, motion, IMU, text</span></div>
+      <div class="arrow">-></div>
+      <div class="step"><strong>20-frame windows</strong><span>stride 5, chronological order</span></div>
+      <div class="arrow">-></div>
+      <div class="step"><strong>8,378-d vector</strong><span>explicit feature manifest</span></div>
+      <div class="arrow">-></div>
+      <div class="step"><strong>12 minimal heads</strong><span>softmax, ridge, logistic</span></div>
+    </section>
+    <section class="families">{''.join(families)}</section>
+    <footer class="footer">
+      <span>Single public sample episode: useful for pipeline validation and task design, not cross-episode generalization.</span>
+      <code>results/episode_task_suite/summary_report.json</code>
+    </footer>
     </div>
   </main>
 </body>
 </html>
             "playwright",
             "screenshot",
             "--full-page",
+            f"--viewport-size={CANVAS_WIDTH},{CANVAS_HEIGHT}",
             html_path.resolve().as_uri(),
             str(output_path),
         ],
 def main() -> int:
     parser = argparse.ArgumentParser()
     parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE)
+    parser.add_argument("--sample-dir", type=Path, default=DEFAULT_SAMPLE_DIR)
     parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
     parser.add_argument("--html", type=Path)
+    parser.add_argument("--no-export", action="store_true", help="Only write the HTML used to render the image.")
     args = parser.parse_args()
     summary = load_summary()
+    html_text = build_html(summary, args.base_image, args.sample_dir)
     if args.html is None:
         with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle:
             handle.write(html_text)
     if not args.no_export:
         render_html(html_path, args.output)
         print(f"Wrote image: {args.output}")
+    print(f"Wrote render HTML: {html_path}")
     return 0