Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| """ | |
| Render a polished Ropedia Xperience-10M 12-task infographic. | |
| The task names, inputs, and metrics are read from | |
| results/episode_task_suite/summary_report.json. The output is a deterministic | |
| PNG rendered from HTML/CSS so the labels stay legible and inspectable. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import base64 | |
| import html | |
| import io | |
| import json | |
| import os | |
| import subprocess | |
| import tempfile | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parents[1] | |
| SUMMARY_PATH = ROOT / "results/episode_task_suite/summary_report.json" | |
| DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png" | |
| DEFAULT_SAMPLE_DIR = ROOT.parent / "data/sample/xperience-10m-sample" | |
| DROPBOX_SAMPLE_DIR = Path.home() / "Library/CloudStorage/Dropbox/Ropedia/data/sample/xperience-10m-sample" | |
| DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png" | |
| CANVAS_WIDTH = 1800 | |
| CANVAS_HEIGHT = 6600 | |
| THUMB_WIDTH = 880 | |
| THUMB_HEIGHT = 520 | |
| GROUPS = [ | |
| { | |
| "name": "Label + State", | |
| "tone": "teal", | |
| "color": "#9bdfff", | |
| "soft": "#071d20", | |
| "tasks": [ | |
| ("timeline_action", "supervised"), | |
| ("timeline_subtask", "supervised"), | |
| ("next_action", "supervised"), | |
| ], | |
| }, | |
| { | |
| "name": "Prediction + Reconstruction", | |
| "tone": "blue", | |
| "color": "#ccffa0", | |
| "soft": "#10210a", | |
| "tasks": [ | |
| ("hand_trajectory_forecast", "forecast"), | |
| ("modality_reconstruction", "forecast"), | |
| ("contact_prediction", "supervised"), | |
| ], | |
| }, | |
| { | |
| "name": "Grounding + Retrieval", | |
| "tone": "amber", | |
| "color": "#7ae5c3", | |
| "soft": "#092019", | |
| "tasks": [ | |
| ("caption_grounding", "retrieval"), | |
| ("cross_modal_retrieval", "retrieval"), | |
| ("object_relevance", "supervised"), | |
| ], | |
| }, | |
| { | |
| "name": "Temporal Diagnostics", | |
| "tone": "red", | |
| "color": "#d8f4a5", | |
| "soft": "#1b210d", | |
| "tasks": [ | |
| ("transition_detection", "diagnostic"), | |
| ("temporal_order", "diagnostic"), | |
| ("misalignment_detection", "diagnostic"), | |
| ], | |
| }, | |
| ] | |
| MODALITIES = [ | |
| ("video", "visual stream", "6 synchronized camera MP4 streams", "RGB/fisheye/stereo frame statistics"), | |
| ("audio", "acoustic stream", "audio stream embedded in MP4", "audio feature group"), | |
| ("depth", "geometry map", "depth map + confidence channel", "spatial geometry feature block"), | |
| ("pose / SLAM", "camera pose", "trajectory + sparse SLAM map", "position + orientation features"), | |
| ("motion capture", "human motion", "body + hand joint tracks", "3D mocap feature statistics"), | |
| ("inertial", "wearable sensor", "accelerometer + gyroscope", "wearable motion statistics"), | |
| ("language", "semantic annotation", "object tags + action captions", "task labels + semantic targets"), | |
| ] | |
| HAND_EDGES = [ | |
| (0, 1), (1, 2), (2, 3), (3, 4), | |
| (0, 5), (5, 6), (6, 7), (7, 8), | |
| (0, 9), (9, 10), (10, 11), (11, 12), | |
| (0, 13), (13, 14), (14, 15), (15, 16), | |
| (0, 17), (17, 18), (18, 19), (19, 20), | |
| ] | |
| def image_data_uri(image, fmt: str = "PNG", quality: int = 92) -> str: | |
| buffer = io.BytesIO() | |
| save_kwargs = {"format": fmt} | |
| if fmt.upper() in {"JPEG", "JPG"}: | |
| save_kwargs.update({"quality": quality, "optimize": True}) | |
| image.save(buffer, **save_kwargs) | |
| encoded = base64.b64encode(buffer.getvalue()).decode("ascii") | |
| mime = "jpeg" if fmt.upper() in {"JPEG", "JPG"} else "png" | |
| return f"data:image/{mime};base64,{encoded}" | |
| def make_canvas(size=(THUMB_WIDTH, THUMB_HEIGHT), color=(2, 5, 2)): | |
| from PIL import Image | |
| return Image.new("RGB", size, color) | |
| def fit_image(image, size=(THUMB_WIDTH, THUMB_HEIGHT)): | |
| from PIL import ImageOps | |
| return ImageOps.fit(image.convert("RGB"), size, method=3, centering=(0.5, 0.5)) | |
| def read_video_frame(video_path: Path, frame_index: int = 2400): | |
| import cv2 | |
| from PIL import Image | |
| cap = cv2.VideoCapture(str(video_path)) | |
| if not cap.isOpened(): | |
| raise RuntimeError(f"Could not open video: {video_path}") | |
| total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0) | |
| if total: | |
| frame_index = max(0, min(frame_index, total - 1)) | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index) | |
| ok, frame = cap.read() | |
| cap.release() | |
| if not ok: | |
| raise RuntimeError(f"Could not read frame {frame_index} from {video_path}") | |
| frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
| return Image.fromarray(frame) | |
| def draw_label(draw, xy, text, fill=(244, 248, 239), size=18): | |
| from PIL import ImageFont | |
| try: | |
| font = ImageFont.truetype("/System/Library/Fonts/Supplemental/Arial Bold.ttf", size) | |
| except Exception: | |
| font = ImageFont.load_default() | |
| draw.text(xy, text, fill=fill, font=font) | |
| def video_thumb(sample_dir: Path) -> str: | |
| from PIL import Image, ImageDraw | |
| gutter = 18 | |
| panel_width = (THUMB_WIDTH - gutter) // 2 | |
| fish = fit_image(read_video_frame(sample_dir / "fisheye_cam0.mp4", 2450), (panel_width, THUMB_HEIGHT)) | |
| stereo_path = sample_dir / "stereo_left.mp4" | |
| stereo = fit_image(read_video_frame(stereo_path, 2450), (panel_width, THUMB_HEIGHT)) if stereo_path.exists() else fish.copy() | |
| canvas = make_canvas() | |
| canvas.paste(fish, (0, 0)) | |
| canvas.paste(stereo, (panel_width + gutter, 0)) | |
| draw = ImageDraw.Draw(canvas, "RGBA") | |
| draw.rounded_rectangle((panel_width - 4, 0, panel_width + gutter + 4, THUMB_HEIGHT), radius=0, fill=(2, 5, 2, 220)) | |
| draw_label(draw, (18, 20), "fisheye", fill=(255, 255, 255), size=22) | |
| draw_label(draw, (panel_width + gutter + 18, 20), "stereo", fill=(255, 255, 255), size=22) | |
| return image_data_uri(canvas, "JPEG") | |
| def colorize(values): | |
| import numpy as np | |
| stops = np.array([ | |
| [2, 5, 2], | |
| [58, 136, 102], | |
| [122, 229, 195], | |
| [167, 240, 120], | |
| [216, 244, 165], | |
| ], dtype=np.float32) | |
| x = np.clip(values, 0, 1) | |
| scaled = x * (len(stops) - 1) | |
| lo = np.floor(scaled).astype(int) | |
| hi = np.clip(lo + 1, 0, len(stops) - 1) | |
| frac = scaled - lo | |
| rgb = stops[lo] * (1 - frac[..., None]) + stops[hi] * frac[..., None] | |
| return rgb.astype("uint8") | |
| def depth_thumb(h5) -> str: | |
| import numpy as np | |
| from PIL import Image, ImageDraw | |
| gutter = 18 | |
| panel_width = (THUMB_WIDTH - gutter) // 2 | |
| frame = np.array(h5["depth/depth"][2450], dtype=np.float32) | |
| valid = np.isfinite(frame) | |
| lo, hi = np.percentile(frame[valid], [3, 97]) | |
| norm = (frame - lo) / max(hi - lo, 1e-6) | |
| rgb = colorize(norm) | |
| depth = fit_image(Image.fromarray(rgb), (panel_width, THUMB_HEIGHT)) | |
| conf = np.array(h5["depth/confidence"][2450], dtype=np.uint8) | |
| conf_img = Image.fromarray(conf, mode="L").convert("RGB") | |
| conf_img = fit_image(conf_img, (panel_width, THUMB_HEIGHT)) | |
| canvas = make_canvas() | |
| canvas.paste(depth, (0, 0)) | |
| canvas.paste(conf_img, (panel_width + gutter, 0)) | |
| draw = ImageDraw.Draw(canvas, "RGBA") | |
| draw.rounded_rectangle((0, 0, 158, 44), radius=8, fill=(2, 5, 2, 178)) | |
| draw.rounded_rectangle((panel_width + gutter, 0, panel_width + gutter + 220, 44), radius=8, fill=(2, 5, 2, 178)) | |
| draw_label(draw, (14, 11), "depth", fill=(255, 255, 255), size=22) | |
| draw_label(draw, (panel_width + gutter + 14, 11), "confidence", fill=(255, 255, 255), size=22) | |
| return image_data_uri(canvas, "JPEG") | |
| def audio_thumb(sample_dir: Path) -> str: | |
| import numpy as np | |
| from PIL import ImageDraw | |
| canvas = make_canvas() | |
| draw = ImageDraw.Draw(canvas, "RGBA") | |
| try: | |
| raw = subprocess.run( | |
| [ | |
| "ffmpeg", | |
| "-v", | |
| "error", | |
| "-ss", | |
| "45", | |
| "-t", | |
| "6", | |
| "-i", | |
| str(sample_dir / "fisheye_cam0.mp4"), | |
| "-ac", | |
| "1", | |
| "-ar", | |
| "16000", | |
| "-f", | |
| "s16le", | |
| "pipe:1", | |
| ], | |
| check=True, | |
| stdout=subprocess.PIPE, | |
| ).stdout | |
| samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) | |
| if len(samples) == 0: | |
| raise RuntimeError("empty audio stream") | |
| samples = samples / max(float(np.max(np.abs(samples))), 1.0) | |
| bins = 220 | |
| trimmed = samples[: bins * max(1, len(samples) // bins)] | |
| chunks = np.array_split(trimmed, bins) | |
| rms = np.array([np.sqrt(np.mean(chunk * chunk)) if len(chunk) else 0.0 for chunk in chunks]) | |
| waveform = np.array([float(np.mean(chunk)) if len(chunk) else 0.0 for chunk in chunks]) | |
| baseline = THUMB_HEIGHT - 72 | |
| for i, value in enumerate(rms): | |
| x = 18 + i / max(bins - 1, 1) * (THUMB_WIDTH - 36) | |
| h = 14 + np.clip(value * 158, 0, 158) | |
| draw.line((x, baseline, x, baseline - h), fill=(167, 240, 120, 170), width=2) | |
| points = [] | |
| for i, value in enumerate(waveform): | |
| x = 18 + i / max(bins - 1, 1) * (THUMB_WIDTH - 36) | |
| y = 126 - np.clip(value, -1, 1) * 82 | |
| points.append((x, y)) | |
| draw.line(points, fill=(122, 229, 195, 220), width=2) | |
| except Exception: | |
| for i in range(48): | |
| x = 22 + i * 8 | |
| h = 16 + (i % 7) * 7 | |
| draw.rounded_rectangle((x, THUMB_HEIGHT - 72 - h, x + 4, THUMB_HEIGHT - 72), radius=2, fill=(167, 240, 120, 170)) | |
| draw_label(draw, (18, 18), "Audio waveform", fill=(244, 248, 239), size=22) | |
| return image_data_uri(canvas, "PNG") | |
| def normalize_points(points, width, height, pad=16): | |
| import numpy as np | |
| xy = points[:, :2].copy() | |
| lo = np.percentile(xy, 2, axis=0) | |
| hi = np.percentile(xy, 98, axis=0) | |
| span = np.maximum(hi - lo, 1e-6) | |
| norm = (xy - lo) / span | |
| norm = np.clip(norm, 0, 1) | |
| norm[:, 1] = 1 - norm[:, 1] | |
| out = np.empty_like(norm) | |
| out[:, 0] = pad + norm[:, 0] * (width - pad * 2) | |
| out[:, 1] = pad + norm[:, 1] * (height - pad * 2) | |
| return out | |
| def slam_thumb(h5) -> str: | |
| import numpy as np | |
| from PIL import ImageDraw | |
| canvas = make_canvas() | |
| draw = ImageDraw.Draw(canvas, "RGBA") | |
| points = np.array(h5["slam/point_cloud"], dtype=np.float64) | |
| points = points[np.isfinite(points).all(axis=1)] | |
| if len(points) > 2600: | |
| points = points[np.linspace(0, len(points) - 1, 2600).astype(int)] | |
| xy = normalize_points(points[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT) | |
| z = points[:, 1] | |
| z_norm = (z - np.percentile(z, 2)) / max(np.percentile(z, 98) - np.percentile(z, 2), 1e-6) | |
| colors = colorize(z_norm) | |
| for (x, y), color in zip(xy, colors): | |
| draw.ellipse((x - 1.2, y - 1.2, x + 1.2, y + 1.2), fill=tuple(color.tolist()) + (165,)) | |
| traj = np.array(h5["slam/trans_xyz"][:2450:36], dtype=np.float64) | |
| traj_xy = normalize_points(traj[:, [0, 2, 1]], THUMB_WIDTH, THUMB_HEIGHT) | |
| for a, b in zip(traj_xy[:-1], traj_xy[1:]): | |
| draw.line((a[0], a[1], b[0], b[1]), fill=(167, 240, 120, 205), width=2) | |
| draw_label(draw, (18, 18), "camera pose + SLAM map", fill=(244, 248, 239), size=22) | |
| return image_data_uri(canvas, "PNG") | |
| def imu_thumb(h5) -> str: | |
| import numpy as np | |
| from PIL import ImageDraw | |
| canvas = make_canvas() | |
| draw = ImageDraw.Draw(canvas, "RGBA") | |
| key_idx = int(h5["imu/keyframe_indices"][2450]) | |
| accel = np.array(h5["imu/accel_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64) | |
| gyro = np.array(h5["imu/gyro_xyz"][max(0, key_idx - 220): key_idx + 220], dtype=np.float64) | |
| series = [accel[:, 0], accel[:, 1], accel[:, 2], gyro[:, 0], gyro[:, 1], gyro[:, 2]] | |
| colors = [(167, 240, 120), (122, 229, 195), (155, 223, 255), (216, 244, 165), (244, 248, 239), (165, 175, 162)] | |
| for row in range(6): | |
| y = 68 + row * 44 | |
| draw.line((18, y, THUMB_WIDTH - 18, y), fill=(167, 240, 120, 48), width=1) | |
| for values, color in zip(series, colors): | |
| values = values[:420] | |
| if len(values) < 2: | |
| continue | |
| lo, hi = np.percentile(values, [3, 97]) | |
| norm = (values - lo) / max(hi - lo, 1e-6) | |
| pts = [] | |
| for i, v in enumerate(norm): | |
| x = 18 + i / max(len(values) - 1, 1) * (THUMB_WIDTH - 36) | |
| y = THUMB_HEIGHT - 48 - np.clip(v, 0, 1) * (THUMB_HEIGHT - 116) | |
| pts.append((x, y)) | |
| draw.line(pts, fill=color + (200,), width=2) | |
| draw_label(draw, (18, 18), "inertial accel / gyro", fill=(244, 248, 239), size=22) | |
| return image_data_uri(canvas, "PNG") | |
| def mocap_thumb(h5) -> str: | |
| import numpy as np | |
| from PIL import ImageDraw | |
| canvas = make_canvas() | |
| draw = ImageDraw.Draw(canvas, "RGBA") | |
| body = np.array(h5["full_body_mocap/keypoints"][2450], dtype=np.float32) | |
| left = np.array(h5["hand_mocap/left_joints_3d"][2450], dtype=np.float32) | |
| right = np.array(h5["hand_mocap/right_joints_3d"][2450], dtype=np.float32) | |
| all_points = np.concatenate([body, left, right], axis=0) | |
| lo = np.percentile(all_points[:, :2], 2, axis=0) | |
| hi = np.percentile(all_points[:, :2], 98, axis=0) | |
| span = np.maximum(hi - lo, 1e-6) | |
| def project(points, x_offset, width): | |
| xy = (points[:, :2] - lo) / span | |
| xy[:, 1] = 1 - xy[:, 1] | |
| xy[:, 0] = x_offset + xy[:, 0] * width | |
| xy[:, 1] = 72 + xy[:, 1] * (THUMB_HEIGHT - 136) | |
| return xy | |
| body_xy = project(body, 28, 270) | |
| for x, y in body_xy: | |
| draw.ellipse((x - 2.4, y - 2.4, x + 2.4, y + 2.4), fill=(167, 240, 120, 185)) | |
| for a, b in zip(body_xy[:-1], body_xy[1:]): | |
| draw.line((a[0], a[1], b[0], b[1]), fill=(167, 240, 120, 82), width=1) | |
| for points, x_offset, color in [(left, 392, (122, 229, 195)), (right, 562, (216, 244, 165))]: | |
| xy = project(points, x_offset, 126) | |
| for a, b in HAND_EDGES: | |
| draw.line((xy[a][0], xy[a][1], xy[b][0], xy[b][1]), fill=color + (180,), width=2) | |
| for x, y in xy: | |
| draw.ellipse((x - 2.4, y - 2.4, x + 2.4, y + 2.4), fill=color + (220,)) | |
| draw_label(draw, (18, 18), "body + hand mocap", fill=(244, 248, 239), size=22) | |
| return image_data_uri(canvas, "PNG") | |
| def text_thumb(h5) -> str: | |
| from PIL import ImageDraw | |
| width = THUMB_WIDTH | |
| raw = h5["caption"][()] | |
| if isinstance(raw, bytes): | |
| raw = raw.decode("utf-8", errors="replace") | |
| data = json.loads(raw) | |
| segment = data["segments"][0] | |
| objects = sorted({item for values in segment.get("objects", {}).values() for item in values})[:5] | |
| actions = [a.get("label", "") for a in segment.get("Current Action", [])][:2] | |
| canvas = make_canvas((width, THUMB_HEIGHT)) | |
| draw = ImageDraw.Draw(canvas, "RGBA") | |
| draw_label(draw, (28, 24), "language annotation", fill=(244, 248, 239), size=28) | |
| y = 82 | |
| for label in objects: | |
| chip_width = 52 + len(label) * 16 | |
| draw.rounded_rectangle((28, y, 28 + chip_width, y + 38), radius=8, fill=(7, 18, 7, 235), outline=(167, 240, 120, 170), width=2) | |
| draw_label(draw, (44, y + 8), label, fill=(244, 248, 239), size=18) | |
| y += 47 | |
| x = 340 | |
| y = 92 | |
| for action in actions: | |
| wrapped = action[:66] + ("..." if len(action) > 66 else "") | |
| draw.rounded_rectangle((x, y, width - 28, y + 54), radius=9, fill=(7, 18, 7, 235), outline=(122, 229, 195, 180), width=2) | |
| draw_label(draw, (x + 22, y + 15), wrapped, fill=(244, 248, 239), size=20) | |
| y += 68 | |
| return image_data_uri(canvas, "PNG") | |
| def load_sample_thumbnails(sample_dir: Path | None) -> dict[str, str]: | |
| if sample_dir is None or not sample_dir.exists(): | |
| return {} | |
| hdf5_path = sample_dir / "annotation.hdf5" | |
| required = [sample_dir / "fisheye_cam0.mp4", hdf5_path] | |
| if not all(path.exists() for path in required): | |
| return {} | |
| try: | |
| import h5py | |
| thumbnails = {"video": video_thumb(sample_dir), "audio": audio_thumb(sample_dir)} | |
| with h5py.File(hdf5_path, "r") as h5: | |
| thumbnails.update({ | |
| "depth": depth_thumb(h5), | |
| "pose / SLAM": slam_thumb(h5), | |
| "motion capture": mocap_thumb(h5), | |
| "inertial": imu_thumb(h5), | |
| "language": text_thumb(h5), | |
| }) | |
| return thumbnails | |
| except Exception as exc: | |
| print(f"Warning: could not build sample modality thumbnails: {exc}") | |
| return {} | |
| def valid_sample_dir(sample_dir: Path | None) -> bool: | |
| if sample_dir is None: | |
| return False | |
| return (sample_dir / "annotation.hdf5").exists() and (sample_dir / "fisheye_cam0.mp4").exists() | |
| def resolve_sample_dir(sample_dir: Path | None) -> Path | None: | |
| candidates: list[Path] = [] | |
| env_sample_dir = os.environ.get("XPERIENCE10M_SAMPLE_DIR") | |
| if env_sample_dir: | |
| candidates.append(Path(env_sample_dir).expanduser()) | |
| workspace = os.environ.get("WORKSPACE") | |
| if workspace: | |
| candidates.append(Path(workspace).expanduser() / "data/sample/xperience-10m-sample") | |
| if sample_dir is not None: | |
| candidates.append(sample_dir) | |
| candidates.extend([ | |
| DEFAULT_SAMPLE_DIR, | |
| DROPBOX_SAMPLE_DIR, | |
| ]) | |
| for candidate in candidates: | |
| if valid_sample_dir(candidate): | |
| return candidate | |
| return sample_dir | |
| def load_summary() -> dict: | |
| return json.loads(SUMMARY_PATH.read_text(encoding="utf-8")) | |
| def fmt(value: float) -> str: | |
| return f"{float(value):.4f}" | |
| def metric_for(task_name: str, metrics: dict) -> tuple[str, str]: | |
| if task_name == "hand_trajectory_forecast": | |
| return "MPJPE", fmt(metrics["mpjpe"]) | |
| if task_name == "cross_modal_retrieval": | |
| return "top-5", fmt(metrics["top5_accuracy"]) | |
| if task_name == "caption_grounding": | |
| return "MRR", fmt(metrics["mrr"]) | |
| if task_name == "object_relevance": | |
| return "micro-F1", fmt(metrics["micro_f1"]) | |
| if task_name == "modality_reconstruction": | |
| return "R2", fmt(metrics["r2"]) | |
| if task_name in {"temporal_order", "misalignment_detection"}: | |
| return "F1", fmt(metrics["f1"]) | |
| if "macro_f1" in metrics: | |
| return "macro-F1", fmt(metrics["macro_f1"]) | |
| if "accuracy" in metrics: | |
| return "accuracy", fmt(metrics["accuracy"]) | |
| raise KeyError(f"No main metric configured for {task_name}") | |
| def short_io(task_name: str, metrics: dict) -> str: | |
| custom = { | |
| "timeline_action": "all featurized modalities -> action label", | |
| "timeline_subtask": "all featurized modalities -> subtask label", | |
| "transition_detection": "all featurized modalities -> boundary vs steady", | |
| "next_action": "window at t -> action at t+20 frames", | |
| "hand_trajectory_forecast": "all featurized modalities -> future hand joints", | |
| "contact_prediction": "non-contact modalities -> contact state", | |
| "object_relevance": "non-caption feature blocks -> relevant objects", | |
| "caption_grounding": "text query -> matching sensor window", | |
| "cross_modal_retrieval": "motion / IMU / camera -> depth / video match", | |
| "modality_reconstruction": "motion / IMU / camera -> depth / video vector", | |
| "temporal_order": "two adjacent windows -> correct order", | |
| "misalignment_detection": "motion + visual pair -> aligned or shifted", | |
| } | |
| return custom.get(task_name, metrics.get("input", "")) | |
| def task_card(task_name: str, kind: str, metrics: dict, group: dict, index: int, neural_metrics: dict | None = None) -> str: | |
| label, value = metric_for(task_name, metrics) | |
| neural_html = "" | |
| if neural_metrics and "error" not in neural_metrics: | |
| neural_label, neural_value = metric_for(task_name, neural_metrics) | |
| neural_html = f""" | |
| <div class="metric neural"> | |
| <span>NN {html.escape(neural_label)}</span> | |
| <strong>{html.escape(neural_value)}</strong> | |
| </div> | |
| """ | |
| io = short_io(task_name, metrics) | |
| return f""" | |
| <article class="task-card" style="--accent:{group['color']};--soft:{group['soft']};"> | |
| <div class="task-meta"> | |
| <span class="index">{index:02d}</span> | |
| <span class="kind">{html.escape(kind)}</span> | |
| </div> | |
| <h3>{html.escape(task_name)}</h3> | |
| <p>{html.escape(io)}</p> | |
| <div class="metric"> | |
| <span>min {html.escape(label)}</span> | |
| <strong>{html.escape(value)}</strong> | |
| </div> | |
| {neural_html} | |
| </article> | |
| """ | |
| def modality_card(name: str, modality_type: str, sample_text: str, feature_text: str, index: int, thumbnail: str | None) -> str: | |
| thumb_html = "" | |
| if thumbnail: | |
| thumb_html = f'<div class="modality-thumb"><img src="{thumbnail}" alt=""></div>' | |
| return f""" | |
| <article class="modality"> | |
| <div class="modality-heading"> | |
| <div> | |
| <span class="modality-index">{index:02d}</span> | |
| <h3>{html.escape(name)}</h3> | |
| </div> | |
| <span class="modality-type">{html.escape(modality_type)}</span> | |
| </div> | |
| {thumb_html} | |
| <div class="modality-copy"> | |
| <div class="modality-row"> | |
| <span>Sample contains</span> | |
| <p>{html.escape(sample_text)}</p> | |
| </div> | |
| <div class="modality-row"> | |
| <span>Current baseline use</span> | |
| <p>{html.escape(feature_text)}</p> | |
| </div> | |
| </div> | |
| </article> | |
| """ | |
| def build_html(summary: dict, base_image: Path | None, sample_dir: Path | None) -> str: | |
| suite = summary["tasks"] | |
| neural_suite = summary.get("neural_tasks", {}) | |
| thumbnails = load_sample_thumbnails(sample_dir) | |
| base_layer = "" | |
| if base_image is not None and base_image.exists(): | |
| base_layer = f'<div class="image-background" style="background-image:url(\'{base_image.resolve().as_uri()}\');"></div>' | |
| stats = [ | |
| (f"{summary['num_frames']:,}", "frames"), | |
| (f"{summary['num_windows']:,}", "windows"), | |
| (f"{summary['feature_dim']:,}", "features"), | |
| (f"{len(suite)}+{len(neural_suite)}", "min + NN tasks"), | |
| ("70/30", "chronological split"), | |
| ] | |
| stats_html = "".join( | |
| f"<div class=\"stat\"><strong>{html.escape(value)}</strong><span>{html.escape(label)}</span></div>" | |
| for value, label in stats | |
| ) | |
| modalities_html = "".join( | |
| modality_card(name, modality_type, sample_text, feature_text, index, thumbnails.get(name)) | |
| for index, (name, modality_type, sample_text, feature_text) in enumerate(MODALITIES, start=1) | |
| ) | |
| task_index = 1 | |
| families = [] | |
| for group in GROUPS: | |
| cards = [] | |
| for task_name, kind in group["tasks"]: | |
| cards.append(task_card(task_name, kind, suite[task_name], group, task_index, neural_suite.get(task_name))) | |
| task_index += 1 | |
| families.append( | |
| f""" | |
| <section class="family" style="--accent:{group['color']};--soft:{group['soft']};"> | |
| <div class="family-head"> | |
| <span>{html.escape(group['tone'])}</span> | |
| <h2>{html.escape(group['name'])}</h2> | |
| </div> | |
| <div class="family-cards">{''.join(cards)}</div> | |
| </section> | |
| """ | |
| ) | |
| return f"""<!doctype html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="utf-8"> | |
| <meta name="viewport" content="width={CANVAS_WIDTH}, initial-scale=1"> | |
| <title>Xperience-10M 12-Task Episode Suite Infographic</title> | |
| <style> | |
| * {{ box-sizing: border-box; }} | |
| html, | |
| body {{ | |
| margin: 0; | |
| width: {CANVAS_WIDTH}px; | |
| height: {CANVAS_HEIGHT}px; | |
| background: #020502; | |
| }} | |
| body {{ | |
| font-family: "Inter Tight", "Space Grotesk", ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; | |
| color: #f4f8ef; | |
| text-rendering: optimizeLegibility; | |
| }} | |
| .canvas {{ | |
| position: relative; | |
| width: {CANVAS_WIDTH}px; | |
| height: {CANVAS_HEIGHT}px; | |
| overflow: hidden; | |
| padding: 54px 64px 44px; | |
| background: | |
| radial-gradient(circle at 72% 10%, rgba(167,240,120,0.18), transparent 24%), | |
| radial-gradient(circle at 20% 28%, rgba(255,255,255,0.10) 1px, transparent 2px), | |
| #020502; | |
| background-size: auto, 18px 18px, auto; | |
| }} | |
| .image-background {{ | |
| position: absolute; | |
| inset: 0; | |
| background-position: center; | |
| background-repeat: no-repeat; | |
| background-size: cover; | |
| opacity: 0.36; | |
| filter: saturate(1.05) contrast(1.08) brightness(0.42); | |
| }} | |
| .content {{ | |
| position: relative; | |
| z-index: 1; | |
| }} | |
| .header {{ | |
| display: grid; | |
| grid-template-columns: 1.25fr 0.75fr; | |
| gap: 44px; | |
| align-items: end; | |
| padding-bottom: 30px; | |
| border-bottom: 1px solid rgba(167,240,120,0.20); | |
| }} | |
| .kicker {{ | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 12px; | |
| color: #ccffa0; | |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; | |
| font-size: 15px; | |
| text-transform: uppercase; | |
| letter-spacing: 0.08em; | |
| }} | |
| .kicker::before {{ | |
| content: ""; | |
| width: 44px; | |
| height: 1px; | |
| background: #ccffa0; | |
| }} | |
| h1 {{ | |
| margin: 18px 0 0; | |
| max-width: 930px; | |
| font-size: 72px; | |
| line-height: 0.95; | |
| letter-spacing: 0; | |
| }} | |
| .subtitle {{ | |
| margin: 18px 0 0; | |
| max-width: 900px; | |
| color: #dce8d7; | |
| font-size: 23px; | |
| line-height: 1.35; | |
| font-weight: 520; | |
| }} | |
| .stats {{ | |
| display: grid; | |
| grid-template-columns: repeat(5, minmax(0, 1fr)); | |
| gap: 10px; | |
| }} | |
| .stat {{ | |
| min-height: 78px; | |
| padding: 14px 15px; | |
| border: 1px solid rgba(167,240,120,0.24); | |
| background: rgba(7,18,7,0.80); | |
| border-radius: 8px; | |
| }} | |
| .stat strong {{ | |
| display: block; | |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; | |
| font-size: 25px; | |
| line-height: 1; | |
| font-variant-numeric: tabular-nums; | |
| }} | |
| .stat span {{ | |
| display: block; | |
| margin-top: 8px; | |
| color: #a5afa2; | |
| font-size: 13px; | |
| line-height: 1.15; | |
| }} | |
| .section-label {{ | |
| display: grid; | |
| grid-template-columns: 1fr; | |
| gap: 12px; | |
| align-items: start; | |
| margin: 44px 0 24px; | |
| color: #a5afa2; | |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; | |
| font-size: 22px; | |
| text-transform: uppercase; | |
| letter-spacing: 0.08em; | |
| }} | |
| .section-label span:last-child {{ | |
| max-width: 1400px; | |
| color: #dce8d7; | |
| text-transform: none; | |
| letter-spacing: 0; | |
| font-family: inherit; | |
| font-size: 21px; | |
| line-height: 1.42; | |
| text-align: left; | |
| }} | |
| .modalities {{ | |
| display: grid; | |
| grid-template-columns: 1fr; | |
| gap: 34px; | |
| }} | |
| .modality {{ | |
| min-height: 560px; | |
| padding: 34px; | |
| border: 1px solid rgba(167,240,120,0.22); | |
| background: rgba(7,18,7,0.84); | |
| border-radius: 8px; | |
| display: grid; | |
| grid-template-columns: 880px minmax(0, 1fr); | |
| grid-template-areas: | |
| "thumb heading" | |
| "thumb copy"; | |
| column-gap: 46px; | |
| row-gap: 28px; | |
| align-items: start; | |
| }} | |
| .modality-thumb {{ | |
| grid-area: thumb; | |
| height: 492px; | |
| overflow: hidden; | |
| border: 1px solid rgba(167,240,120,0.16); | |
| border-radius: 8px; | |
| background: #020502; | |
| }} | |
| .modality-thumb img {{ | |
| display: block; | |
| width: 100%; | |
| height: 100%; | |
| object-fit: cover; | |
| }} | |
| .modality-index, | |
| .index {{ | |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; | |
| font-variant-numeric: tabular-nums; | |
| }} | |
| .modality-heading {{ | |
| grid-area: heading; | |
| display: flex; | |
| align-items: start; | |
| justify-content: space-between; | |
| gap: 24px; | |
| padding-bottom: 26px; | |
| border-bottom: 1px solid rgba(167,240,120,0.16); | |
| }} | |
| .modality-index {{ | |
| color: #a5afa2; | |
| font-size: 24px; | |
| }} | |
| .modality-type {{ | |
| color: #ccffa0; | |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; | |
| font-size: 16px; | |
| line-height: 1.15; | |
| text-transform: uppercase; | |
| letter-spacing: 0.08em; | |
| text-align: right; | |
| max-width: 330px; | |
| padding-top: 8px; | |
| }} | |
| .modality h3 {{ | |
| margin: 14px 0 0; | |
| font-size: 76px; | |
| line-height: 0.98; | |
| text-transform: uppercase; | |
| }} | |
| .modality-copy {{ | |
| grid-area: copy; | |
| display: grid; | |
| grid-template-columns: 1fr; | |
| gap: 22px; | |
| }} | |
| .modality-row {{ | |
| display: grid; | |
| grid-template-columns: 1fr; | |
| gap: 10px; | |
| align-items: baseline; | |
| padding: 22px 24px; | |
| border: 1px solid rgba(167,240,120,0.16); | |
| border-radius: 8px; | |
| background: rgba(2,5,2,0.40); | |
| }} | |
| .modality-row span {{ | |
| display: block; | |
| color: #a5afa2; | |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; | |
| font-size: 16px; | |
| letter-spacing: 0.06em; | |
| line-height: 1.25; | |
| text-transform: uppercase; | |
| }} | |
| .modality-row p {{ | |
| margin: 0; | |
| color: #dce8d7; | |
| font-size: 40px; | |
| font-weight: 650; | |
| line-height: 1.15; | |
| }} | |
| .shared-band {{ | |
| display: grid; | |
| grid-template-columns: 1fr auto 1fr auto 1fr auto 1fr; | |
| gap: 12px; | |
| align-items: center; | |
| margin-top: 30px; | |
| padding: 14px; | |
| border: 1px solid rgba(167,240,120,0.22); | |
| background: rgba(7,18,7,0.72); | |
| border-radius: 8px; | |
| }} | |
| .step {{ | |
| min-height: 62px; | |
| padding: 13px 15px; | |
| background: rgba(7,18,7,0.92); | |
| border: 1px solid rgba(167,240,120,0.16); | |
| border-radius: 8px; | |
| }} | |
| .step strong {{ | |
| display: block; | |
| font-size: 17px; | |
| line-height: 1.1; | |
| }} | |
| .step span {{ | |
| display: block; | |
| margin-top: 5px; | |
| color: #a5afa2; | |
| font-size: 13px; | |
| }} | |
| .arrow {{ | |
| color: #ccffa0; | |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; | |
| font-size: 22px; | |
| }} | |
| .families {{ | |
| display: grid; | |
| grid-template-columns: repeat(2, minmax(0, 1fr)); | |
| gap: 24px; | |
| margin-top: 30px; | |
| }} | |
| .family {{ | |
| padding: 20px; | |
| border: 1px solid color-mix(in srgb, var(--accent) 28%, #020502); | |
| background: rgba(7,18,7,0.82); | |
| border-radius: 8px; | |
| }} | |
| .family-head {{ | |
| display: flex; | |
| align-items: end; | |
| justify-content: space-between; | |
| gap: 16px; | |
| min-height: 66px; | |
| padding-bottom: 16px; | |
| border-bottom: 1px solid color-mix(in srgb, var(--accent) 24%, #020502); | |
| }} | |
| .family-head span {{ | |
| color: var(--accent); | |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; | |
| font-size: 12px; | |
| text-transform: uppercase; | |
| letter-spacing: 0.08em; | |
| }} | |
| .family-head h2 {{ | |
| margin: 0; | |
| color: var(--accent); | |
| font-size: 32px; | |
| line-height: 1.02; | |
| text-align: right; | |
| }} | |
| .family-cards {{ | |
| display: grid; | |
| gap: 16px; | |
| margin-top: 18px; | |
| }} | |
| .task-card {{ | |
| min-height: 178px; | |
| padding: 18px 20px; | |
| border: 1px solid color-mix(in srgb, var(--accent) 28%, #020502); | |
| background: linear-gradient(180deg, rgba(10,24,10,0.96), color-mix(in srgb, var(--soft) 24%, #071207)); | |
| border-radius: 8px; | |
| }} | |
| .task-meta {{ | |
| display: flex; | |
| align-items: center; | |
| justify-content: space-between; | |
| gap: 12px; | |
| }} | |
| .index {{ | |
| color: #a5afa2; | |
| font-size: 12px; | |
| }} | |
| .kind {{ | |
| display: inline-flex; | |
| align-items: center; | |
| height: 24px; | |
| padding: 0 9px; | |
| border-radius: 6px; | |
| border: 1px solid color-mix(in srgb, var(--accent) 40%, #020502); | |
| color: var(--accent); | |
| background: rgba(2,5,2,0.48); | |
| text-transform: uppercase; | |
| font-size: 11px; | |
| line-height: 1; | |
| font-weight: 830; | |
| }} | |
| .task-card h3 {{ | |
| margin: 12px 0 0; | |
| color: #f4f8ef; | |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; | |
| font-size: 21px; | |
| line-height: 1.18; | |
| overflow-wrap: anywhere; | |
| }} | |
| .task-card p {{ | |
| margin: 11px 0 0; | |
| min-height: 39px; | |
| color: #dce8d7; | |
| font-size: 15px; | |
| line-height: 1.28; | |
| font-weight: 560; | |
| }} | |
| .metric {{ | |
| display: inline-flex; | |
| align-items: baseline; | |
| gap: 10px; | |
| margin-top: 10px; | |
| min-height: 32px; | |
| padding: 7px 10px; | |
| border-radius: 8px; | |
| border: 1px solid color-mix(in srgb, var(--accent) 42%, #020502); | |
| background: rgba(2,5,2,0.42); | |
| }} | |
| .metric.neural {{ | |
| margin-left: 8px; | |
| border-color: rgba(255,255,255,0.20); | |
| background: rgba(255,255,255,0.08); | |
| }} | |
| .metric span {{ | |
| color: #a5afa2; | |
| font-size: 13px; | |
| font-weight: 760; | |
| }} | |
| .metric strong {{ | |
| color: var(--accent); | |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; | |
| font-size: 20px; | |
| line-height: 1; | |
| font-weight: 860; | |
| font-variant-numeric: tabular-nums; | |
| }} | |
| .footer {{ | |
| display: flex; | |
| align-items: center; | |
| justify-content: space-between; | |
| gap: 32px; | |
| margin-top: 22px; | |
| padding-top: 20px; | |
| border-top: 1px solid rgba(167,240,120,0.20); | |
| color: #a5afa2; | |
| font-size: 18px; | |
| line-height: 1.35; | |
| font-weight: 620; | |
| }} | |
| .footer code {{ | |
| font-family: "SF Mono", "JetBrains Mono", ui-monospace, monospace; | |
| color: #020502; | |
| background: #ccffa0; | |
| border: 1px solid #ccffa0; | |
| border-radius: 7px; | |
| padding: 6px 9px; | |
| white-space: nowrap; | |
| }} | |
| </style> | |
| </head> | |
| <body> | |
| <main class="canvas" aria-label="Ropedia Xperience-10M 12-task suite infographic"> | |
| {base_layer} | |
| <div class="content"> | |
| <header class="header"> | |
| <div> | |
| <div class="kicker">verified single-episode task suite</div> | |
| <h1>Ropedia Xperience-10M 12-task suite</h1> | |
| <p class="subtitle">A clean map from synchronized multimodal windows to 12 research task heads, comparing minimal heads with neural MLP results. Next milestone: Qwen3-Omni fine-tuning with sensor-bridge evaluation.</p> | |
| </div> | |
| <div class="stats">{stats_html}</div> | |
| </header> | |
| <section class="shared-band" aria-label="shared processing contract"> | |
| <div class="step"><strong>raw public episode</strong><span>video, audio, depth, pose, mocap, IMU, language</span></div> | |
| <div class="arrow">-></div> | |
| <div class="step"><strong>20-frame windows</strong><span>stride 5, chronological order</span></div> | |
| <div class="arrow">-></div> | |
| <div class="step"><strong>{summary['feature_dim']:,}-d vector</strong><span>current manifest includes audio features</span></div> | |
| <div class="arrow">-></div> | |
| <div class="step"><strong>12 minimal + NN heads</strong><span>softmax/ridge/logistic plus PyTorch MLP</span></div> | |
| </section> | |
| <div class="section-label"> | |
| <span>12 task families</span> | |
| <span>Every task below has a minimal baseline and a neural MLP head over the same aligned window contract, making the suite easy to compare, extend, and scale to held-out episodes.</span> | |
| </div> | |
| <section class="families">{''.join(families)}</section> | |
| <div class="section-label"> | |
| <span>Xperience-10M modalities</span> | |
| <span>Public-sample thumbnails are enlarged here so each data stream is legible. Audio is present in the sample MP4 stream and is now extracted into the current baseline manifest.</span> | |
| </div> | |
| <section class="modalities">{modalities_html}</section> | |
| <footer class="footer"> | |
| <span>Single public sample episode: useful for pipeline validation and task design, not cross-episode generalization.</span> | |
| <code>results/episode_task_suite/summary_report.json</code> | |
| </footer> | |
| </div> | |
| </main> | |
| </body> | |
| </html> | |
| """ | |
| def render_html(html_path: Path, output_path: Path) -> None: | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| subprocess.run( | |
| [ | |
| "npx", | |
| "--yes", | |
| "playwright", | |
| "screenshot", | |
| "--full-page", | |
| f"--viewport-size={CANVAS_WIDTH},{CANVAS_HEIGHT}", | |
| html_path.resolve().as_uri(), | |
| str(output_path), | |
| ], | |
| check=True, | |
| ) | |
| def main() -> int: | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE) | |
| parser.add_argument("--sample-dir", type=Path, default=DEFAULT_SAMPLE_DIR) | |
| parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT) | |
| parser.add_argument("--html", type=Path) | |
| parser.add_argument("--no-export", action="store_true", help="Only write the HTML used to render the image.") | |
| args = parser.parse_args() | |
| summary = load_summary() | |
| sample_dir = resolve_sample_dir(args.sample_dir) | |
| html_text = build_html(summary, args.base_image, sample_dir) | |
| if args.html is None: | |
| with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle: | |
| handle.write(html_text) | |
| html_path = Path(handle.name) | |
| else: | |
| html_path = args.html | |
| html_path.parent.mkdir(parents=True, exist_ok=True) | |
| html_path.write_text(html_text, encoding="utf-8") | |
| if not args.no_export: | |
| render_html(html_path, args.output) | |
| print(f"Wrote image: {args.output}") | |
| print(f"Wrote render HTML: {html_path}") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |