Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| """ | |
| Render a ChatGPT-image-backed 12-task infographic. | |
| The background bitmap is AI-generated. The task names, inputs, and metrics are | |
| read from results/episode_task_suite/summary_report.json so the published image | |
| does not rely on image-model text generation. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import html | |
| import json | |
| import subprocess | |
| import tempfile | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parents[1] | |
| SUMMARY_PATH = ROOT / "results/episode_task_suite/summary_report.json" | |
| DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png" | |
| DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png" | |
| GROUPS = [ | |
| { | |
| "name": "Label + State", | |
| "color": "#008b9a", | |
| "left": 94, | |
| "top": 374, | |
| "width": 246, | |
| "tasks": [ | |
| ("timeline_action", "supervised"), | |
| ("timeline_subtask", "supervised"), | |
| ("next_action", "supervised"), | |
| ], | |
| }, | |
| { | |
| "name": "Prediction + Reconstruction", | |
| "color": "#1f63e9", | |
| "left": 472, | |
| "top": 374, | |
| "width": 248, | |
| "tasks": [ | |
| ("hand_trajectory_forecast", "forecast"), | |
| ("modality_reconstruction", "forecast"), | |
| ("contact_prediction", "supervised"), | |
| ], | |
| }, | |
| { | |
| "name": "Grounding + Retrieval", | |
| "color": "#b65b04", | |
| "left": 848, | |
| "top": 374, | |
| "width": 220, | |
| "tasks": [ | |
| ("caption_grounding", "retrieval"), | |
| ("cross_modal_retrieval", "retrieval"), | |
| ("object_relevance", "supervised"), | |
| ], | |
| }, | |
| { | |
| "name": "Temporal Diagnostics", | |
| "color": "#b42318", | |
| "left": 1202, | |
| "top": 374, | |
| "width": 244, | |
| "tasks": [ | |
| ("transition_detection", "diagnostic"), | |
| ("temporal_order", "diagnostic"), | |
| ("misalignment_detection", "diagnostic"), | |
| ], | |
| }, | |
| ] | |
| def load_summary() -> dict: | |
| return json.loads(SUMMARY_PATH.read_text(encoding="utf-8")) | |
| def fmt(value: float) -> str: | |
| return f"{float(value):.4f}" | |
| def metric_for(task_name: str, metrics: dict) -> tuple[str, str]: | |
| if task_name == "hand_trajectory_forecast": | |
| return "MPJPE", fmt(metrics["mpjpe"]) | |
| if task_name == "cross_modal_retrieval": | |
| return "top-5", fmt(metrics["top5_accuracy"]) | |
| if task_name == "caption_grounding": | |
| return "MRR", fmt(metrics["mrr"]) | |
| if task_name == "object_relevance": | |
| return "micro-F1", fmt(metrics["micro_f1"]) | |
| if task_name == "modality_reconstruction": | |
| return "R2", fmt(metrics["r2"]) | |
| if task_name in {"temporal_order", "misalignment_detection"}: | |
| return "F1", fmt(metrics["f1"]) | |
| if "macro_f1" in metrics: | |
| return "macro-F1", fmt(metrics["macro_f1"]) | |
| if "accuracy" in metrics: | |
| return "accuracy", fmt(metrics["accuracy"]) | |
| raise KeyError(f"No main metric configured for {task_name}") | |
| def short_io(task_name: str, metrics: dict) -> str: | |
| custom = { | |
| "timeline_action": "all modalities -> action label", | |
| "timeline_subtask": "all modalities -> subtask label", | |
| "transition_detection": "all modalities -> boundary / steady", | |
| "next_action": "window at t -> action at t+20", | |
| "hand_trajectory_forecast": "all modalities -> future hand joints", | |
| "contact_prediction": "non-contact modalities -> contact", | |
| "object_relevance": "non-caption modalities -> object set", | |
| "caption_grounding": "text query -> matching window", | |
| "cross_modal_retrieval": "motion / IMU / camera -> depth / video", | |
| "modality_reconstruction": "motion / IMU / camera -> depth / video vec", | |
| "temporal_order": "two windows -> correct order?", | |
| "misalignment_detection": "motion + visual -> aligned / shifted", | |
| } | |
| return custom.get(task_name, metrics.get("input", "")) | |
| def task_html(task_name: str, kind: str, metrics: dict, top: int, group: dict) -> str: | |
| label, value = metric_for(task_name, metrics) | |
| io = short_io(task_name, metrics) | |
| name_size = 17 if len(task_name) > 22 else 18 | |
| return f""" | |
| <section class="task" style="left:{group['left']}px;top:{top}px;width:{group['width']}px;--accent:{group['color']};"> | |
| <div class="kind">{html.escape(kind)}</div> | |
| <div class="task-name" style="font-size:{name_size}px;">{html.escape(task_name)}</div> | |
| <div class="io">{html.escape(io)}</div> | |
| <div class="metric"><span>{html.escape(label)}</span><strong>{html.escape(value)}</strong></div> | |
| </section> | |
| """ | |
| def build_html(summary: dict, base_image: Path) -> str: | |
| suite = summary["tasks"] | |
| task_count = len(suite) | |
| group_headers = [] | |
| cards = [] | |
| row_tops = [374, 552, 730] | |
| header_lefts = [38, 417, 792, 1143] | |
| for group, header_left in zip(GROUPS, header_lefts): | |
| group_headers.append( | |
| f'<div class="group-title" style="left:{header_left}px;top:333px;color:{group["color"]};">{html.escape(group["name"])}</div>' | |
| ) | |
| for row_idx, (task_name, kind) in enumerate(group["tasks"]): | |
| cards.append(task_html(task_name, kind, suite[task_name], row_tops[row_idx], group)) | |
| stats = [ | |
| f"{summary['num_frames']:,} frames", | |
| f"{summary['num_windows']:,} windows", | |
| f"{summary['feature_dim']:,} features", | |
| f"{task_count} tasks", | |
| "chronological split", | |
| ] | |
| stat_html = "".join(f"<span>{html.escape(item)}</span>" for item in stats) | |
| base_uri = base_image.resolve().as_uri() | |
| return f"""<!doctype html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="utf-8"> | |
| <meta name="viewport" content="width=1536, initial-scale=1"> | |
| <title>Ropedia 12-Task Episode Suite Infographic</title> | |
| <style> | |
| * {{ box-sizing: border-box; }} | |
| html, body {{ margin: 0; width: 1536px; height: 1024px; background: #ffffff; }} | |
| body {{ | |
| font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Arial, sans-serif; | |
| color: #10141f; | |
| }} | |
| .canvas {{ | |
| position: relative; | |
| width: 1536px; | |
| height: 1024px; | |
| overflow: hidden; | |
| background-image: url("{base_uri}"); | |
| background-size: 1536px 1024px; | |
| background-repeat: no-repeat; | |
| }} | |
| .title {{ | |
| position: absolute; | |
| left: 330px; | |
| top: 42px; | |
| width: 876px; | |
| text-align: center; | |
| }} | |
| h1 {{ | |
| margin: 0; | |
| font-size: 38px; | |
| line-height: 1.05; | |
| letter-spacing: 0; | |
| font-weight: 820; | |
| }} | |
| .subtitle {{ | |
| margin-top: 8px; | |
| color: #425067; | |
| font-size: 15px; | |
| line-height: 1.35; | |
| font-weight: 520; | |
| }} | |
| .stats {{ | |
| margin-top: 12px; | |
| display: flex; | |
| justify-content: center; | |
| gap: 8px; | |
| }} | |
| .stats span {{ | |
| display: inline-flex; | |
| align-items: center; | |
| height: 24px; | |
| padding: 0 10px; | |
| border: 1px solid #cdd8e8; | |
| background: rgba(255, 255, 255, 0.82); | |
| border-radius: 999px; | |
| color: #253046; | |
| font-size: 12px; | |
| font-weight: 720; | |
| }} | |
| .modality {{ | |
| position: absolute; | |
| top: 256px; | |
| width: 180px; | |
| text-align: center; | |
| font-size: 12px; | |
| color: #536074; | |
| font-weight: 720; | |
| text-transform: uppercase; | |
| letter-spacing: 0; | |
| }} | |
| .group-title {{ | |
| position: absolute; | |
| width: 322px; | |
| text-align: center; | |
| font-size: 18px; | |
| line-height: 1; | |
| font-weight: 830; | |
| letter-spacing: 0; | |
| }} | |
| .task {{ | |
| position: absolute; | |
| padding: 0; | |
| }} | |
| .kind {{ | |
| display: inline-flex; | |
| align-items: center; | |
| height: 22px; | |
| padding: 0 8px; | |
| border-radius: 6px; | |
| border: 1px solid color-mix(in srgb, var(--accent) 35%, #ffffff); | |
| color: var(--accent); | |
| background: rgba(255, 255, 255, 0.76); | |
| text-transform: uppercase; | |
| font-size: 10px; | |
| line-height: 1; | |
| font-weight: 840; | |
| letter-spacing: 0; | |
| }} | |
| .task-name {{ | |
| margin-top: 7px; | |
| color: #111827; | |
| line-height: 1.05; | |
| font-weight: 850; | |
| letter-spacing: 0; | |
| white-space: nowrap; | |
| }} | |
| .io {{ | |
| margin-top: 8px; | |
| min-height: 36px; | |
| color: #475569; | |
| font-size: 13.5px; | |
| line-height: 1.28; | |
| font-weight: 570; | |
| }} | |
| .metric {{ | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 9px; | |
| margin-top: 8px; | |
| height: 30px; | |
| padding: 0 10px; | |
| border-radius: 7px; | |
| border: 1px solid color-mix(in srgb, var(--accent) 36%, #ffffff); | |
| background: rgba(255, 255, 255, 0.90); | |
| box-shadow: 0 7px 20px rgba(16, 20, 31, 0.07); | |
| }} | |
| .metric span {{ | |
| color: #64748b; | |
| font-size: 12px; | |
| font-weight: 760; | |
| }} | |
| .metric strong {{ | |
| color: var(--accent); | |
| font-size: 16px; | |
| line-height: 1; | |
| font-weight: 860; | |
| }} | |
| .footer {{ | |
| position: absolute; | |
| left: 360px; | |
| top: 932px; | |
| width: 816px; | |
| text-align: center; | |
| color: #536074; | |
| font-size: 14px; | |
| font-weight: 650; | |
| }} | |
| </style> | |
| </head> | |
| <body> | |
| <main class="canvas" aria-label="Ropedia 12-task episode suite infographic"> | |
| <div class="title"> | |
| <h1>Ropedia 12-Task Episode Suite</h1> | |
| <div class="subtitle">All labels and metrics are overlaid from the verified single-episode results.</div> | |
| <div class="stats">{stat_html}</div> | |
| </div> | |
| <div class="modality" style="left:50px;">fisheye video</div> | |
| <div class="modality" style="left:270px;">depth</div> | |
| <div class="modality" style="left:530px;">3D / SLAM</div> | |
| <div class="modality" style="left:770px;">IMU</div> | |
| <div class="modality" style="left:1030px;">hands</div> | |
| <div class="modality" style="left:1278px;">text / objects</div> | |
| {''.join(group_headers)} | |
| {''.join(cards)} | |
| <div class="footer">Single public sample episode: useful for pipeline validation and task design, not cross-episode generalization.</div> | |
| </main> | |
| </body> | |
| </html> | |
| """ | |
| def render_html(html_path: Path, output_path: Path) -> None: | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| subprocess.run( | |
| [ | |
| "npx", | |
| "--yes", | |
| "playwright", | |
| "screenshot", | |
| "--full-page", | |
| "--viewport-size=1536,1024", | |
| html_path.resolve().as_uri(), | |
| str(output_path), | |
| ], | |
| check=True, | |
| ) | |
| def main() -> int: | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE) | |
| parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT) | |
| parser.add_argument("--html", type=Path) | |
| parser.add_argument("--no-export", action="store_true", help="Only write the HTML overlay.") | |
| args = parser.parse_args() | |
| summary = load_summary() | |
| html_text = build_html(summary, args.base_image) | |
| if args.html is None: | |
| with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle: | |
| handle.write(html_text) | |
| html_path = Path(handle.name) | |
| else: | |
| html_path = args.html | |
| html_path.parent.mkdir(parents=True, exist_ok=True) | |
| html_path.write_text(html_text, encoding="utf-8") | |
| if not args.no_export: | |
| render_html(html_path, args.output) | |
| print(f"Wrote image: {args.output}") | |
| print(f"Wrote overlay HTML: {html_path}") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |