File size: 25,866 Bytes

#!/usr/bin/env python3
"""Organize the 12 Xperience-10M tasks into the four Ropedia research tracks.

The script is intentionally deterministic: it reads the committed task metrics,
adds a manually curated taxonomy, and writes machine-readable artifacts used by the
README, website, and Hugging Face pages.
"""

from __future__ import annotations

import csv
import html
import json
from collections import OrderedDict
from pathlib import Path
from typing import Any

from task_display import task_display_name


ROOT = Path(__file__).resolve().parents[1]
RESULTS = ROOT / "results" / "episode_task_suite"
OUT_DIR = RESULTS / "research_directions"
DOCS_DATA = ROOT / "docs" / "data"
CHARTS = ROOT / "docs" / "assets" / "charts"

SUMMARY_REPORT = RESULTS / "summary_report.json"


DIRECTIONS: OrderedDict[str, dict[str, Any]] = OrderedDict(
    [
        (
            "A",
            {
                "id": "human_motion",
                "name": "Human Modeling & Motion Understanding",
                "focus": "Human/hand/body motion, deformation priors, human-object interaction, affordance modeling.",
                "preferred_background": "Human pose/shape estimation, SMPL-style models, motion capture, or motion generation.",
                "current_status": "partially implemented",
                "current_readout": "The sample supports hand trajectory forecasting and contact/object probes, but it does not yet include a full body/shape model or multi-person priors.",
                "next_steps": [
                    "Add SMPL/SMPL-X or MANO-style body/hand parameter targets where available.",
                    "Train sequence models over multi-episode motion trajectories instead of isolated windows.",
                    "Evaluate affordance prediction on held-out objects and held-out episodes.",
                ],
            },
        ),
        (
            "B",
            {
                "id": "reconstruction_rendering",
                "name": "3D/4D Reconstruction & Neural Rendering",
                "focus": "Multi-view dynamic scene reconstruction, NeRF/Gaussian Splatting, novel-view synthesis.",
                "preferred_background": "3D reconstruction, neural rendering, camera calibration, and bundle adjustment.",
                "current_status": "proxy tasks only",
                "current_readout": "The current suite checks cross-modal alignment and depth/video reconstruction proxies; it does not yet train a renderer or reconstruct geometry.",
                "next_steps": [
                    "Use calibrated multi-view video plus SLAM pose to build per-episode camera trajectories.",
                    "Add depth-supervised point clouds, TSDF, Gaussian Splatting, or NeRF baselines.",
                    "Evaluate novel-view synthesis and temporal consistency across held-out views/time.",
                ],
            },
        ),
        (
            "C",
            {
                "id": "egocentric_interaction",
                "name": "Egocentric Vision & Interaction",
                "focus": "Egocentric action and intention understanding, hand-object interaction, gaze/attention modeling, task structure modeling.",
                "preferred_background": "Video understanding, action recognition, or egocentric vision.",
                "current_status": "strongest implemented track",
                "current_readout": "Most of the 12 tasks directly target egocentric action, task state, interaction, grounding, and alignment.",
                "next_steps": [
                    "Move from single-episode chronological splits to held-out-episode splits.",
                    "Use audio together with stronger multimodal backbones for action, intent, and grounding.",
                    "Evaluate long-horizon task success prediction and action-conditioned generation.",
                ],
            },
        ),
        (
            "D",
            {
                "id": "world_modeling",
                "name": "Scene Reconstruction & World Modeling",
                "focus": "Long-term consistent 3D/4D scene mapping, scene graphs, object- and space-centric representations, spatial reasoning.",
                "preferred_background": "Large-scale mapping, semantic reconstruction, or agent world models.",
                "current_status": "early proxy tasks",
                "current_readout": "The current tasks probe temporal structure, object relevance, cross-modal retrieval, and modality prediction, but they do not yet build persistent maps or scene graphs.",
                "next_steps": [
                    "Convert windows into persistent object/scene-state nodes with timestamps and camera poses.",
                    "Add map consistency, object permanence, and spatial relation prediction tasks.",
                    "Train held-out-episode world models that predict future observations and task state.",
                ],
            },
        ),
    ]
)


TASK_TAXONOMY: OrderedDict[str, dict[str, Any]] = OrderedDict(
    [
        (
            "timeline_action",
            {
                "name": "Timeline action recognition",
                "family": "supervised",
                "input": "all featurized modalities",
                "output": "current action label",
                "primary_direction": "C",
                "direction_roles": {"C": "direct", "A": "proxy"},
                "why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout.",
                "current_limit": "Chronological single-episode split creates unseen future action classes.",
            },
        ),
        (
            "timeline_subtask",
            {
                "name": "Timeline subtask recognition",
                "family": "supervised",
                "input": "all featurized modalities",
                "output": "current subtask label",
                "primary_direction": "C",
                "direction_roles": {"C": "direct", "D": "proxy"},
                "why": "Segments egocentric task state and provides a first proxy for symbolic world/task state.",
                "current_limit": "Single-episode ordering makes future subtasks hard to generalize.",
            },
        ),
        (
            "transition_detection",
            {
                "name": "Action transition detection",
                "family": "diagnostic",
                "input": "all featurized modalities",
                "output": "boundary vs steady state",
                "primary_direction": "C",
                "direction_roles": {"C": "direct", "D": "diagnostic"},
                "why": "Localizes egocentric task boundaries and diagnoses temporal state changes.",
                "current_limit": "Boundary class is sparse, so accuracy alone is misleading.",
            },
        ),
        (
            "next_action",
            {
                "name": "Short-horizon next action",
                "family": "supervised",
                "input": "current multimodal window",
                "output": "action 20 frames later",
                "primary_direction": "C",
                "direction_roles": {"C": "direct", "D": "proxy"},
                "why": "Tests action intention/task-flow prediction from egocentric context.",
                "current_limit": "Unseen future labels dominate the single-episode chronological test.",
            },
        ),
        (
            "hand_trajectory_forecast",
            {
                "name": "Hand trajectory forecasting",
                "family": "forecast",
                "input": "current multimodal window",
                "output": "future left/right hand 3D joints",
                "primary_direction": "A",
                "direction_roles": {"A": "direct", "C": "proxy"},
                "why": "Directly predicts human hand motion and supports hand-object interaction modeling.",
                "current_limit": "Forecasting is window-level and not yet a full sequence or policy model.",
            },
        ),
        (
            "contact_prediction",
            {
                "name": "Body/object contact prediction",
                "family": "supervised",
                "input": "non-contact/non-caption features",
                "output": "binary contact label",
                "primary_direction": "A",
                "direction_roles": {"A": "direct", "C": "proxy"},
                "why": "Targets physical interaction state, a core affordance and manipulation signal.",
                "current_limit": "The public sample is degenerate for this target because one class dominates.",
            },
        ),
        (
            "object_relevance",
            {
                "name": "Relevant object set prediction",
                "family": "supervised",
                "input": "non-caption feature blocks",
                "output": "multi-label object set",
                "primary_direction": "C",
                "direction_roles": {"C": "direct", "A": "proxy", "D": "proxy"},
                "why": "Connects egocentric activity to manipulated objects and early object-centric state.",
                "current_limit": "Object labels are language-derived and sparse in one episode.",
            },
        ),
        (
            "caption_grounding",
            {
                "name": "Caption-to-window grounding",
                "family": "retrieval",
                "input": "caption objects/interaction query and candidate sensor windows",
                "output": "matching time window",
                "primary_direction": "C",
                "direction_roles": {"C": "direct", "D": "proxy"},
                "why": "Grounds language annotation into egocentric sensor time and task state.",
                "current_limit": "Bag-of-objects language features are too weak for rich grounding.",
            },
        ),
        (
            "cross_modal_retrieval",
            {
                "name": "Cross-modal retrieval",
                "family": "retrieval",
                "input": "motion/IMU/camera query",
                "output": "matching depth/video window",
                "primary_direction": "C",
                "direction_roles": {"C": "diagnostic", "B": "proxy", "D": "proxy"},
                "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling.",
                "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
            },
        ),
        (
            "modality_reconstruction",
            {
                "name": "Modality reconstruction",
                "family": "forecast",
                "input": "motion/IMU/camera",
                "output": "depth/video feature vector",
                "primary_direction": "B",
                "direction_roles": {"B": "proxy", "D": "proxy"},
                "why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective.",
                "current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.",
            },
        ),
        (
            "temporal_order",
            {
                "name": "Temporal order verification",
                "family": "diagnostic",
                "input": "two adjacent windows",
                "output": "correct vs reversed order",
                "primary_direction": "C",
                "direction_roles": {"C": "diagnostic", "D": "diagnostic"},
                "why": "Checks whether features encode local time direction and task progression.",
                "current_limit": "Only local adjacent ordering, not long-horizon causal modeling.",
            },
        ),
        (
            "misalignment_detection",
            {
                "name": "Cross-modal misalignment detection",
                "family": "diagnostic",
                "input": "motion plus visual/depth pair",
                "output": "aligned vs shifted",
                "primary_direction": "C",
                "direction_roles": {"C": "diagnostic", "B": "diagnostic", "D": "diagnostic"},
                "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models.",
                "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
            },
        ),
    ]
)


METRIC_SPECS = {
    "timeline_action": ("macro_f1", "macro-F1", "higher"),
    "timeline_subtask": ("macro_f1", "macro-F1", "higher"),
    "transition_detection": ("macro_f1", "macro-F1", "higher"),
    "next_action": ("macro_f1", "macro-F1", "higher"),
    "hand_trajectory_forecast": ("mpjpe", "MPJPE", "lower"),
    "contact_prediction": ("macro_f1", "macro-F1", "higher"),
    "object_relevance": ("micro_f1", "micro-F1", "higher"),
    "caption_grounding": ("mrr", "MRR", "higher"),
    "cross_modal_retrieval": ("mrr", "MRR", "higher"),
    "modality_reconstruction": ("r2", "R2", "higher"),
    "temporal_order": ("f1", "F1", "higher"),
    "misalignment_detection": ("f1", "F1", "higher"),
}


def load_summary() -> dict[str, Any]:
    return json.loads(SUMMARY_REPORT.read_text(encoding="utf-8"))


def metric_value(metrics: dict[str, Any] | None, task: str) -> float | None:
    if not metrics:
        return None
    key = METRIC_SPECS[task][0]
    value = metrics.get(key)
    return float(value) if value is not None else None


def choose_better(task: str, minimal: float | None, neural: float | None) -> str:
    if minimal is None or neural is None:
        return "unavailable"
    _, _, direction = METRIC_SPECS[task]
    delta = neural - minimal
    if abs(delta) < 1e-9:
        return "tie"
    if direction == "lower":
        return "neural_mlp" if delta < 0 else "minimal"
    return "neural_mlp" if delta > 0 else "minimal"


def fmt_metric(value: float | None) -> str:
    if value is None:
        return "n/a"
    if abs(value) >= 10:
        return f"{value:.3f}"
    return f"{value:.4f}"


def baseline_readout(label: str) -> str:
    if label == "tie":
        return "Both baselines are tied"
    if label == "minimal":
        return "Minimal baseline is stronger"
    if label == "neural_mlp":
        return "Neural MLP is stronger"
    return "Baseline comparison is unavailable"


def build_taxonomy(summary: dict[str, Any]) -> dict[str, Any]:
    minimal_tasks = summary["tasks"]
    neural_tasks = summary.get("neural_tasks", {})

    task_records: OrderedDict[str, dict[str, Any]] = OrderedDict()
    direction_counts = {
        code: {"direct": 0, "proxy": 0, "diagnostic": 0, "total_links": 0}
        for code in DIRECTIONS
    }

    for task, spec in TASK_TAXONOMY.items():
        metric_key, metric_name, metric_direction = METRIC_SPECS[task]
        minimal_metric = metric_value(minimal_tasks.get(task), task)
        neural_metric = metric_value(neural_tasks.get(task), task)
        better = choose_better(task, minimal_metric, neural_metric)

        roles = spec["direction_roles"]
        for direction_code, role in roles.items():
            direction_counts[direction_code][role] += 1
            direction_counts[direction_code]["total_links"] += 1

        task_records[task] = {
            **spec,
            "display_name": task_display_name(task),
            "artifact_id": task,
            "metric": {
                "key": metric_key,
                "name": metric_name,
                "direction": metric_direction,
                "minimal": minimal_metric,
                "neural_mlp": neural_metric,
                "better_baseline": better,
            },
        }

    direction_records = OrderedDict()
    for code, info in DIRECTIONS.items():
        linked_tasks = [
            task
            for task, spec in task_records.items()
            if code in spec["direction_roles"]
        ]
        direction_records[code] = {
            **info,
            "tasks": linked_tasks,
            "task_display_names": [task_display_name(task) for task in linked_tasks],
            "counts": direction_counts[code],
        }

    return {
        "source": "results/episode_task_suite/summary_report.json",
        "dataset_scope": {
            "sample_episode_count": 1,
            "num_frames": summary.get("num_frames"),
            "num_windows": summary.get("num_windows"),
            "feature_dim": summary.get("feature_dim"),
            "warning": "Single public sample episode; this supports pipeline/task evidence, while cross-episode generalization requires held-out episodes.",
        },
        "baselines": {
            "minimal": f"Interpretable softmax, logistic, ridge, and retrieval heads over the {summary.get('feature_dim'):,}-d window feature vector.",
            "neural_mlp": "Small PyTorch MLP classifiers/regressors using the same features, splits, and task contracts.",
        },
        "directions": direction_records,
        "tasks": task_records,
    }


def write_csv(taxonomy: dict[str, Any]) -> None:
    path = OUT_DIR / "research_direction_task_map.csv"
    with path.open("w", newline="", encoding="utf-8") as handle:
        writer = csv.writer(handle, lineterminator="\n")
        writer.writerow(
            [
                "direction",
                "direction_name",
                "task",
                "task_display_name",
                "task_name",
                "family",
                "relationship",
                "primary_direction",
                "metric_name",
                "minimal_metric",
                "neural_mlp_metric",
                "better_baseline",
                "why",
                "current_limit",
            ]
        )
        for task, spec in taxonomy["tasks"].items():
            metric = spec["metric"]
            for direction_code, relationship in spec["direction_roles"].items():
                writer.writerow(
                    [
                        direction_code,
                        taxonomy["directions"][direction_code]["name"],
                        task,
                        spec["display_name"],
                        spec["name"],
                        spec["family"],
                        relationship,
                        spec["primary_direction"],
                        metric["name"],
                        "" if metric["minimal"] is None else f"{metric['minimal']:.12g}",
                        "" if metric["neural_mlp"] is None else f"{metric['neural_mlp']:.12g}",
                        metric["better_baseline"],
                        spec["why"],
                        spec["current_limit"],
                    ]
                )


def write_markdown(taxonomy: dict[str, Any]) -> None:
    lines = [
        "# Four-Direction Task Taxonomy",
        "",
        "This file is generated by `scripts/research_direction_taxonomy.py` from the committed 12-task metrics.",
        "It maps the current Xperience-10M sample tasks to the four Ropedia research directions and marks which parts require multi-episode evidence.",
        "",
        "## Baseline Families",
        "",
        "| Baseline | Meaning |",
        "| --- | --- |",
        f"| Minimal | {taxonomy['baselines']['minimal']} |",
        f"| Neural MLP | {taxonomy['baselines']['neural_mlp']} |",
        "",
        "## Direction Coverage",
        "",
        "| Direction | Current status | Direct | Proxy | Diagnostic | Current readout |",
        "| --- | --- | ---: | ---: | ---: | --- |",
    ]
    for code, info in taxonomy["directions"].items():
        counts = info["counts"]
        lines.append(
            f"| {code}. {info['name']} | {info['current_status']} | {counts['direct']} | {counts['proxy']} | {counts['diagnostic']} | {info['current_readout']} |"
        )

    lines.extend(
        [
            "",
            "## Task Mapping With Two Baselines",
            "",
            "| Task | Artifact id | Primary direction | Related directions | Minimal | Neural MLP | Readout |",
            "| --- | --- | --- | --- | ---: | ---: | --- |",
        ]
    )
    for task, spec in taxonomy["tasks"].items():
        metric = spec["metric"]
        related = ", ".join(
            f"{code}:{role}" for code, role in spec["direction_roles"].items()
        )
        minimal = f"{fmt_metric(metric['minimal'])} {metric['name']}"
        neural = f"{fmt_metric(metric['neural_mlp'])} {metric['name']}"
        readout = f"{baseline_readout(metric['better_baseline'])}. {spec['current_limit']}"
        lines.append(
            f"| {spec['display_name']} | `{task}` | {spec['primary_direction']} | {related} | {minimal} | {neural} | {readout} |"
        )

    lines.extend(["", "## Next-Step Interpretation", ""])
    for code, info in taxonomy["directions"].items():
        lines.append(f"### {code}. {info['name']}")
        lines.append("")
        lines.append(info["current_readout"])
        lines.append("")
        for step in info["next_steps"]:
            lines.append(f"- {step}")
        lines.append("")

    (OUT_DIR / "research_direction_summary.md").write_text(
        "\n".join(lines).rstrip() + "\n", encoding="utf-8"
    )


def svg_text(x: int, y: int, text: str, size: int = 16, weight: int = 500, color: str = "#f4f8ef") -> str:
    return (
        f'<text x="{x}" y="{y}" font-size="{size}" font-weight="{weight}" '
        f'fill="{color}">{html.escape(text)}</text>'
    )


def write_svg(taxonomy: dict[str, Any]) -> None:
    width = 1180
    height = 700
    margin = 58
    card_w = 515
    card_h = 220
    colors = {"direct": "#ccffa0", "proxy": "#7ae5c3", "diagnostic": "#d8f4a5"}
    cards = []

    for idx, (code, info) in enumerate(taxonomy["directions"].items()):
        row = idx // 2
        col = idx % 2
        x = margin + col * (card_w + 34)
        y = 130 + row * (card_h + 34)
        counts = info["counts"]
        total = max(1, counts["direct"] + counts["proxy"] + counts["diagnostic"])
        bar_x = x + 24
        bar_y = y + 132
        bar_w = card_w - 48
        cursor = bar_x
        segments = []
        for key in ("direct", "proxy", "diagnostic"):
            seg_w = round(bar_w * counts[key] / total)
            if counts[key] > 0:
                segments.append(
                    f'<rect x="{cursor}" y="{bar_y}" width="{seg_w}" height="16" rx="8" fill="{colors[key]}"/>'
                )
            cursor += seg_w

        task_labels = ", ".join(info["task_display_names"][:5])
        if len(info["task_display_names"]) > 5:
            task_labels += f", +{len(info['task_display_names']) - 5}"

        cards.append(
            "\n".join(
                [
                    f'<rect x="{x}" y="{y}" width="{card_w}" height="{card_h}" rx="8" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/>',
                    svg_text(x + 24, y + 42, f"{code}. {info['name']}", 21, 700),
                    svg_text(x + 24, y + 75, info["current_status"], 15, 700, "#ccffa0"),
                    svg_text(x + 24, y + 108, f"Tasks: {task_labels}", 14, 500, "#dce8d7"),
                    *segments,
                    svg_text(x + 24, y + 174, f"Direct {counts['direct']}", 14, 700, colors["direct"]),
                    svg_text(x + 150, y + 174, f"Proxy {counts['proxy']}", 14, 700, colors["proxy"]),
                    svg_text(x + 270, y + 174, f"Diagnostic {counts['diagnostic']}", 14, 700, colors["diagnostic"]),
                ]
            )
        )

    legend = []
    lx = margin
    for key, label in (
        ("direct", "Direct task"),
        ("proxy", "Proxy / prerequisite"),
        ("diagnostic", "Diagnostic probe"),
    ):
        legend.extend(
            [
                f'<rect x="{lx}" y="622" width="16" height="16" rx="4" fill="{colors[key]}"/>',
                svg_text(lx + 24, 636, label, 14, 600, "#dce8d7"),
            ]
        )
        lx += 200

    svg = f"""<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}" role="img" aria-label="Xperience-10M task coverage across four research directions">
  <rect width="100%" height="100%" fill="#020502"/>
  <rect x="24" y="24" width="1132" height="652" rx="20" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/>
  {svg_text(margin, 64, "Xperience-10M 12-Task Suite: Four Research Directions", 30, 800)}
  {svg_text(margin, 96, "One public sample episode, two baseline families, explicit direct/proxy/diagnostic coverage.", 16, 500, "#a5afa2")}
  {"".join(cards)}
  {"".join(legend)}
  {svg_text(margin, 670, "Generated from results/episode_task_suite/summary_report.json and scripts/research_direction_taxonomy.py", 13, 500, "#a5afa2")}
</svg>
"""
    (CHARTS / "research_direction_coverage.svg").write_text(svg, encoding="utf-8")


def main() -> None:
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    DOCS_DATA.mkdir(parents=True, exist_ok=True)
    CHARTS.mkdir(parents=True, exist_ok=True)

    taxonomy = build_taxonomy(load_summary())
    json_text = json.dumps(taxonomy, indent=2, ensure_ascii=False)
    (OUT_DIR / "research_direction_taxonomy.json").write_text(json_text + "\n", encoding="utf-8")
    (DOCS_DATA / "research_directions.json").write_text(json_text + "\n", encoding="utf-8")
    write_csv(taxonomy)
    write_markdown(taxonomy)
    write_svg(taxonomy)

    print(f"Wrote {OUT_DIR / 'research_direction_taxonomy.json'}")
    print(f"Wrote {OUT_DIR / 'research_direction_task_map.csv'}")
    print(f"Wrote {OUT_DIR / 'research_direction_summary.md'}")
    print(f"Wrote {DOCS_DATA / 'research_directions.json'}")
    print(f"Wrote {CHARTS / 'research_direction_coverage.svg'}")


if __name__ == "__main__":
    main()