ropedia-xperience-10m-task-baselines / scripts /research_direction_taxonomy.py

Update final Qwen model scripts

627e5d7 verified 21 days ago

25.9 kB

	#!/usr/bin/env python3
	"""Organize the 12 Xperience-10M tasks into the four Ropedia research tracks.

	The script is intentionally deterministic: it reads the committed task metrics,
	adds a manually curated taxonomy, and writes machine-readable artifacts used by the
	README, website, and Hugging Face pages.
	"""

	from __future__ import annotations

	import csv
	import html
	import json
	from collections import OrderedDict
	from pathlib import Path
	from typing import Any

	from task_display import task_display_name


	ROOT = Path(__file__).resolve().parents[1]
	RESULTS = ROOT / "results" / "episode_task_suite"
	OUT_DIR = RESULTS / "research_directions"
	DOCS_DATA = ROOT / "docs" / "data"
	CHARTS = ROOT / "docs" / "assets" / "charts"

	SUMMARY_REPORT = RESULTS / "summary_report.json"


	DIRECTIONS: OrderedDict[str, dict[str, Any]] = OrderedDict(
	[
	(
	"A",
	{
	"id": "human_motion",
	"name": "Human Modeling & Motion Understanding",
	"focus": "Human/hand/body motion, deformation priors, human-object interaction, affordance modeling.",
	"preferred_background": "Human pose/shape estimation, SMPL-style models, motion capture, or motion generation.",
	"current_status": "partially implemented",
	"current_readout": "The sample supports hand trajectory forecasting and contact/object probes, but it does not yet include a full body/shape model or multi-person priors.",
	"next_steps": [
	"Add SMPL/SMPL-X or MANO-style body/hand parameter targets where available.",
	"Train sequence models over multi-episode motion trajectories instead of isolated windows.",
	"Evaluate affordance prediction on held-out objects and held-out episodes.",
	],
	},
	),
	(
	"B",
	{
	"id": "reconstruction_rendering",
	"name": "3D/4D Reconstruction & Neural Rendering",
	"focus": "Multi-view dynamic scene reconstruction, NeRF/Gaussian Splatting, novel-view synthesis.",
	"preferred_background": "3D reconstruction, neural rendering, camera calibration, and bundle adjustment.",
	"current_status": "proxy tasks only",
	"current_readout": "The current suite checks cross-modal alignment and depth/video reconstruction proxies; it does not yet train a renderer or reconstruct geometry.",
	"next_steps": [
	"Use calibrated multi-view video plus SLAM pose to build per-episode camera trajectories.",
	"Add depth-supervised point clouds, TSDF, Gaussian Splatting, or NeRF baselines.",
	"Evaluate novel-view synthesis and temporal consistency across held-out views/time.",
	],
	},
	),
	(
	"C",
	{
	"id": "egocentric_interaction",
	"name": "Egocentric Vision & Interaction",
	"focus": "Egocentric action and intention understanding, hand-object interaction, gaze/attention modeling, task structure modeling.",
	"preferred_background": "Video understanding, action recognition, or egocentric vision.",
	"current_status": "strongest implemented track",
	"current_readout": "Most of the 12 tasks directly target egocentric action, task state, interaction, grounding, and alignment.",
	"next_steps": [
	"Move from single-episode chronological splits to held-out-episode splits.",
	"Use audio together with stronger multimodal backbones for action, intent, and grounding.",
	"Evaluate long-horizon task success prediction and action-conditioned generation.",
	],
	},
	),
	(
	"D",
	{
	"id": "world_modeling",
	"name": "Scene Reconstruction & World Modeling",
	"focus": "Long-term consistent 3D/4D scene mapping, scene graphs, object- and space-centric representations, spatial reasoning.",
	"preferred_background": "Large-scale mapping, semantic reconstruction, or agent world models.",
	"current_status": "early proxy tasks",
	"current_readout": "The current tasks probe temporal structure, object relevance, cross-modal retrieval, and modality prediction, but they do not yet build persistent maps or scene graphs.",
	"next_steps": [
	"Convert windows into persistent object/scene-state nodes with timestamps and camera poses.",
	"Add map consistency, object permanence, and spatial relation prediction tasks.",
	"Train held-out-episode world models that predict future observations and task state.",
	],
	},
	),
	]
	)


	TASK_TAXONOMY: OrderedDict[str, dict[str, Any]] = OrderedDict(
	[
	(
	"timeline_action",
	{
	"name": "Timeline action recognition",
	"family": "supervised",
	"input": "all featurized modalities",
	"output": "current action label",
	"primary_direction": "C",
	"direction_roles": {"C": "direct", "A": "proxy"},
	"why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout.",
	"current_limit": "Chronological single-episode split creates unseen future action classes.",
	},
	),
	(
	"timeline_subtask",
	{
	"name": "Timeline subtask recognition",
	"family": "supervised",
	"input": "all featurized modalities",
	"output": "current subtask label",
	"primary_direction": "C",
	"direction_roles": {"C": "direct", "D": "proxy"},
	"why": "Segments egocentric task state and provides a first proxy for symbolic world/task state.",
	"current_limit": "Single-episode ordering makes future subtasks hard to generalize.",
	},
	),
	(
	"transition_detection",
	{
	"name": "Action transition detection",
	"family": "diagnostic",
	"input": "all featurized modalities",
	"output": "boundary vs steady state",
	"primary_direction": "C",
	"direction_roles": {"C": "direct", "D": "diagnostic"},
	"why": "Localizes egocentric task boundaries and diagnoses temporal state changes.",
	"current_limit": "Boundary class is sparse, so accuracy alone is misleading.",
	},
	),
	(
	"next_action",
	{
	"name": "Short-horizon next action",
	"family": "supervised",
	"input": "current multimodal window",
	"output": "action 20 frames later",
	"primary_direction": "C",
	"direction_roles": {"C": "direct", "D": "proxy"},
	"why": "Tests action intention/task-flow prediction from egocentric context.",
	"current_limit": "Unseen future labels dominate the single-episode chronological test.",
	},
	),
	(
	"hand_trajectory_forecast",
	{
	"name": "Hand trajectory forecasting",
	"family": "forecast",
	"input": "current multimodal window",
	"output": "future left/right hand 3D joints",
	"primary_direction": "A",
	"direction_roles": {"A": "direct", "C": "proxy"},
	"why": "Directly predicts human hand motion and supports hand-object interaction modeling.",
	"current_limit": "Forecasting is window-level and not yet a full sequence or policy model.",
	},
	),
	(
	"contact_prediction",
	{
	"name": "Body/object contact prediction",
	"family": "supervised",
	"input": "non-contact/non-caption features",
	"output": "binary contact label",
	"primary_direction": "A",
	"direction_roles": {"A": "direct", "C": "proxy"},
	"why": "Targets physical interaction state, a core affordance and manipulation signal.",
	"current_limit": "The public sample is degenerate for this target because one class dominates.",
	},
	),
	(
	"object_relevance",
	{
	"name": "Relevant object set prediction",
	"family": "supervised",
	"input": "non-caption feature blocks",
	"output": "multi-label object set",
	"primary_direction": "C",
	"direction_roles": {"C": "direct", "A": "proxy", "D": "proxy"},
	"why": "Connects egocentric activity to manipulated objects and early object-centric state.",
	"current_limit": "Object labels are language-derived and sparse in one episode.",
	},
	),
	(
	"caption_grounding",
	{
	"name": "Caption-to-window grounding",
	"family": "retrieval",
	"input": "caption objects/interaction query and candidate sensor windows",
	"output": "matching time window",
	"primary_direction": "C",
	"direction_roles": {"C": "direct", "D": "proxy"},
	"why": "Grounds language annotation into egocentric sensor time and task state.",
	"current_limit": "Bag-of-objects language features are too weak for rich grounding.",
	},
	),
	(
	"cross_modal_retrieval",
	{
	"name": "Cross-modal retrieval",
	"family": "retrieval",
	"input": "motion/IMU/camera query",
	"output": "matching depth/video window",
	"primary_direction": "C",
	"direction_roles": {"C": "diagnostic", "B": "proxy", "D": "proxy"},
	"why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling.",
	"current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
	},
	),
	(
	"modality_reconstruction",
	{
	"name": "Modality reconstruction",
	"family": "forecast",
	"input": "motion/IMU/camera",
	"output": "depth/video feature vector",
	"primary_direction": "B",
	"direction_roles": {"B": "proxy", "D": "proxy"},
	"why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective.",
	"current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.",
	},
	),
	(
	"temporal_order",
	{
	"name": "Temporal order verification",
	"family": "diagnostic",
	"input": "two adjacent windows",
	"output": "correct vs reversed order",
	"primary_direction": "C",
	"direction_roles": {"C": "diagnostic", "D": "diagnostic"},
	"why": "Checks whether features encode local time direction and task progression.",
	"current_limit": "Only local adjacent ordering, not long-horizon causal modeling.",
	},
	),
	(
	"misalignment_detection",
	{
	"name": "Cross-modal misalignment detection",
	"family": "diagnostic",
	"input": "motion plus visual/depth pair",
	"output": "aligned vs shifted",
	"primary_direction": "C",
	"direction_roles": {"C": "diagnostic", "B": "diagnostic", "D": "diagnostic"},
	"why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models.",
	"current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
	},
	),
	]
	)


	METRIC_SPECS = {
	"timeline_action": ("macro_f1", "macro-F1", "higher"),
	"timeline_subtask": ("macro_f1", "macro-F1", "higher"),
	"transition_detection": ("macro_f1", "macro-F1", "higher"),
	"next_action": ("macro_f1", "macro-F1", "higher"),
	"hand_trajectory_forecast": ("mpjpe", "MPJPE", "lower"),
	"contact_prediction": ("macro_f1", "macro-F1", "higher"),
	"object_relevance": ("micro_f1", "micro-F1", "higher"),
	"caption_grounding": ("mrr", "MRR", "higher"),
	"cross_modal_retrieval": ("mrr", "MRR", "higher"),
	"modality_reconstruction": ("r2", "R2", "higher"),
	"temporal_order": ("f1", "F1", "higher"),
	"misalignment_detection": ("f1", "F1", "higher"),
	}


	def load_summary() -> dict[str, Any]:
	return json.loads(SUMMARY_REPORT.read_text(encoding="utf-8"))


	def metric_value(metrics: dict[str, Any] \| None, task: str) -> float \| None:
	if not metrics:
	return None
	key = METRIC_SPECS[task][0]
	value = metrics.get(key)
	return float(value) if value is not None else None


	def choose_better(task: str, minimal: float \| None, neural: float \| None) -> str:
	if minimal is None or neural is None:
	return "unavailable"
	_, _, direction = METRIC_SPECS[task]
	delta = neural - minimal
	if abs(delta) < 1e-9:
	return "tie"
	if direction == "lower":
	return "neural_mlp" if delta < 0 else "minimal"
	return "neural_mlp" if delta > 0 else "minimal"


	def fmt_metric(value: float \| None) -> str:
	if value is None:
	return "n/a"
	if abs(value) >= 10:
	return f"{value:.3f}"
	return f"{value:.4f}"


	def baseline_readout(label: str) -> str:
	if label == "tie":
	return "Both baselines are tied"
	if label == "minimal":
	return "Minimal baseline is stronger"
	if label == "neural_mlp":
	return "Neural MLP is stronger"
	return "Baseline comparison is unavailable"


	def build_taxonomy(summary: dict[str, Any]) -> dict[str, Any]:
	minimal_tasks = summary["tasks"]
	neural_tasks = summary.get("neural_tasks", {})

	task_records: OrderedDict[str, dict[str, Any]] = OrderedDict()
	direction_counts = {
	code: {"direct": 0, "proxy": 0, "diagnostic": 0, "total_links": 0}
	for code in DIRECTIONS
	}

	for task, spec in TASK_TAXONOMY.items():
	metric_key, metric_name, metric_direction = METRIC_SPECS[task]
	minimal_metric = metric_value(minimal_tasks.get(task), task)
	neural_metric = metric_value(neural_tasks.get(task), task)
	better = choose_better(task, minimal_metric, neural_metric)

	roles = spec["direction_roles"]
	for direction_code, role in roles.items():
	direction_counts[direction_code][role] += 1
	direction_counts[direction_code]["total_links"] += 1

	task_records[task] = {
	**spec,
	"display_name": task_display_name(task),
	"artifact_id": task,
	"metric": {
	"key": metric_key,
	"name": metric_name,
	"direction": metric_direction,
	"minimal": minimal_metric,
	"neural_mlp": neural_metric,
	"better_baseline": better,
	},
	}

	direction_records = OrderedDict()
	for code, info in DIRECTIONS.items():
	linked_tasks = [
	task
	for task, spec in task_records.items()
	if code in spec["direction_roles"]
	]
	direction_records[code] = {
	**info,
	"tasks": linked_tasks,
	"task_display_names": [task_display_name(task) for task in linked_tasks],
	"counts": direction_counts[code],
	}

	return {
	"source": "results/episode_task_suite/summary_report.json",
	"dataset_scope": {
	"sample_episode_count": 1,
	"num_frames": summary.get("num_frames"),
	"num_windows": summary.get("num_windows"),
	"feature_dim": summary.get("feature_dim"),
	"warning": "Single public sample episode; this supports pipeline/task evidence, while cross-episode generalization requires held-out episodes.",
	},
	"baselines": {
	"minimal": f"Interpretable softmax, logistic, ridge, and retrieval heads over the {summary.get('feature_dim'):,}-d window feature vector.",
	"neural_mlp": "Small PyTorch MLP classifiers/regressors using the same features, splits, and task contracts.",
	},
	"directions": direction_records,
	"tasks": task_records,
	}


	def write_csv(taxonomy: dict[str, Any]) -> None:
	path = OUT_DIR / "research_direction_task_map.csv"
	with path.open("w", newline="", encoding="utf-8") as handle:
	writer = csv.writer(handle, lineterminator="\n")
	writer.writerow(
	[
	"direction",
	"direction_name",
	"task",
	"task_display_name",
	"task_name",
	"family",
	"relationship",
	"primary_direction",
	"metric_name",
	"minimal_metric",
	"neural_mlp_metric",
	"better_baseline",
	"why",
	"current_limit",
	]
	)
	for task, spec in taxonomy["tasks"].items():
	metric = spec["metric"]
	for direction_code, relationship in spec["direction_roles"].items():
	writer.writerow(
	[
	direction_code,
	taxonomy["directions"][direction_code]["name"],
	task,
	spec["display_name"],
	spec["name"],
	spec["family"],
	relationship,
	spec["primary_direction"],
	metric["name"],
	"" if metric["minimal"] is None else f"{metric['minimal']:.12g}",
	"" if metric["neural_mlp"] is None else f"{metric['neural_mlp']:.12g}",
	metric["better_baseline"],
	spec["why"],
	spec["current_limit"],
	]
	)


	def write_markdown(taxonomy: dict[str, Any]) -> None:
	lines = [
	"# Four-Direction Task Taxonomy",
	"",
	"This file is generated by `scripts/research_direction_taxonomy.py` from the committed 12-task metrics.",
	"It maps the current Xperience-10M sample tasks to the four Ropedia research directions and marks which parts require multi-episode evidence.",
	"",
	"## Baseline Families",
	"",
	"\| Baseline \| Meaning \|",
	"\| --- \| --- \|",
	f"\| Minimal \| {taxonomy['baselines']['minimal']} \|",
	f"\| Neural MLP \| {taxonomy['baselines']['neural_mlp']} \|",
	"",
	"## Direction Coverage",
	"",
	"\| Direction \| Current status \| Direct \| Proxy \| Diagnostic \| Current readout \|",
	"\| --- \| --- \| ---: \| ---: \| ---: \| --- \|",
	]
	for code, info in taxonomy["directions"].items():
	counts = info["counts"]
	lines.append(
	f"\| {code}. {info['name']} \| {info['current_status']} \| {counts['direct']} \| {counts['proxy']} \| {counts['diagnostic']} \| {info['current_readout']} \|"
	)

	lines.extend(
	[
	"",
	"## Task Mapping With Two Baselines",
	"",
	"\| Task \| Artifact id \| Primary direction \| Related directions \| Minimal \| Neural MLP \| Readout \|",
	"\| --- \| --- \| --- \| --- \| ---: \| ---: \| --- \|",
	]
	)
	for task, spec in taxonomy["tasks"].items():
	metric = spec["metric"]
	related = ", ".join(
	f"{code}:{role}" for code, role in spec["direction_roles"].items()
	)
	minimal = f"{fmt_metric(metric['minimal'])} {metric['name']}"
	neural = f"{fmt_metric(metric['neural_mlp'])} {metric['name']}"
	readout = f"{baseline_readout(metric['better_baseline'])}. {spec['current_limit']}"
	lines.append(
	f"\| {spec['display_name']} \| `{task}` \| {spec['primary_direction']} \| {related} \| {minimal} \| {neural} \| {readout} \|"
	)

	lines.extend(["", "## Next-Step Interpretation", ""])
	for code, info in taxonomy["directions"].items():
	lines.append(f"### {code}. {info['name']}")
	lines.append("")
	lines.append(info["current_readout"])
	lines.append("")
	for step in info["next_steps"]:
	lines.append(f"- {step}")
	lines.append("")

	(OUT_DIR / "research_direction_summary.md").write_text(
	"\n".join(lines).rstrip() + "\n", encoding="utf-8"
	)


	def svg_text(x: int, y: int, text: str, size: int = 16, weight: int = 500, color: str = "#f4f8ef") -> str:
	return (
	f'<text x="{x}" y="{y}" font-size="{size}" font-weight="{weight}" '
	f'fill="{color}">{html.escape(text)}</text>'
	)


	def write_svg(taxonomy: dict[str, Any]) -> None:
	width = 1180
	height = 700
	margin = 58
	card_w = 515
	card_h = 220
	colors = {"direct": "#ccffa0", "proxy": "#7ae5c3", "diagnostic": "#d8f4a5"}
	cards = []

	for idx, (code, info) in enumerate(taxonomy["directions"].items()):
	row = idx // 2
	col = idx % 2
	x = margin + col * (card_w + 34)
	y = 130 + row * (card_h + 34)
	counts = info["counts"]
	total = max(1, counts["direct"] + counts["proxy"] + counts["diagnostic"])
	bar_x = x + 24
	bar_y = y + 132
	bar_w = card_w - 48
	cursor = bar_x
	segments = []
	for key in ("direct", "proxy", "diagnostic"):
	seg_w = round(bar_w * counts[key] / total)
	if counts[key] > 0:
	segments.append(
	f'<rect x="{cursor}" y="{bar_y}" width="{seg_w}" height="16" rx="8" fill="{colors[key]}"/>'
	)
	cursor += seg_w

	task_labels = ", ".join(info["task_display_names"][:5])
	if len(info["task_display_names"]) > 5:
	task_labels += f", +{len(info['task_display_names']) - 5}"

	cards.append(
	"\n".join(
	[
	f'<rect x="{x}" y="{y}" width="{card_w}" height="{card_h}" rx="8" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/>',
	svg_text(x + 24, y + 42, f"{code}. {info['name']}", 21, 700),
	svg_text(x + 24, y + 75, info["current_status"], 15, 700, "#ccffa0"),
	svg_text(x + 24, y + 108, f"Tasks: {task_labels}", 14, 500, "#dce8d7"),
	*segments,
	svg_text(x + 24, y + 174, f"Direct {counts['direct']}", 14, 700, colors["direct"]),
	svg_text(x + 150, y + 174, f"Proxy {counts['proxy']}", 14, 700, colors["proxy"]),
	svg_text(x + 270, y + 174, f"Diagnostic {counts['diagnostic']}", 14, 700, colors["diagnostic"]),
	]
	)
	)

	legend = []
	lx = margin
	for key, label in (
	("direct", "Direct task"),
	("proxy", "Proxy / prerequisite"),
	("diagnostic", "Diagnostic probe"),
	):
	legend.extend(
	[
	f'<rect x="{lx}" y="622" width="16" height="16" rx="4" fill="{colors[key]}"/>',
	svg_text(lx + 24, 636, label, 14, 600, "#dce8d7"),
	]
	)
	lx += 200

	svg = f"""<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}" role="img" aria-label="Xperience-10M task coverage across four research directions">
	<rect width="100%" height="100%" fill="#020502"/>
	<rect x="24" y="24" width="1132" height="652" rx="20" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/>
	{svg_text(margin, 64, "Xperience-10M 12-Task Suite: Four Research Directions", 30, 800)}
	{svg_text(margin, 96, "One public sample episode, two baseline families, explicit direct/proxy/diagnostic coverage.", 16, 500, "#a5afa2")}
	{"".join(cards)}
	{"".join(legend)}
	{svg_text(margin, 670, "Generated from results/episode_task_suite/summary_report.json and scripts/research_direction_taxonomy.py", 13, 500, "#a5afa2")}
	</svg>
	"""
	(CHARTS / "research_direction_coverage.svg").write_text(svg, encoding="utf-8")


	def main() -> None:
	OUT_DIR.mkdir(parents=True, exist_ok=True)
	DOCS_DATA.mkdir(parents=True, exist_ok=True)
	CHARTS.mkdir(parents=True, exist_ok=True)

	taxonomy = build_taxonomy(load_summary())
	json_text = json.dumps(taxonomy, indent=2, ensure_ascii=False)
	(OUT_DIR / "research_direction_taxonomy.json").write_text(json_text + "\n", encoding="utf-8")
	(DOCS_DATA / "research_directions.json").write_text(json_text + "\n", encoding="utf-8")
	write_csv(taxonomy)
	write_markdown(taxonomy)
	write_svg(taxonomy)

	print(f"Wrote {OUT_DIR / 'research_direction_taxonomy.json'}")
	print(f"Wrote {OUT_DIR / 'research_direction_task_map.csv'}")
	print(f"Wrote {OUT_DIR / 'research_direction_summary.md'}")
	print(f"Wrote {DOCS_DATA / 'research_directions.json'}")
	print(f"Wrote {CHARTS / 'research_direction_coverage.svg'}")


	if __name__ == "__main__":
	main()