#!/usr/bin/env python3 """Build a unified 20-task radar chart for baseline and model-branch metrics.""" from __future__ import annotations import html import json import math from datetime import datetime, timezone from pathlib import Path from typing import Any ROOT = Path(__file__).resolve().parents[1] TASK_SUITE_PATH = ROOT / "docs/data/task_suite_20.json" QWEN_V6_METRICS_PATH = ( ROOT / "results/omni_finetune/verified_public" / "xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full" / "eval/metrics.json" ) COSMOS_SUPER_REASONER_METRICS_PATH = ( ROOT / "results/omni_finetune/verified_public" / "xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607" / "eval/metrics.json" ) COSMOS_NANO_METRICS_PATH = ( ROOT / "results/omni_finetune/verified_public" / "xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full" / "eval/metrics.json" ) COSMOS_SUPER_FD_METRICS_PATH = ( ROOT / "results/omni_finetune/verified_public" / "xperience10m_cosmos3_super_forward_dynamics_lora_128ep_train1epoch_256_attn_full8gpu_20260608_eval_test_full_fsdp" / "eval/metrics.json" ) METADATA128_BASELINE_DIR = ROOT / "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2" RAW128_BASELINE_DIR = ROOT / "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z" MODEL_OUTPUT_TASK_PROBE_DIR = ROOT / "results/omni_finetune/model_output_task_probes_20260616" QWEN_FUTURE_TASK_PROBE_DIR = ( ROOT / "results/omni_finetune" / "xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z" ) QWEN_ORDER_SYNC_TIME_PROBE_DIR = ( ROOT / "results/omni_finetune" / "xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z" ) QWEN_RETRIEVAL_TASK_PROBE_DIR = ( ROOT / "results/omni_finetune" / "xperience10m_qwen3_omni_v6_retrieval_task_probes_a100_20260617T175919Z" ) QWEN_CROSS_MODAL_RETRIEVAL_PROBE_DIR = ( ROOT / "results/omni_finetune" / "xperience10m_qwen3_omni_v6_cross_modal_retrieval_probe_a100_20260618T000000Z" ) QWEN_ACTION_OBJECT_METRICS_PATH = ( MODEL_OUTPUT_TASK_PROBE_DIR / "action_object_relation/qwen3_omni_v6_lora/metrics.json" ) COSMOS_SUPER_ACTION_OBJECT_METRICS_PATH = ( MODEL_OUTPUT_TASK_PROBE_DIR / "action_object_relation/cosmos3_super_reasoner/metrics.json" ) QWEN_FUTURE_TASK_METRIC_PATHS = { "caption_grounding": QWEN_RETRIEVAL_TASK_PROBE_DIR / "caption_grounding/metrics.json", "cross_modal_retrieval": QWEN_CROSS_MODAL_RETRIEVAL_PROBE_DIR / "cross_modal_retrieval/metrics.json", "temporal_order": QWEN_ORDER_SYNC_TIME_PROBE_DIR / "temporal_order/metrics.json", "misalignment_detection": QWEN_ORDER_SYNC_TIME_PROBE_DIR / "misalignment_detection/metrics.json", "long_horizon_next_action": QWEN_FUTURE_TASK_PROBE_DIR / "long_horizon_next_action/metrics.json", "next_subtask_forecast": QWEN_FUTURE_TASK_PROBE_DIR / "next_subtask_forecast/metrics.json", "object_set_forecast": QWEN_FUTURE_TASK_PROBE_DIR / "object_set_forecast/metrics.json", "time_to_transition": QWEN_ORDER_SYNC_TIME_PROBE_DIR / "time_to_transition/metrics.json", } QWEN_FUTURE_TASK_METRIC_KEYS = { "caption_grounding": "caption_grounding_mrr", "cross_modal_retrieval": "cross_modal_retrieval_mrr", "temporal_order": "temporal_order_f1", "misalignment_detection": "misalignment_detection_f1", "long_horizon_next_action": "long_horizon_next_action_macro_f1", "next_subtask_forecast": "next_subtask_forecast_macro_f1", "object_set_forecast": "object_set_forecast_micro_f1", "time_to_transition": "time_to_transition_mae", } OUTPUT_JSON = ROOT / "docs/data/unified_task_model_radar.json" OUTPUT_SINGLE_JSON = ROOT / "docs/data/single_episode_task_model_radar.json" OUTPUT_128_JSON = ROOT / "docs/data/episode128_task_model_radar.json" OUTPUT_MATRIX_JSON = ROOT / "docs/data/task_method_20_result_matrix.json" OUTPUT_MATRIX_MD = ROOT / "TASK_METHOD_20_RESULT_MATRIX.md" OUTPUT_SVG = ROOT / "docs/assets/charts/unified_task_model_radar.svg" OUTPUT_SINGLE_SVG = ROOT / "docs/assets/charts/single_episode_task_model_radar.svg" OUTPUT_128_SVG = ROOT / "docs/assets/charts/episode128_task_model_radar.svg" SERIES = { "minimal": { "label": "Minimal", "short_label": "Min", "color": "#ccffa0", "kind": "full_20_task_baseline", "scope": "1 public sample episode", "stroke_dasharray": None, }, "neural_mlp": { "label": "Neural MLP", "short_label": "NN", "color": "#67e8d1", "kind": "full_20_task_baseline", "scope": "1 public sample episode", "stroke_dasharray": None, }, "metadata128_simple": { "label": "128ep Metadata Simple", "short_label": "128-S", "color": "#ffd166", "kind": "partial_128_episode_metadata_baseline", "scope": "128 selected episodes, JSONL metadata/text only", "stroke_dasharray": "9 6", }, "metadata128_neural_mlp": { "label": "128ep Metadata NN", "short_label": "128-NN", "color": "#f472b6", "kind": "partial_128_episode_metadata_baseline", "scope": "128 selected episodes, JSONL metadata/text only", "stroke_dasharray": "3 6", }, "raw128_simple": { "label": "128ep Raw Simple", "short_label": "128-RS", "color": "#f59e0b", "kind": "complete_128_episode_raw_feature_baseline", "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", "stroke_dasharray": "8 4", }, "raw128_neural_mlp": { "label": "128ep Raw NN", "short_label": "128-RN", "color": "#22d3ee", "kind": "complete_128_episode_raw_feature_baseline", "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", "stroke_dasharray": "2 5", }, "qwen3_omni_v6_lora": { "label": "Qwen3-Omni v6 LoRA", "short_label": "Qwen3", "color": "#9bb8ff", "kind": "partial_128_episode_foundation_model_overlay", "scope": "128 selected episodes, held-out test", "stroke_dasharray": "7 7", }, "cosmos3_super_reasoner": { "label": "Cosmos3-Super Reasoner", "short_label": "C3-S", "color": "#ff9c7a", "kind": "partial_128_episode_foundation_model_overlay", "scope": "128 selected episodes, held-out test", "stroke_dasharray": "4 7", }, "cosmos3_nano_future_window": { "label": "Cosmos3-Nano Future Window", "short_label": "C3-N", "color": "#d9c7ff", "kind": "partial_128_episode_world_model_overlay", "scope": "128 selected episodes, held-out test", "stroke_dasharray": "2 7", }, } FOUNDATION_TASK_METRICS = { "timeline_action": { "qwen3_omni_v6_lora": "action_macro_f1", "cosmos3_super_reasoner": "action_macro_f1", "cosmos3_nano_future_window": "action_accuracy_from_retrieved_future", }, "timeline_subtask": { "qwen3_omni_v6_lora": "subtask_accuracy", "cosmos3_super_reasoner": "subtask_accuracy", }, "transition_detection": { "qwen3_omni_v6_lora": "transition_accuracy", "cosmos3_super_reasoner": "transition_accuracy", "cosmos3_nano_future_window": "transition_accuracy", }, "next_action": { "qwen3_omni_v6_lora": "next_action_accuracy", "cosmos3_super_reasoner": "next_action_accuracy", "cosmos3_nano_future_window": "action_accuracy_from_retrieved_future", }, "contact_prediction": { "qwen3_omni_v6_lora": "contact_accuracy", "cosmos3_super_reasoner": "contact_accuracy", "cosmos3_nano_future_window": "contact_accuracy", }, "object_relevance": { "qwen3_omni_v6_lora": "object_micro_f1", "cosmos3_super_reasoner": "object_micro_f1", }, "action_object_relation": { "qwen3_omni_v6_lora": "action_object_relation_macro_f1", "cosmos3_super_reasoner": "action_object_relation_macro_f1", }, "cross_modal_retrieval": { "cosmos3_nano_future_window": "future_retrieval_mrr", }, } FOUNDATION_METRIC_PATHS = { "qwen3_omni_v6_lora": QWEN_V6_METRICS_PATH, "cosmos3_super_reasoner": COSMOS_SUPER_REASONER_METRICS_PATH, "cosmos3_nano_future_window": COSMOS_NANO_METRICS_PATH, } FOUNDATION_METRIC_SOURCE_OVERRIDES = { ("qwen3_omni_v6_lora", "action_object_relation"): QWEN_ACTION_OBJECT_METRICS_PATH, ("cosmos3_super_reasoner", "action_object_relation"): COSMOS_SUPER_ACTION_OBJECT_METRICS_PATH, ("qwen3_omni_v6_lora", "caption_grounding"): QWEN_FUTURE_TASK_METRIC_PATHS["caption_grounding"], ("qwen3_omni_v6_lora", "cross_modal_retrieval"): QWEN_FUTURE_TASK_METRIC_PATHS["cross_modal_retrieval"], ("qwen3_omni_v6_lora", "temporal_order"): QWEN_FUTURE_TASK_METRIC_PATHS["temporal_order"], ("qwen3_omni_v6_lora", "misalignment_detection"): QWEN_FUTURE_TASK_METRIC_PATHS["misalignment_detection"], ("qwen3_omni_v6_lora", "long_horizon_next_action"): QWEN_FUTURE_TASK_METRIC_PATHS["long_horizon_next_action"], ("qwen3_omni_v6_lora", "next_subtask_forecast"): QWEN_FUTURE_TASK_METRIC_PATHS["next_subtask_forecast"], ("qwen3_omni_v6_lora", "object_set_forecast"): QWEN_FUTURE_TASK_METRIC_PATHS["object_set_forecast"], ("qwen3_omni_v6_lora", "time_to_transition"): QWEN_FUTURE_TASK_METRIC_PATHS["time_to_transition"], } SHORT_TASK_LABELS = { "timeline_action": "Action", "timeline_subtask": "Step", "transition_detection": "Boundary", "next_action": "Next act", "hand_trajectory_forecast": "Hand traj", "contact_prediction": "Contact", "object_relevance": "Objects", "caption_grounding": "Language", "cross_modal_retrieval": "X-modal", "modality_reconstruction": "Recon", "temporal_order": "Order", "misalignment_detection": "Sync", "long_horizon_next_action": "Long act", "next_subtask_forecast": "Long step", "interaction_text_prediction": "Interact txt", "action_object_relation": "Act+obj", "object_set_forecast": "Future obj", "imu_to_hand_pose": "IMU->hand", "camera_view_sync_retrieval": "Cam sync", "time_to_transition": "Time2bdry", } METHOD_DETAILS = { "minimal": "Single-episode simple heads over the public sample split.", "neural_mlp": "Single-episode compact PyTorch MLP heads on the same 20 task contracts.", "metadata128_simple": "128-episode JSONL metadata/text simple baselines.", "metadata128_neural_mlp": "128-episode JSONL metadata/text MLP baselines.", "raw128_simple": "128-episode 4430-dim sensor NPZ simple heads; tasks 15/19 use compact proxies.", "raw128_neural_mlp": "128-episode 4430-dim sensor NPZ MLP heads; tasks 15/19 use compact proxies.", "qwen3_omni_v6_lora": "Verified held-out Qwen3-Omni v6 LoRA metrics, plus task 16 and any completed private-GPU future-task probes scored from task-specific JSON.", "cosmos3_super_reasoner": "Verified Cosmos3-Super base-weight Reasoner JSON-task evaluation, plus task 16 scored from existing verified action/object JSON.", "cosmos3_nano_future_window": "Verified Cosmos3-Nano future-window compatibility metrics.", } PROXY_TASK_IDS = {"interaction_text_prediction", "camera_view_sync_retrieval"} SINGLE_EPISODE_SERIES = ("minimal", "neural_mlp") EPISODE128_SERIES = ( "metadata128_simple", "metadata128_neural_mlp", "raw128_simple", "raw128_neural_mlp", "qwen3_omni_v6_lora", "cosmos3_super_reasoner", "cosmos3_nano_future_window", ) STATUS_LABELS = { "scored": "scored", "proxy_scored": "proxy scored", "unsupported_without_required_target": "unsupported", "not_supported_by_metadata_only_package": "not supported", "not_evaluated_in_verified_package": "not evaluated", "missing_public_metric": "missing metric", } STATUS_SHORT = { "scored": "score", "proxy_scored": "proxy", "unsupported_without_required_target": "unsupported", "not_supported_by_metadata_only_package": "not supported", "not_evaluated_in_verified_package": "not evaluated", "missing_public_metric": "missing", } def read_json(path: Path) -> dict[str, Any]: return json.loads(path.read_text(encoding="utf-8")) if path.exists() else {} def foundation_task_metric_mapping(qwen_metrics: dict[str, Any]) -> dict[str, dict[str, str]]: mapping = {task_id: dict(series_metrics) for task_id, series_metrics in FOUNDATION_TASK_METRICS.items()} for task_id, path in QWEN_FUTURE_TASK_METRIC_PATHS.items(): payload = read_json(path) metric_key = QWEN_FUTURE_TASK_METRIC_KEYS[task_id] metric_value = payload.get(metric_key) if payload.get("status") != "pass" or not isinstance(metric_value, (int, float)): continue qwen_metrics[metric_key] = metric_value mapping.setdefault(task_id, {})["qwen3_omni_v6_lora"] = metric_key return mapping def read_a100_metadata_record(task_id: str, *, neural: bool = False) -> dict[str, Any] | None: path = METADATA128_BASELINE_DIR / ("neural_mlp" if neural else "") / task_id / "metrics.json" if not path.exists(): return None payload = read_json(path) status = payload.get("status", "missing_public_metric") score = payload.get("primary_score") if status == "pass" else None return { "raw": score, "metric_key": payload.get("primary_metric"), "source": str(path.relative_to(ROOT)), "scope": "multi_episode_128_metadata_baseline", "status": "scored" if status == "pass" and score is not None else "unsupported_without_required_target", "reason": payload.get("reason") or payload.get("error") or ( "metadata-only package has a metrics artifact for this task, but it does not contain a numeric public score" if status != "pass" else None ), } def read_a100_raw_metric(task_id: str, *, neural: bool = False) -> dict[str, Any] | None: candidates = ( [RAW128_BASELINE_DIR / "neural_mlp_raw128" / task_id / "metrics.json"] if neural else [ RAW128_BASELINE_DIR / "simple_raw128" / task_id / "metrics.json", RAW128_BASELINE_DIR / "simple_raw128_centroid" / task_id / "metrics.json", RAW128_BASELINE_DIR / "simple_raw128_ridge" / task_id / "metrics.json", ] ) for path in candidates: if not path.exists(): continue payload = read_json(path) if payload.get("status") != "pass": continue score = payload.get("primary_score") if score is None: continue return { "raw": score, "metric_key": payload.get("primary_metric"), "source": str(path.relative_to(ROOT)), "scope": "multi_episode_128_raw_sensor_feature_baseline", "status": "proxy_scored" if task_id in PROXY_TASK_IDS else "scored", "reason": "documented compact proxy completion for this raw128 task axis" if task_id in PROXY_TASK_IDS else None, } return None def clamp01(value: float) -> float: return max(0.0, min(1.0, value)) def score_from_raw(value: float | None, direction: str, best_lower: float | None = None) -> float | None: if value is None: return None if direction == "lower": if value <= 0: return 1.0 if best_lower is None or best_lower <= 0: return None return clamp01(best_lower / value) return clamp01(value) def format_metric(value: float | None) -> str: if value is None: return "n/a" if abs(value) >= 10: return f"{value:.2f}" if abs(value) >= 1: return f"{value:.3f}" return f"{value:.4f}" def status_label(status: str | None) -> str: return STATUS_LABELS.get(status or "", status or "unknown") def make_missing_record(series_id: str, task_id: str, metric_key: str | None) -> dict[str, Any]: if series_id.startswith("metadata128"): status = "not_supported_by_metadata_only_package" reason = ( "the 128-episode metadata/text rerun did not produce this task target; " "raw sensor blocks or a task-specific metadata target builder are required" ) scope = "multi_episode_128_metadata_baseline" elif series_id in {"qwen3_omni_v6_lora", "cosmos3_super_reasoner", "cosmos3_nano_future_window"}: status = "not_evaluated_in_verified_package" reason = ( "the verified public model package did not ask this branch to emit that task target; " "a new task-specific evaluation package is required for a numeric score" ) scope = "multi_episode_128_partial_model_overlay" else: status = "missing_public_metric" reason = "no public metric artifact was found for this method-task pair" scope = SERIES.get(series_id, {}).get("scope") return { "raw": None, "metric_key": metric_key, "source": None, "scope": scope, "status": status, "reason": reason, "normalized_score": None, "raw_text": "n/a", } def finalize_value_record(item: dict[str, Any], direction: str, best_lower: float | None) -> None: raw = item.get("raw") item.setdefault("status", "scored" if isinstance(raw, (int, float)) else "missing_public_metric") item["normalized_score"] = score_from_raw(raw if isinstance(raw, (int, float)) else None, direction, best_lower) if item["normalized_score"] is None and item.get("status") in {"scored", "proxy_scored"}: item["status"] = "missing_public_metric" item.setdefault("reason", "numeric raw score could not be normalized for this task") item["raw_text"] = format_metric(raw if isinstance(raw, (int, float)) else None) item["status_label"] = status_label(item.get("status")) def matrix_rows(payload: dict[str, Any]) -> list[dict[str, Any]]: rows: list[dict[str, Any]] = [] for task in payload["tasks"]: for series_id, series_spec in SERIES.items(): value = task["values"][series_id] rows.append( { "task_number": task["task_number"], "task_id": task["task_id"], "task_label": task["label"], "series_id": series_id, "method": series_spec["label"], "status": value.get("status"), "status_label": value.get("status_label", status_label(value.get("status"))), "scored": value.get("normalized_score") is not None, "proxy_scored": value.get("status") == "proxy_scored", "raw": value.get("raw"), "raw_text": value.get("raw_text", "n/a"), "normalized_score": value.get("normalized_score"), "metric_key": value.get("metric_key"), "source": value.get("source"), "scope": value.get("scope"), "reason": value.get("reason"), } ) return rows def render_matrix_markdown(payload: dict[str, Any]) -> str: lines = [ "# Task Method 20-Result Matrix", "", "Every method has one record for each of the 20 unified task contracts. Numeric scores appear only where a committed runner or verified package produced that task target.", "", "Legend: `score` = numeric task score, `proxy` = documented raw128 compact proxy score, `unsupported` = artifact exists but required target is not present, `not supported` = metadata-only package cannot form that target, `not evaluated` = verified model package did not request that target.", "", "| Method | Records | Scored | Proxy scored | Scoreless | Status counts |", "| --- | ---: | ---: | ---: | ---: | --- |", ] for record in payload["series"]: counts = record["status_counts"] count_text = ", ".join(f"{status_label(key)} {value}" for key, value in sorted(counts.items())) lines.append( f"| {record['label']} | {record['result_record_count']} | {record['scored_task_count']} | " f"{record['proxy_scored_task_count']} | {record['scoreless_task_count']} | {count_text} |" ) lines.extend( [ "", "| # | Task | " + " | ".join(spec["short_label"] for spec in SERIES.values()) + " |", "| ---: | --- | " + " | ".join("---" for _ in SERIES) + " |", ] ) for task in payload["tasks"]: cells = [STATUS_SHORT.get(task["values"][series_id].get("status"), "unknown") for series_id in SERIES] lines.append(f"| {task['task_number']:02d} | {task['label']} | " + " | ".join(cells) + " |") lines.extend( [ "", "Sources and raw values are in `docs/data/task_method_20_result_matrix.json` and `docs/data/unified_task_model_radar.json`.", "", ] ) return "\n".join(lines) def filtered_radar_payload( payload: dict[str, Any], series_ids: tuple[str, ...], *, title: str, description: str, ) -> dict[str, Any]: selected = set(series_ids) series = [json.loads(json.dumps(record)) for record in payload["series"] if record["id"] in selected] tasks = [] for task in payload["tasks"]: task_copy = {key: json.loads(json.dumps(value)) for key, value in task.items() if key != "values"} task_copy["values"] = { series_id: json.loads(json.dumps(task["values"][series_id])) for series_id in series_ids if series_id in task["values"] } tasks.append(task_copy) rows = [ json.loads(json.dumps(row)) for row in payload["task_method_result_matrix"] if row.get("series_id") in selected ] return { "title": title, "status": payload["status"], "generated_at_utc": payload["generated_at_utc"], "description": description, "task_count": payload["task_count"], "method_count": len(series), "method_task_record_count": sum(record.get("result_record_count", 0) for record in series), "scored_method_task_count": sum(record.get("scored_task_count", 0) for record in series), "normalization_policy": payload["normalization_policy"], "source_unified_radar": "docs/data/unified_task_model_radar.json", "source_result_matrix": "docs/data/task_method_20_result_matrix.json", "series": series, "tasks": tasks, "task_method_result_matrix": rows, } def point(cx: float, cy: float, radius: float, angle: float) -> tuple[float, float]: return cx + math.cos(angle) * radius, cy + math.sin(angle) * radius def svg_text( x: float, y: float, text: str, *, size: int = 16, fill: str = "#f4f8ef", anchor: str = "start", weight: int | str = 600, opacity: float = 1.0, ) -> str: return ( f'{html.escape(text)}' ) def split_text(text: str, max_chars: int) -> list[str]: words = text.split() if not words: return [""] lines: list[str] = [] current = words[0] for word in words[1:]: if len(current) + 1 + len(word) <= max_chars: current += " " + word else: lines.append(current) current = word lines.append(current) return lines def svg_text_lines( x: float, y: float, lines: list[str], *, size: int = 14, fill: str = "#f4f8ef", anchor: str = "start", weight: int | str = 600, line_height: float = 18, opacity: float = 1.0, ) -> list[str]: return [ svg_text(x, y + idx * line_height, line, size=size, fill=fill, anchor=anchor, weight=weight, opacity=opacity) for idx, line in enumerate(lines) ] def polyline(points: list[tuple[float, float]], *, fill: str, stroke: str, opacity: float, stroke_width: float, dash: str | None = None) -> str: coords = " ".join(f"{x:.1f},{y:.1f}" for x, y in points) dash_attr = f' stroke-dasharray="{dash}"' if dash else "" return ( f'' ) def build_payload() -> dict[str, Any]: suite = read_json(TASK_SUITE_PATH) qwen = read_json(QWEN_V6_METRICS_PATH) cosmos_super = read_json(COSMOS_SUPER_REASONER_METRICS_PATH) cosmos_nano = read_json(COSMOS_NANO_METRICS_PATH) cosmos_fd = read_json(COSMOS_SUPER_FD_METRICS_PATH) qwen.update(read_json(QWEN_ACTION_OBJECT_METRICS_PATH)) cosmos_super.update(read_json(COSMOS_SUPER_ACTION_OBJECT_METRICS_PATH)) foundation_task_metrics = foundation_task_metric_mapping(qwen) foundation_metrics = { "qwen3_omni_v6_lora": qwen, "cosmos3_super_reasoner": cosmos_super, "cosmos3_nano_future_window": cosmos_nano, } tasks: list[dict[str, Any]] = [] for row in suite.get("tasks", []): values: dict[str, dict[str, Any]] = { "minimal": { "raw": row.get("minimal_primary_metric"), "metric_key": row.get("metric_key"), "source": row.get("artifact_sources", {}).get("minimal_metrics"), "scope": "single_episode_public_sample", "status": "scored", }, "neural_mlp": { "raw": row.get("neural_primary_metric"), "metric_key": row.get("metric_key"), "source": row.get("artifact_sources", {}).get("neural_metrics"), "scope": "single_episode_public_sample", "status": "scored", }, } for series_id, metric_key in foundation_task_metrics.get(row["task_id"], {}).items(): raw = foundation_metrics.get(series_id, {}).get(metric_key) values[series_id] = { "raw": raw, "metric_key": metric_key, "source": str( FOUNDATION_METRIC_SOURCE_OVERRIDES.get( (series_id, row["task_id"]), FOUNDATION_METRIC_PATHS[series_id], ).relative_to(ROOT) ), "scope": "multi_episode_128_partial_model_overlay", "status": "scored" if isinstance(raw, (int, float)) else "missing_public_metric", "reason": None if isinstance(raw, (int, float)) else f"metric {metric_key} is absent from the verified public package", } metadata_simple = read_a100_metadata_record(row["task_id"], neural=False) if metadata_simple: values["metadata128_simple"] = metadata_simple metadata_neural = read_a100_metadata_record(row["task_id"], neural=True) if metadata_neural: values["metadata128_neural_mlp"] = metadata_neural raw_simple = read_a100_raw_metric(row["task_id"], neural=False) if raw_simple: values["raw128_simple"] = raw_simple raw_neural = read_a100_raw_metric(row["task_id"], neural=True) if raw_neural: values["raw128_neural_mlp"] = raw_neural lower_values = [ item["raw"] for item in values.values() if row.get("metric_direction") == "lower" and isinstance(item.get("raw"), (int, float)) and item["raw"] > 0 ] best_lower = min(lower_values) if lower_values else None for series_id in SERIES: values.setdefault(series_id, make_missing_record(series_id, row["task_id"], row.get("metric_key"))) for item in values.values(): finalize_value_record(item, row.get("metric_direction", "higher"), best_lower) tasks.append( { "task_number": row["task_number"], "task_id": row["task_id"], "label": row.get("task_display_name", row["task_id"]), "axis_label": f"{row['task_number']:02d} {row.get('task_display_name', row['task_id'])}", "short_label": SHORT_TASK_LABELS.get(row["task_id"], row["task_id"].replace("_", " ").title()), "origin": row.get("origin"), "metric_key": row.get("metric_key"), "metric_name": row.get("metric_name"), "metric_direction": row.get("metric_direction"), "raw128_proxy_axis": row["task_id"] in PROXY_TASK_IDS, "values": values, } ) series_records = [] for series_id, spec in SERIES.items(): status_counts: dict[str, int] = {} for task in tasks: status = task["values"][series_id].get("status", "unknown") status_counts[status] = status_counts.get(status, 0) + 1 covered = sum(1 for task in tasks if task["values"].get(series_id, {}).get("normalized_score") is not None) proxy_count = status_counts.get("proxy_scored", 0) scoreless = len(tasks) - covered series_records.append( { "id": series_id, **spec, "method_detail": METHOD_DETAILS.get(series_id, spec["scope"]), "plotted_as": "filled polygon" if spec["kind"].startswith("full_20_task_baseline") else "colored point overlay", "result_record_count": len(tasks), "scored_task_count": covered, "covered_task_count": covered, "proxy_scored_task_count": proxy_count, "scoreless_task_count": scoreless, "unsupported_task_count": status_counts.get("unsupported_without_required_target", 0) + status_counts.get("not_supported_by_metadata_only_package", 0), "not_evaluated_task_count": status_counts.get("not_evaluated_in_verified_package", 0), "status_counts": dict(sorted(status_counts.items())), "coverage_fraction": covered / max(len(tasks), 1), "result_record_fraction": len(tasks) / max(len(tasks), 1), } ) fd_loss = (cosmos_fd.get("loss_summary") or {}).get("mean") payload = { "title": "Unified 20-Task Model Radar", "status": "pass", "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), "task_count": len(tasks), "method_count": len(SERIES), "method_task_record_count": len(tasks) * len(SERIES), "scored_method_task_count": sum( 1 for task in tasks for series_id in SERIES if task["values"][series_id].get("normalized_score") is not None ), "normalization_policy": { "higher_is_better": "bounded metrics are plotted directly on 0-1 axes after clipping to [0, 1]", "lower_is_better": "lower-error metrics are converted to best_observed_value / raw_value within the same task", "raw_values": "raw metric values, metric keys, and sources are retained in this JSON; the SVG is an overview, not a replacement for the metric table", "result_record_policy": "every method has 20 task records; records without a numeric score carry explicit unsupported/not-evaluated status and reason fields", "foundation_model_overlay": "Qwen3/Cosmos points are plotted only on task-aligned axes. Scoreless records mean the public result does not evaluate that task contract.", "metadata_128_overlay": "128-episode metadata baselines have 20 records, but numeric scores only where the public JSONL contains enough task labels without raw feature blocks.", "raw_128_overlay": "128-episode raw-feature baselines use staged sensor NPZ features. Eighteen axes use direct task targets; interaction text and camera-view sync are completed with documented compact proxies because raw interaction strings and paired video-view embeddings are absent from the 128 export.", }, "series": series_records, "tasks": tasks, "model_branch_cards": [ { "id": "metadata128_simple", "title": "128ep Metadata Simple", "status": "a100_rerun_pass", "coverage": f"20 records / {next(item for item in series_records if item['id'] == 'metadata128_simple')['scored_task_count']} scored JSONL-supported axes", "headline": "34,269 rows; train/val/test 25,629/4,608/4,032", "source": str((METADATA128_BASELINE_DIR / "summary_report.json").relative_to(ROOT)), }, { "id": "metadata128_neural_mlp", "title": "128ep Metadata NN", "status": "a100_rerun_pass", "coverage": f"20 records / {next(item for item in series_records if item['id'] == 'metadata128_neural_mlp')['scored_task_count']} scored JSONL-supported axes", "headline": "compact MLP heads over metadata/text features", "source": str((METADATA128_BASELINE_DIR / "summary_report.json").relative_to(ROOT)), }, { "id": "raw128_simple", "title": "128ep Raw Simple", "status": "a100_raw20_complete_with_documented_proxies", "coverage": f"20 records / {next(item for item in series_records if item['id'] == 'raw128_simple')['scored_task_count']} scored axes; 18 direct + 2 proxy", "headline": "34,269 windows; centroid/ridge heads over 4430-dim sensor blocks", "source": str((RAW128_BASELINE_DIR / "run_summary_all.json").relative_to(ROOT)), }, { "id": "raw128_neural_mlp", "title": "128ep Raw NN", "status": "a100_raw20_complete_with_documented_proxies", "coverage": f"20 records / {next(item for item in series_records if item['id'] == 'raw128_neural_mlp')['scored_task_count']} scored axes; 18 direct + 2 proxy", "headline": "MLP heads over staged features; tasks 15/19 use compact proxies", "source": str((RAW128_BASELINE_DIR / "run_summary_all.json").relative_to(ROOT)), }, { "id": "qwen3_omni_v6_lora", "title": "Qwen3-Omni v6 LoRA", "status": "verified", "task_aligned_axes": SERIES["qwen3_omni_v6_lora"]["short_label"], "coverage": f"20 records / {next(item for item in series_records if item['id'] == 'qwen3_omni_v6_lora')['scored_task_count']} scored task-aligned axes", "headline": f"JSON validity {format_metric(qwen.get('json_validity_rate'))}; action macro-F1 {format_metric(qwen.get('action_macro_f1'))}", "source": str(QWEN_V6_METRICS_PATH.relative_to(ROOT)), }, { "id": "cosmos3_super_reasoner", "title": "Cosmos3-Super Reasoner", "status": "verified_base_weight_eval", "coverage": f"20 records / {next(item for item in series_records if item['id'] == 'cosmos3_super_reasoner')['scored_task_count']} scored task-aligned axes", "headline": f"JSON validity {format_metric(cosmos_super.get('json_validity_rate'))}; action macro-F1 {format_metric(cosmos_super.get('action_macro_f1'))}", "source": str(COSMOS_SUPER_REASONER_METRICS_PATH.relative_to(ROOT)), }, { "id": "cosmos3_nano_future_window", "title": "Cosmos3-Nano Future Window", "status": "verified_compatibility_eval", "coverage": f"20 records / {next(item for item in series_records if item['id'] == 'cosmos3_nano_future_window')['scored_task_count']} scored task-aligned axes", "headline": f"future retrieval MRR {format_metric(cosmos_nano.get('future_retrieval_mrr'))}; transition accuracy {format_metric(cosmos_nano.get('transition_accuracy'))}", "source": str(COSMOS_NANO_METRICS_PATH.relative_to(ROOT)), }, { "id": "cosmos3_super_forward_dynamics_lora", "title": "Cosmos3-Super Forward-Dynamics LoRA", "status": "verified_finetuned_adapter", "coverage": "separate camera-pose proxy target, not plotted on the 20 task axes", "headline": f"test MSE {format_metric(fd_loss)} over 448 held-out rows", "source": str(COSMOS_SUPER_FD_METRICS_PATH.relative_to(ROOT)), }, ], } payload["task_method_result_matrix"] = matrix_rows(payload) return payload def render_svg( payload: dict[str, Any], *, series_ids: tuple[str, ...] | None = None, polygon_series_ids: tuple[str, ...] = ("minimal", "neural_mlp"), title: str | None = None, subtitle: str | None = None, context_line: str | None = None, chip_specs: list[tuple[str, str]] | None = None, reading_rules: tuple[str, str, str] | None = None, ) -> str: width, height = 2400, 1840 cx, cy, radius = 650, 860, 355 tasks = payload["tasks"] n = len(tasks) angles = [-math.pi / 2 + 2 * math.pi * i / n for i in range(n)] if series_ids is None: series_ids = tuple(record["id"] for record in payload["series"]) polygon_series_set = set(polygon_series_ids) series_records = [record for record in payload["series"] if record["id"] in set(series_ids)] parts = [ f'', "", '', '', "", '', '', '', svg_text(70, 86, title or payload.get("title", "20-Task Model Radar"), size=36, weight=800), svg_text( 70, 122, subtitle or "Task names, methods, coverage, and metric normalization in one comparison view.", size=18, fill="#dce8d7", weight=650, ), svg_text( 70, 150, context_line or "Filled areas show complete scored baselines; colored points show partial branches on task-aligned axes.", size=15, fill="#a5afa2", weight=560, ), ] if chip_specs is None: chip_specs = [ ("20 task axes", "#ccffa0"), (f"{payload['method_task_record_count']} method-task records", "#67e8d1"), (f"{payload['scored_method_task_count']} scored axes", "#22d3ee"), ("40/40 raw128 pass", "#f59e0b"), ("2 compact proxy axes", "#f472b6"), ] chip_x = 70 for label, color in chip_specs: chip_w = 168 if len(label) < 15 else 250 parts.append(f'') parts.append(svg_text(chip_x + 16, 197, label, size=13, fill=color, weight=760)) chip_x += chip_w + 12 parts.append('') parts.append(svg_text(84, 276, "Normalized task scores", size=23, weight=800)) parts.append(svg_text(84, 302, "Each axis is one task. Longer radius means better after metric-direction normalization.", size=13, fill="#a5afa2", weight=560)) for level in range(1, 6): r = radius * level / 5 ring = [point(cx, cy, r, angle) for angle in angles] parts.append(polyline(ring, fill="none", stroke="#ccffa0", opacity=0, stroke_width=1.1)) parts[-1] = parts[-1].replace('fill="none" fill-opacity="0.000"', 'fill="none"').replace('stroke-opacity="0.92"', 'stroke-opacity="0.15"') parts.append(svg_text(cx + 8, cy - r + 4, f"{level / 5:.1f}", size=11, fill="#a5afa2", weight=600, opacity=0.75)) for task, angle in zip(tasks, angles): x, y = point(cx, cy, radius, angle) parts.append(f'') lx, ly = point(cx, cy, radius + 82, angle) parts.append(f'') parts.append(svg_text(lx, ly + 4, f"{task['task_number']:02d}", size=11, fill="#ccffa0", anchor="middle", weight=800, opacity=0.98)) for series_id in series_ids: if series_id not in polygon_series_set: continue spec = SERIES[series_id] points = [] for task, angle in zip(tasks, angles): score = task["values"].get(series_id, {}).get("normalized_score") points.append(point(cx, cy, radius * float(score or 0.0), angle)) parts.append(polyline(points, fill=spec["color"], stroke=spec["color"], opacity=0.18 if series_id in {"minimal", "raw128_simple"} else 0.16, stroke_width=4.2, dash=spec.get("stroke_dasharray"))) for x, y in points: parts.append(f'') for series_id in series_ids: if series_id in polygon_series_set: continue spec = SERIES[series_id] for task, angle in zip(tasks, angles): score = task["values"].get(series_id, {}).get("normalized_score") if score is None: continue x, y = point(cx, cy, radius * float(score), angle) radius_px = 6.5 if series_id.startswith(("metadata128", "raw128")) else 8.0 parts.append( f'' ) legend_x, legend_y = 1315, 178 parts.append(f'') parts.append(svg_text(legend_x, legend_y, "Methods compared", size=25, weight=800)) parts.append(svg_text(legend_x, legend_y + 30, "Each method has 20 records; scored axes and scoreless statuses stay in the JSON matrix.", size=13, fill="#a5afa2", weight=560)) cursor = legend_y + 74 for record in series_records: color = record["color"] parts.append(f'') if record["id"] not in polygon_series_set: parts.append(f'') parts.append(svg_text(legend_x + 66, cursor - 12, record["label"], size=15, weight=800)) parts.append(svg_text(legend_x + 392, cursor - 12, f"20 records / {record['scored_task_count']} scored", size=13, fill=color, weight=800)) detail_lines = split_text(METHOD_DETAILS.get(record["id"], record["scope"]), 78)[:2] parts.extend(svg_text_lines(legend_x + 66, cursor + 8, detail_lines, size=11, fill="#a5afa2", weight=560, line_height=15)) cursor += 50 key_x, key_y = 1315, 780 parts.append(f'') parts.append(svg_text(key_x, key_y, "Task axis key", size=25, weight=800)) parts.append(svg_text(key_x, key_y + 30, "Full task names are listed here so the polygon remains readable at homepage scale.", size=13, fill="#a5afa2", weight=560)) for idx, task in enumerate(tasks): col = 0 if idx < 10 else 1 row = idx if idx < 10 else idx - 10 x0 = key_x + col * 500 y0 = key_y + 74 + row * 54 proxy = task["task_id"] in PROXY_TASK_IDS badge_fill = "#f472b6" if proxy else "#ccffa0" parts.append(f'') parts.append(svg_text(x0 + 18, y0 + 2, f"{task['task_number']:02d}", size=11, fill=badge_fill, anchor="middle", weight=800)) name_lines = split_text(str(task["label"]), 42)[:2] parts.extend(svg_text_lines(x0 + 48, y0 - 3, name_lines, size=12, fill="#f4f8ef", weight=760, line_height=14)) metric_label = f"{task.get('metric_name') or task.get('metric_key')} / {'lower better' if task.get('metric_direction') == 'lower' else 'higher better'}" if proxy: metric_label += " / raw128 proxy" parts.append(svg_text(x0 + 48, y0 + 29, metric_label, size=10, fill="#a5afa2", weight=560)) table_y = 1680 if reading_rules is None: reading_rules = ( "Every method has 20 task records; radius appears only where a numeric task score exists.", "Raw128 completion: 18 direct task targets plus 2 compact proxies. Task 15 predicts the dominant caption/object/interaction hash bin; task 19 retrieves depth/audio sync from camera pose.", "Scoreless metadata/Qwen/Cosmos records are explicit unsupported or not-evaluated cells in docs/data/task_method_20_result_matrix.json.", ) parts.append(f'') parts.append(svg_text(100, table_y - 10, "Reading rules", size=16, fill="#ccffa0", weight=800)) parts.append(svg_text(220, table_y - 10, reading_rules[0], size=14, fill="#dce8d7", weight=650)) parts.append(svg_text(220, table_y + 18, reading_rules[1], size=13, fill="#a5afa2", weight=560)) parts.append(svg_text(220, table_y + 44, reading_rules[2], size=13, fill="#a5afa2", weight=560)) parts.append("") return "\n".join(parts) + "\n" def main() -> int: payload = build_payload() single_payload = filtered_radar_payload( payload, SINGLE_EPISODE_SERIES, title="Single-Episode 20-Task Radar", description="Minimal and Neural MLP baselines on the one public sample episode, both scored on all 20 task contracts.", ) episode128_payload = filtered_radar_payload( payload, EPISODE128_SERIES, title="128-Episode 20-Task Radar", description="Selected 128-episode metadata/raw baselines plus verified Qwen3/Cosmos branches. Every method has 20 records; numeric scores appear only where the public artifact produced that task target.", ) OUTPUT_JSON.parent.mkdir(parents=True, exist_ok=True) OUTPUT_SINGLE_JSON.parent.mkdir(parents=True, exist_ok=True) OUTPUT_128_JSON.parent.mkdir(parents=True, exist_ok=True) OUTPUT_MATRIX_JSON.parent.mkdir(parents=True, exist_ok=True) OUTPUT_SVG.parent.mkdir(parents=True, exist_ok=True) OUTPUT_SINGLE_SVG.parent.mkdir(parents=True, exist_ok=True) OUTPUT_128_SVG.parent.mkdir(parents=True, exist_ok=True) OUTPUT_JSON.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") OUTPUT_SINGLE_JSON.write_text(json.dumps(single_payload, indent=2) + "\n", encoding="utf-8") OUTPUT_128_JSON.write_text(json.dumps(episode128_payload, indent=2) + "\n", encoding="utf-8") matrix_payload = { "title": "Task Method 20-Result Matrix", "status": "pass", "generated_at_utc": payload["generated_at_utc"], "task_count": payload["task_count"], "method_count": payload["method_count"], "method_task_record_count": payload["method_task_record_count"], "scored_method_task_count": payload["scored_method_task_count"], "series": payload["series"], "records": payload["task_method_result_matrix"], } OUTPUT_MATRIX_JSON.write_text(json.dumps(matrix_payload, indent=2) + "\n", encoding="utf-8") OUTPUT_MATRIX_MD.write_text(render_matrix_markdown(payload), encoding="utf-8") OUTPUT_SVG.write_text(render_svg(payload), encoding="utf-8") OUTPUT_SINGLE_SVG.write_text( render_svg( single_payload, series_ids=SINGLE_EPISODE_SERIES, polygon_series_ids=SINGLE_EPISODE_SERIES, title="Single-Episode 20-Task Radar", subtitle="One public sample episode; both baseline heads score every task axis.", context_line="This view isolates the 1-episode task-head setup from the multi-episode model branches.", chip_specs=[ ("20 task axes", "#ccffa0"), ("40 method-task records", "#67e8d1"), ("40 scored axes", "#22d3ee"), ("2 filled baseline polygons", "#f472b6"), ], reading_rules=( "Both single-episode methods have numeric scores on every one of the 20 task contracts.", "This radar is the cleanest view of public-sample Minimal vs Neural MLP behavior before any 128-episode scale-up.", "Raw metric values and sources remain in docs/data/single_episode_task_model_radar.json and docs/data/task_method_20_result_matrix.json.", ), ), encoding="utf-8", ) OUTPUT_128_SVG.write_text( render_svg( episode128_payload, series_ids=EPISODE128_SERIES, polygon_series_ids=("raw128_simple", "raw128_neural_mlp"), title="128-Episode 20-Task Radar", subtitle="Selected 96/16/16 episode split; raw-feature heads score all 20 axes.", context_line="Raw128 baselines are filled polygons; metadata, Qwen3, and Cosmos branches plot only evaluated task targets.", chip_specs=[ ("20 task axes", "#ccffa0"), ("140 method-task records", "#67e8d1"), (f"{episode128_payload['scored_method_task_count']} scored axes", "#22d3ee"), ("40/40 raw128 pass", "#f59e0b"), ( f"{episode128_payload['method_task_record_count'] - episode128_payload['scored_method_task_count']} explicit scoreless", "#f472b6", ), ], reading_rules=( "Every 128-episode method has 20 result records; radius appears only where a numeric score exists.", "Raw128 Simple and Raw128 NN are complete 20/20 scored multi-episode baselines; tasks 15/19 are documented compact proxies.", "Qwen3/Cosmos task 16 uses existing verified action/object JSON; other scoreless cells remain explicit not-supported or not-evaluated records.", ), ), encoding="utf-8", ) print(f"PASS: wrote {OUTPUT_JSON}") print(f"PASS: wrote {OUTPUT_SINGLE_JSON}") print(f"PASS: wrote {OUTPUT_128_JSON}") print(f"PASS: wrote {OUTPUT_MATRIX_JSON}") print(f"PASS: wrote {OUTPUT_MATRIX_MD}") print(f"PASS: wrote {OUTPUT_SVG}") print(f"PASS: wrote {OUTPUT_SINGLE_SVG}") print(f"PASS: wrote {OUTPUT_128_SVG}") return 0 if __name__ == "__main__": raise SystemExit(main())