#!/usr/bin/env python3 """Build research takeaways from committed Xperience-10M metric artifacts.""" from __future__ import annotations import json from datetime import datetime, timezone from pathlib import Path ROOT = Path(__file__).resolve().parents[1] SUMMARY_PATH = ROOT / "docs/data/summary_metrics.json" OUTPUT_JSON = ROOT / "docs/data/research_takeaways.json" OUTPUT_MD = ROOT / "RESEARCH_TAKEAWAYS.md" def pct_delta(new: float, old: float, higher_is_better: bool = True) -> float: if old == 0: return 0.0 if higher_is_better: return (new - old) / abs(old) return (old - new) / abs(old) def fmt(value: float | int | None, digits: int = 4) -> str: if value is None: return "n/a" if isinstance(value, int): return f"{value:,}" return f"{value:.{digits}f}" def task_metric(tasks: dict, task: str, key: str) -> float: return float(tasks[task][key]) def build_payload() -> dict: summary = json.loads(SUMMARY_PATH.read_text(encoding="utf-8")) suite = summary["suite"] tasks = suite["tasks"] neural = suite.get("neural_tasks", {}) models = summary["models"] omni = summary.get("omni_relay", {}) hand_min = task_metric(tasks, "hand_trajectory_forecast", "mpjpe") hand_neural = task_metric(neural, "hand_trajectory_forecast", "mpjpe") temporal_min = task_metric(tasks, "temporal_order", "f1") temporal_neural = task_metric(neural, "temporal_order", "f1") misalign_min = task_metric(tasks, "misalignment_detection", "f1") misalign_neural = task_metric(neural, "misalignment_detection", "f1") retrieval_min_mrr = task_metric(tasks, "cross_modal_retrieval", "mrr") retrieval_neural_mrr = task_metric(neural, "cross_modal_retrieval", "mrr") recon_min_r2 = task_metric(tasks, "modality_reconstruction", "r2") recon_neural_r2 = task_metric(neural, "modality_reconstruction", "r2") action_chrono = task_metric(tasks, "timeline_action", "macro_f1") subtask_chrono = task_metric(tasks, "timeline_subtask", "macro_f1") takeaways = [ { "id": "episode_to_benchmark", "title": "One episode can become a real benchmark contract", "readout": ( "The public sample is converted into 5,821 frames, 1,161 aligned " f"20-frame windows, and an {suite['feature_dim']:,}-dimensional feature contract." ), "evidence": [ {"label": "frames", "value": suite["num_frames"]}, {"label": "windows", "value": suite["num_windows"]}, {"label": "feature_dim", "value": suite["feature_dim"]}, ], "source": "docs/data/summary_metrics.json", "current_scope": "This benchmark defines the task contract; cross-episode generalization is evaluated in the multi-episode stage.", }, { "id": "chronological_split_exposes_class_shift", "title": "Chronological splits expose action-class shift", "readout": ( "Earlier all-feature action classifiers reach high macro-F1 on their " "local split, but the 12-task chronological action/subtask heads are " "much harder because later held-out windows include unseen labels." ), "evidence": [ {"label": "all_feature_action_macro_f1", "value": models["all_modalities_action"]["macro_f1"]}, {"label": "suite_action_macro_f1", "value": action_chrono}, {"label": "suite_subtask_macro_f1", "value": subtask_chrono}, {"label": "unseen_action_test_classes", "value": len(tasks["timeline_action"].get("unseen_test_classes", []))}, ], "source": "results/episode_task_suite/summary_report.json", "current_scope": "This split is useful for studying label shift; broad action-recognition conclusions need held-out episodes.", }, { "id": "neural_heads_help_dynamics", "title": "Small neural heads help dynamic and temporal probes", "readout": ( "The MLP heads substantially improve hand trajectory forecasting, " "temporal-order verification, and motion/visual synchronization." ), "evidence": [ {"label": "hand_mpjpe_minimal", "value": hand_min}, {"label": "hand_mpjpe_neural", "value": hand_neural}, {"label": "hand_mpjpe_relative_improvement", "value": pct_delta(hand_neural, hand_min, higher_is_better=False)}, {"label": "temporal_order_f1_minimal", "value": temporal_min}, {"label": "temporal_order_f1_neural", "value": temporal_neural}, {"label": "misalignment_f1_minimal", "value": misalign_min}, {"label": "misalignment_f1_neural", "value": misalign_neural}, ], "source": "results/episode_task_suite/neural_mlp/*/metrics.json", "current_scope": "These gains are measured within one episode and are candidates for held-out-episode testing.", }, { "id": "retrieval_and_reconstruction_remain_open", "title": "Retrieval and reconstruction remain the harder multimodal problems", "readout": ( "Ridge/cosine retrieval remains stronger than the neural projection on " "this sample, and cross-modal reconstruction still has negative R2." ), "evidence": [ {"label": "retrieval_mrr_minimal", "value": retrieval_min_mrr}, {"label": "retrieval_mrr_neural", "value": retrieval_neural_mrr}, {"label": "retrieval_top5_minimal", "value": tasks["cross_modal_retrieval"]["top5_accuracy"]}, {"label": "reconstruction_r2_minimal", "value": recon_min_r2}, {"label": "reconstruction_r2_neural", "value": recon_neural_r2}, ], "source": "results/episode_task_suite/cross_modal_retrieval/metrics.json", "current_scope": "The current reconstruction task predicts feature vectors; depth, mesh, NeRF, and Gaussian-splatting outputs are future task variants.", }, { "id": "scale_requires_episodes", "title": "The next scientific unit is held-out episodes, not more adjacent windows", "readout": ( "The prepared Qwen3-Omni path targets 32 episodes from 32 sessions, " "but it remains data-gated until access and held-out evaluation complete." ), "evidence": [ {"label": "target_episodes", "value": omni.get("target_episodes")}, {"label": "selected_sessions", "value": omni.get("selected_sessions")}, {"label": "valid_candidates", "value": omni.get("valid_candidates")}, ], "source": "results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md", "current_scope": omni.get( "current_scope", "The 32-episode fine-tune requires gated data staging and held-out evaluation.", ), }, ] return { "title": "Ropedia Xperience-10M Research Takeaways", "status": "pass", "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), "source_files": [ "docs/data/summary_metrics.json", "results/episode_task_suite/summary_report.json", "results/episode_task_suite/neural_mlp/*/metrics.json", "results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md", ], "scope": { "validated_episode_count": 1, "num_frames": suite["num_frames"], "num_windows": suite["num_windows"], "feature_dim": suite["feature_dim"], "audio_featurized": True, "raw_data_redistributed": False, }, "takeaways": takeaways, } def render_md(payload: dict) -> str: lines = [ "# Research Takeaways", "", "This generated note summarizes what the current public Xperience-10M sample", "pipeline actually shows. It is built from committed metric artifacts, not", "from hand-edited score text.", "", "## Scope", "", f"- validated episodes: {payload['scope']['validated_episode_count']}", f"- frames: {payload['scope']['num_frames']:,}", f"- aligned windows: {payload['scope']['num_windows']:,}", f"- current feature dimension: {payload['scope']['feature_dim']:,}", "- raw Xperience-10M data is not redistributed", "- AAC audio from the sample MP4 stream is extracted into the current feature vector", "", "## Takeaways", "", ] for item in payload["takeaways"]: lines.extend( [ f"### {item['title']}", "", item["readout"], "", "| Metric | Value |", "| --- | ---: |", ] ) for evidence in item["evidence"]: value = evidence["value"] if isinstance(value, float): value_text = fmt(value) elif isinstance(value, int): value_text = fmt(value) elif value is None: value_text = "n/a" else: value_text = str(value) lines.append(f"| `{evidence['label']}` | {value_text} |") lines.extend(["", f"Source: `{item['source']}`.", "", f"Current scope: {item['current_scope']}", ""]) lines.extend( [ "## How To Read These Results", "", "- High single-episode scores are useful pipeline checks for the current task contracts.", "- Low chronological action/subtask scores are informative because they expose later-label shift.", "- Neural gains on trajectory/order/alignment make those tasks good candidates for the next fine-tuning stage.", "- Retrieval and reconstruction remain the main multimodal representation challenges.", "- The next credible model-quality result needs held-out episodes.", "", ] ) return "\n".join(lines) def main() -> int: payload = build_payload() OUTPUT_JSON.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") OUTPUT_MD.write_text(render_md(payload), encoding="utf-8") print(f"PASS: wrote {OUTPUT_JSON}") print(f"PASS: wrote {OUTPUT_MD}") return 0 if __name__ == "__main__": raise SystemExit(main())