| |
| """Build research takeaways from committed Xperience-10M metric artifacts.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| from datetime import datetime, timezone |
| from pathlib import Path |
|
|
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| SUMMARY_PATH = ROOT / "docs/data/summary_metrics.json" |
| AUDIO_PATH = ROOT / "docs/data/audio_ablation_summary.json" |
| OUTPUT_JSON = ROOT / "docs/data/research_takeaways.json" |
| OUTPUT_MD = ROOT / "RESEARCH_TAKEAWAYS.md" |
|
|
|
|
| def pct_delta(new: float, old: float, higher_is_better: bool = True) -> float: |
| if old == 0: |
| return 0.0 |
| if higher_is_better: |
| return (new - old) / abs(old) |
| return (old - new) / abs(old) |
|
|
|
|
| def fmt(value: float | int | None, digits: int = 4) -> str: |
| if value is None: |
| return "n/a" |
| if isinstance(value, int): |
| return f"{value:,}" |
| return f"{value:.{digits}f}" |
|
|
|
|
| def task_metric(tasks: dict, task: str, key: str) -> float: |
| return float(tasks[task][key]) |
|
|
|
|
| def build_payload() -> dict: |
| summary = json.loads(SUMMARY_PATH.read_text(encoding="utf-8")) |
| audio_summary = json.loads(AUDIO_PATH.read_text(encoding="utf-8")) if AUDIO_PATH.exists() else None |
| suite = summary["suite"] |
| tasks = suite["tasks"] |
| neural = suite.get("neural_tasks", {}) |
| models = summary["models"] |
| omni = summary.get("omni_relay", {}) |
|
|
| hand_min = task_metric(tasks, "hand_trajectory_forecast", "mpjpe") |
| hand_neural = task_metric(neural, "hand_trajectory_forecast", "mpjpe") |
| temporal_min = task_metric(tasks, "temporal_order", "f1") |
| temporal_neural = task_metric(neural, "temporal_order", "f1") |
| misalign_min = task_metric(tasks, "misalignment_detection", "f1") |
| misalign_neural = task_metric(neural, "misalignment_detection", "f1") |
| retrieval_min_mrr = task_metric(tasks, "cross_modal_retrieval", "mrr") |
| retrieval_neural_mrr = task_metric(neural, "cross_modal_retrieval", "mrr") |
| recon_min_r2 = task_metric(tasks, "modality_reconstruction", "r2") |
| recon_neural_r2 = task_metric(neural, "modality_reconstruction", "r2") |
| action_chrono = task_metric(tasks, "timeline_action", "macro_f1") |
| subtask_chrono = task_metric(tasks, "timeline_subtask", "macro_f1") |
|
|
| takeaways = [ |
| { |
| "id": "episode_to_benchmark", |
| "title": "One episode can become a real benchmark contract", |
| "readout": ( |
| "The public sample is converted into 5,821 frames, 1,161 aligned " |
| f"20-frame windows, and an {suite['feature_dim']:,}-dimensional feature contract." |
| ), |
| "evidence": [ |
| {"label": "frames", "value": suite["num_frames"]}, |
| {"label": "windows", "value": suite["num_windows"]}, |
| {"label": "feature_dim", "value": suite["feature_dim"]}, |
| ], |
| "source": "docs/data/summary_metrics.json", |
| "current_scope": "This benchmark defines the task contract; cross-episode generalization is evaluated in the multi-episode stage.", |
| }, |
| { |
| "id": "chronological_split_exposes_class_shift", |
| "title": "Chronological splits expose action-class shift", |
| "readout": ( |
| "Earlier all-feature action classifiers reach high macro-F1 on their " |
| "local split, but the core chronological action/subtask heads are " |
| "much harder because later held-out windows include unseen labels." |
| ), |
| "evidence": [ |
| {"label": "all_feature_action_macro_f1", "value": models["all_modalities_action"]["macro_f1"]}, |
| {"label": "suite_action_macro_f1", "value": action_chrono}, |
| {"label": "suite_subtask_macro_f1", "value": subtask_chrono}, |
| {"label": "unseen_action_test_classes", "value": len(tasks["timeline_action"].get("unseen_test_classes", []))}, |
| ], |
| "source": "results/episode_task_suite/summary_report.json", |
| "current_scope": "This split is useful for studying label shift; broad action-recognition conclusions need held-out episodes.", |
| }, |
| { |
| "id": "neural_heads_help_dynamics", |
| "title": "Small neural heads help dynamic and temporal probes", |
| "readout": ( |
| "The MLP heads substantially improve hand trajectory forecasting, " |
| "temporal-order verification, and motion/visual synchronization." |
| ), |
| "evidence": [ |
| {"label": "hand_mpjpe_minimal", "value": hand_min}, |
| {"label": "hand_mpjpe_neural", "value": hand_neural}, |
| {"label": "hand_mpjpe_relative_improvement", "value": pct_delta(hand_neural, hand_min, higher_is_better=False)}, |
| {"label": "temporal_order_f1_minimal", "value": temporal_min}, |
| {"label": "temporal_order_f1_neural", "value": temporal_neural}, |
| {"label": "misalignment_f1_minimal", "value": misalign_min}, |
| {"label": "misalignment_f1_neural", "value": misalign_neural}, |
| ], |
| "source": "results/episode_task_suite/neural_mlp/*/metrics.json", |
| "current_scope": "These gains are measured within one episode and are candidates for held-out-episode testing.", |
| }, |
| { |
| "id": "retrieval_and_reconstruction_remain_open", |
| "title": "Retrieval and reconstruction remain the harder multimodal problems", |
| "readout": ( |
| "Ridge/cosine retrieval remains stronger than the neural projection on " |
| "this sample, and cross-modal reconstruction still has negative R2." |
| ), |
| "evidence": [ |
| {"label": "retrieval_mrr_minimal", "value": retrieval_min_mrr}, |
| {"label": "retrieval_mrr_neural", "value": retrieval_neural_mrr}, |
| {"label": "retrieval_top5_minimal", "value": tasks["cross_modal_retrieval"]["top5_accuracy"]}, |
| {"label": "reconstruction_r2_minimal", "value": recon_min_r2}, |
| {"label": "reconstruction_r2_neural", "value": recon_neural_r2}, |
| ], |
| "source": "results/episode_task_suite/cross_modal_retrieval/metrics.json", |
| "current_scope": "The current reconstruction task predicts feature vectors; depth, mesh, NeRF, and Gaussian-splatting outputs are future task variants.", |
| }, |
| ] |
|
|
| if audio_summary is not None: |
| audio_aggregate = audio_summary["aggregate"] |
| modality_recon = next( |
| (item for item in audio_summary["task_summaries"] if item["task"] == "modality_reconstruction"), |
| {}, |
| ) |
| object_relevance = next( |
| (item for item in audio_summary["task_summaries"] if item["task"] == "object_relevance"), |
| {}, |
| ) |
| takeaways.append( |
| { |
| "id": "audio_contribution_is_task_specific", |
| "title": "Audio helps some tasks and hurts others on the public sample", |
| "readout": ( |
| "Audio improves the primary metric on 6 walkthrough-backed task contracts, " |
| "while raw log-mel replacement improves over the current handcrafted block on 6 of those contracts. " |
| "The largest current-audio gain appears in feature reconstruction, not in action classification." |
| ), |
| "evidence": [ |
| {"label": "tasks_where_current_audio_improves", "value": audio_aggregate["tasks_where_handcrafted_audio_improves"]}, |
| {"label": "mean_current_audio_delta", "value": audio_aggregate["mean_handcrafted_audio_delta"]}, |
| {"label": "tasks_where_raw_replacement_improves", "value": audio_aggregate["tasks_where_raw_replacement_improves_over_handcrafted"]}, |
| {"label": "mean_raw_replacement_delta_vs_current", "value": audio_aggregate["mean_raw_replacement_delta_vs_handcrafted"]}, |
| {"label": "reconstruction_current_audio_delta", "value": modality_recon.get("handcrafted_audio_delta")}, |
| {"label": "object_relevance_current_audio_delta", "value": object_relevance.get("handcrafted_audio_delta")}, |
| ], |
| "source": "results/audio_ablation/audio_ablation_summary.json", |
| "current_scope": ( |
| "This is a single-episode ablation over fixed ridge heads. It validates that audio is wired into the task suite " |
| "and shows where it changes metrics; it does not prove cross-episode audio generalization." |
| ), |
| } |
| ) |
|
|
| takeaways.append( |
| { |
| "id": "scale_requires_episodes", |
| "title": "The next scientific unit is held-out episodes, not more adjacent windows", |
| "readout": ( |
| "The selected Qwen3-Omni path now has a verified two-epoch held-out diagnostic result. " |
| "It proves the cross-episode train/validation/eval loop and meets the strict-JSON target, " |
| "while weak action/subtask metrics remain the next modeling problem." |
| ), |
| "evidence": [ |
| {"label": "selected_episodes", "value": omni.get("target_episodes")}, |
| {"label": "held_out_test_windows", "value": omni.get("held_out_test_windows")}, |
| {"label": "json_validity_rate", "value": omni.get("json_validity_rate")}, |
| {"label": "action_macro_f1", "value": omni.get("action_macro_f1")}, |
| ], |
| "source": "docs/data/omni_finetune_verified_result.json", |
| "current_scope": omni.get( |
| "current_scope", |
| "This is a diagnostic multi-episode pilot, not a strong model result.", |
| ), |
| } |
| ) |
|
|
| return { |
| "title": "Ropedia Xperience-10M Research Takeaways", |
| "status": "pass", |
| "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), |
| "source_files": [ |
| "docs/data/summary_metrics.json", |
| "results/episode_task_suite/summary_report.json", |
| "results/episode_task_suite/neural_mlp/*/metrics.json", |
| "docs/data/audio_ablation_summary.json", |
| "results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md", |
| ], |
| "scope": { |
| "validated_episode_count": 1, |
| "num_frames": suite["num_frames"], |
| "num_windows": suite["num_windows"], |
| "feature_dim": suite["feature_dim"], |
| "audio_featurized": True, |
| "raw_data_redistributed": False, |
| }, |
| "takeaways": takeaways, |
| } |
|
|
|
|
| def render_md(payload: dict) -> str: |
| lines = [ |
| "# Research Takeaways", |
| "", |
| "This generated note summarizes what the current public Xperience-10M sample", |
| "pipeline actually shows. It is built from committed metric artifacts, not", |
| "from hand-edited score text.", |
| "", |
| "## Scope", |
| "", |
| f"- validated episodes: {payload['scope']['validated_episode_count']}", |
| f"- frames: {payload['scope']['num_frames']:,}", |
| f"- aligned windows: {payload['scope']['num_windows']:,}", |
| f"- current feature dimension: {payload['scope']['feature_dim']:,}", |
| "- raw Xperience-10M data is not redistributed", |
| "- Audio from the sample MP4 stream is represented in the current feature vector", |
| "", |
| "## Takeaways", |
| "", |
| ] |
| for item in payload["takeaways"]: |
| lines.extend( |
| [ |
| f"### {item['title']}", |
| "", |
| item["readout"], |
| "", |
| "| Metric | Value |", |
| "| --- | ---: |", |
| ] |
| ) |
| for evidence in item["evidence"]: |
| value = evidence["value"] |
| if isinstance(value, float): |
| value_text = fmt(value) |
| elif isinstance(value, int): |
| value_text = fmt(value) |
| elif value is None: |
| value_text = "n/a" |
| else: |
| value_text = str(value) |
| lines.append(f"| `{evidence['label']}` | {value_text} |") |
| lines.extend(["", f"Source: `{item['source']}`.", "", f"Current scope: {item['current_scope']}", ""]) |
| lines.extend( |
| [ |
| "## How To Read These Results", |
| "", |
| "- High single-episode scores are useful pipeline checks for the current task contracts.", |
| "- Low chronological action/subtask scores are informative because they expose later-label shift.", |
| "- Neural gains on trajectory/order/alignment make those tasks good candidates for the next fine-tuning stage.", |
| "- Audio ablation is task-specific: audio representation choices help some probes and hurt others.", |
| "- Retrieval and reconstruction remain the main multimodal representation challenges.", |
| "- The next credible model-quality result needs held-out episodes.", |
| "", |
| ] |
| ) |
| return "\n".join(lines) |
|
|
|
|
| def main() -> int: |
| payload = build_payload() |
| OUTPUT_JSON.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") |
| OUTPUT_MD.write_text(render_md(payload), encoding="utf-8") |
| print(f"PASS: wrote {OUTPUT_JSON}") |
| print(f"PASS: wrote {OUTPUT_MD}") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|