File size: 13,473 Bytes
d735235
 
 
 
 
 
 
 
 
 
 
 
ca4ac1c
d735235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca4ac1c
d735235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540e67a
d735235
a8124a8
d735235
 
 
 
 
 
 
cf07180
d735235
 
 
 
540e67a
d735235
 
 
 
 
 
 
 
 
 
 
cf07180
d735235
 
 
 
540e67a
d735235
 
 
 
 
 
 
 
 
 
 
 
 
cf07180
d735235
 
 
 
540e67a
d735235
 
 
 
 
 
 
 
 
 
 
cf07180
d735235
ca4ac1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45c1706
ca4ac1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d735235
 
 
540e67a
627e5d7
 
 
d735235
 
627e5d7
 
 
 
d735235
627e5d7
cf07180
 
627e5d7
cf07180
ca4ac1c
 
d735235
 
 
 
 
 
 
 
 
ca4ac1c
d735235
 
 
 
 
 
 
a8124a8
d735235
 
 
 
 
 
 
 
 
 
 
 
cf07180
d735235
 
 
 
 
 
 
 
45c1706
d735235
 
 
 
 
 
 
 
 
540e67a
d735235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf07180
d735235
 
 
 
cf07180
d735235
ca4ac1c
45c1706
ca4ac1c
d735235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#!/usr/bin/env python3
"""Build research takeaways from committed Xperience-10M metric artifacts."""

from __future__ import annotations

import json
from datetime import datetime, timezone
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
SUMMARY_PATH = ROOT / "docs/data/summary_metrics.json"
AUDIO_PATH = ROOT / "docs/data/audio_ablation_summary.json"
OUTPUT_JSON = ROOT / "docs/data/research_takeaways.json"
OUTPUT_MD = ROOT / "RESEARCH_TAKEAWAYS.md"


def pct_delta(new: float, old: float, higher_is_better: bool = True) -> float:
    if old == 0:
        return 0.0
    if higher_is_better:
        return (new - old) / abs(old)
    return (old - new) / abs(old)


def fmt(value: float | int | None, digits: int = 4) -> str:
    if value is None:
        return "n/a"
    if isinstance(value, int):
        return f"{value:,}"
    return f"{value:.{digits}f}"


def task_metric(tasks: dict, task: str, key: str) -> float:
    return float(tasks[task][key])


def build_payload() -> dict:
    summary = json.loads(SUMMARY_PATH.read_text(encoding="utf-8"))
    audio_summary = json.loads(AUDIO_PATH.read_text(encoding="utf-8")) if AUDIO_PATH.exists() else None
    suite = summary["suite"]
    tasks = suite["tasks"]
    neural = suite.get("neural_tasks", {})
    models = summary["models"]
    omni = summary.get("omni_relay", {})

    hand_min = task_metric(tasks, "hand_trajectory_forecast", "mpjpe")
    hand_neural = task_metric(neural, "hand_trajectory_forecast", "mpjpe")
    temporal_min = task_metric(tasks, "temporal_order", "f1")
    temporal_neural = task_metric(neural, "temporal_order", "f1")
    misalign_min = task_metric(tasks, "misalignment_detection", "f1")
    misalign_neural = task_metric(neural, "misalignment_detection", "f1")
    retrieval_min_mrr = task_metric(tasks, "cross_modal_retrieval", "mrr")
    retrieval_neural_mrr = task_metric(neural, "cross_modal_retrieval", "mrr")
    recon_min_r2 = task_metric(tasks, "modality_reconstruction", "r2")
    recon_neural_r2 = task_metric(neural, "modality_reconstruction", "r2")
    action_chrono = task_metric(tasks, "timeline_action", "macro_f1")
    subtask_chrono = task_metric(tasks, "timeline_subtask", "macro_f1")

    takeaways = [
        {
            "id": "episode_to_benchmark",
            "title": "One episode can become a real benchmark contract",
            "readout": (
                "The public sample is converted into 5,821 frames, 1,161 aligned "
                f"20-frame windows, and an {suite['feature_dim']:,}-dimensional feature contract."
            ),
            "evidence": [
                {"label": "frames", "value": suite["num_frames"]},
                {"label": "windows", "value": suite["num_windows"]},
                {"label": "feature_dim", "value": suite["feature_dim"]},
            ],
            "source": "docs/data/summary_metrics.json",
            "current_scope": "This benchmark defines the task contract; cross-episode generalization is evaluated in the multi-episode stage.",
        },
        {
            "id": "chronological_split_exposes_class_shift",
            "title": "Chronological splits expose action-class shift",
            "readout": (
                "Earlier all-feature action classifiers reach high macro-F1 on their "
                "local split, but the 12-task chronological action/subtask heads are "
                "much harder because later held-out windows include unseen labels."
            ),
            "evidence": [
                {"label": "all_feature_action_macro_f1", "value": models["all_modalities_action"]["macro_f1"]},
                {"label": "suite_action_macro_f1", "value": action_chrono},
                {"label": "suite_subtask_macro_f1", "value": subtask_chrono},
                {"label": "unseen_action_test_classes", "value": len(tasks["timeline_action"].get("unseen_test_classes", []))},
            ],
            "source": "results/episode_task_suite/summary_report.json",
            "current_scope": "This split is useful for studying label shift; broad action-recognition conclusions need held-out episodes.",
        },
        {
            "id": "neural_heads_help_dynamics",
            "title": "Small neural heads help dynamic and temporal probes",
            "readout": (
                "The MLP heads substantially improve hand trajectory forecasting, "
                "temporal-order verification, and motion/visual synchronization."
            ),
            "evidence": [
                {"label": "hand_mpjpe_minimal", "value": hand_min},
                {"label": "hand_mpjpe_neural", "value": hand_neural},
                {"label": "hand_mpjpe_relative_improvement", "value": pct_delta(hand_neural, hand_min, higher_is_better=False)},
                {"label": "temporal_order_f1_minimal", "value": temporal_min},
                {"label": "temporal_order_f1_neural", "value": temporal_neural},
                {"label": "misalignment_f1_minimal", "value": misalign_min},
                {"label": "misalignment_f1_neural", "value": misalign_neural},
            ],
            "source": "results/episode_task_suite/neural_mlp/*/metrics.json",
            "current_scope": "These gains are measured within one episode and are candidates for held-out-episode testing.",
        },
        {
            "id": "retrieval_and_reconstruction_remain_open",
            "title": "Retrieval and reconstruction remain the harder multimodal problems",
            "readout": (
                "Ridge/cosine retrieval remains stronger than the neural projection on "
                "this sample, and cross-modal reconstruction still has negative R2."
            ),
            "evidence": [
                {"label": "retrieval_mrr_minimal", "value": retrieval_min_mrr},
                {"label": "retrieval_mrr_neural", "value": retrieval_neural_mrr},
                {"label": "retrieval_top5_minimal", "value": tasks["cross_modal_retrieval"]["top5_accuracy"]},
                {"label": "reconstruction_r2_minimal", "value": recon_min_r2},
                {"label": "reconstruction_r2_neural", "value": recon_neural_r2},
            ],
            "source": "results/episode_task_suite/cross_modal_retrieval/metrics.json",
            "current_scope": "The current reconstruction task predicts feature vectors; depth, mesh, NeRF, and Gaussian-splatting outputs are future task variants.",
        },
    ]

    if audio_summary is not None:
        audio_aggregate = audio_summary["aggregate"]
        modality_recon = next(
            (item for item in audio_summary["task_summaries"] if item["task"] == "modality_reconstruction"),
            {},
        )
        object_relevance = next(
            (item for item in audio_summary["task_summaries"] if item["task"] == "object_relevance"),
            {},
        )
        takeaways.append(
            {
                "id": "audio_contribution_is_task_specific",
                "title": "Audio helps some tasks and hurts others on the public sample",
                "readout": (
                    "Audio improves the primary metric on 6 of 12 tasks, "
                    "while raw log-mel replacement improves over the current handcrafted block on 6 of 12 tasks. "
                    "The largest current-audio gain appears in feature reconstruction, not in action classification."
                ),
                "evidence": [
                    {"label": "tasks_where_current_audio_improves", "value": audio_aggregate["tasks_where_handcrafted_audio_improves"]},
                    {"label": "mean_current_audio_delta", "value": audio_aggregate["mean_handcrafted_audio_delta"]},
                    {"label": "tasks_where_raw_replacement_improves", "value": audio_aggregate["tasks_where_raw_replacement_improves_over_handcrafted"]},
                    {"label": "mean_raw_replacement_delta_vs_current", "value": audio_aggregate["mean_raw_replacement_delta_vs_handcrafted"]},
                    {"label": "reconstruction_current_audio_delta", "value": modality_recon.get("handcrafted_audio_delta")},
                    {"label": "object_relevance_current_audio_delta", "value": object_relevance.get("handcrafted_audio_delta")},
                ],
                "source": "results/audio_ablation/audio_ablation_summary.json",
                "current_scope": (
                    "This is a single-episode ablation over fixed ridge heads. It validates that audio is wired into the task suite "
                    "and shows where it changes metrics; it does not prove cross-episode audio generalization."
                ),
            }
        )

    takeaways.append(
        {
            "id": "scale_requires_episodes",
            "title": "The next scientific unit is held-out episodes, not more adjacent windows",
            "readout": (
                "The selected Qwen3-Omni path now has a verified two-epoch held-out diagnostic result. "
                "It proves the cross-episode train/validation/eval loop and meets the strict-JSON target, "
                "while weak action/subtask metrics remain the next modeling problem."
            ),
            "evidence": [
                {"label": "selected_episodes", "value": omni.get("target_episodes")},
                {"label": "held_out_test_windows", "value": omni.get("held_out_test_windows")},
                {"label": "json_validity_rate", "value": omni.get("json_validity_rate")},
                {"label": "action_macro_f1", "value": omni.get("action_macro_f1")},
            ],
            "source": "docs/data/omni_finetune_verified_result.json",
            "current_scope": omni.get(
                "current_scope",
                "This is a diagnostic multi-episode pilot, not a strong model result.",
            ),
        }
    )

    return {
        "title": "Ropedia Xperience-10M Research Takeaways",
        "status": "pass",
        "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
        "source_files": [
            "docs/data/summary_metrics.json",
            "results/episode_task_suite/summary_report.json",
            "results/episode_task_suite/neural_mlp/*/metrics.json",
            "docs/data/audio_ablation_summary.json",
            "results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md",
        ],
        "scope": {
            "validated_episode_count": 1,
            "num_frames": suite["num_frames"],
            "num_windows": suite["num_windows"],
            "feature_dim": suite["feature_dim"],
            "audio_featurized": True,
            "raw_data_redistributed": False,
        },
        "takeaways": takeaways,
    }


def render_md(payload: dict) -> str:
    lines = [
        "# Research Takeaways",
        "",
        "This generated note summarizes what the current public Xperience-10M sample",
        "pipeline actually shows. It is built from committed metric artifacts, not",
        "from hand-edited score text.",
        "",
        "## Scope",
        "",
        f"- validated episodes: {payload['scope']['validated_episode_count']}",
        f"- frames: {payload['scope']['num_frames']:,}",
        f"- aligned windows: {payload['scope']['num_windows']:,}",
        f"- current feature dimension: {payload['scope']['feature_dim']:,}",
        "- raw Xperience-10M data is not redistributed",
        "- Audio from the sample MP4 stream is represented in the current feature vector",
        "",
        "## Takeaways",
        "",
    ]
    for item in payload["takeaways"]:
        lines.extend(
            [
                f"### {item['title']}",
                "",
                item["readout"],
                "",
                "| Metric | Value |",
                "| --- | ---: |",
            ]
        )
        for evidence in item["evidence"]:
            value = evidence["value"]
            if isinstance(value, float):
                value_text = fmt(value)
            elif isinstance(value, int):
                value_text = fmt(value)
            elif value is None:
                value_text = "n/a"
            else:
                value_text = str(value)
            lines.append(f"| `{evidence['label']}` | {value_text} |")
        lines.extend(["", f"Source: `{item['source']}`.", "", f"Current scope: {item['current_scope']}", ""])
    lines.extend(
        [
            "## How To Read These Results",
            "",
            "- High single-episode scores are useful pipeline checks for the current task contracts.",
            "- Low chronological action/subtask scores are informative because they expose later-label shift.",
        "- Neural gains on trajectory/order/alignment make those tasks good candidates for the next fine-tuning stage.",
        "- Audio ablation is task-specific: audio representation choices help some probes and hurt others.",
        "- Retrieval and reconstruction remain the main multimodal representation challenges.",
            "- The next credible model-quality result needs held-out episodes.",
            "",
        ]
    )
    return "\n".join(lines)


def main() -> int:
    payload = build_payload()
    OUTPUT_JSON.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
    OUTPUT_MD.write_text(render_md(payload), encoding="utf-8")
    print(f"PASS: wrote {OUTPUT_JSON}")
    print(f"PASS: wrote {OUTPUT_MD}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())