Add files using upload-large-folder tool

6460b80 verified 3 days ago

13.5 kB

	#!/usr/bin/env python3
	"""Build research takeaways from committed Xperience-10M metric artifacts."""

	from __future__ import annotations

	import json
	from datetime import datetime, timezone
	from pathlib import Path


	ROOT = Path(__file__).resolve().parents[1]
	SUMMARY_PATH = ROOT / "docs/data/summary_metrics.json"
	AUDIO_PATH = ROOT / "docs/data/audio_ablation_summary.json"
	OUTPUT_JSON = ROOT / "docs/data/research_takeaways.json"
	OUTPUT_MD = ROOT / "RESEARCH_TAKEAWAYS.md"


	def pct_delta(new: float, old: float, higher_is_better: bool = True) -> float:
	if old == 0:
	return 0.0
	if higher_is_better:
	return (new - old) / abs(old)
	return (old - new) / abs(old)


	def fmt(value: float \| int \| None, digits: int = 4) -> str:
	if value is None:
	return "n/a"
	if isinstance(value, int):
	return f"{value:,}"
	return f"{value:.{digits}f}"


	def task_metric(tasks: dict, task: str, key: str) -> float:
	return float(tasks[task][key])


	def build_payload() -> dict:
	summary = json.loads(SUMMARY_PATH.read_text(encoding="utf-8"))
	audio_summary = json.loads(AUDIO_PATH.read_text(encoding="utf-8")) if AUDIO_PATH.exists() else None
	suite = summary["suite"]
	tasks = suite["tasks"]
	neural = suite.get("neural_tasks", {})
	models = summary["models"]
	omni = summary.get("omni_relay", {})

	hand_min = task_metric(tasks, "hand_trajectory_forecast", "mpjpe")
	hand_neural = task_metric(neural, "hand_trajectory_forecast", "mpjpe")
	temporal_min = task_metric(tasks, "temporal_order", "f1")
	temporal_neural = task_metric(neural, "temporal_order", "f1")
	misalign_min = task_metric(tasks, "misalignment_detection", "f1")
	misalign_neural = task_metric(neural, "misalignment_detection", "f1")
	retrieval_min_mrr = task_metric(tasks, "cross_modal_retrieval", "mrr")
	retrieval_neural_mrr = task_metric(neural, "cross_modal_retrieval", "mrr")
	recon_min_r2 = task_metric(tasks, "modality_reconstruction", "r2")
	recon_neural_r2 = task_metric(neural, "modality_reconstruction", "r2")
	action_chrono = task_metric(tasks, "timeline_action", "macro_f1")
	subtask_chrono = task_metric(tasks, "timeline_subtask", "macro_f1")

	takeaways = [
	{
	"id": "episode_to_benchmark",
	"title": "One episode can become a real benchmark contract",
	"readout": (
	"The public sample is converted into 5,821 frames, 1,161 aligned "
	f"20-frame windows, and an {suite['feature_dim']:,}-dimensional feature contract."
	),
	"evidence": [
	{"label": "frames", "value": suite["num_frames"]},
	{"label": "windows", "value": suite["num_windows"]},
	{"label": "feature_dim", "value": suite["feature_dim"]},
	],
	"source": "docs/data/summary_metrics.json",
	"current_scope": "This benchmark defines the task contract; cross-episode generalization is evaluated in the multi-episode stage.",
	},
	{
	"id": "chronological_split_exposes_class_shift",
	"title": "Chronological splits expose action-class shift",
	"readout": (
	"Earlier all-feature action classifiers reach high macro-F1 on their "
	"local split, but the core chronological action/subtask heads are "
	"much harder because later held-out windows include unseen labels."
	),
	"evidence": [
	{"label": "all_feature_action_macro_f1", "value": models["all_modalities_action"]["macro_f1"]},
	{"label": "suite_action_macro_f1", "value": action_chrono},
	{"label": "suite_subtask_macro_f1", "value": subtask_chrono},
	{"label": "unseen_action_test_classes", "value": len(tasks["timeline_action"].get("unseen_test_classes", []))},
	],
	"source": "results/episode_task_suite/summary_report.json",
	"current_scope": "This split is useful for studying label shift; broad action-recognition conclusions need held-out episodes.",
	},
	{
	"id": "neural_heads_help_dynamics",
	"title": "Small neural heads help dynamic and temporal probes",
	"readout": (
	"The MLP heads substantially improve hand trajectory forecasting, "
	"temporal-order verification, and motion/visual synchronization."
	),
	"evidence": [
	{"label": "hand_mpjpe_minimal", "value": hand_min},
	{"label": "hand_mpjpe_neural", "value": hand_neural},
	{"label": "hand_mpjpe_relative_improvement", "value": pct_delta(hand_neural, hand_min, higher_is_better=False)},
	{"label": "temporal_order_f1_minimal", "value": temporal_min},
	{"label": "temporal_order_f1_neural", "value": temporal_neural},
	{"label": "misalignment_f1_minimal", "value": misalign_min},
	{"label": "misalignment_f1_neural", "value": misalign_neural},
	],
	"source": "results/episode_task_suite/neural_mlp/*/metrics.json",
	"current_scope": "These gains are measured within one episode and are candidates for held-out-episode testing.",
	},
	{
	"id": "retrieval_and_reconstruction_remain_open",
	"title": "Retrieval and reconstruction remain the harder multimodal problems",
	"readout": (
	"Ridge/cosine retrieval remains stronger than the neural projection on "
	"this sample, and cross-modal reconstruction still has negative R2."
	),
	"evidence": [
	{"label": "retrieval_mrr_minimal", "value": retrieval_min_mrr},
	{"label": "retrieval_mrr_neural", "value": retrieval_neural_mrr},
	{"label": "retrieval_top5_minimal", "value": tasks["cross_modal_retrieval"]["top5_accuracy"]},
	{"label": "reconstruction_r2_minimal", "value": recon_min_r2},
	{"label": "reconstruction_r2_neural", "value": recon_neural_r2},
	],
	"source": "results/episode_task_suite/cross_modal_retrieval/metrics.json",
	"current_scope": "The current reconstruction task predicts feature vectors; depth, mesh, NeRF, and Gaussian-splatting outputs are future task variants.",
	},
	]

	if audio_summary is not None:
	audio_aggregate = audio_summary["aggregate"]
	modality_recon = next(
	(item for item in audio_summary["task_summaries"] if item["task"] == "modality_reconstruction"),
	{},
	)
	object_relevance = next(
	(item for item in audio_summary["task_summaries"] if item["task"] == "object_relevance"),
	{},
	)
	takeaways.append(
	{
	"id": "audio_contribution_is_task_specific",
	"title": "Audio helps some tasks and hurts others on the public sample",
	"readout": (
	"Audio improves the primary metric on 6 walkthrough-backed task contracts, "
	"while raw log-mel replacement improves over the current handcrafted block on 6 of those contracts. "
	"The largest current-audio gain appears in feature reconstruction, not in action classification."
	),
	"evidence": [
	{"label": "tasks_where_current_audio_improves", "value": audio_aggregate["tasks_where_handcrafted_audio_improves"]},
	{"label": "mean_current_audio_delta", "value": audio_aggregate["mean_handcrafted_audio_delta"]},
	{"label": "tasks_where_raw_replacement_improves", "value": audio_aggregate["tasks_where_raw_replacement_improves_over_handcrafted"]},
	{"label": "mean_raw_replacement_delta_vs_current", "value": audio_aggregate["mean_raw_replacement_delta_vs_handcrafted"]},
	{"label": "reconstruction_current_audio_delta", "value": modality_recon.get("handcrafted_audio_delta")},
	{"label": "object_relevance_current_audio_delta", "value": object_relevance.get("handcrafted_audio_delta")},
	],
	"source": "results/audio_ablation/audio_ablation_summary.json",
	"current_scope": (
	"This is a single-episode ablation over fixed ridge heads. It validates that audio is wired into the task suite "
	"and shows where it changes metrics; it does not prove cross-episode audio generalization."
	),
	}
	)

	takeaways.append(
	{
	"id": "scale_requires_episodes",
	"title": "The next scientific unit is held-out episodes, not more adjacent windows",
	"readout": (
	"The selected Qwen3-Omni path now has a verified two-epoch held-out diagnostic result. "
	"It proves the cross-episode train/validation/eval loop and meets the strict-JSON target, "
	"while weak action/subtask metrics remain the next modeling problem."
	),
	"evidence": [
	{"label": "selected_episodes", "value": omni.get("target_episodes")},
	{"label": "held_out_test_windows", "value": omni.get("held_out_test_windows")},
	{"label": "json_validity_rate", "value": omni.get("json_validity_rate")},
	{"label": "action_macro_f1", "value": omni.get("action_macro_f1")},
	],
	"source": "docs/data/omni_finetune_verified_result.json",
	"current_scope": omni.get(
	"current_scope",
	"This is a diagnostic multi-episode pilot, not a strong model result.",
	),
	}
	)

	return {
	"title": "Ropedia Xperience-10M Research Takeaways",
	"status": "pass",
	"generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
	"source_files": [
	"docs/data/summary_metrics.json",
	"results/episode_task_suite/summary_report.json",
	"results/episode_task_suite/neural_mlp/*/metrics.json",
	"docs/data/audio_ablation_summary.json",
	"results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md",
	],
	"scope": {
	"validated_episode_count": 1,
	"num_frames": suite["num_frames"],
	"num_windows": suite["num_windows"],
	"feature_dim": suite["feature_dim"],
	"audio_featurized": True,
	"raw_data_redistributed": False,
	},
	"takeaways": takeaways,
	}


	def render_md(payload: dict) -> str:
	lines = [
	"# Research Takeaways",
	"",
	"This generated note summarizes what the current public Xperience-10M sample",
	"pipeline actually shows. It is built from committed metric artifacts, not",
	"from hand-edited score text.",
	"",
	"## Scope",
	"",
	f"- validated episodes: {payload['scope']['validated_episode_count']}",
	f"- frames: {payload['scope']['num_frames']:,}",
	f"- aligned windows: {payload['scope']['num_windows']:,}",
	f"- current feature dimension: {payload['scope']['feature_dim']:,}",
	"- raw Xperience-10M data is not redistributed",
	"- Audio from the sample MP4 stream is represented in the current feature vector",
	"",
	"## Takeaways",
	"",
	]
	for item in payload["takeaways"]:
	lines.extend(
	[
	f"### {item['title']}",
	"",
	item["readout"],
	"",
	"\| Metric \| Value \|",
	"\| --- \| ---: \|",
	]
	)
	for evidence in item["evidence"]:
	value = evidence["value"]
	if isinstance(value, float):
	value_text = fmt(value)
	elif isinstance(value, int):
	value_text = fmt(value)
	elif value is None:
	value_text = "n/a"
	else:
	value_text = str(value)
	lines.append(f"\| `{evidence['label']}` \| {value_text} \|")
	lines.extend(["", f"Source: `{item['source']}`.", "", f"Current scope: {item['current_scope']}", ""])
	lines.extend(
	[
	"## How To Read These Results",
	"",
	"- High single-episode scores are useful pipeline checks for the current task contracts.",
	"- Low chronological action/subtask scores are informative because they expose later-label shift.",
	"- Neural gains on trajectory/order/alignment make those tasks good candidates for the next fine-tuning stage.",
	"- Audio ablation is task-specific: audio representation choices help some probes and hurt others.",
	"- Retrieval and reconstruction remain the main multimodal representation challenges.",
	"- The next credible model-quality result needs held-out episodes.",
	"",
	]
	)
	return "\n".join(lines)


	def main() -> int:
	payload = build_payload()
	OUTPUT_JSON.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
	OUTPUT_MD.write_text(render_md(payload), encoding="utf-8")
	print(f"PASS: wrote {OUTPUT_JSON}")
	print(f"PASS: wrote {OUTPUT_MD}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())