Publish Ropedia Xperience-10M task baseline cards

a8124a8 verified 23 days ago

5.36 kB

	{
	"title": "Ropedia Xperience-10M Research Takeaways",
	"status": "pass",
	"generated_at_utc": "2026-06-03T12:56:49+00:00",
	"source_files": [
	"docs/data/summary_metrics.json",
	"results/episode_task_suite/summary_report.json",
	"results/episode_task_suite/neural_mlp/*/metrics.json",
	"results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md"
	],
	"scope": {
	"validated_episode_count": 1,
	"num_frames": 5821,
	"num_windows": 1161,
	"feature_dim": 8546,
	"audio_featurized": true,
	"raw_data_redistributed": false
	},
	"takeaways": [
	{
	"id": "episode_to_benchmark",
	"title": "One episode can become a real benchmark contract",
	"readout": "The public sample is converted into 5,821 frames, 1,161 aligned 20-frame windows, and an 8,546-dimensional feature contract.",
	"evidence": [
	{
	"label": "frames",
	"value": 5821
	},
	{
	"label": "windows",
	"value": 1161
	},
	{
	"label": "feature_dim",
	"value": 8546
	}
	],
	"source": "docs/data/summary_metrics.json",
	"current_scope": "This benchmark defines the task contract; cross-episode generalization is evaluated in the multi-episode stage."
	},
	{
	"id": "chronological_split_exposes_class_shift",
	"title": "Chronological splits expose action-class shift",
	"readout": "Earlier all-feature action classifiers reach high macro-F1 on their local split, but the 12-task chronological action/subtask heads are much harder because later held-out windows include unseen labels.",
	"evidence": [
	{
	"label": "all_feature_action_macro_f1",
	"value": 0.9828810433408773
	},
	{
	"label": "suite_action_macro_f1",
	"value": 0.05
	},
	{
	"label": "suite_subtask_macro_f1",
	"value": 0.05056355513846935
	},
	{
	"label": "unseen_action_test_classes",
	"value": 4
	}
	],
	"source": "results/episode_task_suite/summary_report.json",
	"current_scope": "This split is useful for studying label shift; broad action-recognition conclusions need held-out episodes."
	},
	{
	"id": "neural_heads_help_dynamics",
	"title": "Small neural heads help dynamic and temporal probes",
	"readout": "The MLP heads substantially improve hand trajectory forecasting, temporal-order verification, and motion/visual synchronization.",
	"evidence": [
	{
	"label": "hand_mpjpe_minimal",
	"value": 0.8646570444107056
	},
	{
	"label": "hand_mpjpe_neural",
	"value": 0.10785018652677536
	},
	{
	"label": "hand_mpjpe_relative_improvement",
	"value": 0.8752682497367739
	},
	{
	"label": "temporal_order_f1_minimal",
	"value": 0.5399515738498789
	},
	{
	"label": "temporal_order_f1_neural",
	"value": 0.8520179372197308
	},
	{
	"label": "misalignment_f1_minimal",
	"value": 0.5051698670605613
	},
	{
	"label": "misalignment_f1_neural",
	"value": 0.7152682255845944
	}
	],
	"source": "results/episode_task_suite/neural_mlp/*/metrics.json",
	"current_scope": "These gains are measured within one episode and are candidates for held-out-episode testing."
	},
	{
	"id": "retrieval_and_reconstruction_remain_open",
	"title": "Retrieval and reconstruction remain the harder multimodal problems",
	"readout": "Ridge/cosine retrieval remains stronger than the neural projection on this sample, and cross-modal reconstruction still has negative R2.",
	"evidence": [
	{
	"label": "retrieval_mrr_minimal",
	"value": 0.26925966892956127
	},
	{
	"label": "retrieval_mrr_neural",
	"value": 0.1299971898648288
	},
	{
	"label": "retrieval_top5_minimal",
	"value": 0.367816091954023
	},
	{
	"label": "reconstruction_r2_minimal",
	"value": -0.015271898913936655
	},
	{
	"label": "reconstruction_r2_neural",
	"value": -0.010171410134180991
	}
	],
	"source": "results/episode_task_suite/cross_modal_retrieval/metrics.json",
	"current_scope": "The current reconstruction task predicts feature vectors; depth, mesh, NeRF, and Gaussian-splatting outputs are future task variants."
	},
	{
	"id": "scale_requires_episodes",
	"title": "The next scientific unit is held-out episodes, not more adjacent windows",
	"readout": "The prepared Qwen3-Omni path targets 32 episodes from 32 sessions, but it remains data-gated until access and held-out evaluation complete.",
	"evidence": [
	{
	"label": "target_episodes",
	"value": 32
	},
	{
	"label": "selected_sessions",
	"value": 32
	},
	{
	"label": "valid_candidates",
	"value": 680
	}
	],
	"source": "results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md",
	"current_scope": "The 32-episode Qwen3-Omni fine-tune requires gated data staging and held-out evaluation."
	}
	]
	}