Add files using upload-large-folder tool

6460b80 verified 8 days ago

34.8 kB

	{
	"title": "Ropedia Xperience-10M Unified 20-Task Suite",
	"status": "pass",
	"generated_at_utc": "2026-06-21T15:21:12+00:00",
	"task_count": 20,
	"task_count_summary": {
	"total_unified_tasks": 20,
	"public_framing": "all 20 task contracts are presented as one suite",
	"legacy_provenance_rows": 8
	},
	"unification_policy": {
	"public_framing": "The suite is presented as one 20-task benchmark surface. All task contracts share the same window, split, feature, baseline, and leakage-control language.",
	"legacy_path_note": "The directory and file name tier2_task_suite are retained only for backward-compatible artifact links; they are not a separate public benchmark tier."
	},
	"dataset_scope": {
	"sample_episode_count": 1,
	"annotation": "data/sample/xperience-10m-sample/annotation.hdf5",
	"num_frames": 5821,
	"num_windows": 1161,
	"feature_dim": 8546,
	"window_frames": 20,
	"stride_frames": 5,
	"split_policy": "single_episode_chronological_70_30",
	"raw_hdf5_required_for_full_public_regeneration": true,
	"raw_data_redistributed": false
	},
	"setup_alignment": {
	"same_window_unit": "20-frame aligned windows",
	"same_stride": "5 frames",
	"same_feature_manifest": "results/episode_task_suite/feature_manifest.json",
	"same_shared_tensor": "results/episode_task_suite/shared_windows.npz",
	"same_split": "chronological 70/30 train/test split within the public sample episode",
	"same_baseline_pattern": "minimal interpretable heads plus compact neural MLP heads",
	"same_leakage_policy": "Target-side future, contact, object, caption, relation, and interaction signals are excluded from inputs unless language is explicitly the query."
	},
	"source_files": [
	"docs/data/summary_metrics.json",
	"docs/data/task_walkthroughs.json",
	"docs/data/tier2_task_suite.json",
	"results/episode_task_suite/summary_report.json",
	"results/episode_task_suite/tier2_task_suite/tier2_task_suite_results.json",
	"results/episode_task_suite/windows.csv",
	"results/episode_task_suite/feature_manifest.json"
	],
	"tasks": [
	{
	"task_id": "timeline_action",
	"task_display_name": "Action Recognition",
	"research_name": "Egocentric Action Recognition",
	"provenance_source": "walkthrough_backed_task_contract",
	"origin_count_label": "unified task",
	"family": "supervised",
	"architecture_family": "multiclass classifier",
	"primary_direction": "C. Egocentric Vision & Interaction",
	"input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.",
	"input_short": "20-frame multimodal window",
	"process": "window features -> action label builder -> classifier",
	"output": "A single action class for the current window.",
	"output_short": "current action class",
	"metric_key": "macro_f1",
	"metric_name": "macro-F1",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.05,
	"neural_primary_metric": 0.014814814814814814,
	"counts": {
	"num_windows": 1144,
	"num_eval_windows": 343,
	"num_train_windows": 801,
	"num_test_windows": 343,
	"num_classes": 18
	},
	"meaning": "Recognize the current manipulation action from synchronized visual, motion, inertial, pose, and annotation context.",
	"artifact_sources": {
	"walkthrough": "results/episode_task_suite/task_walkthroughs/timeline_action.md",
	"minimal_metrics": "results/episode_task_suite/timeline_action/metrics.json",
	"neural_metrics": "results/episode_task_suite/neural_mlp/timeline_action/metrics.json"
	},
	"task_number": 1,
	"suite_label": "Task 01"
	},
	{
	"task_id": "timeline_subtask",
	"task_display_name": "Procedure Step Recognition",
	"research_name": "Temporal Subtask Recognition",
	"provenance_source": "walkthrough_backed_task_contract",
	"origin_count_label": "unified task",
	"family": "supervised",
	"architecture_family": "multiclass classifier",
	"primary_direction": "C. Egocentric Vision & Interaction",
	"input": "The same all-modality window vector used by action recognition.",
	"input_short": "20-frame multimodal window",
	"process": "window features -> subtask label builder -> classifier",
	"output": "A single subtask label for the current window.",
	"output_short": "current procedure step",
	"metric_key": "macro_f1",
	"metric_name": "macro-F1",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.05056355513846935,
	"neural_primary_metric": 0.02810810810810811,
	"counts": {
	"num_windows": 1147,
	"num_eval_windows": 344,
	"num_train_windows": 803,
	"num_test_windows": 344,
	"num_classes": 14
	},
	"meaning": "Recognize the broader activity stage so fine actions become a readable procedure timeline.",
	"artifact_sources": {
	"walkthrough": "results/episode_task_suite/task_walkthroughs/timeline_subtask.md",
	"minimal_metrics": "results/episode_task_suite/timeline_subtask/metrics.json",
	"neural_metrics": "results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json"
	},
	"task_number": 2,
	"suite_label": "Task 02"
	},
	{
	"task_id": "transition_detection",
	"task_display_name": "Action Boundary Detection",
	"research_name": "Temporal Action Segmentation",
	"provenance_source": "walkthrough_backed_task_contract",
	"origin_count_label": "unified task",
	"family": "diagnostic",
	"architecture_family": "binary classifier",
	"primary_direction": "C. Egocentric Vision & Interaction",
	"input": "One all-modality window vector plus labels derived from action-change timestamps.",
	"input_short": "current window with boundary target",
	"process": "action changes -> boundary labels -> binary classifier",
	"output": "A binary label: boundary or steady.",
	"output_short": "boundary or steady",
	"metric_key": "macro_f1",
	"metric_name": "macro-F1",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.6118237590630229,
	"neural_primary_metric": 0.5862068965517241,
	"counts": {
	"num_windows": 1161,
	"num_eval_windows": 348,
	"num_train_windows": 813,
	"num_test_windows": 348,
	"num_classes": 2
	},
	"meaning": "Detect the local moment where the episode changes from one action segment to the next.",
	"artifact_sources": {
	"walkthrough": "results/episode_task_suite/task_walkthroughs/transition_detection.md",
	"minimal_metrics": "results/episode_task_suite/transition_detection/metrics.json",
	"neural_metrics": "results/episode_task_suite/neural_mlp/transition_detection/metrics.json"
	},
	"task_number": 3,
	"suite_label": "Task 03"
	},
	{
	"task_id": "next_action",
	"task_display_name": "Next-Action Prediction",
	"research_name": "Short-Horizon Intention Prediction",
	"provenance_source": "walkthrough_backed_task_contract",
	"origin_count_label": "unified task",
	"family": "supervised",
	"architecture_family": "future-label classifier",
	"primary_direction": "C. Egocentric Vision & Interaction",
	"input": "The current all-modality window vector at time t.",
	"input_short": "current window at time t",
	"process": "current features -> future label shift -> classifier",
	"output": "A single action class for t+20 frames.",
	"output_short": "action at t+20 frames",
	"metric_key": "macro_f1",
	"metric_name": "macro-F1",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.05925925925925927,
	"neural_primary_metric": 0.04186046511627907,
	"counts": {
	"num_windows": 1161,
	"num_eval_windows": 348,
	"num_train_windows": 813,
	"num_test_windows": 348,
	"num_classes": 18
	},
	"meaning": "Forecast the near-future action from the current observations only.",
	"artifact_sources": {
	"walkthrough": "results/episode_task_suite/task_walkthroughs/next_action.md",
	"minimal_metrics": "results/episode_task_suite/next_action/metrics.json",
	"neural_metrics": "results/episode_task_suite/neural_mlp/next_action/metrics.json"
	},
	"task_number": 4,
	"suite_label": "Task 04"
	},
	{
	"task_id": "hand_trajectory_forecast",
	"task_display_name": "Hand Trajectory Forecasting",
	"research_name": "3D Hand Motion Forecasting",
	"provenance_source": "walkthrough_backed_task_contract",
	"origin_count_label": "unified task",
	"family": "forecast",
	"architecture_family": "continuous regressor",
	"primary_direction": "A. Human Modeling & Motion Understanding",
	"input": "The current all-modality window vector at time t.",
	"input_short": "current multimodal window",
	"process": "current features -> future mocap target -> regression head",
	"output": "A future trajectory vector for left and right hand joints.",
	"output_short": "future hand-joint trajectory",
	"metric_key": "mpjpe",
	"metric_name": "MPJPE",
	"metric_direction": "lower",
	"minimal_primary_metric": 0.8646570444107056,
	"neural_primary_metric": 0.10785018652677536,
	"counts": {
	"num_windows": 1159,
	"num_train_windows": 811,
	"num_test_windows": 348
	},
	"meaning": "Predict the future 3D left/right hand path from the current multimodal state.",
	"artifact_sources": {
	"walkthrough": "results/episode_task_suite/task_walkthroughs/hand_trajectory_forecast.md",
	"minimal_metrics": "results/episode_task_suite/hand_trajectory_forecast/metrics.json",
	"neural_metrics": "results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json"
	},
	"task_number": 5,
	"suite_label": "Task 05"
	},
	{
	"task_id": "contact_prediction",
	"task_display_name": "Contact State Prediction",
	"research_name": "Human-Object Contact Prediction",
	"provenance_source": "walkthrough_backed_task_contract",
	"origin_count_label": "unified task",
	"family": "supervised",
	"architecture_family": "binary classifier",
	"primary_direction": "A. Human Modeling & Motion Understanding",
	"input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.",
	"input_short": "non-contact, non-caption features",
	"process": "feature filter -> contact target -> binary classifier",
	"output": "A binary contact label.",
	"output_short": "contact or no contact",
	"metric_key": "macro_f1",
	"metric_name": "macro-F1",
	"metric_direction": "higher",
	"minimal_primary_metric": 1.0,
	"neural_primary_metric": 1.0,
	"counts": {
	"num_windows": 1161,
	"num_eval_windows": 348,
	"num_train_windows": 813,
	"num_test_windows": 348,
	"num_classes": 1
	},
	"meaning": "Predict whether body or hand contact with the scene is occurring without leaking contact labels.",
	"artifact_sources": {
	"walkthrough": "results/episode_task_suite/task_walkthroughs/contact_prediction.md",
	"minimal_metrics": "results/episode_task_suite/contact_prediction/metrics.json",
	"neural_metrics": "results/episode_task_suite/neural_mlp/contact_prediction/metrics.json"
	},
	"task_number": 6,
	"suite_label": "Task 06"
	},
	{
	"task_id": "object_relevance",
	"task_display_name": "Object Relevance Prediction",
	"research_name": "Object-Centric Interaction Recognition",
	"provenance_source": "walkthrough_backed_task_contract",
	"origin_count_label": "unified task",
	"family": "supervised",
	"architecture_family": "multi-label classifier",
	"primary_direction": "C. Egocentric Vision & Interaction",
	"input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.",
	"input_short": "non-caption multimodal features",
	"process": "object vocabulary -> multi-hot labels -> sigmoid heads",
	"output": "A multi-label object set for the current window.",
	"output_short": "relevant object set",
	"metric_key": "micro_f1",
	"metric_name": "micro-F1",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.18034382095361662,
	"neural_primary_metric": 0.1679279279279279,
	"counts": {
	"num_windows": 1161,
	"num_train_windows": 813,
	"num_test_windows": 348
	},
	"meaning": "Infer which objects are relevant to the current manipulation window from non-caption features.",
	"artifact_sources": {
	"walkthrough": "results/episode_task_suite/task_walkthroughs/object_relevance.md",
	"minimal_metrics": "results/episode_task_suite/object_relevance/metrics.json",
	"neural_metrics": "results/episode_task_suite/neural_mlp/object_relevance/metrics.json"
	},
	"task_number": 7,
	"suite_label": "Task 07"
	},
	{
	"task_id": "caption_grounding",
	"task_display_name": "Language Grounding",
	"research_name": "Language-to-Moment Grounding",
	"provenance_source": "walkthrough_backed_task_contract",
	"origin_count_label": "unified task",
	"family": "retrieval",
	"architecture_family": "retrieval ranker",
	"primary_direction": "C. Egocentric Vision & Interaction",
	"input": "Caption/object/interaction query features and a set of candidate sensor-window features.",
	"input_short": "text-like query and candidate windows",
	"process": "query features -> candidate index -> cosine ranker",
	"output": "A ranked list of windows, with the correct matching window ideally near rank 1.",
	"output_short": "ranked matching moments",
	"metric_key": "mrr",
	"metric_name": "MRR",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.016023479050338015,
	"neural_primary_metric": 0.01684125567132316,
	"counts": {
	"num_queries": 348,
	"num_train_windows": 813,
	"num_test_windows": 348
	},
	"meaning": "Retrieve the matching time window for an annotation-derived text query.",
	"artifact_sources": {
	"walkthrough": "results/episode_task_suite/task_walkthroughs/caption_grounding.md",
	"minimal_metrics": "results/episode_task_suite/caption_grounding/metrics.json",
	"neural_metrics": "results/episode_task_suite/neural_mlp/caption_grounding/metrics.json"
	},
	"task_number": 8,
	"suite_label": "Task 08"
	},
	{
	"task_id": "cross_modal_retrieval",
	"task_display_name": "Cross-Modal Retrieval",
	"research_name": "Multimodal Representation Retrieval",
	"provenance_source": "walkthrough_backed_task_contract",
	"origin_count_label": "unified task",
	"family": "retrieval",
	"architecture_family": "two-tower retrieval head",
	"primary_direction": "D. Scene Reconstruction & World Modeling",
	"input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.",
	"input_short": "motion/IMU/pose query; depth/video candidates",
	"process": "modality split -> projection -> nearest-neighbor ranker",
	"output": "A ranked list of candidate depth/video windows.",
	"output_short": "ranked visual windows",
	"metric_key": "mrr",
	"metric_name": "MRR",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.26925966892956127,
	"neural_primary_metric": 0.1299971898648288,
	"counts": {
	"num_queries": 348,
	"num_train_windows": 813,
	"num_test_windows": 348
	},
	"meaning": "Use motion, IMU, and camera-pose signals to retrieve the matching depth/video window.",
	"artifact_sources": {
	"walkthrough": "results/episode_task_suite/task_walkthroughs/cross_modal_retrieval.md",
	"minimal_metrics": "results/episode_task_suite/cross_modal_retrieval/metrics.json",
	"neural_metrics": "results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json"
	},
	"task_number": 9,
	"suite_label": "Task 09"
	},
	{
	"task_id": "modality_reconstruction",
	"task_display_name": "Cross-Modal Reconstruction",
	"research_name": "Modality Feature Reconstruction",
	"provenance_source": "walkthrough_backed_task_contract",
	"origin_count_label": "unified task",
	"family": "forecast",
	"architecture_family": "feature regressor",
	"primary_direction": "B. 3D/4D Reconstruction & Neural Rendering",
	"input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.",
	"input_short": "motion, IMU, and camera/pose features",
	"process": "source-target split -> scaler -> regression head",
	"output": "A reconstructed depth/video feature vector.",
	"output_short": "reconstructed depth/video vector",
	"metric_key": "r2",
	"metric_name": "R2",
	"metric_direction": "higher",
	"minimal_primary_metric": -0.015271898913936655,
	"neural_primary_metric": -0.010171410134180991,
	"counts": {
	"num_train_windows": 813,
	"num_test_windows": 348
	},
	"meaning": "Predict compressed depth/video feature vectors from motion, IMU, and camera-pose features.",
	"artifact_sources": {
	"walkthrough": "results/episode_task_suite/task_walkthroughs/modality_reconstruction.md",
	"minimal_metrics": "results/episode_task_suite/modality_reconstruction/metrics.json",
	"neural_metrics": "results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json"
	},
	"task_number": 10,
	"suite_label": "Task 10"
	},
	{
	"task_id": "temporal_order",
	"task_display_name": "Temporal Order Verification",
	"research_name": "Temporal Order Verification",
	"provenance_source": "walkthrough_backed_task_contract",
	"origin_count_label": "unified task",
	"family": "diagnostic",
	"architecture_family": "pairwise classifier",
	"primary_direction": "D. Scene Reconstruction & World Modeling",
	"input": "A pair of adjacent window vectors, plus their difference vector.",
	"input_short": "two adjacent windows plus difference vector",
	"process": "pair builder -> feature combiner -> binary classifier",
	"output": "A binary label: correct order or reversed order.",
	"output_short": "correct or reversed",
	"metric_key": "f1",
	"metric_name": "F1",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.5399515738498789,
	"neural_primary_metric": 0.8520179372197308,
	"counts": {
	"num_samples": 2320,
	"num_train_samples": 1624,
	"num_test_samples": 696
	},
	"meaning": "Tell whether two neighboring windows are in chronological order or reversed.",
	"artifact_sources": {
	"walkthrough": "results/episode_task_suite/task_walkthroughs/temporal_order.md",
	"minimal_metrics": "results/episode_task_suite/temporal_order/metrics.json",
	"neural_metrics": "results/episode_task_suite/neural_mlp/temporal_order/metrics.json"
	},
	"task_number": 11,
	"suite_label": "Task 11"
	},
	{
	"task_id": "misalignment_detection",
	"task_display_name": "Multimodal Synchronization Detection",
	"research_name": "Cross-Modal Misalignment Detection",
	"provenance_source": "walkthrough_backed_task_contract",
	"origin_count_label": "unified task",
	"family": "diagnostic",
	"architecture_family": "pairwise classifier",
	"primary_direction": "B. 3D/4D Reconstruction & Neural Rendering",
	"input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.",
	"input_short": "motion-side and visual/depth-side feature groups",
	"process": "aligned/shifted pairs -> feature combiner -> binary classifier",
	"output": "A binary label: aligned or shifted.",
	"output_short": "aligned or shifted",
	"metric_key": "f1",
	"metric_name": "F1",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.5051698670605613,
	"neural_primary_metric": 0.7152682255845944,
	"counts": {
	"num_samples": 2306,
	"num_train_samples": 1614,
	"num_test_samples": 692
	},
	"meaning": "Detect whether motion and visual/depth streams have been artificially shifted out of sync.",
	"artifact_sources": {
	"walkthrough": "results/episode_task_suite/task_walkthroughs/misalignment_detection.md",
	"minimal_metrics": "results/episode_task_suite/misalignment_detection/metrics.json",
	"neural_metrics": "results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json"
	},
	"task_number": 12,
	"suite_label": "Task 12"
	},
	{
	"task_id": "long_horizon_next_action",
	"task_display_name": "Long-Horizon Next-Action Forecasting",
	"research_name": "Long-Horizon Next-Action Forecasting",
	"provenance_source": "historical_result_bundle",
	"origin_count_label": "unified task",
	"family": "classification",
	"architecture_family": "minimal_softmax",
	"primary_direction": "sample-supported extension",
	"input": "Current 20-frame non-caption multimodal window.",
	"input_short": "Current 20-frame non-caption multimodal window.",
	"process": "shared window features -> task-specific target builder -> minimal/neural head",
	"output": "Action label five seconds later.",
	"output_short": "Action label five seconds later.",
	"metric_key": "macro_f1",
	"metric_name": "macro-F1",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.07499999999999998,
	"neural_primary_metric": 0.06545454545454546,
	"counts": {
	"num_windows": 1073,
	"num_eval_windows": 322,
	"num_train_windows": 751,
	"num_test_windows": 322,
	"num_classes": 18
	},
	"meaning": "Tests whether the current state carries enough procedure context to forecast beyond the one-second core next-action task.",
	"artifact_sources": {
	"legacy_result_directory": "results/episode_task_suite/tier2_task_suite/",
	"minimal_metrics": "results/episode_task_suite/tier2_task_suite/long_horizon_next_action/metrics.json",
	"neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/long_horizon_next_action/metrics.json"
	},
	"task_number": 13,
	"suite_label": "Task 13"
	},
	{
	"task_id": "next_subtask_forecast",
	"task_display_name": "Long-Horizon Next-Subtask Forecasting",
	"research_name": "Long-Horizon Next-Subtask Forecasting",
	"provenance_source": "historical_result_bundle",
	"origin_count_label": "unified task",
	"family": "classification",
	"architecture_family": "minimal_softmax",
	"primary_direction": "sample-supported extension",
	"input": "Current 20-frame non-caption multimodal window.",
	"input_short": "Current 20-frame non-caption multimodal window.",
	"process": "shared window features -> task-specific target builder -> minimal/neural head",
	"output": "Procedure subtask label five seconds later.",
	"output_short": "Procedure subtask label five seconds later.",
	"metric_key": "macro_f1",
	"metric_name": "macro-F1",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.04545454545454545,
	"neural_primary_metric": 0.050724637681159424,
	"counts": {
	"num_windows": 1141,
	"num_eval_windows": 342,
	"num_train_windows": 799,
	"num_test_windows": 342,
	"num_classes": 14
	},
	"meaning": "Moves from immediate action anticipation to higher-level procedure-state prediction.",
	"artifact_sources": {
	"legacy_result_directory": "results/episode_task_suite/tier2_task_suite/",
	"minimal_metrics": "results/episode_task_suite/tier2_task_suite/next_subtask_forecast/metrics.json",
	"neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/next_subtask_forecast/metrics.json"
	},
	"task_number": 14,
	"suite_label": "Task 14"
	},
	{
	"task_id": "interaction_text_prediction",
	"task_display_name": "Interaction Text Prediction",
	"research_name": "Interaction Text Prediction",
	"provenance_source": "historical_result_bundle",
	"origin_count_label": "unified task",
	"family": "classification",
	"architecture_family": "minimal_softmax",
	"primary_direction": "sample-supported extension",
	"input": "Current 20-frame sensor window with caption-text features removed.",
	"input_short": "Current 20-frame sensor window with caption-text features removed.",
	"process": "shared window features -> task-specific target builder -> minimal/neural head",
	"output": "Raw annotation interaction phrase for the same window.",
	"output_short": "Raw annotation interaction phrase for the same window.",
	"metric_key": "macro_f1",
	"metric_name": "macro-F1",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.04444444444444444,
	"neural_primary_metric": 0.0380952380952381,
	"counts": {
	"num_windows": 192,
	"num_eval_windows": 58,
	"num_train_windows": 134,
	"num_test_windows": 58,
	"num_classes": 46
	},
	"meaning": "Uses the raw caption JSON interaction field as a language target instead of only the hashed text feature.",
	"artifact_sources": {
	"legacy_result_directory": "results/episode_task_suite/tier2_task_suite/",
	"minimal_metrics": "results/episode_task_suite/tier2_task_suite/interaction_text_prediction/metrics.json",
	"neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/interaction_text_prediction/metrics.json"
	},
	"task_number": 15,
	"suite_label": "Task 15"
	},
	{
	"task_id": "action_object_relation",
	"task_display_name": "Action-Object Relation Prediction",
	"research_name": "Action-Object Relation Prediction",
	"provenance_source": "historical_result_bundle",
	"origin_count_label": "unified task",
	"family": "classification",
	"architecture_family": "minimal_softmax",
	"primary_direction": "sample-supported extension",
	"input": "Current 20-frame sensor window with caption-text features removed.",
	"input_short": "Current 20-frame sensor window with caption-text features removed.",
	"process": "shared window features -> task-specific target builder -> minimal/neural head",
	"output": "Joint action plus active object-set relation.",
	"output_short": "Joint action plus active object-set relation.",
	"metric_key": "macro_f1",
	"metric_name": "macro-F1",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.0,
	"neural_primary_metric": 0.0,
	"counts": {
	"num_windows": 178,
	"num_eval_windows": 53,
	"num_train_windows": 125,
	"num_test_windows": 53,
	"num_classes": 42
	},
	"meaning": "Evaluates whether a model can bind what action is happening to which objects are involved.",
	"artifact_sources": {
	"legacy_result_directory": "results/episode_task_suite/tier2_task_suite/",
	"minimal_metrics": "results/episode_task_suite/tier2_task_suite/action_object_relation/metrics.json",
	"neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/action_object_relation/metrics.json"
	},
	"task_number": 16,
	"suite_label": "Task 16"
	},
	{
	"task_id": "object_set_forecast",
	"task_display_name": "Future Object-Set Forecasting",
	"research_name": "Future Object-Set Forecasting",
	"provenance_source": "historical_result_bundle",
	"origin_count_label": "unified task",
	"family": "multi_label",
	"architecture_family": "minimal_ridge_multilabel",
	"primary_direction": "sample-supported extension",
	"input": "Current 20-frame sensor window with caption-text features removed.",
	"input_short": "Current 20-frame sensor window with caption-text features removed.",
	"process": "shared window features -> task-specific target builder -> minimal/neural head",
	"output": "Object set active five seconds later.",
	"output_short": "Object set active five seconds later.",
	"metric_key": "micro_f1",
	"metric_name": "micro-F1",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.16939890710382516,
	"neural_primary_metric": 0.19718309859154928,
	"counts": {
	"num_windows": 188,
	"num_train_windows": 132,
	"num_test_windows": 56
	},
	"meaning": "Predicts which objects will become relevant soon, not only which objects are relevant now.",
	"artifact_sources": {
	"legacy_result_directory": "results/episode_task_suite/tier2_task_suite/",
	"minimal_metrics": "results/episode_task_suite/tier2_task_suite/object_set_forecast/metrics.json",
	"neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/object_set_forecast/metrics.json"
	},
	"task_number": 17,
	"suite_label": "Task 17"
	},
	{
	"task_id": "imu_to_hand_pose",
	"task_display_name": "IMU-to-Hand Pose Reconstruction",
	"research_name": "IMU-to-Hand Pose Reconstruction",
	"provenance_source": "historical_result_bundle",
	"origin_count_label": "unified task",
	"family": "regression",
	"architecture_family": "minimal_ridge_regression",
	"primary_direction": "sample-supported extension",
	"input": "Current IMU acceleration/gyroscope feature block only.",
	"input_short": "Current IMU acceleration/gyroscope feature block only.",
	"process": "shared window features -> task-specific target builder -> minimal/neural head",
	"output": "Current left/right hand joint feature blocks.",
	"output_short": "Current left/right hand joint feature blocks.",
	"metric_key": "mae",
	"metric_name": "MAE",
	"metric_direction": "lower",
	"minimal_primary_metric": 0.042049407958984375,
	"neural_primary_metric": 0.042562149465084076,
	"counts": {
	"num_windows": 1161,
	"num_train_windows": 813,
	"num_test_windows": 348
	},
	"meaning": "A sensor-bridge probe for how much hand configuration can be recovered from inertial motion alone.",
	"artifact_sources": {
	"legacy_result_directory": "results/episode_task_suite/tier2_task_suite/",
	"minimal_metrics": "results/episode_task_suite/tier2_task_suite/imu_to_hand_pose/metrics.json",
	"neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/imu_to_hand_pose/metrics.json"
	},
	"task_number": 18,
	"suite_label": "Task 18"
	},
	{
	"task_id": "camera_view_sync_retrieval",
	"task_display_name": "Camera-View Synchronization Retrieval",
	"research_name": "Camera-View Synchronization Retrieval",
	"provenance_source": "historical_result_bundle",
	"origin_count_label": "unified task",
	"family": "retrieval",
	"architecture_family": "minimal_ridge_projection_cosine_retrieval",
	"primary_direction": "sample-supported extension",
	"input": "Fisheye camera-1 feature query projected into fisheye camera-3 feature space.",
	"input_short": "Fisheye camera-1 feature query projected into fisheye camera-3 feature space.",
	"process": "shared window features -> task-specific target builder -> minimal/neural head",
	"output": "The synchronized held-out camera-3 window.",
	"output_short": "The synchronized held-out camera-3 window.",
	"metric_key": "mrr",
	"metric_name": "MRR",
	"metric_direction": "higher",
	"minimal_primary_metric": 0.4943004846572876,
	"neural_primary_metric": 0.24086658656597137,
	"counts": {
	"num_train_windows": 813,
	"num_test_windows": 348
	},
	"meaning": "Stress-tests multi-camera time alignment beyond the core cross-modal retrieval task.",
	"artifact_sources": {
	"legacy_result_directory": "results/episode_task_suite/tier2_task_suite/",
	"minimal_metrics": "results/episode_task_suite/tier2_task_suite/camera_view_sync_retrieval/metrics.json",
	"neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/camera_view_sync_retrieval/metrics.json"
	},
	"task_number": 19,
	"suite_label": "Task 19"
	},
	{
	"task_id": "time_to_transition",
	"task_display_name": "Time-to-Next-Transition Regression",
	"research_name": "Time-to-Next-Transition Regression",
	"provenance_source": "historical_result_bundle",
	"origin_count_label": "unified task",
	"family": "regression",
	"architecture_family": "minimal_ridge_regression",
	"primary_direction": "sample-supported extension",
	"input": "Current 20-frame non-caption multimodal window.",
	"input_short": "Current 20-frame non-caption multimodal window.",
	"process": "shared window features -> task-specific target builder -> minimal/neural head",
	"output": "Frames until the next action-label boundary, capped at 200 frames.",
	"output_short": "Frames until the next action-label boundary, capped at 200 frames.",
	"metric_key": "mae",
	"metric_name": "MAE frames",
	"metric_direction": "lower",
	"minimal_primary_metric": 10.53735637664795,
	"neural_primary_metric": 10.55449390411377,
	"counts": {
	"num_windows": 1161,
	"num_train_windows": 813,
	"num_test_windows": 348
	},
	"meaning": "Turns boundary detection into a continuous timing estimate for procedural control.",
	"artifact_sources": {
	"legacy_result_directory": "results/episode_task_suite/tier2_task_suite/",
	"minimal_metrics": "results/episode_task_suite/tier2_task_suite/time_to_transition/metrics.json",
	"neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/time_to_transition/metrics.json"
	},
	"task_number": 20,
	"suite_label": "Task 20"
	}
	]
	}