Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "title": "Ropedia Xperience-10M Unified 20-Task Suite", | |
| "status": "pass", | |
| "generated_at_utc": "2026-06-21T15:21:12+00:00", | |
| "task_count": 20, | |
| "task_count_summary": { | |
| "total_unified_tasks": 20, | |
| "public_framing": "all 20 task contracts are presented as one suite", | |
| "legacy_provenance_rows": 8 | |
| }, | |
| "unification_policy": { | |
| "public_framing": "The suite is presented as one 20-task benchmark surface. All task contracts share the same window, split, feature, baseline, and leakage-control language.", | |
| "legacy_path_note": "The directory and file name tier2_task_suite are retained only for backward-compatible artifact links; they are not a separate public benchmark tier." | |
| }, | |
| "dataset_scope": { | |
| "sample_episode_count": 1, | |
| "annotation": "data/sample/xperience-10m-sample/annotation.hdf5", | |
| "num_frames": 5821, | |
| "num_windows": 1161, | |
| "feature_dim": 8546, | |
| "window_frames": 20, | |
| "stride_frames": 5, | |
| "split_policy": "single_episode_chronological_70_30", | |
| "raw_hdf5_required_for_full_public_regeneration": true, | |
| "raw_data_redistributed": false | |
| }, | |
| "setup_alignment": { | |
| "same_window_unit": "20-frame aligned windows", | |
| "same_stride": "5 frames", | |
| "same_feature_manifest": "results/episode_task_suite/feature_manifest.json", | |
| "same_shared_tensor": "results/episode_task_suite/shared_windows.npz", | |
| "same_split": "chronological 70/30 train/test split within the public sample episode", | |
| "same_baseline_pattern": "minimal interpretable heads plus compact neural MLP heads", | |
| "same_leakage_policy": "Target-side future, contact, object, caption, relation, and interaction signals are excluded from inputs unless language is explicitly the query." | |
| }, | |
| "source_files": [ | |
| "docs/data/summary_metrics.json", | |
| "docs/data/task_walkthroughs.json", | |
| "docs/data/tier2_task_suite.json", | |
| "results/episode_task_suite/summary_report.json", | |
| "results/episode_task_suite/tier2_task_suite/tier2_task_suite_results.json", | |
| "results/episode_task_suite/windows.csv", | |
| "results/episode_task_suite/feature_manifest.json" | |
| ], | |
| "tasks": [ | |
| { | |
| "task_id": "timeline_action", | |
| "task_display_name": "Action Recognition", | |
| "research_name": "Egocentric Action Recognition", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "origin_count_label": "unified task", | |
| "family": "supervised", | |
| "architecture_family": "multiclass classifier", | |
| "primary_direction": "C. Egocentric Vision & Interaction", | |
| "input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.", | |
| "input_short": "20-frame multimodal window", | |
| "process": "window features -> action label builder -> classifier", | |
| "output": "A single action class for the current window.", | |
| "output_short": "current action class", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.05, | |
| "neural_primary_metric": 0.014814814814814814, | |
| "counts": { | |
| "num_windows": 1144, | |
| "num_eval_windows": 343, | |
| "num_train_windows": 801, | |
| "num_test_windows": 343, | |
| "num_classes": 18 | |
| }, | |
| "meaning": "Recognize the current manipulation action from synchronized visual, motion, inertial, pose, and annotation context.", | |
| "artifact_sources": { | |
| "walkthrough": "results/episode_task_suite/task_walkthroughs/timeline_action.md", | |
| "minimal_metrics": "results/episode_task_suite/timeline_action/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/neural_mlp/timeline_action/metrics.json" | |
| }, | |
| "task_number": 1, | |
| "suite_label": "Task 01" | |
| }, | |
| { | |
| "task_id": "timeline_subtask", | |
| "task_display_name": "Procedure Step Recognition", | |
| "research_name": "Temporal Subtask Recognition", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "origin_count_label": "unified task", | |
| "family": "supervised", | |
| "architecture_family": "multiclass classifier", | |
| "primary_direction": "C. Egocentric Vision & Interaction", | |
| "input": "The same all-modality window vector used by action recognition.", | |
| "input_short": "20-frame multimodal window", | |
| "process": "window features -> subtask label builder -> classifier", | |
| "output": "A single subtask label for the current window.", | |
| "output_short": "current procedure step", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.05056355513846935, | |
| "neural_primary_metric": 0.02810810810810811, | |
| "counts": { | |
| "num_windows": 1147, | |
| "num_eval_windows": 344, | |
| "num_train_windows": 803, | |
| "num_test_windows": 344, | |
| "num_classes": 14 | |
| }, | |
| "meaning": "Recognize the broader activity stage so fine actions become a readable procedure timeline.", | |
| "artifact_sources": { | |
| "walkthrough": "results/episode_task_suite/task_walkthroughs/timeline_subtask.md", | |
| "minimal_metrics": "results/episode_task_suite/timeline_subtask/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json" | |
| }, | |
| "task_number": 2, | |
| "suite_label": "Task 02" | |
| }, | |
| { | |
| "task_id": "transition_detection", | |
| "task_display_name": "Action Boundary Detection", | |
| "research_name": "Temporal Action Segmentation", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "origin_count_label": "unified task", | |
| "family": "diagnostic", | |
| "architecture_family": "binary classifier", | |
| "primary_direction": "C. Egocentric Vision & Interaction", | |
| "input": "One all-modality window vector plus labels derived from action-change timestamps.", | |
| "input_short": "current window with boundary target", | |
| "process": "action changes -> boundary labels -> binary classifier", | |
| "output": "A binary label: boundary or steady.", | |
| "output_short": "boundary or steady", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.6118237590630229, | |
| "neural_primary_metric": 0.5862068965517241, | |
| "counts": { | |
| "num_windows": 1161, | |
| "num_eval_windows": 348, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "num_classes": 2 | |
| }, | |
| "meaning": "Detect the local moment where the episode changes from one action segment to the next.", | |
| "artifact_sources": { | |
| "walkthrough": "results/episode_task_suite/task_walkthroughs/transition_detection.md", | |
| "minimal_metrics": "results/episode_task_suite/transition_detection/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/neural_mlp/transition_detection/metrics.json" | |
| }, | |
| "task_number": 3, | |
| "suite_label": "Task 03" | |
| }, | |
| { | |
| "task_id": "next_action", | |
| "task_display_name": "Next-Action Prediction", | |
| "research_name": "Short-Horizon Intention Prediction", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "origin_count_label": "unified task", | |
| "family": "supervised", | |
| "architecture_family": "future-label classifier", | |
| "primary_direction": "C. Egocentric Vision & Interaction", | |
| "input": "The current all-modality window vector at time t.", | |
| "input_short": "current window at time t", | |
| "process": "current features -> future label shift -> classifier", | |
| "output": "A single action class for t+20 frames.", | |
| "output_short": "action at t+20 frames", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.05925925925925927, | |
| "neural_primary_metric": 0.04186046511627907, | |
| "counts": { | |
| "num_windows": 1161, | |
| "num_eval_windows": 348, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "num_classes": 18 | |
| }, | |
| "meaning": "Forecast the near-future action from the current observations only.", | |
| "artifact_sources": { | |
| "walkthrough": "results/episode_task_suite/task_walkthroughs/next_action.md", | |
| "minimal_metrics": "results/episode_task_suite/next_action/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/neural_mlp/next_action/metrics.json" | |
| }, | |
| "task_number": 4, | |
| "suite_label": "Task 04" | |
| }, | |
| { | |
| "task_id": "hand_trajectory_forecast", | |
| "task_display_name": "Hand Trajectory Forecasting", | |
| "research_name": "3D Hand Motion Forecasting", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "origin_count_label": "unified task", | |
| "family": "forecast", | |
| "architecture_family": "continuous regressor", | |
| "primary_direction": "A. Human Modeling & Motion Understanding", | |
| "input": "The current all-modality window vector at time t.", | |
| "input_short": "current multimodal window", | |
| "process": "current features -> future mocap target -> regression head", | |
| "output": "A future trajectory vector for left and right hand joints.", | |
| "output_short": "future hand-joint trajectory", | |
| "metric_key": "mpjpe", | |
| "metric_name": "MPJPE", | |
| "metric_direction": "lower", | |
| "minimal_primary_metric": 0.8646570444107056, | |
| "neural_primary_metric": 0.10785018652677536, | |
| "counts": { | |
| "num_windows": 1159, | |
| "num_train_windows": 811, | |
| "num_test_windows": 348 | |
| }, | |
| "meaning": "Predict the future 3D left/right hand path from the current multimodal state.", | |
| "artifact_sources": { | |
| "walkthrough": "results/episode_task_suite/task_walkthroughs/hand_trajectory_forecast.md", | |
| "minimal_metrics": "results/episode_task_suite/hand_trajectory_forecast/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json" | |
| }, | |
| "task_number": 5, | |
| "suite_label": "Task 05" | |
| }, | |
| { | |
| "task_id": "contact_prediction", | |
| "task_display_name": "Contact State Prediction", | |
| "research_name": "Human-Object Contact Prediction", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "origin_count_label": "unified task", | |
| "family": "supervised", | |
| "architecture_family": "binary classifier", | |
| "primary_direction": "A. Human Modeling & Motion Understanding", | |
| "input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.", | |
| "input_short": "non-contact, non-caption features", | |
| "process": "feature filter -> contact target -> binary classifier", | |
| "output": "A binary contact label.", | |
| "output_short": "contact or no contact", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 1.0, | |
| "neural_primary_metric": 1.0, | |
| "counts": { | |
| "num_windows": 1161, | |
| "num_eval_windows": 348, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "num_classes": 1 | |
| }, | |
| "meaning": "Predict whether body or hand contact with the scene is occurring without leaking contact labels.", | |
| "artifact_sources": { | |
| "walkthrough": "results/episode_task_suite/task_walkthroughs/contact_prediction.md", | |
| "minimal_metrics": "results/episode_task_suite/contact_prediction/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/neural_mlp/contact_prediction/metrics.json" | |
| }, | |
| "task_number": 6, | |
| "suite_label": "Task 06" | |
| }, | |
| { | |
| "task_id": "object_relevance", | |
| "task_display_name": "Object Relevance Prediction", | |
| "research_name": "Object-Centric Interaction Recognition", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "origin_count_label": "unified task", | |
| "family": "supervised", | |
| "architecture_family": "multi-label classifier", | |
| "primary_direction": "C. Egocentric Vision & Interaction", | |
| "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.", | |
| "input_short": "non-caption multimodal features", | |
| "process": "object vocabulary -> multi-hot labels -> sigmoid heads", | |
| "output": "A multi-label object set for the current window.", | |
| "output_short": "relevant object set", | |
| "metric_key": "micro_f1", | |
| "metric_name": "micro-F1", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.18034382095361662, | |
| "neural_primary_metric": 0.1679279279279279, | |
| "counts": { | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "meaning": "Infer which objects are relevant to the current manipulation window from non-caption features.", | |
| "artifact_sources": { | |
| "walkthrough": "results/episode_task_suite/task_walkthroughs/object_relevance.md", | |
| "minimal_metrics": "results/episode_task_suite/object_relevance/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/neural_mlp/object_relevance/metrics.json" | |
| }, | |
| "task_number": 7, | |
| "suite_label": "Task 07" | |
| }, | |
| { | |
| "task_id": "caption_grounding", | |
| "task_display_name": "Language Grounding", | |
| "research_name": "Language-to-Moment Grounding", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "origin_count_label": "unified task", | |
| "family": "retrieval", | |
| "architecture_family": "retrieval ranker", | |
| "primary_direction": "C. Egocentric Vision & Interaction", | |
| "input": "Caption/object/interaction query features and a set of candidate sensor-window features.", | |
| "input_short": "text-like query and candidate windows", | |
| "process": "query features -> candidate index -> cosine ranker", | |
| "output": "A ranked list of windows, with the correct matching window ideally near rank 1.", | |
| "output_short": "ranked matching moments", | |
| "metric_key": "mrr", | |
| "metric_name": "MRR", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.016023479050338015, | |
| "neural_primary_metric": 0.01684125567132316, | |
| "counts": { | |
| "num_queries": 348, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "meaning": "Retrieve the matching time window for an annotation-derived text query.", | |
| "artifact_sources": { | |
| "walkthrough": "results/episode_task_suite/task_walkthroughs/caption_grounding.md", | |
| "minimal_metrics": "results/episode_task_suite/caption_grounding/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/neural_mlp/caption_grounding/metrics.json" | |
| }, | |
| "task_number": 8, | |
| "suite_label": "Task 08" | |
| }, | |
| { | |
| "task_id": "cross_modal_retrieval", | |
| "task_display_name": "Cross-Modal Retrieval", | |
| "research_name": "Multimodal Representation Retrieval", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "origin_count_label": "unified task", | |
| "family": "retrieval", | |
| "architecture_family": "two-tower retrieval head", | |
| "primary_direction": "D. Scene Reconstruction & World Modeling", | |
| "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.", | |
| "input_short": "motion/IMU/pose query; depth/video candidates", | |
| "process": "modality split -> projection -> nearest-neighbor ranker", | |
| "output": "A ranked list of candidate depth/video windows.", | |
| "output_short": "ranked visual windows", | |
| "metric_key": "mrr", | |
| "metric_name": "MRR", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.26925966892956127, | |
| "neural_primary_metric": 0.1299971898648288, | |
| "counts": { | |
| "num_queries": 348, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "meaning": "Use motion, IMU, and camera-pose signals to retrieve the matching depth/video window.", | |
| "artifact_sources": { | |
| "walkthrough": "results/episode_task_suite/task_walkthroughs/cross_modal_retrieval.md", | |
| "minimal_metrics": "results/episode_task_suite/cross_modal_retrieval/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json" | |
| }, | |
| "task_number": 9, | |
| "suite_label": "Task 09" | |
| }, | |
| { | |
| "task_id": "modality_reconstruction", | |
| "task_display_name": "Cross-Modal Reconstruction", | |
| "research_name": "Modality Feature Reconstruction", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "origin_count_label": "unified task", | |
| "family": "forecast", | |
| "architecture_family": "feature regressor", | |
| "primary_direction": "B. 3D/4D Reconstruction & Neural Rendering", | |
| "input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.", | |
| "input_short": "motion, IMU, and camera/pose features", | |
| "process": "source-target split -> scaler -> regression head", | |
| "output": "A reconstructed depth/video feature vector.", | |
| "output_short": "reconstructed depth/video vector", | |
| "metric_key": "r2", | |
| "metric_name": "R2", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": -0.015271898913936655, | |
| "neural_primary_metric": -0.010171410134180991, | |
| "counts": { | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "meaning": "Predict compressed depth/video feature vectors from motion, IMU, and camera-pose features.", | |
| "artifact_sources": { | |
| "walkthrough": "results/episode_task_suite/task_walkthroughs/modality_reconstruction.md", | |
| "minimal_metrics": "results/episode_task_suite/modality_reconstruction/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json" | |
| }, | |
| "task_number": 10, | |
| "suite_label": "Task 10" | |
| }, | |
| { | |
| "task_id": "temporal_order", | |
| "task_display_name": "Temporal Order Verification", | |
| "research_name": "Temporal Order Verification", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "origin_count_label": "unified task", | |
| "family": "diagnostic", | |
| "architecture_family": "pairwise classifier", | |
| "primary_direction": "D. Scene Reconstruction & World Modeling", | |
| "input": "A pair of adjacent window vectors, plus their difference vector.", | |
| "input_short": "two adjacent windows plus difference vector", | |
| "process": "pair builder -> feature combiner -> binary classifier", | |
| "output": "A binary label: correct order or reversed order.", | |
| "output_short": "correct or reversed", | |
| "metric_key": "f1", | |
| "metric_name": "F1", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.5399515738498789, | |
| "neural_primary_metric": 0.8520179372197308, | |
| "counts": { | |
| "num_samples": 2320, | |
| "num_train_samples": 1624, | |
| "num_test_samples": 696 | |
| }, | |
| "meaning": "Tell whether two neighboring windows are in chronological order or reversed.", | |
| "artifact_sources": { | |
| "walkthrough": "results/episode_task_suite/task_walkthroughs/temporal_order.md", | |
| "minimal_metrics": "results/episode_task_suite/temporal_order/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/neural_mlp/temporal_order/metrics.json" | |
| }, | |
| "task_number": 11, | |
| "suite_label": "Task 11" | |
| }, | |
| { | |
| "task_id": "misalignment_detection", | |
| "task_display_name": "Multimodal Synchronization Detection", | |
| "research_name": "Cross-Modal Misalignment Detection", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "origin_count_label": "unified task", | |
| "family": "diagnostic", | |
| "architecture_family": "pairwise classifier", | |
| "primary_direction": "B. 3D/4D Reconstruction & Neural Rendering", | |
| "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.", | |
| "input_short": "motion-side and visual/depth-side feature groups", | |
| "process": "aligned/shifted pairs -> feature combiner -> binary classifier", | |
| "output": "A binary label: aligned or shifted.", | |
| "output_short": "aligned or shifted", | |
| "metric_key": "f1", | |
| "metric_name": "F1", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.5051698670605613, | |
| "neural_primary_metric": 0.7152682255845944, | |
| "counts": { | |
| "num_samples": 2306, | |
| "num_train_samples": 1614, | |
| "num_test_samples": 692 | |
| }, | |
| "meaning": "Detect whether motion and visual/depth streams have been artificially shifted out of sync.", | |
| "artifact_sources": { | |
| "walkthrough": "results/episode_task_suite/task_walkthroughs/misalignment_detection.md", | |
| "minimal_metrics": "results/episode_task_suite/misalignment_detection/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json" | |
| }, | |
| "task_number": 12, | |
| "suite_label": "Task 12" | |
| }, | |
| { | |
| "task_id": "long_horizon_next_action", | |
| "task_display_name": "Long-Horizon Next-Action Forecasting", | |
| "research_name": "Long-Horizon Next-Action Forecasting", | |
| "provenance_source": "historical_result_bundle", | |
| "origin_count_label": "unified task", | |
| "family": "classification", | |
| "architecture_family": "minimal_softmax", | |
| "primary_direction": "sample-supported extension", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "input_short": "Current 20-frame non-caption multimodal window.", | |
| "process": "shared window features -> task-specific target builder -> minimal/neural head", | |
| "output": "Action label five seconds later.", | |
| "output_short": "Action label five seconds later.", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.07499999999999998, | |
| "neural_primary_metric": 0.06545454545454546, | |
| "counts": { | |
| "num_windows": 1073, | |
| "num_eval_windows": 322, | |
| "num_train_windows": 751, | |
| "num_test_windows": 322, | |
| "num_classes": 18 | |
| }, | |
| "meaning": "Tests whether the current state carries enough procedure context to forecast beyond the one-second core next-action task.", | |
| "artifact_sources": { | |
| "legacy_result_directory": "results/episode_task_suite/tier2_task_suite/", | |
| "minimal_metrics": "results/episode_task_suite/tier2_task_suite/long_horizon_next_action/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/long_horizon_next_action/metrics.json" | |
| }, | |
| "task_number": 13, | |
| "suite_label": "Task 13" | |
| }, | |
| { | |
| "task_id": "next_subtask_forecast", | |
| "task_display_name": "Long-Horizon Next-Subtask Forecasting", | |
| "research_name": "Long-Horizon Next-Subtask Forecasting", | |
| "provenance_source": "historical_result_bundle", | |
| "origin_count_label": "unified task", | |
| "family": "classification", | |
| "architecture_family": "minimal_softmax", | |
| "primary_direction": "sample-supported extension", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "input_short": "Current 20-frame non-caption multimodal window.", | |
| "process": "shared window features -> task-specific target builder -> minimal/neural head", | |
| "output": "Procedure subtask label five seconds later.", | |
| "output_short": "Procedure subtask label five seconds later.", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.04545454545454545, | |
| "neural_primary_metric": 0.050724637681159424, | |
| "counts": { | |
| "num_windows": 1141, | |
| "num_eval_windows": 342, | |
| "num_train_windows": 799, | |
| "num_test_windows": 342, | |
| "num_classes": 14 | |
| }, | |
| "meaning": "Moves from immediate action anticipation to higher-level procedure-state prediction.", | |
| "artifact_sources": { | |
| "legacy_result_directory": "results/episode_task_suite/tier2_task_suite/", | |
| "minimal_metrics": "results/episode_task_suite/tier2_task_suite/next_subtask_forecast/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/next_subtask_forecast/metrics.json" | |
| }, | |
| "task_number": 14, | |
| "suite_label": "Task 14" | |
| }, | |
| { | |
| "task_id": "interaction_text_prediction", | |
| "task_display_name": "Interaction Text Prediction", | |
| "research_name": "Interaction Text Prediction", | |
| "provenance_source": "historical_result_bundle", | |
| "origin_count_label": "unified task", | |
| "family": "classification", | |
| "architecture_family": "minimal_softmax", | |
| "primary_direction": "sample-supported extension", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "input_short": "Current 20-frame sensor window with caption-text features removed.", | |
| "process": "shared window features -> task-specific target builder -> minimal/neural head", | |
| "output": "Raw annotation interaction phrase for the same window.", | |
| "output_short": "Raw annotation interaction phrase for the same window.", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.04444444444444444, | |
| "neural_primary_metric": 0.0380952380952381, | |
| "counts": { | |
| "num_windows": 192, | |
| "num_eval_windows": 58, | |
| "num_train_windows": 134, | |
| "num_test_windows": 58, | |
| "num_classes": 46 | |
| }, | |
| "meaning": "Uses the raw caption JSON interaction field as a language target instead of only the hashed text feature.", | |
| "artifact_sources": { | |
| "legacy_result_directory": "results/episode_task_suite/tier2_task_suite/", | |
| "minimal_metrics": "results/episode_task_suite/tier2_task_suite/interaction_text_prediction/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/interaction_text_prediction/metrics.json" | |
| }, | |
| "task_number": 15, | |
| "suite_label": "Task 15" | |
| }, | |
| { | |
| "task_id": "action_object_relation", | |
| "task_display_name": "Action-Object Relation Prediction", | |
| "research_name": "Action-Object Relation Prediction", | |
| "provenance_source": "historical_result_bundle", | |
| "origin_count_label": "unified task", | |
| "family": "classification", | |
| "architecture_family": "minimal_softmax", | |
| "primary_direction": "sample-supported extension", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "input_short": "Current 20-frame sensor window with caption-text features removed.", | |
| "process": "shared window features -> task-specific target builder -> minimal/neural head", | |
| "output": "Joint action plus active object-set relation.", | |
| "output_short": "Joint action plus active object-set relation.", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.0, | |
| "neural_primary_metric": 0.0, | |
| "counts": { | |
| "num_windows": 178, | |
| "num_eval_windows": 53, | |
| "num_train_windows": 125, | |
| "num_test_windows": 53, | |
| "num_classes": 42 | |
| }, | |
| "meaning": "Evaluates whether a model can bind what action is happening to which objects are involved.", | |
| "artifact_sources": { | |
| "legacy_result_directory": "results/episode_task_suite/tier2_task_suite/", | |
| "minimal_metrics": "results/episode_task_suite/tier2_task_suite/action_object_relation/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/action_object_relation/metrics.json" | |
| }, | |
| "task_number": 16, | |
| "suite_label": "Task 16" | |
| }, | |
| { | |
| "task_id": "object_set_forecast", | |
| "task_display_name": "Future Object-Set Forecasting", | |
| "research_name": "Future Object-Set Forecasting", | |
| "provenance_source": "historical_result_bundle", | |
| "origin_count_label": "unified task", | |
| "family": "multi_label", | |
| "architecture_family": "minimal_ridge_multilabel", | |
| "primary_direction": "sample-supported extension", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "input_short": "Current 20-frame sensor window with caption-text features removed.", | |
| "process": "shared window features -> task-specific target builder -> minimal/neural head", | |
| "output": "Object set active five seconds later.", | |
| "output_short": "Object set active five seconds later.", | |
| "metric_key": "micro_f1", | |
| "metric_name": "micro-F1", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.16939890710382516, | |
| "neural_primary_metric": 0.19718309859154928, | |
| "counts": { | |
| "num_windows": 188, | |
| "num_train_windows": 132, | |
| "num_test_windows": 56 | |
| }, | |
| "meaning": "Predicts which objects will become relevant soon, not only which objects are relevant now.", | |
| "artifact_sources": { | |
| "legacy_result_directory": "results/episode_task_suite/tier2_task_suite/", | |
| "minimal_metrics": "results/episode_task_suite/tier2_task_suite/object_set_forecast/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/object_set_forecast/metrics.json" | |
| }, | |
| "task_number": 17, | |
| "suite_label": "Task 17" | |
| }, | |
| { | |
| "task_id": "imu_to_hand_pose", | |
| "task_display_name": "IMU-to-Hand Pose Reconstruction", | |
| "research_name": "IMU-to-Hand Pose Reconstruction", | |
| "provenance_source": "historical_result_bundle", | |
| "origin_count_label": "unified task", | |
| "family": "regression", | |
| "architecture_family": "minimal_ridge_regression", | |
| "primary_direction": "sample-supported extension", | |
| "input": "Current IMU acceleration/gyroscope feature block only.", | |
| "input_short": "Current IMU acceleration/gyroscope feature block only.", | |
| "process": "shared window features -> task-specific target builder -> minimal/neural head", | |
| "output": "Current left/right hand joint feature blocks.", | |
| "output_short": "Current left/right hand joint feature blocks.", | |
| "metric_key": "mae", | |
| "metric_name": "MAE", | |
| "metric_direction": "lower", | |
| "minimal_primary_metric": 0.042049407958984375, | |
| "neural_primary_metric": 0.042562149465084076, | |
| "counts": { | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "meaning": "A sensor-bridge probe for how much hand configuration can be recovered from inertial motion alone.", | |
| "artifact_sources": { | |
| "legacy_result_directory": "results/episode_task_suite/tier2_task_suite/", | |
| "minimal_metrics": "results/episode_task_suite/tier2_task_suite/imu_to_hand_pose/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/imu_to_hand_pose/metrics.json" | |
| }, | |
| "task_number": 18, | |
| "suite_label": "Task 18" | |
| }, | |
| { | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_display_name": "Camera-View Synchronization Retrieval", | |
| "research_name": "Camera-View Synchronization Retrieval", | |
| "provenance_source": "historical_result_bundle", | |
| "origin_count_label": "unified task", | |
| "family": "retrieval", | |
| "architecture_family": "minimal_ridge_projection_cosine_retrieval", | |
| "primary_direction": "sample-supported extension", | |
| "input": "Fisheye camera-1 feature query projected into fisheye camera-3 feature space.", | |
| "input_short": "Fisheye camera-1 feature query projected into fisheye camera-3 feature space.", | |
| "process": "shared window features -> task-specific target builder -> minimal/neural head", | |
| "output": "The synchronized held-out camera-3 window.", | |
| "output_short": "The synchronized held-out camera-3 window.", | |
| "metric_key": "mrr", | |
| "metric_name": "MRR", | |
| "metric_direction": "higher", | |
| "minimal_primary_metric": 0.4943004846572876, | |
| "neural_primary_metric": 0.24086658656597137, | |
| "counts": { | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "meaning": "Stress-tests multi-camera time alignment beyond the core cross-modal retrieval task.", | |
| "artifact_sources": { | |
| "legacy_result_directory": "results/episode_task_suite/tier2_task_suite/", | |
| "minimal_metrics": "results/episode_task_suite/tier2_task_suite/camera_view_sync_retrieval/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/camera_view_sync_retrieval/metrics.json" | |
| }, | |
| "task_number": 19, | |
| "suite_label": "Task 19" | |
| }, | |
| { | |
| "task_id": "time_to_transition", | |
| "task_display_name": "Time-to-Next-Transition Regression", | |
| "research_name": "Time-to-Next-Transition Regression", | |
| "provenance_source": "historical_result_bundle", | |
| "origin_count_label": "unified task", | |
| "family": "regression", | |
| "architecture_family": "minimal_ridge_regression", | |
| "primary_direction": "sample-supported extension", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "input_short": "Current 20-frame non-caption multimodal window.", | |
| "process": "shared window features -> task-specific target builder -> minimal/neural head", | |
| "output": "Frames until the next action-label boundary, capped at 200 frames.", | |
| "output_short": "Frames until the next action-label boundary, capped at 200 frames.", | |
| "metric_key": "mae", | |
| "metric_name": "MAE frames", | |
| "metric_direction": "lower", | |
| "minimal_primary_metric": 10.53735637664795, | |
| "neural_primary_metric": 10.55449390411377, | |
| "counts": { | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "meaning": "Turns boundary detection into a continuous timing estimate for procedural control.", | |
| "artifact_sources": { | |
| "legacy_result_directory": "results/episode_task_suite/tier2_task_suite/", | |
| "minimal_metrics": "results/episode_task_suite/tier2_task_suite/time_to_transition/metrics.json", | |
| "neural_metrics": "results/episode_task_suite/tier2_task_suite/neural_mlp/time_to_transition/metrics.json" | |
| }, | |
| "task_number": 20, | |
| "suite_label": "Task 20" | |
| } | |
| ] | |
| } | |