Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "source": "docs/data/task_suite_20.json plus results/episode_task_suite/summary_report.json", | |
| "dataset_scope": { | |
| "sample_episode_count": 1, | |
| "num_frames": 5821, | |
| "num_windows": 1161, | |
| "feature_dim": 8546, | |
| "warning": "Single public sample episode; this supports pipeline/task evidence, while cross-episode generalization requires held-out episodes." | |
| }, | |
| "baselines": { | |
| "minimal": "Interpretable softmax, logistic, ridge, and retrieval heads over the 8,546-d window feature vector.", | |
| "neural_mlp": "Small PyTorch MLP classifiers/regressors using the same features, splits, and task contracts." | |
| }, | |
| "task_count": 20, | |
| "directions": { | |
| "A": { | |
| "id": "human_motion", | |
| "name": "Human Modeling & Motion Understanding", | |
| "focus": "Human/hand/body motion, deformation priors, human-object interaction, affordance modeling.", | |
| "preferred_background": "Human pose/shape estimation, SMPL-style models, motion capture, or motion generation.", | |
| "current_status": "partially implemented", | |
| "current_readout": "The sample supports hand trajectory forecasting and contact/object probes, but it does not yet include a full body/shape model or multi-person priors.", | |
| "next_steps": [ | |
| "Add SMPL/SMPL-X or MANO-style body/hand parameter targets where available.", | |
| "Train sequence models over multi-episode motion trajectories instead of isolated windows.", | |
| "Evaluate affordance prediction on held-out objects and held-out episodes." | |
| ], | |
| "tasks": [ | |
| "timeline_action", | |
| "hand_trajectory_forecast", | |
| "contact_prediction", | |
| "object_relevance", | |
| "interaction_text_prediction", | |
| "imu_to_hand_pose" | |
| ], | |
| "task_display_names": [ | |
| "Action Recognition", | |
| "Hand Trajectory Forecasting", | |
| "Contact State Prediction", | |
| "Object Relevance Prediction", | |
| "Interaction Text Prediction", | |
| "IMU-to-Hand Pose Reconstruction" | |
| ], | |
| "counts": { | |
| "direct": 3, | |
| "proxy": 3, | |
| "diagnostic": 0, | |
| "total_links": 6 | |
| } | |
| }, | |
| "B": { | |
| "id": "reconstruction_rendering", | |
| "name": "3D/4D Reconstruction & Neural Rendering", | |
| "focus": "Multi-view dynamic scene reconstruction, NeRF/Gaussian Splatting, novel-view synthesis.", | |
| "preferred_background": "3D reconstruction, neural rendering, camera calibration, and bundle adjustment.", | |
| "current_status": "proxy tasks only", | |
| "current_readout": "The current suite checks cross-modal alignment and depth/video reconstruction proxies; it does not yet train a renderer or reconstruct geometry.", | |
| "next_steps": [ | |
| "Use calibrated multi-view video plus SLAM pose to build per-episode camera trajectories.", | |
| "Add depth-supervised point clouds, TSDF, Gaussian Splatting, or NeRF baselines.", | |
| "Evaluate novel-view synthesis and temporal consistency across held-out views/time." | |
| ], | |
| "tasks": [ | |
| "cross_modal_retrieval", | |
| "modality_reconstruction", | |
| "misalignment_detection", | |
| "imu_to_hand_pose", | |
| "camera_view_sync_retrieval" | |
| ], | |
| "task_display_names": [ | |
| "Cross-Modal Retrieval", | |
| "Cross-Modal Reconstruction", | |
| "Multimodal Synchronization Detection", | |
| "IMU-to-Hand Pose Reconstruction", | |
| "Camera-View Synchronization Retrieval" | |
| ], | |
| "counts": { | |
| "direct": 1, | |
| "proxy": 3, | |
| "diagnostic": 1, | |
| "total_links": 5 | |
| } | |
| }, | |
| "C": { | |
| "id": "egocentric_interaction", | |
| "name": "Egocentric Vision & Interaction", | |
| "focus": "Egocentric action and intention understanding, hand-object interaction, gaze/attention modeling, task structure modeling.", | |
| "preferred_background": "Video understanding, action recognition, or egocentric vision.", | |
| "current_status": "strongest implemented track", | |
| "current_readout": "The unified 20-task suite directly targets egocentric action, task state, interaction, grounding, forecasting, and alignment.", | |
| "next_steps": [ | |
| "Move from single-episode chronological splits to held-out-episode splits.", | |
| "Use audio together with stronger multimodal backbones for action, intent, and grounding.", | |
| "Evaluate long-horizon task success prediction and action-conditioned generation." | |
| ], | |
| "tasks": [ | |
| "timeline_action", | |
| "timeline_subtask", | |
| "transition_detection", | |
| "next_action", | |
| "hand_trajectory_forecast", | |
| "contact_prediction", | |
| "object_relevance", | |
| "caption_grounding", | |
| "cross_modal_retrieval", | |
| "temporal_order", | |
| "misalignment_detection", | |
| "long_horizon_next_action", | |
| "next_subtask_forecast", | |
| "interaction_text_prediction", | |
| "action_object_relation", | |
| "object_set_forecast", | |
| "time_to_transition" | |
| ], | |
| "task_display_names": [ | |
| "Action Recognition", | |
| "Procedure Step Recognition", | |
| "Action Boundary Detection", | |
| "Next-Action Prediction", | |
| "Hand Trajectory Forecasting", | |
| "Contact State Prediction", | |
| "Object Relevance Prediction", | |
| "Language Grounding", | |
| "Cross-Modal Retrieval", | |
| "Temporal Order Verification", | |
| "Multimodal Synchronization Detection", | |
| "Long-Horizon Next-Action Forecasting", | |
| "Long-Horizon Next-Subtask Forecasting", | |
| "Interaction Text Prediction", | |
| "Action-Object Relation Prediction", | |
| "Future Object-Set Forecasting", | |
| "Time-to-Next-Transition Regression" | |
| ], | |
| "counts": { | |
| "direct": 10, | |
| "proxy": 3, | |
| "diagnostic": 4, | |
| "total_links": 17 | |
| } | |
| }, | |
| "D": { | |
| "id": "world_modeling", | |
| "name": "Scene Reconstruction & World Modeling", | |
| "focus": "Long-term consistent 3D/4D scene mapping, scene graphs, object- and space-centric representations, spatial reasoning.", | |
| "preferred_background": "Large-scale mapping, semantic reconstruction, or agent world models.", | |
| "current_status": "early proxy tasks", | |
| "current_readout": "The current tasks probe temporal structure, object relevance, cross-modal retrieval, and modality prediction, but they do not yet build persistent maps or scene graphs.", | |
| "next_steps": [ | |
| "Convert windows into persistent object/scene-state nodes with timestamps and camera poses.", | |
| "Add map consistency, object permanence, and spatial relation prediction tasks.", | |
| "Train held-out-episode world models that predict future observations and task state." | |
| ], | |
| "tasks": [ | |
| "timeline_subtask", | |
| "transition_detection", | |
| "next_action", | |
| "object_relevance", | |
| "caption_grounding", | |
| "cross_modal_retrieval", | |
| "modality_reconstruction", | |
| "temporal_order", | |
| "misalignment_detection", | |
| "long_horizon_next_action", | |
| "next_subtask_forecast", | |
| "action_object_relation", | |
| "object_set_forecast", | |
| "camera_view_sync_retrieval", | |
| "time_to_transition" | |
| ], | |
| "task_display_names": [ | |
| "Procedure Step Recognition", | |
| "Action Boundary Detection", | |
| "Next-Action Prediction", | |
| "Object Relevance Prediction", | |
| "Language Grounding", | |
| "Cross-Modal Retrieval", | |
| "Cross-Modal Reconstruction", | |
| "Temporal Order Verification", | |
| "Multimodal Synchronization Detection", | |
| "Long-Horizon Next-Action Forecasting", | |
| "Long-Horizon Next-Subtask Forecasting", | |
| "Action-Object Relation Prediction", | |
| "Future Object-Set Forecasting", | |
| "Camera-View Synchronization Retrieval", | |
| "Time-to-Next-Transition Regression" | |
| ], | |
| "counts": { | |
| "direct": 1, | |
| "proxy": 10, | |
| "diagnostic": 4, | |
| "total_links": 15 | |
| } | |
| } | |
| }, | |
| "tasks": { | |
| "timeline_action": { | |
| "name": "Timeline action recognition", | |
| "family": "supervised", | |
| "input": "all featurized modalities", | |
| "output": "current action label", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "direct", | |
| "A": "proxy" | |
| }, | |
| "why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout.", | |
| "current_limit": "Chronological single-episode split creates unseen future action classes.", | |
| "display_name": "Action Recognition", | |
| "artifact_id": "timeline_action", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 0.05, | |
| "neural_mlp": 0.014814814814814814, | |
| "better_baseline": "minimal" | |
| } | |
| }, | |
| "timeline_subtask": { | |
| "name": "Timeline subtask recognition", | |
| "family": "supervised", | |
| "input": "all featurized modalities", | |
| "output": "current subtask label", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "direct", | |
| "D": "proxy" | |
| }, | |
| "why": "Segments egocentric task state and provides a first proxy for symbolic world/task state.", | |
| "current_limit": "Single-episode ordering makes future subtasks hard to generalize.", | |
| "display_name": "Procedure Step Recognition", | |
| "artifact_id": "timeline_subtask", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 0.05056355513846935, | |
| "neural_mlp": 0.02810810810810811, | |
| "better_baseline": "minimal" | |
| } | |
| }, | |
| "transition_detection": { | |
| "name": "Action transition detection", | |
| "family": "diagnostic", | |
| "input": "all featurized modalities", | |
| "output": "boundary vs steady state", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "direct", | |
| "D": "diagnostic" | |
| }, | |
| "why": "Localizes egocentric task boundaries and diagnoses temporal state changes.", | |
| "current_limit": "Boundary class is sparse, so accuracy alone is misleading.", | |
| "display_name": "Action Boundary Detection", | |
| "artifact_id": "transition_detection", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 0.6118237590630229, | |
| "neural_mlp": 0.5862068965517241, | |
| "better_baseline": "minimal" | |
| } | |
| }, | |
| "next_action": { | |
| "name": "Short-horizon next action", | |
| "family": "supervised", | |
| "input": "current multimodal window", | |
| "output": "action 20 frames later", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "direct", | |
| "D": "proxy" | |
| }, | |
| "why": "Tests action intention/task-flow prediction from egocentric context.", | |
| "current_limit": "Unseen future labels dominate the single-episode chronological test.", | |
| "display_name": "Next-Action Prediction", | |
| "artifact_id": "next_action", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 0.05925925925925927, | |
| "neural_mlp": 0.04186046511627907, | |
| "better_baseline": "minimal" | |
| } | |
| }, | |
| "hand_trajectory_forecast": { | |
| "name": "Hand trajectory forecasting", | |
| "family": "forecast", | |
| "input": "current multimodal window", | |
| "output": "future left/right hand 3D joints", | |
| "primary_direction": "A", | |
| "direction_roles": { | |
| "A": "direct", | |
| "C": "proxy" | |
| }, | |
| "why": "Directly predicts human hand motion and supports hand-object interaction modeling.", | |
| "current_limit": "Forecasting is window-level and not yet a full sequence or policy model.", | |
| "display_name": "Hand Trajectory Forecasting", | |
| "artifact_id": "hand_trajectory_forecast", | |
| "metric": { | |
| "key": "mpjpe", | |
| "name": "MPJPE", | |
| "direction": "lower", | |
| "minimal": 0.8646570444107056, | |
| "neural_mlp": 0.10785018652677536, | |
| "better_baseline": "neural_mlp" | |
| } | |
| }, | |
| "contact_prediction": { | |
| "name": "Body/object contact prediction", | |
| "family": "supervised", | |
| "input": "non-contact/non-caption features", | |
| "output": "binary contact label", | |
| "primary_direction": "A", | |
| "direction_roles": { | |
| "A": "direct", | |
| "C": "proxy" | |
| }, | |
| "why": "Targets physical interaction state, a core affordance and manipulation signal.", | |
| "current_limit": "The public sample is degenerate for this target because one class dominates.", | |
| "display_name": "Contact State Prediction", | |
| "artifact_id": "contact_prediction", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 1.0, | |
| "neural_mlp": 1.0, | |
| "better_baseline": "tie" | |
| } | |
| }, | |
| "object_relevance": { | |
| "name": "Relevant object set prediction", | |
| "family": "supervised", | |
| "input": "non-caption feature blocks", | |
| "output": "multi-label object set", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "direct", | |
| "A": "proxy", | |
| "D": "proxy" | |
| }, | |
| "why": "Connects egocentric activity to manipulated objects and early object-centric state.", | |
| "current_limit": "Object labels are language-derived and sparse in one episode.", | |
| "display_name": "Object Relevance Prediction", | |
| "artifact_id": "object_relevance", | |
| "metric": { | |
| "key": "micro_f1", | |
| "name": "micro-F1", | |
| "direction": "higher", | |
| "minimal": 0.18034382095361662, | |
| "neural_mlp": 0.1679279279279279, | |
| "better_baseline": "minimal" | |
| } | |
| }, | |
| "caption_grounding": { | |
| "name": "Caption-to-window grounding", | |
| "family": "retrieval", | |
| "input": "caption objects/interaction query and candidate sensor windows", | |
| "output": "matching time window", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "direct", | |
| "D": "proxy" | |
| }, | |
| "why": "Grounds language annotation into egocentric sensor time and task state.", | |
| "current_limit": "Bag-of-objects language features are too weak for rich grounding.", | |
| "display_name": "Language Grounding", | |
| "artifact_id": "caption_grounding", | |
| "metric": { | |
| "key": "mrr", | |
| "name": "MRR", | |
| "direction": "higher", | |
| "minimal": 0.016023479050338015, | |
| "neural_mlp": 0.01684125567132316, | |
| "better_baseline": "neural_mlp" | |
| } | |
| }, | |
| "cross_modal_retrieval": { | |
| "name": "Cross-modal retrieval", | |
| "family": "retrieval", | |
| "input": "motion/IMU/camera query", | |
| "output": "matching depth/video window", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "diagnostic", | |
| "B": "proxy", | |
| "D": "proxy" | |
| }, | |
| "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling.", | |
| "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.", | |
| "display_name": "Cross-Modal Retrieval", | |
| "artifact_id": "cross_modal_retrieval", | |
| "metric": { | |
| "key": "mrr", | |
| "name": "MRR", | |
| "direction": "higher", | |
| "minimal": 0.26925966892956127, | |
| "neural_mlp": 0.1299971898648288, | |
| "better_baseline": "minimal" | |
| } | |
| }, | |
| "modality_reconstruction": { | |
| "name": "Modality reconstruction", | |
| "family": "forecast", | |
| "input": "motion/IMU/camera", | |
| "output": "depth/video feature vector", | |
| "primary_direction": "B", | |
| "direction_roles": { | |
| "B": "proxy", | |
| "D": "proxy" | |
| }, | |
| "why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective.", | |
| "current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.", | |
| "display_name": "Cross-Modal Reconstruction", | |
| "artifact_id": "modality_reconstruction", | |
| "metric": { | |
| "key": "r2", | |
| "name": "R2", | |
| "direction": "higher", | |
| "minimal": -0.015271898913936655, | |
| "neural_mlp": -0.010171410134180991, | |
| "better_baseline": "neural_mlp" | |
| } | |
| }, | |
| "temporal_order": { | |
| "name": "Temporal order verification", | |
| "family": "diagnostic", | |
| "input": "two adjacent windows", | |
| "output": "correct vs reversed order", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "diagnostic", | |
| "D": "diagnostic" | |
| }, | |
| "why": "Checks whether features encode local time direction and task progression.", | |
| "current_limit": "Only local adjacent ordering, not long-horizon causal modeling.", | |
| "display_name": "Temporal Order Verification", | |
| "artifact_id": "temporal_order", | |
| "metric": { | |
| "key": "f1", | |
| "name": "F1", | |
| "direction": "higher", | |
| "minimal": 0.5399515738498789, | |
| "neural_mlp": 0.8520179372197308, | |
| "better_baseline": "neural_mlp" | |
| } | |
| }, | |
| "misalignment_detection": { | |
| "name": "Cross-modal misalignment detection", | |
| "family": "diagnostic", | |
| "input": "motion plus visual/depth pair", | |
| "output": "aligned vs shifted", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "diagnostic", | |
| "B": "diagnostic", | |
| "D": "diagnostic" | |
| }, | |
| "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models.", | |
| "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.", | |
| "display_name": "Multimodal Synchronization Detection", | |
| "artifact_id": "misalignment_detection", | |
| "metric": { | |
| "key": "f1", | |
| "name": "F1", | |
| "direction": "higher", | |
| "minimal": 0.5051698670605613, | |
| "neural_mlp": 0.7152682255845944, | |
| "better_baseline": "neural_mlp" | |
| } | |
| }, | |
| "long_horizon_next_action": { | |
| "name": "Long-horizon next-action forecasting", | |
| "family": "classification", | |
| "input": "current and historical windows", | |
| "output": "future action label", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "direct", | |
| "D": "proxy" | |
| }, | |
| "why": "Extends short-horizon intention prediction into longer activity futures, a key egocentric and world-model signal.", | |
| "current_limit": "Evaluated from sample-supported future labels, not full open-world action generation.", | |
| "display_name": "Long-Horizon Next-Action Forecasting", | |
| "artifact_id": "long_horizon_next_action", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 0.07499999999999998, | |
| "neural_mlp": 0.06545454545454546, | |
| "better_baseline": "minimal" | |
| } | |
| }, | |
| "next_subtask_forecast": { | |
| "name": "Long-horizon next-subtask forecasting", | |
| "family": "classification", | |
| "input": "current and historical windows", | |
| "output": "future procedure-step label", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "direct", | |
| "D": "proxy" | |
| }, | |
| "why": "Measures whether the model can anticipate the next procedural phase rather than only the current frame state.", | |
| "current_limit": "Subtask labels are constrained to the available annotation vocabulary.", | |
| "display_name": "Long-Horizon Next-Subtask Forecasting", | |
| "artifact_id": "next_subtask_forecast", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 0.04545454545454545, | |
| "neural_mlp": 0.050724637681159424, | |
| "better_baseline": "neural_mlp" | |
| } | |
| }, | |
| "interaction_text_prediction": { | |
| "name": "Interaction text prediction", | |
| "family": "classification", | |
| "input": "window features without target text leakage", | |
| "output": "natural-language interaction class", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "direct", | |
| "A": "proxy" | |
| }, | |
| "why": "Connects egocentric observations to the natural-language interaction semantics carried by the annotation.", | |
| "current_limit": "Public derived features retain hashed text targets; raw full text requires the official annotation source.", | |
| "display_name": "Interaction Text Prediction", | |
| "artifact_id": "interaction_text_prediction", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 0.04444444444444444, | |
| "neural_mlp": 0.0380952380952381, | |
| "better_baseline": "minimal" | |
| } | |
| }, | |
| "action_object_relation": { | |
| "name": "Action-object relation prediction", | |
| "family": "classification", | |
| "input": "window features with target-side relation leakage excluded", | |
| "output": "action-object relation class", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "direct", | |
| "D": "proxy" | |
| }, | |
| "why": "Tests whether action recognition and object state are connected as a relational interaction representation.", | |
| "current_limit": "Relation labels are derived from the public-sample annotation scope.", | |
| "display_name": "Action-Object Relation Prediction", | |
| "artifact_id": "action_object_relation", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 0.0, | |
| "neural_mlp": 0.0, | |
| "better_baseline": "tie" | |
| } | |
| }, | |
| "object_set_forecast": { | |
| "name": "Future object-set forecasting", | |
| "family": "multi-label", | |
| "input": "current and historical windows", | |
| "output": "future object set", | |
| "primary_direction": "D", | |
| "direction_roles": { | |
| "D": "direct", | |
| "C": "proxy" | |
| }, | |
| "why": "Asks whether the current scene state supports predicting which objects will matter later.", | |
| "current_limit": "This is a set-level proxy, not a persistent 3D scene graph.", | |
| "display_name": "Future Object-Set Forecasting", | |
| "artifact_id": "object_set_forecast", | |
| "metric": { | |
| "key": "micro_f1", | |
| "name": "micro-F1", | |
| "direction": "higher", | |
| "minimal": 0.16939890710382516, | |
| "neural_mlp": 0.19718309859154928, | |
| "better_baseline": "neural_mlp" | |
| } | |
| }, | |
| "imu_to_hand_pose": { | |
| "name": "IMU-to-hand pose reconstruction", | |
| "family": "regression", | |
| "input": "IMU and motion context", | |
| "output": "hand pose target", | |
| "primary_direction": "A", | |
| "direction_roles": { | |
| "A": "direct", | |
| "B": "proxy" | |
| }, | |
| "why": "Measures human-motion reconstruction from wearable and motion cues.", | |
| "current_limit": "Pose reconstruction is window-level and does not yet fit a full parametric hand/body model.", | |
| "display_name": "IMU-to-Hand Pose Reconstruction", | |
| "artifact_id": "imu_to_hand_pose", | |
| "metric": { | |
| "key": "mae", | |
| "name": "MAE", | |
| "direction": "lower", | |
| "minimal": 0.042049407958984375, | |
| "neural_mlp": 0.042562149465084076, | |
| "better_baseline": "minimal" | |
| } | |
| }, | |
| "camera_view_sync_retrieval": { | |
| "name": "Camera-view synchronization retrieval", | |
| "family": "retrieval", | |
| "input": "one camera-view/window query", | |
| "output": "matching synchronized view", | |
| "primary_direction": "B", | |
| "direction_roles": { | |
| "B": "direct", | |
| "D": "proxy" | |
| }, | |
| "why": "Tests whether synchronized multi-view structure is recoverable across camera streams.", | |
| "current_limit": "Retrieval checks view consistency but does not reconstruct geometry by itself.", | |
| "display_name": "Camera-View Synchronization Retrieval", | |
| "artifact_id": "camera_view_sync_retrieval", | |
| "metric": { | |
| "key": "mrr", | |
| "name": "MRR", | |
| "direction": "higher", | |
| "minimal": 0.4943004846572876, | |
| "neural_mlp": 0.24086658656597137, | |
| "better_baseline": "minimal" | |
| } | |
| }, | |
| "time_to_transition": { | |
| "name": "Time-to-next-transition regression", | |
| "family": "regression", | |
| "input": "current temporal window state", | |
| "output": "frames/time until the next transition", | |
| "primary_direction": "C", | |
| "direction_roles": { | |
| "C": "diagnostic", | |
| "D": "diagnostic" | |
| }, | |
| "why": "Measures temporal boundary awareness as a continuous timing target.", | |
| "current_limit": "Regression is local to the annotated public sample timeline.", | |
| "display_name": "Time-to-Next-Transition Regression", | |
| "artifact_id": "time_to_transition", | |
| "metric": { | |
| "key": "mae", | |
| "name": "MAE frames", | |
| "direction": "lower", | |
| "minimal": 10.53735637664795, | |
| "neural_mlp": 10.55449390411377, | |
| "better_baseline": "minimal" | |
| } | |
| } | |
| } | |
| } | |