Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "omni_relay": { | |
| "status": "verified_full_128_episode_diagnostic_result", | |
| "dataset": "ropedia-ai/xperience-10m", | |
| "staging": "verified_public_package_and_adapter_publication", | |
| "training_target": "action_subtask_quality_and_unseen_label_error_analysis", | |
| "selection_strategy": "stratified_round_robin_by_top_level_session", | |
| "target_episodes": 128, | |
| "selected_sessions": 128, | |
| "candidate_scan_top_level_sessions": 802, | |
| "valid_candidates": 12102, | |
| "estimated_bytes": 298188841943, | |
| "exclude": [ | |
| "visualization.rrd" | |
| ], | |
| "access_status": "The gated Xperience-10M dataset is available for selected multi-episode pilot preparation.", | |
| "current_scope": "The selected-episode Qwen3-Omni diagnostic pilot is verified on the 96/16/16 split and now meets the 98% target for JSON validity; action/subtask quality remains weak, so current results are diagnostic baselines, not strong model-quality claims." | |
| }, | |
| "models": { | |
| "motion_action": { | |
| "accuracy": 0.9828178694158075, | |
| "balanced_accuracy": 0.9643518518518519, | |
| "macro_f1": 0.96884342657456, | |
| "weighted_f1": 0.9824311468352843, | |
| "num_eval_windows": 291, | |
| "num_classes": 18, | |
| "majority_baseline_accuracy": 0.13745704467353953, | |
| "train_final_accuracy": 1.0, | |
| "train_final_loss": 0.019042566418647766 | |
| }, | |
| "motion_subtask": { | |
| "accuracy": 0.9758620689655172, | |
| "balanced_accuracy": 0.9783924095954172, | |
| "macro_f1": 0.9528048001232955, | |
| "weighted_f1": 0.9778836359351952, | |
| "num_eval_windows": 290, | |
| "num_classes": 14, | |
| "majority_baseline_accuracy": 0.14482758620689656, | |
| "train_final_accuracy": 1.0, | |
| "train_final_loss": 0.02664567530155182 | |
| }, | |
| "all_modalities_action": { | |
| "accuracy": 0.9862542955326461, | |
| "balanced_accuracy": 0.9856481481481482, | |
| "macro_f1": 0.9828810433408773, | |
| "weighted_f1": 0.9862660597416385, | |
| "num_eval_windows": 291, | |
| "num_classes": 18, | |
| "majority_baseline_accuracy": 0.13745704467353953, | |
| "train_final_accuracy": 1.0, | |
| "train_final_loss": 0.014677195809781551, | |
| "feature_dim": 8546, | |
| "num_windows": 1144 | |
| }, | |
| "all_modalities_subtask": { | |
| "accuracy": 0.9827586206896551, | |
| "balanced_accuracy": 0.9505102040816327, | |
| "macro_f1": 0.9173189771658273, | |
| "weighted_f1": 0.9841228382209077, | |
| "num_eval_windows": 290, | |
| "num_classes": 14, | |
| "majority_baseline_accuracy": 0.14482758620689656, | |
| "train_final_accuracy": 1.0, | |
| "train_final_loss": 0.012834250926971436, | |
| "feature_dim": 8546, | |
| "num_windows": 1147 | |
| } | |
| }, | |
| "suite": { | |
| "annotation": "data/sample/xperience-10m-sample/annotation.hdf5", | |
| "num_frames": 5821, | |
| "num_windows": 1161, | |
| "feature_dim": 8546, | |
| "window_frames": 20, | |
| "stride_frames": 5, | |
| "tasks": { | |
| "timeline_action": { | |
| "accuracy": 0.029154518950437316, | |
| "balanced_accuracy": 0.03125, | |
| "macro_f1": 0.05, | |
| "weighted_f1": 0.04664723032069971, | |
| "num_eval_windows": 343, | |
| "num_classes": 18, | |
| "task": "timeline_action", | |
| "input": "all modalities -> current action label", | |
| "split": "chronological", | |
| "num_windows": 1144, | |
| "num_train_windows": 801, | |
| "num_test_windows": 343, | |
| "feature_dim": 8546, | |
| "majority_baseline_accuracy": 0.0, | |
| "train_final_accuracy": 1.0, | |
| "train_final_loss": 0.016824405640363693, | |
| "unseen_test_classes": [ | |
| "Place item on table", | |
| "Pour coffee", | |
| "Pour milk into coffee", | |
| "Wait/Prepare for pouring" | |
| ], | |
| "task_display_name": "Action Recognition" | |
| }, | |
| "timeline_subtask": { | |
| "accuracy": 0.05813953488372093, | |
| "balanced_accuracy": 0.05376979652090881, | |
| "macro_f1": 0.05056355513846935, | |
| "weighted_f1": 0.06827161211620246, | |
| "num_eval_windows": 344, | |
| "num_classes": 14, | |
| "task": "timeline_subtask", | |
| "input": "all modalities -> current subtask label", | |
| "split": "chronological", | |
| "num_windows": 1147, | |
| "num_train_windows": 803, | |
| "num_test_windows": 344, | |
| "feature_dim": 8546, | |
| "majority_baseline_accuracy": 0.0, | |
| "train_final_accuracy": 1.0, | |
| "train_final_loss": 0.014138756319880486, | |
| "unseen_test_classes": [ | |
| "Move bottle to coffee equipment", | |
| "Pour coffee", | |
| "Pour milk into coffee", | |
| "Prepare for pouring" | |
| ], | |
| "task_display_name": "Procedure Step Recognition" | |
| }, | |
| "transition_detection": { | |
| "accuracy": 0.9080459770114943, | |
| "balanced_accuracy": 0.6543674698795181, | |
| "macro_f1": 0.6118237590630229, | |
| "weighted_f1": 0.9197389592989339, | |
| "num_eval_windows": 348, | |
| "num_classes": 2, | |
| "task": "transition_detection", | |
| "input": "all modalities -> action boundary/steady", | |
| "split": "chronological", | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "feature_dim": 8546, | |
| "majority_baseline_accuracy": 0.9540229885057471, | |
| "train_final_accuracy": 1.0, | |
| "train_final_loss": 0.007154403254389763, | |
| "unseen_test_classes": [], | |
| "boundary_precision": 0.07142857142857142, | |
| "boundary_recall": 0.5, | |
| "boundary_f1": 0.125, | |
| "matched_boundaries": 2, | |
| "true_boundaries": 4, | |
| "predicted_boundaries": 28, | |
| "mean_abs_timing_error_frames": 3.5, | |
| "task_display_name": "Action Boundary Detection" | |
| }, | |
| "next_action": { | |
| "accuracy": 0.034482758620689655, | |
| "balanced_accuracy": 0.04, | |
| "macro_f1": 0.05925925925925927, | |
| "weighted_f1": 0.05108556832694764, | |
| "num_eval_windows": 348, | |
| "num_classes": 18, | |
| "task": "next_action", | |
| "input": "all modalities at t -> action at t+20 frames", | |
| "split": "chronological", | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "feature_dim": 8546, | |
| "majority_baseline_accuracy": 0.0, | |
| "train_final_accuracy": 1.0, | |
| "train_final_loss": 0.01754833571612835, | |
| "unseen_test_classes": [ | |
| "Place item on table", | |
| "Pour coffee", | |
| "Pour milk into coffee", | |
| "Wait/Prepare for pouring" | |
| ], | |
| "task_display_name": "Next-Action Prediction" | |
| }, | |
| "hand_trajectory_forecast": { | |
| "mse": 14.956222534179688, | |
| "mae": 0.420173317193985, | |
| "r2": -1763.3831383277447, | |
| "task": "hand_trajectory_forecast", | |
| "input": "all modalities at t -> future left/right hand 3D joints", | |
| "split": "chronological", | |
| "num_windows": 1159, | |
| "num_train_windows": 811, | |
| "num_test_windows": 348, | |
| "forecast_frames": 10, | |
| "mpjpe": 0.8646570444107056, | |
| "final_frame_mpjpe": 1.0330793857574463, | |
| "target_dim": 1260, | |
| "task_display_name": "Hand Trajectory Forecasting" | |
| }, | |
| "contact_prediction": { | |
| "accuracy": 1.0, | |
| "balanced_accuracy": 1.0, | |
| "macro_f1": 1.0, | |
| "weighted_f1": 1.0, | |
| "num_eval_windows": 348, | |
| "num_classes": 1, | |
| "task": "contact_prediction", | |
| "input": "all non-contact/non-caption-label modalities -> any body contact", | |
| "split": "chronological", | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "feature_dim": 7503, | |
| "majority_baseline_accuracy": 1.0, | |
| "train_final_accuracy": 1.0, | |
| "train_final_loss": 0.0006056802230887115, | |
| "unseen_test_classes": [], | |
| "task_display_name": "Contact State Prediction" | |
| }, | |
| "object_relevance": { | |
| "micro_f1": 0.18034382095361662, | |
| "macro_f1": 0.06329638076675959, | |
| "exact_match": 0.005747126436781609, | |
| "precision": 0.16106604866743918, | |
| "recall": 0.20486366985998525, | |
| "task": "object_relevance", | |
| "input": "all non-caption modalities -> current relevant object set", | |
| "split": "chronological", | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "num_objects": 34, | |
| "task_display_name": "Object Relevance Prediction" | |
| }, | |
| "caption_grounding": { | |
| "mrr": 0.016023479050338015, | |
| "median_rank": 172.0, | |
| "mean_rank": 174.67816091954023, | |
| "num_queries": 348, | |
| "top1_accuracy": 0.0028735632183908046, | |
| "top5_accuracy": 0.011494252873563218, | |
| "top10_accuracy": 0.014367816091954023, | |
| "task": "caption_grounding", | |
| "input": "caption objects/interaction text query + candidate sensor windows", | |
| "output": "matching time window", | |
| "split": "chronological", | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "task_display_name": "Language Grounding" | |
| }, | |
| "cross_modal_retrieval": { | |
| "mrr": 0.26925966892956127, | |
| "median_rank": 14.0, | |
| "mean_rank": 43.34770114942529, | |
| "num_queries": 348, | |
| "top1_accuracy": 0.16379310344827586, | |
| "top5_accuracy": 0.367816091954023, | |
| "top10_accuracy": 0.47126436781609193, | |
| "task": "cross_modal_retrieval", | |
| "input": "motion/IMU/camera/audio query", | |
| "output": "matching depth/video window", | |
| "split": "chronological", | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "task_display_name": "Cross-Modal Retrieval" | |
| }, | |
| "modality_reconstruction": { | |
| "mse": 1358.1593017578125, | |
| "mae": 0.29572129249572754, | |
| "r2": -0.015271898913936655, | |
| "task": "modality_reconstruction", | |
| "input": "motion/IMU/camera/audio", | |
| "output": "depth/video feature vector", | |
| "split": "chronological", | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "target_dim": 5096, | |
| "task_display_name": "Cross-Modal Reconstruction" | |
| }, | |
| "temporal_order": { | |
| "accuracy": 0.4540229885057471, | |
| "precision": 0.4665271966527197, | |
| "recall": 0.6408045977011494, | |
| "f1": 0.5399515738498789, | |
| "tp": 223, | |
| "tn": 93, | |
| "fp": 255, | |
| "fn": 125, | |
| "positive_rate_true": 0.5, | |
| "positive_rate_pred": 0.6867816091954023, | |
| "task": "temporal_order", | |
| "input": "two adjacent windows -> whether order is correct", | |
| "split": "chronological", | |
| "num_samples": 2320, | |
| "num_train_samples": 1624, | |
| "num_test_samples": 696, | |
| "train_final_accuracy": 0.5086206896551724, | |
| "task_display_name": "Temporal Order Verification" | |
| }, | |
| "misalignment_detection": { | |
| "accuracy": 0.5158959537572254, | |
| "precision": 0.5166163141993958, | |
| "recall": 0.49421965317919075, | |
| "f1": 0.5051698670605613, | |
| "tp": 171, | |
| "tn": 186, | |
| "fp": 160, | |
| "fn": 175, | |
| "positive_rate_true": 0.5, | |
| "positive_rate_pred": 0.47832369942196534, | |
| "task": "misalignment_detection", | |
| "input": "motion+visual/audio pair -> aligned vs shifted by 8 windows", | |
| "split": "chronological", | |
| "num_samples": 2306, | |
| "num_train_samples": 1614, | |
| "num_test_samples": 692, | |
| "train_final_accuracy": 0.49380421313506817, | |
| "task_display_name": "Multimodal Synchronization Detection" | |
| } | |
| }, | |
| "neural_model": { | |
| "name": "neural_mlp", | |
| "type": "lightweight PyTorch MLP over shared window features", | |
| "epochs": 80, | |
| "hidden_dim": 128, | |
| "batch_size": 128, | |
| "learning_rate": 0.001, | |
| "weight_decay": 0.0001, | |
| "dropout": 0.1, | |
| "device": "auto" | |
| }, | |
| "neural_tasks": { | |
| "timeline_action": { | |
| "accuracy": 0.008746355685131196, | |
| "balanced_accuracy": 0.009375, | |
| "macro_f1": 0.014814814814814814, | |
| "weighted_f1": 0.013821401576503616, | |
| "num_eval_windows": 343, | |
| "num_classes": 18, | |
| "task": "timeline_action", | |
| "input": "all modalities -> current action label", | |
| "split": "chronological", | |
| "num_windows": 1144, | |
| "num_train_windows": 801, | |
| "num_test_windows": 343, | |
| "feature_dim": 8546, | |
| "majority_baseline_accuracy": 0.0, | |
| "unseen_test_classes": [ | |
| "Place item on table", | |
| "Pour coffee", | |
| "Pour milk into coffee", | |
| "Wait/Prepare for pouring" | |
| ], | |
| "model": "neural_mlp", | |
| "head": "z-score -> MLP softmax", | |
| "neural_epochs": 80, | |
| "neural_hidden_dim": 128, | |
| "neural_batch_size": 128, | |
| "neural_learning_rate": 0.001, | |
| "neural_weight_decay": 0.0001, | |
| "neural_dropout": 0.1, | |
| "neural_device": "cpu", | |
| "train_final_loss": 0.04246756529782, | |
| "train_final_accuracy": 0.9875156054931336, | |
| "task_display_name": "Action Recognition" | |
| }, | |
| "timeline_subtask": { | |
| "accuracy": 0.0377906976744186, | |
| "balanced_accuracy": 0.045614035087719294, | |
| "macro_f1": 0.02810810810810811, | |
| "weighted_f1": 0.023287240729101197, | |
| "num_eval_windows": 344, | |
| "num_classes": 14, | |
| "task": "timeline_subtask", | |
| "input": "all modalities -> current subtask label", | |
| "split": "chronological", | |
| "num_windows": 1147, | |
| "num_train_windows": 803, | |
| "num_test_windows": 344, | |
| "feature_dim": 8546, | |
| "majority_baseline_accuracy": 0.0, | |
| "unseen_test_classes": [ | |
| "Move bottle to coffee equipment", | |
| "Pour coffee", | |
| "Pour milk into coffee", | |
| "Prepare for pouring" | |
| ], | |
| "model": "neural_mlp", | |
| "head": "z-score -> MLP softmax", | |
| "neural_epochs": 80, | |
| "neural_hidden_dim": 128, | |
| "neural_batch_size": 128, | |
| "neural_learning_rate": 0.001, | |
| "neural_weight_decay": 0.0001, | |
| "neural_dropout": 0.1, | |
| "neural_device": "cpu", | |
| "train_final_loss": 5.4104819144748596e-05, | |
| "train_final_accuracy": 1.0, | |
| "task_display_name": "Procedure Step Recognition" | |
| }, | |
| "transition_detection": { | |
| "accuracy": 0.8735632183908046, | |
| "balanced_accuracy": 0.666039156626506, | |
| "macro_f1": 0.5862068965517241, | |
| "weighted_f1": 0.8993261989694807, | |
| "num_eval_windows": 348, | |
| "num_classes": 2, | |
| "task": "transition_detection", | |
| "input": "all modalities -> action boundary/steady", | |
| "split": "chronological", | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "feature_dim": 8546, | |
| "majority_baseline_accuracy": 0.9540229885057471, | |
| "unseen_test_classes": [], | |
| "model": "neural_mlp", | |
| "head": "z-score -> MLP softmax", | |
| "neural_epochs": 80, | |
| "neural_hidden_dim": 128, | |
| "neural_batch_size": 128, | |
| "neural_learning_rate": 0.001, | |
| "neural_weight_decay": 0.0001, | |
| "neural_dropout": 0.1, | |
| "neural_device": "cpu", | |
| "train_final_loss": 0.029138497962572854, | |
| "train_final_accuracy": 0.990159901599016, | |
| "boundary_precision": 0.07142857142857142, | |
| "boundary_recall": 0.75, | |
| "boundary_f1": 0.13043478260869565, | |
| "matched_boundaries": 3, | |
| "true_boundaries": 4, | |
| "predicted_boundaries": 42, | |
| "mean_abs_timing_error_frames": 2.6666666666666665, | |
| "task_display_name": "Action Boundary Detection" | |
| }, | |
| "next_action": { | |
| "accuracy": 0.02586206896551724, | |
| "balanced_accuracy": 0.03, | |
| "macro_f1": 0.04186046511627907, | |
| "weighted_f1": 0.03608660785886127, | |
| "num_eval_windows": 348, | |
| "num_classes": 18, | |
| "task": "next_action", | |
| "input": "all modalities at t -> action at t+20 frames", | |
| "split": "chronological", | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "feature_dim": 8546, | |
| "majority_baseline_accuracy": 0.0, | |
| "unseen_test_classes": [ | |
| "Place item on table", | |
| "Pour coffee", | |
| "Pour milk into coffee", | |
| "Wait/Prepare for pouring" | |
| ], | |
| "model": "neural_mlp", | |
| "head": "z-score -> MLP softmax", | |
| "neural_epochs": 80, | |
| "neural_hidden_dim": 128, | |
| "neural_batch_size": 128, | |
| "neural_learning_rate": 0.001, | |
| "neural_weight_decay": 0.0001, | |
| "neural_dropout": 0.1, | |
| "neural_device": "cpu", | |
| "train_final_loss": 0.000416612956025105, | |
| "train_final_accuracy": 1.0, | |
| "task_display_name": "Next-Action Prediction" | |
| }, | |
| "hand_trajectory_forecast": { | |
| "mse": 0.004775360692292452, | |
| "mae": 0.05433763191103935, | |
| "r2": 0.43665148265771614, | |
| "task": "hand_trajectory_forecast", | |
| "input": "all modalities at t -> future left/right hand 3D joints", | |
| "split": "chronological", | |
| "num_windows": 1159, | |
| "num_train_windows": 811, | |
| "num_test_windows": 348, | |
| "forecast_frames": 10, | |
| "mpjpe": 0.10785018652677536, | |
| "final_frame_mpjpe": 0.11407545953989029, | |
| "target_dim": 1260, | |
| "model": "neural_mlp", | |
| "head": "z-score -> MLP regression", | |
| "neural_epochs": 80, | |
| "neural_hidden_dim": 128, | |
| "neural_batch_size": 128, | |
| "neural_learning_rate": 0.001, | |
| "neural_weight_decay": 0.0001, | |
| "neural_dropout": 0.1, | |
| "neural_device": "cpu", | |
| "train_final_loss": 0.055699273420247435, | |
| "task_display_name": "Hand Trajectory Forecasting" | |
| }, | |
| "contact_prediction": { | |
| "accuracy": 1.0, | |
| "balanced_accuracy": 1.0, | |
| "macro_f1": 1.0, | |
| "weighted_f1": 1.0, | |
| "num_eval_windows": 348, | |
| "num_classes": 1, | |
| "task": "contact_prediction", | |
| "input": "all non-contact/non-caption-label modalities -> any body contact", | |
| "split": "chronological", | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "feature_dim": 7503, | |
| "majority_baseline_accuracy": 1.0, | |
| "unseen_test_classes": [], | |
| "model": "neural_mlp", | |
| "head": "z-score -> MLP softmax", | |
| "neural_epochs": 80, | |
| "neural_hidden_dim": 128, | |
| "neural_batch_size": 128, | |
| "neural_learning_rate": 0.001, | |
| "neural_weight_decay": 0.0001, | |
| "neural_dropout": 0.1, | |
| "neural_device": "cpu", | |
| "train_final_loss": 0.0, | |
| "train_final_accuracy": 1.0, | |
| "task_display_name": "Contact State Prediction" | |
| }, | |
| "object_relevance": { | |
| "micro_f1": 0.1679279279279279, | |
| "macro_f1": 0.048883162556964774, | |
| "exact_match": 0.014367816091954023, | |
| "precision": 0.16431593794076163, | |
| "recall": 0.17170228445099484, | |
| "task": "object_relevance", | |
| "input": "all non-caption modalities -> current relevant object set", | |
| "split": "chronological", | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "num_objects": 34, | |
| "feature_dim": 7650, | |
| "model": "neural_mlp", | |
| "head": "z-score -> MLP sigmoid multilabel", | |
| "neural_epochs": 80, | |
| "neural_hidden_dim": 128, | |
| "neural_batch_size": 128, | |
| "neural_learning_rate": 0.001, | |
| "neural_weight_decay": 0.0001, | |
| "neural_dropout": 0.1, | |
| "neural_device": "cpu", | |
| "train_final_loss": 0.003651880362182214, | |
| "task_display_name": "Object Relevance Prediction" | |
| }, | |
| "caption_grounding": { | |
| "mrr": 0.01684125567132316, | |
| "median_rank": 180.5, | |
| "mean_rank": 178.382183908046, | |
| "num_queries": 348, | |
| "top1_accuracy": 0.0028735632183908046, | |
| "top5_accuracy": 0.014367816091954023, | |
| "top10_accuracy": 0.020114942528735632, | |
| "task": "caption_grounding", | |
| "input": "caption objects/interaction text query + candidate sensor windows", | |
| "split": "chronological", | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "target_dim": 896, | |
| "output": "matching time window", | |
| "model": "neural_mlp", | |
| "head": "z-score -> MLP projection/regression", | |
| "neural_epochs": 80, | |
| "neural_hidden_dim": 128, | |
| "neural_batch_size": 128, | |
| "neural_learning_rate": 0.001, | |
| "neural_weight_decay": 0.0001, | |
| "neural_dropout": 0.1, | |
| "neural_device": "cpu", | |
| "train_final_loss": 0.06317874967483723, | |
| "task_display_name": "Language Grounding" | |
| }, | |
| "cross_modal_retrieval": { | |
| "mrr": 0.1299971898648288, | |
| "median_rank": 40.0, | |
| "mean_rank": 66.60057471264368, | |
| "num_queries": 348, | |
| "top1_accuracy": 0.05172413793103448, | |
| "top5_accuracy": 0.19827586206896552, | |
| "top10_accuracy": 0.2413793103448276, | |
| "task": "cross_modal_retrieval", | |
| "input": "motion/IMU/camera/audio query", | |
| "split": "chronological", | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "target_dim": 5096, | |
| "output": "matching depth/video window", | |
| "model": "neural_mlp", | |
| "head": "z-score -> MLP projection/regression", | |
| "neural_epochs": 80, | |
| "neural_hidden_dim": 128, | |
| "neural_batch_size": 128, | |
| "neural_learning_rate": 0.001, | |
| "neural_weight_decay": 0.0001, | |
| "neural_dropout": 0.1, | |
| "neural_device": "cpu", | |
| "train_final_loss": 0.21891545446596464, | |
| "task_display_name": "Cross-Modal Retrieval" | |
| }, | |
| "modality_reconstruction": { | |
| "mse": 1351.3363037109375, | |
| "mae": 0.10379635542631149, | |
| "r2": -0.010171410134180991, | |
| "task": "modality_reconstruction", | |
| "input": "motion/IMU/camera/audio", | |
| "split": "chronological", | |
| "num_train_windows": 813, | |
| "num_test_windows": 348, | |
| "target_dim": 5096, | |
| "output": "depth/video feature vector", | |
| "model": "neural_mlp", | |
| "head": "z-score -> MLP projection/regression", | |
| "neural_epochs": 80, | |
| "neural_hidden_dim": 128, | |
| "neural_batch_size": 128, | |
| "neural_learning_rate": 0.001, | |
| "neural_weight_decay": 0.0001, | |
| "neural_dropout": 0.1, | |
| "neural_device": "cpu", | |
| "train_final_loss": 0.21891545446596464, | |
| "task_display_name": "Cross-Modal Reconstruction" | |
| }, | |
| "temporal_order": { | |
| "accuracy": 0.8577586206896551, | |
| "precision": 0.8878504672897196, | |
| "recall": 0.8189655172413793, | |
| "f1": 0.8520179372197308, | |
| "tp": 285, | |
| "tn": 312, | |
| "fp": 36, | |
| "fn": 63, | |
| "positive_rate_true": 0.5, | |
| "positive_rate_pred": 0.46120689655172414, | |
| "task": "temporal_order", | |
| "input": "two adjacent windows -> whether order is correct", | |
| "split": "chronological", | |
| "num_samples": 2320, | |
| "num_train_samples": 1624, | |
| "num_test_samples": 696, | |
| "feature_dim": 25638, | |
| "model": "neural_mlp", | |
| "head": "z-score -> MLP binary softmax", | |
| "neural_epochs": 80, | |
| "neural_hidden_dim": 128, | |
| "neural_batch_size": 128, | |
| "neural_learning_rate": 0.001, | |
| "neural_weight_decay": 0.0001, | |
| "neural_dropout": 0.1, | |
| "neural_device": "cpu", | |
| "train_final_loss": 0.0005108328477586757, | |
| "train_final_accuracy": 1.0, | |
| "task_display_name": "Temporal Order Verification" | |
| }, | |
| "misalignment_detection": { | |
| "accuracy": 0.7008670520231214, | |
| "precision": 0.6824146981627297, | |
| "recall": 0.7514450867052023, | |
| "f1": 0.7152682255845944, | |
| "tp": 260, | |
| "tn": 225, | |
| "fp": 121, | |
| "fn": 86, | |
| "positive_rate_true": 0.5, | |
| "positive_rate_pred": 0.5505780346820809, | |
| "task": "misalignment_detection", | |
| "input": "motion+visual/audio pair -> aligned vs shifted by 8 windows", | |
| "split": "chronological", | |
| "num_samples": 2306, | |
| "num_train_samples": 1614, | |
| "num_test_samples": 692, | |
| "feature_dim": 7511, | |
| "model": "neural_mlp", | |
| "head": "z-score -> MLP binary softmax", | |
| "neural_epochs": 80, | |
| "neural_hidden_dim": 128, | |
| "neural_batch_size": 128, | |
| "neural_learning_rate": 0.001, | |
| "neural_weight_decay": 0.0001, | |
| "neural_dropout": 0.1, | |
| "neural_device": "cpu", | |
| "train_final_loss": 0.010604870708167664, | |
| "train_final_accuracy": 0.9956629491945477, | |
| "task_display_name": "Multimodal Synchronization Detection" | |
| } | |
| }, | |
| "task_display_names": { | |
| "timeline_action": "Action Recognition", | |
| "timeline_subtask": "Procedure Step Recognition", | |
| "transition_detection": "Action Boundary Detection", | |
| "next_action": "Next-Action Prediction", | |
| "hand_trajectory_forecast": "Hand Trajectory Forecasting", | |
| "contact_prediction": "Contact State Prediction", | |
| "object_relevance": "Object Relevance Prediction", | |
| "caption_grounding": "Language Grounding", | |
| "cross_modal_retrieval": "Cross-Modal Retrieval", | |
| "modality_reconstruction": "Cross-Modal Reconstruction", | |
| "temporal_order": "Temporal Order Verification", | |
| "misalignment_detection": "Multimodal Synchronization Detection" | |
| } | |
| }, | |
| "unified_task_count": 20, | |
| "feature_manifest": [ | |
| { | |
| "name": "hand left joints", | |
| "start": 0, | |
| "end": 441, | |
| "dim": 441 | |
| }, | |
| { | |
| "name": "hand right joints", | |
| "start": 441, | |
| "end": 882, | |
| "dim": 441 | |
| }, | |
| { | |
| "name": "body joints", | |
| "start": 882, | |
| "end": 1974, | |
| "dim": 1092 | |
| }, | |
| { | |
| "name": "body contacts", | |
| "start": 1974, | |
| "end": 2121, | |
| "dim": 147 | |
| }, | |
| { | |
| "name": "camera translation", | |
| "start": 2121, | |
| "end": 2142, | |
| "dim": 21 | |
| }, | |
| { | |
| "name": "camera rotation matrix", | |
| "start": 2142, | |
| "end": 2205, | |
| "dim": 63 | |
| }, | |
| { | |
| "name": "imu accel gyro", | |
| "start": 2205, | |
| "end": 2247, | |
| "dim": 42 | |
| }, | |
| { | |
| "name": "depth confidence", | |
| "start": 2247, | |
| "end": 3227, | |
| "dim": 980 | |
| }, | |
| { | |
| "name": "video fisheye cam0", | |
| "start": 3227, | |
| "end": 3913, | |
| "dim": 686 | |
| }, | |
| { | |
| "name": "video fisheye cam1", | |
| "start": 3913, | |
| "end": 4599, | |
| "dim": 686 | |
| }, | |
| { | |
| "name": "video fisheye cam2", | |
| "start": 4599, | |
| "end": 5285, | |
| "dim": 686 | |
| }, | |
| { | |
| "name": "video fisheye cam3", | |
| "start": 5285, | |
| "end": 5971, | |
| "dim": 686 | |
| }, | |
| { | |
| "name": "video stereo left", | |
| "start": 5971, | |
| "end": 6657, | |
| "dim": 686 | |
| }, | |
| { | |
| "name": "video stereo right", | |
| "start": 6657, | |
| "end": 7343, | |
| "dim": 686 | |
| }, | |
| { | |
| "name": "audio", | |
| "start": 7343, | |
| "end": 7511, | |
| "dim": 168 | |
| }, | |
| { | |
| "name": "language text", | |
| "start": 7511, | |
| "end": 8407, | |
| "dim": 896 | |
| }, | |
| { | |
| "name": "slam point cloud", | |
| "start": 8407, | |
| "end": 8429, | |
| "dim": 22 | |
| }, | |
| { | |
| "name": "calibration", | |
| "start": 8429, | |
| "end": 8546, | |
| "dim": 117 | |
| } | |
| ] | |
| } | |