{ "omni_relay": { "status": "pending_huggingface_gated_access", "dataset": "ropedia-ai/xperience-10m", "staging": "prepared_generic_host_to_host_transfer", "training_target": "external_multi_gpu_training_host", "selection_strategy": "stratified_round_robin_by_top_level_session", "target_episodes": 32, "selected_sessions": 32, "candidate_scan_top_level_sessions": 64, "valid_candidates": 680, "estimated_bytes": 72031620552, "exclude": [ "visualization.rrd" ], "access_status": "Hugging Face returns 403 pending review for the full Xperience-10M gated dataset.", "current_scope": "The 32-episode Qwen3-Omni fine-tune requires gated data staging and held-out evaluation." }, "models": { "motion_action": { "accuracy": 0.9828178694158075, "balanced_accuracy": 0.9643518518518519, "macro_f1": 0.96884342657456, "weighted_f1": 0.9824311468352843, "num_eval_windows": 291, "num_classes": 18, "majority_baseline_accuracy": 0.13745704467353953, "train_final_accuracy": 1.0, "train_final_loss": 0.019042566418647766 }, "motion_subtask": { "accuracy": 0.9758620689655172, "balanced_accuracy": 0.9783924095954172, "macro_f1": 0.9528048001232955, "weighted_f1": 0.9778836359351952, "num_eval_windows": 290, "num_classes": 14, "majority_baseline_accuracy": 0.14482758620689656, "train_final_accuracy": 1.0, "train_final_loss": 0.02664567530155182 }, "all_modalities_action": { "accuracy": 0.9828178694158075, "balanced_accuracy": 0.9800925925925925, "macro_f1": 0.9791023658779895, "weighted_f1": 0.98276563540562, "num_eval_windows": 291, "num_classes": 18, "majority_baseline_accuracy": 0.13745704467353953, "train_final_accuracy": 1.0, "train_final_loss": 0.014624637551605701, "feature_dim": 8378, "num_windows": 1144 }, "all_modalities_subtask": { "accuracy": 0.9827586206896551, "balanced_accuracy": 0.9505102040816327, "macro_f1": 0.9307645963773675, "weighted_f1": 0.9837987833808578, "num_eval_windows": 290, "num_classes": 14, "majority_baseline_accuracy": 0.14482758620689656, "train_final_accuracy": 1.0, "train_final_loss": 0.012823422439396381, "feature_dim": 8378, "num_windows": 1147 } }, "suite": { "annotation": "data/sample/xperience-10m-sample/annotation.hdf5", "num_frames": 5821, "num_windows": 1161, "feature_dim": 8378, "window_frames": 20, "stride_frames": 5, "tasks": { "timeline_action": { "accuracy": 0.029154518950437316, "balanced_accuracy": 0.03125, "macro_f1": 0.05, "weighted_f1": 0.04664723032069971, "num_eval_windows": 343, "num_classes": 18, "task": "timeline_action", "input": "all modalities -> current action label", "split": "chronological", "num_windows": 1144, "num_train_windows": 801, "num_test_windows": 343, "feature_dim": 8378, "majority_baseline_accuracy": 0.0, "train_final_accuracy": 1.0, "train_final_loss": 0.01664665900170803, "unseen_test_classes": [ "Place item on table", "Pour coffee", "Pour milk into coffee", "Wait/Prepare for pouring" ] }, "timeline_subtask": { "accuracy": 0.05813953488372093, "balanced_accuracy": 0.05376979652090881, "macro_f1": 0.04954121121178666, "weighted_f1": 0.06731304264454903, "num_eval_windows": 344, "num_classes": 14, "task": "timeline_subtask", "input": "all modalities -> current subtask label", "split": "chronological", "num_windows": 1147, "num_train_windows": 803, "num_test_windows": 344, "feature_dim": 8378, "majority_baseline_accuracy": 0.0, "train_final_accuracy": 1.0, "train_final_loss": 0.014040183275938034, "unseen_test_classes": [ "Move bottle to coffee equipment", "Pour coffee", "Pour milk into coffee", "Prepare for pouring" ] }, "transition_detection": { "accuracy": 0.9252873563218391, "balanced_accuracy": 0.6931475903614458, "macro_f1": 0.6551829268292684, "weighted_f1": 0.9323030557891787, "num_eval_windows": 348, "num_classes": 2, "task": "transition_detection", "input": "all modalities -> action boundary/steady", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "feature_dim": 8378, "majority_baseline_accuracy": 0.9540229885057471, "train_final_accuracy": 1.0, "train_final_loss": 0.007071746978908777, "unseen_test_classes": [], "boundary_precision": 0.125, "boundary_recall": 0.75, "boundary_f1": 0.21428571428571427, "matched_boundaries": 3, "true_boundaries": 4, "predicted_boundaries": 24, "mean_abs_timing_error_frames": 2.6666666666666665 }, "next_action": { "accuracy": 0.034482758620689655, "balanced_accuracy": 0.04, "macro_f1": 0.05925925925925927, "weighted_f1": 0.05108556832694764, "num_eval_windows": 348, "num_classes": 18, "task": "next_action", "input": "all modalities at t -> action at t+20 frames", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "feature_dim": 8378, "majority_baseline_accuracy": 0.0, "train_final_accuracy": 1.0, "train_final_loss": 0.017629079520702362, "unseen_test_classes": [ "Place item on table", "Pour coffee", "Pour milk into coffee", "Wait/Prepare for pouring" ] }, "hand_trajectory_forecast": { "mse": 11.323140144348145, "mae": 0.40246668457984924, "r2": -1334.788993815828, "task": "hand_trajectory_forecast", "input": "all modalities at t -> future left/right hand 3D joints", "split": "chronological", "num_windows": 1159, "num_train_windows": 811, "num_test_windows": 348, "forecast_frames": 10, "mpjpe": 0.8222644925117493, "final_frame_mpjpe": 1.0649521350860596, "target_dim": 1260 }, "contact_prediction": { "accuracy": 1.0, "balanced_accuracy": 1.0, "macro_f1": 1.0, "weighted_f1": 1.0, "num_eval_windows": 348, "num_classes": 1, "task": "contact_prediction", "input": "all non-contact/non-caption-label modalities -> any body contact", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "feature_dim": 7335, "majority_baseline_accuracy": 1.0, "train_final_accuracy": 1.0, "train_final_loss": 0.0005947681493125856, "unseen_test_classes": [] }, "object_relevance": { "micro_f1": 0.18393030009680542, "macro_f1": 0.06427052187996415, "exact_match": 0.005747126436781609, "precision": 0.16360505166475317, "recall": 0.21002210759027265, "task": "object_relevance", "input": "all non-caption modalities -> current relevant object set", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "num_objects": 34 }, "caption_grounding": { "mrr": 0.017183946083791223, "median_rank": 167.0, "mean_rank": 174.39367816091954, "num_queries": 348, "top1_accuracy": 0.0028735632183908046, "top5_accuracy": 0.011494252873563218, "top10_accuracy": 0.017241379310344827, "task": "caption_grounding", "input": "caption objects/interaction text query + candidate sensor windows", "output": "matching time window", "split": "chronological", "num_train_windows": 813, "num_test_windows": 348 }, "cross_modal_retrieval": { "mrr": 0.26335984006618296, "median_rank": 12.5, "mean_rank": 43.33045977011494, "num_queries": 348, "top1_accuracy": 0.14942528735632185, "top5_accuracy": 0.3764367816091954, "top10_accuracy": 0.47413793103448276, "task": "cross_modal_retrieval", "input": "motion/IMU/camera query", "output": "matching depth/video window", "split": "chronological", "num_train_windows": 813, "num_test_windows": 348 }, "modality_reconstruction": { "mse": 1359.1639404296875, "mae": 0.31084805727005005, "r2": -0.016022846771134747, "task": "modality_reconstruction", "input": "motion/IMU/camera", "output": "depth/video feature vector", "split": "chronological", "num_train_windows": 813, "num_test_windows": 348, "target_dim": 5096 }, "temporal_order": { "accuracy": 0.46120689655172414, "precision": 0.4720496894409938, "recall": 0.6551724137931034, "f1": 0.5487364620938628, "tp": 228, "tn": 93, "fp": 255, "fn": 120, "positive_rate_true": 0.5, "positive_rate_pred": 0.6939655172413793, "task": "temporal_order", "input": "two adjacent windows -> whether order is correct", "split": "chronological", "num_samples": 2320, "num_train_samples": 1624, "num_test_samples": 696, "train_final_accuracy": 0.5104679802955665 }, "misalignment_detection": { "accuracy": 0.5028901734104047, "precision": 0.5030864197530864, "recall": 0.47109826589595377, "f1": 0.4865671641791045, "tp": 163, "tn": 185, "fp": 161, "fn": 183, "positive_rate_true": 0.5, "positive_rate_pred": 0.4682080924855491, "task": "misalignment_detection", "input": "motion+visual pair -> aligned vs shifted by 8 windows", "split": "chronological", "num_samples": 2306, "num_train_samples": 1614, "num_test_samples": 692, "train_final_accuracy": 0.5018587360594795 } }, "neural_model": { "name": "neural_mlp", "type": "lightweight PyTorch MLP over shared window features", "epochs": 80, "hidden_dim": 128, "batch_size": 128, "learning_rate": 0.001, "weight_decay": 0.0001, "dropout": 0.1, "device": "auto" }, "neural_tasks": { "timeline_action": { "accuracy": 0.014577259475218658, "balanced_accuracy": 0.015625, "macro_f1": 0.02631578947368421, "weighted_f1": 0.024551173852999847, "num_eval_windows": 343, "num_classes": 18, "task": "timeline_action", "input": "all modalities -> current action label", "split": "chronological", "num_windows": 1144, "num_train_windows": 801, "num_test_windows": 343, "feature_dim": 8378, "majority_baseline_accuracy": 0.0, "unseen_test_classes": [ "Place item on table", "Pour coffee", "Pour milk into coffee", "Wait/Prepare for pouring" ], "model": "neural_mlp", "head": "z-score -> MLP softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.0001524056650931597, "train_final_accuracy": 1.0 }, "timeline_subtask": { "accuracy": 0.01744186046511628, "balanced_accuracy": 0.021052631578947368, "macro_f1": 0.017518248175182476, "weighted_f1": 0.014513664912578507, "num_eval_windows": 344, "num_classes": 14, "task": "timeline_subtask", "input": "all modalities -> current subtask label", "split": "chronological", "num_windows": 1147, "num_train_windows": 803, "num_test_windows": 344, "feature_dim": 8378, "majority_baseline_accuracy": 0.0, "unseen_test_classes": [ "Move bottle to coffee equipment", "Pour coffee", "Pour milk into coffee", "Prepare for pouring" ], "model": "neural_mlp", "head": "z-score -> MLP softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.06660133146519678, "train_final_accuracy": 0.9912826899128269 }, "transition_detection": { "accuracy": 0.9310344827586207, "balanced_accuracy": 0.6664156626506024, "macro_f1": 0.6484848484848484, "weighted_f1": 0.9346569139672588, "num_eval_windows": 348, "num_classes": 2, "task": "transition_detection", "input": "all modalities -> action boundary/steady", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "feature_dim": 8378, "majority_baseline_accuracy": 0.9540229885057471, "unseen_test_classes": [], "model": "neural_mlp", "head": "z-score -> MLP softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.005629667796936003, "train_final_accuracy": 0.998769987699877, "boundary_precision": 0.1, "boundary_recall": 0.5, "boundary_f1": 0.16666666666666669, "matched_boundaries": 2, "true_boundaries": 4, "predicted_boundaries": 20, "mean_abs_timing_error_frames": 5.0 }, "next_action": { "accuracy": 0.011494252873563218, "balanced_accuracy": 0.013333333333333332, "macro_f1": 0.023529411764705882, "weighted_f1": 0.02028397565922921, "num_eval_windows": 348, "num_classes": 18, "task": "next_action", "input": "all modalities at t -> action at t+20 frames", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "feature_dim": 8378, "majority_baseline_accuracy": 0.0, "unseen_test_classes": [ "Place item on table", "Pour coffee", "Pour milk into coffee", "Wait/Prepare for pouring" ], "model": "neural_mlp", "head": "z-score -> MLP softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.0050763053797378156, "train_final_accuracy": 0.998769987699877 }, "hand_trajectory_forecast": { "mse": 0.005083972588181496, "mae": 0.055900074541568756, "r2": 0.40024460814419005, "task": "hand_trajectory_forecast", "input": "all modalities at t -> future left/right hand 3D joints", "split": "chronological", "num_windows": 1159, "num_train_windows": 811, "num_test_windows": 348, "forecast_frames": 10, "mpjpe": 0.11163123697042465, "final_frame_mpjpe": 0.11860372871160507, "target_dim": 1260, "model": "neural_mlp", "head": "z-score -> MLP regression", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.059780220692901516 }, "contact_prediction": { "accuracy": 1.0, "balanced_accuracy": 1.0, "macro_f1": 1.0, "weighted_f1": 1.0, "num_eval_windows": 348, "num_classes": 1, "task": "contact_prediction", "input": "all non-contact/non-caption-label modalities -> any body contact", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "feature_dim": 7335, "majority_baseline_accuracy": 1.0, "unseen_test_classes": [], "model": "neural_mlp", "head": "z-score -> MLP softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.0, "train_final_accuracy": 1.0 }, "object_relevance": { "micro_f1": 0.1797583081570997, "macro_f1": 0.04958769134098823, "exact_match": 0.011494252873563218, "precision": 0.18435321456235476, "recall": 0.17538688282977155, "task": "object_relevance", "input": "all non-caption modalities -> current relevant object set", "split": "chronological", "num_windows": 1161, "num_train_windows": 813, "num_test_windows": 348, "num_objects": 34, "feature_dim": 7482, "model": "neural_mlp", "head": "z-score -> MLP sigmoid multilabel", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.0006806955548115865 }, "caption_grounding": { "mrr": 0.01781111161035397, "median_rank": 184.0, "mean_rank": 183.86206896551724, "num_queries": 348, "top1_accuracy": 0.005747126436781609, "top5_accuracy": 0.017241379310344827, "top10_accuracy": 0.02586206896551724, "task": "caption_grounding", "input": "caption objects/interaction text query + candidate sensor windows", "split": "chronological", "num_train_windows": 813, "num_test_windows": 348, "target_dim": 896, "output": "matching time window", "model": "neural_mlp", "head": "z-score -> MLP projection/regression", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.06571704036525254 }, "cross_modal_retrieval": { "mrr": 0.1530070022204131, "median_rank": 34.0, "mean_rank": 62.043103448275865, "num_queries": 348, "top1_accuracy": 0.07183908045977011, "top5_accuracy": 0.21551724137931033, "top10_accuracy": 0.3017241379310345, "task": "cross_modal_retrieval", "input": "motion/IMU/camera query", "split": "chronological", "num_train_windows": 813, "num_test_windows": 348, "target_dim": 5096, "output": "matching depth/video window", "model": "neural_mlp", "head": "z-score -> MLP projection/regression", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.2246821296537641 }, "modality_reconstruction": { "mse": 1351.3720703125, "mae": 0.10358995944261551, "r2": -0.010198171891414143, "task": "modality_reconstruction", "input": "motion/IMU/camera", "split": "chronological", "num_train_windows": 813, "num_test_windows": 348, "target_dim": 5096, "output": "depth/video feature vector", "model": "neural_mlp", "head": "z-score -> MLP projection/regression", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.2246821296537641 }, "temporal_order": { "accuracy": 0.8706896551724138, "precision": 0.864406779661017, "recall": 0.8793103448275862, "f1": 0.8717948717948718, "tp": 306, "tn": 300, "fp": 48, "fn": 42, "positive_rate_true": 0.5, "positive_rate_pred": 0.5086206896551724, "task": "temporal_order", "input": "two adjacent windows -> whether order is correct", "split": "chronological", "num_samples": 2320, "num_train_samples": 1624, "num_test_samples": 696, "feature_dim": 25134, "model": "neural_mlp", "head": "z-score -> MLP binary softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 8.5640803086261e-05, "train_final_accuracy": 1.0 }, "misalignment_detection": { "accuracy": 0.7312138728323699, "precision": 0.7272727272727273, "recall": 0.7398843930635838, "f1": 0.7335243553008597, "tp": 256, "tn": 250, "fp": 96, "fn": 90, "positive_rate_true": 0.5, "positive_rate_pred": 0.5086705202312138, "task": "misalignment_detection", "input": "motion+visual pair -> aligned vs shifted by 8 windows", "split": "chronological", "num_samples": 2306, "num_train_samples": 1614, "num_test_samples": 692, "feature_dim": 7343, "model": "neural_mlp", "head": "z-score -> MLP binary softmax", "neural_epochs": 80, "neural_hidden_dim": 128, "neural_batch_size": 128, "neural_learning_rate": 0.001, "neural_weight_decay": 0.0001, "neural_dropout": 0.1, "neural_device": "cpu", "train_final_loss": 0.01810159092443583, "train_final_accuracy": 0.993184634448575 } } }, "feature_manifest": [ { "name": "hand_left_joints", "start": 0, "end": 441, "dim": 441 }, { "name": "hand_right_joints", "start": 441, "end": 882, "dim": 441 }, { "name": "body_joints", "start": 882, "end": 1974, "dim": 1092 }, { "name": "body_contacts", "start": 1974, "end": 2121, "dim": 147 }, { "name": "camera_translation", "start": 2121, "end": 2142, "dim": 21 }, { "name": "camera_rotation_matrix", "start": 2142, "end": 2205, "dim": 63 }, { "name": "imu_accel_gyro", "start": 2205, "end": 2247, "dim": 42 }, { "name": "depth_confidence", "start": 2247, "end": 3227, "dim": 980 }, { "name": "video_fisheye_cam0", "start": 3227, "end": 3913, "dim": 686 }, { "name": "video_fisheye_cam1", "start": 3913, "end": 4599, "dim": 686 }, { "name": "video_fisheye_cam2", "start": 4599, "end": 5285, "dim": 686 }, { "name": "video_fisheye_cam3", "start": 5285, "end": 5971, "dim": 686 }, { "name": "video_stereo_left", "start": 5971, "end": 6657, "dim": 686 }, { "name": "video_stereo_right", "start": 6657, "end": 7343, "dim": 686 }, { "name": "caption_objects_interaction_text", "start": 7343, "end": 8239, "dim": 896 }, { "name": "slam_point_cloud", "start": 8239, "end": 8261, "dim": 22 }, { "name": "calibration", "start": 8261, "end": 8378, "dim": 117 } ] }