{
  "annotation": "data/sample/xperience-10m-sample/annotation.hdf5",
  "num_frames": 5821,
  "num_windows": 1161,
  "feature_dim": 8378,
  "window_frames": 20,
  "stride_frames": 5,
  "tasks": {
    "timeline_action": {
      "accuracy": 0.029154518950437316,
      "balanced_accuracy": 0.03125,
      "macro_f1": 0.05,
      "weighted_f1": 0.04664723032069971,
      "num_eval_windows": 343,
      "num_classes": 18,
      "task": "timeline_action",
      "input": "all modalities -> current action label",
      "split": "chronological",
      "num_windows": 1144,
      "num_train_windows": 801,
      "num_test_windows": 343,
      "feature_dim": 8378,
      "majority_baseline_accuracy": 0.0,
      "train_final_accuracy": 1.0,
      "train_final_loss": 0.01664665900170803,
      "unseen_test_classes": [
        "Place item on table",
        "Pour coffee",
        "Pour milk into coffee",
        "Wait/Prepare for pouring"
      ]
    },
    "timeline_subtask": {
      "accuracy": 0.05813953488372093,
      "balanced_accuracy": 0.05376979652090881,
      "macro_f1": 0.04954121121178666,
      "weighted_f1": 0.06731304264454903,
      "num_eval_windows": 344,
      "num_classes": 14,
      "task": "timeline_subtask",
      "input": "all modalities -> current subtask label",
      "split": "chronological",
      "num_windows": 1147,
      "num_train_windows": 803,
      "num_test_windows": 344,
      "feature_dim": 8378,
      "majority_baseline_accuracy": 0.0,
      "train_final_accuracy": 1.0,
      "train_final_loss": 0.014040183275938034,
      "unseen_test_classes": [
        "Move bottle to coffee equipment",
        "Pour coffee",
        "Pour milk into coffee",
        "Prepare for pouring"
      ]
    },
    "transition_detection": {
      "accuracy": 0.9252873563218391,
      "balanced_accuracy": 0.6931475903614458,
      "macro_f1": 0.6551829268292684,
      "weighted_f1": 0.9323030557891787,
      "num_eval_windows": 348,
      "num_classes": 2,
      "task": "transition_detection",
      "input": "all modalities -> action boundary/steady",
      "split": "chronological",
      "num_windows": 1161,
      "num_train_windows": 813,
      "num_test_windows": 348,
      "feature_dim": 8378,
      "majority_baseline_accuracy": 0.9540229885057471,
      "train_final_accuracy": 1.0,
      "train_final_loss": 0.007071746978908777,
      "unseen_test_classes": [],
      "boundary_precision": 0.125,
      "boundary_recall": 0.75,
      "boundary_f1": 0.21428571428571427,
      "matched_boundaries": 3,
      "true_boundaries": 4,
      "predicted_boundaries": 24,
      "mean_abs_timing_error_frames": 2.6666666666666665
    },
    "next_action": {
      "accuracy": 0.034482758620689655,
      "balanced_accuracy": 0.04,
      "macro_f1": 0.05925925925925927,
      "weighted_f1": 0.05108556832694764,
      "num_eval_windows": 348,
      "num_classes": 18,
      "task": "next_action",
      "input": "all modalities at t -> action at t+20 frames",
      "split": "chronological",
      "num_windows": 1161,
      "num_train_windows": 813,
      "num_test_windows": 348,
      "feature_dim": 8378,
      "majority_baseline_accuracy": 0.0,
      "train_final_accuracy": 1.0,
      "train_final_loss": 0.017629079520702362,
      "unseen_test_classes": [
        "Place item on table",
        "Pour coffee",
        "Pour milk into coffee",
        "Wait/Prepare for pouring"
      ]
    },
    "hand_trajectory_forecast": {
      "mse": 11.323140144348145,
      "mae": 0.40246668457984924,
      "r2": -1334.788993815828,
      "task": "hand_trajectory_forecast",
      "input": "all modalities at t -> future left/right hand 3D joints",
      "split": "chronological",
      "num_windows": 1159,
      "num_train_windows": 811,
      "num_test_windows": 348,
      "forecast_frames": 10,
      "mpjpe": 0.8222644925117493,
      "final_frame_mpjpe": 1.0649521350860596,
      "target_dim": 1260
    },
    "contact_prediction": {
      "accuracy": 1.0,
      "balanced_accuracy": 1.0,
      "macro_f1": 1.0,
      "weighted_f1": 1.0,
      "num_eval_windows": 348,
      "num_classes": 1,
      "task": "contact_prediction",
      "input": "all non-contact/non-caption-label modalities -> any body contact",
      "split": "chronological",
      "num_windows": 1161,
      "num_train_windows": 813,
      "num_test_windows": 348,
      "feature_dim": 7335,
      "majority_baseline_accuracy": 1.0,
      "train_final_accuracy": 1.0,
      "train_final_loss": 0.0005947681493125856,
      "unseen_test_classes": []
    },
    "object_relevance": {
      "micro_f1": 0.18393030009680542,
      "macro_f1": 0.06427052187996415,
      "exact_match": 0.005747126436781609,
      "precision": 0.16360505166475317,
      "recall": 0.21002210759027265,
      "task": "object_relevance",
      "input": "all non-caption modalities -> current relevant object set",
      "split": "chronological",
      "num_windows": 1161,
      "num_train_windows": 813,
      "num_test_windows": 348,
      "num_objects": 34
    },
    "caption_grounding": {
      "mrr": 0.017183946083791223,
      "median_rank": 167.0,
      "mean_rank": 174.39367816091954,
      "num_queries": 348,
      "top1_accuracy": 0.0028735632183908046,
      "top5_accuracy": 0.011494252873563218,
      "top10_accuracy": 0.017241379310344827,
      "task": "caption_grounding",
      "input": "caption objects/interaction text query + candidate sensor windows",
      "output": "matching time window",
      "split": "chronological",
      "num_train_windows": 813,
      "num_test_windows": 348
    },
    "cross_modal_retrieval": {
      "mrr": 0.26335984006618296,
      "median_rank": 12.5,
      "mean_rank": 43.33045977011494,
      "num_queries": 348,
      "top1_accuracy": 0.14942528735632185,
      "top5_accuracy": 0.3764367816091954,
      "top10_accuracy": 0.47413793103448276,
      "task": "cross_modal_retrieval",
      "input": "motion/IMU/camera query",
      "output": "matching depth/video window",
      "split": "chronological",
      "num_train_windows": 813,
      "num_test_windows": 348
    },
    "modality_reconstruction": {
      "mse": 1359.1639404296875,
      "mae": 0.31084805727005005,
      "r2": -0.016022846771134747,
      "task": "modality_reconstruction",
      "input": "motion/IMU/camera",
      "output": "depth/video feature vector",
      "split": "chronological",
      "num_train_windows": 813,
      "num_test_windows": 348,
      "target_dim": 5096
    },
    "temporal_order": {
      "accuracy": 0.46120689655172414,
      "precision": 0.4720496894409938,
      "recall": 0.6551724137931034,
      "f1": 0.5487364620938628,
      "tp": 228,
      "tn": 93,
      "fp": 255,
      "fn": 120,
      "positive_rate_true": 0.5,
      "positive_rate_pred": 0.6939655172413793,
      "task": "temporal_order",
      "input": "two adjacent windows -> whether order is correct",
      "split": "chronological",
      "num_samples": 2320,
      "num_train_samples": 1624,
      "num_test_samples": 696,
      "train_final_accuracy": 0.5104679802955665
    },
    "misalignment_detection": {
      "accuracy": 0.5028901734104047,
      "precision": 0.5030864197530864,
      "recall": 0.47109826589595377,
      "f1": 0.4865671641791045,
      "tp": 163,
      "tn": 185,
      "fp": 161,
      "fn": 183,
      "positive_rate_true": 0.5,
      "positive_rate_pred": 0.4682080924855491,
      "task": "misalignment_detection",
      "input": "motion+visual pair -> aligned vs shifted by 8 windows",
      "split": "chronological",
      "num_samples": 2306,
      "num_train_samples": 1614,
      "num_test_samples": 692,
      "train_final_accuracy": 0.5018587360594795
    }
  },
  "neural_model": {
    "name": "neural_mlp",
    "type": "lightweight PyTorch MLP over shared window features",
    "epochs": 80,
    "hidden_dim": 128,
    "batch_size": 128,
    "learning_rate": 0.001,
    "weight_decay": 0.0001,
    "dropout": 0.1,
    "device": "auto"
  },
  "neural_tasks": {
    "timeline_action": {
      "accuracy": 0.014577259475218658,
      "balanced_accuracy": 0.015625,
      "macro_f1": 0.02631578947368421,
      "weighted_f1": 0.024551173852999847,
      "num_eval_windows": 343,
      "num_classes": 18,
      "task": "timeline_action",
      "input": "all modalities -> current action label",
      "split": "chronological",
      "num_windows": 1144,
      "num_train_windows": 801,
      "num_test_windows": 343,
      "feature_dim": 8378,
      "majority_baseline_accuracy": 0.0,
      "unseen_test_classes": [
        "Place item on table",
        "Pour coffee",
        "Pour milk into coffee",
        "Wait/Prepare for pouring"
      ],
      "model": "neural_mlp",
      "head": "z-score -> MLP softmax",
      "neural_epochs": 80,
      "neural_hidden_dim": 128,
      "neural_batch_size": 128,
      "neural_learning_rate": 0.001,
      "neural_weight_decay": 0.0001,
      "neural_dropout": 0.1,
      "neural_device": "cpu",
      "train_final_loss": 0.0001524056650931597,
      "train_final_accuracy": 1.0
    },
    "timeline_subtask": {
      "accuracy": 0.01744186046511628,
      "balanced_accuracy": 0.021052631578947368,
      "macro_f1": 0.017518248175182476,
      "weighted_f1": 0.014513664912578507,
      "num_eval_windows": 344,
      "num_classes": 14,
      "task": "timeline_subtask",
      "input": "all modalities -> current subtask label",
      "split": "chronological",
      "num_windows": 1147,
      "num_train_windows": 803,
      "num_test_windows": 344,
      "feature_dim": 8378,
      "majority_baseline_accuracy": 0.0,
      "unseen_test_classes": [
        "Move bottle to coffee equipment",
        "Pour coffee",
        "Pour milk into coffee",
        "Prepare for pouring"
      ],
      "model": "neural_mlp",
      "head": "z-score -> MLP softmax",
      "neural_epochs": 80,
      "neural_hidden_dim": 128,
      "neural_batch_size": 128,
      "neural_learning_rate": 0.001,
      "neural_weight_decay": 0.0001,
      "neural_dropout": 0.1,
      "neural_device": "cpu",
      "train_final_loss": 0.06660133146519678,
      "train_final_accuracy": 0.9912826899128269
    },
    "transition_detection": {
      "accuracy": 0.9310344827586207,
      "balanced_accuracy": 0.6664156626506024,
      "macro_f1": 0.6484848484848484,
      "weighted_f1": 0.9346569139672588,
      "num_eval_windows": 348,
      "num_classes": 2,
      "task": "transition_detection",
      "input": "all modalities -> action boundary/steady",
      "split": "chronological",
      "num_windows": 1161,
      "num_train_windows": 813,
      "num_test_windows": 348,
      "feature_dim": 8378,
      "majority_baseline_accuracy": 0.9540229885057471,
      "unseen_test_classes": [],
      "model": "neural_mlp",
      "head": "z-score -> MLP softmax",
      "neural_epochs": 80,
      "neural_hidden_dim": 128,
      "neural_batch_size": 128,
      "neural_learning_rate": 0.001,
      "neural_weight_decay": 0.0001,
      "neural_dropout": 0.1,
      "neural_device": "cpu",
      "train_final_loss": 0.005629667796936003,
      "train_final_accuracy": 0.998769987699877,
      "boundary_precision": 0.1,
      "boundary_recall": 0.5,
      "boundary_f1": 0.16666666666666669,
      "matched_boundaries": 2,
      "true_boundaries": 4,
      "predicted_boundaries": 20,
      "mean_abs_timing_error_frames": 5.0
    },
    "next_action": {
      "accuracy": 0.011494252873563218,
      "balanced_accuracy": 0.013333333333333332,
      "macro_f1": 0.023529411764705882,
      "weighted_f1": 0.02028397565922921,
      "num_eval_windows": 348,
      "num_classes": 18,
      "task": "next_action",
      "input": "all modalities at t -> action at t+20 frames",
      "split": "chronological",
      "num_windows": 1161,
      "num_train_windows": 813,
      "num_test_windows": 348,
      "feature_dim": 8378,
      "majority_baseline_accuracy": 0.0,
      "unseen_test_classes": [
        "Place item on table",
        "Pour coffee",
        "Pour milk into coffee",
        "Wait/Prepare for pouring"
      ],
      "model": "neural_mlp",
      "head": "z-score -> MLP softmax",
      "neural_epochs": 80,
      "neural_hidden_dim": 128,
      "neural_batch_size": 128,
      "neural_learning_rate": 0.001,
      "neural_weight_decay": 0.0001,
      "neural_dropout": 0.1,
      "neural_device": "cpu",
      "train_final_loss": 0.0050763053797378156,
      "train_final_accuracy": 0.998769987699877
    },
    "hand_trajectory_forecast": {
      "mse": 0.005083972588181496,
      "mae": 0.055900074541568756,
      "r2": 0.40024460814419005,
      "task": "hand_trajectory_forecast",
      "input": "all modalities at t -> future left/right hand 3D joints",
      "split": "chronological",
      "num_windows": 1159,
      "num_train_windows": 811,
      "num_test_windows": 348,
      "forecast_frames": 10,
      "mpjpe": 0.11163123697042465,
      "final_frame_mpjpe": 0.11860372871160507,
      "target_dim": 1260,
      "model": "neural_mlp",
      "head": "z-score -> MLP regression",
      "neural_epochs": 80,
      "neural_hidden_dim": 128,
      "neural_batch_size": 128,
      "neural_learning_rate": 0.001,
      "neural_weight_decay": 0.0001,
      "neural_dropout": 0.1,
      "neural_device": "cpu",
      "train_final_loss": 0.059780220692901516
    },
    "contact_prediction": {
      "accuracy": 1.0,
      "balanced_accuracy": 1.0,
      "macro_f1": 1.0,
      "weighted_f1": 1.0,
      "num_eval_windows": 348,
      "num_classes": 1,
      "task": "contact_prediction",
      "input": "all non-contact/non-caption-label modalities -> any body contact",
      "split": "chronological",
      "num_windows": 1161,
      "num_train_windows": 813,
      "num_test_windows": 348,
      "feature_dim": 7335,
      "majority_baseline_accuracy": 1.0,
      "unseen_test_classes": [],
      "model": "neural_mlp",
      "head": "z-score -> MLP softmax",
      "neural_epochs": 80,
      "neural_hidden_dim": 128,
      "neural_batch_size": 128,
      "neural_learning_rate": 0.001,
      "neural_weight_decay": 0.0001,
      "neural_dropout": 0.1,
      "neural_device": "cpu",
      "train_final_loss": 0.0,
      "train_final_accuracy": 1.0
    },
    "object_relevance": {
      "micro_f1": 0.1797583081570997,
      "macro_f1": 0.04958769134098823,
      "exact_match": 0.011494252873563218,
      "precision": 0.18435321456235476,
      "recall": 0.17538688282977155,
      "task": "object_relevance",
      "input": "all non-caption modalities -> current relevant object set",
      "split": "chronological",
      "num_windows": 1161,
      "num_train_windows": 813,
      "num_test_windows": 348,
      "num_objects": 34,
      "feature_dim": 7482,
      "model": "neural_mlp",
      "head": "z-score -> MLP sigmoid multilabel",
      "neural_epochs": 80,
      "neural_hidden_dim": 128,
      "neural_batch_size": 128,
      "neural_learning_rate": 0.001,
      "neural_weight_decay": 0.0001,
      "neural_dropout": 0.1,
      "neural_device": "cpu",
      "train_final_loss": 0.0006806955548115865
    },
    "caption_grounding": {
      "mrr": 0.01781111161035397,
      "median_rank": 184.0,
      "mean_rank": 183.86206896551724,
      "num_queries": 348,
      "top1_accuracy": 0.005747126436781609,
      "top5_accuracy": 0.017241379310344827,
      "top10_accuracy": 0.02586206896551724,
      "task": "caption_grounding",
      "input": "caption objects/interaction text query + candidate sensor windows",
      "split": "chronological",
      "num_train_windows": 813,
      "num_test_windows": 348,
      "target_dim": 896,
      "output": "matching time window",
      "model": "neural_mlp",
      "head": "z-score -> MLP projection/regression",
      "neural_epochs": 80,
      "neural_hidden_dim": 128,
      "neural_batch_size": 128,
      "neural_learning_rate": 0.001,
      "neural_weight_decay": 0.0001,
      "neural_dropout": 0.1,
      "neural_device": "cpu",
      "train_final_loss": 0.06571704036525254
    },
    "cross_modal_retrieval": {
      "mrr": 0.1530070022204131,
      "median_rank": 34.0,
      "mean_rank": 62.043103448275865,
      "num_queries": 348,
      "top1_accuracy": 0.07183908045977011,
      "top5_accuracy": 0.21551724137931033,
      "top10_accuracy": 0.3017241379310345,
      "task": "cross_modal_retrieval",
      "input": "motion/IMU/camera query",
      "split": "chronological",
      "num_train_windows": 813,
      "num_test_windows": 348,
      "target_dim": 5096,
      "output": "matching depth/video window",
      "model": "neural_mlp",
      "head": "z-score -> MLP projection/regression",
      "neural_epochs": 80,
      "neural_hidden_dim": 128,
      "neural_batch_size": 128,
      "neural_learning_rate": 0.001,
      "neural_weight_decay": 0.0001,
      "neural_dropout": 0.1,
      "neural_device": "cpu",
      "train_final_loss": 0.2246821296537641
    },
    "modality_reconstruction": {
      "mse": 1351.3720703125,
      "mae": 0.10358995944261551,
      "r2": -0.010198171891414143,
      "task": "modality_reconstruction",
      "input": "motion/IMU/camera",
      "split": "chronological",
      "num_train_windows": 813,
      "num_test_windows": 348,
      "target_dim": 5096,
      "output": "depth/video feature vector",
      "model": "neural_mlp",
      "head": "z-score -> MLP projection/regression",
      "neural_epochs": 80,
      "neural_hidden_dim": 128,
      "neural_batch_size": 128,
      "neural_learning_rate": 0.001,
      "neural_weight_decay": 0.0001,
      "neural_dropout": 0.1,
      "neural_device": "cpu",
      "train_final_loss": 0.2246821296537641
    },
    "temporal_order": {
      "accuracy": 0.8706896551724138,
      "precision": 0.864406779661017,
      "recall": 0.8793103448275862,
      "f1": 0.8717948717948718,
      "tp": 306,
      "tn": 300,
      "fp": 48,
      "fn": 42,
      "positive_rate_true": 0.5,
      "positive_rate_pred": 0.5086206896551724,
      "task": "temporal_order",
      "input": "two adjacent windows -> whether order is correct",
      "split": "chronological",
      "num_samples": 2320,
      "num_train_samples": 1624,
      "num_test_samples": 696,
      "feature_dim": 25134,
      "model": "neural_mlp",
      "head": "z-score -> MLP binary softmax",
      "neural_epochs": 80,
      "neural_hidden_dim": 128,
      "neural_batch_size": 128,
      "neural_learning_rate": 0.001,
      "neural_weight_decay": 0.0001,
      "neural_dropout": 0.1,
      "neural_device": "cpu",
      "train_final_loss": 8.5640803086261e-05,
      "train_final_accuracy": 1.0
    },
    "misalignment_detection": {
      "accuracy": 0.7312138728323699,
      "precision": 0.7272727272727273,
      "recall": 0.7398843930635838,
      "f1": 0.7335243553008597,
      "tp": 256,
      "tn": 250,
      "fp": 96,
      "fn": 90,
      "positive_rate_true": 0.5,
      "positive_rate_pred": 0.5086705202312138,
      "task": "misalignment_detection",
      "input": "motion+visual pair -> aligned vs shifted by 8 windows",
      "split": "chronological",
      "num_samples": 2306,
      "num_train_samples": 1614,
      "num_test_samples": 692,
      "feature_dim": 7343,
      "model": "neural_mlp",
      "head": "z-score -> MLP binary softmax",
      "neural_epochs": 80,
      "neural_hidden_dim": 128,
      "neural_batch_size": 128,
      "neural_learning_rate": 0.001,
      "neural_weight_decay": 0.0001,
      "neural_dropout": 0.1,
      "neural_device": "cpu",
      "train_final_loss": 0.01810159092443583,
      "train_final_accuracy": 0.993184634448575
    }
  }
}