cy0307's picture
Publish Ropedia Xperience-10M task baseline cards
b7334ff verified
Raw
History Blame
25.1 kB
{
"omni_relay": {
"status": "pending_huggingface_gated_access",
"dataset": "ropedia-ai/xperience-10m",
"relay_server": "ANGEL-A100-80Gx4",
"training_server": "ANGEL-H20-96GX8",
"selection_strategy": "stratified_round_robin_by_top_level_session",
"target_episodes": 32,
"selected_sessions": 32,
"candidate_scan_top_level_sessions": 64,
"valid_candidates": 680,
"estimated_bytes": 72031620552,
"exclude": [
"visualization.rrd"
],
"blocker": "Hugging Face returns 403 pending review for the full Xperience-10M gated dataset.",
"claim_boundary": "No real 32-episode fine-tune is claimed until the watcher downloads data, transfers it to H20, and the held-out evaluation runs."
},
"models": {
"motion_action": {
"accuracy": 0.9828178694158075,
"balanced_accuracy": 0.9643518518518519,
"macro_f1": 0.96884342657456,
"weighted_f1": 0.9824311468352843,
"num_eval_windows": 291,
"num_classes": 18,
"majority_baseline_accuracy": 0.13745704467353953,
"train_final_accuracy": 1.0,
"train_final_loss": 0.019042566418647766
},
"motion_subtask": {
"accuracy": 0.9758620689655172,
"balanced_accuracy": 0.9783924095954172,
"macro_f1": 0.9528048001232955,
"weighted_f1": 0.9778836359351952,
"num_eval_windows": 290,
"num_classes": 14,
"majority_baseline_accuracy": 0.14482758620689656,
"train_final_accuracy": 1.0,
"train_final_loss": 0.02664567530155182
},
"all_modalities_action": {
"accuracy": 0.9828178694158075,
"balanced_accuracy": 0.9800925925925925,
"macro_f1": 0.9791023658779895,
"weighted_f1": 0.98276563540562,
"num_eval_windows": 291,
"num_classes": 18,
"majority_baseline_accuracy": 0.13745704467353953,
"train_final_accuracy": 1.0,
"train_final_loss": 0.014624637551605701,
"feature_dim": 8378,
"num_windows": 1144
},
"all_modalities_subtask": {
"accuracy": 0.9827586206896551,
"balanced_accuracy": 0.9505102040816327,
"macro_f1": 0.9307645963773675,
"weighted_f1": 0.9837987833808578,
"num_eval_windows": 290,
"num_classes": 14,
"majority_baseline_accuracy": 0.14482758620689656,
"train_final_accuracy": 1.0,
"train_final_loss": 0.012823422439396381,
"feature_dim": 8378,
"num_windows": 1147
}
},
"suite": {
"annotation": "data/sample/xperience-10m-sample/annotation.hdf5",
"num_frames": 5821,
"num_windows": 1161,
"feature_dim": 8378,
"window_frames": 20,
"stride_frames": 5,
"tasks": {
"timeline_action": {
"accuracy": 0.029154518950437316,
"balanced_accuracy": 0.03125,
"macro_f1": 0.05,
"weighted_f1": 0.04664723032069971,
"num_eval_windows": 343,
"num_classes": 18,
"task": "timeline_action",
"input": "all modalities -> current action label",
"split": "chronological",
"num_windows": 1144,
"num_train_windows": 801,
"num_test_windows": 343,
"feature_dim": 8378,
"majority_baseline_accuracy": 0.0,
"train_final_accuracy": 1.0,
"train_final_loss": 0.01664665900170803,
"unseen_test_classes": [
"Place item on table",
"Pour coffee",
"Pour milk into coffee",
"Wait/Prepare for pouring"
]
},
"timeline_subtask": {
"accuracy": 0.05813953488372093,
"balanced_accuracy": 0.05376979652090881,
"macro_f1": 0.04954121121178666,
"weighted_f1": 0.06731304264454903,
"num_eval_windows": 344,
"num_classes": 14,
"task": "timeline_subtask",
"input": "all modalities -> current subtask label",
"split": "chronological",
"num_windows": 1147,
"num_train_windows": 803,
"num_test_windows": 344,
"feature_dim": 8378,
"majority_baseline_accuracy": 0.0,
"train_final_accuracy": 1.0,
"train_final_loss": 0.014040183275938034,
"unseen_test_classes": [
"Move bottle to coffee equipment",
"Pour coffee",
"Pour milk into coffee",
"Prepare for pouring"
]
},
"transition_detection": {
"accuracy": 0.9252873563218391,
"balanced_accuracy": 0.6931475903614458,
"macro_f1": 0.6551829268292684,
"weighted_f1": 0.9323030557891787,
"num_eval_windows": 348,
"num_classes": 2,
"task": "transition_detection",
"input": "all modalities -> action boundary/steady",
"split": "chronological",
"num_windows": 1161,
"num_train_windows": 813,
"num_test_windows": 348,
"feature_dim": 8378,
"majority_baseline_accuracy": 0.9540229885057471,
"train_final_accuracy": 1.0,
"train_final_loss": 0.007071746978908777,
"unseen_test_classes": [],
"boundary_precision": 0.125,
"boundary_recall": 0.75,
"boundary_f1": 0.21428571428571427,
"matched_boundaries": 3,
"true_boundaries": 4,
"predicted_boundaries": 24,
"mean_abs_timing_error_frames": 2.6666666666666665
},
"next_action": {
"accuracy": 0.034482758620689655,
"balanced_accuracy": 0.04,
"macro_f1": 0.05925925925925927,
"weighted_f1": 0.05108556832694764,
"num_eval_windows": 348,
"num_classes": 18,
"task": "next_action",
"input": "all modalities at t -> action at t+20 frames",
"split": "chronological",
"num_windows": 1161,
"num_train_windows": 813,
"num_test_windows": 348,
"feature_dim": 8378,
"majority_baseline_accuracy": 0.0,
"train_final_accuracy": 1.0,
"train_final_loss": 0.017629079520702362,
"unseen_test_classes": [
"Place item on table",
"Pour coffee",
"Pour milk into coffee",
"Wait/Prepare for pouring"
]
},
"hand_trajectory_forecast": {
"mse": 11.323140144348145,
"mae": 0.40246668457984924,
"r2": -1334.788993815828,
"task": "hand_trajectory_forecast",
"input": "all modalities at t -> future left/right hand 3D joints",
"split": "chronological",
"num_windows": 1159,
"num_train_windows": 811,
"num_test_windows": 348,
"forecast_frames": 10,
"mpjpe": 0.8222644925117493,
"final_frame_mpjpe": 1.0649521350860596,
"target_dim": 1260
},
"contact_prediction": {
"accuracy": 1.0,
"balanced_accuracy": 1.0,
"macro_f1": 1.0,
"weighted_f1": 1.0,
"num_eval_windows": 348,
"num_classes": 1,
"task": "contact_prediction",
"input": "all non-contact/non-caption-label modalities -> any body contact",
"split": "chronological",
"num_windows": 1161,
"num_train_windows": 813,
"num_test_windows": 348,
"feature_dim": 7335,
"majority_baseline_accuracy": 1.0,
"train_final_accuracy": 1.0,
"train_final_loss": 0.0005947681493125856,
"unseen_test_classes": []
},
"object_relevance": {
"micro_f1": 0.18393030009680542,
"macro_f1": 0.06427052187996415,
"exact_match": 0.005747126436781609,
"precision": 0.16360505166475317,
"recall": 0.21002210759027265,
"task": "object_relevance",
"input": "all non-caption modalities -> current relevant object set",
"split": "chronological",
"num_windows": 1161,
"num_train_windows": 813,
"num_test_windows": 348,
"num_objects": 34
},
"caption_grounding": {
"mrr": 0.017183946083791223,
"median_rank": 167.0,
"mean_rank": 174.39367816091954,
"num_queries": 348,
"top1_accuracy": 0.0028735632183908046,
"top5_accuracy": 0.011494252873563218,
"top10_accuracy": 0.017241379310344827,
"task": "caption_grounding",
"input": "caption objects/interaction text query + candidate sensor windows",
"output": "matching time window",
"split": "chronological",
"num_train_windows": 813,
"num_test_windows": 348
},
"cross_modal_retrieval": {
"mrr": 0.26335984006618296,
"median_rank": 12.5,
"mean_rank": 43.33045977011494,
"num_queries": 348,
"top1_accuracy": 0.14942528735632185,
"top5_accuracy": 0.3764367816091954,
"top10_accuracy": 0.47413793103448276,
"task": "cross_modal_retrieval",
"input": "motion/IMU/camera query",
"output": "matching depth/video window",
"split": "chronological",
"num_train_windows": 813,
"num_test_windows": 348
},
"modality_reconstruction": {
"mse": 1359.1639404296875,
"mae": 0.31084805727005005,
"r2": -0.016022846771134747,
"task": "modality_reconstruction",
"input": "motion/IMU/camera",
"output": "depth/video feature vector",
"split": "chronological",
"num_train_windows": 813,
"num_test_windows": 348,
"target_dim": 5096
},
"temporal_order": {
"accuracy": 0.46120689655172414,
"precision": 0.4720496894409938,
"recall": 0.6551724137931034,
"f1": 0.5487364620938628,
"tp": 228,
"tn": 93,
"fp": 255,
"fn": 120,
"positive_rate_true": 0.5,
"positive_rate_pred": 0.6939655172413793,
"task": "temporal_order",
"input": "two adjacent windows -> whether order is correct",
"split": "chronological",
"num_samples": 2320,
"num_train_samples": 1624,
"num_test_samples": 696,
"train_final_accuracy": 0.5104679802955665
},
"misalignment_detection": {
"accuracy": 0.5028901734104047,
"precision": 0.5030864197530864,
"recall": 0.47109826589595377,
"f1": 0.4865671641791045,
"tp": 163,
"tn": 185,
"fp": 161,
"fn": 183,
"positive_rate_true": 0.5,
"positive_rate_pred": 0.4682080924855491,
"task": "misalignment_detection",
"input": "motion+visual pair -> aligned vs shifted by 8 windows",
"split": "chronological",
"num_samples": 2306,
"num_train_samples": 1614,
"num_test_samples": 692,
"train_final_accuracy": 0.5018587360594795
}
},
"neural_model": {
"name": "neural_mlp",
"type": "lightweight PyTorch MLP over shared window features",
"epochs": 80,
"hidden_dim": 128,
"batch_size": 128,
"learning_rate": 0.001,
"weight_decay": 0.0001,
"dropout": 0.1,
"device": "auto"
},
"neural_tasks": {
"timeline_action": {
"accuracy": 0.014577259475218658,
"balanced_accuracy": 0.015625,
"macro_f1": 0.02631578947368421,
"weighted_f1": 0.024551173852999847,
"num_eval_windows": 343,
"num_classes": 18,
"task": "timeline_action",
"input": "all modalities -> current action label",
"split": "chronological",
"num_windows": 1144,
"num_train_windows": 801,
"num_test_windows": 343,
"feature_dim": 8378,
"majority_baseline_accuracy": 0.0,
"unseen_test_classes": [
"Place item on table",
"Pour coffee",
"Pour milk into coffee",
"Wait/Prepare for pouring"
],
"model": "neural_mlp",
"head": "z-score -> MLP softmax",
"neural_epochs": 80,
"neural_hidden_dim": 128,
"neural_batch_size": 128,
"neural_learning_rate": 0.001,
"neural_weight_decay": 0.0001,
"neural_dropout": 0.1,
"neural_device": "cpu",
"train_final_loss": 0.0001524056650931597,
"train_final_accuracy": 1.0
},
"timeline_subtask": {
"accuracy": 0.01744186046511628,
"balanced_accuracy": 0.021052631578947368,
"macro_f1": 0.017518248175182476,
"weighted_f1": 0.014513664912578507,
"num_eval_windows": 344,
"num_classes": 14,
"task": "timeline_subtask",
"input": "all modalities -> current subtask label",
"split": "chronological",
"num_windows": 1147,
"num_train_windows": 803,
"num_test_windows": 344,
"feature_dim": 8378,
"majority_baseline_accuracy": 0.0,
"unseen_test_classes": [
"Move bottle to coffee equipment",
"Pour coffee",
"Pour milk into coffee",
"Prepare for pouring"
],
"model": "neural_mlp",
"head": "z-score -> MLP softmax",
"neural_epochs": 80,
"neural_hidden_dim": 128,
"neural_batch_size": 128,
"neural_learning_rate": 0.001,
"neural_weight_decay": 0.0001,
"neural_dropout": 0.1,
"neural_device": "cpu",
"train_final_loss": 0.06660133146519678,
"train_final_accuracy": 0.9912826899128269
},
"transition_detection": {
"accuracy": 0.9310344827586207,
"balanced_accuracy": 0.6664156626506024,
"macro_f1": 0.6484848484848484,
"weighted_f1": 0.9346569139672588,
"num_eval_windows": 348,
"num_classes": 2,
"task": "transition_detection",
"input": "all modalities -> action boundary/steady",
"split": "chronological",
"num_windows": 1161,
"num_train_windows": 813,
"num_test_windows": 348,
"feature_dim": 8378,
"majority_baseline_accuracy": 0.9540229885057471,
"unseen_test_classes": [],
"model": "neural_mlp",
"head": "z-score -> MLP softmax",
"neural_epochs": 80,
"neural_hidden_dim": 128,
"neural_batch_size": 128,
"neural_learning_rate": 0.001,
"neural_weight_decay": 0.0001,
"neural_dropout": 0.1,
"neural_device": "cpu",
"train_final_loss": 0.005629667796936003,
"train_final_accuracy": 0.998769987699877,
"boundary_precision": 0.1,
"boundary_recall": 0.5,
"boundary_f1": 0.16666666666666669,
"matched_boundaries": 2,
"true_boundaries": 4,
"predicted_boundaries": 20,
"mean_abs_timing_error_frames": 5.0
},
"next_action": {
"accuracy": 0.011494252873563218,
"balanced_accuracy": 0.013333333333333332,
"macro_f1": 0.023529411764705882,
"weighted_f1": 0.02028397565922921,
"num_eval_windows": 348,
"num_classes": 18,
"task": "next_action",
"input": "all modalities at t -> action at t+20 frames",
"split": "chronological",
"num_windows": 1161,
"num_train_windows": 813,
"num_test_windows": 348,
"feature_dim": 8378,
"majority_baseline_accuracy": 0.0,
"unseen_test_classes": [
"Place item on table",
"Pour coffee",
"Pour milk into coffee",
"Wait/Prepare for pouring"
],
"model": "neural_mlp",
"head": "z-score -> MLP softmax",
"neural_epochs": 80,
"neural_hidden_dim": 128,
"neural_batch_size": 128,
"neural_learning_rate": 0.001,
"neural_weight_decay": 0.0001,
"neural_dropout": 0.1,
"neural_device": "cpu",
"train_final_loss": 0.0050763053797378156,
"train_final_accuracy": 0.998769987699877
},
"hand_trajectory_forecast": {
"mse": 0.005083972588181496,
"mae": 0.055900074541568756,
"r2": 0.40024460814419005,
"task": "hand_trajectory_forecast",
"input": "all modalities at t -> future left/right hand 3D joints",
"split": "chronological",
"num_windows": 1159,
"num_train_windows": 811,
"num_test_windows": 348,
"forecast_frames": 10,
"mpjpe": 0.11163123697042465,
"final_frame_mpjpe": 0.11860372871160507,
"target_dim": 1260,
"model": "neural_mlp",
"head": "z-score -> MLP regression",
"neural_epochs": 80,
"neural_hidden_dim": 128,
"neural_batch_size": 128,
"neural_learning_rate": 0.001,
"neural_weight_decay": 0.0001,
"neural_dropout": 0.1,
"neural_device": "cpu",
"train_final_loss": 0.059780220692901516
},
"contact_prediction": {
"accuracy": 1.0,
"balanced_accuracy": 1.0,
"macro_f1": 1.0,
"weighted_f1": 1.0,
"num_eval_windows": 348,
"num_classes": 1,
"task": "contact_prediction",
"input": "all non-contact/non-caption-label modalities -> any body contact",
"split": "chronological",
"num_windows": 1161,
"num_train_windows": 813,
"num_test_windows": 348,
"feature_dim": 7335,
"majority_baseline_accuracy": 1.0,
"unseen_test_classes": [],
"model": "neural_mlp",
"head": "z-score -> MLP softmax",
"neural_epochs": 80,
"neural_hidden_dim": 128,
"neural_batch_size": 128,
"neural_learning_rate": 0.001,
"neural_weight_decay": 0.0001,
"neural_dropout": 0.1,
"neural_device": "cpu",
"train_final_loss": 0.0,
"train_final_accuracy": 1.0
},
"object_relevance": {
"micro_f1": 0.1797583081570997,
"macro_f1": 0.04958769134098823,
"exact_match": 0.011494252873563218,
"precision": 0.18435321456235476,
"recall": 0.17538688282977155,
"task": "object_relevance",
"input": "all non-caption modalities -> current relevant object set",
"split": "chronological",
"num_windows": 1161,
"num_train_windows": 813,
"num_test_windows": 348,
"num_objects": 34,
"feature_dim": 7482,
"model": "neural_mlp",
"head": "z-score -> MLP sigmoid multilabel",
"neural_epochs": 80,
"neural_hidden_dim": 128,
"neural_batch_size": 128,
"neural_learning_rate": 0.001,
"neural_weight_decay": 0.0001,
"neural_dropout": 0.1,
"neural_device": "cpu",
"train_final_loss": 0.0006806955548115865
},
"caption_grounding": {
"mrr": 0.01781111161035397,
"median_rank": 184.0,
"mean_rank": 183.86206896551724,
"num_queries": 348,
"top1_accuracy": 0.005747126436781609,
"top5_accuracy": 0.017241379310344827,
"top10_accuracy": 0.02586206896551724,
"task": "caption_grounding",
"input": "caption objects/interaction text query + candidate sensor windows",
"split": "chronological",
"num_train_windows": 813,
"num_test_windows": 348,
"target_dim": 896,
"output": "matching time window",
"model": "neural_mlp",
"head": "z-score -> MLP projection/regression",
"neural_epochs": 80,
"neural_hidden_dim": 128,
"neural_batch_size": 128,
"neural_learning_rate": 0.001,
"neural_weight_decay": 0.0001,
"neural_dropout": 0.1,
"neural_device": "cpu",
"train_final_loss": 0.06571704036525254
},
"cross_modal_retrieval": {
"mrr": 0.1530070022204131,
"median_rank": 34.0,
"mean_rank": 62.043103448275865,
"num_queries": 348,
"top1_accuracy": 0.07183908045977011,
"top5_accuracy": 0.21551724137931033,
"top10_accuracy": 0.3017241379310345,
"task": "cross_modal_retrieval",
"input": "motion/IMU/camera query",
"split": "chronological",
"num_train_windows": 813,
"num_test_windows": 348,
"target_dim": 5096,
"output": "matching depth/video window",
"model": "neural_mlp",
"head": "z-score -> MLP projection/regression",
"neural_epochs": 80,
"neural_hidden_dim": 128,
"neural_batch_size": 128,
"neural_learning_rate": 0.001,
"neural_weight_decay": 0.0001,
"neural_dropout": 0.1,
"neural_device": "cpu",
"train_final_loss": 0.2246821296537641
},
"modality_reconstruction": {
"mse": 1351.3720703125,
"mae": 0.10358995944261551,
"r2": -0.010198171891414143,
"task": "modality_reconstruction",
"input": "motion/IMU/camera",
"split": "chronological",
"num_train_windows": 813,
"num_test_windows": 348,
"target_dim": 5096,
"output": "depth/video feature vector",
"model": "neural_mlp",
"head": "z-score -> MLP projection/regression",
"neural_epochs": 80,
"neural_hidden_dim": 128,
"neural_batch_size": 128,
"neural_learning_rate": 0.001,
"neural_weight_decay": 0.0001,
"neural_dropout": 0.1,
"neural_device": "cpu",
"train_final_loss": 0.2246821296537641
},
"temporal_order": {
"accuracy": 0.8706896551724138,
"precision": 0.864406779661017,
"recall": 0.8793103448275862,
"f1": 0.8717948717948718,
"tp": 306,
"tn": 300,
"fp": 48,
"fn": 42,
"positive_rate_true": 0.5,
"positive_rate_pred": 0.5086206896551724,
"task": "temporal_order",
"input": "two adjacent windows -> whether order is correct",
"split": "chronological",
"num_samples": 2320,
"num_train_samples": 1624,
"num_test_samples": 696,
"feature_dim": 25134,
"model": "neural_mlp",
"head": "z-score -> MLP binary softmax",
"neural_epochs": 80,
"neural_hidden_dim": 128,
"neural_batch_size": 128,
"neural_learning_rate": 0.001,
"neural_weight_decay": 0.0001,
"neural_dropout": 0.1,
"neural_device": "cpu",
"train_final_loss": 8.5640803086261e-05,
"train_final_accuracy": 1.0
},
"misalignment_detection": {
"accuracy": 0.7312138728323699,
"precision": 0.7272727272727273,
"recall": 0.7398843930635838,
"f1": 0.7335243553008597,
"tp": 256,
"tn": 250,
"fp": 96,
"fn": 90,
"positive_rate_true": 0.5,
"positive_rate_pred": 0.5086705202312138,
"task": "misalignment_detection",
"input": "motion+visual pair -> aligned vs shifted by 8 windows",
"split": "chronological",
"num_samples": 2306,
"num_train_samples": 1614,
"num_test_samples": 692,
"feature_dim": 7343,
"model": "neural_mlp",
"head": "z-score -> MLP binary softmax",
"neural_epochs": 80,
"neural_hidden_dim": 128,
"neural_batch_size": 128,
"neural_learning_rate": 0.001,
"neural_weight_decay": 0.0001,
"neural_dropout": 0.1,
"neural_device": "cpu",
"train_final_loss": 0.01810159092443583,
"train_final_accuracy": 0.993184634448575
}
}
},
"feature_manifest": [
{
"name": "hand_left_joints",
"start": 0,
"end": 441,
"dim": 441
},
{
"name": "hand_right_joints",
"start": 441,
"end": 882,
"dim": 441
},
{
"name": "body_joints",
"start": 882,
"end": 1974,
"dim": 1092
},
{
"name": "body_contacts",
"start": 1974,
"end": 2121,
"dim": 147
},
{
"name": "camera_translation",
"start": 2121,
"end": 2142,
"dim": 21
},
{
"name": "camera_rotation_matrix",
"start": 2142,
"end": 2205,
"dim": 63
},
{
"name": "imu_accel_gyro",
"start": 2205,
"end": 2247,
"dim": 42
},
{
"name": "depth_confidence",
"start": 2247,
"end": 3227,
"dim": 980
},
{
"name": "video_fisheye_cam0",
"start": 3227,
"end": 3913,
"dim": 686
},
{
"name": "video_fisheye_cam1",
"start": 3913,
"end": 4599,
"dim": 686
},
{
"name": "video_fisheye_cam2",
"start": 4599,
"end": 5285,
"dim": 686
},
{
"name": "video_fisheye_cam3",
"start": 5285,
"end": 5971,
"dim": 686
},
{
"name": "video_stereo_left",
"start": 5971,
"end": 6657,
"dim": 686
},
{
"name": "video_stereo_right",
"start": 6657,
"end": 7343,
"dim": 686
},
{
"name": "caption_objects_interaction_text",
"start": 7343,
"end": 8239,
"dim": 896
},
{
"name": "slam_point_cloud",
"start": 8239,
"end": 8261,
"dim": 22
},
{
"name": "calibration",
"start": 8261,
"end": 8378,
"dim": 117
}
]
}