Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "source": { | |
| "shared_windows": "results/episode_task_suite/shared_windows.npz", | |
| "windows_csv": "results/episode_task_suite/windows.csv", | |
| "feature_manifest": "results/episode_task_suite/feature_manifest.json" | |
| }, | |
| "dataset_scope": { | |
| "sample_episode_count": 1, | |
| "num_windows": 1161, | |
| "feature_dim": 8546, | |
| "first_start_frame": 0, | |
| "last_end_frame": 5819, | |
| "warning": "Single public sample episode; these extension probes validate task design and pipeline mechanics, not cross-episode generalization." | |
| }, | |
| "baselines": { | |
| "minimal": "Ridge classifiers/regressors/projections plus cosine retrieval on the committed feature tensor.", | |
| "neural_mlp": "Small one-hidden-layer PyTorch MLP heads using the same inputs, targets, chronological split, and evaluator." | |
| }, | |
| "run_config": { | |
| "train_fraction": 0.7, | |
| "ridge_l2": 10.0, | |
| "seed": 7, | |
| "future_windows": 4, | |
| "neural_epochs": 25, | |
| "neural_hidden_dim": 128, | |
| "neural_batch_size": 128, | |
| "skip_neural": false | |
| }, | |
| "task_specs": { | |
| "body_motion_intensity": { | |
| "direction": "A", | |
| "direction_name": "Human Modeling & Motion Understanding", | |
| "name": "Body and Hand Motion Intensity", | |
| "family": "classification", | |
| "case_study": "A window with a fast reach or pour should be classified as high motion; a steady holding window should be low motion.", | |
| "input": "Current non-mocap feature blocks: video, audio, depth, camera pose/rotation, IMU, SLAM, calibration, and language context.", | |
| "middle_process": "Compute the target from hand/body joint changes between neighboring windows, hide the mocap blocks from the input, then classify high versus low motion using the train-set median as the threshold.", | |
| "output": "Binary label: high_motion or low_motion.", | |
| "minimal_baseline": "Ridge classifier on standardized non-mocap features.", | |
| "neural_baseline": "One-hidden-layer MLP binary classifier on the same input features.", | |
| "metric_name": "macro-F1", | |
| "metric_key": "macro_f1", | |
| "metric_direction": "higher", | |
| "current_limit": "This is a motion-energy proxy, not a SMPL/MANO body model or a generative motion prior." | |
| }, | |
| "multi_view_consistency_retrieval": { | |
| "direction": "B", | |
| "direction_name": "3D/4D Reconstruction & Neural Rendering", | |
| "name": "Multi-View Consistency Retrieval", | |
| "family": "retrieval", | |
| "case_study": "Given the fisheye camera features for a pouring moment, retrieve the synchronized stereo-left view from the same time window.", | |
| "input": "Query side: fisheye_cam0 video feature block. Candidate side: stereo_left video feature block from held-out windows.", | |
| "middle_process": "Learn a projection from one camera-view feature space into another, then rank held-out candidate windows by cosine similarity.", | |
| "output": "Ranked candidate windows; the correct synchronized view should rank near the top.", | |
| "minimal_baseline": "Ridge projection followed by cosine nearest-neighbor retrieval.", | |
| "neural_baseline": "One-hidden-layer MLP projection followed by the same cosine retrieval evaluator.", | |
| "metric_name": "MRR", | |
| "metric_key": "mrr", | |
| "metric_direction": "higher", | |
| "current_limit": "This checks calibrated multi-view signal, but it is still feature retrieval, not NeRF, Gaussian Splatting, or novel-view synthesis." | |
| }, | |
| "action_phase_progress": { | |
| "direction": "C", | |
| "direction_name": "Egocentric Vision & Interaction", | |
| "name": "Action Phase Progress Estimation", | |
| "family": "regression", | |
| "case_study": "Inside a Pour coffee action segment, estimate whether the current window is near the beginning, middle, or end of that action.", | |
| "input": "Current non-caption multimodal feature vector, so the label text cannot be copied directly from the language block.", | |
| "middle_process": "Convert contiguous action-label runs into a normalized 0-to-1 progress target, train on earlier windows, and regress progress for later windows.", | |
| "output": "A scalar progress value between 0.0 and 1.0 for the current action segment.", | |
| "minimal_baseline": "Ridge regressor on standardized non-caption features.", | |
| "neural_baseline": "One-hidden-layer MLP regressor on the same input features.", | |
| "metric_name": "MAE", | |
| "metric_key": "mae", | |
| "metric_direction": "lower", | |
| "current_limit": "This is an action-structure probe inside one episode, not a general intent model across homes, people, or tasks." | |
| }, | |
| "ego_motion_forecast": { | |
| "direction": "D", | |
| "direction_name": "Scene Reconstruction & World Modeling", | |
| "name": "Short-Horizon Ego-Motion Forecasting", | |
| "family": "forecast", | |
| "case_study": "From the current sensors, predict how the camera translation will change over the next 20 frames while the wearer moves through the scene.", | |
| "input": "Current multimodal features excluding the camera-translation block and caption text.", | |
| "middle_process": "Build a future target from camera-translation difference at a four-window horizon, then regress that future ego-motion delta from current sensors.", | |
| "output": "A future camera-translation delta vector.", | |
| "minimal_baseline": "Ridge regressor with a 20-frame forecast horizon.", | |
| "neural_baseline": "One-hidden-layer MLP regressor with the same horizon and split.", | |
| "metric_name": "MAE", | |
| "metric_key": "mae", | |
| "metric_direction": "lower", | |
| "current_limit": "This is a compact world-model proxy; it does not build a persistent map, scene graph, or object permanence model." | |
| } | |
| }, | |
| "tasks": { | |
| "body_motion_intensity": { | |
| "train_windows": 812, | |
| "test_windows": 348, | |
| "target_threshold_train_median": 0.476467490196228, | |
| "input_dim": 6425, | |
| "target_source": "hand/body joint delta between neighboring windows", | |
| "minimal": { | |
| "accuracy": 0.7758620689655172, | |
| "macro_f1": 0.7658385093167701, | |
| "positive_rate_true": 0.35919540229885055, | |
| "positive_rate_pred": 0.4339080459770115, | |
| "num_test": 348 | |
| }, | |
| "neural_mlp": { | |
| "accuracy": 0.8304597701149425, | |
| "macro_f1": 0.8254423029509534, | |
| "positive_rate_true": 0.35919540229885055, | |
| "positive_rate_pred": 0.47126436781609193, | |
| "num_test": 348 | |
| }, | |
| "neural_training": { | |
| "available": true, | |
| "epochs": 25, | |
| "hidden_dim": 128, | |
| "loss_history": [ | |
| 0.37322977610996794, | |
| 0.22245765099384515, | |
| 0.1382973729242832, | |
| 0.10363741681493562, | |
| 0.0795709453523159, | |
| 0.06539858697817244, | |
| 0.055655122610735776, | |
| 0.043255199022187385, | |
| 0.03558319240001035, | |
| 0.031631215764530776, | |
| 0.029465350402711796, | |
| 0.024383640274625695, | |
| 0.02020622924740972, | |
| 0.016222001351599624, | |
| 0.018758724778523587, | |
| 0.013199950316049196, | |
| 0.014794624612432689, | |
| 0.01013119441452505, | |
| 0.009688532855040554, | |
| 0.008956241283767622, | |
| 0.006733611014469761, | |
| 0.006677041435843618, | |
| 0.0067647325489761795, | |
| 0.005346325666556511, | |
| 0.004052691048084588 | |
| ] | |
| } | |
| }, | |
| "multi_view_consistency_retrieval": { | |
| "train_windows": 813, | |
| "test_windows": 348, | |
| "query_block": "video_fisheye_cam0", | |
| "target_block": "video_stereo_left", | |
| "query_dim": 686, | |
| "target_dim": 686, | |
| "minimal": { | |
| "mrr": 0.5533982515335083, | |
| "top1": 0.41954022988505746, | |
| "top5": 0.7068965517241379, | |
| "top10": 0.8304597701149425, | |
| "median_rank": 2.0, | |
| "num_test": 348 | |
| }, | |
| "neural_mlp": { | |
| "mrr": 0.34691643714904785, | |
| "top1": 0.23275862068965517, | |
| "top5": 0.46264367816091956, | |
| "top10": 0.5890804597701149, | |
| "median_rank": 7.0, | |
| "num_test": 348 | |
| }, | |
| "neural_training": { | |
| "available": true, | |
| "epochs": 25, | |
| "hidden_dim": 128, | |
| "loss_history": [ | |
| 0.9800718805740094, | |
| 0.8296866191855803, | |
| 0.7029420470986126, | |
| 0.6089339927846948, | |
| 0.5426930648814268, | |
| 0.49323386093200644, | |
| 0.45315542230600214, | |
| 0.4240272395578551, | |
| 0.3964498403400806, | |
| 0.37567753094588696, | |
| 0.3599070675332021, | |
| 0.3417643405048023, | |
| 0.32952829051721577, | |
| 0.31516501450450657, | |
| 0.3070896395824639, | |
| 0.29752101269888553, | |
| 0.287490411878072, | |
| 0.2791558311654193, | |
| 0.2707079971921693, | |
| 0.2669465311998811, | |
| 0.2603630047442728, | |
| 0.2501040017656148, | |
| 0.24714160980920216, | |
| 0.24146720613060843, | |
| 0.23866056472173036 | |
| ] | |
| } | |
| }, | |
| "action_phase_progress": { | |
| "train_windows": 813, | |
| "test_windows": 348, | |
| "input_dim": 7650, | |
| "target_source": "normalized position inside contiguous action-label runs", | |
| "minimal": { | |
| "mse": 0.16943013668060303, | |
| "mae": 0.32674381136894226, | |
| "r2": -1.0236103208433347, | |
| "num_test": 348 | |
| }, | |
| "neural_mlp": { | |
| "mse": 0.14226463437080383, | |
| "mae": 0.301545649766922, | |
| "r2": -0.6991557278041094, | |
| "num_test": 348 | |
| }, | |
| "neural_training": { | |
| "available": true, | |
| "epochs": 25, | |
| "hidden_dim": 128, | |
| "loss_history": [ | |
| 3.236165899632162, | |
| 1.2893786148831414, | |
| 0.8036823107014429, | |
| 0.5113777590120557, | |
| 0.42643894586909153, | |
| 0.37627028047965166, | |
| 0.30304253713524504, | |
| 0.25041163572526065, | |
| 0.20774717810409096, | |
| 0.18116216590630319, | |
| 0.16409150619165191, | |
| 0.14362229159397394, | |
| 0.1277421933001991, | |
| 0.1232111468672899, | |
| 0.11924667946013724, | |
| 0.11960234325310401, | |
| 0.09951645682988572, | |
| 0.08239271907825459, | |
| 0.08965909919102401, | |
| 0.07973466079503408, | |
| 0.07939992471154765, | |
| 0.07085797808340408, | |
| 0.08331163055269188, | |
| 0.068286436959313, | |
| 0.07575550977814241 | |
| ] | |
| } | |
| }, | |
| "ego_motion_forecast": { | |
| "train_windows": 810, | |
| "test_windows": 347, | |
| "forecast_horizon_windows": 4, | |
| "forecast_horizon_frames": 20, | |
| "input_dim": 7629, | |
| "target_dim": 21, | |
| "target_source": "future minus current camera_translation feature block", | |
| "minimal": { | |
| "mse": 3.3189830780029297, | |
| "mae": 0.16999860107898712, | |
| "r2": -5674.96718626448, | |
| "num_test": 347 | |
| }, | |
| "neural_mlp": { | |
| "mse": 0.547074019908905, | |
| "mae": 0.0833469107747078, | |
| "r2": -934.5800418396838, | |
| "num_test": 347 | |
| }, | |
| "neural_training": { | |
| "available": true, | |
| "epochs": 25, | |
| "hidden_dim": 128, | |
| "loss_history": [ | |
| 1.034793225188314, | |
| 0.6070329941349265, | |
| 0.459870958622591, | |
| 0.3616479166495947, | |
| 0.2902924293353234, | |
| 0.23913239262722158, | |
| 0.1907091121982645, | |
| 0.17297036942140556, | |
| 0.15034288657300265, | |
| 0.13807891327657817, | |
| 0.13084740807980666, | |
| 0.12387588925567675, | |
| 0.12085371225336451, | |
| 0.11271689225126195, | |
| 0.10728766140010622, | |
| 0.10044601888936243, | |
| 0.09365091108613544, | |
| 0.09090288755150489, | |
| 0.08592776887946658, | |
| 0.0805281208805096, | |
| 0.08014915316929051, | |
| 0.07900124887625376, | |
| 0.07897219589830917, | |
| 0.07575527565714754, | |
| 0.0774567014273302 | |
| ] | |
| } | |
| } | |
| } | |
| } |