Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
ropedia-xperience-10m-task-baselines / artifacts /episode_task_suite /task_walkthroughs /task_walkthroughs.json
| { | |
| "source": "results/episode_task_suite/summary_report.json", | |
| "scope": { | |
| "episode_count": 1, | |
| "num_frames": 5821, | |
| "num_windows": 1161, | |
| "feature_dim": 8378, | |
| "window_frames": 20, | |
| "stride_frames": 5, | |
| "warning": "These walkthroughs explain task contracts on one public sample episode; they are not cross-episode performance claims." | |
| }, | |
| "shared_pipeline": [ | |
| "Read annotation.hdf5 and synchronized video-derived features.", | |
| "Slice the episode into 20-frame windows with stride 5.", | |
| "Build an 8,378-d current feature vector from available modality blocks.", | |
| "Construct a task-specific target from labels, future frames, paired windows, or modality splits.", | |
| "Train a minimal head and, when enabled, a neural MLP head.", | |
| "Write metrics, predictions, and model artifacts for review." | |
| ], | |
| "tasks": { | |
| "timeline_action": { | |
| "plain_goal": "Look at one short multimodal window and name what action is happening now.", | |
| "case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.", | |
| "input": "One 20-frame window represented by the current 8,378-d feature vector: video/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.", | |
| "middle_modules": [ | |
| "Window builder slices the episode into short overlapping windows.", | |
| "Feature assembler concatenates all current feature blocks.", | |
| "Label builder reads the action annotation for the center of the window.", | |
| "Classifier head maps the window vector to one action class.", | |
| "Evaluator compares predicted action labels against the held-out chronological segment." | |
| ], | |
| "output": "A single action class for the current window.", | |
| "junior_tip": "This is like asking: given this tiny movie clip plus sensor readings, what is the person doing right now?", | |
| "failure_mode": "The one-episode chronological split contains future action classes that were not present in training, so low test macro-F1 is expected.", | |
| "task": "timeline_action", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 0.05, | |
| "neural_mlp": 0.02631578947368421 | |
| }, | |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" | |
| }, | |
| "timeline_subtask": { | |
| "plain_goal": "Predict the higher-level task stage for the current window.", | |
| "case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.", | |
| "input": "The same all-modality 8,378-d window vector used by action recognition.", | |
| "middle_modules": [ | |
| "Window builder creates the current temporal slice.", | |
| "Feature assembler keeps all available modality blocks.", | |
| "Subtask label builder maps the current timestamp to a subtask annotation.", | |
| "Classifier head predicts the subtask class.", | |
| "Evaluator reports class-balanced scores so rare subtasks matter." | |
| ], | |
| "output": "A single subtask label for the current window.", | |
| "junior_tip": "Action is the verb; subtask is the chapter of the activity.", | |
| "failure_mode": "Single-episode ordering means some later subtasks appear only in test, so this is a pipeline check rather than a general benchmark.", | |
| "task": "timeline_subtask", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 0.04954121121178666, | |
| "neural_mlp": 0.017518248175182476 | |
| }, | |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" | |
| }, | |
| "transition_detection": { | |
| "plain_goal": "Detect whether the current window is near a boundary between actions.", | |
| "case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.", | |
| "input": "One all-modality window vector plus labels derived from action-change timestamps.", | |
| "middle_modules": [ | |
| "Boundary builder scans action labels over time and marks windows near a change.", | |
| "Feature assembler supplies all current modality features.", | |
| "Binary classifier predicts steady vs boundary.", | |
| "Boundary matcher checks whether predicted boundary times are close to true boundary times.", | |
| "Evaluator reports macro-F1 and timing error, not just accuracy." | |
| ], | |
| "output": "A binary label: boundary or steady.", | |
| "junior_tip": "This is the model's way of saying: something just changed here.", | |
| "failure_mode": "Boundaries are rare, so high accuracy can be misleading if the model predicts steady too often.", | |
| "task": "transition_detection", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 0.6551829268292684, | |
| "neural_mlp": 0.6484848484848484 | |
| }, | |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" | |
| }, | |
| "next_action": { | |
| "plain_goal": "Use the current window to guess the action that will happen shortly after it.", | |
| "case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.", | |
| "input": "The current all-modality window vector at time t.", | |
| "middle_modules": [ | |
| "Window builder picks a current time window.", | |
| "Future label builder shifts the action target by 20 frames.", | |
| "Feature assembler uses only current information, not future features.", | |
| "Classifier head predicts the future action class.", | |
| "Evaluator checks whether the future action label is correct." | |
| ], | |
| "output": "A single action class for t+20 frames.", | |
| "junior_tip": "This is short-horizon intention prediction: what will the person do next?", | |
| "failure_mode": "The public sample has unseen future classes in the chronological test split, which makes this very hard with one episode.", | |
| "task": "next_action", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 0.05925925925925927, | |
| "neural_mlp": 0.023529411764705882 | |
| }, | |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" | |
| }, | |
| "hand_trajectory_forecast": { | |
| "plain_goal": "Predict where the hands will move over the next few frames.", | |
| "case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.", | |
| "input": "The current all-modality window vector at time t.", | |
| "middle_modules": [ | |
| "Window builder chooses the current sensor window.", | |
| "Target builder extracts future left/right hand 3D joints from motion capture.", | |
| "Regression head predicts a continuous trajectory, not a class label.", | |
| "Output reshaper interprets the vector as future frames and joints.", | |
| "Evaluator computes MPJPE, the average 3D joint-position error." | |
| ], | |
| "output": "A future trajectory vector for left and right hand joints.", | |
| "junior_tip": "Instead of naming an action, this task draws the next hand path in 3D.", | |
| "failure_mode": "It is still a window-level forecast, not a full policy or long-horizon motion generator.", | |
| "task": "hand_trajectory_forecast", | |
| "metric": { | |
| "key": "mpjpe", | |
| "name": "MPJPE", | |
| "direction": "lower", | |
| "minimal": 0.8222644925117493, | |
| "neural_mlp": 0.11163123697042465 | |
| }, | |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" | |
| }, | |
| "contact_prediction": { | |
| "plain_goal": "Predict whether the body or hand is in contact with something.", | |
| "case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.", | |
| "input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.", | |
| "middle_modules": [ | |
| "Feature selector removes contact-label and caption-label blocks.", | |
| "Target builder converts contact annotations into a binary label.", | |
| "Binary classifier predicts contact vs no contact.", | |
| "Evaluator reports macro-F1 and accuracy.", | |
| "Degeneracy checker records whether only one class appears." | |
| ], | |
| "output": "A binary contact label.", | |
| "junior_tip": "This is a simple physical-interaction probe: is the person touching something now?", | |
| "failure_mode": "The current public sample is degenerate for this task because one class dominates, so perfect score does not mean the model learned contact physics.", | |
| "task": "contact_prediction", | |
| "metric": { | |
| "key": "macro_f1", | |
| "name": "macro-F1", | |
| "direction": "higher", | |
| "minimal": 1.0, | |
| "neural_mlp": 1.0 | |
| }, | |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" | |
| }, | |
| "object_relevance": { | |
| "plain_goal": "Predict which objects matter in the current window.", | |
| "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.", | |
| "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.", | |
| "middle_modules": [ | |
| "Object vocabulary builder collects object labels from annotations.", | |
| "Feature selector removes caption-derived label blocks.", | |
| "Multi-label target builder creates a multi-hot object vector.", | |
| "Sigmoid heads predict each object's relevance independently.", | |
| "Evaluator reports micro-F1 and exact-match quality." | |
| ], | |
| "output": "A multi-label object set for the current window.", | |
| "junior_tip": "A window can involve more than one object, so this is not a one-class classifier.", | |
| "failure_mode": "Object labels are sparse and language-derived, so this is currently a weak object-centric probe.", | |
| "task": "object_relevance", | |
| "metric": { | |
| "key": "micro_f1", | |
| "name": "micro-F1", | |
| "direction": "higher", | |
| "minimal": 0.18393030009680542, | |
| "neural_mlp": 0.1797583081570997 | |
| }, | |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" | |
| }, | |
| "caption_grounding": { | |
| "plain_goal": "Given a text-like query from annotation, find the matching time window.", | |
| "case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.", | |
| "input": "Caption/object/interaction query features and a set of candidate sensor-window features.", | |
| "middle_modules": [ | |
| "Query builder converts annotation words into a compact query representation.", | |
| "Candidate builder gathers held-out sensor windows.", | |
| "Projection head maps sensor windows into the query space.", | |
| "Ranker scores candidates by cosine similarity.", | |
| "Evaluator reports MRR and top-k retrieval accuracy." | |
| ], | |
| "output": "A ranked list of windows, with the correct matching window ideally near rank 1.", | |
| "junior_tip": "This is search: type a description, retrieve the matching moment.", | |
| "failure_mode": "Bag-of-objects text features are too simple for rich language grounding.", | |
| "task": "caption_grounding", | |
| "metric": { | |
| "key": "mrr", | |
| "name": "MRR", | |
| "direction": "higher", | |
| "minimal": 0.017183946083791223, | |
| "neural_mlp": 0.01781111161035397 | |
| }, | |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" | |
| }, | |
| "cross_modal_retrieval": { | |
| "plain_goal": "Use one group of modalities to retrieve the matching window from another group.", | |
| "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.", | |
| "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.", | |
| "middle_modules": [ | |
| "Feature splitter separates query modalities from target modalities.", | |
| "Projection head maps the query vector into target-modality space.", | |
| "Candidate index stores target vectors from held-out windows.", | |
| "Ranker retrieves nearest candidates by cosine similarity.", | |
| "Evaluator reports MRR, top-1, top-5, and top-10 accuracy." | |
| ], | |
| "output": "A ranked list of candidate depth/video windows.", | |
| "junior_tip": "This checks whether different sensors agree about the same moment in time.", | |
| "failure_mode": "Good retrieval means useful alignment signal, but it is not yet 3D reconstruction or rendering.", | |
| "task": "cross_modal_retrieval", | |
| "metric": { | |
| "key": "mrr", | |
| "name": "MRR", | |
| "direction": "higher", | |
| "minimal": 0.26335984006618296, | |
| "neural_mlp": 0.1530070022204131 | |
| }, | |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" | |
| }, | |
| "modality_reconstruction": { | |
| "plain_goal": "Predict one modality feature block from other modality blocks.", | |
| "case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.", | |
| "input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.", | |
| "middle_modules": [ | |
| "Feature splitter defines source and target modality blocks.", | |
| "Scaler normalizes source and target vectors using train statistics.", | |
| "Regression head predicts the target feature vector.", | |
| "Inverse scaler returns predictions to target scale.", | |
| "Evaluator reports MSE, MAE, and R2." | |
| ], | |
| "output": "A reconstructed depth/video feature vector.", | |
| "junior_tip": "This is feature-level imagination: can the model infer what another sensor would see?", | |
| "failure_mode": "This reconstructs compressed features, not raw pixels, depth maps, meshes, NeRFs, or Gaussian splats.", | |
| "task": "modality_reconstruction", | |
| "metric": { | |
| "key": "r2", | |
| "name": "R2", | |
| "direction": "higher", | |
| "minimal": -0.016022846771134747, | |
| "neural_mlp": -0.010198171891414143 | |
| }, | |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" | |
| }, | |
| "temporal_order": { | |
| "plain_goal": "Tell whether two nearby windows are in the correct time order.", | |
| "case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.", | |
| "input": "A pair of adjacent window vectors, plus their difference vector.", | |
| "middle_modules": [ | |
| "Pair builder creates correct-order and reversed-order examples.", | |
| "Feature combiner concatenates first window, second window, and their difference.", | |
| "Binary classifier predicts correct vs reversed.", | |
| "Evaluator reports F1, precision, and recall.", | |
| "Diagnostic reader interprets whether features encode local time direction." | |
| ], | |
| "output": "A binary label: correct order or reversed order.", | |
| "junior_tip": "This asks whether the representation knows which moment came first.", | |
| "failure_mode": "It only tests local ordering, not long-term planning or causality.", | |
| "task": "temporal_order", | |
| "metric": { | |
| "key": "f1", | |
| "name": "F1", | |
| "direction": "higher", | |
| "minimal": 0.5487364620938628, | |
| "neural_mlp": 0.8717948717948718 | |
| }, | |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" | |
| }, | |
| "misalignment_detection": { | |
| "plain_goal": "Detect when modalities that should match are shifted out of sync.", | |
| "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.", | |
| "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.", | |
| "middle_modules": [ | |
| "Alignment builder creates positive pairs from the same time window.", | |
| "Shift builder creates negative pairs by offsetting one modality group.", | |
| "Feature combiner joins both sides into one example.", | |
| "Binary classifier predicts aligned vs misaligned.", | |
| "Evaluator reports F1 and accuracy." | |
| ], | |
| "output": "A binary label: aligned or shifted.", | |
| "junior_tip": "This is a synchronization alarm for multimodal data.", | |
| "failure_mode": "Synthetic shifts are useful diagnostics but do not solve calibration, reconstruction, or mapping by themselves.", | |
| "task": "misalignment_detection", | |
| "metric": { | |
| "key": "f1", | |
| "name": "F1", | |
| "direction": "higher", | |
| "minimal": 0.4865671641791045, | |
| "neural_mlp": 0.7335243553008597 | |
| }, | |
| "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files" | |
| } | |
| } | |
| } | |