Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "title": "Ropedia Xperience-10M Task Suite Evaluation Protocol", | |
| "status": "pass", | |
| "version": "2026-06-01", | |
| "generated_at_utc": "2026-06-21T15:20:33+00:00", | |
| "source_files": [ | |
| "docs/data/summary_metrics.json", | |
| "results/episode_task_suite/summary_report.json", | |
| "results/episode_task_suite/windows.csv", | |
| "results/episode_task_suite/feature_manifest.json", | |
| "docs/data/task_suite_20.json", | |
| "docs/data/tier2_task_suite.json", | |
| "results/episode_task_suite/tier2_task_suite/tier2_task_suite_results.json" | |
| ], | |
| "scope": { | |
| "validated_episode_count": 1, | |
| "annotation": "data/sample/xperience-10m-sample/annotation.hdf5", | |
| "num_frames": 5821, | |
| "num_windows": 1161, | |
| "feature_dim": 8546, | |
| "window_frames": 20, | |
| "stride_frames": 5, | |
| "audio_featurized": true, | |
| "raw_data_redistributed": false | |
| }, | |
| "task_suite": { | |
| "status": "unified_public_sample_suite", | |
| "task_count": 20, | |
| "public_framing": "all 20 public-sample task contracts are presented as one suite", | |
| "legacy_provenance_rows": 8, | |
| "unified_results": "docs/data/task_suite_20.json", | |
| "legacy_additional_task_result_path": "docs/data/tier2_task_suite.json", | |
| "legacy_path_note": "The tier2_task_suite path is retained for stable links only; it is provenance inside the same 20-task suite." | |
| }, | |
| "split_policy": { | |
| "name": "single_episode_chronological", | |
| "train_fraction": 0.7, | |
| "test_fraction": 0.3, | |
| "why": "The split preserves time order so future episode segments are not mixed randomly into the train set.", | |
| "limitation": "It is still one episode; cross-episode generalization is evaluated in the multi-episode stage." | |
| }, | |
| "feature_policy": { | |
| "input_contract": "8,546-dimensional current feature vector", | |
| "source_manifest": "results/episode_task_suite/feature_manifest.json", | |
| "normalization": "Scalers are fit on train windows only for the baseline heads.", | |
| "audio_status": "Audio is represented in the current feature vector." | |
| }, | |
| "baselines": [ | |
| { | |
| "name": "minimal", | |
| "heads": [ | |
| "softmax", | |
| "binary logistic", | |
| "multi-label logistic", | |
| "ridge regression", | |
| "ridge projection plus cosine ranking" | |
| ], | |
| "purpose": "Keep each task contract interpretable and easy to inspect." | |
| }, | |
| { | |
| "name": "neural_mlp", | |
| "heads": [ | |
| "PyTorch MLP classifier", | |
| "PyTorch MLP regressor", | |
| "PyTorch MLP multi-label head" | |
| ], | |
| "purpose": "Check nonlinear gains before larger omni-model fine-tuning.", | |
| "config": { | |
| "name": "neural_mlp", | |
| "type": "lightweight PyTorch MLP over shared window features", | |
| "epochs": 80, | |
| "hidden_dim": 128, | |
| "batch_size": 128, | |
| "learning_rate": 0.001, | |
| "weight_decay": 0.0001, | |
| "dropout": 0.1, | |
| "device": "auto" | |
| } | |
| } | |
| ], | |
| "task_protocols": [ | |
| { | |
| "task": "timeline_action", | |
| "task_display_name": "Action Recognition", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "family": "supervised classification", | |
| "unit": "single window", | |
| "input": "current 20-frame all-feature window", | |
| "target": "current action label", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "leakage_rule": "No future labels enter the input. Chronological split exposes unseen later action labels.", | |
| "counts": { | |
| "num_windows": 1144, | |
| "num_train_windows": 801, | |
| "num_test_windows": 343 | |
| }, | |
| "minimal_primary_metric": 0.05, | |
| "neural_primary_metric": 0.014814814814814814, | |
| "minimal_metric_source": "results/episode_task_suite/timeline_action/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/timeline_action/metrics.json", | |
| "task_number": 1, | |
| "suite_label": "Task 01" | |
| }, | |
| { | |
| "task": "timeline_subtask", | |
| "task_display_name": "Procedure Step Recognition", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "family": "supervised classification", | |
| "unit": "single window", | |
| "input": "current 20-frame all-feature window", | |
| "target": "current subtask label", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "leakage_rule": "No future labels enter the input. Chronological split exposes unseen later subtask labels.", | |
| "counts": { | |
| "num_windows": 1147, | |
| "num_train_windows": 803, | |
| "num_test_windows": 344 | |
| }, | |
| "minimal_primary_metric": 0.05056355513846935, | |
| "neural_primary_metric": 0.02810810810810811, | |
| "minimal_metric_source": "results/episode_task_suite/timeline_subtask/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json", | |
| "task_number": 2, | |
| "suite_label": "Task 02" | |
| }, | |
| { | |
| "task": "transition_detection", | |
| "task_display_name": "Action Boundary Detection", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "family": "temporal diagnostic", | |
| "unit": "single window", | |
| "input": "current 20-frame all-feature window", | |
| "target": "action boundary versus steady", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "leakage_rule": "Boundary labels are targets only. Boundary timing is evaluated after prediction.", | |
| "counts": { | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "minimal_primary_metric": 0.6118237590630229, | |
| "neural_primary_metric": 0.5862068965517241, | |
| "minimal_metric_source": "results/episode_task_suite/transition_detection/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/transition_detection/metrics.json", | |
| "task_number": 3, | |
| "suite_label": "Task 03" | |
| }, | |
| { | |
| "task": "next_action", | |
| "task_display_name": "Next-Action Prediction", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "family": "short-horizon prediction", | |
| "unit": "single window", | |
| "input": "current 20-frame all-feature window at time t", | |
| "target": "action label at t + 20 frames", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "leakage_rule": "Future labels are shifted into targets only; model inputs remain current-window features.", | |
| "counts": { | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "minimal_primary_metric": 0.05925925925925927, | |
| "neural_primary_metric": 0.04186046511627907, | |
| "minimal_metric_source": "results/episode_task_suite/next_action/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/next_action/metrics.json", | |
| "task_number": 4, | |
| "suite_label": "Task 04" | |
| }, | |
| { | |
| "task": "hand_trajectory_forecast", | |
| "task_display_name": "Hand Trajectory Forecasting", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "family": "trajectory regression", | |
| "unit": "single window", | |
| "input": "current all-feature window", | |
| "target": "future left/right hand 3D joints for 10 frames", | |
| "primary_metric": "mpjpe", | |
| "higher_is_better": false, | |
| "leakage_rule": "Future mocap coordinates are targets only, not inputs.", | |
| "counts": { | |
| "num_windows": 1159, | |
| "num_train_windows": 811, | |
| "num_test_windows": 348 | |
| }, | |
| "minimal_primary_metric": 0.8646570444107056, | |
| "neural_primary_metric": 0.10785018652677536, | |
| "minimal_metric_source": "results/episode_task_suite/hand_trajectory_forecast/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json", | |
| "task_number": 5, | |
| "suite_label": "Task 05" | |
| }, | |
| { | |
| "task": "contact_prediction", | |
| "task_display_name": "Contact State Prediction", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "family": "binary classification", | |
| "unit": "single window", | |
| "input": "non-contact and non-caption feature blocks", | |
| "target": "any body contact", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "leakage_rule": "Contact-derived fields and caption labels are excluded from inputs.", | |
| "counts": { | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "minimal_primary_metric": 1.0, | |
| "neural_primary_metric": 1.0, | |
| "minimal_metric_source": "results/episode_task_suite/contact_prediction/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/contact_prediction/metrics.json", | |
| "task_number": 6, | |
| "suite_label": "Task 06" | |
| }, | |
| { | |
| "task": "object_relevance", | |
| "task_display_name": "Object Relevance Prediction", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "family": "multi-label classification", | |
| "unit": "single window", | |
| "input": "non-caption feature blocks", | |
| "target": "current relevant object set", | |
| "primary_metric": "micro_f1", | |
| "higher_is_better": true, | |
| "leakage_rule": "Caption/object-label fields are excluded from inputs.", | |
| "counts": { | |
| "num_windows": 1161, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "minimal_primary_metric": 0.18034382095361662, | |
| "neural_primary_metric": 0.1679279279279279, | |
| "minimal_metric_source": "results/episode_task_suite/object_relevance/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/object_relevance/metrics.json", | |
| "task_number": 7, | |
| "suite_label": "Task 07" | |
| }, | |
| { | |
| "task": "caption_grounding", | |
| "task_display_name": "Language Grounding", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "family": "retrieval", | |
| "unit": "caption query", | |
| "input": "caption object/interaction query plus candidate sensor windows", | |
| "target": "matching time window", | |
| "primary_metric": "mrr", | |
| "higher_is_better": true, | |
| "leakage_rule": "Queries are ranked against held-out candidate windows; reported ranks are computed after model scoring.", | |
| "counts": { | |
| "num_queries": 348, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "minimal_primary_metric": 0.016023479050338015, | |
| "neural_primary_metric": 0.01684125567132316, | |
| "minimal_metric_source": "results/episode_task_suite/caption_grounding/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/caption_grounding/metrics.json", | |
| "task_number": 8, | |
| "suite_label": "Task 08" | |
| }, | |
| { | |
| "task": "cross_modal_retrieval", | |
| "task_display_name": "Cross-Modal Retrieval", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "family": "retrieval", | |
| "unit": "sensor query", | |
| "input": "motion, IMU, and camera query features", | |
| "target": "matching depth/video window", | |
| "primary_metric": "top5_accuracy", | |
| "higher_is_better": true, | |
| "leakage_rule": "Query-side and candidate-side feature blocks are split before projection/ranking.", | |
| "counts": { | |
| "num_queries": 348, | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "minimal_primary_metric": 0.367816091954023, | |
| "neural_primary_metric": 0.19827586206896552, | |
| "minimal_metric_source": "results/episode_task_suite/cross_modal_retrieval/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json", | |
| "task_number": 9, | |
| "suite_label": "Task 09" | |
| }, | |
| { | |
| "task": "modality_reconstruction", | |
| "task_display_name": "Cross-Modal Reconstruction", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "family": "cross-modal regression", | |
| "unit": "single window", | |
| "input": "motion, IMU, and camera features", | |
| "target": "depth/video feature vector", | |
| "primary_metric": "r2", | |
| "higher_is_better": true, | |
| "leakage_rule": "Target feature blocks are excluded from the input side.", | |
| "counts": { | |
| "num_train_windows": 813, | |
| "num_test_windows": 348 | |
| }, | |
| "minimal_primary_metric": -0.015271898913936655, | |
| "neural_primary_metric": -0.010171410134180991, | |
| "minimal_metric_source": "results/episode_task_suite/modality_reconstruction/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json", | |
| "task_number": 10, | |
| "suite_label": "Task 10" | |
| }, | |
| { | |
| "task": "temporal_order", | |
| "task_display_name": "Temporal Order Verification", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "family": "pairwise diagnostic", | |
| "unit": "adjacent window pair", | |
| "input": "two adjacent windows", | |
| "target": "correct versus reversed order", | |
| "primary_metric": "f1", | |
| "higher_is_better": true, | |
| "leakage_rule": "Pairs are built after windowing; labels are synthetic order labels, not input features.", | |
| "counts": { | |
| "num_samples": 2320, | |
| "num_train_samples": 1624, | |
| "num_test_samples": 696 | |
| }, | |
| "minimal_primary_metric": 0.5399515738498789, | |
| "neural_primary_metric": 0.8520179372197308, | |
| "minimal_metric_source": "results/episode_task_suite/temporal_order/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/temporal_order/metrics.json", | |
| "task_number": 11, | |
| "suite_label": "Task 11" | |
| }, | |
| { | |
| "task": "misalignment_detection", | |
| "task_display_name": "Multimodal Synchronization Detection", | |
| "provenance_source": "walkthrough_backed_task_contract", | |
| "family": "pairwise diagnostic", | |
| "unit": "paired modality window", | |
| "input": "motion side plus visual/depth side", | |
| "target": "aligned versus shifted by 8 windows", | |
| "primary_metric": "f1", | |
| "higher_is_better": true, | |
| "leakage_rule": "Shift labels are synthetic targets; shifted visual/depth blocks are generated after feature splitting.", | |
| "counts": { | |
| "num_samples": 2306, | |
| "num_train_samples": 1614, | |
| "num_test_samples": 692 | |
| }, | |
| "minimal_primary_metric": 0.5051698670605613, | |
| "neural_primary_metric": 0.7152682255845944, | |
| "minimal_metric_source": "results/episode_task_suite/misalignment_detection/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json", | |
| "task_number": 12, | |
| "suite_label": "Task 12" | |
| }, | |
| { | |
| "task": "long_horizon_next_action", | |
| "task_display_name": "Long-Horizon Next-Action Forecasting", | |
| "provenance_source": "historical_result_bundle", | |
| "family": "classification", | |
| "unit": "single aligned window", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "target": "Action label five seconds later.", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "minimal_primary_metric": 0.07499999999999998, | |
| "neural_primary_metric": 0.06545454545454546, | |
| "minimal_metric_source": "results/episode_task_suite/tier2_task_suite/long_horizon_next_action/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/tier2_task_suite/neural_mlp/long_horizon_next_action/metrics.json", | |
| "meaning": "Tests whether the current state carries enough procedure context to forecast beyond the one-second core next-action task.", | |
| "task_number": 13, | |
| "suite_label": "Task 13" | |
| }, | |
| { | |
| "task": "next_subtask_forecast", | |
| "task_display_name": "Long-Horizon Next-Subtask Forecasting", | |
| "provenance_source": "historical_result_bundle", | |
| "family": "classification", | |
| "unit": "single aligned window", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "target": "Procedure subtask label five seconds later.", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "minimal_primary_metric": 0.04545454545454545, | |
| "neural_primary_metric": 0.050724637681159424, | |
| "minimal_metric_source": "results/episode_task_suite/tier2_task_suite/next_subtask_forecast/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/tier2_task_suite/neural_mlp/next_subtask_forecast/metrics.json", | |
| "meaning": "Moves from immediate action anticipation to higher-level procedure-state prediction.", | |
| "task_number": 14, | |
| "suite_label": "Task 14" | |
| }, | |
| { | |
| "task": "interaction_text_prediction", | |
| "task_display_name": "Interaction Text Prediction", | |
| "provenance_source": "historical_result_bundle", | |
| "family": "classification", | |
| "unit": "single aligned window", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "target": "Raw annotation interaction phrase for the same window.", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "minimal_primary_metric": 0.04444444444444444, | |
| "neural_primary_metric": 0.0380952380952381, | |
| "minimal_metric_source": "results/episode_task_suite/tier2_task_suite/interaction_text_prediction/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/tier2_task_suite/neural_mlp/interaction_text_prediction/metrics.json", | |
| "meaning": "Uses the raw caption JSON interaction field as a language target instead of only the hashed text feature.", | |
| "task_number": 15, | |
| "suite_label": "Task 15" | |
| }, | |
| { | |
| "task": "action_object_relation", | |
| "task_display_name": "Action-Object Relation Prediction", | |
| "provenance_source": "historical_result_bundle", | |
| "family": "classification", | |
| "unit": "single aligned window", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "target": "Joint action plus active object-set relation.", | |
| "primary_metric": "macro_f1", | |
| "higher_is_better": true, | |
| "minimal_primary_metric": 0.0, | |
| "neural_primary_metric": 0.0, | |
| "minimal_metric_source": "results/episode_task_suite/tier2_task_suite/action_object_relation/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/tier2_task_suite/neural_mlp/action_object_relation/metrics.json", | |
| "meaning": "Evaluates whether a model can bind what action is happening to which objects are involved.", | |
| "task_number": 16, | |
| "suite_label": "Task 16" | |
| }, | |
| { | |
| "task": "object_set_forecast", | |
| "task_display_name": "Future Object-Set Forecasting", | |
| "provenance_source": "historical_result_bundle", | |
| "family": "multi_label", | |
| "unit": "single aligned window", | |
| "input": "Current 20-frame sensor window with caption-text features removed.", | |
| "target": "Object set active five seconds later.", | |
| "primary_metric": "micro_f1", | |
| "higher_is_better": true, | |
| "minimal_primary_metric": 0.16939890710382516, | |
| "neural_primary_metric": 0.19718309859154928, | |
| "minimal_metric_source": "results/episode_task_suite/tier2_task_suite/object_set_forecast/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/tier2_task_suite/neural_mlp/object_set_forecast/metrics.json", | |
| "meaning": "Predicts which objects will become relevant soon, not only which objects are relevant now.", | |
| "task_number": 17, | |
| "suite_label": "Task 17" | |
| }, | |
| { | |
| "task": "imu_to_hand_pose", | |
| "task_display_name": "IMU-to-Hand Pose Reconstruction", | |
| "provenance_source": "historical_result_bundle", | |
| "family": "regression", | |
| "unit": "single aligned window", | |
| "input": "Current IMU acceleration/gyroscope feature block only.", | |
| "target": "Current left/right hand joint feature blocks.", | |
| "primary_metric": "mae", | |
| "higher_is_better": false, | |
| "minimal_primary_metric": 0.042049407958984375, | |
| "neural_primary_metric": 0.042562149465084076, | |
| "minimal_metric_source": "results/episode_task_suite/tier2_task_suite/imu_to_hand_pose/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/tier2_task_suite/neural_mlp/imu_to_hand_pose/metrics.json", | |
| "meaning": "A sensor-bridge probe for how much hand configuration can be recovered from inertial motion alone.", | |
| "task_number": 18, | |
| "suite_label": "Task 18" | |
| }, | |
| { | |
| "task": "camera_view_sync_retrieval", | |
| "task_display_name": "Camera-View Synchronization Retrieval", | |
| "provenance_source": "historical_result_bundle", | |
| "family": "retrieval", | |
| "unit": "held-out query window", | |
| "input": "Fisheye camera-1 feature query projected into fisheye camera-3 feature space.", | |
| "target": "The synchronized held-out camera-3 window.", | |
| "primary_metric": "mrr", | |
| "higher_is_better": true, | |
| "minimal_primary_metric": 0.4943004846572876, | |
| "neural_primary_metric": 0.24086658656597137, | |
| "minimal_metric_source": "results/episode_task_suite/tier2_task_suite/camera_view_sync_retrieval/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/tier2_task_suite/neural_mlp/camera_view_sync_retrieval/metrics.json", | |
| "meaning": "Stress-tests multi-camera time alignment beyond the core cross-modal retrieval task.", | |
| "task_number": 19, | |
| "suite_label": "Task 19" | |
| }, | |
| { | |
| "task": "time_to_transition", | |
| "task_display_name": "Time-to-Next-Transition Regression", | |
| "provenance_source": "historical_result_bundle", | |
| "family": "regression", | |
| "unit": "single aligned window", | |
| "input": "Current 20-frame non-caption multimodal window.", | |
| "target": "Frames until the next action-label boundary, capped at 200 frames.", | |
| "primary_metric": "mae", | |
| "higher_is_better": false, | |
| "minimal_primary_metric": 10.53735637664795, | |
| "neural_primary_metric": 10.55449390411377, | |
| "minimal_metric_source": "results/episode_task_suite/tier2_task_suite/time_to_transition/metrics.json", | |
| "neural_metric_source": "results/episode_task_suite/tier2_task_suite/neural_mlp/time_to_transition/metrics.json", | |
| "meaning": "Turns boundary detection into a continuous timing estimate for procedural control.", | |
| "task_number": 20, | |
| "suite_label": "Task 20" | |
| } | |
| ], | |
| "global_leakage_controls": [ | |
| "Use chronological train/test splits instead of random window shuffling.", | |
| "Fit scalers and learned projections on train windows only.", | |
| "Keep future labels, future mocap, contact labels, object labels, and caption labels on the target side unless a task explicitly treats language as the query.", | |
| "For cross-modal tasks, split query-side and candidate-side feature blocks before training and ranking.", | |
| "Report unseen test classes when the chronological split exposes labels absent from the train segment." | |
| ], | |
| "current_limitations": [ | |
| "Cross-episode generalization for Qwen3-Omni has a first verified diagnostic pilot, but strong model quality is not yet shown.", | |
| "Feature-vector reconstruction is separate from pixel depth, mesh, NeRF, or Gaussian reconstruction.", | |
| "The final verified Qwen3-Omni diagnostic result meets the strict-JSON target, but action/subtask held-out quality remains weak and needs error analysis before larger model-quality claims.", | |
| "Full audio-visual representation learning still needs multi-episode training; the current report includes single-episode audio/no-audio ablations." | |
| ], | |
| "scale_up_gate": { | |
| "required_before_next_omni_quality_pilot": [ | |
| "selected prepared Xperience-10M episodes", | |
| "held-out episode split with no train/test episode leakage", | |
| "validation samples during training", | |
| "manifest, training metadata, progress logs, metrics, predictions, and run report", | |
| "held-out evaluation on test episodes rather than train windows" | |
| ], | |
| "current_status": "verified diagnostic result; strict-JSON quality target met, action/subtask quality still weak", | |
| "evidence": [ | |
| "docs/data/omni_finetune_verified_result.json", | |
| "results/omni_finetune/verified_public/" | |
| ] | |
| } | |
| } | |