Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "title": "Xperience-10M Foundation Model Plan", | |
| "status": "planning_artifact", | |
| "current_boundary": "Verified held-out multi-episode foundation-model diagnostics now exist for Qwen3-Omni LoRA, Cosmos3-Nano future-window compatibility, Cosmos3-Super base-weight Reasoner evaluation, and Cosmos3-Super Forward-Dynamics LoRA. Qwen remains the structured JSON baseline; the Cosmos branches answer world-model or base-reasoner questions with separate metrics.", | |
| "pipeline_tracks": { | |
| "source_document": "THREE_FOUNDATION_PIPELINES.md", | |
| "data": "docs/data/three_foundation_pipelines.json", | |
| "reader_note": "Spatial intelligence, human-video world modeling, and vision-language-action are supported pipeline directions with separate data exports, training recipes, and evaluation gates.", | |
| "track_ids": [ | |
| "spatial_intelligence", | |
| "human_video_world_models", | |
| "vision_language_action" | |
| ] | |
| }, | |
| "backbone_registry": { | |
| "config_dir": "configs/omni_backbones", | |
| "validator": "scripts/omni/backbone_registry.py --validate --json", | |
| "extension_contract": "OMNI_MODEL_EXTENSION_CONTRACT.md", | |
| "implemented_backbone": "qwen3_omni_lora", | |
| "implemented_backbones": [ | |
| "qwen3_omni_lora", | |
| "cosmos_world_model", | |
| "cosmos3_super_reasoner", | |
| "cosmos3_super_forward_dynamics" | |
| ], | |
| "planned_backbones": [ | |
| "policy_vla_branch" | |
| ] | |
| }, | |
| "decision": { | |
| "immediate_trainable_backbone": "Qwen3-Omni", | |
| "first_world_model_branch": "Cosmos 3", | |
| "first_policy_branch_candidates": [ | |
| "OpenVLA / OpenVLA-OFT", | |
| "openpi pi0/pi0.5", | |
| "NVIDIA GR00T" | |
| ], | |
| "external_reasoning_reference": "Gemini Robotics", | |
| "long_term_native_pretraining_goal": "Xperience Embodied Foundation Model" | |
| }, | |
| "future_pretraining_goal": { | |
| "name": "Xperience Embodied Foundation Model", | |
| "status": "future_planning_goal", | |
| "role": "Domain-specific embodied foundation model pretrained on full Xperience-10M if full-corpus data, storage, and compute become available.", | |
| "not_current_result": true, | |
| "document": "XPERIENCE_EMBODIED_FOUNDATION_MODEL_PRETRAINING.md", | |
| "entry_conditions": [ | |
| "Selected multi-episode Qwen3-Omni pilot trains and evaluates cleanly.", | |
| "Scaling from 128 episodes to thousands of episodes shows measurable value.", | |
| "Full-corpus storage, derived-shard storage, and fast active-cache capacity are available.", | |
| "Distributed training, checkpoint/restart, and provenance tracking are reliable.", | |
| "Evaluation covers held-out episodes, sessions, activities, objects, and missing-modality robustness." | |
| ], | |
| "target_modules": [ | |
| "multi-view video encoder", | |
| "audio encoder", | |
| "depth and geometry encoder", | |
| "pose/SLAM encoder", | |
| "hand/body mocap encoder", | |
| "IMU encoder", | |
| "language encoder/decoder", | |
| "temporal fusion transformer", | |
| "task heads and decoders" | |
| ], | |
| "pretraining_objectives": [ | |
| "masked multimodal modeling", | |
| "cross-modal contrastive alignment", | |
| "future-state prediction", | |
| "ego-motion and hand-motion forecasting", | |
| "action and procedure prediction", | |
| "language grounding and captioning", | |
| "contact and affordance prediction", | |
| "optional policy-style targets after action conversion" | |
| ], | |
| "hardware_ranges": [ | |
| { | |
| "goal": "0.3B-1B pilot", | |
| "compute": "8-32 modern 80GB-class data-center GPUs", | |
| "use": "prove objectives and data loaders" | |
| }, | |
| { | |
| "goal": "1B-3B domain model", | |
| "compute": "32-128 GPUs", | |
| "use": "research-scale Xperience representation learning" | |
| }, | |
| { | |
| "goal": "3B-7B full-corpus domain model", | |
| "compute": "128-512 GPUs", | |
| "use": "first realistic full Xperience-native foundation model" | |
| }, | |
| { | |
| "goal": "30B-class omni model from scratch", | |
| "compute": "512-2000+ GPUs", | |
| "use": "lab-scale project after scaling curves justify cost" | |
| } | |
| ] | |
| }, | |
| "model_families": [ | |
| { | |
| "priority": 1, | |
| "family": "Qwen3-Omni", | |
| "category": "omni_instruction_model", | |
| "openness": "open_weights_available_from_official_hf_repo", | |
| "best_role": "First selected-episode multimodal LoRA pilot and structured task predictor.", | |
| "xperience10m_fit": [ | |
| "RGB/fisheye video, embedded audio, and language prompts can enter directly.", | |
| "Depth, pose/SLAM, mocap, contacts, and IMU enter through the existing sensor bridge.", | |
| "Matches current task outputs: labels, structured JSON, captions, and short decisions." | |
| ], | |
| "current_decision": "keep_as_first_pilot", | |
| "entry_condition": "Selected episodes prepared with held-out episode split.", | |
| "public_source": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct" | |
| }, | |
| { | |
| "priority": 2, | |
| "family": "Cosmos 3", | |
| "category": "world_foundation_model", | |
| "openness": "track_official_nvidia_release_and_available_weights", | |
| "best_role": "Embodied world modeling, action generation, future-window prediction, and synthetic-data expansion.", | |
| "xperience10m_fit": [ | |
| "Uses video streams as visual state.", | |
| "Uses pose/SLAM, depth, mocap, IMU, and language as physical-world conditioning signals.", | |
| "Better aligned with prediction/generation objectives than simple label classification." | |
| ], | |
| "current_decision": "implemented_as_nano_future_window_and_super_forward_dynamics_branches", | |
| "entry_condition": "Use separate metrics for Nano future-window retrieval and Super forward-dynamics MSE; do not compare them directly to Qwen JSON-task accuracy.", | |
| "public_source": "https://www.nvidia.com/en-us/ai/cosmos/" | |
| }, | |
| { | |
| "priority": 3, | |
| "family": "NVIDIA GR00T", | |
| "category": "humanoid_policy_foundation_model", | |
| "openness": "track_official_nvidia_release_and_tooling", | |
| "best_role": "Humanoid action understanding, retargeting, contact/action prediction, and embodied skill transfer.", | |
| "xperience10m_fit": [ | |
| "Hand/body mocap and contact cues can be retargeted into humanoid state/action targets.", | |
| "Egocentric video plus human motion can support affordance and interaction tasks." | |
| ], | |
| "current_decision": "track_as_humanoid_policy_branch", | |
| "entry_condition": "Retargeting artifact and action-space definition exist.", | |
| "public_source": "https://developer.nvidia.com/isaac/gr00t" | |
| }, | |
| { | |
| "priority": 4, | |
| "family": "OpenVLA / OpenVLA-OFT", | |
| "category": "vision_language_action_policy", | |
| "openness": "open_project_and_weights", | |
| "best_role": "Open robot-policy baseline after observations and action labels are converted into a VLA format.", | |
| "xperience10m_fit": [ | |
| "Good candidate when each window is expressed as visual observation, instruction/context, and action token.", | |
| "Requires an explicit action target; current human egocentric labels are not robot controls by default." | |
| ], | |
| "current_decision": "candidate_after_action_space_design", | |
| "entry_condition": "Window-to-action-token conversion is implemented and checked.", | |
| "public_source": "https://openvla.github.io/" | |
| }, | |
| { | |
| "priority": 5, | |
| "family": "openpi pi0/pi0.5", | |
| "category": "robot_policy_model", | |
| "openness": "open_source_policy_training_stack", | |
| "best_role": "Action-chunking, policy fine-tuning, and embodiment-transfer experiments.", | |
| "xperience10m_fit": [ | |
| "Useful once hand trajectories, contacts, or retargeted body motion are converted into policy targets.", | |
| "Better for policy branch than for current structured task JSON outputs." | |
| ], | |
| "current_decision": "candidate_policy_branch", | |
| "entry_condition": "Action target and train/eval protocol exist for at least 64 episodes.", | |
| "public_source": "https://github.com/Physical-Intelligence/openpi" | |
| }, | |
| { | |
| "priority": 6, | |
| "family": "Gemini Robotics", | |
| "category": "closed_embodied_reasoning_reference", | |
| "openness": "closed_or_limited_access", | |
| "best_role": "Qualitative reasoning reference, annotation helper, and external comparison when API access exists.", | |
| "xperience10m_fit": [ | |
| "Can help reason over egocentric scenes and task descriptions.", | |
| "Not a local fine-tune target for this repo." | |
| ], | |
| "current_decision": "external_reference_only", | |
| "entry_condition": "API/access exists and outputs are logged separately from trainable model metrics.", | |
| "public_source": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/" | |
| }, | |
| { | |
| "priority": 7, | |
| "family": "Octo / SmolVLA-style lightweight policies", | |
| "category": "lightweight_robot_policy_baselines", | |
| "openness": "open_projects", | |
| "best_role": "Cheaper policy baselines for observation-to-action experiments.", | |
| "xperience10m_fit": [ | |
| "Useful after action target design.", | |
| "Less directly omni-modal than Qwen3-Omni or Cosmos 3." | |
| ], | |
| "current_decision": "optional_baseline_after_data_staging", | |
| "entry_condition": "Action labels and baseline protocol exist.", | |
| "public_source": "https://github.com/huggingface/lerobot" | |
| }, | |
| { | |
| "priority": 8, | |
| "family": "Xperience Embodied Foundation Model", | |
| "category": "xperience_native_pretraining_goal", | |
| "openness": "future project-specific model if full-corpus access and compute exist", | |
| "best_role": "Domain model over synchronized embodied experience.", | |
| "xperience10m_fit": [ | |
| "Uses the full aligned modality stack rather than treating sensors as auxiliary metadata.", | |
| "Targets temporal embodied representation learning across perception, motion, geometry, audio, and language.", | |
| "Can become the shared pretraining backbone for Qwen-style instruction tasks, Cosmos-style world modeling, and policy/action branches." | |
| ], | |
| "current_decision": "future_goal_after_scaling_evidence", | |
| "entry_condition": "Full-corpus data path, PB-scale storage, multi-node compute, and positive smaller-run scaling evidence.", | |
| "public_source": "XPERIENCE_EMBODIED_FOUNDATION_MODEL_PRETRAINING.md" | |
| } | |
| ], | |
| "execution_order": [ | |
| { | |
| "step": 1, | |
| "name": "Data gate", | |
| "action": "Stage at least 32 valid Xperience-10M episodes with held-out episode split." | |
| }, | |
| { | |
| "step": 2, | |
| "name": "First held-out baseline", | |
| "action": "Run Qwen3-Omni action/subtask error analysis and targeted reruns to improve the verified diagnostic baseline." | |
| }, | |
| { | |
| "step": 3, | |
| "name": "Model-selection dry run", | |
| "action": "Run 3-8 episode dry runs for any next backbone before scaling beyond the selected split." | |
| }, | |
| { | |
| "step": 4, | |
| "name": "World-model track", | |
| "action": "Promote Cosmos 3 beyond the current Nano compatibility and Super forward-dynamics runs only when loss metrics, preprocessing, and storage justify the added compute." | |
| }, | |
| { | |
| "step": 5, | |
| "name": "Policy branch", | |
| "action": "Promote OpenVLA/openpi/GR00T after action target conversion and retargeting artifacts are traceable." | |
| }, | |
| { | |
| "step": 6, | |
| "name": "Publishing threshold", | |
| "action": "Publish branch results only with real manifests, predictions, metrics, and qualitative examples." | |
| }, | |
| { | |
| "step": 7, | |
| "name": "Xperience-native pretraining", | |
| "action": "Start a from-scratch Xperience Embodied Foundation Model only after smaller scaling stages, full-corpus storage, multi-node compute, and held-out evaluation protocols are in place." | |
| } | |
| ], | |
| "evaluation_additions": [ | |
| { | |
| "target": "structured_task_prediction", | |
| "metrics": [ | |
| "JSON validity", | |
| "macro-F1", | |
| "accuracy", | |
| "micro-F1" | |
| ], | |
| "model_families": [ | |
| "Qwen3-Omni", | |
| "Gemini Robotics reference" | |
| ] | |
| }, | |
| { | |
| "target": "future_state_prediction", | |
| "metrics": [ | |
| "retrieval rank", | |
| "temporal consistency", | |
| "feature reconstruction", | |
| "qualitative visual inspection" | |
| ], | |
| "model_families": [ | |
| "Cosmos 3" | |
| ] | |
| }, | |
| { | |
| "target": "action_conditioned_dynamics", | |
| "metrics": [ | |
| "transition accuracy", | |
| "contact accuracy", | |
| "next-action accuracy" | |
| ], | |
| "model_families": [ | |
| "Cosmos 3", | |
| "OpenVLA", | |
| "openpi", | |
| "GR00T" | |
| ] | |
| }, | |
| { | |
| "target": "cross_episode_generalization", | |
| "metrics": [ | |
| "held-out episode metrics", | |
| "held-out session metrics", | |
| "leakage checks" | |
| ], | |
| "model_families": [ | |
| "all trainable branches" | |
| ] | |
| } | |
| ], | |
| "source_links": [ | |
| { | |
| "label": "Qwen3-Omni official HF model", | |
| "url": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct" | |
| }, | |
| { | |
| "label": "NVIDIA Cosmos", | |
| "url": "https://www.nvidia.com/en-us/ai/cosmos/" | |
| }, | |
| { | |
| "label": "NVIDIA Isaac GR00T", | |
| "url": "https://developer.nvidia.com/isaac/gr00t" | |
| }, | |
| { | |
| "label": "OpenVLA", | |
| "url": "https://openvla.github.io/" | |
| }, | |
| { | |
| "label": "openpi", | |
| "url": "https://github.com/Physical-Intelligence/openpi" | |
| }, | |
| { | |
| "label": "Gemini Robotics", | |
| "url": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/" | |
| }, | |
| { | |
| "label": "Octo", | |
| "url": "https://octo-models.github.io/" | |
| }, | |
| { | |
| "label": "LeRobot / SmolVLA", | |
| "url": "https://github.com/huggingface/lerobot" | |
| }, | |
| { | |
| "label": "Xperience Embodied Foundation Model pretraining plan", | |
| "url": "XPERIENCE_EMBODIED_FOUNDATION_MODEL_PRETRAINING.md" | |
| } | |
| ] | |
| } | |