Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "title": "Xperience-10M Foundation Model Plan", | |
| "status": "planning_artifact", | |
| "current_boundary": "No held-out multi-episode foundation-model result has been completed in this repo. The current foundation-model artifacts are setup-stage until enough valid episodes are staged and evaluated.", | |
| "decision": { | |
| "immediate_trainable_backbone": "Qwen3-Omni", | |
| "first_world_model_branch": "Cosmos 3", | |
| "first_policy_branch_candidates": [ | |
| "OpenVLA / OpenVLA-OFT", | |
| "openpi pi0/pi0.5", | |
| "NVIDIA GR00T" | |
| ], | |
| "external_reasoning_reference": "Gemini Robotics" | |
| }, | |
| "model_families": [ | |
| { | |
| "priority": 1, | |
| "family": "Qwen3-Omni", | |
| "category": "omni_instruction_model", | |
| "openness": "open_weights_available_from_official_hf_repo", | |
| "best_role": "First selected-episode multimodal LoRA pilot and structured task predictor.", | |
| "xperience10m_fit": [ | |
| "RGB/fisheye video, embedded audio, and language prompts can enter directly.", | |
| "Depth, pose/SLAM, mocap, contacts, and IMU enter through the existing sensor bridge.", | |
| "Matches current task outputs: labels, structured JSON, captions, and short decisions." | |
| ], | |
| "current_decision": "keep_as_first_pilot", | |
| "entry_condition": "Selected episodes staged with held-out episode split.", | |
| "public_source": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct" | |
| }, | |
| { | |
| "priority": 2, | |
| "family": "Cosmos 3", | |
| "category": "world_foundation_model", | |
| "openness": "track_official_nvidia_release_and_available_weights", | |
| "best_role": "Embodied world modeling, action generation, future-window prediction, and synthetic-data expansion.", | |
| "xperience10m_fit": [ | |
| "Uses video streams as visual state.", | |
| "Uses pose/SLAM, depth, mocap, IMU, and language as physical-world conditioning signals.", | |
| "Better aligned with prediction/generation objectives than simple label classification." | |
| ], | |
| "current_decision": "add_as_first_world_model_branch_after_data_gate", | |
| "entry_condition": "Multi-episode data plus enough storage/compute for generated or latent video-state outputs.", | |
| "public_source": "https://www.nvidia.com/en-us/ai/cosmos/" | |
| }, | |
| { | |
| "priority": 3, | |
| "family": "NVIDIA GR00T", | |
| "category": "humanoid_policy_foundation_model", | |
| "openness": "track_official_nvidia_release_and_tooling", | |
| "best_role": "Humanoid action understanding, retargeting, contact/action prediction, and embodied skill transfer.", | |
| "xperience10m_fit": [ | |
| "Hand/body mocap and contact cues can be retargeted into humanoid state/action targets.", | |
| "Egocentric video plus human motion can support affordance and interaction tasks." | |
| ], | |
| "current_decision": "track_as_humanoid_policy_branch", | |
| "entry_condition": "Retargeting artifact and action-space definition exist.", | |
| "public_source": "https://developer.nvidia.com/isaac/gr00t" | |
| }, | |
| { | |
| "priority": 4, | |
| "family": "OpenVLA / OpenVLA-OFT", | |
| "category": "vision_language_action_policy", | |
| "openness": "open_project_and_weights", | |
| "best_role": "Open robot-policy baseline after observations and action labels are converted into a VLA format.", | |
| "xperience10m_fit": [ | |
| "Good candidate when each window is expressed as visual observation, instruction/context, and action token.", | |
| "Requires an explicit action target; current human egocentric labels are not robot controls by default." | |
| ], | |
| "current_decision": "candidate_after_action_space_design", | |
| "entry_condition": "Window-to-action-token conversion is implemented and audited.", | |
| "public_source": "https://openvla.github.io/" | |
| }, | |
| { | |
| "priority": 5, | |
| "family": "openpi pi0/pi0.5", | |
| "category": "robot_policy_model", | |
| "openness": "open_source_policy_training_stack", | |
| "best_role": "Action-chunking, policy fine-tuning, and embodiment-transfer experiments.", | |
| "xperience10m_fit": [ | |
| "Useful once hand trajectories, contacts, or retargeted body motion are converted into policy targets.", | |
| "Better for policy branch than for current structured task JSON outputs." | |
| ], | |
| "current_decision": "candidate_policy_branch", | |
| "entry_condition": "Action target and train/eval protocol exist for at least 64 episodes.", | |
| "public_source": "https://github.com/Physical-Intelligence/openpi" | |
| }, | |
| { | |
| "priority": 6, | |
| "family": "Gemini Robotics", | |
| "category": "closed_embodied_reasoning_reference", | |
| "openness": "closed_or_limited_access", | |
| "best_role": "Qualitative reasoning reference, annotation helper, and external comparison when API access exists.", | |
| "xperience10m_fit": [ | |
| "Can help reason over egocentric scenes and task descriptions.", | |
| "Not a local fine-tune target for this repo." | |
| ], | |
| "current_decision": "external_reference_only", | |
| "entry_condition": "API/access exists and outputs are logged separately from trainable model metrics.", | |
| "public_source": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/" | |
| }, | |
| { | |
| "priority": 7, | |
| "family": "Octo / SmolVLA-style lightweight policies", | |
| "category": "lightweight_robot_policy_baselines", | |
| "openness": "open_projects", | |
| "best_role": "Cheaper policy baselines for observation-to-action experiments.", | |
| "xperience10m_fit": [ | |
| "Useful after action target design.", | |
| "Less directly omni-modal than Qwen3-Omni or Cosmos 3." | |
| ], | |
| "current_decision": "optional_baseline_after_data_staging", | |
| "entry_condition": "Action labels and baseline protocol exist.", | |
| "public_source": "https://github.com/huggingface/lerobot" | |
| } | |
| ], | |
| "execution_order": [ | |
| { | |
| "step": 1, | |
| "name": "Data gate", | |
| "action": "Stage at least 32 valid Xperience-10M episodes with held-out episode split." | |
| }, | |
| { | |
| "step": 2, | |
| "name": "First held-out baseline", | |
| "action": "Run Qwen3-Omni LoRA to establish the full train/eval loop." | |
| }, | |
| { | |
| "step": 3, | |
| "name": "Model-selection dry run", | |
| "action": "Run 3-8 episode dry runs for Qwen3-Omni prompt/LoRA, Cosmos 3 preprocessing, and one policy candidate." | |
| }, | |
| { | |
| "step": 4, | |
| "name": "World-model branch", | |
| "action": "Promote Cosmos 3 if future-window/action-conditioned preprocessing fits storage and compute." | |
| }, | |
| { | |
| "step": 5, | |
| "name": "Policy branch", | |
| "action": "Promote OpenVLA/openpi/GR00T after action target conversion and retargeting artifacts are traceable." | |
| }, | |
| { | |
| "step": 6, | |
| "name": "Publication rule", | |
| "action": "Publish branch results only with real manifests, predictions, metrics, and qualitative examples." | |
| } | |
| ], | |
| "evaluation_additions": [ | |
| { | |
| "target": "structured_task_prediction", | |
| "metrics": [ | |
| "JSON validity", | |
| "macro-F1", | |
| "accuracy", | |
| "micro-F1" | |
| ], | |
| "model_families": [ | |
| "Qwen3-Omni", | |
| "Gemini Robotics reference" | |
| ] | |
| }, | |
| { | |
| "target": "future_state_prediction", | |
| "metrics": [ | |
| "retrieval rank", | |
| "temporal consistency", | |
| "feature reconstruction", | |
| "qualitative visual inspection" | |
| ], | |
| "model_families": [ | |
| "Cosmos 3" | |
| ] | |
| }, | |
| { | |
| "target": "action_conditioned_dynamics", | |
| "metrics": [ | |
| "transition accuracy", | |
| "contact accuracy", | |
| "next-action accuracy" | |
| ], | |
| "model_families": [ | |
| "Cosmos 3", | |
| "OpenVLA", | |
| "openpi", | |
| "GR00T" | |
| ] | |
| }, | |
| { | |
| "target": "cross_episode_generalization", | |
| "metrics": [ | |
| "held-out episode metrics", | |
| "held-out session metrics", | |
| "leakage audit" | |
| ], | |
| "model_families": [ | |
| "all trainable branches" | |
| ] | |
| } | |
| ], | |
| "source_links": [ | |
| { | |
| "label": "Qwen3-Omni official HF model", | |
| "url": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct" | |
| }, | |
| { | |
| "label": "NVIDIA Cosmos", | |
| "url": "https://www.nvidia.com/en-us/ai/cosmos/" | |
| }, | |
| { | |
| "label": "NVIDIA Isaac GR00T", | |
| "url": "https://developer.nvidia.com/isaac/gr00t" | |
| }, | |
| { | |
| "label": "OpenVLA", | |
| "url": "https://openvla.github.io/" | |
| }, | |
| { | |
| "label": "openpi", | |
| "url": "https://github.com/Physical-Intelligence/openpi" | |
| }, | |
| { | |
| "label": "Gemini Robotics", | |
| "url": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/" | |
| }, | |
| { | |
| "label": "Octo", | |
| "url": "https://octo-models.github.io/" | |
| }, | |
| { | |
| "label": "LeRobot / SmolVLA", | |
| "url": "https://github.com/huggingface/lerobot" | |
| } | |
| ] | |
| } | |