File size: 8,880 Bytes

df8f96e

{
  "title": "Xperience-10M Foundation Model Plan",
  "status": "planning_artifact",
  "current_boundary": "No held-out multi-episode foundation-model result has been completed in this repo. The current foundation-model artifacts are setup-stage until enough valid episodes are staged and evaluated.",
  "decision": {
    "immediate_trainable_backbone": "Qwen3-Omni",
    "first_world_model_branch": "Cosmos 3",
    "first_policy_branch_candidates": [
      "OpenVLA / OpenVLA-OFT",
      "openpi pi0/pi0.5",
      "NVIDIA GR00T"
    ],
    "external_reasoning_reference": "Gemini Robotics"
  },
  "model_families": [
    {
      "priority": 1,
      "family": "Qwen3-Omni",
      "category": "omni_instruction_model",
      "openness": "open_weights_available_from_official_hf_repo",
      "best_role": "First selected-episode multimodal LoRA pilot and structured task predictor.",
      "xperience10m_fit": [
        "RGB/fisheye video, embedded audio, and language prompts can enter directly.",
        "Depth, pose/SLAM, mocap, contacts, and IMU enter through the existing sensor bridge.",
        "Matches current task outputs: labels, structured JSON, captions, and short decisions."
      ],
      "current_decision": "keep_as_first_pilot",
      "entry_condition": "Selected episodes staged with held-out episode split.",
      "public_source": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct"
    },
    {
      "priority": 2,
      "family": "Cosmos 3",
      "category": "world_foundation_model",
      "openness": "track_official_nvidia_release_and_available_weights",
      "best_role": "Embodied world modeling, action generation, future-window prediction, and synthetic-data expansion.",
      "xperience10m_fit": [
        "Uses video streams as visual state.",
        "Uses pose/SLAM, depth, mocap, IMU, and language as physical-world conditioning signals.",
        "Better aligned with prediction/generation objectives than simple label classification."
      ],
      "current_decision": "add_as_first_world_model_branch_after_data_gate",
      "entry_condition": "Multi-episode data plus enough storage/compute for generated or latent video-state outputs.",
      "public_source": "https://www.nvidia.com/en-us/ai/cosmos/"
    },
    {
      "priority": 3,
      "family": "NVIDIA GR00T",
      "category": "humanoid_policy_foundation_model",
      "openness": "track_official_nvidia_release_and_tooling",
      "best_role": "Humanoid action understanding, retargeting, contact/action prediction, and embodied skill transfer.",
      "xperience10m_fit": [
        "Hand/body mocap and contact cues can be retargeted into humanoid state/action targets.",
        "Egocentric video plus human motion can support affordance and interaction tasks."
      ],
      "current_decision": "track_as_humanoid_policy_branch",
      "entry_condition": "Retargeting artifact and action-space definition exist.",
      "public_source": "https://developer.nvidia.com/isaac/gr00t"
    },
    {
      "priority": 4,
      "family": "OpenVLA / OpenVLA-OFT",
      "category": "vision_language_action_policy",
      "openness": "open_project_and_weights",
      "best_role": "Open robot-policy baseline after observations and action labels are converted into a VLA format.",
      "xperience10m_fit": [
        "Good candidate when each window is expressed as visual observation, instruction/context, and action token.",
        "Requires an explicit action target; current human egocentric labels are not robot controls by default."
      ],
      "current_decision": "candidate_after_action_space_design",
      "entry_condition": "Window-to-action-token conversion is implemented and audited.",
      "public_source": "https://openvla.github.io/"
    },
    {
      "priority": 5,
      "family": "openpi pi0/pi0.5",
      "category": "robot_policy_model",
      "openness": "open_source_policy_training_stack",
      "best_role": "Action-chunking, policy fine-tuning, and embodiment-transfer experiments.",
      "xperience10m_fit": [
        "Useful once hand trajectories, contacts, or retargeted body motion are converted into policy targets.",
        "Better for policy branch than for current structured task JSON outputs."
      ],
      "current_decision": "candidate_policy_branch",
      "entry_condition": "Action target and train/eval protocol exist for at least 64 episodes.",
      "public_source": "https://github.com/Physical-Intelligence/openpi"
    },
    {
      "priority": 6,
      "family": "Gemini Robotics",
      "category": "closed_embodied_reasoning_reference",
      "openness": "closed_or_limited_access",
      "best_role": "Qualitative reasoning reference, annotation helper, and external comparison when API access exists.",
      "xperience10m_fit": [
        "Can help reason over egocentric scenes and task descriptions.",
        "Not a local fine-tune target for this repo."
      ],
      "current_decision": "external_reference_only",
      "entry_condition": "API/access exists and outputs are logged separately from trainable model metrics.",
      "public_source": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/"
    },
    {
      "priority": 7,
      "family": "Octo / SmolVLA-style lightweight policies",
      "category": "lightweight_robot_policy_baselines",
      "openness": "open_projects",
      "best_role": "Cheaper policy baselines for observation-to-action experiments.",
      "xperience10m_fit": [
        "Useful after action target design.",
        "Less directly omni-modal than Qwen3-Omni or Cosmos 3."
      ],
      "current_decision": "optional_baseline_after_data_staging",
      "entry_condition": "Action labels and baseline protocol exist.",
      "public_source": "https://github.com/huggingface/lerobot"
    }
  ],
  "execution_order": [
    {
      "step": 1,
      "name": "Data gate",
      "action": "Stage at least 32 valid Xperience-10M episodes with held-out episode split."
    },
    {
      "step": 2,
      "name": "First held-out baseline",
      "action": "Run Qwen3-Omni LoRA to establish the full train/eval loop."
    },
    {
      "step": 3,
      "name": "Model-selection dry run",
      "action": "Run 3-8 episode dry runs for Qwen3-Omni prompt/LoRA, Cosmos 3 preprocessing, and one policy candidate."
    },
    {
      "step": 4,
      "name": "World-model branch",
      "action": "Promote Cosmos 3 if future-window/action-conditioned preprocessing fits storage and compute."
    },
    {
      "step": 5,
      "name": "Policy branch",
      "action": "Promote OpenVLA/openpi/GR00T after action target conversion and retargeting artifacts are traceable."
    },
    {
      "step": 6,
      "name": "Publication rule",
      "action": "Publish branch results only with real manifests, predictions, metrics, and qualitative examples."
    }
  ],
  "evaluation_additions": [
    {
      "target": "structured_task_prediction",
      "metrics": [
        "JSON validity",
        "macro-F1",
        "accuracy",
        "micro-F1"
      ],
      "model_families": [
        "Qwen3-Omni",
        "Gemini Robotics reference"
      ]
    },
    {
      "target": "future_state_prediction",
      "metrics": [
        "retrieval rank",
        "temporal consistency",
        "feature reconstruction",
        "qualitative visual inspection"
      ],
      "model_families": [
        "Cosmos 3"
      ]
    },
    {
      "target": "action_conditioned_dynamics",
      "metrics": [
        "transition accuracy",
        "contact accuracy",
        "next-action accuracy"
      ],
      "model_families": [
        "Cosmos 3",
        "OpenVLA",
        "openpi",
        "GR00T"
      ]
    },
    {
      "target": "cross_episode_generalization",
      "metrics": [
        "held-out episode metrics",
        "held-out session metrics",
        "leakage audit"
      ],
      "model_families": [
        "all trainable branches"
      ]
    }
  ],
  "source_links": [
    {
      "label": "Qwen3-Omni official HF model",
      "url": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct"
    },
    {
      "label": "NVIDIA Cosmos",
      "url": "https://www.nvidia.com/en-us/ai/cosmos/"
    },
    {
      "label": "NVIDIA Isaac GR00T",
      "url": "https://developer.nvidia.com/isaac/gr00t"
    },
    {
      "label": "OpenVLA",
      "url": "https://openvla.github.io/"
    },
    {
      "label": "openpi",
      "url": "https://github.com/Physical-Intelligence/openpi"
    },
    {
      "label": "Gemini Robotics",
      "url": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/"
    },
    {
      "label": "Octo",
      "url": "https://octo-models.github.io/"
    },
    {
      "label": "LeRobot / SmolVLA",
      "url": "https://github.com/huggingface/lerobot"
    }
  ]
}