# rSkill manifest — OpenRAL packaging format V1 (CLAUDE.md §6.4) # Wraps: lerobot/smolvla_metaworld (Apache-2.0) # Base: lerobot/smolvla_base (arxiv:2506.01844) # ── Identity ─────────────────────────────────────────────────────────────── schema_version: "0.1" name: "OpenRAL/rskill-smolvla-metaworld" evaluated_tasks: ["metaworld"] # ADR-0060 task-data gate. A *family* entry: the # matcher (openral_sim.benchmark._task_matches) covers every "metaworld/-v3" # task id and the bare "metaworld" scene id, so this multi-task MT50 checkpoint is # gated to the whole MetaWorld family (MT10 + MT50 suites, all 5 demo scenes) and # blocked elsewhere — mirroring how the LIBERO rSkills declare "libero_spatial". version: "0.1.0" license: "apache-2.0" role: "s1" kind: "vla" # ADR-00XX: rSkill kind discriminator. "vla" = learnable Vision-Language-Action policy. # ── Policy identity ──────────────────────────────────────────────────────── model_family: "smolvla" # ── Compatibility contract ───────────────────────────────────────────────── # MetaWorld MT50 targets 50 diverse manipulation tasks on a Rethink Sawyer # arm (NOT a Franka — the underlying lerobot MetaworldEnv wraps a Sawyer URDF). embodiment_tags: - "sawyer" # MetaWorld renders one corner-view RGB camera (480x480 native, mapped to # observation.images.camera1 by the metaworld scene adapter). sensors_required: - modality: "rgb" vla_feature_key: "observation.images.camera1" min_width: 224 min_height: 224 # Output side (ADR-0013). For the canonical sawyer embodiment the loader # auto-fills n_dof + vla_action_key from robots/sawyer/robot.yaml. actuators_required: - kind: "joint_position" control_mode_semantics: mode: "absolute" # ── Runtime / weights ────────────────────────────────────────────────────── runtime: "pytorch" quantization: dtype: "bf16" backend: "pytorch" # Pin to the HEAD SHA at packaging time for reproducibility (principle 8). weights_uri: "hf://lerobot/smolvla_metaworld" # ── Preprocessing (all knobs needed to interpret IO) ─────────────────────── processors: preprocessor_uri: "hf://lerobot/smolvla_metaworld/policy_preprocessor.json" postprocessor_uri: "hf://lerobot/smolvla_metaworld/policy_postprocessor.json" # MetaWorld uses a single 4-D proprio state (eef_pos + gripper). NO adapter-level # flip: lerobot's MetaworldEnv already corrects the corner2 camera's 180° # inversion at the source (lerobot/envs/metaworld.py — "corner2 outputs images # with both axes inverted ... correct them", `np.flip(image, (0, 1))`), and our # backend passes that already-corrected frame straight through as camera1. An # adapter flip_180 here would DOUBLE-flip → upside-down input → near-random # success (was 0.052 on metaworld_mt50 with flip_180: true). Unlike LIBERO, # whose lerobot env does NOT pre-flip (so those rSkills need flip_180: true). image_preprocessing: flip_180: false state_contract: dim: 4 # ── Execution semantics ──────────────────────────────────────────────────── chunk_size: 16 # n_action_steps omitted — equals chunk_size, so apply_chunk_replay falls # through to the SmolVLA family default (full chunk). latency_budget: per_chunk_ms: 150.0 # SmolVLA inference at bf16 on a desktop GPU ≈ 80–120 ms # ── Provenance ───────────────────────────────────────────────────────────── # Headline success rate from rskills/smolvla-metaworld/eval/metaworld_mt50.json, # a locally reproduced `openral benchmark run --suite metaworld_mt50` rollout # (50 tasks, 1 episode/seed-0; 16/50 solved → avg 0.30). Raise n_episodes for a # paper-equivalent number. benchmarks: metaworld_mt50: 0.3 paper_url: "https://arxiv.org/abs/2506.01844" dataset_uri: "hf://lerobot/metaworld_mt50" source_repo: "hf://lerobot/smolvla_metaworld" description: > SmolVLA (0.45 B) finetuned on MetaWorld MT50 — 50 manipulation tasks on a Rethink Sawyer arm (MuJoCo via lerobot). Runs the MT10/MT50 suites (benchmarks/metaworld_mt{10,50}.yaml) and 5 demo scenes (scenes/benchmark/metaworld_*.yaml). Locally reproduced on MT50: 16/50 solved at 1 ep/seed-0 (avg 0.30); see eval/metaworld_mt50.json. 4-D proprio / camera1 contract verified against the checkpoint (no adapter flip — lerobot's MetaworldEnv already corrects the corner camera's 180° inversion). # ADR-0022 — action vocabulary surfaced to the reasoner LLM tool # palette so it can pick this skill by what it does (action verb + # object + scene), not just by its slug. actions: - "generalist" - "reach" - "push" - "pick" - "place" - "open" - "close" - "insert" - "slide" objects: [] scenes: - "tabletop" # ADR-0019 — per-checkpoint action contract (consumed by the dataset bridge # to bind the LeRobot v3 `action` feature shape). action_contract: dim: 4