AdrianLlopart's picture
chore: publish rSkill OpenRAL/rskill-smolvla-metaworld v0.1.0
0e22d27 verified
Raw
History Blame
5.55 kB
# rSkill manifest β€” OpenRAL packaging format V1 (CLAUDE.md Β§6.4)
# Wraps: lerobot/smolvla_metaworld (Apache-2.0)
# Base: lerobot/smolvla_base (arxiv:2506.01844)
# ── Identity ───────────────────────────────────────────────────────────────
schema_version: "0.1"
name: "OpenRAL/rskill-smolvla-metaworld"
evaluated_tasks: ["metaworld"] # ADR-0060 task-data gate. A *family* entry: the
# matcher (openral_sim.benchmark._task_matches) covers every "metaworld/<task>-v3"
# task id and the bare "metaworld" scene id, so this multi-task MT50 checkpoint is
# gated to the whole MetaWorld family (MT10 + MT50 suites, all 5 demo scenes) and
# blocked elsewhere β€” mirroring how the LIBERO rSkills declare "libero_spatial".
version: "0.1.0"
license: "apache-2.0"
role: "s1"
kind: "vla" # ADR-00XX: rSkill kind discriminator. "vla" = learnable Vision-Language-Action policy.
# ── Policy identity ────────────────────────────────────────────────────────
model_family: "smolvla"
# ── Compatibility contract ─────────────────────────────────────────────────
# MetaWorld MT50 targets 50 diverse manipulation tasks on a Rethink Sawyer
# arm (NOT a Franka β€” the underlying lerobot MetaworldEnv wraps a Sawyer URDF).
embodiment_tags:
- "sawyer"
# MetaWorld renders one corner-view RGB camera (480x480 native, mapped to
# observation.images.camera1 by the metaworld scene adapter).
sensors_required:
- modality: "rgb"
vla_feature_key: "observation.images.camera1"
min_width: 224
min_height: 224
# Output side (ADR-0013). For the canonical sawyer embodiment the loader
# auto-fills n_dof + vla_action_key from robots/sawyer/robot.yaml.
actuators_required:
- kind: "joint_position"
control_mode_semantics:
mode: "absolute"
# ── Runtime / weights ──────────────────────────────────────────────────────
runtime: "pytorch"
quantization:
dtype: "bf16"
backend: "pytorch"
# Pin to the HEAD SHA at packaging time for reproducibility (principle 8).
weights_uri: "hf://lerobot/smolvla_metaworld"
# ── Preprocessing (all knobs needed to interpret IO) ───────────────────────
processors:
preprocessor_uri: "hf://lerobot/smolvla_metaworld/policy_preprocessor.json"
postprocessor_uri: "hf://lerobot/smolvla_metaworld/policy_postprocessor.json"
# MetaWorld uses a single 4-D proprio state (eef_pos + gripper). NO adapter-level
# flip: lerobot's MetaworldEnv already corrects the corner2 camera's 180Β°
# inversion at the source (lerobot/envs/metaworld.py β€” "corner2 outputs images
# with both axes inverted ... correct them", `np.flip(image, (0, 1))`), and our
# backend passes that already-corrected frame straight through as camera1. An
# adapter flip_180 here would DOUBLE-flip β†’ upside-down input β†’ near-random
# success (was 0.052 on metaworld_mt50 with flip_180: true). Unlike LIBERO,
# whose lerobot env does NOT pre-flip (so those rSkills need flip_180: true).
image_preprocessing:
flip_180: false
state_contract:
dim: 4
# ── Execution semantics ────────────────────────────────────────────────────
chunk_size: 16
# n_action_steps omitted β€” equals chunk_size, so apply_chunk_replay falls
# through to the SmolVLA family default (full chunk).
latency_budget:
per_chunk_ms: 150.0 # SmolVLA inference at bf16 on a desktop GPU β‰ˆ 80–120 ms
# ── Provenance ─────────────────────────────────────────────────────────────
# Headline success rate from rskills/smolvla-metaworld/eval/metaworld_mt50.json,
# a locally reproduced `openral benchmark run --suite metaworld_mt50` rollout
# (50 tasks, 1 episode/seed-0; 16/50 solved β†’ avg 0.30). Raise n_episodes for a
# paper-equivalent number.
benchmarks:
metaworld_mt50: 0.3
paper_url: "https://arxiv.org/abs/2506.01844"
dataset_uri: "hf://lerobot/metaworld_mt50"
source_repo: "hf://lerobot/smolvla_metaworld"
description: >
SmolVLA (0.45 B) finetuned on MetaWorld MT50 β€” 50 manipulation tasks on a
Rethink Sawyer arm (MuJoCo via lerobot). Runs the MT10/MT50 suites
(benchmarks/metaworld_mt{10,50}.yaml) and 5 demo scenes
(scenes/benchmark/metaworld_*.yaml). Locally reproduced on MT50: 16/50
solved at 1 ep/seed-0 (avg 0.30); see eval/metaworld_mt50.json. 4-D
proprio / camera1 contract verified against the checkpoint (no adapter flip β€”
lerobot's MetaworldEnv already corrects the corner camera's 180Β° inversion).
# ADR-0022 β€” action vocabulary surfaced to the reasoner LLM tool
# palette so it can pick this skill by what it does (action verb +
# object + scene), not just by its slug.
actions:
- "generalist"
- "reach"
- "push"
- "pick"
- "place"
- "open"
- "close"
- "insert"
- "slide"
objects: []
scenes:
- "tabletop"
# ADR-0019 β€” per-checkpoint action contract (consumed by the dataset bridge
# to bind the LeRobot v3 `action` feature shape).
action_contract:
dim: 4