chore: publish rSkill OpenRAL/rskill-smolvla-metaworld v0.1.0

0e22d27 verified 5 days ago

5.55 kB

	# rSkill manifest — OpenRAL packaging format V1 (CLAUDE.md §6.4)
	# Wraps: lerobot/smolvla_metaworld (Apache-2.0)
	# Base: lerobot/smolvla_base (arxiv:2506.01844)

	# ── Identity ───────────────────────────────────────────────────────────────
	schema_version: "0.1"
	name: "OpenRAL/rskill-smolvla-metaworld"
	evaluated_tasks: ["metaworld"] # ADR-0060 task-data gate. A family entry: the
	# matcher (openral_sim.benchmark._task_matches) covers every "metaworld/<task>-v3"
	# task id and the bare "metaworld" scene id, so this multi-task MT50 checkpoint is
	# gated to the whole MetaWorld family (MT10 + MT50 suites, all 5 demo scenes) and
	# blocked elsewhere — mirroring how the LIBERO rSkills declare "libero_spatial".
	version: "0.1.0"
	license: "apache-2.0"
	role: "s1"
	kind: "vla" # ADR-00XX: rSkill kind discriminator. "vla" = learnable Vision-Language-Action policy.

	# ── Policy identity ────────────────────────────────────────────────────────
	model_family: "smolvla"

	# ── Compatibility contract ─────────────────────────────────────────────────
	# MetaWorld MT50 targets 50 diverse manipulation tasks on a Rethink Sawyer
	# arm (NOT a Franka — the underlying lerobot MetaworldEnv wraps a Sawyer URDF).
	embodiment_tags:
	- "sawyer"

	# MetaWorld renders one corner-view RGB camera (480x480 native, mapped to
	# observation.images.camera1 by the metaworld scene adapter).
	sensors_required:
	- modality: "rgb"
	vla_feature_key: "observation.images.camera1"
	min_width: 224
	min_height: 224

	# Output side (ADR-0013). For the canonical sawyer embodiment the loader
	# auto-fills n_dof + vla_action_key from robots/sawyer/robot.yaml.
	actuators_required:
	- kind: "joint_position"
	control_mode_semantics:
	mode: "absolute"

	# ── Runtime / weights ──────────────────────────────────────────────────────
	runtime: "pytorch"
	quantization:
	dtype: "bf16"
	backend: "pytorch"
	# Pin to the HEAD SHA at packaging time for reproducibility (principle 8).
	weights_uri: "hf://lerobot/smolvla_metaworld"

	# ── Preprocessing (all knobs needed to interpret IO) ───────────────────────
	processors:
	preprocessor_uri: "hf://lerobot/smolvla_metaworld/policy_preprocessor.json"
	postprocessor_uri: "hf://lerobot/smolvla_metaworld/policy_postprocessor.json"
	# MetaWorld uses a single 4-D proprio state (eef_pos + gripper). NO adapter-level
	# flip: lerobot's MetaworldEnv already corrects the corner2 camera's 180°
	# inversion at the source (lerobot/envs/metaworld.py — "corner2 outputs images
	# with both axes inverted ... correct them", `np.flip(image, (0, 1))`), and our
	# backend passes that already-corrected frame straight through as camera1. An
	# adapter flip_180 here would DOUBLE-flip → upside-down input → near-random
	# success (was 0.052 on metaworld_mt50 with flip_180: true). Unlike LIBERO,
	# whose lerobot env does NOT pre-flip (so those rSkills need flip_180: true).
	image_preprocessing:
	flip_180: false
	state_contract:
	dim: 4

	# ── Execution semantics ────────────────────────────────────────────────────
	chunk_size: 16
	# n_action_steps omitted — equals chunk_size, so apply_chunk_replay falls
	# through to the SmolVLA family default (full chunk).
	latency_budget:
	per_chunk_ms: 150.0 # SmolVLA inference at bf16 on a desktop GPU ≈ 80–120 ms

	# ── Provenance ─────────────────────────────────────────────────────────────
	# Headline success rate from rskills/smolvla-metaworld/eval/metaworld_mt50.json,
	# a locally reproduced `openral benchmark run --suite metaworld_mt50` rollout
	# (50 tasks, 1 episode/seed-0; 16/50 solved → avg 0.30). Raise n_episodes for a
	# paper-equivalent number.
	benchmarks:
	metaworld_mt50: 0.3

	paper_url: "https://arxiv.org/abs/2506.01844"
	dataset_uri: "hf://lerobot/metaworld_mt50"
	source_repo: "hf://lerobot/smolvla_metaworld"

	description: >
	SmolVLA (0.45 B) finetuned on MetaWorld MT50 — 50 manipulation tasks on a
	Rethink Sawyer arm (MuJoCo via lerobot). Runs the MT10/MT50 suites
	(benchmarks/metaworld_mt{10,50}.yaml) and 5 demo scenes
	(scenes/benchmark/metaworld_*.yaml). Locally reproduced on MT50: 16/50
	solved at 1 ep/seed-0 (avg 0.30); see eval/metaworld_mt50.json. 4-D
	proprio / camera1 contract verified against the checkpoint (no adapter flip —
	lerobot's MetaworldEnv already corrects the corner camera's 180° inversion).

	# ADR-0022 — action vocabulary surfaced to the reasoner LLM tool
	# palette so it can pick this skill by what it does (action verb +
	# object + scene), not just by its slug.
	actions:
	- "generalist"
	- "reach"
	- "push"
	- "pick"
	- "place"
	- "open"
	- "close"
	- "insert"
	- "slide"
	objects: []
	scenes:
	- "tabletop"

	# ADR-0019 — per-checkpoint action contract (consumed by the dataset bridge
	# to bind the LeRobot v3 `action` feature shape).
	action_contract:
	dim: 4