# rSkill manifest — OpenRAL packaging format V1 (CLAUDE.md §6.4)
# Wraps: lerobot/smolvla_metaworld (Apache-2.0)
# Base:  lerobot/smolvla_base  (arxiv:2506.01844)

# ── Identity ───────────────────────────────────────────────────────────────
schema_version: "0.1"
name: "OpenRAL/rskill-smolvla-metaworld"
evaluated_tasks: ["metaworld"]  # ADR-0060 task-data gate. A *family* entry: the
# matcher (openral_sim.benchmark._task_matches) covers every "metaworld/<task>-v3"
# task id and the bare "metaworld" scene id, so this multi-task MT50 checkpoint is
# gated to the whole MetaWorld family (MT10 + MT50 suites, all 5 demo scenes) and
# blocked elsewhere — mirroring how the LIBERO rSkills declare "libero_spatial".
version: "0.1.0"
license: "apache-2.0"
role: "s1"
kind: "vla"  # ADR-00XX: rSkill kind discriminator. "vla" = learnable Vision-Language-Action policy.

# ── Policy identity ────────────────────────────────────────────────────────
model_family: "smolvla"

# ── Compatibility contract ─────────────────────────────────────────────────
# MetaWorld MT50 targets 50 diverse manipulation tasks on a Rethink Sawyer
# arm (NOT a Franka — the underlying lerobot MetaworldEnv wraps a Sawyer URDF).
embodiment_tags:
  - "sawyer"

# MetaWorld renders one corner-view RGB camera (480x480 native, mapped to
# observation.images.camera1 by the metaworld scene adapter).
sensors_required:
  - modality: "rgb"
    vla_feature_key: "observation.images.camera1"
    min_width: 224
    min_height: 224

# Output side (ADR-0013). For the canonical sawyer embodiment the loader
# auto-fills n_dof + vla_action_key from robots/sawyer/robot.yaml.
actuators_required:
  - kind: "joint_position"
    control_mode_semantics:
      mode: "absolute"

# ── Runtime / weights ──────────────────────────────────────────────────────
runtime: "pytorch"
quantization:
  dtype: "bf16"
  backend: "pytorch"
# Pin to the HEAD SHA at packaging time for reproducibility (principle 8).
weights_uri: "hf://lerobot/smolvla_metaworld"

# ── Preprocessing (all knobs needed to interpret IO) ───────────────────────
processors:
  preprocessor_uri: "hf://lerobot/smolvla_metaworld/policy_preprocessor.json"
  postprocessor_uri: "hf://lerobot/smolvla_metaworld/policy_postprocessor.json"
# MetaWorld uses a single 4-D proprio state (eef_pos + gripper). NO adapter-level
# flip: lerobot's MetaworldEnv already corrects the corner2 camera's 180°
# inversion at the source (lerobot/envs/metaworld.py — "corner2 outputs images
# with both axes inverted ... correct them", `np.flip(image, (0, 1))`), and our
# backend passes that already-corrected frame straight through as camera1. An
# adapter flip_180 here would DOUBLE-flip → upside-down input → near-random
# success (was 0.052 on metaworld_mt50 with flip_180: true). Unlike LIBERO,
# whose lerobot env does NOT pre-flip (so those rSkills need flip_180: true).
image_preprocessing:
  flip_180: false
state_contract:
  dim: 4

# ── Execution semantics ────────────────────────────────────────────────────
chunk_size: 16
# n_action_steps omitted — equals chunk_size, so apply_chunk_replay falls
# through to the SmolVLA family default (full chunk).
latency_budget:
  per_chunk_ms: 150.0   # SmolVLA inference at bf16 on a desktop GPU ≈ 80–120 ms

# ── Provenance ─────────────────────────────────────────────────────────────
# Headline success rate from rskills/smolvla-metaworld/eval/metaworld_mt50.json,
# a locally reproduced `openral benchmark run --suite metaworld_mt50` rollout
# (50 tasks, 1 episode/seed-0; 16/50 solved → avg 0.30). Raise n_episodes for a
# paper-equivalent number.
benchmarks:
  metaworld_mt50: 0.3

paper_url: "https://arxiv.org/abs/2506.01844"
dataset_uri: "hf://lerobot/metaworld_mt50"
source_repo: "hf://lerobot/smolvla_metaworld"

description: >
  SmolVLA (0.45 B) finetuned on MetaWorld MT50 — 50 manipulation tasks on a
  Rethink Sawyer arm (MuJoCo via lerobot). Runs the MT10/MT50 suites
  (benchmarks/metaworld_mt{10,50}.yaml) and 5 demo scenes
  (scenes/benchmark/metaworld_*.yaml). Locally reproduced on MT50: 16/50
  solved at 1 ep/seed-0 (avg 0.30); see eval/metaworld_mt50.json. 4-D
  proprio / camera1 contract verified against the checkpoint (no adapter flip —
  lerobot's MetaworldEnv already corrects the corner camera's 180° inversion).

# ADR-0022 — action vocabulary surfaced to the reasoner LLM tool
# palette so it can pick this skill by what it does (action verb +
# object + scene), not just by its slug.
actions:
  - "generalist"
  - "reach"
  - "push"
  - "pick"
  - "place"
  - "open"
  - "close"
  - "insert"
  - "slide"
objects: []
scenes:
  - "tabletop"

# ADR-0019 — per-checkpoint action contract (consumed by the dataset bridge
# to bind the LeRobot v3 `action` feature shape).
action_contract:
  dim: 4