Duplicate from AdrianLlopart/rskill-molmoact2-libero-nf4

Browse files

Files changed (14) hide show

.gitattributes +36 -0
README.md +207 -0
config.json +153 -0
configuration_molmoact2.py +543 -0
generation_config.json +6 -0
inference.py +768 -0
model.safetensors +3 -0
modeling_molmoact2.py +0 -0
norm_stats.json +238 -0
processing_molmoact2.py +418 -0
processor_config.json +85 -0
quantization_metadata.json +14 -0
tokenizer.json +3 -0
tokenizer_config.json +34 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+library_name: transformers
+tags:
+- molmoact2
+- robotics
+- image-text-to-text
+- libero
+- bitsandbytes
+- nf4
+- 4-bit
+base_model: allenai/MolmoAct2-LIBERO
+base_model_relation: quantized
+---
+> ⚠️ **NF4-quantized fork for OpenRAL.** This repository is a **4-bit bitsandbytes NF4**
+> quantization (compute dtype bf16) of [`allenai/MolmoAct2-LIBERO`](https://hf.co/allenai/MolmoAct2-LIBERO), produced for OpenRAL's
+> ≤8 GiB-VRAM robot deployment. The authoritative scheme is in
+> [`quantization_metadata.json`](./quantization_metadata.json) (`scheme: nf4`). The Hub's
+> auto-detected **8-bit** tag is approximate (it reflects the packed bitsandbytes uint8 storage);
+> the applied scheme is **NF4 (4-bit)**. Load via OpenRAL's `load_prequantized_state_for_rskill`.
+<img src="assets/MolmoAct2.svg" alt="MolmoAct Logo" height="50">
+# **MolmoAct2-LIBERO**
+MolmoAct2 is an open vision-language-action model for robot control. It builds on Molmo2-ER and attaches a flow-matching continuous action expert that conditions on the VLM key-value cache through a per-layer connection.
+This checkpoint is fine-tuned on the full LIBERO training mixture, combining Spatial, Object, Goal, and Long suites. It is intended for both further fine-tuning and LIBERO policy inference.
+## Quick Links
+- 📂 Models: [Models](https://huggingface.co/collections/allenai/molmoact2-models), [Finetuned Models](https://huggingface.co/collections/allenai/molmoact2-finetuned-models)
+- 📂 Datasets: [MolmoAct2-BimanualYAM Dataset](https://huggingface.co/collections/allenai/molmoact2-datasets), [MolmoAct2 Datasets](https://huggingface.co/collections/allenai/molmoact2-datasets), [Molmo2-ER Datasets](https://huggingface.co/collections/allenai/molmo2-er-datasets)
+- 📄 Paper: [arXiv:2605.02881](https://arxiv.org/abs/2605.02881)
+- 💻 Code: [allenai/molmoact2](https://github.com/allenai/molmoact2)
+- 🎥 Blog Post: [MolmoAct2](https://allenai.org/blog/molmoact2)
+## Intended Use
+Use this checkpoint for LIBERO inference or for further fine-tuning. Dataset normalization metadata is stored in `norm_stats.json`. pass `norm_tag="libero"` at inference time.
+Continuous action prediction is the intended and recommended inference mode. Discrete action prediction is exposed for parity and debugging, but we use continuous actions by default.
+## Install
+```bash
+pip install torch transformers pillow numpy huggingface_hub
+```
+## Sample Input
+This sample comes from `libero_10`, episode 0, frame 0. The LIBERO camera order is front/agent view followed by wrist view.
+| Agentview RGB | Wrist RGB |
+| --- | --- |
+| ![Sample agentview RGB](assets/sample_agentview_rgb.png) | ![Sample wrist RGB](assets/sample_wrist_rgb.png) |
+```python
+from huggingface_hub import hf_hub_download
+from PIL import Image
+import numpy as np
+repo_id = "allenai/MolmoAct2-LIBERO"
+agentview_rgb = Image.open(
+    hf_hub_download(repo_id, "assets/sample_agentview_rgb.png")
+).convert("RGB")
+wrist_rgb = Image.open(
+    hf_hub_download(repo_id, "assets/sample_wrist_rgb.png")
+).convert("RGB")
+task = "put the white mug on the left plate and put the yellow and white mug on the right plate"
+robot_state = np.array(
+    [
+        -0.05338004603981972,
+        0.007029631175100803,
+        0.6783280968666077,
+        3.1407692432403564,
+        0.0017593271331861615,
+        -0.08994418382644653,
+        0.03878866136074066,
+        -0.03878721222281456,
+    ],
+    dtype=np.float32,
+)
+```
+## Continuous Actions
+```python
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from transformers import AutoModelForImageTextToText, AutoProcessor
+repo_id = "allenai/MolmoAct2-LIBERO"
+agentview_rgb = Image.open(
+    hf_hub_download(repo_id, "assets/sample_agentview_rgb.png")
+).convert("RGB")
+wrist_rgb = Image.open(
+    hf_hub_download(repo_id, "assets/sample_wrist_rgb.png")
+).convert("RGB")
+task = "put the white mug on the left plate and put the yellow and white mug on the right plate"
+robot_state = np.array(
+    [
+        -0.05338004603981972,
+        0.007029631175100803,
+        0.6783280968666077,
+        3.1407692432403564,
+        0.0017593271331861615,
+        -0.08994418382644653,
+        0.03878866136074066,
+        -0.03878721222281456,
+    ],
+    dtype=np.float32,
+)
+processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)
+model = AutoModelForImageTextToText.from_pretrained(
+    repo_id,
+    trust_remote_code=True,
+    dtype=torch.float32,
+).to("cuda").eval()
+out = model.predict_action(
+    processor=processor,
+    images=[agentview_rgb, wrist_rgb],
+    task=task,
+    state=robot_state,
+    norm_tag="libero",
+    inference_action_mode="continuous",
+    enable_depth_reasoning=False,
+    num_steps=10,
+    normalize_language=True,
+    enable_cuda_graph=True,
+)
+actions = out.actions
+```
+MolmoAct2 was trained with mixed precision. For our reported experiments, we ran inference in `float32`. This path uses the most GPU memory: roughly 26GB with CUDA graph enabled, or around 24GB without CUDA graph.
+If you have a GPU with less memory, you can run inference with `bfloat16` instead:
+```python
+model = AutoModelForImageTextToText.from_pretrained(
+    repo_id,
+    trust_remote_code=True,
+    dtype=torch.bfloat16,
+).to("cuda").eval()
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+    out = model.predict_action(...)
+```
+Using `bfloat16` is much more memory efficient and can run under 16GB of GPU memory in our tests. It usually does not hurt performance much.
+`images` should preserve camera order, for example `[agentview_rgb, wrist_rgb]`. Images may be PIL images or RGB arrays. `state` is the raw robot state, and actions are returned in robot scale.
+`normalize_language=True` is the default. It lowercases the task string and removes trailing sentence punctuation to match training preprocessing. Set it to `False` if you need to preserve the task text exactly.
+`enable_cuda_graph=True` is the default. The first few calls can be slow because the model warms up and captures CUDA graphs. run several random warm-up calls before measuring deployment latency. `num_steps` controls the continuous flow solver and defaults to the checkpoint config value, 10.
+Depth reasoning is disabled for this checkpoint. Calling `enable_depth_reasoning=True` will raise an error.
+## Discrete Actions
+Discrete action inference requires a caller-provided action tokenizer. It is not saved in this repository. Discrete mode decodes action tokens directly. the continuous action expert is not used.
+```python
+action_tokenizer = AutoProcessor.from_pretrained(
+    "allenai/MolmoAct2-FAST-Tokenizer",
+    trust_remote_code=True,
+)
+out = model.predict_action(
+    processor=processor,
+    images=[agentview_rgb, wrist_rgb],
+    task=task,
+    state=robot_state,
+    norm_tag="libero",
+    inference_action_mode="discrete",
+    action_tokenizer=action_tokenizer,
+    enable_depth_reasoning=False,
+)
+```
+## Model and Hardware Safety
+MolmoAct2 generate robot actions from visual observations and language instructions, but their behavior may vary across embodiments, environments, and hardware configurations. Users should carefully validate model outputs before deployment, especially when operating physical robots or other actuated systems. Where possible, actions should be monitored through interpretable intermediate outputs (adaptive depth map), simulation rollouts, action limits, or other safety checks before execution on hardware. The model’s action space should be bounded by the training data, robot controller limits, and task-specific safety constraints, including limits on speed, workspace, torque, and contact force. Users should follow the hardware manufacturer’s safety guidelines, use appropriate emergency-stop mechanisms, and operate the system only in a safely configured environment with human supervision.
+## Citation
+```bibtex
+@misc{fang2026molmoact2actionreasoningmodels,
+      title={MolmoAct2: Action Reasoning Models for Real-world Deployment},
+      author={Haoquan Fang and Jiafei Duan and Donovan Clay and Sam Wang and Shuo Liu and Weikai Huang and Xiang Fan and Wei-Chuan Tsai and Shirui Chen and Yi Ru Wang and Shanli Xing and Jaemin Cho and Jae Sung Park and Ainaz Eftekhar and Peter Sushko and Karen Farley and Angad Wadhwa and Cole Harrison and Winson Han and Ying-Chun Lee and Eli VanderBilt and Rose Hendrix and Suveen Ellawela and Lucas Ngoo and Joyce Chai and Zhongzheng Ren and Ali Farhadi and Dieter Fox and Ranjay Krishna},
+      year={2026},
+      eprint={2605.02881},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2605.02881},
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,153 @@

+{
+  "action_end_token_id": 151933,
+  "action_expert_config": {
+    "attn_dropout": 0.0,
+    "causal_attn": false,
+    "context_layer_norm": true,
+    "dropout": 0.0,
+    "ffn_multiple_of": 256,
+    "hidden_size": 768,
+    "mlp_ratio": 4.0,
+    "model_type": "molmoact2_action_expert",
+    "num_heads": 8,
+    "num_layers": 36,
+    "qk_norm": true,
+    "qk_norm_eps": 1e-06,
+    "rope": true,
+    "timestep_embed_dim": 256
+  },
+  "action_expert_depth_gate": false,
+  "action_expert_depth_gate_init_bias": -4.0,
+  "action_expert_depth_gate_per_layer": false,
+  "action_mode": "both",
+  "max_action_horizon": 10,
+  "action_output_token_id": 151931,
+  "action_start_token_id": 151932,
+  "action_token_start_id": 151934,
+  "adapter_config": {
+    "attention_dropout": 0.0,
+    "attn_implementation": "sdpa",
+    "float32_attention": true,
+    "head_dim": 72,
+    "hidden_act": "silu",
+    "hidden_size": 1152,
+    "image_feature_dropout": 0.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "model_type": "molmoact2",
+    "num_attention_heads": 16,
+    "num_key_value_heads": 16,
+    "pooling_attention_mask": true,
+    "residual_dropout": 0.0,
+    "text_hidden_size": 2560,
+    "vit_layers": [
+      -3,
+      -9
+    ]
+  },
+  "add_action_expert": true,
+  "add_control_tokens": true,
+  "add_setup_tokens": true,
+  "architectures": [
+    "MolmoAct2ForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_molmoact2.MolmoAct2Config",
+    "AutoModelForImageTextToText": "modeling_molmoact2.MolmoAct2ForConditionalGeneration"
+  },
+  "depth_end_token_id": null,
+  "depth_mode": 2,
+  "depth_output_token_id": null,
+  "depth_start_token_id": null,
+  "depth_token_start_id": null,
+  "dtype": "float32",
+  "enable_depth_reasoning": false,
+  "flow_matching_beta_alpha": 1.0,
+  "flow_matching_beta_beta": 1.5,
+  "flow_matching_cutoff": 1.0,
+  "flow_matching_num_steps": 10,
+  "flow_matching_time_offset": 0.001,
+  "flow_matching_time_scale": 0.999,
+  "frame_end_token_id": 154632,
+  "frame_start_token_id": 154631,
+  "image_col_id": 154627,
+  "image_end_token_id": 154625,
+  "image_high_res_id": 154626,
+  "image_low_res_id": 154630,
+  "image_patch_id": 154626,
+  "image_start_token_id": 154624,
+  "initializer_range": 0.02,
+  "low_res_image_start_token_id": 154628,
+  "mask_action_dim_padding": true,
+  "max_action_dim": 32,
+  "model_type": "molmoact2",
+  "n_obs_steps": 1,
+  "norm_stats_filename": "norm_stats.json",
+  "num_action_tokens": 2048,
+  "num_depth_codes": 100,
+  "num_depth_tokens": 0,
+  "num_state_tokens": 256,
+  "state_end_token_id": 151674,
+  "state_format": "discrete",
+  "state_start_token_id": 151673,
+  "state_token_start_id": 151675,
+  "text_config": {
+    "additional_vocab_size": 128,
+    "attention_dropout": 0.0,
+    "attn_implementation": "sdpa",
+    "embedding_dropout": 0.0,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "layer_norm_eps": 1e-06,
+    "max_position_embeddings": 16384,
+    "model_type": "molmoact2_text",
+    "norm_after": false,
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "qk_norm_type": "qwen3",
+    "qkv_bias": false,
+    "residual_dropout": 0.0,
+    "rope_parameters": {
+      "rope_theta": 5000000.0,
+      "rope_type": "default"
+    },
+    "rope_scaling_layers": null,
+    "rope_theta": 5000000.0,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_qk_norm": true,
+    "vocab_size": 154624
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.3.0",
+  "use_frame_special_tokens": true,
+  "vit_config": {
+    "attention_dropout": 0.0,
+    "attn_implementation": "sdpa",
+    "float32_attention": true,
+    "head_dim": 72,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_default_input_size": [
+      378,
+      378
+    ],
+    "image_num_pos": 729,
+    "image_patch_size": 14,
+    "initializer_range": 0.02,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "molmoact2",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "num_key_value_heads": 16,
+    "residual_dropout": 0.0
+  },
+  "bos_token_id": 151645,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643
+}

configuration_molmoact2.py ADDED Viewed

	@@ -0,0 +1,543 @@

+"""
+MolmoAct2 configuration
+"""
+from typing import Optional, Any
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class MolmoAct2VitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2VisionTransformer`].
+    It is used to instantiate a `MolmoAct2VisionTransformer` according to the specified arguments,
+    defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2VitConfig, MolmoAct2VisionTransformer
+    >>> # Initializing a MolmoAct2VitConfig
+    >>> configuration = MolmoAct2VitConfig()
+    >>> # Initializing a MolmoAct2VisionTransformer (with random weights)
+    >>> model = MolmoAct2VisionTransformer(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "molmoact2"
+    base_config_key = "vit_config"
+    def __init__(
+        self,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        num_hidden_layers: int = 27,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 72,
+        hidden_act: str = "gelu_pytorch_tanh",
+        layer_norm_eps: float = 1e-6,
+        image_default_input_size: tuple[int, int] = (378, 378),
+        image_patch_size: int = 14,
+        image_num_pos: int = 577,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        initializer_range: float = 0.02,
+        float32_attention: bool = True,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(
+            attn_implementation=attn_implementation,
+            **kwargs
+        )
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.image_default_input_size = image_default_input_size
+        self.image_patch_size = image_patch_size
+        self.image_num_pos = image_num_pos
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.initializer_range = initializer_range
+        self.float32_attention = float32_attention
+    @property
+    def image_num_patch(self):
+        h, w = self.image_default_input_size
+        return h // self.image_patch_size, w // self.image_patch_size
+class MolmoAct2AdapterConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of MolmoAct2Adapter. With MolmoAct2VitConfig,
+    It is used to instantiate an MolmoAct2VisionBackbone according to the specified arguments,
+    defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2VitConfig, MolmoAct2AdapterConfig, MolmoAct2VisionBackbone
+    >>> # Initializing a MolmoAct2VitConfig and a MolmoAct2AdapterConfig
+    >>> vit_config = MolmoAct2VitConfig()
+    >>> adapter_config = MolmoPoolingConfig()
+    >>> # Initializing a MolmoAct2VisionBackbone (with random weights)
+    >>> model = MolmoAct2VisionBackbone(vit_config, adapter_config)
+    >>> # Accessing the model configuration
+    >>> vit_configuration = model.vit_config
+    >>> adapter_configuration = model.adapter_config
+    ```"""
+    model_type = "molmoact2"
+    base_config_key = "adapter_config"
+    def __init__(
+        self,
+        vit_layers: tuple = (-3, -9),
+        pooling_attention_mask: bool = False,
+        hidden_size: int = 1152,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 72,
+        float32_attention: bool = True,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        hidden_act: str = "silu",
+        intermediate_size: int = 18944,
+        text_hidden_size: int = 3584,
+        image_feature_dropout: float = 0.0,
+        initializer_range: float = 0.02,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(
+            attn_implementation=attn_implementation,
+            **kwargs
+        )
+        self.vit_layers = vit_layers
+        self.pooling_attention_mask = pooling_attention_mask
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.float32_attention = float32_attention
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.text_hidden_size = text_hidden_size
+        self.image_feature_dropout = image_feature_dropout
+        self.initializer_range = initializer_range
+class MolmoAct2TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2TextModel`]. It is used to instantiate a
+    `MolmoAct2TextModel` according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2TextConfig, MolmoAct2TextModel
+    >>> # Initializing a MolmoAct2TextConfig
+    >>> configuration = MolmoAct2TextConfig()
+    >>> # Initializing a MolmoAct2TextModel (with random weights)
+    >>> model = MolmoAct2TextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "molmoact2_text"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "blocks.*.self_attn.att_proj": "colwise",
+        "blocks.*.self_attn.attn_out": "rowwise",
+        "blocks.*.mlp.ff_proj": "colwise",
+        "blocks.*.mlp.ff_out": "rowwise",
+    }
+    base_model_pp_plan = {
+        "wte": (["input_ids"], ["inputs_embeds"]),
+        "blocks": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "ln_f": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        hidden_size: int = 3584,
+        num_attention_heads: int = 28,
+        num_key_value_heads: Optional[int] = 4,
+        head_dim: int = 128,
+        vocab_size: int = 152064,
+        additional_vocab_size: int = 128,
+        qkv_bias: bool = True,
+        num_hidden_layers: int = 48,
+        intermediate_size: int = 18944,
+        hidden_act: str = "silu",
+        embedding_dropout: float=0.0,
+        attention_dropout: float=0.0,
+        residual_dropout: float = 0.0,
+        max_position_embeddings: int = 4096,
+        rope_theta: float = 1000000.0,
+        rope_scaling: dict[str, Any] = None,
+        rope_scaling_layers: Optional[list[int]] = None,
+        use_qk_norm: bool = False,
+        qk_norm_type: str = "olmo",
+        layer_norm_eps: int = 1e-6,
+        norm_after: bool = False,
+        initializer_range: float = 0.02,
+        use_cache=True,
+        tie_word_embeddings=False,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            attn_implementation=attn_implementation,
+            **kwargs
+        )
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.vocab_size = vocab_size
+        self.additional_vocab_size = additional_vocab_size
+        self.qkv_bias = qkv_bias
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.embedding_dropout = embedding_dropout
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.rope_scaling_layers = rope_scaling_layers
+        self.use_qk_norm = use_qk_norm
+        self.qk_norm_type = qk_norm_type
+        self.layer_norm_eps = layer_norm_eps
+        self.norm_after = norm_after
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+class MolmoAct2ActionExpertConfig(PretrainedConfig):
+    r"""Configuration for the MolmoAct2 modern action expert."""
+    model_type = "molmoact2_action_expert"
+    base_config_key = "action_expert_config"
+    def __init__(
+        self,
+        max_action_horizon: int = 32,
+        max_action_dim: int = 32,
+        hidden_size: int = 1024,
+        num_layers: int = 32,
+        num_heads: int = 16,
+        mlp_ratio: float = 8.0 / 3.0,
+        ffn_multiple_of: int = 256,
+        timestep_embed_dim: int = 256,
+        dropout: float = 0.0,
+        attn_dropout: float = 0.0,
+        context_layer_norm: bool = True,
+        qk_norm: bool = True,
+        qk_norm_eps: float = 1e-6,
+        rope: bool = True,
+        causal_attn: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_action_horizon = max_action_horizon
+        self.max_action_dim = max_action_dim
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.ffn_multiple_of = ffn_multiple_of
+        self.timestep_embed_dim = timestep_embed_dim
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.context_layer_norm = context_layer_norm
+        self.qk_norm = qk_norm
+        self.qk_norm_eps = qk_norm_eps
+        self.rope = rope
+        self.causal_attn = causal_attn
+    def to_dict(self):
+        output = super().to_dict()
+        # These are derived from the parent MolmoAct2Config for HF exports. Keeping
+        # them out of the public nested config avoids duplicated sources of truth.
+        output.pop("max_action_horizon", None)
+        output.pop("max_action_dim", None)
+        return output
+class MolmoAct2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2ForConditionalGeneration`].
+    It is used to instantiate an MolmoAct2 model according to the specified arguments, defining the model architecture.
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2Config, MolmoAct2VitConfig, MolmoAct2AdapterConfig, MolmoAct2TextConfig
+    >>> # Initializing a MolmoAct2VitConfig
+    >>> vit_config = MolmoAct2VitConfig()
+    >>> # Initializing a MolmoAct2AdapterConfig
+    >>> adapter_config = MolmoAct2AdapterConfig()
+    >>> # Initializing a MolmoAct2TextConfig
+    >>> text_config = MolmoAct2TextConfig()
+    >>> # Initializing a MolmoAct2Config
+    >>> configuration = MolmoAct2Config(
+    >>>     vit_config=vit_config,
+    >>>     adapter_config=adapter_config,
+    >>>     text_config=text_config,
+    >>>     image_start_token_id=151936,
+    >>>     image_end_token_id=151937,
+    >>>     image_patch_id=151938,
+    >>>     image_col_id=151939,
+    >>>     low_res_image_start_token_id=151940,
+    >>>     image_low_res_id=151942,
+    >>>     frame_start_token_id=151943,
+    >>>     frame_end_token_id=151944,
+    >>> )
+    >>> # Initializing a model
+    >>> model = MolmoAct2ForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "molmoact2"
+    sub_configs = {
+        "text_config": MolmoAct2TextConfig,
+        "vit_config": MolmoAct2VitConfig,
+        "adapter_config": MolmoAct2AdapterConfig,
+        "action_expert_config": MolmoAct2ActionExpertConfig,
+    }
+    def __init__(
+        self,
+        vit_config: MolmoAct2VitConfig = None,
+        adapter_config: MolmoAct2AdapterConfig = None,
+        text_config: MolmoAct2TextConfig = None,
+        action_expert_config: MolmoAct2ActionExpertConfig = None,
+        image_start_token_id: int = None,
+        low_res_image_start_token_id: int = None,
+        image_end_token_id: int = None,
+        image_low_res_id: int = None,
+        image_patch_id: int = None,
+        image_col_id: int = None,
+        frame_start_token_id: int = None,
+        frame_end_token_id: int = None,
+        use_frame_special_tokens: bool = True,
+        initializer_range: float = 0.02,
+        add_action_expert: bool = True,
+        max_action_dim: int = 32,
+        max_action_horizon: int = 30,
+        n_obs_steps: int = 30,
+        action_mode: str = "both",
+        state_format: str = "discrete",
+        flow_matching_num_steps: int = 10,
+        flow_matching_cutoff: float = 1.0,
+        flow_matching_time_offset: float = 0.001,
+        flow_matching_time_scale: float = 0.999,
+        flow_matching_beta_alpha: float = 1.0,
+        flow_matching_beta_beta: float = 1.5,
+        mask_action_dim_padding: bool = True,
+        enable_depth_reasoning: bool = False,
+        depth_mode: int = 2,
+        num_depth_codes: int = 100,
+        action_expert_depth_gate: bool = False,
+        action_expert_depth_gate_per_layer: bool = False,
+        action_expert_depth_gate_init_bias: float = -4.0,
+        action_output_token_id: int = None,
+        action_start_token_id: int = None,
+        action_end_token_id: int = None,
+        action_token_start_id: int = None,
+        num_action_tokens: int = 0,
+        depth_output_token_id: int = None,
+        depth_start_token_id: int = None,
+        depth_end_token_id: int = None,
+        depth_token_start_id: int = None,
+        num_depth_tokens: int = 0,
+        state_start_token_id: int = None,
+        state_end_token_id: int = None,
+        state_token_start_id: int = None,
+        num_state_tokens: int = 0,
+        add_setup_tokens: bool = True,
+        add_control_tokens: bool = True,
+        norm_stats_filename: str = "norm_stats.json",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if vit_config is None:
+            self.vit_config = MolmoAct2VitConfig()
+        elif isinstance(vit_config, dict):
+            self.vit_config = MolmoAct2VitConfig(**vit_config)
+        else:
+            self.vit_config = vit_config
+        if adapter_config is None:
+            self.adapter_config = MolmoAct2AdapterConfig()
+        elif isinstance(adapter_config, dict):
+            self.adapter_config = MolmoAct2AdapterConfig(**adapter_config)
+        else:
+            self.adapter_config = adapter_config
+        if text_config is None:
+            self.text_config = MolmoAct2TextConfig()
+        elif isinstance(text_config, dict):
+            self.text_config = MolmoAct2TextConfig(**text_config)
+        else:
+            self.text_config = text_config
+        self.add_action_expert = bool(add_action_expert)
+        if not self.add_action_expert:
+            self.action_expert_config = None
+        elif action_expert_config is None:
+            self.action_expert_config = MolmoAct2ActionExpertConfig(
+                max_action_horizon=max_action_horizon,
+                max_action_dim=max_action_dim,
+                num_layers=self.text_config.num_hidden_layers,
+            )
+        elif isinstance(action_expert_config, dict):
+            self.action_expert_config = MolmoAct2ActionExpertConfig(**action_expert_config)
+        else:
+            self.action_expert_config = action_expert_config
+        if self.add_action_expert:
+            self.action_expert_config.max_action_dim = int(max_action_dim)
+            self.action_expert_config.max_action_horizon = int(max_action_horizon)
+            self._validate_release_action_config(
+                state_format=state_format,
+            )
+        self.image_start_token_id = image_start_token_id
+        self.low_res_image_start_token_id = low_res_image_start_token_id
+        self.image_end_token_id = image_end_token_id
+        self.image_low_res_id = image_low_res_id
+        self.image_high_res_id = image_patch_id
+        self.image_patch_id = image_patch_id
+        self.image_col_id = image_col_id
+        self.frame_start_token_id = frame_start_token_id
+        self.frame_end_token_id = frame_end_token_id
+        self.use_frame_special_tokens = use_frame_special_tokens
+        self.initializer_range = initializer_range
+        self.max_action_dim = max_action_dim
+        self.max_action_horizon = max_action_horizon
+        self.n_obs_steps = n_obs_steps
+        self.action_mode = action_mode
+        self.state_format = state_format
+        self.flow_matching_num_steps = flow_matching_num_steps
+        self.flow_matching_cutoff = flow_matching_cutoff
+        self.flow_matching_time_offset = flow_matching_time_offset
+        self.flow_matching_time_scale = flow_matching_time_scale
+        self.flow_matching_beta_alpha = flow_matching_beta_alpha
+        self.flow_matching_beta_beta = flow_matching_beta_beta
+        self.mask_action_dim_padding = mask_action_dim_padding
+        self.enable_depth_reasoning = enable_depth_reasoning
+        self.depth_mode = depth_mode
+        self.num_depth_codes = num_depth_codes
+        self.action_expert_depth_gate = action_expert_depth_gate
+        self.action_expert_depth_gate_per_layer = action_expert_depth_gate_per_layer
+        self.action_expert_depth_gate_init_bias = action_expert_depth_gate_init_bias
+        self.action_output_token_id = action_output_token_id
+        self.action_start_token_id = action_start_token_id
+        self.action_end_token_id = action_end_token_id
+        self.action_token_start_id = action_token_start_id
+        self.num_action_tokens = num_action_tokens
+        self.depth_output_token_id = depth_output_token_id
+        self.depth_start_token_id = depth_start_token_id
+        self.depth_end_token_id = depth_end_token_id
+        self.depth_token_start_id = depth_token_start_id
+        self.num_depth_tokens = num_depth_tokens
+        self.state_start_token_id = state_start_token_id
+        self.state_end_token_id = state_end_token_id
+        self.state_token_start_id = state_token_start_id
+        self.num_state_tokens = num_state_tokens
+        self.add_setup_tokens = add_setup_tokens
+        self.add_control_tokens = add_control_tokens
+        self.norm_stats_filename = norm_stats_filename
+    @staticmethod
+    def _validate_release_action_config(
+        *,
+        state_format: str,
+    ) -> None:
+        if state_format != "discrete":
+            raise ValueError("MolmoAct2 HF export supports only state_format='discrete'.")
+    @property
+    def image_num_patch(self):
+        assert self.vit_config is not None
+        return self.vit_config.image_num_patch
+    @property
+    def num_attention_heads(self):
+        return self.text_config.num_attention_heads
+    @property
+    def num_key_value_heads(self):
+        return self.text_config.num_key_value_heads
+    @property
+    def head_dim(self):
+        return self.text_config.head_dim
+    @property
+    def num_hidden_layers(self):
+        return self.text_config.num_hidden_layers
+    @property
+    def hidden_size(self):
+        return self.text_config.hidden_size
+    @property
+    def vocab_size(self):
+        return self.text_config.vocab_size
+    @property
+    def max_position_embeddings(self):
+        return self.text_config.max_position_embeddings
+MolmoAct2VitConfig.register_for_auto_class()
+MolmoAct2AdapterConfig.register_for_auto_class()
+MolmoAct2TextConfig.register_for_auto_class()
+MolmoAct2ActionExpertConfig.register_for_auto_class()
+MolmoAct2Config.register_for_auto_class()

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token_id": 151645,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643,
+  "transformers_version": "5.3.0"
+}

inference.py ADDED Viewed

	@@ -0,0 +1,768 @@

+"""Inference utilities for MolmoAct2"""
+from dataclasses import dataclass
+from typing import Any, Iterable, Optional, Sequence, Tuple
+import torch
+from torch.nn import functional as F
+from transformers.cache_utils import Cache
+from transformers.configuration_utils import PretrainedConfig
+@dataclass
+class _ActionFlowInputs:
+    trajectory: torch.Tensor
+    context: Any
+    modulations: Sequence[Any]
+    action_dim_is_pad: Optional[torch.Tensor]
+@dataclass
+class _ActionFlowCudaGraph:
+    key: Tuple[Any, ...]
+    graph: torch.cuda.CUDAGraph
+    static_inputs: _ActionFlowInputs
+    output: torch.Tensor
+@dataclass
+class _DepthDecodeCudaGraphLayerStage:
+    residual: torch.Tensor
+    query: torch.Tensor
+    key: torch.Tensor
+    value: torch.Tensor
+@dataclass
+class _DepthDecodeCudaGraphPostStage:
+    graph: torch.cuda.CUDAGraph
+    attn_context: torch.Tensor
+@dataclass
+class _DepthDecodeCudaGraph:
+    cache_key: Tuple[Any, ...]
+    pre_graph: torch.cuda.CUDAGraph
+    token_ids: torch.Tensor
+    cos: torch.Tensor
+    sin: torch.Tensor
+    positions: torch.Tensor
+    stages: Sequence[_DepthDecodeCudaGraphLayerStage]
+    post_graphs: Sequence[_DepthDecodeCudaGraphPostStage]
+    output: torch.Tensor
+@dataclass
+class _DepthDecodeCudaGraphSpec:
+    eligible: bool
+    cache_key_prefix: Tuple[Any, ...]
+    num_hidden_layers: int
+    head_dim: int
+    num_attention_heads: int
+def _cache_seq_len_int(past_key_values: Optional[Cache]) -> int:
+    if past_key_values is None:
+        return 0
+    seq_len = past_key_values.get_seq_length()
+    if torch.is_tensor(seq_len):
+        return int(seq_len.item())
+    return int(seq_len)
+def _cache_max_len_int(past_key_values: Optional[Cache]) -> int:
+    if past_key_values is None:
+        return -1
+    max_len = past_key_values.get_max_cache_shape()
+    if torch.is_tensor(max_len):
+        return int(max_len.item())
+    return int(max_len)
+def _iter_cache_key_values(
+    past_key_values: Cache,
+) -> Iterable[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]]:
+    layers = getattr(past_key_values, "layers", None)
+    if layers is not None:
+        for layer in layers:
+            yield getattr(layer, "keys", None), getattr(layer, "values", None)
+        return
+    for layer in past_key_values:
+        yield layer[0], layer[1]
+class _DepthDecodeStaticLayerCache:
+    is_compileable = False
+    is_sliding = False
+    def __init__(self, max_cache_len: int) -> None:
+        self.max_cache_len = int(max_cache_len)
+        self.cumulative_length = 0
+        self.keys: Optional[torch.Tensor] = None
+        self.values: Optional[torch.Tensor] = None
+    def _allocate(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
+        bsz, n_heads = key_states.shape[:2]
+        self.keys = torch.empty(
+            (bsz, n_heads, self.max_cache_len, key_states.shape[-1]),
+            dtype=key_states.dtype,
+            device=key_states.device,
+        )
+        self.values = torch.empty(
+            (bsz, n_heads, self.max_cache_len, value_states.shape[-1]),
+            dtype=value_states.dtype,
+            device=value_states.device,
+        )
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        *args,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.keys is None:
+            self._allocate(key_states, value_states)
+        start = self.cumulative_length
+        end = start + key_states.shape[-2]
+        if end > self.max_cache_len:
+            raise RuntimeError(
+                f"KV cache length {end} exceeds max_cache_len={self.max_cache_len}."
+            )
+        self.keys[:, :, start:end, :].copy_(key_states)
+        self.values[:, :, start:end, :].copy_(value_states)
+        self.cumulative_length = end
+        return self.keys[:, :, :end, :], self.values[:, :, :end, :]
+    def get_seq_length(self) -> int:
+        return self.cumulative_length
+    def get_max_cache_shape(self) -> int:
+        return -1
+    def reset(self) -> None:
+        self.cumulative_length = 0
+class _DepthDecodeStaticCache(Cache):
+    def __init__(self, config: PretrainedConfig, max_cache_len: int) -> None:
+        text_config = config.get_text_config(decoder=True)
+        super().__init__(
+            layers=[
+                _DepthDecodeStaticLayerCache(max_cache_len=max_cache_len)
+                for _ in range(text_config.num_hidden_layers)
+            ]
+        )
+    def get_seq_length(self, layer_idx: int = 0) -> int:
+        return self.layers[layer_idx].get_seq_length()
+    def get_max_cache_shape(self, layer_idx: int = 0) -> int:
+        return self.layers[layer_idx].get_max_cache_shape()
+    def reset(self) -> None:
+        for layer in self.layers:
+            layer.reset()
+class ActionCudaGraphManager:
+    def __init__(self, model: Any) -> None:
+        self.model = model
+        self.enabled = True
+        self.action_flow_graph: Optional[_ActionFlowCudaGraph] = None
+    def set_enabled(self, enabled: bool) -> None:
+        self.enabled = bool(enabled)
+    def can_use_action_flow(self, inputs: _ActionFlowInputs) -> bool:
+        action_model = self.model
+        if not self.enabled:
+            return False
+        if action_model.training or action_model._require_action_expert().training:
+            return False
+        if inputs.trajectory.device.type != "cuda":
+            return False
+        def all_on_cuda():
+            yield inputs.trajectory
+            for k, v in inputs.context.kv_contexts:
+                yield k
+                yield v
+            for t in (
+                inputs.context.cross_mask,
+                inputs.context.self_mask,
+                inputs.context.valid_action,
+                inputs.action_dim_is_pad,
+            ):
+                if t is not None:
+                    yield t
+            if inputs.context.rope_cache is not None:
+                yield from inputs.context.rope_cache
+            for step in inputs.modulations:
+                yield step.conditioning
+                for block_modulation in step.block_modulations:
+                    yield from block_modulation
+                yield from step.final_modulation
+        return all(t.device.type == "cuda" for t in all_on_cuda())
+    def run_action_flow(
+        self,
+        inputs: _ActionFlowInputs,
+        steps: int,
+        run_loop,
+    ) -> torch.Tensor:
+        key = _cuda_graph_key(inputs, steps)
+        cache = self.action_flow_graph
+        if cache is None or cache.key != key:
+            static_inputs = _clone_static_inputs(inputs)
+            graph, output = _capture_cuda_graph(
+                lambda: run_loop(static_inputs, steps),
+                inputs.trajectory.device,
+                after_warmup=lambda: static_inputs.trajectory.copy_(inputs.trajectory),
+            )
+            cache = _ActionFlowCudaGraph(
+                key=key,
+                graph=graph,
+                static_inputs=static_inputs,
+                output=output,
+            )
+            self.action_flow_graph = cache
+        else:
+            _copy_inputs_(cache.static_inputs, inputs)
+        cache.graph.replay()
+        return cache.output.clone()
+class DepthDecodeCudaGraphManager:
+    def __init__(self, model: Any) -> None:
+        self.model = model
+        self.backbone = model.model
+        self.enabled = True
+        self.graph: Optional[_DepthDecodeCudaGraph] = None
+        self.graph_spec: Optional[_DepthDecodeCudaGraphSpec] = None
+    def set_enabled(self, enabled: bool) -> None:
+        self.enabled = bool(enabled)
+    def make_static_cache(self, max_cache_len: int) -> _DepthDecodeStaticCache:
+        return _DepthDecodeStaticCache(
+            config=self.model.config.text_config,
+            max_cache_len=max_cache_len,
+        )
+    def _depth_decode_spec(self) -> _DepthDecodeCudaGraphSpec:
+        static = self.graph_spec
+        if static is None:
+            cfg = self.backbone.transformer.config
+            rotary_emb = getattr(self.backbone.transformer, "rotary_emb", None)
+            static = _DepthDecodeCudaGraphSpec(
+                eligible=(
+                    not cfg.norm_after
+                    and cfg.rope_scaling_layers is None
+                    and getattr(rotary_emb, "rope_type", None) == "default"
+                    and cfg._attn_implementation == "sdpa"
+                ),
+                cache_key_prefix=(
+                    cfg.hidden_size,
+                    cfg.num_attention_heads,
+                    cfg.num_key_value_heads,
+                    cfg.head_dim,
+                    cfg.num_hidden_layers,
+                    cfg.use_qk_norm,
+                    cfg.qk_norm_type,
+                    cfg._attn_implementation,
+                ),
+                num_hidden_layers=cfg.num_hidden_layers,
+                head_dim=cfg.head_dim,
+                num_attention_heads=cfg.num_attention_heads,
+            )
+            self.graph_spec = static
+        return static
+    def can_use(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+    ) -> bool:
+        if (
+            not self.enabled
+            or self.model.training
+            or self.backbone.transformer.training
+        ):
+            return False
+        if next_input_ids.device.type != "cuda":
+            return False
+        if (
+            next_input_ids.ndim != 2
+            or next_input_ids.shape[0] != 1
+            or next_input_ids.shape[1] != 1
+        ):
+            return False
+        if not isinstance(past_key_values, _DepthDecodeStaticCache):
+            return False
+        if (
+            not torch.is_tensor(attention_bias)
+            or attention_bias.device != next_input_ids.device
+        ):
+            return False
+        return self._depth_decode_spec().eligible
+    def _depth_decode_key(
+        self,
+        next_input_ids: torch.Tensor,
+        attention_bias: torch.Tensor,
+    ) -> Tuple[Any, ...]:
+        device = next_input_ids.device
+        return (
+            self._depth_decode_spec().cache_key_prefix,
+            device.type,
+            device.index,
+            self.model.lm_head.weight.dtype,
+            attention_bias.shape[-1],
+        )
+    def _select_depth_decode_rope(
+        self, cos: torch.Tensor, sin: torch.Tensor, *, past_length: int
+    ) -> None:
+        emb = self.backbone.transformer.rotary_emb
+        cos.copy_(emb._pos_cos_cache[0, :, past_length : past_length + 1, :])
+        sin.copy_(emb._pos_sin_cache[0, :, past_length : past_length + 1, :])
+    def _depth_decode_pre_layer(
+        self,
+        layer_idx: int,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        block = self.backbone.transformer.blocks[layer_idx]
+        attention = block.self_attn
+        residual = hidden_states
+        hidden_states = block.attn_norm(hidden_states)
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, attention.head_dim)
+        qkv = attention.att_proj(hidden_states)
+        query_states, key_states, value_states = qkv.split(attention.fused_dims, dim=-1)
+        value_states = value_states.view(hidden_shape)
+        apply_qk_norm = attention.q_norm is not None and attention.k_norm is not None
+        norm_after_view = apply_qk_norm and attention.qk_norm_type == "qwen3"
+        if apply_qk_norm and not norm_after_view:
+            query_states = attention.q_norm(query_states)
+            key_states = attention.k_norm(key_states)
+        query_states = query_states.view(hidden_shape)
+        key_states = key_states.view(hidden_shape)
+        if norm_after_view:
+            query_states = attention.q_norm(query_states)
+            key_states = attention.k_norm(key_states)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        query_states, key_states = _apply_rotary_pos_emb(
+            query_states, key_states, cos, sin
+        )
+        return residual, query_states, key_states, value_states
+    def _depth_decode_pre0(
+        self,
+        token_ids: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        inputs_embeds = self.model._embed_base_tokens(token_ids)
+        return self._depth_decode_pre_layer(0, inputs_embeds, cos, sin)
+    def _depth_decode_post_layer(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+    ) -> torch.Tensor:
+        block = self.backbone.transformer.blocks[layer_idx]
+        attention = block.self_attn
+        input_shape = residual.shape[:-1]
+        attn_output = attn_context.reshape(*input_shape, -1).contiguous()
+        attn_output = attention.attn_out(attn_output)
+        hidden_states = residual + block.dropout(attn_output)
+        residual = hidden_states
+        hidden_states = block.ff_norm(hidden_states)
+        hidden_states = block.mlp(hidden_states)
+        hidden_states = residual + block.dropout(hidden_states)
+        return hidden_states
+    def _depth_decode_post_and_pre_next(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
+        return self._depth_decode_pre_layer(layer_idx + 1, hidden_states, cos, sin)
+    def _depth_decode_last_post(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
+        return self.backbone.transformer.ln_f(hidden_states)
+    def _build_depth_decode_graph(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_length: int,
+        attention_bias: torch.Tensor,
+    ) -> _DepthDecodeCudaGraph:
+        text_config = self.backbone.transformer.config
+        device = next_input_ids.device
+        dtype = self.model.lm_head.weight.dtype
+        static = self._depth_decode_spec()
+        num_layers = static.num_hidden_layers
+        head_dim = static.head_dim
+        max_cache_len = int(attention_bias.shape[-1])
+        max_rope_len = max(int(text_config.max_position_embeddings or 0), max_cache_len)
+        self.backbone.transformer.prepare_rope_cache(
+            device=device, max_seq_len=max_rope_len
+        )
+        token_ids = torch.empty((1, 1), device=device, dtype=torch.long)
+        cos = torch.empty((1, 1, head_dim), device=device, dtype=dtype)
+        sin = torch.empty_like(cos)
+        positions = torch.arange(max_cache_len, device=device, dtype=torch.long)
+        context_shape = (1, 1, static.num_attention_heads, head_dim)
+        token_ids.copy_(next_input_ids)
+        self._select_depth_decode_rope(cos, sin, past_length=past_length)
+        pre_graph, pre_output = _capture_cuda_graph(
+            lambda: self._depth_decode_pre0(token_ids, cos, sin),
+            device,
+        )
+        stages = [_DepthDecodeCudaGraphLayerStage(*pre_output)]
+        post_graphs = []
+        for layer_idx in range(num_layers - 1):
+            stage = stages[-1]
+            attn_context = torch.empty(context_shape, device=device, dtype=dtype)
+            graph, output = _capture_cuda_graph(
+                lambda layer_idx=layer_idx, stage=stage, attn_context=attn_context: (
+                    self._depth_decode_post_and_pre_next(
+                        layer_idx,
+                        stage.residual,
+                        attn_context,
+                        cos,
+                        sin,
+                    )
+                ),
+                device,
+            )
+            post_graphs.append(
+                _DepthDecodeCudaGraphPostStage(graph=graph, attn_context=attn_context)
+            )
+            stages.append(_DepthDecodeCudaGraphLayerStage(*output))
+        last_stage = stages[-1]
+        last_attn_context = torch.empty(context_shape, device=device, dtype=dtype)
+        last_graph, last_output = _capture_cuda_graph(
+            lambda: self._depth_decode_last_post(
+                num_layers - 1,
+                last_stage.residual,
+                last_attn_context,
+            ),
+            device,
+        )
+        post_graphs.append(
+            _DepthDecodeCudaGraphPostStage(
+                graph=last_graph, attn_context=last_attn_context
+            )
+        )
+        return _DepthDecodeCudaGraph(
+            cache_key=self._depth_decode_key(next_input_ids, attention_bias),
+            pre_graph=pre_graph,
+            token_ids=token_ids,
+            cos=cos,
+            sin=sin,
+            positions=positions,
+            stages=tuple(stages),
+            post_graphs=tuple(post_graphs),
+            output=last_output,
+        )
+    def _get_depth_decode_graph(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_length: int,
+        attention_bias: torch.Tensor,
+    ) -> _DepthDecodeCudaGraph:
+        key = self._depth_decode_key(next_input_ids, attention_bias)
+        decode_graph = self.graph
+        if decode_graph is None or decode_graph.cache_key != key:
+            decode_graph = self._build_depth_decode_graph(
+                next_input_ids,
+                past_length=past_length,
+                attention_bias=attention_bias,
+            )
+            self.graph = decode_graph
+        else:
+            decode_graph.token_ids.copy_(next_input_ids)
+            self._select_depth_decode_rope(
+                decode_graph.cos, decode_graph.sin, past_length=past_length
+            )
+        return decode_graph
+    def _run_depth_decode_attention_core(
+        self,
+        layer_idx: int,
+        stage: _DepthDecodeCudaGraphLayerStage,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+        cache_position: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        attention = self.backbone.transformer.blocks[layer_idx].self_attn
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_values.update(
+            stage.key,
+            stage.value,
+            layer_idx,
+            cache_kwargs,
+        )
+        key_states = _repeat_kv(key_states, attention.num_key_value_groups)
+        value_states = _repeat_kv(value_states, attention.num_key_value_groups)
+        attn_output = F.scaled_dot_product_attention(
+            stage.query,
+            key_states,
+            value_states,
+            attn_mask=attention_bias,
+            dropout_p=0.0,
+            is_causal=False,
+        )
+        return attn_output.transpose(1, 2)
+    def run(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+        past_length: int,
+    ) -> Tuple[torch.Tensor, Cache]:
+        end = past_length + 1
+        decode_graph = self._get_depth_decode_graph(
+            next_input_ids,
+            past_length=past_length,
+            attention_bias=attention_bias,
+        )
+        cache_position = decode_graph.positions[past_length:end]
+        attention_bias_q = attention_bias[:, :, past_length:end, :end]
+        decode_graph.pre_graph.replay()
+        for layer_idx, post_graph in enumerate(decode_graph.post_graphs):
+            attn_context = self._run_depth_decode_attention_core(
+                layer_idx,
+                decode_graph.stages[layer_idx],
+                past_key_values=past_key_values,
+                attention_bias=attention_bias_q,
+                cache_position=cache_position,
+                cos=decode_graph.cos,
+                sin=decode_graph.sin,
+            )
+            post_graph.attn_context.copy_(attn_context)
+            post_graph.graph.replay()
+        return decode_graph.output, past_key_values
+def _cuda_graph_tensor_signature(
+    tensor: Optional[torch.Tensor],
+) -> Optional[Tuple[Any, ...]]:
+    if tensor is None:
+        return None
+    return (
+        tuple(tensor.shape),
+        tuple(tensor.stride()),
+        str(tensor.dtype),
+        str(tensor.device),
+    )
+def _cuda_graph_context_signature(context: Any) -> Tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return (
+        tuple((sig(k), sig(v)) for k, v in context.kv_contexts),
+        sig(context.cross_mask),
+        sig(context.self_mask),
+        sig(context.valid_action),
+        None
+        if context.rope_cache is None
+        else tuple(sig(t) for t in context.rope_cache),
+    )
+def _cuda_graph_modulation_signature(modulations: Sequence[Any]) -> Tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return tuple(
+        (
+            sig(step.conditioning),
+            tuple(
+                tuple(sig(t) for t in block_modulation)
+                for block_modulation in step.block_modulations
+            ),
+            tuple(sig(t) for t in step.final_modulation),
+        )
+        for step in modulations
+    )
+def _cuda_graph_key(inputs: _ActionFlowInputs, steps: int) -> Tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return (
+        sig(inputs.trajectory),
+        _cuda_graph_context_signature(inputs.context),
+        _cuda_graph_modulation_signature(inputs.modulations),
+        sig(inputs.action_dim_is_pad),
+        int(steps),
+    )
+def _clone_static_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    if tensor is None:
+        return None
+    static = torch.empty_strided(
+        tuple(tensor.shape),
+        tuple(tensor.stride()),
+        device=tensor.device,
+        dtype=tensor.dtype,
+    )
+    static.copy_(tensor)
+    return static
+def _clone_static_context(context: Any) -> Any:
+    rope_cache = None
+    if context.rope_cache is not None:
+        rope_cache = tuple(_clone_static_tensor(t) for t in context.rope_cache)
+    return context.__class__(
+        kv_contexts=tuple(
+            (_clone_static_tensor(k), _clone_static_tensor(v))
+            for k, v in context.kv_contexts
+        ),
+        cross_mask=_clone_static_tensor(context.cross_mask),
+        self_mask=_clone_static_tensor(context.self_mask),
+        valid_action=_clone_static_tensor(context.valid_action),
+        rope_cache=rope_cache,
+    )
+def _clone_static_modulations(modulations: Sequence[Any]) -> Sequence[Any]:
+    return tuple(
+        step.__class__(
+            conditioning=_clone_static_tensor(step.conditioning),
+            block_modulations=tuple(
+                tuple(_clone_static_tensor(t) for t in block_modulation)
+                for block_modulation in step.block_modulations
+            ),
+            final_modulation=tuple(
+                _clone_static_tensor(t) for t in step.final_modulation
+            ),
+        )
+        for step in modulations
+    )
+def _clone_static_inputs(inputs: _ActionFlowInputs) -> _ActionFlowInputs:
+    return _ActionFlowInputs(
+        trajectory=_clone_static_tensor(inputs.trajectory),
+        context=_clone_static_context(inputs.context),
+        modulations=_clone_static_modulations(inputs.modulations),
+        action_dim_is_pad=_clone_static_tensor(inputs.action_dim_is_pad),
+    )
+def _copy_context_(dst: Any, src: Any) -> None:
+    for (dst_k, dst_v), (src_k, src_v) in zip(dst.kv_contexts, src.kv_contexts):
+        dst_k.copy_(src_k)
+        dst_v.copy_(src_v)
+    if src.cross_mask is not None:
+        dst.cross_mask.copy_(src.cross_mask)
+    if src.self_mask is not None:
+        dst.self_mask.copy_(src.self_mask)
+    if src.valid_action is not None:
+        dst.valid_action.copy_(src.valid_action)
+    if src.rope_cache is not None:
+        for dst_tensor, src_tensor in zip(dst.rope_cache, src.rope_cache):
+            dst_tensor.copy_(src_tensor)
+def _copy_inputs_(dst: _ActionFlowInputs, src: _ActionFlowInputs) -> None:
+    dst.trajectory.copy_(src.trajectory)
+    _copy_context_(dst.context, src.context)
+    if src.action_dim_is_pad is not None:
+        dst.action_dim_is_pad.copy_(src.action_dim_is_pad)
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def _apply_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (_rotate_half(q) * sin)
+    k_embed = (k * cos) + (_rotate_half(k) * sin)
+    return q_embed, k_embed
+def _repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def _capture_cuda_graph(
+    fn,
+    device: torch.device,
+    *,
+    after_warmup=None,
+) -> Tuple[torch.cuda.CUDAGraph, Any]:
+    warmup_stream = torch.cuda.Stream(device=device)
+    warmup_stream.wait_stream(torch.cuda.current_stream(device))
+    with torch.cuda.stream(warmup_stream):
+        fn()
+    torch.cuda.current_stream(device).wait_stream(warmup_stream)
+    if after_warmup is not None:
+        after_warmup()
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        output = fn()
+    return graph, output

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07fe33e9759258a1c37b54e7a5f0d78c53a270dc8ddc7ecb139736ae1e9315da
+size 4183452356

modeling_molmoact2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

norm_stats.json ADDED Viewed

	@@ -0,0 +1,238 @@

+{
+  "format": "molmoact2_norm_stats.v1",
+  "norm_mode": "q01_q99",
+  "metadata_by_tag": {
+    "libero": {
+      "action_key": "action",
+      "state_key": "observation.state",
+      "camera_keys": [
+        "observation.images.image",
+        "observation.images.wrist_image"
+      ],
+      "normalize_gripper": false,
+      "action_horizon": 10,
+      "n_action_steps": 10,
+      "setup_type": "single franka robotic arm in libero",
+      "control_mode": "delta end-effector pose",
+      "action_stats": {
+        "min": [
+          -0.9375,
+          -0.9375,
+          -0.9375,
+          -0.2582142949104309,
+          -0.375,
+          -0.3675000071525574,
+          -1.0
+        ],
+        "max": [
+          0.9375,
+          0.9375,
+          0.9375,
+          0.3557142913341522,
+          0.375,
+          0.375,
+          1.0
+        ],
+        "mean": [
+          0.06278156570450202,
+          0.08684081017968912,
+          -0.09037305936952836,
+          0.0005407430783705141,
+          0.0056433796450358715,
+          -0.005229098518603562,
+          -0.04964072167678376
+        ],
+        "std": [
+          0.3355237114945633,
+          0.3784469867268323,
+          0.44472859911256607,
+          0.03924354049229973,
+          0.06339296407444922,
+          0.07797027713976648,
+          0.9987671529022402
+        ],
+        "count": [
+          273465.0
+        ],
+        "q01": [
+          -0.6792031928846481,
+          -0.7736573115323259,
+          -0.8728073904104404,
+          -0.10277447185825356,
+          -0.15509810617083444,
+          -0.20289961475228455,
+          -1.0
+        ],
+        "q10": [
+          -0.328718721971874,
+          -0.3626162647358338,
+          -0.6610056625361599,
+          -0.03907064459203904,
+          -0.06428551162168497,
+          -0.07928202560631951,
+          -1.0
+        ],
+        "q50": [
+          0.015333975787982875,
+          0.006437010746251905,
+          -0.07265095199149316,
+          -1.701317418858285e-05,
+          0.00021801956089207239,
+          -5.852172701796134e-05,
+          -0.12287333595187695
+        ],
+        "q90": [
+          0.5238177265233007,
+          0.671417970219526,
+          0.5384412174699407,
+          0.040331002487738146,
+          0.08240652401791884,
+          0.0690125677722944,
+          0.9999141552827842
+        ],
+        "q99": [
+          0.8536542808794264,
+          0.8637811051429717,
+          0.9363295547540081,
+          0.13045695485814487,
+          0.18015313802054606,
+          0.24129727661704234,
+          0.9999914155282784
+        ],
+        "names": [
+          "x",
+          "y",
+          "z",
+          "roll",
+          "pitch",
+          "yaw",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "state_stats": {
+        "min": [
+          -0.4828203022480011,
+          -0.3255046010017395,
+          0.008128180168569088,
+          0.35277295112609863,
+          -3.641430377960205,
+          -1.842738389968872,
+          -0.0013586411951109767,
+          -0.042040832340717316
+        ],
+        "max": [
+          0.21031762659549713,
+          0.39128610491752625,
+          1.3660105466842651,
+          3.6714255809783936,
+          3.560650587081909,
+          1.386339545249939,
+          0.04233968257904053,
+          0.0013633022317662835
+        ],
+        "mean": [
+          -0.04651878279191748,
+          0.034409066787269356,
+          0.7645525031210381,
+          2.9722094975655056,
+          -0.22046978549041713,
+          -0.1255794031738752,
+          0.026914253269017054,
+          -0.027190783616938205
+        ],
+        "std": [
+          0.10494395508556839,
+          0.1517661933220375,
+          0.378516707505034,
+          0.34427344187858827,
+          0.9069468516043042,
+          0.32539190149967406,
+          0.01417590382231912,
+          0.014058894296088888
+        ],
+        "count": [
+          273465.0
+        ],
+        "q01": [
+          -0.31479429659059555,
+          -0.26691552643710226,
+          0.5194626050191016,
+          2.159994551314992,
+          -1.801294177865994,
+          -0.8949778881389838,
+          0.003382730811955442,
+          -0.04008920533069468
+        ],
+        "q10": [
+          -0.18409729127502492,
+          -0.158759498072202,
+          0.5694822295083012,
+          2.501970046458546,
+          -1.1889107640062022,
+          -0.5297043790093273,
+          0.007573322430226042,
+          -0.039827946964434036
+        ],
+        "q50": [
+          -0.02822545357081922,
+          0.029718887641213443,
+          0.7185643731428462,
+          3.0915725099012166,
+          -0.12491069931831773,
+          -0.08338984738533357,
+          0.030648370056451133,
+          -0.031519123023466586
+        ],
+        "q90": [
+          0.06725052913150302,
+          0.23387160335018267,
+          0.9599947530498419,
+          3.1743361507512997,
+          0.5456820212337484,
+          0.20414514594693875,
+          0.03985537019679712,
+          -0.008040434619037518
+        ],
+        "q99": [
+          0.1222615490116252,
+          0.3140223876046953,
+          1.042961724319958,
+          3.277638017923068,
+          1.724488202195691,
+          0.5659922739094448,
+          0.04009682017699841,
+          -0.003493522538066522
+        ],
+        "names": [
+          "x",
+          "y",
+          "z",
+          "rx",
+          "ry",
+          "rz",
+          "rw",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      }
+    }
+  }
+}

processing_molmoact2.py ADDED Viewed

	@@ -0,0 +1,418 @@

+"""
+Processor class for MolmoAct2.
+"""
+from typing import Optional, Union
+import dataclasses
+import numpy as np
+from transformers.image_utils import ImageInput
+from transformers.video_utils import VideoInput
+from transformers.processing_utils import (
+    Unpack,
+    ProcessingKwargs,
+    ProcessorMixin,
+)
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
+from transformers.utils import logging
+from transformers import AutoTokenizer
+from .image_processing_molmoact2 import MolmoAct2ImagesKwargs, MolmoAct2ImageProcessor
+from .video_processing_molmoact2 import MolmoAct2VideoProcessorKwargs, MolmoAct2VideoProcessor
+logger = logging.get_logger(__name__)
+# Special tokens, these should be present in any tokenizer we use since the preprocessor uses them
+IMAGE_PATCH_TOKEN = f"<im_patch>"  # Where to insert high-res tokens
+IMAGE_LOW_RES_TOKEN = f"<im_low>"  # Where to insert low-res tokens
+IM_START_TOKEN = f"<im_start>"
+LOW_RES_IMAGE_START_TOKEN = f"<low_res_im_start>"
+FRAME_START_TOKEN = f"<frame_start>"
+IM_END_TOKEN = f"<im_end>"
+FRAME_END_TOKEN= f"<frame_end>"
+IM_COL_TOKEN = f"<im_col>"
+IMAGE_PROMPT = "<|image|>"
+VIDEO_PROMPT = "<|video|>"
+IMAGE_TOKENS = [
+    IMAGE_PATCH_TOKEN,
+    IM_COL_TOKEN,
+    IM_START_TOKEN,
+    LOW_RES_IMAGE_START_TOKEN,
+    FRAME_START_TOKEN,
+    IM_END_TOKEN,
+    FRAME_END_TOKEN,
+    IMAGE_LOW_RES_TOKEN,
+]
+class MolmoAct2ProcessorKwargs(ProcessingKwargs, total=False):
+    """MolmoAct2 processor kwargs"""
+    images_kwargs: MolmoAct2ImagesKwargs
+    videos_kwargs: MolmoAct2VideoProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": True,
+        },
+        "videos_kwargs": {"return_metadata": True},
+    }
+class MolmoAct2Processor(ProcessorMixin):
+    attributes = ["image_processor", "video_processor", "tokenizer"]
+    optional_attributes = [
+        "chat_template",
+        "time_mode",
+        "image_use_col_tokens",
+        "use_single_crop_col_tokens",
+        "use_single_crop_start_token",
+        "video_use_col_tokens",
+        "use_frame_special_tokens",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        image_processor: MolmoAct2ImageProcessor = None,
+        video_processor: MolmoAct2VideoProcessor = None,
+        tokenizer: AutoTokenizer = None,
+        chat_template: Optional[str] = None,
+        image_use_col_tokens: Optional[bool] = True,
+        use_single_crop_col_tokens: Optional[bool] = None,
+        use_single_crop_start_token: Optional[bool] = True,
+        video_use_col_tokens: Optional[bool] = False,
+        use_frame_special_tokens: Optional[bool] = True,
+        **kwargs
+    ) -> None:
+        super().__init__(
+            image_processor,
+            video_processor,
+            tokenizer,
+            chat_template=chat_template,
+        )
+        self.image_use_col_tokens = image_use_col_tokens
+        self.use_single_crop_col_tokens = use_single_crop_col_tokens
+        self.use_single_crop_start_token = use_single_crop_start_token
+        self.video_use_col_tokens = video_use_col_tokens
+        self.use_frame_special_tokens = use_frame_special_tokens
+        self.image_placeholder_token = IMAGE_PROMPT
+        self.video_placeholder_token = VIDEO_PROMPT
+        self.image_token_ids = [
+            tokenizer.convert_tokens_to_ids(token)
+            for token in IMAGE_TOKENS
+        ]
+    def get_image_tokens(self, image_grid: np.ndarray):
+        resized_h, resized_w, height, width = image_grid
+        if int(height) == 0 or int(width) == 0:
+            per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
+            use_single_crop_col_tokens = (
+                self.image_use_col_tokens
+                if self.use_single_crop_col_tokens is None
+                else self.use_single_crop_col_tokens
+            )
+            if use_single_crop_col_tokens:
+                per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+            joint = [
+                [IM_START_TOKEN],
+                np.tile(per_row, [resized_h]),
+                [IM_END_TOKEN],
+            ]
+            return np.concatenate(joint)
+        per_row = np.full(width, IMAGE_PATCH_TOKEN)
+        if self.image_use_col_tokens:
+            per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+        joint = [
+            [IM_START_TOKEN],
+            np.tile(per_row, [height]),
+            [IM_END_TOKEN],
+        ]
+        per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
+        use_single_crop_col_tokens = (
+            self.image_use_col_tokens
+            if self.use_single_crop_col_tokens is None
+            else self.use_single_crop_col_tokens
+        )
+        image_start_token = (
+            LOW_RES_IMAGE_START_TOKEN
+            if self.use_single_crop_start_token
+            else IM_START_TOKEN
+        )
+        if use_single_crop_col_tokens:
+            per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+        joint = [
+            [image_start_token],
+            np.tile(per_row, [resized_h]),
+            [IM_END_TOKEN],
+        ] + joint
+        return np.concatenate(joint)
+    def get_video_string(
+        self,
+        video_grid: np.ndarray,
+        timestamps: np.ndarray,
+    ):
+        if self.use_frame_special_tokens:
+            start_token_id = FRAME_START_TOKEN
+            end_token_id = FRAME_END_TOKEN
+        else:
+            start_token_id = IM_START_TOKEN
+            end_token_id = IM_END_TOKEN
+        num_frames, h, w = video_grid
+        video_string: str = ""
+        for frame_idx, frame_time in enumerate(timestamps):
+            # `per-frame-compact` time mode
+            prev_space = " " if frame_idx > 0 else ""
+            frame_prefix = prev_space + f"{frame_time:.1f} " # explicit whitespace before/after image tokens
+            video_string += frame_prefix
+            per_row = np.full(w, IMAGE_PATCH_TOKEN)
+            if self.video_use_col_tokens:
+                per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+            extra_tokens = np.tile(per_row, [h])
+            video_tokens = [
+                [start_token_id],
+                extra_tokens,
+                [end_token_id],
+            ]
+            video_string += "".join(np.concatenate(video_tokens, 0))
+        return video_string
+    def insert_bos(
+        self,
+        input_ids: np.ndarray,
+        attention_mask: np.ndarray,
+        bos_token_id: int,
+        pad_token_id: int,
+    ):
+        """
+        Args:
+            input_ids: [B, S] array with left padding
+            attention_mask: [B, S] array (0 for pad, 1 for valid)
+            bos_token_id: int
+            pad_token_id: int
+        Returns:
+            input_ids_out: [B, S] or [B, S+1] array with bos inserted if needed
+            attention_mask_out: same shape as input_ids_out
+        """
+        need_to_expand = len(input_ids.shape) == 1
+        if need_to_expand:
+            input_ids = input_ids[None, :]
+            attention_mask = attention_mask[None, :]
+        B, S = input_ids.shape
+        # Handle zero-length sequence
+        if S == 0:
+            new_input_ids = np.full((B, 1), bos_token_id, dtype=input_ids.dtype)
+            new_attention_mask = np.ones((B, 1), dtype=attention_mask.dtype)
+            if need_to_expand:
+                new_input_ids = new_input_ids[0]
+                new_attention_mask = new_attention_mask[0]
+            return new_input_ids, new_attention_mask
+        first_valid_index = (attention_mask == 1).argmax(axis=-1)  # [B]
+        bos_already_present = np.all(input_ids[np.arange(B), first_valid_index] == bos_token_id)
+        if bos_already_present:
+            if need_to_expand:
+                input_ids = input_ids[0]
+                attention_mask = attention_mask[0]
+            return input_ids, attention_mask
+        else:
+            new_input_ids = np.full((B, S+1), pad_token_id, dtype=input_ids.dtype)
+            new_attention_mask = np.zeros((B, S+1), dtype=attention_mask.dtype)
+            src_idx = np.tile(np.arange(S), (B, 1))  # [B, S]
+            valid_mask = src_idx >= first_valid_index[:, None]  # [B, S]
+            tgt_idx = src_idx + 1  # shit right
+            batch_idx = np.tile(np.arange(B)[:, None], (1, S))  # [B, S]
+            # flatten valid_positions
+            flat_vals = input_ids[valid_mask]
+            flat_batch = batch_idx[valid_mask]
+            flat_tgt = tgt_idx[valid_mask]
+            new_input_ids[flat_batch, flat_tgt] = flat_vals
+            new_attention_mask[flat_batch, flat_tgt] = 1
+            insert_pos = first_valid_index
+            new_input_ids[np.arange(B), insert_pos] = bos_token_id
+            new_attention_mask[np.arange(B), insert_pos] = 1
+            if need_to_expand:
+                new_input_ids = new_input_ids[0]
+                new_attention_mask = new_attention_mask[0]
+            return new_input_ids, new_attention_mask
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[MolmoAct2ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            videos (`dict[str, Any]` or `list[dict[str, Any]]`):
+                The video or batch of videos to be prepared. Each video can be a dictionary with the following keys:
+                - `"frames"`: `np.ndarray` of shape (T, H, W, 3)
+                - `"timestamps"`: `np.ndarray` of shape (T,)
+                - `"sampled_fps"`: `float` (optional)
+                - `"sampling_augmentation"`: `str` (optional)
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            `BatchFeature`: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **image_token_pooling** -- Indices of the patches in `image_grids` to pool for each token in `image_tokens`.
+              Returned when `images` is not `None`.
+            - **image_grids** -- Grids of images. Returned when `images` is not `None`.
+            - **image_num_crops** -- Number of crops for each image. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **video_token_pooling** -- Indices of the patches in `video_grids` to pool for each token in `video_tokens`.
+              Returned when `videos` is not `None`.
+            - **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            MolmoAct2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+            image_grids = image_inputs["image_grids"]
+        else:
+            image_inputs = {}
+            image_grids = None
+        if videos is not None:
+            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
+            video_grids = videos_inputs["video_grids"]
+            # If user has not requested video metadata, pop it
+            if "return_metadata" not in kwargs:
+                video_metadata = videos_inputs.pop("video_metadata")
+            else:
+                video_metadata = videos_inputs["video_metadata"]
+        else:
+            videos_inputs = {}
+            video_grids = None
+        if not isinstance(text, list):
+            text = [text]
+        text = text.copy() # below lines change text in-place
+        if image_grids is not None:
+            index = 0
+            for i in range(len(text)):
+                num_images = text[i].count(self.image_placeholder_token)
+                image_grids_i = image_grids[index:index+num_images]
+                for image_grid in image_grids_i:
+                    image_tokens = self.get_image_tokens(image_grid)
+                    image_string = "".join(image_tokens)
+                    text[i] = text[i].replace(self.image_placeholder_token, image_string, 1)
+                index += num_images
+        if video_grids is not None:
+            index = 0
+            for i in range(len(text)):
+                num_videos = text[i].count(self.video_placeholder_token)
+                assert num_videos in {0, 1}, "At most one video is supported for now"
+                video_grids_i = video_grids[index:index+num_videos]
+                metadata_i = video_metadata[index:index+num_videos]
+                for video_grid, metadata in zip(video_grids_i, metadata_i):
+                    video_string = self.get_video_string(
+                        video_grid,
+                        metadata.timestamps,
+                    )
+                    text[i] = text[i].replace(self.video_placeholder_token, video_string, 1)
+                index += num_videos
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        input_ids = text_inputs["input_ids"]
+        attention_mask = text_inputs["attention_mask"]
+        input_ids = np.array(input_ids)
+        attention_mask = np.array(attention_mask)
+        bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
+        input_ids, attention_mask = self.insert_bos(
+            input_ids, attention_mask, bos, self.tokenizer.pad_token_id
+        )
+        if return_mm_token_type_ids:
+            image_tokens = np.array(self.image_token_ids).astype(input_ids.dtype)
+            token_type_ids = np.any(input_ids[:, :, None] == image_tokens[None, None, :], axis=-1)
+            text_inputs["token_type_ids"] = token_type_ids.tolist()
+        text_inputs["input_ids"] = input_ids.tolist()
+        text_inputs["attention_mask"] = attention_mask.tolist()
+        return BatchFeature(
+            data={**text_inputs, **image_inputs, **videos_inputs},
+            tensor_type=return_tensors,
+        )
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+        Returns:
+            `list[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+MolmoAct2Processor.register_for_auto_class()

processor_config.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_molmoact2.MolmoAct2Processor"
+  },
+  "image_processor": {
+    "auto_map": {
+      "AutoImageProcessor": "image_processing_molmoact2.MolmoAct2ImageProcessor",
+      "AutoProcessor": "processing_molmoact2.MolmoAct2Processor"
+    },
+    "crop_mode": "resize",
+    "do_convert_rgb": true,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_processor_type": "MolmoAct2ImageProcessor",
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "max_crops": 8,
+    "overlap_margins": [
+      4,
+      4
+    ],
+    "patch_size": 14,
+    "pooling_size": [
+      2,
+      2
+    ],
+    "resample": 2,
+    "size": {
+      "height": 378,
+      "width": 378
+    }
+  },
+  "image_use_col_tokens": true,
+  "processor_class": "MolmoAct2Processor",
+  "use_frame_special_tokens": true,
+  "use_single_crop_col_tokens": false,
+  "use_single_crop_start_token": true,
+  "video_processor": {
+    "auto_map": {
+      "AutoProcessor": "processing_molmoact2.MolmoAct2Processor",
+      "AutoVideoProcessor": "video_processing_molmoact2.MolmoAct2VideoProcessor"
+    },
+    "data_format": "channels_first",
+    "default_to_square": true,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "frame_sample_mode": "uniform_last_frame",
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "max_fps": 2.0,
+    "num_frames": 8,
+    "patch_size": 14,
+    "pooling_size": [
+      3,
+      3
+    ],
+    "resample": 2,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "sampling_fps": 2,
+    "size": {
+      "height": 378,
+      "width": 378
+    },
+    "video_processor_type": "MolmoAct2VideoProcessor"
+  },
+  "video_use_col_tokens": false
+}

quantization_metadata.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "source_repo": "allenai/MolmoAct2-LIBERO",
+  "source_revision": "0d24a92bd1faf321ef497c3bbd5681af97c65aa2",
+  "policy_class": "transformers:AutoModelForImageTextToText",
+  "quantization": {
+    "scheme": "nf4",
+    "backend": "bitsandbytes",
+    "compute_dtype": "bfloat16",
+    "min_params_to_quantize": 4000000,
+    "rule": "Linear modules with >=4_000_000 weight elements rewritten to bnb.nn.Linear4bit; smaller heads kept in compute_dtype (bfloat16).",
+    "runtime_status": "loader-backed (install_prequantized_linears)"
+  },
+  "dropped_state_entries": []
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5395aefc9b1b7f0385d8c86a2f1775e5af81bdfbf9f2d97827ea37921d9f862
+size 11983605

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "add_prefix_space": false,
+  "auto_map": {
+    "AutoProcessor": "processing_molmoact2.MolmoAct2Processor"
+  },
+  "backend": "tokenizers",
+  "bos_token": "<|im_end|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<im_start>",
+    "<im_end>",
+    "<im_patch>",
+    "<im_col>",
+    "<low_res_im_start>",
+    "<|image|>",
+    "<im_low>",
+    "<frame_start>",
+    "<frame_end>",
+    "<|video|>",
+    "<|points|>",
+    "<|token_index|>",
+    "<|vit_index|>",
+    "<|vit_loc|>"
+  ],
+  "is_local": false,
+  "model_max_length": 1010000,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "MolmoAct2Processor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}