"""HuggingFace configs for the MiniMax VL family (M2 VL / M3 VL). This file is bundled into every converted HF checkpoint so that loading via ``AutoConfig.from_pretrained(..., trust_remote_code=True)`` works without any runtime dependency on sglang or other internal packages — only stock ``transformers`` is required. The class definitions intentionally mirror ``sglang.srt.configs.minimax_vl``; if either side changes, keep them in sync. The file is named ``configuration_minimax_m3_vl.py`` (matching the legacy ``model_type="minimax_m3_vl"`` and the converter's ``auto_map`` entry) so that ckpts produced by this converter remain loadable by older sglang versions that only know the ``MiniMaxM3VL*`` names. The canonical class is ``MiniMaxM3VLConfig``; ``MiniMaxM3VLConfig`` is a thin BC alias whose only purpose is to be referenced from ``auto_map``. """ from typing import Optional from transformers.configuration_utils import PretrainedConfig from transformers.models.auto import CONFIG_MAPPING def _coerce_sub_config( sub_config: Optional[dict], default_model_type: str ) -> Optional[PretrainedConfig]: """Convert a config dict to a ``PretrainedConfig`` instance. If ``model_type`` is registered in HF ``CONFIG_MAPPING`` the corresponding config class is used; otherwise we fall back to a generic ``PretrainedConfig`` so all dict keys still become real attributes (M3's text backbone uses ``model_type="minimax_m2"`` which is not in ``CONFIG_MAPPING``). """ if not isinstance(sub_config, dict): return sub_config model_type = sub_config.get("model_type", default_model_type) cls = CONFIG_MAPPING.get(model_type, PretrainedConfig) return cls(**sub_config) class MiniMaxVLBaseConfig(PretrainedConfig): """Base config shared by every MiniMax VL variant. Handles vision/text sub-config coercion. Concrete subclasses only need to declare a unique ``model_type`` string. """ def __init__( self, vision_config: Optional[dict] = None, text_config: Optional[dict] = None, image_token_index: int = 200025, video_token_index: int = 200026, image_seq_length: int = 576, process_image_mode: str = "dynamic_res", projector_hidden_act: str = "gelu", multimodal_projector_bias: bool = True, vision_feature_layer: int = -1, vision_feature_select_strategy: str = "full", img_token_compression_config: Optional[dict] = None, image_grid_pinpoints: Optional[str] = None, **kwargs, ): self.vision_config = _coerce_sub_config(vision_config, "clip_vision_model") self.text_config = _coerce_sub_config(text_config, "mixtral") self.image_token_index = image_token_index self.video_token_index = video_token_index self.image_seq_length = image_seq_length self.process_image_mode = process_image_mode self.projector_hidden_act = projector_hidden_act self.multimodal_projector_bias = multimodal_projector_bias self.vision_feature_layer = vision_feature_layer self.vision_feature_select_strategy = vision_feature_select_strategy self.img_token_compression_config = img_token_compression_config or {} self.image_grid_pinpoints = image_grid_pinpoints super().__init__(**kwargs) def __post_init__(self, **kwargs): super().__post_init__(**kwargs) if hasattr(self, "vision_config"): self.vision_config = _coerce_sub_config(self.vision_config, "clip_vision_model") if hasattr(self, "text_config"): self.text_config = _coerce_sub_config(self.text_config, "mixtral") class MiniMaxM2VLConfig(MiniMaxVLBaseConfig): """MiniMax M2 VL: vision tower + M2 (Mixtral-style MoE) text backbone.""" model_type = "minimax_m2_vl" class MiniMaxM3VLConfig(MiniMaxVLBaseConfig): """MiniMax M3 VL: vision tower + M3 (mixed sparse/dense MoE) text backbone.""" model_type = "minimax_m3_vl" class MiniMaxM2MiniVLConfig(MiniMaxM2VLConfig): """Legacy alias kept so old ``model_type="minimax_m2_mini_vl"`` ckpts load.""" model_type = "minimax_m2_mini_vl" class MiniMaxM3VLConfig(MiniMaxM3VLConfig): """Legacy alias kept so old ``model_type="minimax_m3_vl"`` ckpts load.""" model_type = "minimax_m3_vl"