qovaryx-350m-scratch-base / configuration_qovaryx.py
tjarvis91's picture
Initial release: Qovaryx random-init scratch base, Apache-2.0
9cdb2ab verified
"""HF PretrainedConfig wrapper around Qovaryx DecoderConfig."""
from __future__ import annotations
from transformers import PretrainedConfig
class QovaryxConfig(PretrainedConfig):
model_type = "qovaryx"
def __init__(
self,
vocab_size: int = 20242,
d_model: int = 512,
n_layer: int = 12,
n_head: int = 8,
n_kv_head: int = 2,
d_ff: int = 1408,
max_seq_len: int = 2048,
rope_base: float = 10000.0,
rms_eps: float = 1e-5,
dropout: float = 0.0,
decision_head_classes: int = 4,
decision_head_dropout: float = 0.0,
decision_head_enabled: bool = False,
mtp_k: int = 4,
mtp_head_kind: str = "mlp",
init_std: float = 0.02,
tie_word_embeddings: bool = True,
ffn_kind: str = "swiglu",
ffn_rank: int = 128,
ffn_experts: int = 4,
ffn_top_k: int = 1,
chart_patch_encoder_enabled: bool = False,
chart_image_size: int = 224,
chart_patch_size: int = 32,
chart_channels: int = 3,
chart_embed_dropout: float = 0.0,
**kwargs,
):
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
self.vocab_size = vocab_size
self.d_model = d_model
self.hidden_size = d_model # HF convention alias
self.n_layer = n_layer
self.num_hidden_layers = n_layer # HF alias
self.n_head = n_head
self.num_attention_heads = n_head # HF alias
self.n_kv_head = n_kv_head
self.num_key_value_heads = n_kv_head # HF alias
self.d_ff = d_ff
self.intermediate_size = d_ff # HF alias
self.max_seq_len = max_seq_len
self.max_position_embeddings = max_seq_len # HF alias
self.rope_base = rope_base
self.rms_eps = rms_eps
self.dropout = dropout
self.decision_head_classes = decision_head_classes
self.decision_head_dropout = decision_head_dropout
self.decision_head_enabled = decision_head_enabled
self.mtp_k = mtp_k
self.mtp_head_kind = mtp_head_kind
self.init_std = init_std
self.ffn_kind = ffn_kind
self.ffn_rank = ffn_rank
self.ffn_experts = ffn_experts
self.ffn_top_k = ffn_top_k
self.chart_patch_encoder_enabled = chart_patch_encoder_enabled
self.chart_image_size = chart_image_size
self.chart_patch_size = chart_patch_size
self.chart_channels = chart_channels
self.chart_embed_dropout = chart_embed_dropout