bibproj commited on
Commit
638c857
·
verified ·
1 Parent(s): ceda88d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +2 -5
  2. chat_template.jinja +112 -0
  3. config.json +0 -0
  4. configuration_deepseek.py +214 -0
  5. configuration_kimi_k25.py +123 -0
  6. generation_config.json +4 -0
  7. kimi_k25_processor.py +165 -0
  8. kimi_k25_vision_processing.py +251 -0
  9. media_utils.py +368 -0
  10. model-00001-of-00099.safetensors +3 -0
  11. model-00002-of-00099.safetensors +3 -0
  12. model-00003-of-00099.safetensors +3 -0
  13. model-00004-of-00099.safetensors +3 -0
  14. model-00005-of-00099.safetensors +3 -0
  15. model-00006-of-00099.safetensors +3 -0
  16. model-00007-of-00099.safetensors +3 -0
  17. model-00008-of-00099.safetensors +3 -0
  18. model-00009-of-00099.safetensors +3 -0
  19. model-00010-of-00099.safetensors +3 -0
  20. model-00011-of-00099.safetensors +3 -0
  21. model-00012-of-00099.safetensors +3 -0
  22. model-00013-of-00099.safetensors +3 -0
  23. model-00014-of-00099.safetensors +3 -0
  24. model-00015-of-00099.safetensors +3 -0
  25. model-00016-of-00099.safetensors +3 -0
  26. model-00017-of-00099.safetensors +3 -0
  27. model-00018-of-00099.safetensors +3 -0
  28. model-00019-of-00099.safetensors +3 -0
  29. model-00020-of-00099.safetensors +3 -0
  30. model-00021-of-00099.safetensors +3 -0
  31. model-00022-of-00099.safetensors +3 -0
  32. model-00023-of-00099.safetensors +3 -0
  33. model-00024-of-00099.safetensors +3 -0
  34. model-00025-of-00099.safetensors +3 -0
  35. model-00026-of-00099.safetensors +3 -0
  36. model-00027-of-00099.safetensors +3 -0
  37. model-00028-of-00099.safetensors +3 -0
  38. model-00029-of-00099.safetensors +3 -0
  39. model-00030-of-00099.safetensors +3 -0
  40. model-00031-of-00099.safetensors +3 -0
  41. model-00032-of-00099.safetensors +3 -0
  42. model-00033-of-00099.safetensors +3 -0
  43. model-00034-of-00099.safetensors +3 -0
  44. model-00035-of-00099.safetensors +3 -0
  45. model-00036-of-00099.safetensors +3 -0
  46. model-00037-of-00099.safetensors +3 -0
  47. model-00038-of-00099.safetensors +3 -0
  48. model-00039-of-00099.safetensors +3 -0
  49. model-00040-of-00099.safetensors +3 -0
  50. model-00041-of-00099.safetensors +3 -0
README.md CHANGED
@@ -1,10 +1,7 @@
1
  ---
2
- license: other
3
- license_name: modified-mit
4
  library_name: mlx
 
5
  tags:
6
  - mlx
7
- - transformers
8
- pipeline_tag: text-generation
9
- base_model: moonshotai/Kimi-K2.6
10
  ---
 
1
  ---
2
+ language: en
 
3
  library_name: mlx
4
+ pipeline_tag: text-generation
5
  tags:
6
  - mlx
 
 
 
7
  ---
chat_template.jinja ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro render_content(msg) -%}
2
+ {%- set c = msg.get('content') -%}
3
+ {%- if c is string -%}
4
+ {{ c }}
5
+ {%- elif c is not none -%}
6
+ {% for content in c -%}
7
+ {% if content['type'] == 'image' or content['type'] == 'image_url' -%}
8
+ <|media_begin|>image<|media_content|><|media_pad|><|media_end|>
9
+ {% elif content['type'] == 'video' or content['type']== 'video_url'-%}
10
+ <|kimi_k25_video_placeholder|>
11
+ {% else -%}
12
+ {{ content['text'] }}
13
+ {%- endif -%}
14
+ {%- endfor -%}
15
+ {%- endif -%}
16
+ {%- endmacro -%}
17
+
18
+ {% macro set_roles(message) -%}
19
+ {%- set role_name = message.get('name') or message['role'] -%}
20
+ {%- if message['role'] == 'user' -%}
21
+ <|im_user|>{{role_name}}<|im_middle|>
22
+ {%- elif message['role'] == 'assistant' -%}
23
+ <|im_assistant|>{{role_name}}<|im_middle|>
24
+ {%- else -%}
25
+ <|im_system|>{{role_name}}<|im_middle|>
26
+ {%- endif -%}
27
+ {%- endmacro -%}
28
+
29
+
30
+ {%- macro render_toolcalls(message) -%}
31
+ <|tool_calls_section_begin|>
32
+ {%- for tool_call in message['tool_calls'] -%}
33
+ {%- set formatted_id = tool_call['id'] -%}
34
+ <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
35
+ {%- endfor -%}
36
+ <|tool_calls_section_end|>
37
+ {%- endmacro -%}
38
+
39
+
40
+ {%- set preserve_thinking = preserve_thinking | default(false) -%}
41
+ {# Find last non-tool-call assistant message. If preserve_thinking, keep -1 so hist is empty and all msgs use suffix (retain reasoning). #}
42
+ {%- set ns = namespace(last_non_tool_call_assistant_msg=-1) -%}
43
+ {%- if not preserve_thinking -%}
44
+ {%- for idx in range(messages|length-1, -1, -1) -%}
45
+ {%- if messages[idx]['role'] == 'assistant' and not messages[idx].get('tool_calls') -%}
46
+ {%- set ns.last_non_tool_call_assistant_msg = idx -%}
47
+ {%- break -%}
48
+ {%- endif -%}
49
+ {%- endfor -%}
50
+ {%- endif -%}
51
+
52
+ {# split all messages into history & suffix, reasoning_content in suffix should be reserved.#}
53
+ {%- set hist_msgs = messages[:ns.last_non_tool_call_assistant_msg+1] -%}
54
+ {%- set suffix_msgs = messages[ns.last_non_tool_call_assistant_msg+1:] -%}
55
+
56
+ {%- if tools -%}
57
+ {%- if tools_ts_str -%}
58
+ <|im_system|>tool_declare<|im_middle|>{{ tools_ts_str }}<|im_end|>
59
+ {%- else -%}
60
+ <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
61
+ {%- endif -%}
62
+ {%- endif -%}
63
+
64
+
65
+ {%- for message in hist_msgs -%}
66
+ {{set_roles(message)}}
67
+ {%- if message['role'] == 'assistant' -%}
68
+ <think></think>{{render_content(message)}}
69
+ {%- if message.get('tool_calls') -%}
70
+ {{render_toolcalls(message)}}
71
+ {%- endif -%}
72
+ {%- elif message['role'] == 'tool' -%}
73
+ {%- set tool_call_id = message.tool_call_id -%}
74
+ ## Return of {{ tool_call_id }}
75
+ {{render_content(message)}}
76
+ {%- elif message['content'] is not none -%}
77
+ {{render_content(message)}}
78
+ {%- endif -%}
79
+ <|im_end|>
80
+ {%- endfor -%}
81
+
82
+ {%- for message in suffix_msgs -%}
83
+ {{set_roles(message)}}
84
+ {%- if message['role'] == 'assistant' -%}
85
+ {%- if thinking is defined and thinking is false and preserve_thinking is false -%}
86
+ <think></think>{{render_content(message)}}
87
+ {%- else -%}
88
+ {%- set rc = message.get('reasoning', message.get('reasoning_content', '')) -%}
89
+ <think>{{rc}}</think>{{render_content(message)}}
90
+ {%- endif -%}
91
+ {%- if message.get('tool_calls') -%}
92
+ {{render_toolcalls(message)}}
93
+ {%- endif -%}
94
+ {%- elif message['role'] == 'tool' -%}
95
+ {%- set tool_call_id = message.tool_call_id -%}
96
+ ## Return of {{ tool_call_id }}
97
+ {{render_content(message)}}
98
+ {%- elif message['content'] is not none -%}
99
+ {{render_content(message)}}
100
+ {%- endif -%}
101
+ <|im_end|>
102
+ {%- endfor -%}
103
+
104
+
105
+ {%- if add_generation_prompt -%}
106
+ <|im_assistant|>assistant<|im_middle|>
107
+ {%- if thinking is defined and thinking is false -%}
108
+ <think></think>
109
+ {%- else -%}
110
+ <think>
111
+ {%- endif -%}
112
+ {%- endif -%}
config.json ADDED
The diff for this file is too large to render. See raw diff
 
configuration_deepseek.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copy from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/configuration_deepseek.py
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.utils import logging
5
+
6
+ logger = logging.get_logger(__name__)
7
+
8
+ DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
9
+
10
+
11
+ class DeepseekV3Config(PretrainedConfig):
12
+ r"""
13
+ This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
14
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
15
+ defaults will yield a similar configuration to that of the DeepSeek-V3.
16
+
17
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
18
+ documentation from [`PretrainedConfig`] for more information.
19
+
20
+
21
+ Args:
22
+ vocab_size (`int`, *optional*, defaults to 129280):
23
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
24
+ `inputs_ids` passed when calling [`DeepseekV3Model`]
25
+ hidden_size (`int`, *optional*, defaults to 4096):
26
+ Dimension of the hidden representations.
27
+ intermediate_size (`int`, *optional*, defaults to 11008):
28
+ Dimension of the MLP representations.
29
+ moe_intermediate_size (`int`, *optional*, defaults to 1407):
30
+ Dimension of the MoE representations.
31
+ num_hidden_layers (`int`, *optional*, defaults to 32):
32
+ Number of hidden layers in the Transformer decoder.
33
+ num_nextn_predict_layers (`int`, *optional*, defaults to 1):
34
+ Number of nextn predict layers in the DeepSeekV3 Model.
35
+ num_attention_heads (`int`, *optional*, defaults to 32):
36
+ Number of attention heads for each attention layer in the Transformer decoder.
37
+ n_shared_experts (`int`, *optional*, defaults to None):
38
+ Number of shared experts, None means dense model.
39
+ n_routed_experts (`int`, *optional*, defaults to None):
40
+ Number of routed experts, None means dense model.
41
+ routed_scaling_factor (`float`, *optional*, defaults to 1.0):
42
+ Scaling factor or routed experts.
43
+ topk_method (`str`, *optional*, defaults to `gready`):
44
+ Topk method used in routed gate.
45
+ n_group (`int`, *optional*, defaults to None):
46
+ Number of groups for routed experts.
47
+ topk_group (`int`, *optional*, defaults to None):
48
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
49
+ num_experts_per_tok (`int`, *optional*, defaults to None):
50
+ Number of selected experts, None means dense model.
51
+ moe_layer_freq (`int`, *optional*, defaults to 1):
52
+ The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
53
+ first_k_dense_replace (`int`, *optional*, defaults to 0):
54
+ Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
55
+ \--k dense layers--/
56
+ norm_topk_prob (`bool`, *optional*, defaults to False):
57
+ Whether to normalize the weights of the routed experts.
58
+ scoring_func (`str`, *optional*, defaults to 'softmax'):
59
+ Method of computing expert weights.
60
+ aux_loss_alpha (`float`, *optional*, defaults to 0.001):
61
+ Auxiliary loss weight coefficient.
62
+ seq_aux = (`bool`, *optional*, defaults to True):
63
+ Whether to compute the auxiliary loss for each individual sample.
64
+ num_key_value_heads (`int`, *optional*):
65
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
66
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
67
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
68
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
69
+ by meanpooling all the original heads within that group. For more details checkout [this
70
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
71
+ `num_attention_heads`.
72
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
73
+ The non-linear activation function (function or string) in the decoder.
74
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
75
+ The maximum sequence length that this model might ever be used with.
76
+ initializer_range (`float`, *optional*, defaults to 0.02):
77
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
78
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
79
+ The epsilon used by the rms normalization layers.
80
+ use_cache (`bool`, *optional*, defaults to `True`):
81
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
82
+ relevant if `config.is_decoder=True`.
83
+ pad_token_id (`int`, *optional*):
84
+ Padding token id.
85
+ bos_token_id (`int`, *optional*, defaults to 1):
86
+ Beginning of stream token id.
87
+ eos_token_id (`int`, *optional*, defaults to 2):
88
+ End of stream token id.
89
+ pretraining_tp (`int`, *optional*, defaults to 1):
90
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
91
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
92
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
93
+ issue](https://github.com/pytorch/pytorch/issues/76232).
94
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
95
+ Whether to tie weight embeddings
96
+ rope_theta (`float`, *optional*, defaults to 10000.0):
97
+ The base period of the RoPE embeddings.
98
+ rope_scaling (`Dict`, *optional*):
99
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
100
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
101
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
102
+ `max_position_embeddings` to the expected new maximum.
103
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
104
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
105
+ attention_dropout (`float`, *optional*, defaults to 0.0):
106
+ The dropout ratio for the attention probabilities.
107
+
108
+ ```python
109
+ >>> from transformers import DeepseekV3Model, DeepseekV3Config
110
+
111
+ >>> # Initializing a Deepseek-V3 style configuration
112
+ >>> configuration = DeepseekV3Config()
113
+
114
+ >>> # Accessing the model configuration
115
+ >>> configuration = model.config
116
+ ```"""
117
+
118
+ model_type = "deepseek_v3"
119
+ keys_to_ignore_at_inference = ["past_key_values"]
120
+
121
+ def __init__(
122
+ self,
123
+ vocab_size=129280,
124
+ hidden_size=7168,
125
+ intermediate_size=18432,
126
+ moe_intermediate_size=2048,
127
+ num_hidden_layers=61,
128
+ num_nextn_predict_layers=1,
129
+ num_attention_heads=128,
130
+ num_key_value_heads=128,
131
+ n_shared_experts=1,
132
+ n_routed_experts=256,
133
+ ep_size=1,
134
+ routed_scaling_factor=2.5,
135
+ kv_lora_rank=512,
136
+ q_lora_rank=1536,
137
+ qk_rope_head_dim=64,
138
+ v_head_dim=128,
139
+ qk_nope_head_dim=128,
140
+ topk_method='noaux_tc',
141
+ n_group=8,
142
+ topk_group=4,
143
+ num_experts_per_tok=8,
144
+ moe_layer_freq=1,
145
+ first_k_dense_replace=3,
146
+ norm_topk_prob=True,
147
+ scoring_func='sigmoid',
148
+ aux_loss_alpha=0.001,
149
+ seq_aux=True,
150
+ hidden_act="silu",
151
+ max_position_embeddings=4096,
152
+ initializer_range=0.02,
153
+ rms_norm_eps=1e-6,
154
+ use_cache=True,
155
+ pad_token_id=None,
156
+ bos_token_id=0,
157
+ eos_token_id=1,
158
+ pretraining_tp=1,
159
+ tie_word_embeddings=False,
160
+ rope_theta=10000.0,
161
+ rope_scaling=None,
162
+ attention_bias=False,
163
+ attention_dropout=0.0,
164
+ **kwargs,
165
+ ):
166
+ self.vocab_size = vocab_size
167
+ self.max_position_embeddings = max_position_embeddings
168
+ self.hidden_size = hidden_size
169
+ self.intermediate_size = intermediate_size
170
+ self.moe_intermediate_size = moe_intermediate_size
171
+ self.num_hidden_layers = num_hidden_layers
172
+ self.num_nextn_predict_layers = num_nextn_predict_layers
173
+ self.num_attention_heads = num_attention_heads
174
+ self.n_shared_experts = n_shared_experts
175
+ self.n_routed_experts = n_routed_experts
176
+ self.ep_size = ep_size
177
+ self.routed_scaling_factor = routed_scaling_factor
178
+ self.kv_lora_rank = kv_lora_rank
179
+ self.q_lora_rank = q_lora_rank
180
+ self.qk_rope_head_dim = qk_rope_head_dim
181
+ self.v_head_dim = v_head_dim
182
+ self.qk_nope_head_dim = qk_nope_head_dim
183
+ self.topk_method = topk_method
184
+ self.n_group = n_group
185
+ self.topk_group = topk_group
186
+ self.num_experts_per_tok = num_experts_per_tok
187
+ self.moe_layer_freq = moe_layer_freq
188
+ self.first_k_dense_replace = first_k_dense_replace
189
+ self.norm_topk_prob = norm_topk_prob
190
+ self.scoring_func = scoring_func
191
+ self.aux_loss_alpha = aux_loss_alpha
192
+ self.seq_aux = seq_aux
193
+ # for backward compatibility
194
+ if num_key_value_heads is None:
195
+ num_key_value_heads = num_attention_heads
196
+
197
+ self.num_key_value_heads = num_key_value_heads
198
+ self.hidden_act = hidden_act
199
+ self.initializer_range = initializer_range
200
+ self.rms_norm_eps = rms_norm_eps
201
+ self.pretraining_tp = pretraining_tp
202
+ self.use_cache = use_cache
203
+ self.rope_theta = rope_theta
204
+ self.rope_scaling = rope_scaling
205
+ self.attention_bias = attention_bias
206
+ self.attention_dropout = attention_dropout
207
+
208
+ super().__init__(
209
+ pad_token_id=pad_token_id,
210
+ bos_token_id=bos_token_id,
211
+ eos_token_id=eos_token_id,
212
+ tie_word_embeddings=tie_word_embeddings,
213
+ **kwargs,
214
+ )
configuration_kimi_k25.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+
3
+ try:
4
+ from configuration_deepseek import DeepseekV3Config
5
+ except ImportError:
6
+ from .configuration_deepseek import DeepseekV3Config
7
+
8
+
9
+ class KimiK25VisionConfig(PretrainedConfig):
10
+
11
+ def __init__(
12
+ self,
13
+ patch_size: int = 14,
14
+ init_pos_emb_height: int = 64,
15
+ init_pos_emb_width: int = 64,
16
+ init_pos_emb_time: int = 4,
17
+ pos_emb_type: str = 'divided_fixed',
18
+ vt_num_attention_heads: int = 16,
19
+ vt_num_hidden_layers: int = 27,
20
+ vt_hidden_size: int = 1152,
21
+ vt_intermediate_size: int = 4304,
22
+ merge_kernel_size: tuple = (2, 2),
23
+ video_attn_type: str = 'spatial_temporal',
24
+ merge_type: str = 'sd2_tpool',
25
+ _attn_implementation: str = 'flash_attention_2',
26
+ # MM Projector parameters
27
+ mm_projector_type: str = 'patchmerger',
28
+ mm_hidden_size: int | None = None,
29
+ projector_hidden_act: str = "gelu",
30
+ projector_ln_eps: float = 1e-5,
31
+ # Other parameters
32
+ ignore_index: int = -100,
33
+ media_placeholder_token_id: int = 163605,
34
+ pad_token_id: int = 0,
35
+ use_unified_vision_chunk: bool = True,
36
+ video_placeholder="<|kimi_k25_video_placeholder|>",
37
+ text_hidden_size=7168,
38
+ **vision_config_kwargs):
39
+
40
+ self.patch_size = patch_size
41
+ self.init_pos_emb_height = init_pos_emb_height
42
+ self.init_pos_emb_width = init_pos_emb_width
43
+ self.init_pos_emb_time = init_pos_emb_time
44
+ self.pos_emb_type = pos_emb_type
45
+ self.vt_num_attention_heads = vt_num_attention_heads
46
+ self.vt_num_hidden_layers = vt_num_hidden_layers
47
+ self.vt_hidden_size = vt_hidden_size
48
+ self.vt_intermediate_size = vt_intermediate_size
49
+ self.merge_kernel_size = merge_kernel_size
50
+ self.video_attn_type = video_attn_type
51
+ self.merge_type = merge_type
52
+ self._attn_implementation = _attn_implementation
53
+
54
+ # MM Projector config
55
+ self.mm_projector_type = mm_projector_type
56
+ self.mm_hidden_size = mm_hidden_size if mm_hidden_size is not None else vt_hidden_size
57
+ self.projector_hidden_act = projector_hidden_act
58
+ self.projector_ln_eps = projector_ln_eps
59
+ self.text_hidden_size = text_hidden_size
60
+
61
+
62
+ class KimiK25Config(PretrainedConfig):
63
+ """Kimi-K2.5 model configuration.
64
+
65
+ Args:
66
+ text_config (dict | DeepseekV3Config): Configuration for the text model.
67
+
68
+ Vision Tower Parameters (from MoonViT3dConfig):
69
+ patch_size (int): Patch size for vision tower.
70
+ init_pos_emb_height (int): Initial position embedding height.
71
+ init_pos_emb_width (int): Initial position embedding width.
72
+ init_pos_emb_time (int): Initial position embedding time dimension.
73
+ pos_emb_type (str): Type of position embedding.
74
+ vt_num_attention_heads (int): Number of attention heads in vision tower.
75
+ vt_num_hidden_layers (int): Number of hidden layers in vision tower.
76
+ vt_hidden_size (int): Hidden size of vision tower.
77
+ vt_intermediate_size (int): Intermediate size in vision tower FFN.
78
+ merge_kernel_size (tuple): Kernel size for patch merging.
79
+ video_attn_type (str): Type of video attention.
80
+ merge_type (str): Type of merge operation.
81
+ _attn_implementation (str): Attention implementation type.
82
+
83
+ MM Projector Parameters (from MultiModalProjectorConfig):
84
+ mm_projector_type (str): Type of multimodal projector.
85
+ mm_hidden_size (int): Hidden size from vision tower (should match vt_hidden_size).
86
+ projector_hidden_act (str): Activation function for projector.
87
+ projector_ln_eps (float): Layer norm epsilon for projector.
88
+
89
+ Other Parameters:
90
+ ignore_index (int): The ignore index for the loss function.
91
+ media_placeholder_token_id (int): The token ID to use for media placeholders.
92
+ pad_token_id (int): The token ID to use for padding.
93
+ """
94
+
95
+ model_type = "kimi_k25"
96
+
97
+ def __init__(
98
+ self,
99
+ text_config: dict | DeepseekV3Config = None,
100
+ vision_config: dict | KimiK25VisionConfig = None,
101
+ # Other parameters
102
+ ignore_index: int = -100,
103
+ media_placeholder_token_id: int = 163605,
104
+ pad_token_id: int = 0,
105
+ use_unified_vision_chunk: bool = True,
106
+ video_placeholder="<|kimi_k25_video_placeholder|>",
107
+ **kwargs,
108
+ ):
109
+ if isinstance(text_config, dict):
110
+ text_config = DeepseekV3Config(**text_config)
111
+ if isinstance(vision_config, dict):
112
+ vision_config = KimiK25VisionConfig(**vision_config)
113
+ self.text_config = text_config
114
+ self.vision_config = vision_config
115
+ # Other config
116
+ self.ignore_index = ignore_index
117
+ self.media_placeholder_token_id = media_placeholder_token_id
118
+ self.use_unified_vision_chunk = use_unified_vision_chunk
119
+ self.video_placeholder = video_placeholder
120
+ if getattr(self.text_config, "quantization_config", None) is not None:
121
+ self.quantization_config = self.text_config.quantization_config
122
+
123
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_length": 262144,
3
+ "eos_token_id": 163586
4
+ }
kimi_k25_processor.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.feature_extraction_utils import BatchFeature
2
+ from transformers.processing_utils import ProcessorMixin
3
+ from transformers.utils import logging
4
+
5
+ logger = logging.get_logger(__name__)
6
+
7
+
8
+ class KimiK25Processor(ProcessorMixin):
9
+ r"""
10
+ Constructs a KimiK25 processor which wraps a KimiK25 image processor and a tokenizer into a single processor.
11
+
12
+ [`KimiK25Processor`] offers all the functionalities of [`KimiK25ImageProcessor`] and [`TikTokenTokenizer`]. See the
13
+ [`~KimiK25Processor.__call__`] and [`~KimiK25Processor.decode`] for more information.
14
+
15
+ Args:
16
+ image_processor ([`KimiK25ImageProcessor`], *optional*):
17
+ The image processor is a required input.
18
+ tokenizer ([`TikTokenTokenizer`], *optional*):
19
+ The tokenizer is a required input.
20
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
21
+ in a chat into a tokenizable string.
22
+ """
23
+
24
+ attributes = ["image_processor", "tokenizer"]
25
+ valid_kwargs = ["chat_template"]
26
+ image_processor_class = "AutoImageProcessor"
27
+ tokenizer_class = "AutoTokenizer"
28
+
29
+ def __init__(
30
+ self,
31
+ image_processor=None,
32
+ tokenizer=None,
33
+ chat_template=None,
34
+ **kwargs,
35
+ ):
36
+ super().__init__(image_processor,
37
+ tokenizer,
38
+ chat_template=chat_template)
39
+ self.media_processor = image_processor
40
+ # A special temporal placeholder to be replaced by actual video placeholders
41
+ self.video_placeholder = "<|kimi_k25_video_placeholder|>"
42
+
43
+ def update_raw_text(self, text: str, video_prompts: list[str]) -> str:
44
+ # replace video prompt in text with video chunk prompts
45
+ video_count = text.count(self.video_placeholder)
46
+ if video_count == 0:
47
+ return text
48
+ assert video_count == len(video_prompts)
49
+ text_parts = text.split(self.video_placeholder)
50
+ assert len(text_parts) == len(video_prompts) + 1
51
+ text = "".join([
52
+ text_parts[i] + video_prompts[i] for i in range(len(video_prompts))
53
+ ])
54
+ text += text_parts[-1]
55
+ return text
56
+
57
+ def preprocess_medias(self, medias: list[dict]) -> list[dict]:
58
+ updated_medias = []
59
+ video_prompts = []
60
+ for media in medias:
61
+ if media['type'] == 'image':
62
+ updated_medias.append(media)
63
+ elif media['type'] == 'video':
64
+ video_chunks = self.media_processor.split_video_chunks(
65
+ media['video'])
66
+ updated_medias.extend(video_chunks)
67
+ video_prompts.append("".join(
68
+ [vc['prompt'] for vc in video_chunks]))
69
+ else:
70
+ raise ValueError(f"unsupported media type: {media['type']}")
71
+ return updated_medias, video_prompts
72
+
73
+ def __call__(self,
74
+ messages: list[dict] = None,
75
+ medias: list[dict] = None,
76
+ text: str = None,
77
+ return_tensors: str = "pt",
78
+ **kwargs) -> BatchFeature:
79
+ """
80
+ Process multimodal inputs for Kimi-K2.5 model.
81
+
82
+ This processor accepts ordered messages and extracts both media and text in a single pass.
83
+ text will be automatically updated if video input detected in messages
84
+
85
+ Args:
86
+ messages: List of message dicts with 'role' and 'content' fields.
87
+ If provided, medias and text will be extracted automatically.
88
+ medias: Pre-extracted list of media dicts. If None, extracted from messages.
89
+ text: Pre-formatted text string. If None, generated via apply_chat_template.
90
+ return_tensors: Format of returned tensors ('pt', 'np', 'tf'). Default: 'pt'.
91
+ **kwargs: Additional arguments passed to tokenizer.apply_chat_template.
92
+
93
+ Returns:
94
+ BatchFeature with fields: input_ids, attention_mask, pixel_values, grid_thws.
95
+ """
96
+ if messages is None and (medias is None or text is None):
97
+ raise ValueError(
98
+ "Provide either 'messages' or both 'medias' and 'text'")
99
+
100
+ if medias is not None and text is not None:
101
+ updated_medias, video_prompts = self.preprocess_medias(medias)
102
+ preprocessed = self.media_processor.preprocess(
103
+ updated_medias, return_tensors=return_tensors)
104
+ text = self.update_raw_text(text, video_prompts)
105
+ text_inputs = self.tokenizer(text, return_tensors=return_tensors)
106
+ return BatchFeature(data={**text_inputs, **preprocessed.data})
107
+
108
+ if medias is None:
109
+ medias = self._extract_medias_from_messages(messages)
110
+ updated_medias, video_prompts = self.preprocess_medias(medias)
111
+ preprocessed = self.media_processor.preprocess(
112
+ updated_medias, return_tensors=return_tensors)
113
+
114
+ # Generate text if not provided
115
+ if text is None:
116
+ text = self.tokenizer.apply_chat_template(messages, **kwargs)
117
+
118
+ text = self.update_raw_text(text, video_prompts)
119
+
120
+ text_inputs = self.tokenizer(text, return_tensors=return_tensors)
121
+ return BatchFeature(data={**text_inputs, **preprocessed.data})
122
+
123
+ @staticmethod
124
+ def _extract_medias_from_messages(messages: list[dict]) -> list[dict]:
125
+ """
126
+ Extract media items from messages in a single pass.
127
+
128
+ This is an optimized version that processes messages only once.
129
+ Kept as internal method since external callers should use __call__.
130
+ """
131
+ medias = []
132
+ for msg in messages:
133
+ if msg['role'] != 'user' or not msg.get('content'):
134
+ continue
135
+
136
+ for content_part in msg['content']:
137
+ if not isinstance(content_part, dict):
138
+ continue
139
+
140
+ content_type = content_part.get('type')
141
+ if content_type in ['video_url', 'video']:
142
+ medias.append({
143
+ 'type': 'video',
144
+ 'video': content_part['video_url']['url'],
145
+ 'first_frame_timestamp': 0.0
146
+ })
147
+ elif content_type in ['image_url', 'image']:
148
+ medias.append({
149
+ 'type': 'image',
150
+ 'image': content_part['image_url'],
151
+ })
152
+ return medias
153
+
154
+ def apply_chat_template(self, messages, **kwargs):
155
+ return self.tokenizer.apply_chat_template(messages, **kwargs)
156
+
157
+ def batch_decode(self, *args, **kwargs):
158
+ return self.tokenizer.batch_decode(*args, **kwargs)
159
+
160
+ def decode(self, *args, **kwargs):
161
+ return self.tokenizer.decode(*args, **kwargs)
162
+
163
+ @property
164
+ def model_input_names(self):
165
+ return ['input_ids', 'attention_mask', 'pixel_values', 'grid_thws']
kimi_k25_vision_processing.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image processor class for Kimi-K2.5.
2
+ """
3
+
4
+ import json
5
+ from typing import Any, Dict, Optional, Union
6
+
7
+ import numpy as np
8
+ import torch
9
+ from PIL import Image
10
+ from transformers.image_processing_utils import (BaseImageProcessor,
11
+ BatchFeature)
12
+ from transformers.utils import TensorType
13
+
14
+ from .media_utils import (MediaInput, VideoChunkInput, _to_tensor,
15
+ ensure_media_type, get_video_meta, image_to_np,
16
+ navit_patchify, navit_resize_image,
17
+ navit_resize_video, normalize,
18
+ real_sample_fps_and_max_num_frames, timestamp_as_str)
19
+
20
+ try:
21
+ from mecord import VideoReader
22
+ except ImportError:
23
+ VideoReader = None
24
+
25
+
26
+ def resampling(video_bytes: bytes,
27
+ sample_indices: list[int],
28
+ key_indices=None,
29
+ frame_time_info=None,
30
+ num_threads=4) -> str:
31
+ video = VideoReader(video_bytes,
32
+ num_threads=num_threads,
33
+ frame_time_info=frame_time_info,
34
+ key_indices=key_indices)
35
+ # extract target frames
36
+ frames = video[sample_indices]
37
+ frames = [Image.fromarray(frame) for frame in frames]
38
+ return frames
39
+
40
+
41
+ class KimiK25VisionProcessor(BaseImageProcessor):
42
+ model_type = "kimi_k25"
43
+
44
+ def __init__(
45
+ self,
46
+ media_proc_cfg: dict,
47
+ **kwargs,
48
+ ):
49
+ super().__init__(**kwargs)
50
+ self.media_proc_cfg = media_proc_cfg
51
+ self.num_frames_per_chunk = media_proc_cfg[
52
+ 'temporal_merge_kernel_size']
53
+
54
+ def media_tokens_calculator(self, media: MediaInput):
55
+ media = ensure_media_type(media)
56
+ ret = self.get_resize_config(media)
57
+ return ret['num_tokens']
58
+
59
+ @classmethod
60
+ def make_chunk_prompt(cls, timestamp_text: str) -> str:
61
+ return f"{timestamp_text}<|media_begin|>video<|media_content|><|media_pad|><|media_end|>"
62
+
63
+ def split_video_chunks(self,
64
+ video_url: str | bytes) -> list[list[Image.Image]]:
65
+ # video_url should be base64 str or bytes
66
+ video_spec = get_video_meta(video_url)
67
+ sample_fps = min(self.media_proc_cfg['sample_fps'], video_spec.fps)
68
+ sampled_nframes = max(
69
+ round(video_spec.num_frames * sample_fps / video_spec.fps), 1)
70
+ frame_inds = np.linspace(0, video_spec.num_frames - 1,
71
+ sampled_nframes).round().astype(int)
72
+ frame_inds = frame_inds.tolist()
73
+ sampled_frame_ids = []
74
+ temporal_merge_kernel_size = self.media_proc_cfg[
75
+ "temporal_merge_kernel_size"]
76
+ num_chunks = 0
77
+ chunk_timestamp = []
78
+ for i in range(0, len(frame_inds), temporal_merge_kernel_size):
79
+ sampled_frame_ids.extend(frame_inds[i:i +
80
+ temporal_merge_kernel_size])
81
+ start_time = frame_inds[i] / float(video_spec.fps)
82
+ timestamp_text = timestamp_as_str(
83
+ start_time, self.media_proc_cfg["timestamp_mode"])
84
+ chunk_timestamp.append(timestamp_text)
85
+ num_chunks += 1
86
+
87
+ sampled_frames = resampling(video_url, sampled_frame_ids)
88
+ chunks = []
89
+ for chunk_id in range(num_chunks):
90
+ chunk = sampled_frames[chunk_id *
91
+ temporal_merge_kernel_size:(chunk_id + 1) *
92
+ temporal_merge_kernel_size]
93
+ chunks.append(
94
+ VideoChunkInput(type="video_chunk",
95
+ video_chunk=chunk,
96
+ prompt=self.make_chunk_prompt(
97
+ chunk_timestamp[chunk_id])))
98
+ return chunks
99
+
100
+ def get_resize_config(self, media_input: MediaInput) -> dict:
101
+ if media_input['type'] == 'image':
102
+ w, h = media_input['image'].size
103
+ ret = navit_resize_image(
104
+ w, h, self.media_proc_cfg['patch_size'],
105
+ self.media_proc_cfg['merge_kernel_size'],
106
+ self.media_proc_cfg['in_patch_limit'],
107
+ self.media_proc_cfg['patch_limit_on_one_side'],
108
+ self.media_proc_cfg['fixed_output_tokens'])
109
+ return ret
110
+ elif media_input['type'] == 'video_chunk':
111
+ frame = media_input['video_chunk'][0]
112
+ width, height = frame.size
113
+ num_frames = len(media_input["video_chunk"])
114
+ fps = 1.0
115
+
116
+ sample_fps, max_num_frames_each_video = real_sample_fps_and_max_num_frames(
117
+ media_input["type"],
118
+ self.media_proc_cfg['sample_fps'],
119
+ self.media_proc_cfg['max_num_frames_each_video'],
120
+ )
121
+
122
+ in_patch_limit_each_frame = self.media_proc_cfg[
123
+ 'in_patch_limit_each_frame']
124
+ if in_patch_limit_each_frame is None:
125
+ in_patch_limit_each_frame = self.media_proc_cfg[
126
+ 'in_patch_limit']
127
+
128
+ ret = navit_resize_video(
129
+ width,
130
+ height,
131
+ num_frames,
132
+ fps,
133
+ sample_fps,
134
+ self.media_proc_cfg['patch_size'],
135
+ self.media_proc_cfg['merge_kernel_size'],
136
+ in_patch_limit_each_frame,
137
+ self.media_proc_cfg['patch_limit_on_one_side'],
138
+ self.media_proc_cfg['in_patch_limit_video'],
139
+ max_num_frames_each_video,
140
+ self.media_proc_cfg['fixed_output_tokens'],
141
+ )
142
+ return ret
143
+ else:
144
+ raise ValueError("Unsupported type: {}".format(
145
+ media_input['type']))
146
+
147
+ def resize_image(self, image: Image.Image, new_width: int, new_height: int,
148
+ pad_width: int, pad_height: int) -> np.ndarray:
149
+ image_np = image_to_np(image, (new_width, new_height), "resize")
150
+ image_np = np.pad(
151
+ image_np,
152
+ ((0, pad_height), (0, pad_width), (0, 0)),
153
+ mode="constant",
154
+ constant_values=0,
155
+ )
156
+ return image_np
157
+
158
+ def preprocess(
159
+ self,
160
+ medias: list[MediaInput],
161
+ return_tensors: Optional[Union[str, TensorType]] = None,
162
+ ) -> BatchFeature:
163
+ """
164
+ Preprocess a atom vision input (images/video_chunk) into model-ready tensors.
165
+
166
+ Args:
167
+ medias: List of MediaInput.
168
+ return_tensors: Desired output format ('pt', 'np', 'tf', or None).
169
+
170
+ Returns:
171
+ BatchFeature containing 'pixel_values' and 'grid_thws' tensors.
172
+ """
173
+ if not isinstance(medias, list):
174
+ medias = [medias]
175
+ if medias:
176
+ pixel_values = []
177
+ for item in medias:
178
+ item = ensure_media_type(item)
179
+ resize_config = self.get_resize_config(item)
180
+ new_width, new_height, pad_width, pad_height = resize_config[
181
+ 'new_width'], resize_config['new_height'], resize_config[
182
+ 'pad_width'], resize_config['pad_height']
183
+ if item['type'] == 'image':
184
+ image = item['image']
185
+ image_np = self.resize_image(image, new_width, new_height,
186
+ pad_width, pad_height)
187
+ pixel_values.append(np.expand_dims(image_np, axis=0))
188
+ elif item['type'] == 'video_chunk':
189
+ pixels = []
190
+ for frame in item['video_chunk']:
191
+ frame_np = self.resize_image(frame, new_width,
192
+ new_height, pad_width,
193
+ pad_height)
194
+ pixels.append(frame_np)
195
+ pixel_values.append(np.stack(pixels, axis=0))
196
+ else:
197
+ raise ValueError("Unsupported type: {}".format(
198
+ item['type']))
199
+ normalized_pixel_values = []
200
+ image_std_inv = 1.0 / np.array(self.media_proc_cfg['image_std'])
201
+ image_mean = np.array(self.media_proc_cfg['image_mean'])
202
+ for pixels in pixel_values:
203
+ pixels = normalize(pixels, image_mean, image_std_inv)
204
+ pixels_and_thw = navit_patchify(
205
+ pixels,
206
+ self.media_proc_cfg['patch_size'],
207
+ )
208
+ normalized_pixel_values.append(pixels_and_thw)
209
+
210
+ pixel_values = torch.cat([
211
+ _to_tensor(pixel_value['pixel_values'])
212
+ for pixel_value in normalized_pixel_values
213
+ ])
214
+ grid_thws = torch.cat([
215
+ _to_tensor(pixel_value['grid_thw'],
216
+ dtype=torch.int64).unsqueeze(0)
217
+ for pixel_value in normalized_pixel_values
218
+ ])
219
+
220
+ data = {
221
+ 'pixel_values': pixel_values,
222
+ 'grid_thws': grid_thws,
223
+ }
224
+
225
+ else:
226
+ data = {}
227
+
228
+ return BatchFeature(data=data, tensor_type=return_tensors)
229
+
230
+ def __repr__(self):
231
+ return f"KimiK25VisionProcessor(media_proc_cfg={self.media_proc_cfg})"
232
+
233
+ def to_dict(self) -> Dict[str, Any]:
234
+ output = super().to_dict()
235
+ output["media_proc_cfg"] = self.media_proc_cfg
236
+ if "media_processor" in output:
237
+ del output["media_processor"]
238
+ return output
239
+
240
+ @classmethod
241
+ def from_dict(cls, config_dict: Dict[str, Any], **kwargs):
242
+ config = config_dict.copy()
243
+ media_proc_cfg = config.pop("media_proc_cfg", {})
244
+ return cls(media_proc_cfg=media_proc_cfg, **config, **kwargs)
245
+
246
+ def to_json_string(self):
247
+ dictionary = self.to_dict()
248
+ for key, value in dictionary.items():
249
+ if hasattr(value, 'tolist'):
250
+ dictionary[key] = value.tolist()
251
+ return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
media_utils.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import math
4
+ import os
5
+ from datetime import datetime, timezone
6
+ from typing import List, Literal, Optional, TypedDict
7
+
8
+ import numpy as np
9
+ from PIL import Image
10
+ from pydantic import BaseModel, Field
11
+
12
+ try:
13
+ from mecord import VideoReader
14
+ except ImportError:
15
+ VideoReader = None
16
+
17
+
18
+ class VideoSpec(BaseModel):
19
+ media_type: str = Literal['video']
20
+ height: int = Field(..., gt=0, description="video frame height")
21
+ width: int = Field(..., gt=0, description="video frame width")
22
+ num_frames: int = Field(..., gt=0, description="num frames")
23
+ fps: float = Field(..., gt=0, description="average fps")
24
+
25
+ # optional, help to accelerate video reading
26
+ key_indices: list[int] = Field(None, description="key indices")
27
+ frame_time_info: dict = Field(None, description="frame time info")
28
+
29
+
30
+ class ImageInput(TypedDict):
31
+ type: Literal['image']
32
+ image: Image.Image
33
+
34
+
35
+ class VideoChunkInput(TypedDict):
36
+ type: Literal['video_chunk']
37
+ video_chunk: List[Image.Image]
38
+ prompt: Optional[str] = None
39
+
40
+
41
+ MediaInput = ImageInput | VideoChunkInput
42
+
43
+
44
+ def get_video_meta(video_src: bytes | str | os.PathLike,
45
+ accurate: bool = True) -> dict:
46
+ """Get the dimensions of a video."""
47
+ if isinstance(video_src, os.PathLike):
48
+ video_src = str(video_src)
49
+ # if b64 string, decode to bytes
50
+ if isinstance(video_src,
51
+ str) and video_src.startswith('data:video/mp4;base64,'):
52
+ video_src = base64.b64decode(video_src.split(',')[1])
53
+ video = VideoReader(video_src, auto_init=accurate, num_threads=1)
54
+ assert video.num_frames > 0, "Invalid video format."
55
+ assert video.original_width > 0 and video.original_height > 0, (
56
+ "Invalid video format.")
57
+ assert video.avg_fps > 0, "Invalid video format."
58
+ return VideoSpec(media_type='video',
59
+ height=video.original_height,
60
+ width=video.original_width,
61
+ num_frames=video.num_frames,
62
+ fps=video.avg_fps,
63
+ key_indices=video.key_indices,
64
+ frame_time_info=video.frame_time_info)
65
+
66
+
67
+ def timestamp_as_str(timestamp: float,
68
+ timestamp_mode: str = "hh:mm:ss.fff") -> str:
69
+ """Convert a timestamp to a string in the format of HH:MM:SS.mmm."""
70
+ if timestamp_mode == "hh:mm:ss.fff":
71
+ return (datetime.fromtimestamp(timestamp,
72
+ tz=timezone.utc).strftime("%H:%M:%S") +
73
+ f".{int((timestamp % 1) * 1000):03d}")
74
+ elif timestamp_mode == "mm:ss.fff":
75
+ return (datetime.fromtimestamp(timestamp,
76
+ tz=timezone.utc).strftime("%M:%S") +
77
+ f".{int((timestamp % 1) * 1000):03d}")
78
+ elif timestamp_mode == "mm:ss":
79
+ return datetime.fromtimestamp(timestamp,
80
+ tz=timezone.utc).strftime("%M:%S")
81
+ else:
82
+ raise ValueError(f"Invalid timestamp mode: {timestamp_mode}")
83
+
84
+
85
+ def navit_resize_image(
86
+ width: int,
87
+ height: int,
88
+ patch_size: int,
89
+ merge_kernel_size: int,
90
+ in_patch_limit: int,
91
+ patch_limit_on_one_side: int,
92
+ fixed_output_tokens: int | None,
93
+ ):
94
+ # Apply the patch limits.
95
+ s1 = math.sqrt(
96
+ in_patch_limit /
97
+ (max(1.0, width // patch_size) * max(1.0, height // patch_size)))
98
+ s2 = patch_limit_on_one_side * patch_size / width
99
+ s3 = patch_limit_on_one_side * patch_size / height
100
+ scale = min(1.0, s1, s2, s3)
101
+ new_w, new_h = max(1, int(width * scale)), max(1, int(height * scale))
102
+ new_w = min(new_w, patch_limit_on_one_side * patch_size)
103
+ new_h = min(new_h, patch_limit_on_one_side * patch_size)
104
+
105
+ # Calculate the padding to make the height and width divisible by the merge kernel size and patch size.
106
+ factor = merge_kernel_size * patch_size
107
+
108
+ pad_height = (factor - new_h % factor) % factor
109
+ pad_width = (factor - new_w % factor) % factor
110
+
111
+ if fixed_output_tokens is not None:
112
+ num_tokens = fixed_output_tokens
113
+ else:
114
+ # Calculate new dimensions after padding and patching
115
+ token_height = (new_h + pad_height) // factor
116
+ token_width = (new_w + pad_width) // factor
117
+
118
+ assert token_height * merge_kernel_size <= patch_limit_on_one_side, (
119
+ f"token_height {token_height} * merge_kernel_size {merge_kernel_size} > patch_limit_on_one_side {patch_limit_on_one_side}"
120
+ )
121
+ assert token_width * merge_kernel_size <= patch_limit_on_one_side, (
122
+ f"token_width {token_width} * merge_kernel_size {merge_kernel_size} > patch_limit_on_one_side {patch_limit_on_one_side}"
123
+ )
124
+
125
+ num_tokens = token_height * token_width
126
+ return {
127
+ "num_tokens": num_tokens,
128
+ "new_width": new_w,
129
+ "new_height": new_h,
130
+ "pad_width": pad_width,
131
+ "pad_height": pad_height,
132
+ "sampled_nframes": 1,
133
+ }
134
+
135
+
136
+ def navit_resize_video(
137
+ width: int,
138
+ height: int,
139
+ nframes: int,
140
+ avg_fps: float,
141
+ sample_fps: float,
142
+ patch_size: int,
143
+ merge_kernel_size: int,
144
+ in_patch_limit_each_frame: int,
145
+ patch_limit_on_one_side: int,
146
+ in_patch_limit_total: int | None,
147
+ max_num_frames_each_video: int | None,
148
+ fixed_output_tokens_each_frame: int | None,
149
+ ):
150
+ sample_fps = min(sample_fps, avg_fps)
151
+ # Calculate the number of frames to sample based on target FPS
152
+ sampled_nframes = max(round(nframes * sample_fps / avg_fps), 1)
153
+ if max_num_frames_each_video is not None:
154
+ sampled_nframes = min(sampled_nframes, max_num_frames_each_video)
155
+
156
+ if in_patch_limit_total is not None:
157
+ in_patch_limit_each_frame = min(
158
+ round(in_patch_limit_total / sampled_nframes),
159
+ in_patch_limit_each_frame)
160
+
161
+ ret = navit_resize_image(
162
+ width,
163
+ height,
164
+ patch_size,
165
+ merge_kernel_size,
166
+ in_patch_limit_each_frame,
167
+ patch_limit_on_one_side,
168
+ fixed_output_tokens_each_frame,
169
+ )
170
+ ret["sampled_nframes"] = sampled_nframes
171
+ return ret
172
+
173
+
174
+ def real_sample_fps_and_max_num_frames(
175
+ type_name: Literal["video", "video_chunk"],
176
+ sample_fps: float,
177
+ max_num_frames_each_video: int | None,
178
+ ) -> tuple[int, int | None]:
179
+ if type_name == "video":
180
+ return sample_fps, max_num_frames_each_video
181
+ elif type_name == "video_chunk":
182
+ max_num_frames_each_video = None
183
+ sample_fps = math.inf
184
+ return sample_fps, max_num_frames_each_video
185
+ else:
186
+ return math.inf, None
187
+
188
+
189
+ def _to_pil(data: str | bytes):
190
+ if isinstance(data, Image.Image):
191
+
192
+ return data.convert("RGB")
193
+ elif isinstance(data, str):
194
+ if data.startswith("data:"):
195
+ raw_base64 = data.split(",")[1]
196
+ return Image.open(io.BytesIO(
197
+ base64.b64decode(raw_base64))).convert("RGB")
198
+ else:
199
+ return Image.open(data).convert("RGB")
200
+ elif isinstance(data, bytes):
201
+ return Image.open(io.BytesIO(data)).convert("RGB")
202
+ else:
203
+ raise ValueError(f"Unsupported data type: {type(data)}")
204
+
205
+
206
+ def ensure_media_type(media: MediaInput) -> MediaInput:
207
+ if media['type'] == 'image':
208
+ media['image'] = _to_pil(media['image'])
209
+ return media
210
+ elif media['type'] == 'video_chunk':
211
+ media['video_chunk'] = [
212
+ _to_pil(frame) for frame in media['video_chunk']
213
+ ]
214
+ return media
215
+ else:
216
+ raise ValueError(f"Unsupported media type: {media['type']}")
217
+
218
+
219
+ def image_to_np(
220
+ image: Image.Image,
221
+ resize_to: tuple[int, int] | None = None,
222
+ mode: str = "resize",
223
+ raise_error_for_ill_resize: bool = True,
224
+ ) -> np.ndarray:
225
+ """Convert an image to a numpy array.
226
+
227
+ Args:
228
+ content: The image to convert.
229
+ resize_to: The size to resize the image to.
230
+ mode: The mode to resize the image to.
231
+ raise_error_for_ill_resize: Whether to raise an error for ill-sized resize.
232
+
233
+ Returns:
234
+ A numpy array.
235
+ """
236
+ assert isinstance(image, Image.Image), "image must be a PIL Image"
237
+ if resize_to is not None:
238
+ if mode == "resize":
239
+ image = image.resize(resize_to, resample=Image.Resampling.BICUBIC)
240
+
241
+ elif mode == "rescale_and_pad_to_center":
242
+ scale = min(resize_to[0] / image.width,
243
+ resize_to[1] / image.height, 1.0)
244
+ new_width = round(image.width * scale)
245
+ new_height = round(image.height * scale)
246
+ if new_width == 0 or new_height == 0:
247
+ if raise_error_for_ill_resize:
248
+ raise ValueError(
249
+ f"Invalid resize to: {resize_to}, from image size: {image.size}"
250
+ )
251
+ else:
252
+ return np.zeros((resize_to[1], resize_to[0], 3),
253
+ dtype=np.uint8)
254
+
255
+ image = image.resize((new_width, new_height),
256
+ resample=Image.Resampling.BICUBIC)
257
+ padding_left = (resize_to[0] - new_width) // 2
258
+ padding_right = resize_to[0] - new_width - padding_left
259
+ padding_top = (resize_to[1] - new_height) // 2
260
+ padding_bottom = resize_to[1] - new_height - padding_top
261
+ image = np.asarray(image)
262
+ image = np.pad(
263
+ image,
264
+ ((padding_top, padding_bottom), (padding_left, padding_right),
265
+ (0, 0)),
266
+ mode="constant",
267
+ constant_values=0,
268
+ )
269
+ assert image.shape == (resize_to[1], resize_to[0], 3)
270
+
271
+ elif mode == "rescale_and_pad_to_rightbottom":
272
+ scale = min(resize_to[0] / image.width,
273
+ resize_to[1] / image.height, 1.0)
274
+ new_width = round(image.width * scale)
275
+ new_height = round(image.height * scale)
276
+ if new_width == 0 or new_height == 0:
277
+ if raise_error_for_ill_resize:
278
+ raise ValueError(
279
+ f"Invalid resize to: {resize_to}, from image size: {image.size}"
280
+ )
281
+ else:
282
+ return np.zeros((resize_to[1], resize_to[0], 3),
283
+ dtype=np.uint8)
284
+
285
+ image = image.resize((new_width, new_height),
286
+ resample=Image.Resampling.BICUBIC)
287
+ padding_right = resize_to[0] - new_width
288
+ padding_bottom = resize_to[1] - new_height
289
+ image = np.asarray(image)
290
+ image = np.pad(
291
+ image,
292
+ ((0, padding_bottom), (0, padding_right), (0, 0)),
293
+ mode="constant",
294
+ constant_values=0,
295
+ )
296
+ assert image.shape == (resize_to[1], resize_to[0], 3)
297
+
298
+ else:
299
+ raise ValueError(f"Invalid mode: {mode}")
300
+
301
+ if isinstance(image, Image.Image):
302
+ return np.asarray(image)
303
+ else:
304
+ return image
305
+
306
+
307
+ def navit_patchify(pixel_values: np.ndarray,
308
+ patch_size: int) -> dict[str, np.ndarray]:
309
+ """Reshape the pixel values to a navit shape.
310
+
311
+ Args:
312
+ pixel_values: np.ndarray, shape (t, h, w, c)
313
+ patch_size: int
314
+
315
+ Returns:
316
+ dict[str, np.ndarray]
317
+ - patches: np.ndarray, shape (t * h//patch_size * w//patch_size, c, patch_size, patch_size)
318
+ - grid_thw: np.ndarray, (t, h//patch_size, w//patch_size)
319
+ """
320
+ T, H, W, C = pixel_values.shape
321
+ assert C == 3, "pixel_values must have 3 channels"
322
+
323
+ patches = pixel_values.reshape(T, H // patch_size, patch_size,
324
+ W // patch_size, patch_size, C)
325
+ # (T, H//patch_size, W//patch_size, C, patch_size, patch_size)
326
+ patches = patches.transpose(0, 1, 3, 5, 2, 4)
327
+ patches = patches.reshape(-1, C, patch_size, patch_size)
328
+ grid_thw = np.array([T, H // patch_size, W // patch_size])
329
+ return {"pixel_values": patches, "grid_thw": grid_thw}
330
+
331
+
332
+ def normalize(x: np.ndarray,
333
+ mean,
334
+ std_inv,
335
+ pixels_dtype: np.dtype = np.float32) -> np.ndarray:
336
+ """Normalize the image.
337
+
338
+ Args:
339
+ x: The image to normalize. The shape is (..., 3). The dtype is uint8. The range is [0, 255].
340
+ mean: The mean of the image.
341
+ std_inv: The inverse of the std of the image.
342
+ pixels_dtype: The dtype of the image.
343
+ Returns:
344
+ The normalized image. The shape is (..., 3). The dtype is determined by the pixels_dtype.
345
+ """
346
+ x = (x / 255.0).astype(pixels_dtype)
347
+ x -= mean
348
+ x *= std_inv
349
+ return x
350
+
351
+
352
+ def _to_tensor(data, **kwargs):
353
+ import torch
354
+
355
+ if isinstance(data, np.ndarray):
356
+ return torch.from_numpy(data).to(**kwargs)
357
+ elif isinstance(data, torch.Tensor):
358
+ return data.to(**kwargs)
359
+ elif isinstance(data, list):
360
+ return [_to_tensor(item, **kwargs) for item in data]
361
+ elif isinstance(data, tuple):
362
+ return tuple(_to_tensor(item, **kwargs) for item in data)
363
+ elif isinstance(data, dict):
364
+ return {k: _to_tensor(v, **kwargs) for k, v in data.items()}
365
+ elif data is None:
366
+ return None
367
+ else:
368
+ raise ValueError(f"Unsupported data type: {type(data)}")
model-00001-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e3c7c63e9844392d4f2814cc6bf792d0292192636a18a8b224fabaaa432914b
3
+ size 4350119341
model-00002-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8797a70a9267bff812e854c0367181fcce26608383732050dbbf1ba6fe1df9b
3
+ size 2466251202
model-00003-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be777eb2d16a2a09f5e3890c4ea81279a525d58de3930e6078ccad8fbc3787b3
3
+ size 4035315958
model-00004-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d12bcfff7558c1d58c394ab890964e63f49f6f31c78631428b2acb27a4b2e29
3
+ size 4932502382
model-00005-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c62c8cd69b55516200c7f54b48e2c64e7e94d8c0618394fc62f85fee14e5e0ad
3
+ size 4035315950
model-00006-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:230cb8c1b19b1e49740490440c7f98db622127454814ea7ab58f8a89575fd64a
3
+ size 4932502382
model-00007-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f297f5c2be2d9d4fe529ce11bcf483befda7934585681b7afc3b98e9193f4189
3
+ size 4035315934
model-00008-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4ad01004ac8e025c5fcccecfd69f4cf084299b11be61418011d64e57a1875a6
3
+ size 4932502384
model-00009-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72b9552a96a596564c6c159608fc8389fc8f4a3d14f774f87eeef7a198ca15e1
3
+ size 4035315938
model-00010-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c60fc9b15d727869163b25cdd00a095ec9c2d73c4204757fed56328e4ea38f91
3
+ size 4932502380
model-00011-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74599d6501b0a5830a6916ee1baf1d5ee9014ee6423408e8e8278a85edaf8e89
3
+ size 3330672874
model-00012-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bda002283e914e0bb89a9396426b7a0b81cbb6016779f5f7788a89e033bbd689
3
+ size 4932502382
model-00013-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eab84b7ede5e1f782ca9b0b79a00f4a7d6b96bc2b86dbad7881001102c45c952
3
+ size 5092281030
model-00014-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a35f7d3ead2e01942c21e017c48f511eea21c562b54a353ad8efc90c47b75e33
3
+ size 5092280968
model-00015-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:248de93ef2908d82a3135af66aad7d09da73941e04835eca6259bb1d90b3ed32
3
+ size 4932502380
model-00016-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:053cce7db0f2daf9791432fd747349ba1358353b8531fd9fdb72429c8a939399
3
+ size 5092281014
model-00017-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c55ec356edafc99b5bcbe7dd80bf36a9bc39fb0c828371809cceb600822b2a09
3
+ size 5092281024
model-00018-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab6df7b082ffd9466ea90c0c1a2bf0ab56af761b7026af25d75e8b77a16676bb
3
+ size 4932502390
model-00019-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbfaad78bd417a7ef80a610658a7f672e8b5f22b3a0997ab4b9374a8179ed1e5
3
+ size 3330672884
model-00020-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd782a2b838b86919584578c1d9e47ab4660c1228c1c3f7ce2668fa46ee45688
3
+ size 4932502386
model-00021-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3f702af3dab2158203d5a45f4aad9aca6374836c3b0762700c368198485fc60
3
+ size 5092281079
model-00022-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bc10882ee17da2438085eadec7257b0ee961cbabb309051323386c6234ccdad
3
+ size 5092281021
model-00023-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3c04107f12f5dab2c9af3abfc806dce829456c8c8be7e267088da76ccbecf3b
3
+ size 4932502392
model-00024-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12170615118b99cf83e85f6fe1fb4d1e0668e802f139d441e501e6262a610aa8
3
+ size 5092281121
model-00025-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5dc2146f0082dc30b44fb8a3e589d05fc0b193ec686e9fbec3212480bd116ac
3
+ size 5092281003
model-00026-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac542e6f5af3aa80367b61049684dfe760186f267cb8bef256b030aaade27879
3
+ size 4932502386
model-00027-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44a8605b29172e1cf2fe919e0274e6b443151da5ec2cc745ada373033a391746
3
+ size 3330672884
model-00028-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ece93112fa7e0ba58129f26492cb6f105088a28c5b94ad2a52c74ce9a1ba2f
3
+ size 4932502390
model-00029-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3e18c9c6382824be198f7273314a501067162a12c18fd4d042b519c009aa288
3
+ size 5092281117
model-00030-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb0b2f78d20ddb84ebecdf22b57f26c6bc8b11bedf7c0599a7869c76642f7a57
3
+ size 5092281035
model-00031-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a1bd462941b2c5c43beaf61be46d95e3302590aac46e4cef9e7ba9f69f13eaa
3
+ size 4932502384
model-00032-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7750dafd5e5f4b70595077ae2972239da73098da90f6d7238b0554190823bfb2
3
+ size 5092281103
model-00033-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adabcc076dcc760c3ad9e4a2938ad54c95cb6fb165d5da8bdf5afbc39cedb764
3
+ size 5092281065
model-00034-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a178d90e2bb6147ab946aa0016b1fc7944e11fba89ec1e55281351bfb7044c76
3
+ size 4932502390
model-00035-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a22910cf76a0f091b95fdcfaa1b9ce7fa8c0cb6ca72e1a64e77436f39430baf
3
+ size 3330672900
model-00036-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d3892ec1ad33fc4e6446bd554bbf0114f0d3843972d74fd5a8fee02d77f188e
3
+ size 4932502390
model-00037-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb11d0f23514a25aa6b7abae026a2731b670d32c3da70f71d40c5cda086f9b56
3
+ size 5092281089
model-00038-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cfde028b485f485615524e9def1122b0f3dec22f1746946882923fda084e5e7
3
+ size 5092280989
model-00039-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfea827a601728387af6c520a935965825d6bca5e998ae95502a9011f9104d55
3
+ size 4932502384
model-00040-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d856bdb522b6a337388aef5a064e6df874cf0cb30765ed8e8615c3337f5846a5
3
+ size 5092281123
model-00041-of-00099.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9129ff8a6c5213cc752353a3e0e9dde46e8bab4eb172c8261bac7bdf6c822f9
3
+ size 5092281019