Abdelkareem commited on
Commit
4a8c0a7
·
verified ·
1 Parent(s): c843f35

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 1_SpladePooling/config.json +5 -0
  2. README.md +71 -0
  3. checkpoint-1000/1_SpladePooling/config.json +5 -0
  4. checkpoint-1000/README.md +363 -0
  5. checkpoint-1000/config.json +28 -0
  6. checkpoint-1000/config_sentence_transformers.json +14 -0
  7. checkpoint-1000/model.safetensors +3 -0
  8. checkpoint-1000/modules.json +14 -0
  9. checkpoint-1000/rng_state.pth +3 -0
  10. checkpoint-1000/sentence_bert_config.json +10 -0
  11. checkpoint-1000/tokenizer.json +0 -0
  12. checkpoint-1000/tokenizer_config.json +14 -0
  13. checkpoint-1000/trainer_state.json +54 -0
  14. checkpoint-1500/1_SpladePooling/config.json +5 -0
  15. checkpoint-1500/README.md +364 -0
  16. checkpoint-1500/config.json +28 -0
  17. checkpoint-1500/config_sentence_transformers.json +14 -0
  18. checkpoint-1500/model.safetensors +3 -0
  19. checkpoint-1500/modules.json +14 -0
  20. checkpoint-1500/rng_state.pth +3 -0
  21. checkpoint-1500/sentence_bert_config.json +10 -0
  22. checkpoint-1500/tokenizer.json +0 -0
  23. checkpoint-1500/tokenizer_config.json +14 -0
  24. checkpoint-1500/trainer_state.json +64 -0
  25. checkpoint-1634/1_SpladePooling/config.json +5 -0
  26. checkpoint-1634/README.md +364 -0
  27. checkpoint-1634/config.json +28 -0
  28. checkpoint-1634/config_sentence_transformers.json +14 -0
  29. checkpoint-1634/model.safetensors +3 -0
  30. checkpoint-1634/modules.json +14 -0
  31. checkpoint-1634/rng_state.pth +3 -0
  32. checkpoint-1634/sentence_bert_config.json +10 -0
  33. checkpoint-1634/tokenizer.json +0 -0
  34. checkpoint-1634/tokenizer_config.json +14 -0
  35. checkpoint-1634/trainer_state.json +64 -0
  36. checkpoint-500/1_SpladePooling/config.json +5 -0
  37. checkpoint-500/README.md +362 -0
  38. checkpoint-500/config.json +28 -0
  39. checkpoint-500/config_sentence_transformers.json +14 -0
  40. checkpoint-500/model.safetensors +3 -0
  41. checkpoint-500/modules.json +14 -0
  42. checkpoint-500/rng_state.pth +3 -0
  43. checkpoint-500/sentence_bert_config.json +10 -0
  44. checkpoint-500/tokenizer.json +0 -0
  45. checkpoint-500/tokenizer_config.json +14 -0
  46. checkpoint-500/trainer_state.json +44 -0
  47. config.json +28 -0
  48. config_sentence_transformers.json +14 -0
  49. model.safetensors +3 -0
  50. modules.json +14 -0
1_SpladePooling/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "pooling_strategy": "max",
3
+ "activation_function": "relu",
4
+ "embedding_dimension": 119547
5
+ }
README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ar
3
+ license: apache-2.0
4
+ library_name: sentence-transformers
5
+ tags:
6
+ - sentence-transformers
7
+ - sparse-encoder
8
+ - splade
9
+ - arabic
10
+ - retrieval
11
+ datasets:
12
+ - oddadmix/arabic-triplets-large
13
+ base_model: distilbert-base-multilingual-cased
14
+ metrics:
15
+ - ndcg@10
16
+ - mrr@10
17
+ ---
18
+
19
+ # Arabic SPLADE — Phase 3
20
+
21
+ Efficient symmetric SPLADE using DistilBERT multilingual for faster inference.
22
+
23
+ ## Architecture
24
+
25
+ Symmetric shared (MLMTransformer+SpladePooling, sequential)
26
+
27
+ **Base model:** distilbert-base-multilingual-cased
28
+
29
+ ## Training
30
+
31
+ - **Dataset:** `oddadmix/arabic-triplets-large` (104K triplets, 92K unique passages)
32
+ - **Loss:** `SpladeLoss(SparseMultipleNegativesRankingLoss, q_reg=5e-5, d_reg=3e-5)`
33
+ - **Batch:** 16 per GPU, grad accum 4
34
+ - **Learning rate:** 2e-5
35
+ - **Epochs:** 1
36
+ - **AMP:** fp16
37
+ - **Sampler:** NO_DUPLICATES
38
+
39
+ ## Evaluation on Arabic NanoBEIR (13 datasets)
40
+
41
+ | Metric | Score |
42
+ |--------|-------|
43
+ | NDCG@10 | 0.2528 |
44
+ | MRR@10 | 0.3052 |
45
+
46
+ For reference: BM25 scores 0.3824 NDCG@10, 0.4483 MRR@10 on the same benchmark.
47
+
48
+ ## Training Details
49
+
50
+ DistilBERT multilingual (6-layer, 119K vocab), ~2x faster than AraBERT
51
+
52
+ ### Hardware
53
+ - 2× NVIDIA TITAN RTX (23.5 GB each)
54
+ - DDP via `torchrun`
55
+
56
+ ## Usage
57
+
58
+ ```python
59
+ from sentence_transformers.sparse_encoder import SparseEncoder
60
+
61
+ model = SparseEncoder("Abdelkareem/arabic-splade-efficient")
62
+ embeddings = model.encode([
63
+ "ما هي عاصمة مصر؟",
64
+ "القاهرة هي عاصمة مصر وأكبر مدنها.",
65
+ ])
66
+ print(embeddings.shape)
67
+ # Decode top tokens
68
+ decoded = model.decode(embeddings, top_k=10)
69
+ for d in decoded:
70
+ print(d)
71
+ ```
checkpoint-1000/1_SpladePooling/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "pooling_strategy": "max",
3
+ "activation_function": "relu",
4
+ "embedding_dimension": 119547
5
+ }
checkpoint-1000/README.md ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - sentence-transformers
4
+ - sparse-encoder
5
+ - sparse
6
+ - splade
7
+ - generated_from_trainer
8
+ - dataset_size:104550
9
+ - loss:SpladeLoss
10
+ - loss:SparseMultipleNegativesRankingLoss
11
+ - loss:FlopsLoss
12
+ base_model: distilbert/distilbert-base-multilingual-cased
13
+ widget:
14
+ - text: يُعتبر التأثير الأوروبي على الثقافة اليابانية في القرن التاسع عشر أمرًا هامًا
15
+ في فهم تاريخ البلاد.
16
+ - text: "\n\n يعد النقد الأدبي أداة أساسية في فهم التاريخ الثقافي، حيث يعكس التغيرات\
17
+ \ الاجتماعية والثقافية التي حدثت عبر العصور."
18
+ - text: لا أعتقد أن هناك أي تأثير لصالح المصممة الداخلية الإيطالية إيلينا فرونتزي
19
+ على هذا النوع من التصاميم.
20
+ - text: كيف يؤثر النقد الأدبي على التفاعل الاجتماعي؟
21
+ - text: "\n\n يعد النقد الأدبي أداة أساسية في فهم التاريخ الثقافي، حيث يساعد على تحليل\
22
+ \ وتفسير الأعمال الفنية والثقافية من خلال منظور تاريخي."
23
+ datasets:
24
+ - oddadmix/arabic-triplets-large
25
+ pipeline_tag: feature-extraction
26
+ library_name: sentence-transformers
27
+ ---
28
+
29
+ # SPLADE Sparse Encoder
30
+
31
+ This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [distilbert/distilbert-base-multilingual-cased](https://huggingface.co/distilbert/distilbert-base-multilingual-cased) on the [arabic-triplets-large](https://huggingface.co/datasets/oddadmix/arabic-triplets-large) dataset using the [sentence-transformers](https://www.SBERT.net) library. It maps sentences & paragraphs to a 119547-dimensional sparse vector space and can be used for semantic search and sparse retrieval.
32
+ ## Model Details
33
+
34
+ ### Model Description
35
+ - **Model Type:** SPLADE Sparse Encoder
36
+ - **Base model:** [distilbert/distilbert-base-multilingual-cased](https://huggingface.co/distilbert/distilbert-base-multilingual-cased) <!-- at revision 45c032ab32cc946ad88a166f7cb282f58c753c2e -->
37
+ - **Maximum Sequence Length:** 512 tokens
38
+ - **Output Dimensionality:** 119547 dimensions
39
+ - **Similarity Function:** Dot Product
40
+ - **Supported Modality:** Text
41
+ - **Training Dataset:**
42
+ - [arabic-triplets-large](https://huggingface.co/datasets/oddadmix/arabic-triplets-large)
43
+ <!-- - **Language:** Unknown -->
44
+ <!-- - **License:** Unknown -->
45
+
46
+ ### Model Sources
47
+
48
+ - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
49
+ - **Documentation:** [Sparse Encoder Documentation](https://www.sbert.net/docs/sparse_encoder/usage/usage.html)
50
+ - **Repository:** [Sentence Transformers on GitHub](https://github.com/huggingface/sentence-transformers)
51
+ - **Hugging Face:** [Sparse Encoders on Hugging Face](https://huggingface.co/models?library=sentence-transformers&other=sparse-encoder)
52
+
53
+ ### Full Model Architecture
54
+
55
+ ```
56
+ SparseEncoder(
57
+ (0): Transformer({'transformer_task': 'fill-mask', 'modality_config': {'text': {'method': 'forward', 'method_output_name': 'logits'}}, 'module_output_name': 'token_embeddings', 'architecture': 'DistilBertForMaskedLM'})
58
+ (1): SpladePooling({'pooling_strategy': 'max', 'activation_function': 'relu', 'embedding_dimension': 119547})
59
+ )
60
+ ```
61
+
62
+ ## Usage
63
+
64
+ ### Direct Usage (Sentence Transformers)
65
+
66
+ First install the Sentence Transformers library:
67
+
68
+ ```bash
69
+ pip install -U sentence-transformers
70
+ ```
71
+ Then you can load this model and run inference.
72
+ ```python
73
+ from sentence_transformers import SparseEncoder
74
+
75
+ # Download from the 🤗 Hub
76
+ model = SparseEncoder("sparse_encoder_model_id")
77
+ # Run inference
78
+ sentences = [
79
+ 'ما هي أهمية النقد الأدبي في فهم التاريخ الثقافي؟',
80
+ '\n\n يعد النقد الأدبي أداة أساسية في فهم التاريخ الثقافي، حيث يساعد على تحليل وتفسير الأعمال الفنية والثقافية من خلال منظور تاريخي.',
81
+ 'تُعتبر اللغة العربية لغة قديمة ومتنوعة، وتمثل جزءًا هامًا من تراث البشرية.',
82
+ ]
83
+ embeddings = model.encode(sentences)
84
+ print(embeddings.shape)
85
+ # [3, 119547]
86
+
87
+ # Get the similarity scores for the embeddings
88
+ similarities = model.similarity(embeddings, embeddings)
89
+ print(similarities)
90
+ # tensor([[30.0560, 33.2675, 8.7584],
91
+ # [33.2675, 72.6030, 13.4705],
92
+ # [ 8.7584, 13.4704, 75.8345]])
93
+ ```
94
+ <!--
95
+ ### Direct Usage (Transformers)
96
+
97
+ <details><summary>Click to see the direct usage in Transformers</summary>
98
+
99
+ </details>
100
+ -->
101
+
102
+ <!--
103
+ ### Downstream Usage (Sentence Transformers)
104
+
105
+ You can finetune this model on your own dataset.
106
+
107
+ <details><summary>Click to expand</summary>
108
+
109
+ </details>
110
+ -->
111
+
112
+ <!--
113
+ ### Out-of-Scope Use
114
+
115
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
116
+ -->
117
+
118
+ <!--
119
+ ## Bias, Risks and Limitations
120
+
121
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
122
+ -->
123
+
124
+ <!--
125
+ ### Recommendations
126
+
127
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
128
+ -->
129
+
130
+ ## Training Details
131
+
132
+ ### Training Dataset
133
+
134
+ #### arabic-triplets-large
135
+
136
+ * Dataset: [arabic-triplets-large](https://huggingface.co/datasets/oddadmix/arabic-triplets-large) at [fa99ede](https://huggingface.co/datasets/oddadmix/arabic-triplets-large/tree/fa99ede10602ff5cffb7591ff1f25289414c4b13)
137
+ * Size: 104,550 training samples
138
+ * Columns: <code>anchor</code>, <code>positive</code>, and <code>negative</code>
139
+ * Approximate statistics based on the first 100 samples:
140
+ | | anchor | positive | negative |
141
+ |:---------|:----------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
142
+ | type | string | string | string |
143
+ | modality | text | text | text |
144
+ | details | <ul><li>min: 12 tokens</li><li>mean: 18.6 tokens</li><li>max: 33 tokens</li></ul> | <ul><li>min: 26 tokens</li><li>mean: 57.26 tokens</li><li>max: 142 tokens</li></ul> | <ul><li>min: 18 tokens</li><li>mean: 47.38 tokens</li><li>max: 120 tokens</li></ul> |
145
+ * Samples:
146
+ | anchor | positive | negative |
147
+ |:----------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
148
+ | <code>ما هي أهمية التلال والمناطق الجبلية في البيئة؟</code> | <code><br><br> تعتبر التلال والمناطق الجبلية من أهم عناصر البيئة التي تؤثر بشكل كبير على توازن النظام الإيكولوجي.</code> | <code>يعتبر النشاط السياسي في البلدان الصغيرة من الأمور التي تتطلب إدارة شاملة ومتكاملة.</code> |
149
+ | <code>كيف تؤثر التلال على الرياح والهطول المطر؟</code> | <code><br><br> يؤثر التلال على الرياح والهطول المطر من خلال تأثيرهم على توزيع الضغط الجوي، مما يؤدي إلى تغييرات في اتجاه وسرعة الرياح وتواتر الهطول.</code> | <code>إنّ الأنشطة الزراعية في المناطق الجبلية تعتبر من أهمّ العوامل التي تساهم في تحسين جودة الحياة في هذه المناطق، ولكنها لا تؤثر بشكل مباشر على الرياح والهطول المطر.</code> |
150
+ | <code>ما هي أنواع التلال المختلفة؟ (جبال، هضاب، منحدرات)</code> | <code><br><br> هناك ثلاثة أنواع رئيسية للتلاءم هي الجبال، الهضاب، والمنحدرات.</code> | <code>الإدارة البيئية تعتبر من الأنشطة التي لها تأثير كبير على البيئة.</code> |
151
+ * Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:
152
+ ```json
153
+ {
154
+ "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct='dot_score', gather_across_devices=False, directions=('query_to_doc',), partition_mode='joint', hardness_mode=None, hardness_strength=0.0)",
155
+ "document_regularizer_weight": 3e-05,
156
+ "query_regularizer_weight": 5e-05
157
+ }
158
+ ```
159
+
160
+ ### Training Hyperparameters
161
+ #### Non-Default Hyperparameters
162
+
163
+ - `per_device_train_batch_size`: 16
164
+ - `num_train_epochs`: 1
165
+ - `learning_rate`: 2e-05
166
+ - `gradient_accumulation_steps`: 4
167
+ - `batch_sampler`: no_duplicates
168
+
169
+ #### All Hyperparameters
170
+ <details><summary>Click to expand</summary>
171
+
172
+ - `per_device_train_batch_size`: 16
173
+ - `num_train_epochs`: 1
174
+ - `max_steps`: -1
175
+ - `learning_rate`: 2e-05
176
+ - `lr_scheduler_type`: linear
177
+ - `lr_scheduler_kwargs`: None
178
+ - `warmup_steps`: 0
179
+ - `optim`: adamw_torch_fused
180
+ - `optim_args`: None
181
+ - `weight_decay`: 0.0
182
+ - `adam_beta1`: 0.9
183
+ - `adam_beta2`: 0.999
184
+ - `adam_epsilon`: 1e-08
185
+ - `optim_target_modules`: None
186
+ - `gradient_accumulation_steps`: 4
187
+ - `average_tokens_across_devices`: True
188
+ - `max_grad_norm`: 1.0
189
+ - `label_smoothing_factor`: 0.0
190
+ - `bf16`: False
191
+ - `fp16`: False
192
+ - `bf16_full_eval`: False
193
+ - `fp16_full_eval`: False
194
+ - `tf32`: None
195
+ - `gradient_checkpointing`: False
196
+ - `gradient_checkpointing_kwargs`: None
197
+ - `torch_compile`: False
198
+ - `torch_compile_backend`: None
199
+ - `torch_compile_mode`: None
200
+ - `use_liger_kernel`: False
201
+ - `liger_kernel_config`: None
202
+ - `use_cache`: False
203
+ - `neftune_noise_alpha`: None
204
+ - `torch_empty_cache_steps`: None
205
+ - `auto_find_batch_size`: False
206
+ - `log_on_each_node`: True
207
+ - `logging_nan_inf_filter`: True
208
+ - `include_num_input_tokens_seen`: no
209
+ - `log_level`: passive
210
+ - `log_level_replica`: warning
211
+ - `disable_tqdm`: False
212
+ - `project`: huggingface
213
+ - `trackio_space_id`: trackio
214
+ - `per_device_eval_batch_size`: 8
215
+ - `prediction_loss_only`: True
216
+ - `eval_on_start`: False
217
+ - `eval_do_concat_batches`: True
218
+ - `eval_use_gather_object`: False
219
+ - `eval_accumulation_steps`: None
220
+ - `include_for_metrics`: []
221
+ - `batch_eval_metrics`: False
222
+ - `save_only_model`: False
223
+ - `save_on_each_node`: False
224
+ - `enable_jit_checkpoint`: False
225
+ - `push_to_hub`: False
226
+ - `hub_private_repo`: None
227
+ - `hub_model_id`: None
228
+ - `hub_strategy`: every_save
229
+ - `hub_always_push`: False
230
+ - `hub_revision`: None
231
+ - `load_best_model_at_end`: False
232
+ - `ignore_data_skip`: False
233
+ - `restore_callback_states_from_checkpoint`: False
234
+ - `full_determinism`: False
235
+ - `seed`: 42
236
+ - `data_seed`: None
237
+ - `use_cpu`: False
238
+ - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
239
+ - `parallelism_config`: None
240
+ - `dataloader_drop_last`: False
241
+ - `dataloader_num_workers`: 0
242
+ - `dataloader_pin_memory`: True
243
+ - `dataloader_persistent_workers`: False
244
+ - `dataloader_prefetch_factor`: None
245
+ - `remove_unused_columns`: True
246
+ - `label_names`: None
247
+ - `train_sampling_strategy`: random
248
+ - `length_column_name`: length
249
+ - `ddp_find_unused_parameters`: None
250
+ - `ddp_bucket_cap_mb`: None
251
+ - `ddp_broadcast_buffers`: False
252
+ - `ddp_backend`: None
253
+ - `ddp_timeout`: 1800
254
+ - `fsdp`: []
255
+ - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
256
+ - `deepspeed`: None
257
+ - `debug`: []
258
+ - `skip_memory_metrics`: True
259
+ - `do_predict`: False
260
+ - `resume_from_checkpoint`: None
261
+ - `warmup_ratio`: None
262
+ - `local_rank`: -1
263
+ - `prompts`: None
264
+ - `batch_sampler`: no_duplicates
265
+ - `multi_dataset_batch_sampler`: proportional
266
+ - `router_mapping`: {}
267
+ - `learning_rate_mapping`: {}
268
+
269
+ </details>
270
+
271
+ ### Training Logs
272
+ | Epoch | Step | Training Loss |
273
+ |:------:|:----:|:-------------:|
274
+ | 0.3060 | 500 | 7.9967 |
275
+ | 0.6121 | 1000 | 0.0165 |
276
+
277
+
278
+ ### Training Time
279
+ - **Training**: 16.7 minutes
280
+
281
+ ### Framework Versions
282
+ - Python: 3.12.3
283
+ - Sentence Transformers: 5.5.1
284
+ - Transformers: 5.5.0
285
+ - PyTorch: 2.12.0+cu130
286
+ - Accelerate: 1.14.0
287
+ - Datasets: 4.3.0
288
+ - Tokenizers: 0.22.2
289
+
290
+ ## Additional Resources
291
+
292
+ - [Training and Finetuning Sparse Embedding Models with Sentence Transformers](https://huggingface.co/blog/train-sparse-encoder): the end-to-end guide for training or finetuning SPLADE and other sparse encoder models.
293
+
294
+ ## Citation
295
+
296
+ ### BibTeX
297
+
298
+ #### Sentence Transformers
299
+ ```bibtex
300
+ @inproceedings{reimers-2019-sentence-bert,
301
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
302
+ author = "Reimers, Nils and Gurevych, Iryna",
303
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
304
+ month = "11",
305
+ year = "2019",
306
+ publisher = "Association for Computational Linguistics",
307
+ url = "https://arxiv.org/abs/1908.10084",
308
+ }
309
+ ```
310
+
311
+ #### SpladeLoss
312
+ ```bibtex
313
+ @misc{formal2022distillationhardnegativesampling,
314
+ title={From Distillation to Hard Negative Sampling: Making Sparse Neural IR Models More Effective},
315
+ author={Thibault Formal and Carlos Lassance and Benjamin Piwowarski and Stéphane Clinchant},
316
+ year={2022},
317
+ eprint={2205.04733},
318
+ archivePrefix={arXiv},
319
+ primaryClass={cs.IR},
320
+ url={https://arxiv.org/abs/2205.04733},
321
+ }
322
+ ```
323
+
324
+ #### SparseMultipleNegativesRankingLoss
325
+ ```bibtex
326
+ @misc{oord2019representationlearningcontrastivepredictive,
327
+ title={Representation Learning with Contrastive Predictive Coding},
328
+ author={Aaron van den Oord and Yazhe Li and Oriol Vinyals},
329
+ year={2019},
330
+ eprint={1807.03748},
331
+ archivePrefix={arXiv},
332
+ primaryClass={cs.LG},
333
+ url={https://arxiv.org/abs/1807.03748},
334
+ }
335
+ ```
336
+
337
+ #### FlopsLoss
338
+ ```bibtex
339
+ @article{paria2020minimizing,
340
+ title={Minimizing flops to learn efficient sparse representations},
341
+ author={Paria, Biswajit and Yeh, Chih-Kuan and Yen, Ian EH and Xu, Ning and Ravikumar, Pradeep and P{'o}czos, Barnab{'a}s},
342
+ journal={arXiv preprint arXiv:2004.05665},
343
+ year={2020}
344
+ }
345
+ ```
346
+
347
+ <!--
348
+ ## Glossary
349
+
350
+ *Clearly define terms in order to be accessible across audiences.*
351
+ -->
352
+
353
+ <!--
354
+ ## Model Card Authors
355
+
356
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
357
+ -->
358
+
359
+ <!--
360
+ ## Model Card Contact
361
+
362
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
363
+ -->
checkpoint-1000/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation": "gelu",
3
+ "architectures": [
4
+ "DistilBertForMaskedLM"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "bos_token_id": null,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "dtype": "float32",
11
+ "eos_token_id": null,
12
+ "hidden_dim": 3072,
13
+ "initializer_range": 0.02,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "distilbert",
16
+ "n_heads": 12,
17
+ "n_layers": 6,
18
+ "output_past": true,
19
+ "pad_token_id": 0,
20
+ "qa_dropout": 0.1,
21
+ "seq_classif_dropout": 0.2,
22
+ "sinusoidal_pos_embds": false,
23
+ "tie_weights_": true,
24
+ "tie_word_embeddings": true,
25
+ "transformers_version": "5.5.0",
26
+ "use_cache": false,
27
+ "vocab_size": 119547
28
+ }
checkpoint-1000/config_sentence_transformers.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "pytorch": "2.12.0+cu130",
4
+ "sentence_transformers": "5.5.1",
5
+ "transformers": "5.5.0"
6
+ },
7
+ "default_prompt_name": null,
8
+ "model_type": "SparseEncoder",
9
+ "prompts": {
10
+ "document": "",
11
+ "query": ""
12
+ },
13
+ "similarity_fn_name": "dot"
14
+ }
checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f10e804a1ba5f8a240061be63903e378cb8788c4780db15d03f63e470fa57723
3
+ size 541795684
checkpoint-1000/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.sparse_encoder.modules.mlm_transformer.MLMTransformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_SpladePooling",
12
+ "type": "sentence_transformers.sparse_encoder.modules.splade_pooling.SpladePooling"
13
+ }
14
+ ]
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddb0bde69327634c3c236128934dbae6563b5f2902463b49d5c460f69d8bddda
3
+ size 14645
checkpoint-1000/sentence_bert_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "fill-mask",
3
+ "modality_config": {
4
+ "text": {
5
+ "method": "forward",
6
+ "method_output_name": "logits"
7
+ }
8
+ },
9
+ "module_output_name": "token_embeddings"
10
+ }
checkpoint-1000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 512,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.612088752869166,
6
+ "eval_steps": 500,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "base_loss": 7.996,
14
+ "document_regularizer_loss": 0.0003,
15
+ "epoch": 0.306044376434583,
16
+ "grad_norm": 0.008376441895961761,
17
+ "learning_rate": 1.3892288861689107e-05,
18
+ "loss": 7.99669580078125,
19
+ "query_regularizer_loss": 0.0004,
20
+ "step": 500
21
+ },
22
+ {
23
+ "base_loss": 0.0149,
24
+ "document_regularizer_loss": 0.0007,
25
+ "epoch": 0.612088752869166,
26
+ "grad_norm": 0.10282868891954422,
27
+ "learning_rate": 7.77233782129743e-06,
28
+ "loss": 0.016533720016479493,
29
+ "query_regularizer_loss": 0.0009,
30
+ "step": 1000
31
+ }
32
+ ],
33
+ "logging_steps": 500,
34
+ "max_steps": 1634,
35
+ "num_input_tokens_seen": 0,
36
+ "num_train_epochs": 1,
37
+ "save_steps": 500,
38
+ "stateful_callbacks": {
39
+ "TrainerControl": {
40
+ "args": {
41
+ "should_epoch_stop": false,
42
+ "should_evaluate": false,
43
+ "should_log": false,
44
+ "should_save": true,
45
+ "should_training_stop": false
46
+ },
47
+ "attributes": {}
48
+ }
49
+ },
50
+ "total_flos": 0.0,
51
+ "train_batch_size": 16,
52
+ "trial_name": null,
53
+ "trial_params": null
54
+ }
checkpoint-1500/1_SpladePooling/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "pooling_strategy": "max",
3
+ "activation_function": "relu",
4
+ "embedding_dimension": 119547
5
+ }
checkpoint-1500/README.md ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - sentence-transformers
4
+ - sparse-encoder
5
+ - sparse
6
+ - splade
7
+ - generated_from_trainer
8
+ - dataset_size:104550
9
+ - loss:SpladeLoss
10
+ - loss:SparseMultipleNegativesRankingLoss
11
+ - loss:FlopsLoss
12
+ base_model: distilbert/distilbert-base-multilingual-cased
13
+ widget:
14
+ - text: يُعتبر التأثير الأوروبي على الثقافة اليابانية في القرن التاسع عشر أمرًا هامًا
15
+ في فهم تاريخ البلاد.
16
+ - text: "\n\n يعد النقد الأدبي أداة أساسية في فهم التاريخ الثقافي، حيث يعكس التغيرات\
17
+ \ الاجتماعية والثقافية التي حدثت عبر العصور."
18
+ - text: لا أعتقد أن هناك أي تأثير لصالح المصممة الداخلية الإيطالية إيلينا فرونتزي
19
+ على هذا النوع من التصاميم.
20
+ - text: كيف يؤثر النقد الأدبي على التفاعل الاجتماعي؟
21
+ - text: "\n\n يعد النقد الأدبي أداة أساسية في فهم التاريخ الثقافي، حيث يساعد على تحليل\
22
+ \ وتفسير الأعمال الفنية والثقافية من خلال منظور تاريخي."
23
+ datasets:
24
+ - oddadmix/arabic-triplets-large
25
+ pipeline_tag: feature-extraction
26
+ library_name: sentence-transformers
27
+ ---
28
+
29
+ # SPLADE Sparse Encoder
30
+
31
+ This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [distilbert/distilbert-base-multilingual-cased](https://huggingface.co/distilbert/distilbert-base-multilingual-cased) on the [arabic-triplets-large](https://huggingface.co/datasets/oddadmix/arabic-triplets-large) dataset using the [sentence-transformers](https://www.SBERT.net) library. It maps sentences & paragraphs to a 119547-dimensional sparse vector space and can be used for semantic search and sparse retrieval.
32
+ ## Model Details
33
+
34
+ ### Model Description
35
+ - **Model Type:** SPLADE Sparse Encoder
36
+ - **Base model:** [distilbert/distilbert-base-multilingual-cased](https://huggingface.co/distilbert/distilbert-base-multilingual-cased) <!-- at revision 45c032ab32cc946ad88a166f7cb282f58c753c2e -->
37
+ - **Maximum Sequence Length:** 512 tokens
38
+ - **Output Dimensionality:** 119547 dimensions
39
+ - **Similarity Function:** Dot Product
40
+ - **Supported Modality:** Text
41
+ - **Training Dataset:**
42
+ - [arabic-triplets-large](https://huggingface.co/datasets/oddadmix/arabic-triplets-large)
43
+ <!-- - **Language:** Unknown -->
44
+ <!-- - **License:** Unknown -->
45
+
46
+ ### Model Sources
47
+
48
+ - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
49
+ - **Documentation:** [Sparse Encoder Documentation](https://www.sbert.net/docs/sparse_encoder/usage/usage.html)
50
+ - **Repository:** [Sentence Transformers on GitHub](https://github.com/huggingface/sentence-transformers)
51
+ - **Hugging Face:** [Sparse Encoders on Hugging Face](https://huggingface.co/models?library=sentence-transformers&other=sparse-encoder)
52
+
53
+ ### Full Model Architecture
54
+
55
+ ```
56
+ SparseEncoder(
57
+ (0): Transformer({'transformer_task': 'fill-mask', 'modality_config': {'text': {'method': 'forward', 'method_output_name': 'logits'}}, 'module_output_name': 'token_embeddings', 'architecture': 'DistilBertForMaskedLM'})
58
+ (1): SpladePooling({'pooling_strategy': 'max', 'activation_function': 'relu', 'embedding_dimension': 119547})
59
+ )
60
+ ```
61
+
62
+ ## Usage
63
+
64
+ ### Direct Usage (Sentence Transformers)
65
+
66
+ First install the Sentence Transformers library:
67
+
68
+ ```bash
69
+ pip install -U sentence-transformers
70
+ ```
71
+ Then you can load this model and run inference.
72
+ ```python
73
+ from sentence_transformers import SparseEncoder
74
+
75
+ # Download from the 🤗 Hub
76
+ model = SparseEncoder("sparse_encoder_model_id")
77
+ # Run inference
78
+ sentences = [
79
+ 'ما هي أهمية النقد الأدبي في فهم التاريخ الثقافي؟',
80
+ '\n\n يعد النقد الأدبي أداة أساسية في فهم التاريخ الثقافي، حيث يساعد على تحليل وتفسير الأعمال الفنية والثقافية من خلال منظور تاريخي.',
81
+ 'تُعتبر اللغة العربية لغة قديمة ومتنوعة، وتمثل جزءًا هامًا من تراث البشرية.',
82
+ ]
83
+ embeddings = model.encode(sentences)
84
+ print(embeddings.shape)
85
+ # [3, 119547]
86
+
87
+ # Get the similarity scores for the embeddings
88
+ similarities = model.similarity(embeddings, embeddings)
89
+ print(similarities)
90
+ # tensor([[38.0133, 42.8860, 6.6607],
91
+ # [42.8860, 93.9162, 12.8581],
92
+ # [ 6.6607, 12.8581, 96.7075]])
93
+ ```
94
+ <!--
95
+ ### Direct Usage (Transformers)
96
+
97
+ <details><summary>Click to see the direct usage in Transformers</summary>
98
+
99
+ </details>
100
+ -->
101
+
102
+ <!--
103
+ ### Downstream Usage (Sentence Transformers)
104
+
105
+ You can finetune this model on your own dataset.
106
+
107
+ <details><summary>Click to expand</summary>
108
+
109
+ </details>
110
+ -->
111
+
112
+ <!--
113
+ ### Out-of-Scope Use
114
+
115
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
116
+ -->
117
+
118
+ <!--
119
+ ## Bias, Risks and Limitations
120
+
121
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
122
+ -->
123
+
124
+ <!--
125
+ ### Recommendations
126
+
127
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
128
+ -->
129
+
130
+ ## Training Details
131
+
132
+ ### Training Dataset
133
+
134
+ #### arabic-triplets-large
135
+
136
+ * Dataset: [arabic-triplets-large](https://huggingface.co/datasets/oddadmix/arabic-triplets-large) at [fa99ede](https://huggingface.co/datasets/oddadmix/arabic-triplets-large/tree/fa99ede10602ff5cffb7591ff1f25289414c4b13)
137
+ * Size: 104,550 training samples
138
+ * Columns: <code>anchor</code>, <code>positive</code>, and <code>negative</code>
139
+ * Approximate statistics based on the first 100 samples:
140
+ | | anchor | positive | negative |
141
+ |:---------|:----------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
142
+ | type | string | string | string |
143
+ | modality | text | text | text |
144
+ | details | <ul><li>min: 12 tokens</li><li>mean: 18.6 tokens</li><li>max: 33 tokens</li></ul> | <ul><li>min: 26 tokens</li><li>mean: 57.26 tokens</li><li>max: 142 tokens</li></ul> | <ul><li>min: 18 tokens</li><li>mean: 47.38 tokens</li><li>max: 120 tokens</li></ul> |
145
+ * Samples:
146
+ | anchor | positive | negative |
147
+ |:----------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
148
+ | <code>ما هي أهمية التلال والمناطق الجبلية في البيئة؟</code> | <code><br><br> تعتبر التلال والمناطق الجبلية من أهم عناصر البيئة التي تؤثر بشكل كبير على توازن النظام الإيكولوجي.</code> | <code>يعتبر النشاط السياسي في البلدان الصغيرة من الأمور التي تتطلب إدارة شاملة ومتكاملة.</code> |
149
+ | <code>كيف تؤثر التلال على الرياح والهطول المطر؟</code> | <code><br><br> يؤثر التلال على الرياح والهطول المطر من خلال تأثيرهم على توزيع الضغط الجوي، مما يؤدي إلى تغييرات في اتجاه وسرعة الرياح وتواتر الهطول.</code> | <code>إنّ الأنشطة الزراعية في المناطق الجبلية تعتبر من أهمّ العوامل التي تساهم في تحسين جودة الحياة في هذه المناطق، ولكنها لا تؤثر بشكل مباشر على الرياح والهطول المطر.</code> |
150
+ | <code>ما هي أنواع التلال المختلفة؟ (جبال، هضاب، منحدرات)</code> | <code><br><br> هناك ثلاثة أنواع رئيسية للتلاءم هي الجبال، الهضاب، والمنحدرات.</code> | <code>الإدارة البيئية تعتبر من الأنشطة التي لها تأثير كبير على البيئة.</code> |
151
+ * Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:
152
+ ```json
153
+ {
154
+ "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct='dot_score', gather_across_devices=False, directions=('query_to_doc',), partition_mode='joint', hardness_mode=None, hardness_strength=0.0)",
155
+ "document_regularizer_weight": 3e-05,
156
+ "query_regularizer_weight": 5e-05
157
+ }
158
+ ```
159
+
160
+ ### Training Hyperparameters
161
+ #### Non-Default Hyperparameters
162
+
163
+ - `per_device_train_batch_size`: 16
164
+ - `num_train_epochs`: 1
165
+ - `learning_rate`: 2e-05
166
+ - `gradient_accumulation_steps`: 4
167
+ - `batch_sampler`: no_duplicates
168
+
169
+ #### All Hyperparameters
170
+ <details><summary>Click to expand</summary>
171
+
172
+ - `per_device_train_batch_size`: 16
173
+ - `num_train_epochs`: 1
174
+ - `max_steps`: -1
175
+ - `learning_rate`: 2e-05
176
+ - `lr_scheduler_type`: linear
177
+ - `lr_scheduler_kwargs`: None
178
+ - `warmup_steps`: 0
179
+ - `optim`: adamw_torch_fused
180
+ - `optim_args`: None
181
+ - `weight_decay`: 0.0
182
+ - `adam_beta1`: 0.9
183
+ - `adam_beta2`: 0.999
184
+ - `adam_epsilon`: 1e-08
185
+ - `optim_target_modules`: None
186
+ - `gradient_accumulation_steps`: 4
187
+ - `average_tokens_across_devices`: True
188
+ - `max_grad_norm`: 1.0
189
+ - `label_smoothing_factor`: 0.0
190
+ - `bf16`: False
191
+ - `fp16`: False
192
+ - `bf16_full_eval`: False
193
+ - `fp16_full_eval`: False
194
+ - `tf32`: None
195
+ - `gradient_checkpointing`: False
196
+ - `gradient_checkpointing_kwargs`: None
197
+ - `torch_compile`: False
198
+ - `torch_compile_backend`: None
199
+ - `torch_compile_mode`: None
200
+ - `use_liger_kernel`: False
201
+ - `liger_kernel_config`: None
202
+ - `use_cache`: False
203
+ - `neftune_noise_alpha`: None
204
+ - `torch_empty_cache_steps`: None
205
+ - `auto_find_batch_size`: False
206
+ - `log_on_each_node`: True
207
+ - `logging_nan_inf_filter`: True
208
+ - `include_num_input_tokens_seen`: no
209
+ - `log_level`: passive
210
+ - `log_level_replica`: warning
211
+ - `disable_tqdm`: False
212
+ - `project`: huggingface
213
+ - `trackio_space_id`: trackio
214
+ - `per_device_eval_batch_size`: 8
215
+ - `prediction_loss_only`: True
216
+ - `eval_on_start`: False
217
+ - `eval_do_concat_batches`: True
218
+ - `eval_use_gather_object`: False
219
+ - `eval_accumulation_steps`: None
220
+ - `include_for_metrics`: []
221
+ - `batch_eval_metrics`: False
222
+ - `save_only_model`: False
223
+ - `save_on_each_node`: False
224
+ - `enable_jit_checkpoint`: False
225
+ - `push_to_hub`: False
226
+ - `hub_private_repo`: None
227
+ - `hub_model_id`: None
228
+ - `hub_strategy`: every_save
229
+ - `hub_always_push`: False
230
+ - `hub_revision`: None
231
+ - `load_best_model_at_end`: False
232
+ - `ignore_data_skip`: False
233
+ - `restore_callback_states_from_checkpoint`: False
234
+ - `full_determinism`: False
235
+ - `seed`: 42
236
+ - `data_seed`: None
237
+ - `use_cpu`: False
238
+ - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
239
+ - `parallelism_config`: None
240
+ - `dataloader_drop_last`: False
241
+ - `dataloader_num_workers`: 0
242
+ - `dataloader_pin_memory`: True
243
+ - `dataloader_persistent_workers`: False
244
+ - `dataloader_prefetch_factor`: None
245
+ - `remove_unused_columns`: True
246
+ - `label_names`: None
247
+ - `train_sampling_strategy`: random
248
+ - `length_column_name`: length
249
+ - `ddp_find_unused_parameters`: None
250
+ - `ddp_bucket_cap_mb`: None
251
+ - `ddp_broadcast_buffers`: False
252
+ - `ddp_backend`: None
253
+ - `ddp_timeout`: 1800
254
+ - `fsdp`: []
255
+ - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
256
+ - `deepspeed`: None
257
+ - `debug`: []
258
+ - `skip_memory_metrics`: True
259
+ - `do_predict`: False
260
+ - `resume_from_checkpoint`: None
261
+ - `warmup_ratio`: None
262
+ - `local_rank`: -1
263
+ - `prompts`: None
264
+ - `batch_sampler`: no_duplicates
265
+ - `multi_dataset_batch_sampler`: proportional
266
+ - `router_mapping`: {}
267
+ - `learning_rate_mapping`: {}
268
+
269
+ </details>
270
+
271
+ ### Training Logs
272
+ | Epoch | Step | Training Loss |
273
+ |:------:|:----:|:-------------:|
274
+ | 0.3060 | 500 | 7.9967 |
275
+ | 0.6121 | 1000 | 0.0165 |
276
+ | 0.9181 | 1500 | 0.0108 |
277
+
278
+
279
+ ### Training Time
280
+ - **Training**: 25.0 minutes
281
+
282
+ ### Framework Versions
283
+ - Python: 3.12.3
284
+ - Sentence Transformers: 5.5.1
285
+ - Transformers: 5.5.0
286
+ - PyTorch: 2.12.0+cu130
287
+ - Accelerate: 1.14.0
288
+ - Datasets: 4.3.0
289
+ - Tokenizers: 0.22.2
290
+
291
+ ## Additional Resources
292
+
293
+ - [Training and Finetuning Sparse Embedding Models with Sentence Transformers](https://huggingface.co/blog/train-sparse-encoder): the end-to-end guide for training or finetuning SPLADE and other sparse encoder models.
294
+
295
+ ## Citation
296
+
297
+ ### BibTeX
298
+
299
+ #### Sentence Transformers
300
+ ```bibtex
301
+ @inproceedings{reimers-2019-sentence-bert,
302
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
303
+ author = "Reimers, Nils and Gurevych, Iryna",
304
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
305
+ month = "11",
306
+ year = "2019",
307
+ publisher = "Association for Computational Linguistics",
308
+ url = "https://arxiv.org/abs/1908.10084",
309
+ }
310
+ ```
311
+
312
+ #### SpladeLoss
313
+ ```bibtex
314
+ @misc{formal2022distillationhardnegativesampling,
315
+ title={From Distillation to Hard Negative Sampling: Making Sparse Neural IR Models More Effective},
316
+ author={Thibault Formal and Carlos Lassance and Benjamin Piwowarski and Stéphane Clinchant},
317
+ year={2022},
318
+ eprint={2205.04733},
319
+ archivePrefix={arXiv},
320
+ primaryClass={cs.IR},
321
+ url={https://arxiv.org/abs/2205.04733},
322
+ }
323
+ ```
324
+
325
+ #### SparseMultipleNegativesRankingLoss
326
+ ```bibtex
327
+ @misc{oord2019representationlearningcontrastivepredictive,
328
+ title={Representation Learning with Contrastive Predictive Coding},
329
+ author={Aaron van den Oord and Yazhe Li and Oriol Vinyals},
330
+ year={2019},
331
+ eprint={1807.03748},
332
+ archivePrefix={arXiv},
333
+ primaryClass={cs.LG},
334
+ url={https://arxiv.org/abs/1807.03748},
335
+ }
336
+ ```
337
+
338
+ #### FlopsLoss
339
+ ```bibtex
340
+ @article{paria2020minimizing,
341
+ title={Minimizing flops to learn efficient sparse representations},
342
+ author={Paria, Biswajit and Yeh, Chih-Kuan and Yen, Ian EH and Xu, Ning and Ravikumar, Pradeep and P{'o}czos, Barnab{'a}s},
343
+ journal={arXiv preprint arXiv:2004.05665},
344
+ year={2020}
345
+ }
346
+ ```
347
+
348
+ <!--
349
+ ## Glossary
350
+
351
+ *Clearly define terms in order to be accessible across audiences.*
352
+ -->
353
+
354
+ <!--
355
+ ## Model Card Authors
356
+
357
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
358
+ -->
359
+
360
+ <!--
361
+ ## Model Card Contact
362
+
363
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
364
+ -->
checkpoint-1500/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation": "gelu",
3
+ "architectures": [
4
+ "DistilBertForMaskedLM"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "bos_token_id": null,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "dtype": "float32",
11
+ "eos_token_id": null,
12
+ "hidden_dim": 3072,
13
+ "initializer_range": 0.02,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "distilbert",
16
+ "n_heads": 12,
17
+ "n_layers": 6,
18
+ "output_past": true,
19
+ "pad_token_id": 0,
20
+ "qa_dropout": 0.1,
21
+ "seq_classif_dropout": 0.2,
22
+ "sinusoidal_pos_embds": false,
23
+ "tie_weights_": true,
24
+ "tie_word_embeddings": true,
25
+ "transformers_version": "5.5.0",
26
+ "use_cache": false,
27
+ "vocab_size": 119547
28
+ }
checkpoint-1500/config_sentence_transformers.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "pytorch": "2.12.0+cu130",
4
+ "sentence_transformers": "5.5.1",
5
+ "transformers": "5.5.0"
6
+ },
7
+ "default_prompt_name": null,
8
+ "model_type": "SparseEncoder",
9
+ "prompts": {
10
+ "document": "",
11
+ "query": ""
12
+ },
13
+ "similarity_fn_name": "dot"
14
+ }
checkpoint-1500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8fc2c723506062dba5c714426ffa9660ca2e834c3aece741d041c04596a13f2
3
+ size 541795684
checkpoint-1500/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.sparse_encoder.modules.mlm_transformer.MLMTransformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_SpladePooling",
12
+ "type": "sentence_transformers.sparse_encoder.modules.splade_pooling.SpladePooling"
13
+ }
14
+ ]
checkpoint-1500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aeaa9ce743caf565e4de7fb5688b43c158078eea780ad6925e3e3561770cc00
3
+ size 14645
checkpoint-1500/sentence_bert_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "fill-mask",
3
+ "modality_config": {
4
+ "text": {
5
+ "method": "forward",
6
+ "method_output_name": "logits"
7
+ }
8
+ },
9
+ "module_output_name": "token_embeddings"
10
+ }
checkpoint-1500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1500/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 512,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.918133129303749,
6
+ "eval_steps": 500,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "base_loss": 7.996,
14
+ "document_regularizer_loss": 0.0003,
15
+ "epoch": 0.306044376434583,
16
+ "grad_norm": 0.008376441895961761,
17
+ "learning_rate": 1.3892288861689107e-05,
18
+ "loss": 7.99669580078125,
19
+ "query_regularizer_loss": 0.0004,
20
+ "step": 500
21
+ },
22
+ {
23
+ "base_loss": 0.0149,
24
+ "document_regularizer_loss": 0.0007,
25
+ "epoch": 0.612088752869166,
26
+ "grad_norm": 0.10282868891954422,
27
+ "learning_rate": 7.77233782129743e-06,
28
+ "loss": 0.016533720016479493,
29
+ "query_regularizer_loss": 0.0009,
30
+ "step": 1000
31
+ },
32
+ {
33
+ "base_loss": 0.0096,
34
+ "document_regularizer_loss": 0.0005,
35
+ "epoch": 0.918133129303749,
36
+ "grad_norm": 0.007446900941431522,
37
+ "learning_rate": 1.6523867809057528e-06,
38
+ "loss": 0.010761536598205567,
39
+ "query_regularizer_loss": 0.0007,
40
+ "step": 1500
41
+ }
42
+ ],
43
+ "logging_steps": 500,
44
+ "max_steps": 1634,
45
+ "num_input_tokens_seen": 0,
46
+ "num_train_epochs": 1,
47
+ "save_steps": 500,
48
+ "stateful_callbacks": {
49
+ "TrainerControl": {
50
+ "args": {
51
+ "should_epoch_stop": false,
52
+ "should_evaluate": false,
53
+ "should_log": false,
54
+ "should_save": true,
55
+ "should_training_stop": false
56
+ },
57
+ "attributes": {}
58
+ }
59
+ },
60
+ "total_flos": 0.0,
61
+ "train_batch_size": 16,
62
+ "trial_name": null,
63
+ "trial_params": null
64
+ }
checkpoint-1634/1_SpladePooling/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "pooling_strategy": "max",
3
+ "activation_function": "relu",
4
+ "embedding_dimension": 119547
5
+ }
checkpoint-1634/README.md ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - sentence-transformers
4
+ - sparse-encoder
5
+ - sparse
6
+ - splade
7
+ - generated_from_trainer
8
+ - dataset_size:104550
9
+ - loss:SpladeLoss
10
+ - loss:SparseMultipleNegativesRankingLoss
11
+ - loss:FlopsLoss
12
+ base_model: distilbert/distilbert-base-multilingual-cased
13
+ widget:
14
+ - text: يُعتبر التأثير الأوروبي على الثقافة اليابانية في القرن التاسع عشر أمرًا هامًا
15
+ في فهم تاريخ البلاد.
16
+ - text: "\n\n يعد النقد الأدبي أداة أساسية في فهم التاريخ الثقافي، حيث يعكس التغيرات\
17
+ \ الاجتماعية والثقافية التي حدثت عبر العصور."
18
+ - text: لا أعتقد أن هناك أي تأثير لصالح المصممة الداخلية الإيطالية إيلينا فرونتزي
19
+ على هذا النوع من التصاميم.
20
+ - text: كيف يؤثر النقد الأدبي على التفاعل الاجتماعي؟
21
+ - text: "\n\n يعد النقد الأدبي أداة أساسية في فهم التاريخ الثقافي، حيث يساعد على تحليل\
22
+ \ وتفسير الأعمال الفنية والثقافية من خلال منظور تاريخي."
23
+ datasets:
24
+ - oddadmix/arabic-triplets-large
25
+ pipeline_tag: feature-extraction
26
+ library_name: sentence-transformers
27
+ ---
28
+
29
+ # SPLADE Sparse Encoder
30
+
31
+ This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [distilbert/distilbert-base-multilingual-cased](https://huggingface.co/distilbert/distilbert-base-multilingual-cased) on the [arabic-triplets-large](https://huggingface.co/datasets/oddadmix/arabic-triplets-large) dataset using the [sentence-transformers](https://www.SBERT.net) library. It maps sentences & paragraphs to a 119547-dimensional sparse vector space and can be used for semantic search and sparse retrieval.
32
+ ## Model Details
33
+
34
+ ### Model Description
35
+ - **Model Type:** SPLADE Sparse Encoder
36
+ - **Base model:** [distilbert/distilbert-base-multilingual-cased](https://huggingface.co/distilbert/distilbert-base-multilingual-cased) <!-- at revision 45c032ab32cc946ad88a166f7cb282f58c753c2e -->
37
+ - **Maximum Sequence Length:** 512 tokens
38
+ - **Output Dimensionality:** 119547 dimensions
39
+ - **Similarity Function:** Dot Product
40
+ - **Supported Modality:** Text
41
+ - **Training Dataset:**
42
+ - [arabic-triplets-large](https://huggingface.co/datasets/oddadmix/arabic-triplets-large)
43
+ <!-- - **Language:** Unknown -->
44
+ <!-- - **License:** Unknown -->
45
+
46
+ ### Model Sources
47
+
48
+ - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
49
+ - **Documentation:** [Sparse Encoder Documentation](https://www.sbert.net/docs/sparse_encoder/usage/usage.html)
50
+ - **Repository:** [Sentence Transformers on GitHub](https://github.com/huggingface/sentence-transformers)
51
+ - **Hugging Face:** [Sparse Encoders on Hugging Face](https://huggingface.co/models?library=sentence-transformers&other=sparse-encoder)
52
+
53
+ ### Full Model Architecture
54
+
55
+ ```
56
+ SparseEncoder(
57
+ (0): Transformer({'transformer_task': 'fill-mask', 'modality_config': {'text': {'method': 'forward', 'method_output_name': 'logits'}}, 'module_output_name': 'token_embeddings', 'architecture': 'DistilBertForMaskedLM'})
58
+ (1): SpladePooling({'pooling_strategy': 'max', 'activation_function': 'relu', 'embedding_dimension': 119547})
59
+ )
60
+ ```
61
+
62
+ ## Usage
63
+
64
+ ### Direct Usage (Sentence Transformers)
65
+
66
+ First install the Sentence Transformers library:
67
+
68
+ ```bash
69
+ pip install -U sentence-transformers
70
+ ```
71
+ Then you can load this model and run inference.
72
+ ```python
73
+ from sentence_transformers import SparseEncoder
74
+
75
+ # Download from the 🤗 Hub
76
+ model = SparseEncoder("sparse_encoder_model_id")
77
+ # Run inference
78
+ sentences = [
79
+ 'ما هي أهمية النقد الأدبي في فهم التاريخ الثقافي؟',
80
+ '\n\n يعد النقد الأدبي أداة أساسية في فهم التاريخ الثقافي، حيث يساعد على تحليل وتفسير الأعمال الفنية والثقافية من خلال منظور تاريخي.',
81
+ 'تُعتبر اللغة العربية لغة قديمة ومتنوعة، وتمثل جزءًا هامًا من تراث البشرية.',
82
+ ]
83
+ embeddings = model.encode(sentences)
84
+ print(embeddings.shape)
85
+ # [3, 119547]
86
+
87
+ # Get the similarity scores for the embeddings
88
+ similarities = model.similarity(embeddings, embeddings)
89
+ print(similarities)
90
+ # tensor([[34.2188, 38.5133, 6.4171],
91
+ # [38.5133, 83.0456, 11.4468],
92
+ # [ 6.4171, 11.4468, 90.0776]])
93
+ ```
94
+ <!--
95
+ ### Direct Usage (Transformers)
96
+
97
+ <details><summary>Click to see the direct usage in Transformers</summary>
98
+
99
+ </details>
100
+ -->
101
+
102
+ <!--
103
+ ### Downstream Usage (Sentence Transformers)
104
+
105
+ You can finetune this model on your own dataset.
106
+
107
+ <details><summary>Click to expand</summary>
108
+
109
+ </details>
110
+ -->
111
+
112
+ <!--
113
+ ### Out-of-Scope Use
114
+
115
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
116
+ -->
117
+
118
+ <!--
119
+ ## Bias, Risks and Limitations
120
+
121
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
122
+ -->
123
+
124
+ <!--
125
+ ### Recommendations
126
+
127
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
128
+ -->
129
+
130
+ ## Training Details
131
+
132
+ ### Training Dataset
133
+
134
+ #### arabic-triplets-large
135
+
136
+ * Dataset: [arabic-triplets-large](https://huggingface.co/datasets/oddadmix/arabic-triplets-large) at [fa99ede](https://huggingface.co/datasets/oddadmix/arabic-triplets-large/tree/fa99ede10602ff5cffb7591ff1f25289414c4b13)
137
+ * Size: 104,550 training samples
138
+ * Columns: <code>anchor</code>, <code>positive</code>, and <code>negative</code>
139
+ * Approximate statistics based on the first 100 samples:
140
+ | | anchor | positive | negative |
141
+ |:---------|:----------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
142
+ | type | string | string | string |
143
+ | modality | text | text | text |
144
+ | details | <ul><li>min: 12 tokens</li><li>mean: 18.6 tokens</li><li>max: 33 tokens</li></ul> | <ul><li>min: 26 tokens</li><li>mean: 57.26 tokens</li><li>max: 142 tokens</li></ul> | <ul><li>min: 18 tokens</li><li>mean: 47.38 tokens</li><li>max: 120 tokens</li></ul> |
145
+ * Samples:
146
+ | anchor | positive | negative |
147
+ |:----------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
148
+ | <code>ما هي أهمية التلال والمناطق الجبلية في البيئة؟</code> | <code><br><br> تعتبر التلال والمناطق الجبلية من أهم عناصر البيئة التي تؤثر بشكل كبير على توازن النظام الإيكولوجي.</code> | <code>يعتبر النشاط السياسي في البلدان الصغيرة من الأمور التي تتطلب إدارة شاملة ومتكاملة.</code> |
149
+ | <code>كيف تؤثر التلال على الرياح والهطول المطر؟</code> | <code><br><br> يؤثر التلال على الرياح والهطول المطر من خلال تأثيرهم على توزيع الضغط الجوي، مما يؤدي إلى تغييرات في اتجاه وسرعة الرياح وتواتر الهطول.</code> | <code>إنّ الأنشطة الزراعية في المناطق الجبلية تعتبر من أهمّ العوامل التي تساهم في تحسين جودة الحياة في هذه المناطق، ولكنها لا تؤثر بشكل مباشر على الرياح والهطول المطر.</code> |
150
+ | <code>ما هي أنواع التلال المختلفة؟ (جبال، هضاب، منحدرات)</code> | <code><br><br> هناك ثلاثة أنواع رئيسية للتلاءم هي الجبال، الهضاب، والمنحدرات.</code> | <code>الإدارة البيئية تعتبر من الأنشطة التي لها تأثير كبير على البيئة.</code> |
151
+ * Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:
152
+ ```json
153
+ {
154
+ "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct='dot_score', gather_across_devices=False, directions=('query_to_doc',), partition_mode='joint', hardness_mode=None, hardness_strength=0.0)",
155
+ "document_regularizer_weight": 3e-05,
156
+ "query_regularizer_weight": 5e-05
157
+ }
158
+ ```
159
+
160
+ ### Training Hyperparameters
161
+ #### Non-Default Hyperparameters
162
+
163
+ - `per_device_train_batch_size`: 16
164
+ - `num_train_epochs`: 1
165
+ - `learning_rate`: 2e-05
166
+ - `gradient_accumulation_steps`: 4
167
+ - `batch_sampler`: no_duplicates
168
+
169
+ #### All Hyperparameters
170
+ <details><summary>Click to expand</summary>
171
+
172
+ - `per_device_train_batch_size`: 16
173
+ - `num_train_epochs`: 1
174
+ - `max_steps`: -1
175
+ - `learning_rate`: 2e-05
176
+ - `lr_scheduler_type`: linear
177
+ - `lr_scheduler_kwargs`: None
178
+ - `warmup_steps`: 0
179
+ - `optim`: adamw_torch_fused
180
+ - `optim_args`: None
181
+ - `weight_decay`: 0.0
182
+ - `adam_beta1`: 0.9
183
+ - `adam_beta2`: 0.999
184
+ - `adam_epsilon`: 1e-08
185
+ - `optim_target_modules`: None
186
+ - `gradient_accumulation_steps`: 4
187
+ - `average_tokens_across_devices`: True
188
+ - `max_grad_norm`: 1.0
189
+ - `label_smoothing_factor`: 0.0
190
+ - `bf16`: False
191
+ - `fp16`: False
192
+ - `bf16_full_eval`: False
193
+ - `fp16_full_eval`: False
194
+ - `tf32`: None
195
+ - `gradient_checkpointing`: False
196
+ - `gradient_checkpointing_kwargs`: None
197
+ - `torch_compile`: False
198
+ - `torch_compile_backend`: None
199
+ - `torch_compile_mode`: None
200
+ - `use_liger_kernel`: False
201
+ - `liger_kernel_config`: None
202
+ - `use_cache`: False
203
+ - `neftune_noise_alpha`: None
204
+ - `torch_empty_cache_steps`: None
205
+ - `auto_find_batch_size`: False
206
+ - `log_on_each_node`: True
207
+ - `logging_nan_inf_filter`: True
208
+ - `include_num_input_tokens_seen`: no
209
+ - `log_level`: passive
210
+ - `log_level_replica`: warning
211
+ - `disable_tqdm`: False
212
+ - `project`: huggingface
213
+ - `trackio_space_id`: trackio
214
+ - `per_device_eval_batch_size`: 8
215
+ - `prediction_loss_only`: True
216
+ - `eval_on_start`: False
217
+ - `eval_do_concat_batches`: True
218
+ - `eval_use_gather_object`: False
219
+ - `eval_accumulation_steps`: None
220
+ - `include_for_metrics`: []
221
+ - `batch_eval_metrics`: False
222
+ - `save_only_model`: False
223
+ - `save_on_each_node`: False
224
+ - `enable_jit_checkpoint`: False
225
+ - `push_to_hub`: False
226
+ - `hub_private_repo`: None
227
+ - `hub_model_id`: None
228
+ - `hub_strategy`: every_save
229
+ - `hub_always_push`: False
230
+ - `hub_revision`: None
231
+ - `load_best_model_at_end`: False
232
+ - `ignore_data_skip`: False
233
+ - `restore_callback_states_from_checkpoint`: False
234
+ - `full_determinism`: False
235
+ - `seed`: 42
236
+ - `data_seed`: None
237
+ - `use_cpu`: False
238
+ - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
239
+ - `parallelism_config`: None
240
+ - `dataloader_drop_last`: False
241
+ - `dataloader_num_workers`: 0
242
+ - `dataloader_pin_memory`: True
243
+ - `dataloader_persistent_workers`: False
244
+ - `dataloader_prefetch_factor`: None
245
+ - `remove_unused_columns`: True
246
+ - `label_names`: None
247
+ - `train_sampling_strategy`: random
248
+ - `length_column_name`: length
249
+ - `ddp_find_unused_parameters`: None
250
+ - `ddp_bucket_cap_mb`: None
251
+ - `ddp_broadcast_buffers`: False
252
+ - `ddp_backend`: None
253
+ - `ddp_timeout`: 1800
254
+ - `fsdp`: []
255
+ - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
256
+ - `deepspeed`: None
257
+ - `debug`: []
258
+ - `skip_memory_metrics`: True
259
+ - `do_predict`: False
260
+ - `resume_from_checkpoint`: None
261
+ - `warmup_ratio`: None
262
+ - `local_rank`: -1
263
+ - `prompts`: None
264
+ - `batch_sampler`: no_duplicates
265
+ - `multi_dataset_batch_sampler`: proportional
266
+ - `router_mapping`: {}
267
+ - `learning_rate_mapping`: {}
268
+
269
+ </details>
270
+
271
+ ### Training Logs
272
+ | Epoch | Step | Training Loss |
273
+ |:------:|:----:|:-------------:|
274
+ | 0.3060 | 500 | 7.9967 |
275
+ | 0.6121 | 1000 | 0.0165 |
276
+ | 0.9181 | 1500 | 0.0108 |
277
+
278
+
279
+ ### Training Time
280
+ - **Training**: 27.2 minutes
281
+
282
+ ### Framework Versions
283
+ - Python: 3.12.3
284
+ - Sentence Transformers: 5.5.1
285
+ - Transformers: 5.5.0
286
+ - PyTorch: 2.12.0+cu130
287
+ - Accelerate: 1.14.0
288
+ - Datasets: 4.3.0
289
+ - Tokenizers: 0.22.2
290
+
291
+ ## Additional Resources
292
+
293
+ - [Training and Finetuning Sparse Embedding Models with Sentence Transformers](https://huggingface.co/blog/train-sparse-encoder): the end-to-end guide for training or finetuning SPLADE and other sparse encoder models.
294
+
295
+ ## Citation
296
+
297
+ ### BibTeX
298
+
299
+ #### Sentence Transformers
300
+ ```bibtex
301
+ @inproceedings{reimers-2019-sentence-bert,
302
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
303
+ author = "Reimers, Nils and Gurevych, Iryna",
304
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
305
+ month = "11",
306
+ year = "2019",
307
+ publisher = "Association for Computational Linguistics",
308
+ url = "https://arxiv.org/abs/1908.10084",
309
+ }
310
+ ```
311
+
312
+ #### SpladeLoss
313
+ ```bibtex
314
+ @misc{formal2022distillationhardnegativesampling,
315
+ title={From Distillation to Hard Negative Sampling: Making Sparse Neural IR Models More Effective},
316
+ author={Thibault Formal and Carlos Lassance and Benjamin Piwowarski and Stéphane Clinchant},
317
+ year={2022},
318
+ eprint={2205.04733},
319
+ archivePrefix={arXiv},
320
+ primaryClass={cs.IR},
321
+ url={https://arxiv.org/abs/2205.04733},
322
+ }
323
+ ```
324
+
325
+ #### SparseMultipleNegativesRankingLoss
326
+ ```bibtex
327
+ @misc{oord2019representationlearningcontrastivepredictive,
328
+ title={Representation Learning with Contrastive Predictive Coding},
329
+ author={Aaron van den Oord and Yazhe Li and Oriol Vinyals},
330
+ year={2019},
331
+ eprint={1807.03748},
332
+ archivePrefix={arXiv},
333
+ primaryClass={cs.LG},
334
+ url={https://arxiv.org/abs/1807.03748},
335
+ }
336
+ ```
337
+
338
+ #### FlopsLoss
339
+ ```bibtex
340
+ @article{paria2020minimizing,
341
+ title={Minimizing flops to learn efficient sparse representations},
342
+ author={Paria, Biswajit and Yeh, Chih-Kuan and Yen, Ian EH and Xu, Ning and Ravikumar, Pradeep and P{'o}czos, Barnab{'a}s},
343
+ journal={arXiv preprint arXiv:2004.05665},
344
+ year={2020}
345
+ }
346
+ ```
347
+
348
+ <!--
349
+ ## Glossary
350
+
351
+ *Clearly define terms in order to be accessible across audiences.*
352
+ -->
353
+
354
+ <!--
355
+ ## Model Card Authors
356
+
357
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
358
+ -->
359
+
360
+ <!--
361
+ ## Model Card Contact
362
+
363
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
364
+ -->
checkpoint-1634/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation": "gelu",
3
+ "architectures": [
4
+ "DistilBertForMaskedLM"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "bos_token_id": null,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "dtype": "float32",
11
+ "eos_token_id": null,
12
+ "hidden_dim": 3072,
13
+ "initializer_range": 0.02,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "distilbert",
16
+ "n_heads": 12,
17
+ "n_layers": 6,
18
+ "output_past": true,
19
+ "pad_token_id": 0,
20
+ "qa_dropout": 0.1,
21
+ "seq_classif_dropout": 0.2,
22
+ "sinusoidal_pos_embds": false,
23
+ "tie_weights_": true,
24
+ "tie_word_embeddings": true,
25
+ "transformers_version": "5.5.0",
26
+ "use_cache": false,
27
+ "vocab_size": 119547
28
+ }
checkpoint-1634/config_sentence_transformers.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "pytorch": "2.12.0+cu130",
4
+ "sentence_transformers": "5.5.1",
5
+ "transformers": "5.5.0"
6
+ },
7
+ "default_prompt_name": null,
8
+ "model_type": "SparseEncoder",
9
+ "prompts": {
10
+ "document": "",
11
+ "query": ""
12
+ },
13
+ "similarity_fn_name": "dot"
14
+ }
checkpoint-1634/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85bf3f38130384945446d2f0f8fe6095507e785587ff4bdf69803aa9f599d222
3
+ size 541795684
checkpoint-1634/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.sparse_encoder.modules.mlm_transformer.MLMTransformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_SpladePooling",
12
+ "type": "sentence_transformers.sparse_encoder.modules.splade_pooling.SpladePooling"
13
+ }
14
+ ]
checkpoint-1634/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96a8442eaf667a783024cfcf302ffa8a55b62a5478329defd61c4f44d7349b96
3
+ size 14645
checkpoint-1634/sentence_bert_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "fill-mask",
3
+ "modality_config": {
4
+ "text": {
5
+ "method": "forward",
6
+ "method_output_name": "logits"
7
+ }
8
+ },
9
+ "module_output_name": "token_embeddings"
10
+ }
checkpoint-1634/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1634/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 512,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
checkpoint-1634/trainer_state.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1634,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "base_loss": 7.996,
14
+ "document_regularizer_loss": 0.0003,
15
+ "epoch": 0.306044376434583,
16
+ "grad_norm": 0.008376441895961761,
17
+ "learning_rate": 1.3892288861689107e-05,
18
+ "loss": 7.99669580078125,
19
+ "query_regularizer_loss": 0.0004,
20
+ "step": 500
21
+ },
22
+ {
23
+ "base_loss": 0.0149,
24
+ "document_regularizer_loss": 0.0007,
25
+ "epoch": 0.612088752869166,
26
+ "grad_norm": 0.10282868891954422,
27
+ "learning_rate": 7.77233782129743e-06,
28
+ "loss": 0.016533720016479493,
29
+ "query_regularizer_loss": 0.0009,
30
+ "step": 1000
31
+ },
32
+ {
33
+ "base_loss": 0.0096,
34
+ "document_regularizer_loss": 0.0005,
35
+ "epoch": 0.918133129303749,
36
+ "grad_norm": 0.007446900941431522,
37
+ "learning_rate": 1.6523867809057528e-06,
38
+ "loss": 0.010761536598205567,
39
+ "query_regularizer_loss": 0.0007,
40
+ "step": 1500
41
+ }
42
+ ],
43
+ "logging_steps": 500,
44
+ "max_steps": 1634,
45
+ "num_input_tokens_seen": 0,
46
+ "num_train_epochs": 1,
47
+ "save_steps": 500,
48
+ "stateful_callbacks": {
49
+ "TrainerControl": {
50
+ "args": {
51
+ "should_epoch_stop": false,
52
+ "should_evaluate": false,
53
+ "should_log": false,
54
+ "should_save": true,
55
+ "should_training_stop": true
56
+ },
57
+ "attributes": {}
58
+ }
59
+ },
60
+ "total_flos": 0.0,
61
+ "train_batch_size": 16,
62
+ "trial_name": null,
63
+ "trial_params": null
64
+ }
checkpoint-500/1_SpladePooling/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "pooling_strategy": "max",
3
+ "activation_function": "relu",
4
+ "embedding_dimension": 119547
5
+ }
checkpoint-500/README.md ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - sentence-transformers
4
+ - sparse-encoder
5
+ - sparse
6
+ - splade
7
+ - generated_from_trainer
8
+ - dataset_size:104550
9
+ - loss:SpladeLoss
10
+ - loss:SparseMultipleNegativesRankingLoss
11
+ - loss:FlopsLoss
12
+ base_model: distilbert/distilbert-base-multilingual-cased
13
+ widget:
14
+ - text: يُعتبر التأثير الأوروبي على الثقافة اليابانية في القرن التاسع عشر أمرًا هامًا
15
+ في فهم تاريخ البلاد.
16
+ - text: "\n\n يعد النقد الأدبي أداة أساسية في فهم التاريخ الثقافي، حيث يعكس التغيرات\
17
+ \ الاجتماعية والثقافية التي حدثت عبر العصور."
18
+ - text: لا أعتقد أن هناك أي تأثير لصالح المصممة الداخلية الإيطالية إيلينا فرونتزي
19
+ على هذا النوع من التصاميم.
20
+ - text: كيف يؤثر النقد الأدبي على التفاعل الاجتماعي؟
21
+ - text: "\n\n يعد النقد الأدبي أداة أساسية في فهم التاريخ الثقافي، حيث يساعد على تحليل\
22
+ \ وتفسير الأعمال الفنية والثقافية من خلال منظور تاريخي."
23
+ datasets:
24
+ - oddadmix/arabic-triplets-large
25
+ pipeline_tag: feature-extraction
26
+ library_name: sentence-transformers
27
+ ---
28
+
29
+ # SPLADE Sparse Encoder
30
+
31
+ This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [distilbert/distilbert-base-multilingual-cased](https://huggingface.co/distilbert/distilbert-base-multilingual-cased) on the [arabic-triplets-large](https://huggingface.co/datasets/oddadmix/arabic-triplets-large) dataset using the [sentence-transformers](https://www.SBERT.net) library. It maps sentences & paragraphs to a 119547-dimensional sparse vector space and can be used for semantic search and sparse retrieval.
32
+ ## Model Details
33
+
34
+ ### Model Description
35
+ - **Model Type:** SPLADE Sparse Encoder
36
+ - **Base model:** [distilbert/distilbert-base-multilingual-cased](https://huggingface.co/distilbert/distilbert-base-multilingual-cased) <!-- at revision 45c032ab32cc946ad88a166f7cb282f58c753c2e -->
37
+ - **Maximum Sequence Length:** 512 tokens
38
+ - **Output Dimensionality:** 119547 dimensions
39
+ - **Similarity Function:** Dot Product
40
+ - **Supported Modality:** Text
41
+ - **Training Dataset:**
42
+ - [arabic-triplets-large](https://huggingface.co/datasets/oddadmix/arabic-triplets-large)
43
+ <!-- - **Language:** Unknown -->
44
+ <!-- - **License:** Unknown -->
45
+
46
+ ### Model Sources
47
+
48
+ - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
49
+ - **Documentation:** [Sparse Encoder Documentation](https://www.sbert.net/docs/sparse_encoder/usage/usage.html)
50
+ - **Repository:** [Sentence Transformers on GitHub](https://github.com/huggingface/sentence-transformers)
51
+ - **Hugging Face:** [Sparse Encoders on Hugging Face](https://huggingface.co/models?library=sentence-transformers&other=sparse-encoder)
52
+
53
+ ### Full Model Architecture
54
+
55
+ ```
56
+ SparseEncoder(
57
+ (0): Transformer({'transformer_task': 'fill-mask', 'modality_config': {'text': {'method': 'forward', 'method_output_name': 'logits'}}, 'module_output_name': 'token_embeddings', 'architecture': 'DistilBertForMaskedLM'})
58
+ (1): SpladePooling({'pooling_strategy': 'max', 'activation_function': 'relu', 'embedding_dimension': 119547})
59
+ )
60
+ ```
61
+
62
+ ## Usage
63
+
64
+ ### Direct Usage (Sentence Transformers)
65
+
66
+ First install the Sentence Transformers library:
67
+
68
+ ```bash
69
+ pip install -U sentence-transformers
70
+ ```
71
+ Then you can load this model and run inference.
72
+ ```python
73
+ from sentence_transformers import SparseEncoder
74
+
75
+ # Download from the 🤗 Hub
76
+ model = SparseEncoder("sparse_encoder_model_id")
77
+ # Run inference
78
+ sentences = [
79
+ 'ما هي أهمية النقد الأدبي في فهم التاريخ الثقافي؟',
80
+ '\n\n يعد النقد الأدبي أداة أساسية في فهم التاريخ الثقافي، حيث يساعد على تحليل وتفسير الأعمال الفنية والثقافية من خلال منظور تاريخي.',
81
+ 'تُعتبر اللغة العربية لغة قديمة ومتنوعة، وتمثل جزءًا هامًا من تراث البشرية.',
82
+ ]
83
+ embeddings = model.encode(sentences)
84
+ print(embeddings.shape)
85
+ # [3, 119547]
86
+
87
+ # Get the similarity scores for the embeddings
88
+ similarities = model.similarity(embeddings, embeddings)
89
+ print(similarities)
90
+ # tensor([[ 43.2637, 49.1410, 12.4760],
91
+ # [ 49.1410, 113.4840, 30.3306],
92
+ # [ 12.4760, 30.3306, 109.0792]])
93
+ ```
94
+ <!--
95
+ ### Direct Usage (Transformers)
96
+
97
+ <details><summary>Click to see the direct usage in Transformers</summary>
98
+
99
+ </details>
100
+ -->
101
+
102
+ <!--
103
+ ### Downstream Usage (Sentence Transformers)
104
+
105
+ You can finetune this model on your own dataset.
106
+
107
+ <details><summary>Click to expand</summary>
108
+
109
+ </details>
110
+ -->
111
+
112
+ <!--
113
+ ### Out-of-Scope Use
114
+
115
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
116
+ -->
117
+
118
+ <!--
119
+ ## Bias, Risks and Limitations
120
+
121
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
122
+ -->
123
+
124
+ <!--
125
+ ### Recommendations
126
+
127
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
128
+ -->
129
+
130
+ ## Training Details
131
+
132
+ ### Training Dataset
133
+
134
+ #### arabic-triplets-large
135
+
136
+ * Dataset: [arabic-triplets-large](https://huggingface.co/datasets/oddadmix/arabic-triplets-large) at [fa99ede](https://huggingface.co/datasets/oddadmix/arabic-triplets-large/tree/fa99ede10602ff5cffb7591ff1f25289414c4b13)
137
+ * Size: 104,550 training samples
138
+ * Columns: <code>anchor</code>, <code>positive</code>, and <code>negative</code>
139
+ * Approximate statistics based on the first 100 samples:
140
+ | | anchor | positive | negative |
141
+ |:---------|:----------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
142
+ | type | string | string | string |
143
+ | modality | text | text | text |
144
+ | details | <ul><li>min: 12 tokens</li><li>mean: 18.6 tokens</li><li>max: 33 tokens</li></ul> | <ul><li>min: 26 tokens</li><li>mean: 57.26 tokens</li><li>max: 142 tokens</li></ul> | <ul><li>min: 18 tokens</li><li>mean: 47.38 tokens</li><li>max: 120 tokens</li></ul> |
145
+ * Samples:
146
+ | anchor | positive | negative |
147
+ |:----------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
148
+ | <code>ما هي أهمية التلال والمناطق الجبلية في البيئة؟</code> | <code><br><br> تعتبر التلال والمناطق الجبلية من أهم عناصر البيئة التي تؤثر بشكل كبير على توازن النظام الإيكولوجي.</code> | <code>يعتبر النشاط السياسي في البلدان الصغيرة من الأمور التي تتطلب إدارة شاملة ومتكاملة.</code> |
149
+ | <code>كيف تؤثر التلال على الرياح والهطول المطر؟</code> | <code><br><br> يؤثر التلال على الرياح والهطول المطر من خلال تأثيرهم على توزيع الضغط الجوي، مما يؤدي إلى تغييرات في اتجاه وسرعة الرياح وتواتر الهطول.</code> | <code>إنّ الأنشطة الزراعية في المناطق الجبلية تعتبر من أهمّ العوامل التي تساهم في تحسين جودة الحياة في هذه المناطق، ولكنها لا تؤثر بشكل مباشر على الرياح والهطول المطر.</code> |
150
+ | <code>ما هي أنواع التلال المختلفة؟ (جبال، هضاب، منحدرات)</code> | <code><br><br> هناك ثلاثة أنواع رئيسية للتلاءم هي الجبال، الهضاب، والمنحدرات.</code> | <code>الإدارة البيئية تعتبر من الأنشطة التي لها تأثير كبير على البيئة.</code> |
151
+ * Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:
152
+ ```json
153
+ {
154
+ "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct='dot_score', gather_across_devices=False, directions=('query_to_doc',), partition_mode='joint', hardness_mode=None, hardness_strength=0.0)",
155
+ "document_regularizer_weight": 3e-05,
156
+ "query_regularizer_weight": 5e-05
157
+ }
158
+ ```
159
+
160
+ ### Training Hyperparameters
161
+ #### Non-Default Hyperparameters
162
+
163
+ - `per_device_train_batch_size`: 16
164
+ - `num_train_epochs`: 1
165
+ - `learning_rate`: 2e-05
166
+ - `gradient_accumulation_steps`: 4
167
+ - `batch_sampler`: no_duplicates
168
+
169
+ #### All Hyperparameters
170
+ <details><summary>Click to expand</summary>
171
+
172
+ - `per_device_train_batch_size`: 16
173
+ - `num_train_epochs`: 1
174
+ - `max_steps`: -1
175
+ - `learning_rate`: 2e-05
176
+ - `lr_scheduler_type`: linear
177
+ - `lr_scheduler_kwargs`: None
178
+ - `warmup_steps`: 0
179
+ - `optim`: adamw_torch_fused
180
+ - `optim_args`: None
181
+ - `weight_decay`: 0.0
182
+ - `adam_beta1`: 0.9
183
+ - `adam_beta2`: 0.999
184
+ - `adam_epsilon`: 1e-08
185
+ - `optim_target_modules`: None
186
+ - `gradient_accumulation_steps`: 4
187
+ - `average_tokens_across_devices`: True
188
+ - `max_grad_norm`: 1.0
189
+ - `label_smoothing_factor`: 0.0
190
+ - `bf16`: False
191
+ - `fp16`: False
192
+ - `bf16_full_eval`: False
193
+ - `fp16_full_eval`: False
194
+ - `tf32`: None
195
+ - `gradient_checkpointing`: False
196
+ - `gradient_checkpointing_kwargs`: None
197
+ - `torch_compile`: False
198
+ - `torch_compile_backend`: None
199
+ - `torch_compile_mode`: None
200
+ - `use_liger_kernel`: False
201
+ - `liger_kernel_config`: None
202
+ - `use_cache`: False
203
+ - `neftune_noise_alpha`: None
204
+ - `torch_empty_cache_steps`: None
205
+ - `auto_find_batch_size`: False
206
+ - `log_on_each_node`: True
207
+ - `logging_nan_inf_filter`: True
208
+ - `include_num_input_tokens_seen`: no
209
+ - `log_level`: passive
210
+ - `log_level_replica`: warning
211
+ - `disable_tqdm`: False
212
+ - `project`: huggingface
213
+ - `trackio_space_id`: trackio
214
+ - `per_device_eval_batch_size`: 8
215
+ - `prediction_loss_only`: True
216
+ - `eval_on_start`: False
217
+ - `eval_do_concat_batches`: True
218
+ - `eval_use_gather_object`: False
219
+ - `eval_accumulation_steps`: None
220
+ - `include_for_metrics`: []
221
+ - `batch_eval_metrics`: False
222
+ - `save_only_model`: False
223
+ - `save_on_each_node`: False
224
+ - `enable_jit_checkpoint`: False
225
+ - `push_to_hub`: False
226
+ - `hub_private_repo`: None
227
+ - `hub_model_id`: None
228
+ - `hub_strategy`: every_save
229
+ - `hub_always_push`: False
230
+ - `hub_revision`: None
231
+ - `load_best_model_at_end`: False
232
+ - `ignore_data_skip`: False
233
+ - `restore_callback_states_from_checkpoint`: False
234
+ - `full_determinism`: False
235
+ - `seed`: 42
236
+ - `data_seed`: None
237
+ - `use_cpu`: False
238
+ - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
239
+ - `parallelism_config`: None
240
+ - `dataloader_drop_last`: False
241
+ - `dataloader_num_workers`: 0
242
+ - `dataloader_pin_memory`: True
243
+ - `dataloader_persistent_workers`: False
244
+ - `dataloader_prefetch_factor`: None
245
+ - `remove_unused_columns`: True
246
+ - `label_names`: None
247
+ - `train_sampling_strategy`: random
248
+ - `length_column_name`: length
249
+ - `ddp_find_unused_parameters`: None
250
+ - `ddp_bucket_cap_mb`: None
251
+ - `ddp_broadcast_buffers`: False
252
+ - `ddp_backend`: None
253
+ - `ddp_timeout`: 1800
254
+ - `fsdp`: []
255
+ - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
256
+ - `deepspeed`: None
257
+ - `debug`: []
258
+ - `skip_memory_metrics`: True
259
+ - `do_predict`: False
260
+ - `resume_from_checkpoint`: None
261
+ - `warmup_ratio`: None
262
+ - `local_rank`: -1
263
+ - `prompts`: None
264
+ - `batch_sampler`: no_duplicates
265
+ - `multi_dataset_batch_sampler`: proportional
266
+ - `router_mapping`: {}
267
+ - `learning_rate_mapping`: {}
268
+
269
+ </details>
270
+
271
+ ### Training Logs
272
+ | Epoch | Step | Training Loss |
273
+ |:------:|:----:|:-------------:|
274
+ | 0.3060 | 500 | 7.9967 |
275
+
276
+
277
+ ### Training Time
278
+ - **Training**: 8.3 minutes
279
+
280
+ ### Framework Versions
281
+ - Python: 3.12.3
282
+ - Sentence Transformers: 5.5.1
283
+ - Transformers: 5.5.0
284
+ - PyTorch: 2.12.0+cu130
285
+ - Accelerate: 1.14.0
286
+ - Datasets: 4.3.0
287
+ - Tokenizers: 0.22.2
288
+
289
+ ## Additional Resources
290
+
291
+ - [Training and Finetuning Sparse Embedding Models with Sentence Transformers](https://huggingface.co/blog/train-sparse-encoder): the end-to-end guide for training or finetuning SPLADE and other sparse encoder models.
292
+
293
+ ## Citation
294
+
295
+ ### BibTeX
296
+
297
+ #### Sentence Transformers
298
+ ```bibtex
299
+ @inproceedings{reimers-2019-sentence-bert,
300
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
301
+ author = "Reimers, Nils and Gurevych, Iryna",
302
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
303
+ month = "11",
304
+ year = "2019",
305
+ publisher = "Association for Computational Linguistics",
306
+ url = "https://arxiv.org/abs/1908.10084",
307
+ }
308
+ ```
309
+
310
+ #### SpladeLoss
311
+ ```bibtex
312
+ @misc{formal2022distillationhardnegativesampling,
313
+ title={From Distillation to Hard Negative Sampling: Making Sparse Neural IR Models More Effective},
314
+ author={Thibault Formal and Carlos Lassance and Benjamin Piwowarski and Stéphane Clinchant},
315
+ year={2022},
316
+ eprint={2205.04733},
317
+ archivePrefix={arXiv},
318
+ primaryClass={cs.IR},
319
+ url={https://arxiv.org/abs/2205.04733},
320
+ }
321
+ ```
322
+
323
+ #### SparseMultipleNegativesRankingLoss
324
+ ```bibtex
325
+ @misc{oord2019representationlearningcontrastivepredictive,
326
+ title={Representation Learning with Contrastive Predictive Coding},
327
+ author={Aaron van den Oord and Yazhe Li and Oriol Vinyals},
328
+ year={2019},
329
+ eprint={1807.03748},
330
+ archivePrefix={arXiv},
331
+ primaryClass={cs.LG},
332
+ url={https://arxiv.org/abs/1807.03748},
333
+ }
334
+ ```
335
+
336
+ #### FlopsLoss
337
+ ```bibtex
338
+ @article{paria2020minimizing,
339
+ title={Minimizing flops to learn efficient sparse representations},
340
+ author={Paria, Biswajit and Yeh, Chih-Kuan and Yen, Ian EH and Xu, Ning and Ravikumar, Pradeep and P{'o}czos, Barnab{'a}s},
341
+ journal={arXiv preprint arXiv:2004.05665},
342
+ year={2020}
343
+ }
344
+ ```
345
+
346
+ <!--
347
+ ## Glossary
348
+
349
+ *Clearly define terms in order to be accessible across audiences.*
350
+ -->
351
+
352
+ <!--
353
+ ## Model Card Authors
354
+
355
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
356
+ -->
357
+
358
+ <!--
359
+ ## Model Card Contact
360
+
361
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
362
+ -->
checkpoint-500/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation": "gelu",
3
+ "architectures": [
4
+ "DistilBertForMaskedLM"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "bos_token_id": null,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "dtype": "float32",
11
+ "eos_token_id": null,
12
+ "hidden_dim": 3072,
13
+ "initializer_range": 0.02,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "distilbert",
16
+ "n_heads": 12,
17
+ "n_layers": 6,
18
+ "output_past": true,
19
+ "pad_token_id": 0,
20
+ "qa_dropout": 0.1,
21
+ "seq_classif_dropout": 0.2,
22
+ "sinusoidal_pos_embds": false,
23
+ "tie_weights_": true,
24
+ "tie_word_embeddings": true,
25
+ "transformers_version": "5.5.0",
26
+ "use_cache": false,
27
+ "vocab_size": 119547
28
+ }
checkpoint-500/config_sentence_transformers.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "pytorch": "2.12.0+cu130",
4
+ "sentence_transformers": "5.5.1",
5
+ "transformers": "5.5.0"
6
+ },
7
+ "default_prompt_name": null,
8
+ "model_type": "SparseEncoder",
9
+ "prompts": {
10
+ "document": "",
11
+ "query": ""
12
+ },
13
+ "similarity_fn_name": null
14
+ }
checkpoint-500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bf20d781958ab9a8cfefc71d09618f6b87eb330bb5742f1e710923f93506e03
3
+ size 541795684
checkpoint-500/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.sparse_encoder.modules.mlm_transformer.MLMTransformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_SpladePooling",
12
+ "type": "sentence_transformers.sparse_encoder.modules.splade_pooling.SpladePooling"
13
+ }
14
+ ]
checkpoint-500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f65115d8bc830dca49d58477f93e24794a895430738c5a06f0fc82cbfcc5b9cc
3
+ size 14645
checkpoint-500/sentence_bert_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "fill-mask",
3
+ "modality_config": {
4
+ "text": {
5
+ "method": "forward",
6
+ "method_output_name": "logits"
7
+ }
8
+ },
9
+ "module_output_name": "token_embeddings"
10
+ }
checkpoint-500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-500/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 512,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
checkpoint-500/trainer_state.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.306044376434583,
6
+ "eval_steps": 500,
7
+ "global_step": 500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "base_loss": 7.996,
14
+ "document_regularizer_loss": 0.0003,
15
+ "epoch": 0.306044376434583,
16
+ "grad_norm": 0.008376441895961761,
17
+ "learning_rate": 1.3892288861689107e-05,
18
+ "loss": 7.99669580078125,
19
+ "query_regularizer_loss": 0.0004,
20
+ "step": 500
21
+ }
22
+ ],
23
+ "logging_steps": 500,
24
+ "max_steps": 1634,
25
+ "num_input_tokens_seen": 0,
26
+ "num_train_epochs": 1,
27
+ "save_steps": 500,
28
+ "stateful_callbacks": {
29
+ "TrainerControl": {
30
+ "args": {
31
+ "should_epoch_stop": false,
32
+ "should_evaluate": false,
33
+ "should_log": false,
34
+ "should_save": true,
35
+ "should_training_stop": false
36
+ },
37
+ "attributes": {}
38
+ }
39
+ },
40
+ "total_flos": 0.0,
41
+ "train_batch_size": 16,
42
+ "trial_name": null,
43
+ "trial_params": null
44
+ }
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation": "gelu",
3
+ "architectures": [
4
+ "DistilBertForMaskedLM"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "bos_token_id": null,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "dtype": "float32",
11
+ "eos_token_id": null,
12
+ "hidden_dim": 3072,
13
+ "initializer_range": 0.02,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "distilbert",
16
+ "n_heads": 12,
17
+ "n_layers": 6,
18
+ "output_past": true,
19
+ "pad_token_id": 0,
20
+ "qa_dropout": 0.1,
21
+ "seq_classif_dropout": 0.2,
22
+ "sinusoidal_pos_embds": false,
23
+ "tie_weights_": true,
24
+ "tie_word_embeddings": true,
25
+ "transformers_version": "5.5.0",
26
+ "use_cache": false,
27
+ "vocab_size": 119547
28
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "pytorch": "2.12.0+cu130",
4
+ "sentence_transformers": "5.5.1",
5
+ "transformers": "5.5.0"
6
+ },
7
+ "default_prompt_name": null,
8
+ "model_type": "SparseEncoder",
9
+ "prompts": {
10
+ "document": "",
11
+ "query": ""
12
+ },
13
+ "similarity_fn_name": "dot"
14
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85bf3f38130384945446d2f0f8fe6095507e785587ff4bdf69803aa9f599d222
3
+ size 541795684
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.sparse_encoder.modules.mlm_transformer.MLMTransformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_SpladePooling",
12
+ "type": "sentence_transformers.sparse_encoder.modules.splade_pooling.SpladePooling"
13
+ }
14
+ ]