| --- |
| license: cc-by-nc-4.0 |
| language: |
| - ace |
| - acm |
| - acq |
| - aeb |
| - af |
| - ajp |
| - ak |
| - am |
| - apc |
| - ar |
| - ars |
| - ary |
| - arz |
| - as |
| - ast |
| - awa |
| - ay |
| - azb |
| - azj |
| - ba |
| - bm |
| - ban |
| - be |
| - bem |
| - bn |
| - bho |
| - bjn |
| - bo |
| - bs |
| - bug |
| - bg |
| - ca |
| - ceb |
| - cs |
| - cjk |
| - ckb |
| - crh |
| - cy |
| - da |
| - de |
| - dik |
| - dyu |
| - dz |
| - el |
| - en |
| - eo |
| - et |
| - eu |
| - ee |
| - fo |
| - fa |
| - fj |
| - fi |
| - fon |
| - fr |
| - fur |
| - ff |
| - gd |
| - ga |
| - gl |
| - gn |
| - gu |
| - ht |
| - ha |
| - he |
| - hi |
| - hne |
| - hr |
| - hu |
| - hy |
| - ig |
| - ilo |
| - id |
| - is |
| - it |
| - jv |
| - ja |
| - kab |
| - kac |
| - kam |
| - kn |
| - ks |
| - ka |
| - kr |
| - kk |
| - kbp |
| - kea |
| - km |
| - ki |
| - rw |
| - ky |
| - kmb |
| - kg |
| - ko |
| - kmr |
| - lo |
| - lv |
| - lij |
| - li |
| - ln |
| - lt |
| - lmo |
| - ltg |
| - lb |
| - lua |
| - lg |
| - luo |
| - lus |
| - mag |
| - mai |
| - ml |
| - mr |
| - min |
| - mk |
| - plt |
| - mt |
| - mni |
| - mn |
| - mos |
| - mi |
| - ms |
| - my |
| - nl |
| - nn |
| - nb |
| - ne |
| - nso |
| - nus |
| - ny |
| - oc |
| - gaz |
| - ory |
| - pag |
| - pa |
| - pap |
| - pl |
| - pt |
| - prs |
| - pbt |
| - qu |
| - ro |
| - rn |
| - ru |
| - sg |
| - sa |
| - sat |
| - scn |
| - shn |
| - si |
| - sk |
| - sl |
| - sm |
| - sn |
| - sd |
| - so |
| - st |
| - es |
| - als |
| - sc |
| - sr |
| - ss |
| - su |
| - sv |
| - sw |
| - szl |
| - ta |
| - tt |
| - te |
| - tg |
| - tl |
| - th |
| - ti |
| - taq |
| - tpi |
| - tn |
| - ts |
| - tk |
| - tum |
| - tr |
| - tw |
| - tzm |
| - ug |
| - uk |
| - umb |
| - ur |
| - uz |
| - vec |
| - vi |
| - war |
| - wo |
| - xh |
| - yi |
| - yo |
| - yue |
| - zh |
| - zu |
| language_details: >- |
| ace_Arab, ace_Latn, acm_Arab, acq_Arab, aeb_Arab, afr_Latn, ajp_Arab, |
| aka_Latn, amh_Ethi, apc_Arab, arb_Arab, ars_Arab, ary_Arab, arz_Arab, |
| asm_Beng, ast_Latn, awa_Deva, ayr_Latn, azb_Arab, azj_Latn, bak_Cyrl, |
| bam_Latn, ban_Latn, bel_Cyrl, bem_Latn, ben_Beng, bho_Deva, bjn_Arab, |
| bod_Tibt, bos_Latn, bug_Latn, bul_Cyrl, cat_Latn, ceb_Latn, ces_Latn, |
| cjk_Latn, ckb_Arab, crh_Latn, cym_Latn, dan_Latn, deu_Latn, dik_Latn, |
| dyu_Latn, dzo_Tibt, ell_Grek, eng_Latn, epo_Latn, est_Latn, eus_Latn, |
| ewe_Latn, fao_Latn, pes_Arab, fij_Latn, fin_Latn, fon_Latn, fra_Latn, |
| fur_Latn, fuv_Latn, gla_Latn, gle_Latn, glg_Latn, grn_Latn, guj_Gujr, |
| hat_Latn, hau_Latn, heb_Hebr, hin_Deva, hne_Deva, hrv_Latn, hun_Latn, |
| hye_Armn, ibo_Latn, ilo_Latn, ind_Latn, isl_Latn, ita_Latn, jav_Latn, |
| jpn_Jpan, kab_Latn, kac_Latn, kam_Latn, kan_Knda, kas_Arab, kas_Deva, |
| kat_Geor, knc_Arab, knc_Latn, kaz_Cyrl, kbp_Latn, kea_Latn, khm_Khmr, |
| kik_Latn, kin_Latn, kir_Cyrl, kmb_Latn, kon_Latn, kor_Hang, kmr_Latn, |
| lao_Laoo, lvs_Latn, lij_Latn, lim_Latn, lin_Latn, lit_Latn, lmo_Latn, |
| ltg_Latn, ltz_Latn, lua_Latn, lug_Latn, luo_Latn, lus_Latn, mag_Deva, |
| mai_Deva, mal_Mlym, mar_Deva, min_Latn, mkd_Cyrl, plt_Latn, mlt_Latn, |
| mni_Beng, khk_Cyrl, mos_Latn, mri_Latn, zsm_Latn, mya_Mymr, nld_Latn, |
| nno_Latn, nob_Latn, npi_Deva, nso_Latn, nus_Latn, nya_Latn, oci_Latn, |
| gaz_Latn, ory_Orya, pag_Latn, pan_Guru, pap_Latn, pol_Latn, por_Latn, |
| prs_Arab, pbt_Arab, quy_Latn, ron_Latn, run_Latn, rus_Cyrl, sag_Latn, |
| san_Deva, sat_Beng, scn_Latn, shn_Mymr, sin_Sinh, slk_Latn, slv_Latn, |
| smo_Latn, sna_Latn, snd_Arab, som_Latn, sot_Latn, spa_Latn, als_Latn, |
| srd_Latn, srp_Cyrl, ssw_Latn, sun_Latn, swe_Latn, swh_Latn, szl_Latn, |
| tam_Taml, tat_Cyrl, tel_Telu, tgk_Cyrl, tgl_Latn, tha_Thai, tir_Ethi, |
| taq_Latn, taq_Tfng, tpi_Latn, tsn_Latn, tso_Latn, tuk_Latn, tum_Latn, |
| tur_Latn, twi_Latn, tzm_Tfng, uig_Arab, ukr_Cyrl, umb_Latn, urd_Arab, |
| uzn_Latn, vec_Latn, vie_Latn, war_Latn, wol_Latn, xho_Latn, ydd_Hebr, |
| yor_Latn, yue_Hant, zho_Hans, zho_Hant, zul_Latn |
| pipeline_tag: sentence-similarity |
| --- |
| |
| # BLASER Ref (Ported) |
|
|
| This is a **ported version of the BLASER quality estimation (REF) model** originally developed in [BLASER: Bilingual Language-Agnostic Sentence Representations](https://huggingface.co/facebook/blaser-2.0-ref). |
|
|
| - **Ported to Hugging Face Transformers**: no dependency on Fairseq. |
| - **Uses embeddings from the ported SONAR 200 multilingual text encoder** ([cointegrated/SONAR_200_text_encoder](https://huggingface.co/cointegrated/SONAR_200_text_encoder)). |
| - **Supports the same 202 languages** as SONAR / NLLB-200. |
| - **Outputs BLASER scores on a 1–5 scale** for a source–MT–REF triplet. |
|
|
| > ⚠️ This is **not the original implementation**. Attribution goes to the original BLASER authors. |
|
|
| --- |
|
|
| ## How to compute Ref scores |
|
|
| ```python |
| # !pip install transformers sentencepiece torch -q |
| import torch |
| from transformers import AutoTokenizer, AutoModel |
| from transformers.models.m2m_100.modeling_m2m_100 import M2M100Encoder |
| |
| # 1. Load SONAR encoder |
| sonar_model_name = "cointegrated/SONAR_200_text_encoder" |
| encoder = M2M100Encoder.from_pretrained(sonar_model_name) |
| tokenizer = AutoTokenizer.from_pretrained(sonar_model_name) |
| |
| def encode_mean_pool(texts, tokenizer, encoder, lang='eng_Latn', norm=False): |
| tokenizer.src_lang = lang |
| with torch.inference_mode(): |
| batch = tokenizer(texts, return_tensors='pt', padding=True) |
| seq_embs = encoder(**batch).last_hidden_state |
| mask = batch.attention_mask |
| mean_emb = (seq_embs * mask.unsqueeze(-1)).sum(1) / mask.unsqueeze(-1).sum(1) |
| if norm: |
| mean_emb = torch.nn.functional.normalize(mean_emb) |
| return mean_emb |
| |
| # Example sentences |
| src_sentences = ["Le chat s'assit sur le tapis."] |
| mt_sentences = ["The cat sat down on the carpet."] # Example MT output |
| ref_sentences = ["The cat sat on the mat."] # Example reference translation |
| |
| # Encode source and MT sentences |
| src_embs = encode_mean_pool(src_sentences, tokenizer, encoder, lang="fra_Latn") |
| mt_embs = encode_mean_pool(mt_sentences, tokenizer, encoder, lang="eng_Latn") |
| ref_embs = encode_mean_pool(ref_sentences, tokenizer, encoder, lang="eng_Latn") |
| |
| # 2. Load BLASER Ref model (ported) |
| ref_model_name = "oist/blaser_2_0_ref_ported" |
| ref_model = AutoModel.from_pretrained(ref_model_name, trust_remote_code=True) |
| ref_model.eval() # set to evaluation mode |
| |
| # 3. Compute Ref scores |
| with torch.inference_mode(): |
| ref_scores = ref_model(src_embs, mt_embs, ref_embs) # expects source and MT embeddings, and ref embeddings |
| print("Blaser score shape:", ref_scores.shape) |
| print("Blaser scores:", ref_scores[0]) |