Text Generation
Transformers
Safetensors
Portuguese
qwen3
text-generation-inference
conversational
Eval Results (legacy)
nicholasKluge's picture
Upload evals.yaml with huggingface_hub
77f86d4 verified
raw
history blame
10.5 kB
evaluations:
arc_challenge_poly_pt_acc: 0.352991452991453
arc_challenge_poly_pt_acc_norm: 0.38632478632478634
arc_challenge_poly_pt_acc_norm_stderr: 0.014240929032392555
arc_challenge_poly_pt_acc_stderr: 0.013977511178298731
arc_challenge_poly_pt_alias: arc_challenge_poly_pt
assin2_rte_acc,all: 0.8929738562091504
assin2_rte_acc_stderr,all: 0.004411530421702684
assin2_rte_alias: assin2_rte
assin2_rte_f1_macro,all: 0.8929452735255472
assin2_rte_f1_macro_stderr,all: 0.004413430512294061
assin2_sts_alias: assin2_sts
assin2_sts_mse,all: 0.8459722222222222
assin2_sts_mse_stderr,all: N/A
assin2_sts_pearson,all: 0.5030690041913517
assin2_sts_pearson_stderr,all: 0.011288134306476946
assin_entailment_acc: 0.70525
assin_entailment_acc_stderr: 0.007209792275174285
assin_entailment_alias: assin_entailment
assin_paraphrase_acc: 0.69825
assin_paraphrase_acc_stderr: 0.007258609111018215
assin_paraphrase_alias: assin_paraphrase
belebele_por_Latn_acc: 0.6233333333333333
belebele_por_Latn_acc_norm: 0.6233333333333333
belebele_por_Latn_acc_norm_stderr: 0.016160653713997004
belebele_por_Latn_acc_stderr: 0.016160653713997004
belebele_por_Latn_alias: belebele_por_Latn
bluex_acc,all: 0.40333796940194716
bluex_acc,exam_id__UNICAMP_2018: 0.2962962962962963
bluex_acc,exam_id__UNICAMP_2019: 0.36
bluex_acc,exam_id__UNICAMP_2020: 0.4
bluex_acc,exam_id__UNICAMP_2021_1: 0.41304347826086957
bluex_acc,exam_id__UNICAMP_2021_2: 0.3333333333333333
bluex_acc,exam_id__UNICAMP_2022: 0.46153846153846156
bluex_acc,exam_id__UNICAMP_2023: 0.4883720930232558
bluex_acc,exam_id__UNICAMP_2024: 0.4666666666666667
bluex_acc,exam_id__USP_2018: 0.3333333333333333
bluex_acc,exam_id__USP_2019: 0.45
bluex_acc,exam_id__USP_2020: 0.4107142857142857
bluex_acc,exam_id__USP_2021: 0.4230769230769231
bluex_acc,exam_id__USP_2022: 0.3673469387755102
bluex_acc,exam_id__USP_2023: 0.4090909090909091
bluex_acc,exam_id__USP_2024: 0.5121951219512195
bluex_acc_stderr,all: 0.010516220290583876
bluex_acc_stderr,exam_id__UNICAMP_2018: 0.0358436954826324
bluex_acc_stderr,exam_id__UNICAMP_2019: 0.0394709635049564
bluex_acc_stderr,exam_id__UNICAMP_2020: 0.038090964110030476
bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.041990517186998776
bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.03818899745793331
bluex_acc_stderr,exam_id__UNICAMP_2022: 0.046109614865865844
bluex_acc_stderr,exam_id__UNICAMP_2023: 0.04411333155530449
bluex_acc_stderr,exam_id__UNICAMP_2024: 0.042938801698654536
bluex_acc_stderr,exam_id__USP_2018: 0.036947905283078404
bluex_acc_stderr,exam_id__USP_2019: 0.045372630608773945
bluex_acc_stderr,exam_id__USP_2020: 0.037832930915152975
bluex_acc_stderr,exam_id__USP_2021: 0.039528870694831665
bluex_acc_stderr,exam_id__USP_2022: 0.039697954399404815
bluex_acc_stderr,exam_id__USP_2023: 0.04283372427132068
bluex_acc_stderr,exam_id__USP_2024: 0.04496015455368913
bluex_alias: bluex
calame_pt_acc: 0.30009633911368017
calame_pt_acc_stderr: 0.010060983361463095
calame_pt_alias: calame_pt
calame_pt_perplexity: 40.330510797408564
calame_pt_perplexity_stderr: 2.812293830784164
enem_challenge_acc,all: 0.5360391882435269
enem_challenge_acc,exam_id__2009: 0.4956521739130435
enem_challenge_acc,exam_id__2010: 0.5555555555555556
enem_challenge_acc,exam_id__2011: 0.5555555555555556
enem_challenge_acc,exam_id__2012: 0.5344827586206896
enem_challenge_acc,exam_id__2013: 0.6296296296296297
enem_challenge_acc,exam_id__2014: 0.5504587155963303
enem_challenge_acc,exam_id__2015: 0.453781512605042
enem_challenge_acc,exam_id__2016: 0.5041322314049587
enem_challenge_acc,exam_id__2016_2: 0.5528455284552846
enem_challenge_acc,exam_id__2017: 0.4827586206896552
enem_challenge_acc,exam_id__2022: 0.5263157894736842
enem_challenge_acc,exam_id__2023: 0.5925925925925926
enem_challenge_acc_stderr,all: 0.0076181405479343375
enem_challenge_acc_stderr,exam_id__2009: 0.02694086035194982
enem_challenge_acc_stderr,exam_id__2010: 0.026490028965817942
enem_challenge_acc_stderr,exam_id__2011: 0.026490390134681466
enem_challenge_acc_stderr,exam_id__2012: 0.026706675890852467
enem_challenge_acc_stderr,exam_id__2013: 0.026844859183194552
enem_challenge_acc_stderr,exam_id__2014: 0.027471406425669677
enem_challenge_acc_stderr,exam_id__2015: 0.02625010092250836
enem_challenge_acc_stderr,exam_id__2016: 0.026323579775844996
enem_challenge_acc_stderr,exam_id__2016_2: 0.025854198995497416
enem_challenge_acc_stderr,exam_id__2017: 0.02675016875866561
enem_challenge_acc_stderr,exam_id__2022: 0.02494321437851413
enem_challenge_acc_stderr,exam_id__2023: 0.024375802527465587
enem_challenge_alias: enem
faquad_nli_acc,all: 0.5461538461538461
faquad_nli_acc_stderr,all: 0.013824178717859308
faquad_nli_alias: faquad_nli
faquad_nli_f1_macro,all: 0.5378749719832936
faquad_nli_f1_macro_stderr,all: 0.013886770545945497
global_piqa_completions_por_latn_braz_acc: 0.66
global_piqa_completions_por_latn_braz_acc_bytes: 0.63
global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.048523658709390974
global_piqa_completions_por_latn_braz_acc_norm: 0.62
global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.04878317312145634
global_piqa_completions_por_latn_braz_acc_stderr: 0.04760952285695234
global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz
gsm8k_pt_alias: gsm8k_pt
gsm8k_pt_exact_match,flexible-extract: 0.18493150684931506
gsm8k_pt_exact_match,strict-match: 0.0
gsm8k_pt_exact_match_stderr,flexible-extract: 0.010714456449924532
gsm8k_pt_exact_match_stderr,strict-match: 0.0
hatebr_offensive_acc,all: 0.8514285714285714
hatebr_offensive_acc_stderr,all: 0.0067137369109427445
hatebr_offensive_alias: hatebr_offensive_binary
hatebr_offensive_f1_macro,all: 0.8513918743199851
hatebr_offensive_f1_macro_stderr,all: 0.006717339873214156
hellaswag_poly_pt_acc: 0.38129808213240873
hellaswag_poly_pt_acc_norm: 0.47827500270885254
hellaswag_poly_pt_acc_norm_stderr: 0.005200030264123482
hellaswag_poly_pt_acc_stderr: 0.005056141839024339
hellaswag_poly_pt_alias: hellaswag_poly_pt
ifeval_pt_alias: ifeval_pt
ifeval_pt_inst_level_loose_acc: 0.4186046511627907
ifeval_pt_inst_level_loose_acc_stderr: N/A
ifeval_pt_inst_level_strict_acc: 0.37906976744186044
ifeval_pt_inst_level_strict_acc_stderr: N/A
ifeval_pt_prompt_level_loose_acc: 0.3
ifeval_pt_prompt_level_loose_acc_stderr: 0.02650171951258532
ifeval_pt_prompt_level_strict_acc: 0.26666666666666666
ifeval_pt_prompt_level_strict_acc_stderr: 0.02557404853322572
lambada_poly_pt_acc: 0.36425383271880457
lambada_poly_pt_acc_stderr: 0.006704339729529026
lambada_poly_pt_alias: lambada_poly_pt
lambada_poly_pt_perplexity: 34.54764793589513
lambada_poly_pt_perplexity_stderr: 1.5172274553216532
mmlu_poly_pt_acc: 0.41459021314920447
mmlu_poly_pt_acc_stderr: 0.004268138446999436
mmlu_poly_pt_alias: mmlu_poly_pt
oab_exams_acc,all: 0.40728929384965834
oab_exams_acc,exam_id__2010-01: 0.4
oab_exams_acc,exam_id__2010-02: 0.5
oab_exams_acc,exam_id__2011-03: 0.3434343434343434
oab_exams_acc,exam_id__2011-04: 0.35
oab_exams_acc,exam_id__2011-05: 0.45
oab_exams_acc,exam_id__2012-06: 0.3625
oab_exams_acc,exam_id__2012-06a: 0.525
oab_exams_acc,exam_id__2012-07: 0.425
oab_exams_acc,exam_id__2012-08: 0.425
oab_exams_acc,exam_id__2012-09: 0.33766233766233766
oab_exams_acc,exam_id__2013-10: 0.45
oab_exams_acc,exam_id__2013-11: 0.3875
oab_exams_acc,exam_id__2013-12: 0.4125
oab_exams_acc,exam_id__2014-13: 0.4
oab_exams_acc,exam_id__2014-14: 0.375
oab_exams_acc,exam_id__2014-15: 0.4230769230769231
oab_exams_acc,exam_id__2015-16: 0.475
oab_exams_acc,exam_id__2015-17: 0.41025641025641024
oab_exams_acc,exam_id__2015-18: 0.3875
oab_exams_acc,exam_id__2016-19: 0.4358974358974359
oab_exams_acc,exam_id__2016-20: 0.4125
oab_exams_acc,exam_id__2016-20a: 0.2875
oab_exams_acc,exam_id__2016-21: 0.3625
oab_exams_acc,exam_id__2017-22: 0.4625
oab_exams_acc,exam_id__2017-23: 0.375
oab_exams_acc,exam_id__2017-24: 0.3625
oab_exams_acc,exam_id__2018-25: 0.45
oab_exams_acc_stderr,all: 0.006075587781828954
oab_exams_acc_stderr,exam_id__2010-01: 0.030623794065745072
oab_exams_acc_stderr,exam_id__2010-02: 0.028916493865834454
oab_exams_acc_stderr,exam_id__2011-03: 0.02741417034369018
oab_exams_acc_stderr,exam_id__2011-04: 0.030903722521126956
oab_exams_acc_stderr,exam_id__2011-05: 0.032119480960382765
oab_exams_acc_stderr,exam_id__2012-06: 0.03092116447159791
oab_exams_acc_stderr,exam_id__2012-06a: 0.0323716859938887
oab_exams_acc_stderr,exam_id__2012-07: 0.03197065291082094
oab_exams_acc_stderr,exam_id__2012-08: 0.03199897400280611
oab_exams_acc_stderr,exam_id__2012-09: 0.03113015227112767
oab_exams_acc_stderr,exam_id__2013-10: 0.03220496656457049
oab_exams_acc_stderr,exam_id__2013-11: 0.03141807801888442
oab_exams_acc_stderr,exam_id__2013-12: 0.031705964116907834
oab_exams_acc_stderr,exam_id__2014-13: 0.031523570709171646
oab_exams_acc_stderr,exam_id__2014-14: 0.031348698849557335
oab_exams_acc_stderr,exam_id__2014-15: 0.032308173836725206
oab_exams_acc_stderr,exam_id__2015-16: 0.032275444416125405
oab_exams_acc_stderr,exam_id__2015-17: 0.032242261750832044
oab_exams_acc_stderr,exam_id__2015-18: 0.031327883513865735
oab_exams_acc_stderr,exam_id__2016-19: 0.03247625408703544
oab_exams_acc_stderr,exam_id__2016-20: 0.031762853683104365
oab_exams_acc_stderr,exam_id__2016-20a: 0.02914121034414819
oab_exams_acc_stderr,exam_id__2016-21: 0.030958650868779976
oab_exams_acc_stderr,exam_id__2017-22: 0.032214831966249774
oab_exams_acc_stderr,exam_id__2017-23: 0.03125268775237781
oab_exams_acc_stderr,exam_id__2017-24: 0.030936224842605275
oab_exams_acc_stderr,exam_id__2018-25: 0.03208477618412235
oab_exams_alias: oab_exams
portuguese_hate_speech_acc,all: 0.7297297297297297
portuguese_hate_speech_acc_stderr,all: 0.010726825349374063
portuguese_hate_speech_alias: portuguese_hate_speech_binary
portuguese_hate_speech_f1_macro,all: 0.6842412627123019
portuguese_hate_speech_f1_macro_stderr,all: 0.012055600214659706
tweetsentbr_acc,all: 0.5796019900497512
tweetsentbr_acc_stderr,all: 0.007813148658084126
tweetsentbr_alias: tweetsentbr
tweetsentbr_f1_macro,all: 0.5404009095352119
tweetsentbr_f1_macro_stderr,all: 0.008133081854048629
step: 69750