evaluations: arc_challenge_poly_pt_acc: 0.352991452991453 arc_challenge_poly_pt_acc_norm: 0.38632478632478634 arc_challenge_poly_pt_acc_norm_stderr: 0.014240929032392555 arc_challenge_poly_pt_acc_stderr: 0.013977511178298731 arc_challenge_poly_pt_alias: arc_challenge_poly_pt assin2_rte_acc,all: 0.8929738562091504 assin2_rte_acc_stderr,all: 0.004411530421702684 assin2_rte_alias: assin2_rte assin2_rte_f1_macro,all: 0.8929452735255472 assin2_rte_f1_macro_stderr,all: 0.004413430512294061 assin2_sts_alias: assin2_sts assin2_sts_mse,all: 0.8459722222222222 assin2_sts_mse_stderr,all: N/A assin2_sts_pearson,all: 0.5030690041913517 assin2_sts_pearson_stderr,all: 0.011288134306476946 assin_entailment_acc: 0.70525 assin_entailment_acc_stderr: 0.007209792275174285 assin_entailment_alias: assin_entailment assin_paraphrase_acc: 0.69825 assin_paraphrase_acc_stderr: 0.007258609111018215 assin_paraphrase_alias: assin_paraphrase belebele_por_Latn_acc: 0.6233333333333333 belebele_por_Latn_acc_norm: 0.6233333333333333 belebele_por_Latn_acc_norm_stderr: 0.016160653713997004 belebele_por_Latn_acc_stderr: 0.016160653713997004 belebele_por_Latn_alias: belebele_por_Latn bluex_acc,all: 0.40333796940194716 bluex_acc,exam_id__UNICAMP_2018: 0.2962962962962963 bluex_acc,exam_id__UNICAMP_2019: 0.36 bluex_acc,exam_id__UNICAMP_2020: 0.4 bluex_acc,exam_id__UNICAMP_2021_1: 0.41304347826086957 bluex_acc,exam_id__UNICAMP_2021_2: 0.3333333333333333 bluex_acc,exam_id__UNICAMP_2022: 0.46153846153846156 bluex_acc,exam_id__UNICAMP_2023: 0.4883720930232558 bluex_acc,exam_id__UNICAMP_2024: 0.4666666666666667 bluex_acc,exam_id__USP_2018: 0.3333333333333333 bluex_acc,exam_id__USP_2019: 0.45 bluex_acc,exam_id__USP_2020: 0.4107142857142857 bluex_acc,exam_id__USP_2021: 0.4230769230769231 bluex_acc,exam_id__USP_2022: 0.3673469387755102 bluex_acc,exam_id__USP_2023: 0.4090909090909091 bluex_acc,exam_id__USP_2024: 0.5121951219512195 bluex_acc_stderr,all: 0.010516220290583876 bluex_acc_stderr,exam_id__UNICAMP_2018: 0.0358436954826324 bluex_acc_stderr,exam_id__UNICAMP_2019: 0.0394709635049564 bluex_acc_stderr,exam_id__UNICAMP_2020: 0.038090964110030476 bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.041990517186998776 bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.03818899745793331 bluex_acc_stderr,exam_id__UNICAMP_2022: 0.046109614865865844 bluex_acc_stderr,exam_id__UNICAMP_2023: 0.04411333155530449 bluex_acc_stderr,exam_id__UNICAMP_2024: 0.042938801698654536 bluex_acc_stderr,exam_id__USP_2018: 0.036947905283078404 bluex_acc_stderr,exam_id__USP_2019: 0.045372630608773945 bluex_acc_stderr,exam_id__USP_2020: 0.037832930915152975 bluex_acc_stderr,exam_id__USP_2021: 0.039528870694831665 bluex_acc_stderr,exam_id__USP_2022: 0.039697954399404815 bluex_acc_stderr,exam_id__USP_2023: 0.04283372427132068 bluex_acc_stderr,exam_id__USP_2024: 0.04496015455368913 bluex_alias: bluex calame_pt_acc: 0.30009633911368017 calame_pt_acc_stderr: 0.010060983361463095 calame_pt_alias: calame_pt calame_pt_perplexity: 40.330510797408564 calame_pt_perplexity_stderr: 2.812293830784164 enem_challenge_acc,all: 0.5360391882435269 enem_challenge_acc,exam_id__2009: 0.4956521739130435 enem_challenge_acc,exam_id__2010: 0.5555555555555556 enem_challenge_acc,exam_id__2011: 0.5555555555555556 enem_challenge_acc,exam_id__2012: 0.5344827586206896 enem_challenge_acc,exam_id__2013: 0.6296296296296297 enem_challenge_acc,exam_id__2014: 0.5504587155963303 enem_challenge_acc,exam_id__2015: 0.453781512605042 enem_challenge_acc,exam_id__2016: 0.5041322314049587 enem_challenge_acc,exam_id__2016_2: 0.5528455284552846 enem_challenge_acc,exam_id__2017: 0.4827586206896552 enem_challenge_acc,exam_id__2022: 0.5263157894736842 enem_challenge_acc,exam_id__2023: 0.5925925925925926 enem_challenge_acc_stderr,all: 0.0076181405479343375 enem_challenge_acc_stderr,exam_id__2009: 0.02694086035194982 enem_challenge_acc_stderr,exam_id__2010: 0.026490028965817942 enem_challenge_acc_stderr,exam_id__2011: 0.026490390134681466 enem_challenge_acc_stderr,exam_id__2012: 0.026706675890852467 enem_challenge_acc_stderr,exam_id__2013: 0.026844859183194552 enem_challenge_acc_stderr,exam_id__2014: 0.027471406425669677 enem_challenge_acc_stderr,exam_id__2015: 0.02625010092250836 enem_challenge_acc_stderr,exam_id__2016: 0.026323579775844996 enem_challenge_acc_stderr,exam_id__2016_2: 0.025854198995497416 enem_challenge_acc_stderr,exam_id__2017: 0.02675016875866561 enem_challenge_acc_stderr,exam_id__2022: 0.02494321437851413 enem_challenge_acc_stderr,exam_id__2023: 0.024375802527465587 enem_challenge_alias: enem faquad_nli_acc,all: 0.5461538461538461 faquad_nli_acc_stderr,all: 0.013824178717859308 faquad_nli_alias: faquad_nli faquad_nli_f1_macro,all: 0.5378749719832936 faquad_nli_f1_macro_stderr,all: 0.013886770545945497 global_piqa_completions_por_latn_braz_acc: 0.66 global_piqa_completions_por_latn_braz_acc_bytes: 0.63 global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.048523658709390974 global_piqa_completions_por_latn_braz_acc_norm: 0.62 global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.04878317312145634 global_piqa_completions_por_latn_braz_acc_stderr: 0.04760952285695234 global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz gsm8k_pt_alias: gsm8k_pt gsm8k_pt_exact_match,flexible-extract: 0.18493150684931506 gsm8k_pt_exact_match,strict-match: 0.0 gsm8k_pt_exact_match_stderr,flexible-extract: 0.010714456449924532 gsm8k_pt_exact_match_stderr,strict-match: 0.0 hatebr_offensive_acc,all: 0.8514285714285714 hatebr_offensive_acc_stderr,all: 0.0067137369109427445 hatebr_offensive_alias: hatebr_offensive_binary hatebr_offensive_f1_macro,all: 0.8513918743199851 hatebr_offensive_f1_macro_stderr,all: 0.006717339873214156 hellaswag_poly_pt_acc: 0.38129808213240873 hellaswag_poly_pt_acc_norm: 0.47827500270885254 hellaswag_poly_pt_acc_norm_stderr: 0.005200030264123482 hellaswag_poly_pt_acc_stderr: 0.005056141839024339 hellaswag_poly_pt_alias: hellaswag_poly_pt humaneval_instruct_alias: humaneval_instruct humaneval_instruct_pass@1,create_test: 0.10365853658536585 humaneval_instruct_pass@1_stderr,create_test: 0.023875115311878508 ifeval_pt_alias: ifeval_pt ifeval_pt_inst_level_loose_acc: 0.4186046511627907 ifeval_pt_inst_level_loose_acc_stderr: N/A ifeval_pt_inst_level_strict_acc: 0.37906976744186044 ifeval_pt_inst_level_strict_acc_stderr: N/A ifeval_pt_prompt_level_loose_acc: 0.3 ifeval_pt_prompt_level_loose_acc_stderr: 0.02650171951258532 ifeval_pt_prompt_level_strict_acc: 0.26666666666666666 ifeval_pt_prompt_level_strict_acc_stderr: 0.02557404853322572 lambada_poly_pt_acc: 0.36425383271880457 lambada_poly_pt_acc_stderr: 0.006704339729529026 lambada_poly_pt_alias: lambada_poly_pt lambada_poly_pt_perplexity: 34.54764793589513 lambada_poly_pt_perplexity_stderr: 1.5172274553216532 mmlu_poly_pt_acc: 0.41459021314920447 mmlu_poly_pt_acc_stderr: 0.004268138446999436 mmlu_poly_pt_alias: mmlu_poly_pt oab_exams_acc,all: 0.40728929384965834 oab_exams_acc,exam_id__2010-01: 0.4 oab_exams_acc,exam_id__2010-02: 0.5 oab_exams_acc,exam_id__2011-03: 0.3434343434343434 oab_exams_acc,exam_id__2011-04: 0.35 oab_exams_acc,exam_id__2011-05: 0.45 oab_exams_acc,exam_id__2012-06: 0.3625 oab_exams_acc,exam_id__2012-06a: 0.525 oab_exams_acc,exam_id__2012-07: 0.425 oab_exams_acc,exam_id__2012-08: 0.425 oab_exams_acc,exam_id__2012-09: 0.33766233766233766 oab_exams_acc,exam_id__2013-10: 0.45 oab_exams_acc,exam_id__2013-11: 0.3875 oab_exams_acc,exam_id__2013-12: 0.4125 oab_exams_acc,exam_id__2014-13: 0.4 oab_exams_acc,exam_id__2014-14: 0.375 oab_exams_acc,exam_id__2014-15: 0.4230769230769231 oab_exams_acc,exam_id__2015-16: 0.475 oab_exams_acc,exam_id__2015-17: 0.41025641025641024 oab_exams_acc,exam_id__2015-18: 0.3875 oab_exams_acc,exam_id__2016-19: 0.4358974358974359 oab_exams_acc,exam_id__2016-20: 0.4125 oab_exams_acc,exam_id__2016-20a: 0.2875 oab_exams_acc,exam_id__2016-21: 0.3625 oab_exams_acc,exam_id__2017-22: 0.4625 oab_exams_acc,exam_id__2017-23: 0.375 oab_exams_acc,exam_id__2017-24: 0.3625 oab_exams_acc,exam_id__2018-25: 0.45 oab_exams_acc_stderr,all: 0.006075587781828954 oab_exams_acc_stderr,exam_id__2010-01: 0.030623794065745072 oab_exams_acc_stderr,exam_id__2010-02: 0.028916493865834454 oab_exams_acc_stderr,exam_id__2011-03: 0.02741417034369018 oab_exams_acc_stderr,exam_id__2011-04: 0.030903722521126956 oab_exams_acc_stderr,exam_id__2011-05: 0.032119480960382765 oab_exams_acc_stderr,exam_id__2012-06: 0.03092116447159791 oab_exams_acc_stderr,exam_id__2012-06a: 0.0323716859938887 oab_exams_acc_stderr,exam_id__2012-07: 0.03197065291082094 oab_exams_acc_stderr,exam_id__2012-08: 0.03199897400280611 oab_exams_acc_stderr,exam_id__2012-09: 0.03113015227112767 oab_exams_acc_stderr,exam_id__2013-10: 0.03220496656457049 oab_exams_acc_stderr,exam_id__2013-11: 0.03141807801888442 oab_exams_acc_stderr,exam_id__2013-12: 0.031705964116907834 oab_exams_acc_stderr,exam_id__2014-13: 0.031523570709171646 oab_exams_acc_stderr,exam_id__2014-14: 0.031348698849557335 oab_exams_acc_stderr,exam_id__2014-15: 0.032308173836725206 oab_exams_acc_stderr,exam_id__2015-16: 0.032275444416125405 oab_exams_acc_stderr,exam_id__2015-17: 0.032242261750832044 oab_exams_acc_stderr,exam_id__2015-18: 0.031327883513865735 oab_exams_acc_stderr,exam_id__2016-19: 0.03247625408703544 oab_exams_acc_stderr,exam_id__2016-20: 0.031762853683104365 oab_exams_acc_stderr,exam_id__2016-20a: 0.02914121034414819 oab_exams_acc_stderr,exam_id__2016-21: 0.030958650868779976 oab_exams_acc_stderr,exam_id__2017-22: 0.032214831966249774 oab_exams_acc_stderr,exam_id__2017-23: 0.03125268775237781 oab_exams_acc_stderr,exam_id__2017-24: 0.030936224842605275 oab_exams_acc_stderr,exam_id__2018-25: 0.03208477618412235 oab_exams_alias: oab_exams portuguese_hate_speech_acc,all: 0.7297297297297297 portuguese_hate_speech_acc_stderr,all: 0.010726825349374063 portuguese_hate_speech_alias: portuguese_hate_speech_binary portuguese_hate_speech_f1_macro,all: 0.6842412627123019 portuguese_hate_speech_f1_macro_stderr,all: 0.012055600214659706 tweetsentbr_acc,all: 0.5796019900497512 tweetsentbr_acc_stderr,all: 0.007813148658084126 tweetsentbr_alias: tweetsentbr tweetsentbr_f1_macro,all: 0.5404009095352119 tweetsentbr_f1_macro_stderr,all: 0.008133081854048629 step: 69750