diff --git "a/sample_rate=0.05/checkpoint-17600/trainer_state.json" "b/sample_rate=0.05/checkpoint-17600/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sample_rate=0.05/checkpoint-17600/trainer_state.json" @@ -0,0 +1,4266 @@ +{ + "best_metric": 0.5727406094475693, + "best_model_checkpoint": "histv4_ftis_pretrain_tssp-smlm_0329/checkpoint-17600", + "epoch": 517.0001641791044, + "eval_steps": 100, + "global_step": 17600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.0001194029850746, + "grad_norm": 268.6054992675781, + "learning_rate": 7.462686567164179e-07, + "loss": 45.623, + "step": 50 + }, + { + "epoch": 2.0002388059701492, + "grad_norm": 213.74673461914062, + "learning_rate": 1.4925373134328358e-06, + "loss": 41.9959, + "step": 100 + }, + { + "epoch": 2.0002388059701492, + "eval_accuracy": 0.0611996064128176, + "eval_loss": 29.715543746948242, + "eval_macro_f1": 0.02837569459935811, + "eval_runtime": 100.1332, + "eval_samples_per_second": 6.711, + "eval_steps_per_second": 1.678, + "step": 100 + }, + { + "epoch": 4.0001044776119405, + "grad_norm": 132.3323516845703, + "learning_rate": 2.238805970149254e-06, + "loss": 33.1467, + "step": 150 + }, + { + "epoch": 5.000223880597015, + "grad_norm": 74.35905456542969, + "learning_rate": 2.9850746268656716e-06, + "loss": 24.9796, + "step": 200 + }, + { + "epoch": 5.000223880597015, + "eval_accuracy": 0.09009597318617783, + "eval_loss": 15.895302772521973, + "eval_macro_f1": 0.035076320344672444, + "eval_runtime": 77.8967, + "eval_samples_per_second": 8.627, + "eval_steps_per_second": 2.157, + "step": 200 + }, + { + "epoch": 7.000089552238806, + "grad_norm": 48.132564544677734, + "learning_rate": 3.7313432835820893e-06, + "loss": 17.191, + "step": 250 + }, + { + "epoch": 8.000208955223881, + "grad_norm": 16.860376358032227, + "learning_rate": 4.477611940298508e-06, + "loss": 11.448, + "step": 300 + }, + { + "epoch": 8.000208955223881, + "eval_accuracy": 0.31287949697368334, + "eval_loss": 9.061956405639648, + "eval_macro_f1": 0.07962701269903324, + "eval_runtime": 54.1275, + "eval_samples_per_second": 12.415, + "eval_steps_per_second": 3.104, + "step": 300 + }, + { + "epoch": 10.000074626865672, + "grad_norm": 9.546243667602539, + "learning_rate": 5.2238805970149255e-06, + "loss": 8.9628, + "step": 350 + }, + { + "epoch": 11.000194029850746, + "grad_norm": 28.206951141357422, + "learning_rate": 5.970149253731343e-06, + "loss": 7.4769, + "step": 400 + }, + { + "epoch": 11.000194029850746, + "eval_accuracy": 0.5129615599861176, + "eval_loss": 7.141111850738525, + "eval_macro_f1": 0.13085286125347126, + "eval_runtime": 48.1657, + "eval_samples_per_second": 13.952, + "eval_steps_per_second": 3.488, + "step": 400 + }, + { + "epoch": 13.000059701492537, + "grad_norm": 8.034100532531738, + "learning_rate": 6.716417910447762e-06, + "loss": 6.9324, + "step": 450 + }, + { + "epoch": 14.000179104477612, + "grad_norm": 13.441235542297363, + "learning_rate": 7.4626865671641785e-06, + "loss": 6.3711, + "step": 500 + }, + { + "epoch": 14.000179104477612, + "eval_accuracy": 0.5551171622832922, + "eval_loss": 6.244740962982178, + "eval_macro_f1": 0.1420517647298635, + "eval_runtime": 47.0783, + "eval_samples_per_second": 14.274, + "eval_steps_per_second": 3.569, + "step": 500 + }, + { + "epoch": 16.000044776119402, + "grad_norm": 28.859664916992188, + "learning_rate": 8.208955223880597e-06, + "loss": 6.2194, + "step": 550 + }, + { + "epoch": 17.000164179104477, + "grad_norm": 11.290647506713867, + "learning_rate": 8.955223880597016e-06, + "loss": 5.9458, + "step": 600 + }, + { + "epoch": 17.000164179104477, + "eval_accuracy": 0.5776206330621622, + "eval_loss": 5.946147441864014, + "eval_macro_f1": 0.1511808879291274, + "eval_runtime": 50.0902, + "eval_samples_per_second": 13.416, + "eval_steps_per_second": 3.354, + "step": 600 + }, + { + "epoch": 19.00002985074627, + "grad_norm": 8.279501914978027, + "learning_rate": 9.701492537313434e-06, + "loss": 5.8337, + "step": 650 + }, + { + "epoch": 20.000149253731344, + "grad_norm": 7.2510457038879395, + "learning_rate": 1.0447761194029851e-05, + "loss": 5.6989, + "step": 700 + }, + { + "epoch": 20.000149253731344, + "eval_accuracy": 0.5918539560475244, + "eval_loss": 5.41770601272583, + "eval_macro_f1": 0.1601436746174448, + "eval_runtime": 47.8725, + "eval_samples_per_second": 14.037, + "eval_steps_per_second": 3.509, + "step": 700 + }, + { + "epoch": 22.000014925373133, + "grad_norm": 15.262322425842285, + "learning_rate": 1.119402985074627e-05, + "loss": 5.268, + "step": 750 + }, + { + "epoch": 23.000134328358207, + "grad_norm": 11.269179344177246, + "learning_rate": 1.1940298507462686e-05, + "loss": 4.9652, + "step": 800 + }, + { + "epoch": 23.000134328358207, + "eval_accuracy": 0.6051046689177835, + "eval_loss": 4.733717918395996, + "eval_macro_f1": 0.16606311305728727, + "eval_runtime": 47.5619, + "eval_samples_per_second": 14.129, + "eval_steps_per_second": 3.532, + "step": 800 + }, + { + "epoch": 24.000253731343285, + "grad_norm": 7.085338115692139, + "learning_rate": 1.2686567164179105e-05, + "loss": 4.8364, + "step": 850 + }, + { + "epoch": 26.000119402985074, + "grad_norm": 7.979522705078125, + "learning_rate": 1.3432835820895523e-05, + "loss": 4.6759, + "step": 900 + }, + { + "epoch": 26.000119402985074, + "eval_accuracy": 0.6152476401929597, + "eval_loss": 4.226262092590332, + "eval_macro_f1": 0.16957818492254834, + "eval_runtime": 48.1918, + "eval_samples_per_second": 13.944, + "eval_steps_per_second": 3.486, + "step": 900 + }, + { + "epoch": 27.00023880597015, + "grad_norm": 7.781890392303467, + "learning_rate": 1.417910447761194e-05, + "loss": 4.4026, + "step": 950 + }, + { + "epoch": 29.00010447761194, + "grad_norm": 6.397170543670654, + "learning_rate": 1.4925373134328357e-05, + "loss": 4.2499, + "step": 1000 + }, + { + "epoch": 29.00010447761194, + "eval_accuracy": 0.6169249397171458, + "eval_loss": 3.9199275970458984, + "eval_macro_f1": 0.17197436046412404, + "eval_runtime": 47.5814, + "eval_samples_per_second": 14.123, + "eval_steps_per_second": 3.531, + "step": 1000 + }, + { + "epoch": 30.000223880597016, + "grad_norm": 7.5811662673950195, + "learning_rate": 1.5671641791044777e-05, + "loss": 4.1405, + "step": 1050 + }, + { + "epoch": 32.000089552238805, + "grad_norm": 7.1128668785095215, + "learning_rate": 1.6417910447761194e-05, + "loss": 4.0352, + "step": 1100 + }, + { + "epoch": 32.000089552238805, + "eval_accuracy": 0.6282504577988647, + "eval_loss": 3.6091878414154053, + "eval_macro_f1": 0.17308567573172656, + "eval_runtime": 47.6781, + "eval_samples_per_second": 14.095, + "eval_steps_per_second": 3.524, + "step": 1100 + }, + { + "epoch": 33.00020895522388, + "grad_norm": 7.023083209991455, + "learning_rate": 1.716417910447761e-05, + "loss": 3.834, + "step": 1150 + }, + { + "epoch": 35.00007462686567, + "grad_norm": 6.3213210105896, + "learning_rate": 1.791044776119403e-05, + "loss": 3.5153, + "step": 1200 + }, + { + "epoch": 35.00007462686567, + "eval_accuracy": 0.6340241852202595, + "eval_loss": 3.1947221755981445, + "eval_macro_f1": 0.17862905980558652, + "eval_runtime": 47.9366, + "eval_samples_per_second": 14.019, + "eval_steps_per_second": 3.505, + "step": 1200 + }, + { + "epoch": 36.000194029850746, + "grad_norm": 6.348028659820557, + "learning_rate": 1.865671641791045e-05, + "loss": 3.4825, + "step": 1250 + }, + { + "epoch": 38.00005970149254, + "grad_norm": 8.697322845458984, + "learning_rate": 1.9402985074626868e-05, + "loss": 3.2449, + "step": 1300 + }, + { + "epoch": 38.00005970149254, + "eval_accuracy": 0.6406358374136831, + "eval_loss": 3.046095132827759, + "eval_macro_f1": 0.17843054291839705, + "eval_runtime": 47.8837, + "eval_samples_per_second": 14.034, + "eval_steps_per_second": 3.509, + "step": 1300 + }, + { + "epoch": 39.00017910447761, + "grad_norm": 6.332484722137451, + "learning_rate": 2.0149253731343285e-05, + "loss": 3.1494, + "step": 1350 + }, + { + "epoch": 41.0000447761194, + "grad_norm": 8.93017864227295, + "learning_rate": 2.0895522388059702e-05, + "loss": 2.9193, + "step": 1400 + }, + { + "epoch": 41.0000447761194, + "eval_accuracy": 0.643553186486495, + "eval_loss": 2.703385829925537, + "eval_macro_f1": 0.1936226471515171, + "eval_runtime": 47.6235, + "eval_samples_per_second": 14.111, + "eval_steps_per_second": 3.528, + "step": 1400 + }, + { + "epoch": 42.00016417910448, + "grad_norm": 4.086354732513428, + "learning_rate": 2.164179104477612e-05, + "loss": 2.8356, + "step": 1450 + }, + { + "epoch": 44.000029850746266, + "grad_norm": 8.437738418579102, + "learning_rate": 2.238805970149254e-05, + "loss": 2.6946, + "step": 1500 + }, + { + "epoch": 44.000029850746266, + "eval_accuracy": 0.6532593768317203, + "eval_loss": 2.6700782775878906, + "eval_macro_f1": 0.20852622108881103, + "eval_runtime": 47.2348, + "eval_samples_per_second": 14.227, + "eval_steps_per_second": 3.557, + "step": 1500 + }, + { + "epoch": 45.000149253731344, + "grad_norm": 5.0795745849609375, + "learning_rate": 2.3134328358208956e-05, + "loss": 2.5071, + "step": 1550 + }, + { + "epoch": 47.00001492537314, + "grad_norm": 5.083272457122803, + "learning_rate": 2.3880597014925373e-05, + "loss": 2.5026, + "step": 1600 + }, + { + "epoch": 47.00001492537314, + "eval_accuracy": 0.6609667891358143, + "eval_loss": 2.535517930984497, + "eval_macro_f1": 0.22542057336605137, + "eval_runtime": 47.5436, + "eval_samples_per_second": 14.134, + "eval_steps_per_second": 3.534, + "step": 1600 + }, + { + "epoch": 48.00013432835821, + "grad_norm": 6.944649696350098, + "learning_rate": 2.4626865671641793e-05, + "loss": 2.3382, + "step": 1650 + }, + { + "epoch": 49.000253731343285, + "grad_norm": 3.319343328475952, + "learning_rate": 2.537313432835821e-05, + "loss": 2.2656, + "step": 1700 + }, + { + "epoch": 49.000253731343285, + "eval_accuracy": 0.6666688493875962, + "eval_loss": 2.5731303691864014, + "eval_macro_f1": 0.22558865006421702, + "eval_runtime": 47.795, + "eval_samples_per_second": 14.06, + "eval_steps_per_second": 3.515, + "step": 1700 + }, + { + "epoch": 51.00011940298508, + "grad_norm": 5.969847679138184, + "learning_rate": 2.6119402985074626e-05, + "loss": 2.2414, + "step": 1750 + }, + { + "epoch": 52.00023880597015, + "grad_norm": 4.999436855316162, + "learning_rate": 2.6865671641791047e-05, + "loss": 2.1395, + "step": 1800 + }, + { + "epoch": 52.00023880597015, + "eval_accuracy": 0.6694238725599323, + "eval_loss": 2.4594123363494873, + "eval_macro_f1": 0.23246645748504666, + "eval_runtime": 46.9918, + "eval_samples_per_second": 14.3, + "eval_steps_per_second": 3.575, + "step": 1800 + }, + { + "epoch": 54.00010447761194, + "grad_norm": 6.551823616027832, + "learning_rate": 2.7611940298507467e-05, + "loss": 2.0293, + "step": 1850 + }, + { + "epoch": 55.00022388059701, + "grad_norm": 4.601879596710205, + "learning_rate": 2.835820895522388e-05, + "loss": 2.0593, + "step": 1900 + }, + { + "epoch": 55.00022388059701, + "eval_accuracy": 0.6852335838781272, + "eval_loss": 2.3419840335845947, + "eval_macro_f1": 0.247011885458689, + "eval_runtime": 46.7709, + "eval_samples_per_second": 14.368, + "eval_steps_per_second": 3.592, + "step": 1900 + }, + { + "epoch": 57.000089552238805, + "grad_norm": 12.511756896972656, + "learning_rate": 2.91044776119403e-05, + "loss": 1.9773, + "step": 1950 + }, + { + "epoch": 58.00020895522388, + "grad_norm": 5.317741394042969, + "learning_rate": 2.9850746268656714e-05, + "loss": 1.8986, + "step": 2000 + }, + { + "epoch": 58.00020895522388, + "eval_accuracy": 0.6923152771085135, + "eval_loss": 2.2253482341766357, + "eval_macro_f1": 0.26193646209976773, + "eval_runtime": 47.6348, + "eval_samples_per_second": 14.107, + "eval_steps_per_second": 3.527, + "step": 2000 + }, + { + "epoch": 60.00007462686567, + "grad_norm": 3.723234176635742, + "learning_rate": 3.059701492537314e-05, + "loss": 1.918, + "step": 2050 + }, + { + "epoch": 61.000194029850746, + "grad_norm": 3.894078493118286, + "learning_rate": 3.1343283582089554e-05, + "loss": 1.7397, + "step": 2100 + }, + { + "epoch": 61.000194029850746, + "eval_accuracy": 0.6947100535232662, + "eval_loss": 2.2664544582366943, + "eval_macro_f1": 0.27990330207953806, + "eval_runtime": 47.3036, + "eval_samples_per_second": 14.206, + "eval_steps_per_second": 3.552, + "step": 2100 + }, + { + "epoch": 63.00005970149254, + "grad_norm": 6.1484575271606445, + "learning_rate": 3.208955223880597e-05, + "loss": 1.7487, + "step": 2150 + }, + { + "epoch": 64.00017910447761, + "grad_norm": 4.968579292297363, + "learning_rate": 3.283582089552239e-05, + "loss": 1.7003, + "step": 2200 + }, + { + "epoch": 64.00017910447761, + "eval_accuracy": 0.6917887593379874, + "eval_loss": 2.3647570610046387, + "eval_macro_f1": 0.28330040134890344, + "eval_runtime": 47.2133, + "eval_samples_per_second": 14.233, + "eval_steps_per_second": 3.558, + "step": 2200 + }, + { + "epoch": 66.00004477611941, + "grad_norm": 6.8174614906311035, + "learning_rate": 3.358208955223881e-05, + "loss": 1.6537, + "step": 2250 + }, + { + "epoch": 67.00016417910447, + "grad_norm": 4.2402215003967285, + "learning_rate": 3.432835820895522e-05, + "loss": 1.6067, + "step": 2300 + }, + { + "epoch": 67.00016417910447, + "eval_accuracy": 0.6980081086146106, + "eval_loss": 2.22788143157959, + "eval_macro_f1": 0.3110811321870333, + "eval_runtime": 47.0984, + "eval_samples_per_second": 14.268, + "eval_steps_per_second": 3.567, + "step": 2300 + }, + { + "epoch": 69.00002985074627, + "grad_norm": 9.1964111328125, + "learning_rate": 3.5074626865671645e-05, + "loss": 1.5946, + "step": 2350 + }, + { + "epoch": 70.00014925373134, + "grad_norm": 6.169743537902832, + "learning_rate": 3.582089552238806e-05, + "loss": 1.4953, + "step": 2400 + }, + { + "epoch": 70.00014925373134, + "eval_accuracy": 0.704833960903283, + "eval_loss": 2.312061071395874, + "eval_macro_f1": 0.3110782741352652, + "eval_runtime": 47.0156, + "eval_samples_per_second": 14.293, + "eval_steps_per_second": 3.573, + "step": 2400 + }, + { + "epoch": 72.00001492537314, + "grad_norm": 6.59517240524292, + "learning_rate": 3.656716417910448e-05, + "loss": 1.5211, + "step": 2450 + }, + { + "epoch": 73.00013432835821, + "grad_norm": 4.096273899078369, + "learning_rate": 3.73134328358209e-05, + "loss": 1.4503, + "step": 2500 + }, + { + "epoch": 73.00013432835821, + "eval_accuracy": 0.7087711570949992, + "eval_loss": 2.2480435371398926, + "eval_macro_f1": 0.3411433098308081, + "eval_runtime": 47.405, + "eval_samples_per_second": 14.176, + "eval_steps_per_second": 3.544, + "step": 2500 + }, + { + "epoch": 74.00025373134328, + "grad_norm": 6.388035297393799, + "learning_rate": 3.805970149253731e-05, + "loss": 1.4198, + "step": 2550 + }, + { + "epoch": 76.00011940298508, + "grad_norm": 4.234487056732178, + "learning_rate": 3.8805970149253736e-05, + "loss": 1.4005, + "step": 2600 + }, + { + "epoch": 76.00011940298508, + "eval_accuracy": 0.7116982054147312, + "eval_loss": 2.191664934158325, + "eval_macro_f1": 0.3470692590809922, + "eval_runtime": 47.0056, + "eval_samples_per_second": 14.296, + "eval_steps_per_second": 3.574, + "step": 2600 + }, + { + "epoch": 77.00023880597016, + "grad_norm": 5.115391254425049, + "learning_rate": 3.9552238805970146e-05, + "loss": 1.3576, + "step": 2650 + }, + { + "epoch": 79.00010447761194, + "grad_norm": 7.692559719085693, + "learning_rate": 4.029850746268657e-05, + "loss": 1.2953, + "step": 2700 + }, + { + "epoch": 79.00010447761194, + "eval_accuracy": 0.7055090567142044, + "eval_loss": 2.1772665977478027, + "eval_macro_f1": 0.36332081559320717, + "eval_runtime": 47.6118, + "eval_samples_per_second": 14.114, + "eval_steps_per_second": 3.529, + "step": 2700 + }, + { + "epoch": 80.00022388059702, + "grad_norm": 3.995386838912964, + "learning_rate": 4.104477611940299e-05, + "loss": 1.2939, + "step": 2750 + }, + { + "epoch": 82.0000895522388, + "grad_norm": 7.2733378410339355, + "learning_rate": 4.1791044776119404e-05, + "loss": 1.2172, + "step": 2800 + }, + { + "epoch": 82.0000895522388, + "eval_accuracy": 0.7130192262183377, + "eval_loss": 2.209099292755127, + "eval_macro_f1": 0.36709028385955894, + "eval_runtime": 46.3992, + "eval_samples_per_second": 14.483, + "eval_steps_per_second": 3.621, + "step": 2800 + }, + { + "epoch": 83.00020895522388, + "grad_norm": 3.299356460571289, + "learning_rate": 4.253731343283582e-05, + "loss": 1.2169, + "step": 2850 + }, + { + "epoch": 85.00007462686567, + "grad_norm": 3.3234920501708984, + "learning_rate": 4.328358208955224e-05, + "loss": 1.2013, + "step": 2900 + }, + { + "epoch": 85.00007462686567, + "eval_accuracy": 0.7135873464849347, + "eval_loss": 2.246211528778076, + "eval_macro_f1": 0.3726116519941989, + "eval_runtime": 47.8493, + "eval_samples_per_second": 14.044, + "eval_steps_per_second": 3.511, + "step": 2900 + }, + { + "epoch": 86.00019402985075, + "grad_norm": 7.3292155265808105, + "learning_rate": 4.402985074626866e-05, + "loss": 1.1738, + "step": 2950 + }, + { + "epoch": 88.00005970149253, + "grad_norm": 3.3872430324554443, + "learning_rate": 4.477611940298508e-05, + "loss": 1.1332, + "step": 3000 + }, + { + "epoch": 88.00005970149253, + "eval_accuracy": 0.7253864172041365, + "eval_loss": 2.1158246994018555, + "eval_macro_f1": 0.38775289553895753, + "eval_runtime": 46.487, + "eval_samples_per_second": 14.456, + "eval_steps_per_second": 3.614, + "step": 3000 + }, + { + "epoch": 89.00017910447761, + "grad_norm": 9.818177223205566, + "learning_rate": 4.5522388059701495e-05, + "loss": 1.1261, + "step": 3050 + }, + { + "epoch": 91.00004477611941, + "grad_norm": 4.810808181762695, + "learning_rate": 4.626865671641791e-05, + "loss": 1.0813, + "step": 3100 + }, + { + "epoch": 91.00004477611941, + "eval_accuracy": 0.7211373567993602, + "eval_loss": 2.2004384994506836, + "eval_macro_f1": 0.39903164515389333, + "eval_runtime": 46.4612, + "eval_samples_per_second": 14.464, + "eval_steps_per_second": 3.616, + "step": 3100 + }, + { + "epoch": 92.00016417910447, + "grad_norm": 6.049989700317383, + "learning_rate": 4.7014925373134335e-05, + "loss": 1.0726, + "step": 3150 + }, + { + "epoch": 94.00002985074627, + "grad_norm": 3.0761356353759766, + "learning_rate": 4.7761194029850745e-05, + "loss": 1.0448, + "step": 3200 + }, + { + "epoch": 94.00002985074627, + "eval_accuracy": 0.7230205989727704, + "eval_loss": 2.1663625240325928, + "eval_macro_f1": 0.4098778730791762, + "eval_runtime": 46.9691, + "eval_samples_per_second": 14.307, + "eval_steps_per_second": 3.577, + "step": 3200 + }, + { + "epoch": 95.00014925373134, + "grad_norm": 8.696450233459473, + "learning_rate": 4.850746268656717e-05, + "loss": 1.0176, + "step": 3250 + }, + { + "epoch": 97.00001492537314, + "grad_norm": 4.8120551109313965, + "learning_rate": 4.9253731343283586e-05, + "loss": 1.027, + "step": 3300 + }, + { + "epoch": 97.00001492537314, + "eval_accuracy": 0.7209191849492744, + "eval_loss": 2.2609217166900635, + "eval_macro_f1": 0.41785881652162904, + "eval_runtime": 46.2098, + "eval_samples_per_second": 14.542, + "eval_steps_per_second": 3.636, + "step": 3300 + }, + { + "epoch": 98.00013432835821, + "grad_norm": 7.96643590927124, + "learning_rate": 5e-05, + "loss": 0.9618, + "step": 3350 + }, + { + "epoch": 99.00025373134328, + "grad_norm": 8.369465827941895, + "learning_rate": 5.074626865671642e-05, + "loss": 0.9676, + "step": 3400 + }, + { + "epoch": 99.00025373134328, + "eval_accuracy": 0.7181855960238737, + "eval_loss": 2.2553093433380127, + "eval_macro_f1": 0.4205316699917775, + "eval_runtime": 45.9499, + "eval_samples_per_second": 14.625, + "eval_steps_per_second": 3.656, + "step": 3400 + }, + { + "epoch": 101.00011940298508, + "grad_norm": 3.9557459354400635, + "learning_rate": 5.149253731343284e-05, + "loss": 0.936, + "step": 3450 + }, + { + "epoch": 102.00023880597016, + "grad_norm": 4.022943019866943, + "learning_rate": 5.223880597014925e-05, + "loss": 0.9236, + "step": 3500 + }, + { + "epoch": 102.00023880597016, + "eval_accuracy": 0.7268069494664281, + "eval_loss": 2.2654848098754883, + "eval_macro_f1": 0.4296574394962598, + "eval_runtime": 46.0809, + "eval_samples_per_second": 14.583, + "eval_steps_per_second": 3.646, + "step": 3500 + }, + { + "epoch": 104.00010447761194, + "grad_norm": 7.525932312011719, + "learning_rate": 5.298507462686567e-05, + "loss": 0.9, + "step": 3550 + }, + { + "epoch": 105.00022388059702, + "grad_norm": 2.630303144454956, + "learning_rate": 5.373134328358209e-05, + "loss": 0.8823, + "step": 3600 + }, + { + "epoch": 105.00022388059702, + "eval_accuracy": 0.7255630101947479, + "eval_loss": 2.241246223449707, + "eval_macro_f1": 0.4397888033563203, + "eval_runtime": 46.7755, + "eval_samples_per_second": 14.366, + "eval_steps_per_second": 3.592, + "step": 3600 + }, + { + "epoch": 107.0000895522388, + "grad_norm": 5.248049259185791, + "learning_rate": 5.447761194029851e-05, + "loss": 0.872, + "step": 3650 + }, + { + "epoch": 108.00020895522388, + "grad_norm": 7.748460292816162, + "learning_rate": 5.5223880597014934e-05, + "loss": 0.8613, + "step": 3700 + }, + { + "epoch": 108.00020895522388, + "eval_accuracy": 0.7381425686487191, + "eval_loss": 2.257265090942383, + "eval_macro_f1": 0.43952388838271156, + "eval_runtime": 46.301, + "eval_samples_per_second": 14.514, + "eval_steps_per_second": 3.628, + "step": 3700 + }, + { + "epoch": 110.00007462686567, + "grad_norm": 8.066431999206543, + "learning_rate": 5.5970149253731344e-05, + "loss": 0.8451, + "step": 3750 + }, + { + "epoch": 111.00019402985075, + "grad_norm": 3.731245994567871, + "learning_rate": 5.671641791044776e-05, + "loss": 0.8107, + "step": 3800 + }, + { + "epoch": 111.00019402985075, + "eval_accuracy": 0.7315238054629816, + "eval_loss": 2.282832145690918, + "eval_macro_f1": 0.44124856697106224, + "eval_runtime": 46.6091, + "eval_samples_per_second": 14.418, + "eval_steps_per_second": 3.604, + "step": 3800 + }, + { + "epoch": 113.00005970149253, + "grad_norm": 3.3243250846862793, + "learning_rate": 5.7462686567164184e-05, + "loss": 0.8036, + "step": 3850 + }, + { + "epoch": 114.00017910447761, + "grad_norm": 2.9174063205718994, + "learning_rate": 5.82089552238806e-05, + "loss": 0.7601, + "step": 3900 + }, + { + "epoch": 114.00017910447761, + "eval_accuracy": 0.7360909709174372, + "eval_loss": 2.1535251140594482, + "eval_macro_f1": 0.4602722724724575, + "eval_runtime": 46.3111, + "eval_samples_per_second": 14.511, + "eval_steps_per_second": 3.628, + "step": 3900 + }, + { + "epoch": 116.00004477611941, + "grad_norm": 4.0545196533203125, + "learning_rate": 5.8955223880597025e-05, + "loss": 0.7558, + "step": 3950 + }, + { + "epoch": 117.00016417910447, + "grad_norm": 3.647667646408081, + "learning_rate": 5.970149253731343e-05, + "loss": 0.7313, + "step": 4000 + }, + { + "epoch": 117.00016417910447, + "eval_accuracy": 0.7445398264806605, + "eval_loss": 2.2534518241882324, + "eval_macro_f1": 0.45641158191286085, + "eval_runtime": 46.5445, + "eval_samples_per_second": 14.438, + "eval_steps_per_second": 3.609, + "step": 4000 + }, + { + "epoch": 119.00002985074627, + "grad_norm": 3.51348876953125, + "learning_rate": 6.044776119402985e-05, + "loss": 0.7509, + "step": 4050 + }, + { + "epoch": 120.00014925373134, + "grad_norm": 3.04976749420166, + "learning_rate": 6.119402985074628e-05, + "loss": 0.7175, + "step": 4100 + }, + { + "epoch": 120.00014925373134, + "eval_accuracy": 0.7449346489099626, + "eval_loss": 2.1976943016052246, + "eval_macro_f1": 0.46263712421355374, + "eval_runtime": 46.7545, + "eval_samples_per_second": 14.373, + "eval_steps_per_second": 3.593, + "step": 4100 + }, + { + "epoch": 122.00001492537314, + "grad_norm": 4.575667858123779, + "learning_rate": 6.194029850746269e-05, + "loss": 0.7203, + "step": 4150 + }, + { + "epoch": 123.00013432835821, + "grad_norm": 2.9554224014282227, + "learning_rate": 6.268656716417911e-05, + "loss": 0.7015, + "step": 4200 + }, + { + "epoch": 123.00013432835821, + "eval_accuracy": 0.7403036193685996, + "eval_loss": 2.2720584869384766, + "eval_macro_f1": 0.4688793326712362, + "eval_runtime": 47.2409, + "eval_samples_per_second": 14.225, + "eval_steps_per_second": 3.556, + "step": 4200 + }, + { + "epoch": 124.00025373134328, + "grad_norm": 5.688920497894287, + "learning_rate": 6.343283582089553e-05, + "loss": 0.6811, + "step": 4250 + }, + { + "epoch": 126.00011940298508, + "grad_norm": 3.802302360534668, + "learning_rate": 6.417910447761194e-05, + "loss": 0.6565, + "step": 4300 + }, + { + "epoch": 126.00011940298508, + "eval_accuracy": 0.7484118349990647, + "eval_loss": 2.255232095718384, + "eval_macro_f1": 0.47394248942462014, + "eval_runtime": 46.5503, + "eval_samples_per_second": 14.436, + "eval_steps_per_second": 3.609, + "step": 4300 + }, + { + "epoch": 127.00023880597016, + "grad_norm": 2.821439027786255, + "learning_rate": 6.492537313432836e-05, + "loss": 0.6653, + "step": 4350 + }, + { + "epoch": 129.00010447761193, + "grad_norm": 4.886723041534424, + "learning_rate": 6.567164179104478e-05, + "loss": 0.6527, + "step": 4400 + }, + { + "epoch": 129.00010447761193, + "eval_accuracy": 0.7483261319003174, + "eval_loss": 2.2446300983428955, + "eval_macro_f1": 0.48141887048714205, + "eval_runtime": 45.8818, + "eval_samples_per_second": 14.646, + "eval_steps_per_second": 3.662, + "step": 4400 + }, + { + "epoch": 130.00022388059702, + "grad_norm": 4.213831424713135, + "learning_rate": 6.64179104477612e-05, + "loss": 0.6342, + "step": 4450 + }, + { + "epoch": 132.00008955223882, + "grad_norm": 3.068998098373413, + "learning_rate": 6.716417910447762e-05, + "loss": 0.6271, + "step": 4500 + }, + { + "epoch": 132.00008955223882, + "eval_accuracy": 0.7540568593236725, + "eval_loss": 2.189602851867676, + "eval_macro_f1": 0.48183266241558287, + "eval_runtime": 46.8697, + "eval_samples_per_second": 14.338, + "eval_steps_per_second": 3.584, + "step": 4500 + }, + { + "epoch": 133.00020895522388, + "grad_norm": 5.4233174324035645, + "learning_rate": 6.791044776119403e-05, + "loss": 0.6185, + "step": 4550 + }, + { + "epoch": 135.00007462686568, + "grad_norm": 3.4873721599578857, + "learning_rate": 6.865671641791044e-05, + "loss": 0.6076, + "step": 4600 + }, + { + "epoch": 135.00007462686568, + "eval_accuracy": 0.7513138192883572, + "eval_loss": 2.2669498920440674, + "eval_macro_f1": 0.49261674718050974, + "eval_runtime": 47.5124, + "eval_samples_per_second": 14.144, + "eval_steps_per_second": 3.536, + "step": 4600 + }, + { + "epoch": 136.00019402985075, + "grad_norm": 2.7700037956237793, + "learning_rate": 6.940298507462687e-05, + "loss": 0.5877, + "step": 4650 + }, + { + "epoch": 138.00005970149255, + "grad_norm": 5.472187519073486, + "learning_rate": 7.014925373134329e-05, + "loss": 0.5832, + "step": 4700 + }, + { + "epoch": 138.00005970149255, + "eval_accuracy": 0.7553558527032176, + "eval_loss": 2.175410270690918, + "eval_macro_f1": 0.4888273575190917, + "eval_runtime": 47.1882, + "eval_samples_per_second": 14.241, + "eval_steps_per_second": 3.56, + "step": 4700 + }, + { + "epoch": 139.0001791044776, + "grad_norm": 3.7895753383636475, + "learning_rate": 7.089552238805971e-05, + "loss": 0.5608, + "step": 4750 + }, + { + "epoch": 141.0000447761194, + "grad_norm": 3.398712635040283, + "learning_rate": 7.164179104477612e-05, + "loss": 0.5743, + "step": 4800 + }, + { + "epoch": 141.0000447761194, + "eval_accuracy": 0.7574649967319431, + "eval_loss": 2.251237630844116, + "eval_macro_f1": 0.49457084397782713, + "eval_runtime": 47.113, + "eval_samples_per_second": 14.264, + "eval_steps_per_second": 3.566, + "step": 4800 + }, + { + "epoch": 142.00016417910447, + "grad_norm": 2.730689287185669, + "learning_rate": 7.238805970149254e-05, + "loss": 0.5554, + "step": 4850 + }, + { + "epoch": 144.00002985074627, + "grad_norm": 3.479031801223755, + "learning_rate": 7.313432835820896e-05, + "loss": 0.5416, + "step": 4900 + }, + { + "epoch": 144.00002985074627, + "eval_accuracy": 0.7575385083311772, + "eval_loss": 2.2671725749969482, + "eval_macro_f1": 0.49923179535556805, + "eval_runtime": 47.3726, + "eval_samples_per_second": 14.185, + "eval_steps_per_second": 3.546, + "step": 4900 + }, + { + "epoch": 145.00014925373134, + "grad_norm": 2.8180696964263916, + "learning_rate": 7.388059701492537e-05, + "loss": 0.5335, + "step": 4950 + }, + { + "epoch": 147.00001492537314, + "grad_norm": 2.1443684101104736, + "learning_rate": 7.46268656716418e-05, + "loss": 0.5245, + "step": 5000 + }, + { + "epoch": 147.00001492537314, + "eval_accuracy": 0.7573249759776639, + "eval_loss": 2.2584011554718018, + "eval_macro_f1": 0.4983590299409551, + "eval_runtime": 47.157, + "eval_samples_per_second": 14.25, + "eval_steps_per_second": 3.563, + "step": 5000 + }, + { + "epoch": 148.0001343283582, + "grad_norm": 2.9612479209899902, + "learning_rate": 7.537313432835821e-05, + "loss": 0.5293, + "step": 5050 + }, + { + "epoch": 149.0002537313433, + "grad_norm": 4.128423690795898, + "learning_rate": 7.611940298507463e-05, + "loss": 0.5345, + "step": 5100 + }, + { + "epoch": 149.0002537313433, + "eval_accuracy": 0.7455607202704149, + "eval_loss": 2.4137463569641113, + "eval_macro_f1": 0.48703216348283473, + "eval_runtime": 47.0623, + "eval_samples_per_second": 14.279, + "eval_steps_per_second": 3.57, + "step": 5100 + }, + { + "epoch": 151.00011940298506, + "grad_norm": 2.6632611751556396, + "learning_rate": 7.686567164179104e-05, + "loss": 0.5147, + "step": 5150 + }, + { + "epoch": 152.00023880597016, + "grad_norm": 2.2527358531951904, + "learning_rate": 7.761194029850747e-05, + "loss": 0.5114, + "step": 5200 + }, + { + "epoch": 152.00023880597016, + "eval_accuracy": 0.7670437360079503, + "eval_loss": 2.27756929397583, + "eval_macro_f1": 0.5094941993503374, + "eval_runtime": 47.395, + "eval_samples_per_second": 14.179, + "eval_steps_per_second": 3.545, + "step": 5200 + }, + { + "epoch": 154.00010447761193, + "grad_norm": 3.50247859954834, + "learning_rate": 7.835820895522389e-05, + "loss": 0.4988, + "step": 5250 + }, + { + "epoch": 155.00022388059702, + "grad_norm": 3.8334178924560547, + "learning_rate": 7.910447761194029e-05, + "loss": 0.4835, + "step": 5300 + }, + { + "epoch": 155.00022388059702, + "eval_accuracy": 0.7553154408967699, + "eval_loss": 2.3553121089935303, + "eval_macro_f1": 0.5093009078138683, + "eval_runtime": 46.9366, + "eval_samples_per_second": 14.317, + "eval_steps_per_second": 3.579, + "step": 5300 + }, + { + "epoch": 157.00008955223882, + "grad_norm": 2.1791555881500244, + "learning_rate": 7.985074626865672e-05, + "loss": 0.4799, + "step": 5350 + }, + { + "epoch": 158.00020895522388, + "grad_norm": 1.898905634880066, + "learning_rate": 8.059701492537314e-05, + "loss": 0.4722, + "step": 5400 + }, + { + "epoch": 158.00020895522388, + "eval_accuracy": 0.7619931760744374, + "eval_loss": 2.383127212524414, + "eval_macro_f1": 0.5017490765762928, + "eval_runtime": 46.9051, + "eval_samples_per_second": 14.327, + "eval_steps_per_second": 3.582, + "step": 5400 + }, + { + "epoch": 160.00007462686568, + "grad_norm": 2.5130512714385986, + "learning_rate": 8.134328358208956e-05, + "loss": 0.4706, + "step": 5450 + }, + { + "epoch": 161.00019402985075, + "grad_norm": 2.128020763397217, + "learning_rate": 8.208955223880597e-05, + "loss": 0.4657, + "step": 5500 + }, + { + "epoch": 161.00019402985075, + "eval_accuracy": 0.7575642260386002, + "eval_loss": 2.412935495376587, + "eval_macro_f1": 0.5069068740307354, + "eval_runtime": 47.1989, + "eval_samples_per_second": 14.238, + "eval_steps_per_second": 3.559, + "step": 5500 + }, + { + "epoch": 163.00005970149255, + "grad_norm": 3.768815755844116, + "learning_rate": 8.283582089552239e-05, + "loss": 0.4516, + "step": 5550 + }, + { + "epoch": 164.0001791044776, + "grad_norm": 2.9657809734344482, + "learning_rate": 8.358208955223881e-05, + "loss": 0.462, + "step": 5600 + }, + { + "epoch": 164.0001791044776, + "eval_accuracy": 0.7643664162602013, + "eval_loss": 2.280319929122925, + "eval_macro_f1": 0.5170303573937555, + "eval_runtime": 47.2955, + "eval_samples_per_second": 14.209, + "eval_steps_per_second": 3.552, + "step": 5600 + }, + { + "epoch": 166.0000447761194, + "grad_norm": 2.8042163848876953, + "learning_rate": 8.432835820895522e-05, + "loss": 0.4534, + "step": 5650 + }, + { + "epoch": 167.00016417910447, + "grad_norm": 2.0025835037231445, + "learning_rate": 8.507462686567164e-05, + "loss": 0.4458, + "step": 5700 + }, + { + "epoch": 167.00016417910447, + "eval_accuracy": 0.7653914066483692, + "eval_loss": 2.3632283210754395, + "eval_macro_f1": 0.5151352515015158, + "eval_runtime": 47.1398, + "eval_samples_per_second": 14.255, + "eval_steps_per_second": 3.564, + "step": 5700 + }, + { + "epoch": 169.00002985074627, + "grad_norm": 2.303767442703247, + "learning_rate": 8.582089552238807e-05, + "loss": 0.432, + "step": 5750 + }, + { + "epoch": 170.00014925373134, + "grad_norm": 3.017186403274536, + "learning_rate": 8.656716417910447e-05, + "loss": 0.4238, + "step": 5800 + }, + { + "epoch": 170.00014925373134, + "eval_accuracy": 0.7617022194425912, + "eval_loss": 2.488990306854248, + "eval_macro_f1": 0.5118568233662518, + "eval_runtime": 47.6464, + "eval_samples_per_second": 14.104, + "eval_steps_per_second": 3.526, + "step": 5800 + }, + { + "epoch": 172.00001492537314, + "grad_norm": 1.9521256685256958, + "learning_rate": 8.731343283582089e-05, + "loss": 0.4344, + "step": 5850 + }, + { + "epoch": 173.0001343283582, + "grad_norm": 2.8024206161499023, + "learning_rate": 8.805970149253732e-05, + "loss": 0.4148, + "step": 5900 + }, + { + "epoch": 173.0001343283582, + "eval_accuracy": 0.7707440768706695, + "eval_loss": 2.2938289642333984, + "eval_macro_f1": 0.507194240182011, + "eval_runtime": 46.5858, + "eval_samples_per_second": 14.425, + "eval_steps_per_second": 3.606, + "step": 5900 + }, + { + "epoch": 174.0002537313433, + "grad_norm": 3.1070268154144287, + "learning_rate": 8.880597014925374e-05, + "loss": 0.4314, + "step": 5950 + }, + { + "epoch": 176.00011940298506, + "grad_norm": 2.034417152404785, + "learning_rate": 8.955223880597016e-05, + "loss": 0.4151, + "step": 6000 + }, + { + "epoch": 176.00011940298506, + "eval_accuracy": 0.7681048052066777, + "eval_loss": 2.318552255630493, + "eval_macro_f1": 0.5191110512151402, + "eval_runtime": 47.3283, + "eval_samples_per_second": 14.199, + "eval_steps_per_second": 3.55, + "step": 6000 + }, + { + "epoch": 177.00023880597016, + "grad_norm": 1.605844259262085, + "learning_rate": 9.029850746268657e-05, + "loss": 0.4222, + "step": 6050 + }, + { + "epoch": 179.00010447761193, + "grad_norm": 3.6462390422821045, + "learning_rate": 9.104477611940299e-05, + "loss": 0.4182, + "step": 6100 + }, + { + "epoch": 179.00010447761193, + "eval_accuracy": 0.7765176233155091, + "eval_loss": 2.291574001312256, + "eval_macro_f1": 0.519695376836496, + "eval_runtime": 47.1743, + "eval_samples_per_second": 14.245, + "eval_steps_per_second": 3.561, + "step": 6100 + }, + { + "epoch": 180.00022388059702, + "grad_norm": 3.532557249069214, + "learning_rate": 9.17910447761194e-05, + "loss": 0.403, + "step": 6150 + }, + { + "epoch": 182.00008955223882, + "grad_norm": 2.6417500972747803, + "learning_rate": 9.253731343283582e-05, + "loss": 0.4018, + "step": 6200 + }, + { + "epoch": 182.00008955223882, + "eval_accuracy": 0.7722665493071009, + "eval_loss": 2.3301753997802734, + "eval_macro_f1": 0.5230308745674221, + "eval_runtime": 47.5072, + "eval_samples_per_second": 14.145, + "eval_steps_per_second": 3.536, + "step": 6200 + }, + { + "epoch": 183.00020895522388, + "grad_norm": 3.4290788173675537, + "learning_rate": 9.328358208955224e-05, + "loss": 0.3992, + "step": 6250 + }, + { + "epoch": 185.00007462686568, + "grad_norm": 2.7040562629699707, + "learning_rate": 9.402985074626867e-05, + "loss": 0.3936, + "step": 6300 + }, + { + "epoch": 185.00007462686568, + "eval_accuracy": 0.7740435124879784, + "eval_loss": 2.372661828994751, + "eval_macro_f1": 0.5289905031979247, + "eval_runtime": 47.0118, + "eval_samples_per_second": 14.294, + "eval_steps_per_second": 3.574, + "step": 6300 + }, + { + "epoch": 186.00019402985075, + "grad_norm": 2.4109134674072266, + "learning_rate": 9.477611940298507e-05, + "loss": 0.3862, + "step": 6350 + }, + { + "epoch": 188.00005970149255, + "grad_norm": 2.0369298458099365, + "learning_rate": 9.552238805970149e-05, + "loss": 0.3874, + "step": 6400 + }, + { + "epoch": 188.00005970149255, + "eval_accuracy": 0.7735755440988787, + "eval_loss": 2.3309173583984375, + "eval_macro_f1": 0.5287986553000295, + "eval_runtime": 46.2652, + "eval_samples_per_second": 14.525, + "eval_steps_per_second": 3.631, + "step": 6400 + }, + { + "epoch": 189.0001791044776, + "grad_norm": 1.5051206350326538, + "learning_rate": 9.626865671641792e-05, + "loss": 0.3659, + "step": 6450 + }, + { + "epoch": 191.0000447761194, + "grad_norm": 1.7932085990905762, + "learning_rate": 9.701492537313434e-05, + "loss": 0.3854, + "step": 6500 + }, + { + "epoch": 191.0000447761194, + "eval_accuracy": 0.770976351066773, + "eval_loss": 2.4564998149871826, + "eval_macro_f1": 0.5181360035331812, + "eval_runtime": 47.0727, + "eval_samples_per_second": 14.276, + "eval_steps_per_second": 3.569, + "step": 6500 + }, + { + "epoch": 192.00016417910447, + "grad_norm": 1.6276061534881592, + "learning_rate": 9.776119402985075e-05, + "loss": 0.3733, + "step": 6550 + }, + { + "epoch": 194.00002985074627, + "grad_norm": 2.4511585235595703, + "learning_rate": 9.850746268656717e-05, + "loss": 0.367, + "step": 6600 + }, + { + "epoch": 194.00002985074627, + "eval_accuracy": 0.7743297172942802, + "eval_loss": 2.6141209602355957, + "eval_macro_f1": 0.5268045320391473, + "eval_runtime": 47.3123, + "eval_samples_per_second": 14.203, + "eval_steps_per_second": 3.551, + "step": 6600 + }, + { + "epoch": 195.00014925373134, + "grad_norm": 1.7644522190093994, + "learning_rate": 9.925373134328359e-05, + "loss": 0.3717, + "step": 6650 + }, + { + "epoch": 197.00001492537314, + "grad_norm": 1.2266058921813965, + "learning_rate": 0.0001, + "loss": 0.3591, + "step": 6700 + }, + { + "epoch": 197.00001492537314, + "eval_accuracy": 0.7754298583860919, + "eval_loss": 2.470690965652466, + "eval_macro_f1": 0.5219236195098906, + "eval_runtime": 47.3851, + "eval_samples_per_second": 14.182, + "eval_steps_per_second": 3.545, + "step": 6700 + }, + { + "epoch": 198.0001343283582, + "grad_norm": 2.6765027046203613, + "learning_rate": 9.996072270227809e-05, + "loss": 0.3567, + "step": 6750 + }, + { + "epoch": 199.0002537313433, + "grad_norm": 1.1221798658370972, + "learning_rate": 9.992144540455617e-05, + "loss": 0.3566, + "step": 6800 + }, + { + "epoch": 199.0002537313433, + "eval_accuracy": 0.7733212232720099, + "eval_loss": 2.512014389038086, + "eval_macro_f1": 0.5369477216595325, + "eval_runtime": 46.892, + "eval_samples_per_second": 14.331, + "eval_steps_per_second": 3.583, + "step": 6800 + }, + { + "epoch": 201.00011940298506, + "grad_norm": 1.1312484741210938, + "learning_rate": 9.988216810683426e-05, + "loss": 0.3507, + "step": 6850 + }, + { + "epoch": 202.00023880597016, + "grad_norm": 1.0992048978805542, + "learning_rate": 9.984289080911233e-05, + "loss": 0.3535, + "step": 6900 + }, + { + "epoch": 202.00023880597016, + "eval_accuracy": 0.7756422871607025, + "eval_loss": 2.6135613918304443, + "eval_macro_f1": 0.5341636903665778, + "eval_runtime": 47.2453, + "eval_samples_per_second": 14.224, + "eval_steps_per_second": 3.556, + "step": 6900 + }, + { + "epoch": 204.00010447761193, + "grad_norm": 1.356156826019287, + "learning_rate": 9.980361351139042e-05, + "loss": 0.3452, + "step": 6950 + }, + { + "epoch": 205.00022388059702, + "grad_norm": 1.1029301881790161, + "learning_rate": 9.976433621366851e-05, + "loss": 0.3477, + "step": 7000 + }, + { + "epoch": 205.00022388059702, + "eval_accuracy": 0.7719194889852874, + "eval_loss": 2.531036853790283, + "eval_macro_f1": 0.5256331763670041, + "eval_runtime": 47.3112, + "eval_samples_per_second": 14.204, + "eval_steps_per_second": 3.551, + "step": 7000 + }, + { + "epoch": 207.00008955223882, + "grad_norm": 1.2718098163604736, + "learning_rate": 9.97250589159466e-05, + "loss": 0.3384, + "step": 7050 + }, + { + "epoch": 208.00020895522388, + "grad_norm": 0.9557269215583801, + "learning_rate": 9.968578161822467e-05, + "loss": 0.3423, + "step": 7100 + }, + { + "epoch": 208.00020895522388, + "eval_accuracy": 0.7719196566721647, + "eval_loss": 2.560142993927002, + "eval_macro_f1": 0.521366954897175, + "eval_runtime": 47.2449, + "eval_samples_per_second": 14.224, + "eval_steps_per_second": 3.556, + "step": 7100 + }, + { + "epoch": 210.00007462686568, + "grad_norm": 3.1388516426086426, + "learning_rate": 9.964650432050276e-05, + "loss": 0.3413, + "step": 7150 + }, + { + "epoch": 211.00019402985075, + "grad_norm": 0.9060555696487427, + "learning_rate": 9.960722702278083e-05, + "loss": 0.3287, + "step": 7200 + }, + { + "epoch": 211.00019402985075, + "eval_accuracy": 0.7799261899003544, + "eval_loss": 2.662402629852295, + "eval_macro_f1": 0.5288108834293783, + "eval_runtime": 47.1445, + "eval_samples_per_second": 14.254, + "eval_steps_per_second": 3.564, + "step": 7200 + }, + { + "epoch": 213.00005970149255, + "grad_norm": 0.8901938199996948, + "learning_rate": 9.956794972505892e-05, + "loss": 0.3332, + "step": 7250 + }, + { + "epoch": 214.0001791044776, + "grad_norm": 1.2578210830688477, + "learning_rate": 9.9528672427337e-05, + "loss": 0.3203, + "step": 7300 + }, + { + "epoch": 214.0001791044776, + "eval_accuracy": 0.7781472360718895, + "eval_loss": 2.471846103668213, + "eval_macro_f1": 0.5329869779591574, + "eval_runtime": 47.3422, + "eval_samples_per_second": 14.195, + "eval_steps_per_second": 3.549, + "step": 7300 + }, + { + "epoch": 216.0000447761194, + "grad_norm": 2.1007184982299805, + "learning_rate": 9.94893951296151e-05, + "loss": 0.3181, + "step": 7350 + }, + { + "epoch": 217.00016417910447, + "grad_norm": 1.5407541990280151, + "learning_rate": 9.945011783189317e-05, + "loss": 0.3214, + "step": 7400 + }, + { + "epoch": 217.00016417910447, + "eval_accuracy": 0.7810534381168457, + "eval_loss": 2.6066505908966064, + "eval_macro_f1": 0.5346224113824365, + "eval_runtime": 47.2487, + "eval_samples_per_second": 14.223, + "eval_steps_per_second": 3.556, + "step": 7400 + }, + { + "epoch": 219.00002985074627, + "grad_norm": 1.9181113243103027, + "learning_rate": 9.941084053417126e-05, + "loss": 0.3286, + "step": 7450 + }, + { + "epoch": 220.00014925373134, + "grad_norm": 1.2538775205612183, + "learning_rate": 9.937156323644934e-05, + "loss": 0.3226, + "step": 7500 + }, + { + "epoch": 220.00014925373134, + "eval_accuracy": 0.7828189085491943, + "eval_loss": 2.5737223625183105, + "eval_macro_f1": 0.5382420540380082, + "eval_runtime": 47.4333, + "eval_samples_per_second": 14.167, + "eval_steps_per_second": 3.542, + "step": 7500 + }, + { + "epoch": 222.00001492537314, + "grad_norm": 1.6934894323349, + "learning_rate": 9.933228593872742e-05, + "loss": 0.3215, + "step": 7550 + }, + { + "epoch": 223.0001343283582, + "grad_norm": 1.494821310043335, + "learning_rate": 9.92930086410055e-05, + "loss": 0.3176, + "step": 7600 + }, + { + "epoch": 223.0001343283582, + "eval_accuracy": 0.7782904056985046, + "eval_loss": 2.745771884918213, + "eval_macro_f1": 0.5361177192936347, + "eval_runtime": 47.1236, + "eval_samples_per_second": 14.26, + "eval_steps_per_second": 3.565, + "step": 7600 + }, + { + "epoch": 224.0002537313433, + "grad_norm": 1.815902590751648, + "learning_rate": 9.925373134328359e-05, + "loss": 0.3092, + "step": 7650 + }, + { + "epoch": 226.00011940298506, + "grad_norm": 4.263820171356201, + "learning_rate": 9.921445404556166e-05, + "loss": 0.3108, + "step": 7700 + }, + { + "epoch": 226.00011940298506, + "eval_accuracy": 0.7823510782933494, + "eval_loss": 2.6822726726531982, + "eval_macro_f1": 0.5297143170595022, + "eval_runtime": 47.3755, + "eval_samples_per_second": 14.185, + "eval_steps_per_second": 3.546, + "step": 7700 + }, + { + "epoch": 227.00023880597016, + "grad_norm": 1.1835607290267944, + "learning_rate": 9.917517674783975e-05, + "loss": 0.3089, + "step": 7750 + }, + { + "epoch": 229.00010447761193, + "grad_norm": 0.9950078725814819, + "learning_rate": 9.913589945011784e-05, + "loss": 0.3068, + "step": 7800 + }, + { + "epoch": 229.00010447761193, + "eval_accuracy": 0.7785920542828279, + "eval_loss": 2.6610989570617676, + "eval_macro_f1": 0.5329689150544215, + "eval_runtime": 47.0936, + "eval_samples_per_second": 14.269, + "eval_steps_per_second": 3.567, + "step": 7800 + }, + { + "epoch": 230.00022388059702, + "grad_norm": 2.3747150897979736, + "learning_rate": 9.909662215239593e-05, + "loss": 0.3072, + "step": 7850 + }, + { + "epoch": 232.00008955223882, + "grad_norm": 1.1120485067367554, + "learning_rate": 9.9057344854674e-05, + "loss": 0.3083, + "step": 7900 + }, + { + "epoch": 232.00008955223882, + "eval_accuracy": 0.7874200846277464, + "eval_loss": 2.534055233001709, + "eval_macro_f1": 0.5490044570794441, + "eval_runtime": 46.9375, + "eval_samples_per_second": 14.317, + "eval_steps_per_second": 3.579, + "step": 7900 + }, + { + "epoch": 233.00020895522388, + "grad_norm": 0.8811401128768921, + "learning_rate": 9.901806755695209e-05, + "loss": 0.2979, + "step": 7950 + }, + { + "epoch": 235.00007462686568, + "grad_norm": 1.5326483249664307, + "learning_rate": 9.897879025923017e-05, + "loss": 0.2983, + "step": 8000 + }, + { + "epoch": 235.00007462686568, + "eval_accuracy": 0.7819593035294035, + "eval_loss": 2.5583415031433105, + "eval_macro_f1": 0.5366010121275766, + "eval_runtime": 47.8201, + "eval_samples_per_second": 14.053, + "eval_steps_per_second": 3.513, + "step": 8000 + }, + { + "epoch": 236.00019402985075, + "grad_norm": 0.7597693800926208, + "learning_rate": 9.893951296150825e-05, + "loss": 0.2996, + "step": 8050 + }, + { + "epoch": 238.00005970149255, + "grad_norm": 1.6218539476394653, + "learning_rate": 9.890023566378633e-05, + "loss": 0.2932, + "step": 8100 + }, + { + "epoch": 238.00005970149255, + "eval_accuracy": 0.7801646614092237, + "eval_loss": 2.741665840148926, + "eval_macro_f1": 0.5375982188261168, + "eval_runtime": 47.4351, + "eval_samples_per_second": 14.167, + "eval_steps_per_second": 3.542, + "step": 8100 + }, + { + "epoch": 239.0001791044776, + "grad_norm": 1.2092583179473877, + "learning_rate": 9.886095836606442e-05, + "loss": 0.2975, + "step": 8150 + }, + { + "epoch": 241.0000447761194, + "grad_norm": 1.0931410789489746, + "learning_rate": 9.882168106834249e-05, + "loss": 0.2907, + "step": 8200 + }, + { + "epoch": 241.0000447761194, + "eval_accuracy": 0.789435718757186, + "eval_loss": 2.558954954147339, + "eval_macro_f1": 0.5485659040637261, + "eval_runtime": 47.1646, + "eval_samples_per_second": 14.248, + "eval_steps_per_second": 3.562, + "step": 8200 + }, + { + "epoch": 242.00016417910447, + "grad_norm": 1.3182649612426758, + "learning_rate": 9.87824037706206e-05, + "loss": 0.2968, + "step": 8250 + }, + { + "epoch": 244.00002985074627, + "grad_norm": 0.8145655989646912, + "learning_rate": 9.874312647289867e-05, + "loss": 0.296, + "step": 8300 + }, + { + "epoch": 244.00002985074627, + "eval_accuracy": 0.7870276295759537, + "eval_loss": 2.4877355098724365, + "eval_macro_f1": 0.5542794816493364, + "eval_runtime": 47.7574, + "eval_samples_per_second": 14.071, + "eval_steps_per_second": 3.518, + "step": 8300 + }, + { + "epoch": 245.00014925373134, + "grad_norm": 0.9134150147438049, + "learning_rate": 9.870384917517676e-05, + "loss": 0.2901, + "step": 8350 + }, + { + "epoch": 247.00001492537314, + "grad_norm": 0.8917508721351624, + "learning_rate": 9.866457187745483e-05, + "loss": 0.2894, + "step": 8400 + }, + { + "epoch": 247.00001492537314, + "eval_accuracy": 0.7826655211261566, + "eval_loss": 2.680570602416992, + "eval_macro_f1": 0.5301892902132199, + "eval_runtime": 47.6207, + "eval_samples_per_second": 14.112, + "eval_steps_per_second": 3.528, + "step": 8400 + }, + { + "epoch": 248.0001343283582, + "grad_norm": 0.8445788621902466, + "learning_rate": 9.862529457973292e-05, + "loss": 0.2939, + "step": 8450 + }, + { + "epoch": 249.0002537313433, + "grad_norm": 1.1051580905914307, + "learning_rate": 9.8586017282011e-05, + "loss": 0.2879, + "step": 8500 + }, + { + "epoch": 249.0002537313433, + "eval_accuracy": 0.7866092293105559, + "eval_loss": 2.5236289501190186, + "eval_macro_f1": 0.5414185827620794, + "eval_runtime": 47.2209, + "eval_samples_per_second": 14.231, + "eval_steps_per_second": 3.558, + "step": 8500 + }, + { + "epoch": 251.00011940298506, + "grad_norm": 1.4551324844360352, + "learning_rate": 9.854673998428908e-05, + "loss": 0.2838, + "step": 8550 + }, + { + "epoch": 252.00023880597016, + "grad_norm": 0.5893757939338684, + "learning_rate": 9.850746268656717e-05, + "loss": 0.2804, + "step": 8600 + }, + { + "epoch": 252.00023880597016, + "eval_accuracy": 0.7909514517262591, + "eval_loss": 2.443542718887329, + "eval_macro_f1": 0.5508984177646505, + "eval_runtime": 47.3031, + "eval_samples_per_second": 14.206, + "eval_steps_per_second": 3.552, + "step": 8600 + }, + { + "epoch": 254.00010447761193, + "grad_norm": 0.7699234485626221, + "learning_rate": 9.846818538884526e-05, + "loss": 0.2765, + "step": 8650 + }, + { + "epoch": 255.00022388059702, + "grad_norm": 0.7053253054618835, + "learning_rate": 9.842890809112333e-05, + "loss": 0.2774, + "step": 8700 + }, + { + "epoch": 255.00022388059702, + "eval_accuracy": 0.7837870225808468, + "eval_loss": 2.6435728073120117, + "eval_macro_f1": 0.5450153883267421, + "eval_runtime": 47.3017, + "eval_samples_per_second": 14.207, + "eval_steps_per_second": 3.552, + "step": 8700 + }, + { + "epoch": 257.0000895522388, + "grad_norm": 1.1492245197296143, + "learning_rate": 9.838963079340142e-05, + "loss": 0.2802, + "step": 8750 + }, + { + "epoch": 258.00020895522385, + "grad_norm": 0.9404826760292053, + "learning_rate": 9.83503534956795e-05, + "loss": 0.2721, + "step": 8800 + }, + { + "epoch": 258.00020895522385, + "eval_accuracy": 0.7896500252013665, + "eval_loss": 2.6450061798095703, + "eval_macro_f1": 0.5485306090805963, + "eval_runtime": 47.2949, + "eval_samples_per_second": 14.209, + "eval_steps_per_second": 3.552, + "step": 8800 + }, + { + "epoch": 260.0000746268657, + "grad_norm": 1.018480896949768, + "learning_rate": 9.831107619795759e-05, + "loss": 0.2805, + "step": 8850 + }, + { + "epoch": 261.0001940298507, + "grad_norm": 1.232864260673523, + "learning_rate": 9.827179890023567e-05, + "loss": 0.2751, + "step": 8900 + }, + { + "epoch": 261.0001940298507, + "eval_accuracy": 0.7847322693614396, + "eval_loss": 2.6299021244049072, + "eval_macro_f1": 0.5501171750049827, + "eval_runtime": 46.8857, + "eval_samples_per_second": 14.333, + "eval_steps_per_second": 3.583, + "step": 8900 + }, + { + "epoch": 263.00005970149255, + "grad_norm": 1.1408989429473877, + "learning_rate": 9.823252160251375e-05, + "loss": 0.2784, + "step": 8950 + }, + { + "epoch": 264.00017910447764, + "grad_norm": 0.9467275738716125, + "learning_rate": 9.819324430479184e-05, + "loss": 0.2746, + "step": 9000 + }, + { + "epoch": 264.00017910447764, + "eval_accuracy": 0.790590638402413, + "eval_loss": 2.6440627574920654, + "eval_macro_f1": 0.5521120943014799, + "eval_runtime": 47.2847, + "eval_samples_per_second": 14.212, + "eval_steps_per_second": 3.553, + "step": 9000 + }, + { + "epoch": 266.0000447761194, + "grad_norm": 0.7433880567550659, + "learning_rate": 9.815396700706991e-05, + "loss": 0.2799, + "step": 9050 + }, + { + "epoch": 267.0001641791045, + "grad_norm": 0.869040310382843, + "learning_rate": 9.8114689709348e-05, + "loss": 0.2745, + "step": 9100 + }, + { + "epoch": 267.0001641791045, + "eval_accuracy": 0.7878080907313669, + "eval_loss": 2.6341586112976074, + "eval_macro_f1": 0.5487867464362897, + "eval_runtime": 47.3665, + "eval_samples_per_second": 14.187, + "eval_steps_per_second": 3.547, + "step": 9100 + }, + { + "epoch": 269.0000298507463, + "grad_norm": 0.7101911306381226, + "learning_rate": 9.807541241162609e-05, + "loss": 0.2686, + "step": 9150 + }, + { + "epoch": 270.00014925373137, + "grad_norm": 1.284479022026062, + "learning_rate": 9.803613511390418e-05, + "loss": 0.2726, + "step": 9200 + }, + { + "epoch": 270.00014925373137, + "eval_accuracy": 0.7859600705340068, + "eval_loss": 2.816183090209961, + "eval_macro_f1": 0.5476339169452163, + "eval_runtime": 47.068, + "eval_samples_per_second": 14.277, + "eval_steps_per_second": 3.569, + "step": 9200 + }, + { + "epoch": 272.00001492537314, + "grad_norm": 0.7183237671852112, + "learning_rate": 9.799685781618225e-05, + "loss": 0.2699, + "step": 9250 + }, + { + "epoch": 273.00013432835823, + "grad_norm": 0.8878548741340637, + "learning_rate": 9.795758051846034e-05, + "loss": 0.2688, + "step": 9300 + }, + { + "epoch": 273.00013432835823, + "eval_accuracy": 0.7869160847012808, + "eval_loss": 2.6518239974975586, + "eval_macro_f1": 0.5489229413615944, + "eval_runtime": 47.5312, + "eval_samples_per_second": 14.138, + "eval_steps_per_second": 3.535, + "step": 9300 + }, + { + "epoch": 274.00025373134326, + "grad_norm": 1.2390247583389282, + "learning_rate": 9.791830322073841e-05, + "loss": 0.2695, + "step": 9350 + }, + { + "epoch": 276.0001194029851, + "grad_norm": 0.8914899230003357, + "learning_rate": 9.78790259230165e-05, + "loss": 0.2662, + "step": 9400 + }, + { + "epoch": 276.0001194029851, + "eval_accuracy": 0.7898089466253276, + "eval_loss": 2.6125519275665283, + "eval_macro_f1": 0.5504761711012163, + "eval_runtime": 47.2712, + "eval_samples_per_second": 14.216, + "eval_steps_per_second": 3.554, + "step": 9400 + }, + { + "epoch": 277.0002388059701, + "grad_norm": 1.1537991762161255, + "learning_rate": 9.783974862529458e-05, + "loss": 0.2683, + "step": 9450 + }, + { + "epoch": 279.00010447761196, + "grad_norm": 0.7840927839279175, + "learning_rate": 9.780047132757267e-05, + "loss": 0.2747, + "step": 9500 + }, + { + "epoch": 279.00010447761196, + "eval_accuracy": 0.7919364602596279, + "eval_loss": 2.8076581954956055, + "eval_macro_f1": 0.5537148016540232, + "eval_runtime": 47.5398, + "eval_samples_per_second": 14.136, + "eval_steps_per_second": 3.534, + "step": 9500 + }, + { + "epoch": 280.000223880597, + "grad_norm": 0.9824500679969788, + "learning_rate": 9.776119402985075e-05, + "loss": 0.2648, + "step": 9550 + }, + { + "epoch": 282.0000895522388, + "grad_norm": 1.3994109630584717, + "learning_rate": 9.772191673212884e-05, + "loss": 0.2625, + "step": 9600 + }, + { + "epoch": 282.0000895522388, + "eval_accuracy": 0.7825381372488958, + "eval_loss": 2.6892857551574707, + "eval_macro_f1": 0.5523475136970821, + "eval_runtime": 47.7211, + "eval_samples_per_second": 14.082, + "eval_steps_per_second": 3.52, + "step": 9600 + }, + { + "epoch": 283.00020895522385, + "grad_norm": 0.7064620852470398, + "learning_rate": 9.768263943440692e-05, + "loss": 0.2635, + "step": 9650 + }, + { + "epoch": 285.0000746268657, + "grad_norm": 0.5888859629631042, + "learning_rate": 9.7643362136685e-05, + "loss": 0.2624, + "step": 9700 + }, + { + "epoch": 285.0000746268657, + "eval_accuracy": 0.7912032116214447, + "eval_loss": 2.604067802429199, + "eval_macro_f1": 0.5566263038451964, + "eval_runtime": 46.7455, + "eval_samples_per_second": 14.376, + "eval_steps_per_second": 3.594, + "step": 9700 + }, + { + "epoch": 286.0001940298507, + "grad_norm": 0.8754225969314575, + "learning_rate": 9.760408483896308e-05, + "loss": 0.2631, + "step": 9750 + }, + { + "epoch": 288.00005970149255, + "grad_norm": 0.6127424836158752, + "learning_rate": 9.756480754124117e-05, + "loss": 0.2597, + "step": 9800 + }, + { + "epoch": 288.00005970149255, + "eval_accuracy": 0.7925750474028623, + "eval_loss": 2.7328107357025146, + "eval_macro_f1": 0.5571776302491236, + "eval_runtime": 46.9793, + "eval_samples_per_second": 14.304, + "eval_steps_per_second": 3.576, + "step": 9800 + }, + { + "epoch": 289.00017910447764, + "grad_norm": 0.7649573087692261, + "learning_rate": 9.752553024351924e-05, + "loss": 0.2641, + "step": 9850 + }, + { + "epoch": 291.0000447761194, + "grad_norm": 0.8664366006851196, + "learning_rate": 9.748625294579733e-05, + "loss": 0.2606, + "step": 9900 + }, + { + "epoch": 291.0000447761194, + "eval_accuracy": 0.788393101101295, + "eval_loss": 2.7174975872039795, + "eval_macro_f1": 0.5526578061565084, + "eval_runtime": 47.2915, + "eval_samples_per_second": 14.21, + "eval_steps_per_second": 3.552, + "step": 9900 + }, + { + "epoch": 292.0001641791045, + "grad_norm": 1.0027039051055908, + "learning_rate": 9.744697564807542e-05, + "loss": 0.2557, + "step": 9950 + }, + { + "epoch": 294.0000298507463, + "grad_norm": 1.417030692100525, + "learning_rate": 9.740769835035351e-05, + "loss": 0.2647, + "step": 10000 + }, + { + "epoch": 294.0000298507463, + "eval_accuracy": 0.7970415469232248, + "eval_loss": 2.6527485847473145, + "eval_macro_f1": 0.5472134608756383, + "eval_runtime": 46.9383, + "eval_samples_per_second": 14.317, + "eval_steps_per_second": 3.579, + "step": 10000 + }, + { + "epoch": 295.00014925373137, + "grad_norm": 1.033422589302063, + "learning_rate": 9.736842105263158e-05, + "loss": 0.2588, + "step": 10050 + }, + { + "epoch": 297.00001492537314, + "grad_norm": 0.8456308245658875, + "learning_rate": 9.732914375490967e-05, + "loss": 0.2635, + "step": 10100 + }, + { + "epoch": 297.00001492537314, + "eval_accuracy": 0.7895013356070728, + "eval_loss": 2.677187919616699, + "eval_macro_f1": 0.5481504454768104, + "eval_runtime": 47.407, + "eval_samples_per_second": 14.175, + "eval_steps_per_second": 3.544, + "step": 10100 + }, + { + "epoch": 298.00013432835823, + "grad_norm": 6.6318817138671875, + "learning_rate": 9.728986645718775e-05, + "loss": 0.2742, + "step": 10150 + }, + { + "epoch": 299.00025373134326, + "grad_norm": 1.197939395904541, + "learning_rate": 9.725058915946583e-05, + "loss": 0.2741, + "step": 10200 + }, + { + "epoch": 299.00025373134326, + "eval_accuracy": 0.7847721190000101, + "eval_loss": 2.836459159851074, + "eval_macro_f1": 0.5451334373587171, + "eval_runtime": 47.2669, + "eval_samples_per_second": 14.217, + "eval_steps_per_second": 3.554, + "step": 10200 + }, + { + "epoch": 301.0001194029851, + "grad_norm": 1.0314828157424927, + "learning_rate": 9.721131186174391e-05, + "loss": 0.2652, + "step": 10250 + }, + { + "epoch": 302.0002388059701, + "grad_norm": 0.7985377907752991, + "learning_rate": 9.7172034564022e-05, + "loss": 0.2573, + "step": 10300 + }, + { + "epoch": 302.0002388059701, + "eval_accuracy": 0.7957066815139581, + "eval_loss": 2.69494366645813, + "eval_macro_f1": 0.5571223461117464, + "eval_runtime": 47.6066, + "eval_samples_per_second": 14.116, + "eval_steps_per_second": 3.529, + "step": 10300 + }, + { + "epoch": 304.00010447761196, + "grad_norm": 0.5978965163230896, + "learning_rate": 9.713275726630007e-05, + "loss": 0.2547, + "step": 10350 + }, + { + "epoch": 305.000223880597, + "grad_norm": 0.6941676735877991, + "learning_rate": 9.709347996857817e-05, + "loss": 0.2542, + "step": 10400 + }, + { + "epoch": 305.000223880597, + "eval_accuracy": 0.7939871198996913, + "eval_loss": 2.6679141521453857, + "eval_macro_f1": 0.5564603934809988, + "eval_runtime": 47.5802, + "eval_samples_per_second": 14.124, + "eval_steps_per_second": 3.531, + "step": 10400 + }, + { + "epoch": 307.0000895522388, + "grad_norm": 0.8010326623916626, + "learning_rate": 9.705420267085625e-05, + "loss": 0.2543, + "step": 10450 + }, + { + "epoch": 308.00020895522385, + "grad_norm": 0.731414258480072, + "learning_rate": 9.701492537313434e-05, + "loss": 0.257, + "step": 10500 + }, + { + "epoch": 308.00020895522385, + "eval_accuracy": 0.7923619205893271, + "eval_loss": 2.598283052444458, + "eval_macro_f1": 0.5543807824016451, + "eval_runtime": 47.3167, + "eval_samples_per_second": 14.202, + "eval_steps_per_second": 3.551, + "step": 10500 + }, + { + "epoch": 310.0000746268657, + "grad_norm": 0.9406434297561646, + "learning_rate": 9.697564807541241e-05, + "loss": 0.2532, + "step": 10550 + }, + { + "epoch": 311.0001940298507, + "grad_norm": 0.745418131351471, + "learning_rate": 9.69363707776905e-05, + "loss": 0.2544, + "step": 10600 + }, + { + "epoch": 311.0001940298507, + "eval_accuracy": 0.786760543342164, + "eval_loss": 2.7960455417633057, + "eval_macro_f1": 0.5421744876163888, + "eval_runtime": 47.3001, + "eval_samples_per_second": 14.207, + "eval_steps_per_second": 3.552, + "step": 10600 + }, + { + "epoch": 313.00005970149255, + "grad_norm": 0.6219696402549744, + "learning_rate": 9.689709347996858e-05, + "loss": 0.2499, + "step": 10650 + }, + { + "epoch": 314.00017910447764, + "grad_norm": 0.5295397043228149, + "learning_rate": 9.685781618224666e-05, + "loss": 0.2499, + "step": 10700 + }, + { + "epoch": 314.00017910447764, + "eval_accuracy": 0.791158556318065, + "eval_loss": 2.8693225383758545, + "eval_macro_f1": 0.5504348291506833, + "eval_runtime": 47.0711, + "eval_samples_per_second": 14.276, + "eval_steps_per_second": 3.569, + "step": 10700 + }, + { + "epoch": 316.0000447761194, + "grad_norm": 0.7811539173126221, + "learning_rate": 9.681853888452475e-05, + "loss": 0.2489, + "step": 10750 + }, + { + "epoch": 317.0001641791045, + "grad_norm": 0.8086901903152466, + "learning_rate": 9.677926158680283e-05, + "loss": 0.2476, + "step": 10800 + }, + { + "epoch": 317.0001641791045, + "eval_accuracy": 0.7919355369452209, + "eval_loss": 2.7042899131774902, + "eval_macro_f1": 0.5501142646737022, + "eval_runtime": 47.4393, + "eval_samples_per_second": 14.165, + "eval_steps_per_second": 3.541, + "step": 10800 + }, + { + "epoch": 319.0000298507463, + "grad_norm": 0.9459484219551086, + "learning_rate": 9.673998428908092e-05, + "loss": 0.2471, + "step": 10850 + }, + { + "epoch": 320.00014925373137, + "grad_norm": 0.8346372246742249, + "learning_rate": 9.6700706991359e-05, + "loss": 0.2472, + "step": 10900 + }, + { + "epoch": 320.00014925373137, + "eval_accuracy": 0.7884674237489783, + "eval_loss": 2.732384443283081, + "eval_macro_f1": 0.5547433723908648, + "eval_runtime": 47.126, + "eval_samples_per_second": 14.26, + "eval_steps_per_second": 3.565, + "step": 10900 + }, + { + "epoch": 322.00001492537314, + "grad_norm": 0.642440140247345, + "learning_rate": 9.666142969363708e-05, + "loss": 0.2478, + "step": 10950 + }, + { + "epoch": 323.00013432835823, + "grad_norm": 0.636252760887146, + "learning_rate": 9.662215239591517e-05, + "loss": 0.2497, + "step": 11000 + }, + { + "epoch": 323.00013432835823, + "eval_accuracy": 0.7925603378464738, + "eval_loss": 2.6631999015808105, + "eval_macro_f1": 0.5618251107149661, + "eval_runtime": 47.565, + "eval_samples_per_second": 14.128, + "eval_steps_per_second": 3.532, + "step": 11000 + }, + { + "epoch": 324.00025373134326, + "grad_norm": 1.3599799871444702, + "learning_rate": 9.658287509819325e-05, + "loss": 0.2503, + "step": 11050 + }, + { + "epoch": 326.0001194029851, + "grad_norm": 1.2774139642715454, + "learning_rate": 9.654359780047133e-05, + "loss": 0.2473, + "step": 11100 + }, + { + "epoch": 326.0001194029851, + "eval_accuracy": 0.7879109320704504, + "eval_loss": 2.9122538566589355, + "eval_macro_f1": 0.5541569137459901, + "eval_runtime": 47.7, + "eval_samples_per_second": 14.088, + "eval_steps_per_second": 3.522, + "step": 11100 + }, + { + "epoch": 327.0002388059701, + "grad_norm": 0.9216606616973877, + "learning_rate": 9.650432050274942e-05, + "loss": 0.2484, + "step": 11150 + }, + { + "epoch": 329.00010447761196, + "grad_norm": 0.8243388533592224, + "learning_rate": 9.646504320502749e-05, + "loss": 0.2459, + "step": 11200 + }, + { + "epoch": 329.00010447761196, + "eval_accuracy": 0.7940223167520113, + "eval_loss": 2.8099050521850586, + "eval_macro_f1": 0.5574769707590426, + "eval_runtime": 46.71, + "eval_samples_per_second": 14.387, + "eval_steps_per_second": 3.597, + "step": 11200 + }, + { + "epoch": 330.000223880597, + "grad_norm": 0.7237550616264343, + "learning_rate": 9.642576590730558e-05, + "loss": 0.2428, + "step": 11250 + }, + { + "epoch": 332.0000895522388, + "grad_norm": 0.6807400584220886, + "learning_rate": 9.638648860958367e-05, + "loss": 0.2464, + "step": 11300 + }, + { + "epoch": 332.0000895522388, + "eval_accuracy": 0.7913753483227388, + "eval_loss": 2.902190685272217, + "eval_macro_f1": 0.5599005022452939, + "eval_runtime": 47.2182, + "eval_samples_per_second": 14.232, + "eval_steps_per_second": 3.558, + "step": 11300 + }, + { + "epoch": 333.00020895522385, + "grad_norm": 0.7416890859603882, + "learning_rate": 9.634721131186176e-05, + "loss": 0.2403, + "step": 11350 + }, + { + "epoch": 335.0000746268657, + "grad_norm": 0.5639162063598633, + "learning_rate": 9.630793401413983e-05, + "loss": 0.244, + "step": 11400 + }, + { + "epoch": 335.0000746268657, + "eval_accuracy": 0.7934693108067071, + "eval_loss": 2.7419607639312744, + "eval_macro_f1": 0.5625603653542266, + "eval_runtime": 47.8488, + "eval_samples_per_second": 14.044, + "eval_steps_per_second": 3.511, + "step": 11400 + }, + { + "epoch": 336.0001940298507, + "grad_norm": 0.6209690570831299, + "learning_rate": 9.626865671641792e-05, + "loss": 0.2489, + "step": 11450 + }, + { + "epoch": 338.00005970149255, + "grad_norm": 0.6081991195678711, + "learning_rate": 9.6229379418696e-05, + "loss": 0.2414, + "step": 11500 + }, + { + "epoch": 338.00005970149255, + "eval_accuracy": 0.789204515580589, + "eval_loss": 2.7637839317321777, + "eval_macro_f1": 0.5528043280025945, + "eval_runtime": 47.1355, + "eval_samples_per_second": 14.257, + "eval_steps_per_second": 3.564, + "step": 11500 + }, + { + "epoch": 339.00017910447764, + "grad_norm": 0.5305143594741821, + "learning_rate": 9.619010212097408e-05, + "loss": 0.242, + "step": 11550 + }, + { + "epoch": 341.0000447761194, + "grad_norm": 0.8079513907432556, + "learning_rate": 9.615082482325216e-05, + "loss": 0.2438, + "step": 11600 + }, + { + "epoch": 341.0000447761194, + "eval_accuracy": 0.7923774212438941, + "eval_loss": 2.830327272415161, + "eval_macro_f1": 0.5523658759650758, + "eval_runtime": 47.3422, + "eval_samples_per_second": 14.195, + "eval_steps_per_second": 3.549, + "step": 11600 + }, + { + "epoch": 342.0001641791045, + "grad_norm": 0.6239765882492065, + "learning_rate": 9.611154752553025e-05, + "loss": 0.2393, + "step": 11650 + }, + { + "epoch": 344.0000298507463, + "grad_norm": 0.6470430493354797, + "learning_rate": 9.607227022780834e-05, + "loss": 0.2439, + "step": 11700 + }, + { + "epoch": 344.0000298507463, + "eval_accuracy": 0.7927595063327648, + "eval_loss": 2.9781057834625244, + "eval_macro_f1": 0.5551366068357508, + "eval_runtime": 48.0732, + "eval_samples_per_second": 13.979, + "eval_steps_per_second": 3.495, + "step": 11700 + }, + { + "epoch": 345.00014925373137, + "grad_norm": 0.7119585275650024, + "learning_rate": 9.603299293008642e-05, + "loss": 0.2429, + "step": 11750 + }, + { + "epoch": 347.00001492537314, + "grad_norm": 0.8385061025619507, + "learning_rate": 9.59937156323645e-05, + "loss": 0.2425, + "step": 11800 + }, + { + "epoch": 347.00001492537314, + "eval_accuracy": 0.7887448208699516, + "eval_loss": 2.865709066390991, + "eval_macro_f1": 0.5585402127206076, + "eval_runtime": 47.1321, + "eval_samples_per_second": 14.258, + "eval_steps_per_second": 3.564, + "step": 11800 + }, + { + "epoch": 348.00013432835823, + "grad_norm": 0.8892908692359924, + "learning_rate": 9.595443833464259e-05, + "loss": 0.2418, + "step": 11850 + }, + { + "epoch": 349.00025373134326, + "grad_norm": 1.0612225532531738, + "learning_rate": 9.591516103692066e-05, + "loss": 0.2423, + "step": 11900 + }, + { + "epoch": 349.00025373134326, + "eval_accuracy": 0.7931865688090918, + "eval_loss": 2.877469778060913, + "eval_macro_f1": 0.5569887814639958, + "eval_runtime": 47.3519, + "eval_samples_per_second": 14.192, + "eval_steps_per_second": 3.548, + "step": 11900 + }, + { + "epoch": 351.0001194029851, + "grad_norm": 1.1128082275390625, + "learning_rate": 9.587588373919875e-05, + "loss": 0.2409, + "step": 11950 + }, + { + "epoch": 352.0002388059701, + "grad_norm": 0.7269352674484253, + "learning_rate": 9.583660644147682e-05, + "loss": 0.2423, + "step": 12000 + }, + { + "epoch": 352.0002388059701, + "eval_accuracy": 0.7936902924690483, + "eval_loss": 2.79756760597229, + "eval_macro_f1": 0.5654473783948293, + "eval_runtime": 47.8729, + "eval_samples_per_second": 14.037, + "eval_steps_per_second": 3.509, + "step": 12000 + }, + { + "epoch": 354.00010447761196, + "grad_norm": 0.6745342016220093, + "learning_rate": 9.579732914375491e-05, + "loss": 0.2378, + "step": 12050 + }, + { + "epoch": 355.000223880597, + "grad_norm": 0.5909808874130249, + "learning_rate": 9.575805184603299e-05, + "loss": 0.2358, + "step": 12100 + }, + { + "epoch": 355.000223880597, + "eval_accuracy": 0.7920293615244239, + "eval_loss": 2.834315776824951, + "eval_macro_f1": 0.5570038580742129, + "eval_runtime": 47.5692, + "eval_samples_per_second": 14.127, + "eval_steps_per_second": 3.532, + "step": 12100 + }, + { + "epoch": 357.0000895522388, + "grad_norm": 0.554900586605072, + "learning_rate": 9.571877454831109e-05, + "loss": 0.2391, + "step": 12150 + }, + { + "epoch": 358.00020895522385, + "grad_norm": 0.6298791766166687, + "learning_rate": 9.567949725058916e-05, + "loss": 0.234, + "step": 12200 + }, + { + "epoch": 358.00020895522385, + "eval_accuracy": 0.7900353596259864, + "eval_loss": 2.894160270690918, + "eval_macro_f1": 0.554601086611762, + "eval_runtime": 47.5324, + "eval_samples_per_second": 14.138, + "eval_steps_per_second": 3.534, + "step": 12200 + }, + { + "epoch": 360.0000746268657, + "grad_norm": 0.5867542624473572, + "learning_rate": 9.564021995286725e-05, + "loss": 0.2406, + "step": 12250 + }, + { + "epoch": 361.0001940298507, + "grad_norm": 1.3311362266540527, + "learning_rate": 9.560094265514533e-05, + "loss": 0.233, + "step": 12300 + }, + { + "epoch": 361.0001940298507, + "eval_accuracy": 0.7906419268557261, + "eval_loss": 2.8335020542144775, + "eval_macro_f1": 0.5615948233950084, + "eval_runtime": 47.5727, + "eval_samples_per_second": 14.126, + "eval_steps_per_second": 3.531, + "step": 12300 + }, + { + "epoch": 363.00005970149255, + "grad_norm": 0.7499290704727173, + "learning_rate": 9.556166535742342e-05, + "loss": 0.2342, + "step": 12350 + }, + { + "epoch": 364.00017910447764, + "grad_norm": 0.7394700050354004, + "learning_rate": 9.552238805970149e-05, + "loss": 0.239, + "step": 12400 + }, + { + "epoch": 364.00017910447764, + "eval_accuracy": 0.7907973817846747, + "eval_loss": 2.9565868377685547, + "eval_macro_f1": 0.5572538384336851, + "eval_runtime": 47.6072, + "eval_samples_per_second": 14.115, + "eval_steps_per_second": 3.529, + "step": 12400 + }, + { + "epoch": 366.0000447761194, + "grad_norm": 0.8146505951881409, + "learning_rate": 9.548311076197958e-05, + "loss": 0.2418, + "step": 12450 + }, + { + "epoch": 367.0001641791045, + "grad_norm": 0.9361321926116943, + "learning_rate": 9.544383346425765e-05, + "loss": 0.2356, + "step": 12500 + }, + { + "epoch": 367.0001641791045, + "eval_accuracy": 0.7946960808288022, + "eval_loss": 2.767129421234131, + "eval_macro_f1": 0.5664574567175766, + "eval_runtime": 46.9666, + "eval_samples_per_second": 14.308, + "eval_steps_per_second": 3.577, + "step": 12500 + }, + { + "epoch": 369.0000298507463, + "grad_norm": 0.696810781955719, + "learning_rate": 9.540455616653576e-05, + "loss": 0.2412, + "step": 12550 + }, + { + "epoch": 370.00014925373137, + "grad_norm": 0.5871305465698242, + "learning_rate": 9.536527886881383e-05, + "loss": 0.2342, + "step": 12600 + }, + { + "epoch": 370.00014925373137, + "eval_accuracy": 0.7839926483509609, + "eval_loss": 2.864996910095215, + "eval_macro_f1": 0.5583051459410961, + "eval_runtime": 47.3437, + "eval_samples_per_second": 14.194, + "eval_steps_per_second": 3.549, + "step": 12600 + }, + { + "epoch": 372.00001492537314, + "grad_norm": 0.7368417978286743, + "learning_rate": 9.532600157109192e-05, + "loss": 0.2325, + "step": 12650 + }, + { + "epoch": 373.00013432835823, + "grad_norm": 0.7407382130622864, + "learning_rate": 9.528672427336999e-05, + "loss": 0.2349, + "step": 12700 + }, + { + "epoch": 373.00013432835823, + "eval_accuracy": 0.790613991706593, + "eval_loss": 2.9068164825439453, + "eval_macro_f1": 0.556631032697489, + "eval_runtime": 47.1608, + "eval_samples_per_second": 14.249, + "eval_steps_per_second": 3.562, + "step": 12700 + }, + { + "epoch": 374.00025373134326, + "grad_norm": 0.7086721062660217, + "learning_rate": 9.524744697564808e-05, + "loss": 0.2274, + "step": 12750 + }, + { + "epoch": 376.0001194029851, + "grad_norm": 0.7247728109359741, + "learning_rate": 9.520816967792616e-05, + "loss": 0.2316, + "step": 12800 + }, + { + "epoch": 376.0001194029851, + "eval_accuracy": 0.789248660513699, + "eval_loss": 2.8360400199890137, + "eval_macro_f1": 0.5595493770627399, + "eval_runtime": 48.0404, + "eval_samples_per_second": 13.988, + "eval_steps_per_second": 3.497, + "step": 12800 + }, + { + "epoch": 377.0002388059701, + "grad_norm": 0.7251562476158142, + "learning_rate": 9.516889238020424e-05, + "loss": 0.2343, + "step": 12850 + }, + { + "epoch": 379.00010447761196, + "grad_norm": 0.5742783546447754, + "learning_rate": 9.512961508248233e-05, + "loss": 0.235, + "step": 12900 + }, + { + "epoch": 379.00010447761196, + "eval_accuracy": 0.7934846895919643, + "eval_loss": 2.896634817123413, + "eval_macro_f1": 0.5570722955247589, + "eval_runtime": 47.4409, + "eval_samples_per_second": 14.165, + "eval_steps_per_second": 3.541, + "step": 12900 + }, + { + "epoch": 380.000223880597, + "grad_norm": 0.6099568009376526, + "learning_rate": 9.509033778476041e-05, + "loss": 0.2316, + "step": 12950 + }, + { + "epoch": 382.0000895522388, + "grad_norm": 0.9198169112205505, + "learning_rate": 9.50510604870385e-05, + "loss": 0.2299, + "step": 13000 + }, + { + "epoch": 382.0000895522388, + "eval_accuracy": 0.7860391176212975, + "eval_loss": 3.1634085178375244, + "eval_macro_f1": 0.5521063801230666, + "eval_runtime": 47.3753, + "eval_samples_per_second": 14.185, + "eval_steps_per_second": 3.546, + "step": 13000 + }, + { + "epoch": 383.00020895522385, + "grad_norm": 0.8543654084205627, + "learning_rate": 9.501178318931658e-05, + "loss": 0.2472, + "step": 13050 + }, + { + "epoch": 385.0000746268657, + "grad_norm": 0.7041712999343872, + "learning_rate": 9.497250589159466e-05, + "loss": 0.2445, + "step": 13100 + }, + { + "epoch": 385.0000746268657, + "eval_accuracy": 0.7892383976432757, + "eval_loss": 2.8832931518554688, + "eval_macro_f1": 0.5587841270333588, + "eval_runtime": 47.5048, + "eval_samples_per_second": 14.146, + "eval_steps_per_second": 3.536, + "step": 13100 + }, + { + "epoch": 386.0001940298507, + "grad_norm": 1.4760349988937378, + "learning_rate": 9.493322859387275e-05, + "loss": 0.2336, + "step": 13150 + }, + { + "epoch": 388.00005970149255, + "grad_norm": 1.284757137298584, + "learning_rate": 9.489395129615084e-05, + "loss": 0.2423, + "step": 13200 + }, + { + "epoch": 388.00005970149255, + "eval_accuracy": 0.7912648604343278, + "eval_loss": 2.917445182800293, + "eval_macro_f1": 0.562716706641199, + "eval_runtime": 47.9414, + "eval_samples_per_second": 14.017, + "eval_steps_per_second": 3.504, + "step": 13200 + }, + { + "epoch": 389.00017910447764, + "grad_norm": 0.6278606653213501, + "learning_rate": 9.485467399842891e-05, + "loss": 0.2361, + "step": 13250 + }, + { + "epoch": 391.0000447761194, + "grad_norm": 0.7342977523803711, + "learning_rate": 9.4815396700707e-05, + "loss": 0.2309, + "step": 13300 + }, + { + "epoch": 391.0000447761194, + "eval_accuracy": 0.7926036133674561, + "eval_loss": 2.8917057514190674, + "eval_macro_f1": 0.5649533534714449, + "eval_runtime": 47.8135, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 3.514, + "step": 13300 + }, + { + "epoch": 392.0001641791045, + "grad_norm": 0.5351797342300415, + "learning_rate": 9.477611940298507e-05, + "loss": 0.2299, + "step": 13350 + }, + { + "epoch": 394.0000298507463, + "grad_norm": 0.6577014923095703, + "learning_rate": 9.473684210526316e-05, + "loss": 0.2256, + "step": 13400 + }, + { + "epoch": 394.0000298507463, + "eval_accuracy": 0.794105689399524, + "eval_loss": 2.81776762008667, + "eval_macro_f1": 0.5570123615986171, + "eval_runtime": 47.7412, + "eval_samples_per_second": 14.076, + "eval_steps_per_second": 3.519, + "step": 13400 + }, + { + "epoch": 395.00014925373137, + "grad_norm": 0.6138007640838623, + "learning_rate": 9.469756480754125e-05, + "loss": 0.2265, + "step": 13450 + }, + { + "epoch": 397.00001492537314, + "grad_norm": 0.7050625681877136, + "learning_rate": 9.465828750981934e-05, + "loss": 0.2272, + "step": 13500 + }, + { + "epoch": 397.00001492537314, + "eval_accuracy": 0.7960931460653297, + "eval_loss": 3.073035717010498, + "eval_macro_f1": 0.5627541624009209, + "eval_runtime": 47.7148, + "eval_samples_per_second": 14.084, + "eval_steps_per_second": 3.521, + "step": 13500 + }, + { + "epoch": 398.00013432835823, + "grad_norm": 0.740837812423706, + "learning_rate": 9.461901021209741e-05, + "loss": 0.2286, + "step": 13550 + }, + { + "epoch": 399.00025373134326, + "grad_norm": 1.2674388885498047, + "learning_rate": 9.45797329143755e-05, + "loss": 0.2235, + "step": 13600 + }, + { + "epoch": 399.00025373134326, + "eval_accuracy": 0.7954822451591492, + "eval_loss": 2.7562060356140137, + "eval_macro_f1": 0.5633614160024021, + "eval_runtime": 47.4176, + "eval_samples_per_second": 14.172, + "eval_steps_per_second": 3.543, + "step": 13600 + }, + { + "epoch": 401.0001194029851, + "grad_norm": 0.6072332859039307, + "learning_rate": 9.454045561665358e-05, + "loss": 0.2277, + "step": 13650 + }, + { + "epoch": 402.0002388059701, + "grad_norm": 0.6308974027633667, + "learning_rate": 9.450117831893166e-05, + "loss": 0.2247, + "step": 13700 + }, + { + "epoch": 402.0002388059701, + "eval_accuracy": 0.7957475615909412, + "eval_loss": 3.088440418243408, + "eval_macro_f1": 0.5583174708681726, + "eval_runtime": 47.3848, + "eval_samples_per_second": 14.182, + "eval_steps_per_second": 3.545, + "step": 13700 + }, + { + "epoch": 404.00010447761196, + "grad_norm": 0.5017272233963013, + "learning_rate": 9.446190102120974e-05, + "loss": 0.2293, + "step": 13750 + }, + { + "epoch": 405.000223880597, + "grad_norm": 0.6886133551597595, + "learning_rate": 9.442262372348783e-05, + "loss": 0.2281, + "step": 13800 + }, + { + "epoch": 405.000223880597, + "eval_accuracy": 0.7960166887197784, + "eval_loss": 2.7946841716766357, + "eval_macro_f1": 0.5657979052528688, + "eval_runtime": 47.8335, + "eval_samples_per_second": 14.049, + "eval_steps_per_second": 3.512, + "step": 13800 + }, + { + "epoch": 407.0000895522388, + "grad_norm": 1.1009780168533325, + "learning_rate": 9.438334642576592e-05, + "loss": 0.2269, + "step": 13850 + }, + { + "epoch": 408.00020895522385, + "grad_norm": 0.6575526595115662, + "learning_rate": 9.4344069128044e-05, + "loss": 0.2286, + "step": 13900 + }, + { + "epoch": 408.00020895522385, + "eval_accuracy": 0.7951492968182124, + "eval_loss": 2.8645179271698, + "eval_macro_f1": 0.5634899829818933, + "eval_runtime": 47.8181, + "eval_samples_per_second": 14.053, + "eval_steps_per_second": 3.513, + "step": 13900 + }, + { + "epoch": 410.0000746268657, + "grad_norm": 0.6905514597892761, + "learning_rate": 9.430479183032208e-05, + "loss": 0.2244, + "step": 13950 + }, + { + "epoch": 411.0001940298507, + "grad_norm": 0.7627719640731812, + "learning_rate": 9.426551453260017e-05, + "loss": 0.2261, + "step": 14000 + }, + { + "epoch": 411.0001940298507, + "eval_accuracy": 0.7967005627569268, + "eval_loss": 2.7301526069641113, + "eval_macro_f1": 0.5609372658644085, + "eval_runtime": 47.6283, + "eval_samples_per_second": 14.109, + "eval_steps_per_second": 3.527, + "step": 14000 + }, + { + "epoch": 413.00005970149255, + "grad_norm": 0.5656907558441162, + "learning_rate": 9.422623723487824e-05, + "loss": 0.2282, + "step": 14050 + }, + { + "epoch": 414.00017910447764, + "grad_norm": 0.5546941161155701, + "learning_rate": 9.418695993715633e-05, + "loss": 0.2223, + "step": 14100 + }, + { + "epoch": 414.00017910447764, + "eval_accuracy": 0.794729354225492, + "eval_loss": 2.8526339530944824, + "eval_macro_f1": 0.5661770349565185, + "eval_runtime": 47.3682, + "eval_samples_per_second": 14.187, + "eval_steps_per_second": 3.547, + "step": 14100 + }, + { + "epoch": 416.0000447761194, + "grad_norm": 0.5283498764038086, + "learning_rate": 9.41476826394344e-05, + "loss": 0.2212, + "step": 14150 + }, + { + "epoch": 417.0001641791045, + "grad_norm": 0.40664082765579224, + "learning_rate": 9.41084053417125e-05, + "loss": 0.2256, + "step": 14200 + }, + { + "epoch": 417.0001641791045, + "eval_accuracy": 0.7948865782443788, + "eval_loss": 3.010794162750244, + "eval_macro_f1": 0.561869748710315, + "eval_runtime": 48.0057, + "eval_samples_per_second": 13.998, + "eval_steps_per_second": 3.5, + "step": 14200 + }, + { + "epoch": 419.0000298507463, + "grad_norm": 0.6011142730712891, + "learning_rate": 9.406912804399057e-05, + "loss": 0.2229, + "step": 14250 + }, + { + "epoch": 420.00014925373137, + "grad_norm": 0.6723529100418091, + "learning_rate": 9.402985074626867e-05, + "loss": 0.2204, + "step": 14300 + }, + { + "epoch": 420.00014925373137, + "eval_accuracy": 0.7954629404366225, + "eval_loss": 3.007234811782837, + "eval_macro_f1": 0.567117889403982, + "eval_runtime": 47.9111, + "eval_samples_per_second": 14.026, + "eval_steps_per_second": 3.506, + "step": 14300 + }, + { + "epoch": 422.00001492537314, + "grad_norm": 0.6365390419960022, + "learning_rate": 9.399057344854674e-05, + "loss": 0.2218, + "step": 14350 + }, + { + "epoch": 423.00013432835823, + "grad_norm": 0.5715621709823608, + "learning_rate": 9.395129615082483e-05, + "loss": 0.2229, + "step": 14400 + }, + { + "epoch": 423.00013432835823, + "eval_accuracy": 0.7928832663435474, + "eval_loss": 3.068387746810913, + "eval_macro_f1": 0.5609785945133656, + "eval_runtime": 48.0832, + "eval_samples_per_second": 13.976, + "eval_steps_per_second": 3.494, + "step": 14400 + }, + { + "epoch": 424.00025373134326, + "grad_norm": 1.438861608505249, + "learning_rate": 9.391201885310291e-05, + "loss": 0.2205, + "step": 14450 + }, + { + "epoch": 426.0001194029851, + "grad_norm": 1.0318790674209595, + "learning_rate": 9.3872741555381e-05, + "loss": 0.2197, + "step": 14500 + }, + { + "epoch": 426.0001194029851, + "eval_accuracy": 0.7950800140336203, + "eval_loss": 3.013770580291748, + "eval_macro_f1": 0.5577260098361676, + "eval_runtime": 48.0198, + "eval_samples_per_second": 13.994, + "eval_steps_per_second": 3.499, + "step": 14500 + }, + { + "epoch": 427.0002388059701, + "grad_norm": 0.647678017616272, + "learning_rate": 9.383346425765907e-05, + "loss": 0.2233, + "step": 14550 + }, + { + "epoch": 429.00010447761196, + "grad_norm": 0.7036944031715393, + "learning_rate": 9.379418695993716e-05, + "loss": 0.2233, + "step": 14600 + }, + { + "epoch": 429.00010447761196, + "eval_accuracy": 0.7950184922635065, + "eval_loss": 2.963728666305542, + "eval_macro_f1": 0.5639592906339586, + "eval_runtime": 47.6497, + "eval_samples_per_second": 14.103, + "eval_steps_per_second": 3.526, + "step": 14600 + }, + { + "epoch": 430.000223880597, + "grad_norm": 0.6998516321182251, + "learning_rate": 9.375490966221523e-05, + "loss": 0.2226, + "step": 14650 + }, + { + "epoch": 432.0000895522388, + "grad_norm": 0.5658679008483887, + "learning_rate": 9.371563236449332e-05, + "loss": 0.221, + "step": 14700 + }, + { + "epoch": 432.0000895522388, + "eval_accuracy": 0.7930678936723867, + "eval_loss": 2.9764347076416016, + "eval_macro_f1": 0.5615296744702556, + "eval_runtime": 47.8172, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 3.513, + "step": 14700 + }, + { + "epoch": 433.00020895522385, + "grad_norm": 0.8156617283821106, + "learning_rate": 9.367635506677141e-05, + "loss": 0.2191, + "step": 14750 + }, + { + "epoch": 435.0000746268657, + "grad_norm": 0.5961963534355164, + "learning_rate": 9.36370777690495e-05, + "loss": 0.2186, + "step": 14800 + }, + { + "epoch": 435.0000746268657, + "eval_accuracy": 0.793859945809042, + "eval_loss": 2.8554935455322266, + "eval_macro_f1": 0.5626270574118399, + "eval_runtime": 47.8659, + "eval_samples_per_second": 14.039, + "eval_steps_per_second": 3.51, + "step": 14800 + }, + { + "epoch": 436.0001940298507, + "grad_norm": 0.6928916573524475, + "learning_rate": 9.359780047132757e-05, + "loss": 0.2211, + "step": 14850 + }, + { + "epoch": 438.00005970149255, + "grad_norm": 0.8021474480628967, + "learning_rate": 9.355852317360566e-05, + "loss": 0.2245, + "step": 14900 + }, + { + "epoch": 438.00005970149255, + "eval_accuracy": 0.7921318379676372, + "eval_loss": 3.074869155883789, + "eval_macro_f1": 0.5639749685434595, + "eval_runtime": 47.5539, + "eval_samples_per_second": 14.131, + "eval_steps_per_second": 3.533, + "step": 14900 + }, + { + "epoch": 439.00017910447764, + "grad_norm": 0.5534613132476807, + "learning_rate": 9.351924587588374e-05, + "loss": 0.2212, + "step": 14950 + }, + { + "epoch": 441.0000447761194, + "grad_norm": 0.7129846215248108, + "learning_rate": 9.347996857816183e-05, + "loss": 0.2201, + "step": 15000 + }, + { + "epoch": 441.0000447761194, + "eval_accuracy": 0.7965219894341397, + "eval_loss": 2.940898895263672, + "eval_macro_f1": 0.5625690990279388, + "eval_runtime": 47.8976, + "eval_samples_per_second": 14.03, + "eval_steps_per_second": 3.507, + "step": 15000 + }, + { + "epoch": 442.0001641791045, + "grad_norm": 0.6185797452926636, + "learning_rate": 9.344069128043991e-05, + "loss": 0.221, + "step": 15050 + }, + { + "epoch": 444.0000298507463, + "grad_norm": 0.7621497511863708, + "learning_rate": 9.340141398271799e-05, + "loss": 0.2251, + "step": 15100 + }, + { + "epoch": 444.0000298507463, + "eval_accuracy": 0.7949607863638973, + "eval_loss": 2.880343198776245, + "eval_macro_f1": 0.5618121412432042, + "eval_runtime": 47.2403, + "eval_samples_per_second": 14.225, + "eval_steps_per_second": 3.556, + "step": 15100 + }, + { + "epoch": 445.00014925373137, + "grad_norm": 0.5727177262306213, + "learning_rate": 9.336213668499608e-05, + "loss": 0.2218, + "step": 15150 + }, + { + "epoch": 447.00001492537314, + "grad_norm": 0.6924324631690979, + "learning_rate": 9.332285938727416e-05, + "loss": 0.2182, + "step": 15200 + }, + { + "epoch": 447.00001492537314, + "eval_accuracy": 0.7924063705674953, + "eval_loss": 2.917879104614258, + "eval_macro_f1": 0.561150916227215, + "eval_runtime": 48.1679, + "eval_samples_per_second": 13.951, + "eval_steps_per_second": 3.488, + "step": 15200 + }, + { + "epoch": 448.00013432835823, + "grad_norm": 0.5482611656188965, + "learning_rate": 9.328358208955224e-05, + "loss": 0.2179, + "step": 15250 + }, + { + "epoch": 449.00025373134326, + "grad_norm": 1.4591031074523926, + "learning_rate": 9.324430479183033e-05, + "loss": 0.2151, + "step": 15300 + }, + { + "epoch": 449.00025373134326, + "eval_accuracy": 0.7929598234089452, + "eval_loss": 2.956183433532715, + "eval_macro_f1": 0.5682145114779232, + "eval_runtime": 46.6949, + "eval_samples_per_second": 14.391, + "eval_steps_per_second": 3.598, + "step": 15300 + }, + { + "epoch": 451.0001194029851, + "grad_norm": 0.7271115779876709, + "learning_rate": 9.320502749410842e-05, + "loss": 0.2181, + "step": 15350 + }, + { + "epoch": 452.0002388059701, + "grad_norm": 0.8536895513534546, + "learning_rate": 9.316575019638649e-05, + "loss": 0.2162, + "step": 15400 + }, + { + "epoch": 452.0002388059701, + "eval_accuracy": 0.7948513688144403, + "eval_loss": 3.019801139831543, + "eval_macro_f1": 0.5627273972303852, + "eval_runtime": 47.6775, + "eval_samples_per_second": 14.095, + "eval_steps_per_second": 3.524, + "step": 15400 + }, + { + "epoch": 454.00010447761196, + "grad_norm": 0.44485169649124146, + "learning_rate": 9.312647289866458e-05, + "loss": 0.2174, + "step": 15450 + }, + { + "epoch": 455.000223880597, + "grad_norm": 0.5385164618492126, + "learning_rate": 9.308719560094265e-05, + "loss": 0.2199, + "step": 15500 + }, + { + "epoch": 455.000223880597, + "eval_accuracy": 0.7951205646538749, + "eval_loss": 3.033013105392456, + "eval_macro_f1": 0.564159594880836, + "eval_runtime": 47.414, + "eval_samples_per_second": 14.173, + "eval_steps_per_second": 3.543, + "step": 15500 + }, + { + "epoch": 457.0000895522388, + "grad_norm": 0.642227053642273, + "learning_rate": 9.304791830322074e-05, + "loss": 0.2172, + "step": 15550 + }, + { + "epoch": 458.00020895522385, + "grad_norm": 0.637978732585907, + "learning_rate": 9.300864100549883e-05, + "loss": 0.2159, + "step": 15600 + }, + { + "epoch": 458.00020895522385, + "eval_accuracy": 0.7946547752930774, + "eval_loss": 2.8947529792785645, + "eval_macro_f1": 0.561161898536591, + "eval_runtime": 46.9256, + "eval_samples_per_second": 14.321, + "eval_steps_per_second": 3.58, + "step": 15600 + }, + { + "epoch": 460.0000746268657, + "grad_norm": 0.5700401663780212, + "learning_rate": 9.296936370777692e-05, + "loss": 0.2148, + "step": 15650 + }, + { + "epoch": 461.0001940298507, + "grad_norm": 0.6818207502365112, + "learning_rate": 9.2930086410055e-05, + "loss": 0.215, + "step": 15700 + }, + { + "epoch": 461.0001940298507, + "eval_accuracy": 0.7985236570886091, + "eval_loss": 2.9393157958984375, + "eval_macro_f1": 0.5688867696823432, + "eval_runtime": 47.5037, + "eval_samples_per_second": 14.146, + "eval_steps_per_second": 3.537, + "step": 15700 + }, + { + "epoch": 463.00005970149255, + "grad_norm": 0.5460227727890015, + "learning_rate": 9.289080911233308e-05, + "loss": 0.2141, + "step": 15750 + }, + { + "epoch": 464.00017910447764, + "grad_norm": 0.8175423741340637, + "learning_rate": 9.285153181461116e-05, + "loss": 0.2156, + "step": 15800 + }, + { + "epoch": 464.00017910447764, + "eval_accuracy": 0.7968792394858863, + "eval_loss": 3.0779905319213867, + "eval_macro_f1": 0.5693835454244838, + "eval_runtime": 47.6328, + "eval_samples_per_second": 14.108, + "eval_steps_per_second": 3.527, + "step": 15800 + }, + { + "epoch": 466.0000447761194, + "grad_norm": 0.6272607445716858, + "learning_rate": 9.281225451688925e-05, + "loss": 0.2128, + "step": 15850 + }, + { + "epoch": 467.0001641791045, + "grad_norm": 0.8056233525276184, + "learning_rate": 9.277297721916732e-05, + "loss": 0.2143, + "step": 15900 + }, + { + "epoch": 467.0001641791045, + "eval_accuracy": 0.7923319356538541, + "eval_loss": 2.9888763427734375, + "eval_macro_f1": 0.5615331116391947, + "eval_runtime": 47.3811, + "eval_samples_per_second": 14.183, + "eval_steps_per_second": 3.546, + "step": 15900 + }, + { + "epoch": 469.0000298507463, + "grad_norm": 0.5488153696060181, + "learning_rate": 9.273369992144541e-05, + "loss": 0.2112, + "step": 15950 + }, + { + "epoch": 470.00014925373137, + "grad_norm": 0.6871160864830017, + "learning_rate": 9.269442262372348e-05, + "loss": 0.2099, + "step": 16000 + }, + { + "epoch": 470.00014925373137, + "eval_accuracy": 0.7914931991213165, + "eval_loss": 3.1400673389434814, + "eval_macro_f1": 0.5512689895212705, + "eval_runtime": 47.6936, + "eval_samples_per_second": 14.09, + "eval_steps_per_second": 3.522, + "step": 16000 + }, + { + "epoch": 472.00001492537314, + "grad_norm": 0.8534285426139832, + "learning_rate": 9.265514532600159e-05, + "loss": 0.2127, + "step": 16050 + }, + { + "epoch": 473.00013432835823, + "grad_norm": 0.5599453449249268, + "learning_rate": 9.261586802827966e-05, + "loss": 0.2132, + "step": 16100 + }, + { + "epoch": 473.00013432835823, + "eval_accuracy": 0.7915744370394541, + "eval_loss": 3.1408255100250244, + "eval_macro_f1": 0.5636676273040631, + "eval_runtime": 47.5385, + "eval_samples_per_second": 14.136, + "eval_steps_per_second": 3.534, + "step": 16100 + }, + { + "epoch": 474.00025373134326, + "grad_norm": 0.8912002444267273, + "learning_rate": 9.257659073055775e-05, + "loss": 0.2118, + "step": 16150 + }, + { + "epoch": 476.0001194029851, + "grad_norm": 0.8097513318061829, + "learning_rate": 9.253731343283582e-05, + "loss": 0.2118, + "step": 16200 + }, + { + "epoch": 476.0001194029851, + "eval_accuracy": 0.7998531000193437, + "eval_loss": 3.046705961227417, + "eval_macro_f1": 0.5684223632571548, + "eval_runtime": 47.8219, + "eval_samples_per_second": 14.052, + "eval_steps_per_second": 3.513, + "step": 16200 + }, + { + "epoch": 477.0002388059701, + "grad_norm": 0.537408173084259, + "learning_rate": 9.249803613511391e-05, + "loss": 0.21, + "step": 16250 + }, + { + "epoch": 479.00010447761196, + "grad_norm": 0.5733367204666138, + "learning_rate": 9.245875883739199e-05, + "loss": 0.2128, + "step": 16300 + }, + { + "epoch": 479.00010447761196, + "eval_accuracy": 0.794795492599148, + "eval_loss": 3.1236205101013184, + "eval_macro_f1": 0.5547241398329374, + "eval_runtime": 47.9826, + "eval_samples_per_second": 14.005, + "eval_steps_per_second": 3.501, + "step": 16300 + }, + { + "epoch": 480.000223880597, + "grad_norm": 0.6852474212646484, + "learning_rate": 9.241948153967007e-05, + "loss": 0.2168, + "step": 16350 + }, + { + "epoch": 482.0000895522388, + "grad_norm": 0.8730658292770386, + "learning_rate": 9.238020424194815e-05, + "loss": 0.2158, + "step": 16400 + }, + { + "epoch": 482.0000895522388, + "eval_accuracy": 0.7932279319334217, + "eval_loss": 2.8690903186798096, + "eval_macro_f1": 0.5652858484107643, + "eval_runtime": 47.7669, + "eval_samples_per_second": 14.068, + "eval_steps_per_second": 3.517, + "step": 16400 + }, + { + "epoch": 483.00020895522385, + "grad_norm": 0.6937837600708008, + "learning_rate": 9.234092694422625e-05, + "loss": 0.2145, + "step": 16450 + }, + { + "epoch": 485.0000746268657, + "grad_norm": 0.5635214447975159, + "learning_rate": 9.230164964650433e-05, + "loss": 0.2161, + "step": 16500 + }, + { + "epoch": 485.0000746268657, + "eval_accuracy": 0.7996595290711143, + "eval_loss": 3.0240049362182617, + "eval_macro_f1": 0.5701663220447313, + "eval_runtime": 47.992, + "eval_samples_per_second": 14.002, + "eval_steps_per_second": 3.501, + "step": 16500 + }, + { + "epoch": 486.0001940298507, + "grad_norm": 0.6714308857917786, + "learning_rate": 9.226237234878241e-05, + "loss": 0.2127, + "step": 16550 + }, + { + "epoch": 488.00005970149255, + "grad_norm": 0.6512472629547119, + "learning_rate": 9.222309505106049e-05, + "loss": 0.2165, + "step": 16600 + }, + { + "epoch": 488.00005970149255, + "eval_accuracy": 0.7904966476234659, + "eval_loss": 3.054967164993286, + "eval_macro_f1": 0.5638221398368151, + "eval_runtime": 47.1066, + "eval_samples_per_second": 14.266, + "eval_steps_per_second": 3.566, + "step": 16600 + }, + { + "epoch": 489.00017910447764, + "grad_norm": 0.7864246368408203, + "learning_rate": 9.218381775333858e-05, + "loss": 0.213, + "step": 16650 + }, + { + "epoch": 491.0000447761194, + "grad_norm": 0.5250495672225952, + "learning_rate": 9.214454045561665e-05, + "loss": 0.2111, + "step": 16700 + }, + { + "epoch": 491.0000447761194, + "eval_accuracy": 0.7889951418754939, + "eval_loss": 3.060694456100464, + "eval_macro_f1": 0.5567643969024851, + "eval_runtime": 47.5212, + "eval_samples_per_second": 14.141, + "eval_steps_per_second": 3.535, + "step": 16700 + }, + { + "epoch": 492.0001641791045, + "grad_norm": 0.893277645111084, + "learning_rate": 9.210526315789474e-05, + "loss": 0.2141, + "step": 16750 + }, + { + "epoch": 494.0000298507463, + "grad_norm": 0.5823943614959717, + "learning_rate": 9.206598586017281e-05, + "loss": 0.2187, + "step": 16800 + }, + { + "epoch": 494.0000298507463, + "eval_accuracy": 0.7938742431963668, + "eval_loss": 3.092411518096924, + "eval_macro_f1": 0.5642921212237816, + "eval_runtime": 47.3543, + "eval_samples_per_second": 14.191, + "eval_steps_per_second": 3.548, + "step": 16800 + }, + { + "epoch": 495.00014925373137, + "grad_norm": 0.5595600605010986, + "learning_rate": 9.20267085624509e-05, + "loss": 0.2166, + "step": 16850 + }, + { + "epoch": 497.00001492537314, + "grad_norm": 0.6087270379066467, + "learning_rate": 9.198743126472899e-05, + "loss": 0.2165, + "step": 16900 + }, + { + "epoch": 497.00001492537314, + "eval_accuracy": 0.7991794186863951, + "eval_loss": 2.908203363418579, + "eval_macro_f1": 0.561715854424601, + "eval_runtime": 47.7231, + "eval_samples_per_second": 14.081, + "eval_steps_per_second": 3.52, + "step": 16900 + }, + { + "epoch": 498.00013432835823, + "grad_norm": 0.8621202707290649, + "learning_rate": 9.194815396700708e-05, + "loss": 0.2109, + "step": 16950 + }, + { + "epoch": 499.00025373134326, + "grad_norm": 0.5766103863716125, + "learning_rate": 9.190887666928515e-05, + "loss": 0.2105, + "step": 17000 + }, + { + "epoch": 499.00025373134326, + "eval_accuracy": 0.7937506833688435, + "eval_loss": 3.1744072437286377, + "eval_macro_f1": 0.5625111939896529, + "eval_runtime": 47.7694, + "eval_samples_per_second": 14.068, + "eval_steps_per_second": 3.517, + "step": 17000 + }, + { + "epoch": 501.0001194029851, + "grad_norm": 0.8057187795639038, + "learning_rate": 9.186959937156324e-05, + "loss": 0.2074, + "step": 17050 + }, + { + "epoch": 502.0002388059701, + "grad_norm": 0.7628244161605835, + "learning_rate": 9.183032207384132e-05, + "loss": 0.2086, + "step": 17100 + }, + { + "epoch": 502.0002388059701, + "eval_accuracy": 0.7982410465636556, + "eval_loss": 2.847623825073242, + "eval_macro_f1": 0.5609675697015797, + "eval_runtime": 47.2148, + "eval_samples_per_second": 14.233, + "eval_steps_per_second": 3.558, + "step": 17100 + }, + { + "epoch": 504.00010447761196, + "grad_norm": 0.6015555262565613, + "learning_rate": 9.17910447761194e-05, + "loss": 0.2062, + "step": 17150 + }, + { + "epoch": 505.000223880597, + "grad_norm": 0.6851004362106323, + "learning_rate": 9.17517674783975e-05, + "loss": 0.2075, + "step": 17200 + }, + { + "epoch": 505.000223880597, + "eval_accuracy": 0.7961091047065276, + "eval_loss": 3.093022346496582, + "eval_macro_f1": 0.5701477045377411, + "eval_runtime": 47.4639, + "eval_samples_per_second": 14.158, + "eval_steps_per_second": 3.54, + "step": 17200 + }, + { + "epoch": 507.0000895522388, + "grad_norm": 1.1948143243789673, + "learning_rate": 9.171249018067557e-05, + "loss": 0.2105, + "step": 17250 + }, + { + "epoch": 508.00020895522385, + "grad_norm": 0.6976267695426941, + "learning_rate": 9.167321288295366e-05, + "loss": 0.2095, + "step": 17300 + }, + { + "epoch": 508.00020895522385, + "eval_accuracy": 0.7985576728546575, + "eval_loss": 2.9823520183563232, + "eval_macro_f1": 0.5714790901432028, + "eval_runtime": 47.6906, + "eval_samples_per_second": 14.091, + "eval_steps_per_second": 3.523, + "step": 17300 + }, + { + "epoch": 510.0000746268657, + "grad_norm": 0.47454121708869934, + "learning_rate": 9.163393558523175e-05, + "loss": 0.2086, + "step": 17350 + }, + { + "epoch": 511.0001940298507, + "grad_norm": 0.5543599724769592, + "learning_rate": 9.159465828750982e-05, + "loss": 0.2048, + "step": 17400 + }, + { + "epoch": 511.0001940298507, + "eval_accuracy": 0.8004991697428618, + "eval_loss": 3.0314083099365234, + "eval_macro_f1": 0.5710308271247934, + "eval_runtime": 47.7418, + "eval_samples_per_second": 14.076, + "eval_steps_per_second": 3.519, + "step": 17400 + }, + { + "epoch": 513.0000597014925, + "grad_norm": 0.6959546804428101, + "learning_rate": 9.155538098978791e-05, + "loss": 0.2058, + "step": 17450 + }, + { + "epoch": 514.0001791044776, + "grad_norm": 0.38391655683517456, + "learning_rate": 9.1516103692066e-05, + "loss": 0.2086, + "step": 17500 + }, + { + "epoch": 514.0001791044776, + "eval_accuracy": 0.7983069152998881, + "eval_loss": 2.9809422492980957, + "eval_macro_f1": 0.5658929218664657, + "eval_runtime": 47.8502, + "eval_samples_per_second": 14.044, + "eval_steps_per_second": 3.511, + "step": 17500 + }, + { + "epoch": 516.0000447761194, + "grad_norm": 0.7131857872009277, + "learning_rate": 9.147682639434407e-05, + "loss": 0.2075, + "step": 17550 + }, + { + "epoch": 517.0001641791044, + "grad_norm": 0.6387264132499695, + "learning_rate": 9.143754909662216e-05, + "loss": 0.2052, + "step": 17600 + }, + { + "epoch": 517.0001641791044, + "eval_accuracy": 0.8010795346991915, + "eval_loss": 2.841973304748535, + "eval_macro_f1": 0.5727406094475693, + "eval_runtime": 48.7117, + "eval_samples_per_second": 13.795, + "eval_steps_per_second": 3.449, + "step": 17600 + } + ], + "logging_steps": 50, + "max_steps": 134000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}