{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 12875, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007766990291262136, "grad_norm": 70.24808964324559, "learning_rate": 1.3975155279503106e-07, "loss": 3.4855, "step": 10 }, { "epoch": 0.0015533980582524273, "grad_norm": 61.22885123024937, "learning_rate": 2.9503105590062115e-07, "loss": 3.4301, "step": 20 }, { "epoch": 0.002330097087378641, "grad_norm": 104.70242683134062, "learning_rate": 4.503105590062112e-07, "loss": 3.3594, "step": 30 }, { "epoch": 0.0031067961165048546, "grad_norm": 66.05918812656306, "learning_rate": 6.055900621118013e-07, "loss": 3.2662, "step": 40 }, { "epoch": 0.003883495145631068, "grad_norm": 49.16891869237666, "learning_rate": 7.608695652173914e-07, "loss": 3.1542, "step": 50 }, { "epoch": 0.004660194174757282, "grad_norm": 43.39651533084213, "learning_rate": 9.161490683229815e-07, "loss": 2.9594, "step": 60 }, { "epoch": 0.005436893203883495, "grad_norm": 31.08084100300487, "learning_rate": 1.0714285714285714e-06, "loss": 2.5454, "step": 70 }, { "epoch": 0.006213592233009709, "grad_norm": 17.454975955575122, "learning_rate": 1.2267080745341615e-06, "loss": 2.3462, "step": 80 }, { "epoch": 0.006990291262135922, "grad_norm": 33.701249352229354, "learning_rate": 1.3819875776397517e-06, "loss": 2.012, "step": 90 }, { "epoch": 0.007766990291262136, "grad_norm": 18.814461079986906, "learning_rate": 1.5372670807453419e-06, "loss": 1.6105, "step": 100 }, { "epoch": 0.00854368932038835, "grad_norm": 13.975798160473579, "learning_rate": 1.6925465838509316e-06, "loss": 1.3364, "step": 110 }, { "epoch": 0.009320388349514564, "grad_norm": 47.239036452258084, "learning_rate": 1.8478260869565218e-06, "loss": 1.248, "step": 120 }, { "epoch": 0.010097087378640776, "grad_norm": 33.71815681697253, "learning_rate": 2.0031055900621117e-06, "loss": 1.0993, "step": 130 }, { "epoch": 0.01087378640776699, "grad_norm": 27.469068625916506, "learning_rate": 2.158385093167702e-06, "loss": 0.8871, "step": 140 }, { "epoch": 0.011650485436893204, "grad_norm": 65.93015103817278, "learning_rate": 2.313664596273292e-06, "loss": 0.8017, "step": 150 }, { "epoch": 0.012427184466019418, "grad_norm": 69.25006711184193, "learning_rate": 2.4689440993788823e-06, "loss": 0.7106, "step": 160 }, { "epoch": 0.01320388349514563, "grad_norm": 16.463863970460764, "learning_rate": 2.6242236024844724e-06, "loss": 0.6067, "step": 170 }, { "epoch": 0.013980582524271845, "grad_norm": 11.823712167409878, "learning_rate": 2.779503105590062e-06, "loss": 0.5591, "step": 180 }, { "epoch": 0.014757281553398059, "grad_norm": 4.566411871356504, "learning_rate": 2.9347826086956528e-06, "loss": 0.5422, "step": 190 }, { "epoch": 0.015533980582524271, "grad_norm": 9.776861863203822, "learning_rate": 3.0900621118012425e-06, "loss": 0.464, "step": 200 }, { "epoch": 0.016310679611650485, "grad_norm": 6.803944594954418, "learning_rate": 3.2453416149068323e-06, "loss": 0.4998, "step": 210 }, { "epoch": 0.0170873786407767, "grad_norm": 22.222704907553663, "learning_rate": 3.400621118012423e-06, "loss": 0.453, "step": 220 }, { "epoch": 0.017864077669902913, "grad_norm": 27.374938965928017, "learning_rate": 3.5559006211180126e-06, "loss": 0.4059, "step": 230 }, { "epoch": 0.018640776699029128, "grad_norm": 8.371486315452819, "learning_rate": 3.7111801242236028e-06, "loss": 0.3865, "step": 240 }, { "epoch": 0.019417475728155338, "grad_norm": 3.0060232538624754, "learning_rate": 3.866459627329193e-06, "loss": 0.3454, "step": 250 }, { "epoch": 0.020194174757281552, "grad_norm": 9.074184238211613, "learning_rate": 4.021739130434783e-06, "loss": 0.359, "step": 260 }, { "epoch": 0.020970873786407766, "grad_norm": 63.70299601273966, "learning_rate": 4.1770186335403724e-06, "loss": 0.3222, "step": 270 }, { "epoch": 0.02174757281553398, "grad_norm": 8.763470676454308, "learning_rate": 4.3322981366459634e-06, "loss": 0.2917, "step": 280 }, { "epoch": 0.022524271844660194, "grad_norm": 12.347208364997751, "learning_rate": 4.487577639751553e-06, "loss": 0.2951, "step": 290 }, { "epoch": 0.02330097087378641, "grad_norm": 118.0372641901289, "learning_rate": 4.642857142857144e-06, "loss": 0.2949, "step": 300 }, { "epoch": 0.024077669902912623, "grad_norm": 13.102112429803539, "learning_rate": 4.798136645962733e-06, "loss": 0.2641, "step": 310 }, { "epoch": 0.024854368932038837, "grad_norm": 10.153430948659, "learning_rate": 4.953416149068323e-06, "loss": 0.2386, "step": 320 }, { "epoch": 0.025631067961165047, "grad_norm": 6.975203379028509, "learning_rate": 5.108695652173914e-06, "loss": 0.2391, "step": 330 }, { "epoch": 0.02640776699029126, "grad_norm": 7.231975999500197, "learning_rate": 5.263975155279504e-06, "loss": 0.2231, "step": 340 }, { "epoch": 0.027184466019417475, "grad_norm": 0.9518445876355756, "learning_rate": 5.419254658385094e-06, "loss": 0.2137, "step": 350 }, { "epoch": 0.02796116504854369, "grad_norm": 1.3358576832965936, "learning_rate": 5.574534161490683e-06, "loss": 0.2022, "step": 360 }, { "epoch": 0.028737864077669904, "grad_norm": 8.641566914219792, "learning_rate": 5.729813664596274e-06, "loss": 0.194, "step": 370 }, { "epoch": 0.029514563106796118, "grad_norm": 0.5631896851524963, "learning_rate": 5.885093167701864e-06, "loss": 0.1846, "step": 380 }, { "epoch": 0.030291262135922332, "grad_norm": 35.29448469947501, "learning_rate": 6.040372670807454e-06, "loss": 0.1759, "step": 390 }, { "epoch": 0.031067961165048542, "grad_norm": 2.384630629167573, "learning_rate": 6.195652173913044e-06, "loss": 0.1784, "step": 400 }, { "epoch": 0.031844660194174756, "grad_norm": 4.682365113300592, "learning_rate": 6.350931677018634e-06, "loss": 0.1694, "step": 410 }, { "epoch": 0.03262135922330097, "grad_norm": 2.7469674226861462, "learning_rate": 6.506211180124224e-06, "loss": 0.1641, "step": 420 }, { "epoch": 0.033398058252427185, "grad_norm": 0.6640191532852394, "learning_rate": 6.661490683229814e-06, "loss": 0.1455, "step": 430 }, { "epoch": 0.0341747572815534, "grad_norm": 3.317119652870753, "learning_rate": 6.8167701863354045e-06, "loss": 0.1595, "step": 440 }, { "epoch": 0.03495145631067961, "grad_norm": 2.28282614275576, "learning_rate": 6.972049689440994e-06, "loss": 0.1434, "step": 450 }, { "epoch": 0.03572815533980583, "grad_norm": 10.315379527541852, "learning_rate": 7.127329192546585e-06, "loss": 0.1346, "step": 460 }, { "epoch": 0.03650485436893204, "grad_norm": 62.84139887237369, "learning_rate": 7.282608695652175e-06, "loss": 0.1464, "step": 470 }, { "epoch": 0.037281553398058255, "grad_norm": 3.9249761290217355, "learning_rate": 7.437888198757764e-06, "loss": 0.1395, "step": 480 }, { "epoch": 0.03805825242718447, "grad_norm": 1.3909150617567019, "learning_rate": 7.5931677018633545e-06, "loss": 0.1401, "step": 490 }, { "epoch": 0.038834951456310676, "grad_norm": 0.6147076661983238, "learning_rate": 7.748447204968945e-06, "loss": 0.1281, "step": 500 }, { "epoch": 0.03961165048543689, "grad_norm": 1.495698926028656, "learning_rate": 7.903726708074536e-06, "loss": 0.1221, "step": 510 }, { "epoch": 0.040388349514563104, "grad_norm": 0.6598829993056254, "learning_rate": 8.059006211180125e-06, "loss": 0.1169, "step": 520 }, { "epoch": 0.04116504854368932, "grad_norm": 3.730832609479397, "learning_rate": 8.214285714285714e-06, "loss": 0.1208, "step": 530 }, { "epoch": 0.04194174757281553, "grad_norm": 0.6069890232725306, "learning_rate": 8.369565217391305e-06, "loss": 0.1087, "step": 540 }, { "epoch": 0.04271844660194175, "grad_norm": 0.8185154746248435, "learning_rate": 8.524844720496895e-06, "loss": 0.1113, "step": 550 }, { "epoch": 0.04349514563106796, "grad_norm": 2.2275221891863897, "learning_rate": 8.680124223602486e-06, "loss": 0.1057, "step": 560 }, { "epoch": 0.044271844660194175, "grad_norm": 3.23873700407478, "learning_rate": 8.835403726708075e-06, "loss": 0.1065, "step": 570 }, { "epoch": 0.04504854368932039, "grad_norm": 0.6128857772362971, "learning_rate": 8.990683229813664e-06, "loss": 0.1035, "step": 580 }, { "epoch": 0.0458252427184466, "grad_norm": 3.8825792643318295, "learning_rate": 9.145962732919255e-06, "loss": 0.1005, "step": 590 }, { "epoch": 0.04660194174757282, "grad_norm": 1.9864661779386465, "learning_rate": 9.301242236024846e-06, "loss": 0.106, "step": 600 }, { "epoch": 0.04737864077669903, "grad_norm": 2.7283859064614604, "learning_rate": 9.456521739130436e-06, "loss": 0.0994, "step": 610 }, { "epoch": 0.048155339805825245, "grad_norm": 0.8197783147434852, "learning_rate": 9.611801242236025e-06, "loss": 0.0961, "step": 620 }, { "epoch": 0.04893203883495146, "grad_norm": 1.1537648759232986, "learning_rate": 9.767080745341616e-06, "loss": 0.0894, "step": 630 }, { "epoch": 0.04970873786407767, "grad_norm": 0.7390411383126917, "learning_rate": 9.922360248447205e-06, "loss": 0.0958, "step": 640 }, { "epoch": 0.05048543689320388, "grad_norm": 2.1358549198388697, "learning_rate": 9.999995876597113e-06, "loss": 0.088, "step": 650 }, { "epoch": 0.051262135922330095, "grad_norm": 0.8598750433484502, "learning_rate": 9.99996288941482e-06, "loss": 0.0845, "step": 660 }, { "epoch": 0.05203883495145631, "grad_norm": 0.5995983201056767, "learning_rate": 9.999896915267866e-06, "loss": 0.0845, "step": 670 }, { "epoch": 0.05281553398058252, "grad_norm": 0.6177007313555738, "learning_rate": 9.99979795459151e-06, "loss": 0.0795, "step": 680 }, { "epoch": 0.05359223300970874, "grad_norm": 0.6592368970384508, "learning_rate": 9.999666008038641e-06, "loss": 0.0846, "step": 690 }, { "epoch": 0.05436893203883495, "grad_norm": 0.9922453969914385, "learning_rate": 9.999501076479768e-06, "loss": 0.0808, "step": 700 }, { "epoch": 0.055145631067961165, "grad_norm": 0.4842295437980666, "learning_rate": 9.999303161003017e-06, "loss": 0.0794, "step": 710 }, { "epoch": 0.05592233009708738, "grad_norm": 1.5129602893569967, "learning_rate": 9.999072262914123e-06, "loss": 0.0762, "step": 720 }, { "epoch": 0.05669902912621359, "grad_norm": 0.341717590956545, "learning_rate": 9.998808383736425e-06, "loss": 0.0706, "step": 730 }, { "epoch": 0.05747572815533981, "grad_norm": 1.8027410161310318, "learning_rate": 9.99851152521085e-06, "loss": 0.0809, "step": 740 }, { "epoch": 0.05825242718446602, "grad_norm": 0.3861808733778894, "learning_rate": 9.998181689295904e-06, "loss": 0.0758, "step": 750 }, { "epoch": 0.059029126213592235, "grad_norm": 0.5969002346877048, "learning_rate": 9.99781887816766e-06, "loss": 0.0738, "step": 760 }, { "epoch": 0.05980582524271845, "grad_norm": 0.4223411460564809, "learning_rate": 9.997423094219742e-06, "loss": 0.0736, "step": 770 }, { "epoch": 0.060582524271844664, "grad_norm": 0.6964586521571462, "learning_rate": 9.996994340063314e-06, "loss": 0.0718, "step": 780 }, { "epoch": 0.06135922330097087, "grad_norm": 0.44921006331468555, "learning_rate": 9.996532618527059e-06, "loss": 0.0757, "step": 790 }, { "epoch": 0.062135922330097085, "grad_norm": 0.5318120679150641, "learning_rate": 9.996037932657155e-06, "loss": 0.0707, "step": 800 }, { "epoch": 0.0629126213592233, "grad_norm": 0.5118570803958712, "learning_rate": 9.995510285717261e-06, "loss": 0.0713, "step": 810 }, { "epoch": 0.06368932038834951, "grad_norm": 0.7128549673987672, "learning_rate": 9.994949681188501e-06, "loss": 0.0723, "step": 820 }, { "epoch": 0.06446601941747573, "grad_norm": 8.453390979024276, "learning_rate": 9.994356122769428e-06, "loss": 0.0686, "step": 830 }, { "epoch": 0.06524271844660194, "grad_norm": 0.574659264495718, "learning_rate": 9.993729614376012e-06, "loss": 0.0631, "step": 840 }, { "epoch": 0.06601941747572816, "grad_norm": 0.27535387416432516, "learning_rate": 9.993070160141603e-06, "loss": 0.0757, "step": 850 }, { "epoch": 0.06679611650485437, "grad_norm": 0.845674797004825, "learning_rate": 9.992377764416914e-06, "loss": 0.0643, "step": 860 }, { "epoch": 0.06757281553398058, "grad_norm": 0.22797380673774062, "learning_rate": 9.991652431769985e-06, "loss": 0.0629, "step": 870 }, { "epoch": 0.0683495145631068, "grad_norm": 0.35871439577697967, "learning_rate": 9.990894166986155e-06, "loss": 0.07, "step": 880 }, { "epoch": 0.069126213592233, "grad_norm": 0.41340392444949753, "learning_rate": 9.990102975068032e-06, "loss": 0.0672, "step": 890 }, { "epoch": 0.06990291262135923, "grad_norm": 4.891822689625701, "learning_rate": 9.989278861235461e-06, "loss": 0.0666, "step": 900 }, { "epoch": 0.07067961165048543, "grad_norm": 0.23531370825348855, "learning_rate": 9.988421830925482e-06, "loss": 0.0656, "step": 910 }, { "epoch": 0.07145631067961165, "grad_norm": 4.298743273695105, "learning_rate": 9.987531889792304e-06, "loss": 0.0596, "step": 920 }, { "epoch": 0.07223300970873786, "grad_norm": 0.46627467036351156, "learning_rate": 9.986609043707262e-06, "loss": 0.06, "step": 930 }, { "epoch": 0.07300970873786408, "grad_norm": 0.26902606576465943, "learning_rate": 9.985653298758781e-06, "loss": 0.0626, "step": 940 }, { "epoch": 0.07378640776699029, "grad_norm": 0.49811490209964726, "learning_rate": 9.98466466125233e-06, "loss": 0.0709, "step": 950 }, { "epoch": 0.07456310679611651, "grad_norm": 0.3698380119373003, "learning_rate": 9.983643137710391e-06, "loss": 0.0654, "step": 960 }, { "epoch": 0.07533980582524272, "grad_norm": 0.4447165908845746, "learning_rate": 9.9825887348724e-06, "loss": 0.0616, "step": 970 }, { "epoch": 0.07611650485436894, "grad_norm": 0.7018820741892676, "learning_rate": 9.981501459694725e-06, "loss": 0.0614, "step": 980 }, { "epoch": 0.07689320388349515, "grad_norm": 0.21472084270664246, "learning_rate": 9.980381319350598e-06, "loss": 0.06, "step": 990 }, { "epoch": 0.07766990291262135, "grad_norm": 0.3538884976710125, "learning_rate": 9.979228321230079e-06, "loss": 0.0609, "step": 1000 }, { "epoch": 0.07766990291262135, "eval_loss": 0.06220540776848793, "eval_runtime": 1429.5458, "eval_samples_per_second": 3.926, "eval_steps_per_second": 0.281, "step": 1000 }, { "epoch": 0.07844660194174757, "grad_norm": 0.3381532273634951, "learning_rate": 9.978042472940007e-06, "loss": 0.0582, "step": 1010 }, { "epoch": 0.07922330097087378, "grad_norm": 1.4129042741600022, "learning_rate": 9.97682378230395e-06, "loss": 0.0672, "step": 1020 }, { "epoch": 0.08, "grad_norm": 0.5431497250848125, "learning_rate": 9.975572257362143e-06, "loss": 0.061, "step": 1030 }, { "epoch": 0.08077669902912621, "grad_norm": 0.2210077595016109, "learning_rate": 9.974287906371454e-06, "loss": 0.058, "step": 1040 }, { "epoch": 0.08155339805825243, "grad_norm": 1.4962400488679997, "learning_rate": 9.972970737805312e-06, "loss": 0.0615, "step": 1050 }, { "epoch": 0.08233009708737864, "grad_norm": 0.3680208782614156, "learning_rate": 9.97162076035366e-06, "loss": 0.0594, "step": 1060 }, { "epoch": 0.08310679611650486, "grad_norm": 0.44558761975003036, "learning_rate": 9.970237982922896e-06, "loss": 0.0608, "step": 1070 }, { "epoch": 0.08388349514563107, "grad_norm": 0.21136714919531008, "learning_rate": 9.968822414635813e-06, "loss": 0.0586, "step": 1080 }, { "epoch": 0.08466019417475729, "grad_norm": 0.3083264990446477, "learning_rate": 9.967374064831544e-06, "loss": 0.0569, "step": 1090 }, { "epoch": 0.0854368932038835, "grad_norm": 0.3576495691188122, "learning_rate": 9.965892943065488e-06, "loss": 0.057, "step": 1100 }, { "epoch": 0.08621359223300971, "grad_norm": 0.38633162487430783, "learning_rate": 9.96437905910926e-06, "loss": 0.0573, "step": 1110 }, { "epoch": 0.08699029126213592, "grad_norm": 0.1925178847716028, "learning_rate": 9.962832422950625e-06, "loss": 0.0519, "step": 1120 }, { "epoch": 0.08776699029126214, "grad_norm": 9.425757870610274, "learning_rate": 9.961253044793424e-06, "loss": 0.0553, "step": 1130 }, { "epoch": 0.08854368932038835, "grad_norm": 0.22925760324280509, "learning_rate": 9.959640935057512e-06, "loss": 0.0547, "step": 1140 }, { "epoch": 0.08932038834951456, "grad_norm": 0.2549603399706302, "learning_rate": 9.957996104378689e-06, "loss": 0.0592, "step": 1150 }, { "epoch": 0.09009708737864078, "grad_norm": 0.3194903250179675, "learning_rate": 9.95631856360863e-06, "loss": 0.0547, "step": 1160 }, { "epoch": 0.09087378640776698, "grad_norm": 0.19315323229460435, "learning_rate": 9.954608323814814e-06, "loss": 0.0558, "step": 1170 }, { "epoch": 0.0916504854368932, "grad_norm": 0.13314533180115556, "learning_rate": 9.952865396280446e-06, "loss": 0.0594, "step": 1180 }, { "epoch": 0.09242718446601941, "grad_norm": 0.1934360080534723, "learning_rate": 9.951089792504392e-06, "loss": 0.0552, "step": 1190 }, { "epoch": 0.09320388349514563, "grad_norm": 0.15602120206623987, "learning_rate": 9.949281524201092e-06, "loss": 0.0511, "step": 1200 }, { "epoch": 0.09398058252427184, "grad_norm": 0.30815149339783743, "learning_rate": 9.947440603300496e-06, "loss": 0.0529, "step": 1210 }, { "epoch": 0.09475728155339806, "grad_norm": 0.3243014509824919, "learning_rate": 9.945567041947968e-06, "loss": 0.0545, "step": 1220 }, { "epoch": 0.09553398058252427, "grad_norm": 0.3524662321600662, "learning_rate": 9.94366085250422e-06, "loss": 0.0509, "step": 1230 }, { "epoch": 0.09631067961165049, "grad_norm": 0.18529919856132182, "learning_rate": 9.941722047545228e-06, "loss": 0.0521, "step": 1240 }, { "epoch": 0.0970873786407767, "grad_norm": 0.18380390302432503, "learning_rate": 9.939750639862145e-06, "loss": 0.0532, "step": 1250 }, { "epoch": 0.09786407766990292, "grad_norm": 0.19759282580977539, "learning_rate": 9.93774664246122e-06, "loss": 0.0549, "step": 1260 }, { "epoch": 0.09864077669902913, "grad_norm": 0.3480300487962872, "learning_rate": 9.935710068563706e-06, "loss": 0.0552, "step": 1270 }, { "epoch": 0.09941747572815535, "grad_norm": 0.20288730362157759, "learning_rate": 9.933640931605782e-06, "loss": 0.0706, "step": 1280 }, { "epoch": 0.10019417475728155, "grad_norm": 0.18140047010199514, "learning_rate": 9.931539245238462e-06, "loss": 0.0538, "step": 1290 }, { "epoch": 0.10097087378640776, "grad_norm": 0.2323000754894511, "learning_rate": 9.929405023327495e-06, "loss": 0.0526, "step": 1300 }, { "epoch": 0.10174757281553398, "grad_norm": 0.21182026015111252, "learning_rate": 9.927238279953289e-06, "loss": 0.0578, "step": 1310 }, { "epoch": 0.10252427184466019, "grad_norm": 0.3212376314417802, "learning_rate": 9.925039029410807e-06, "loss": 0.0514, "step": 1320 }, { "epoch": 0.10330097087378641, "grad_norm": 0.4325231074654262, "learning_rate": 9.922807286209477e-06, "loss": 0.0566, "step": 1330 }, { "epoch": 0.10407766990291262, "grad_norm": 0.23344705529820273, "learning_rate": 9.920543065073095e-06, "loss": 0.0516, "step": 1340 }, { "epoch": 0.10485436893203884, "grad_norm": 0.3546872971866052, "learning_rate": 9.918246380939726e-06, "loss": 0.0524, "step": 1350 }, { "epoch": 0.10563106796116505, "grad_norm": 0.2963446843170135, "learning_rate": 9.915917248961614e-06, "loss": 0.0493, "step": 1360 }, { "epoch": 0.10640776699029127, "grad_norm": 0.24869863542592696, "learning_rate": 9.913555684505068e-06, "loss": 0.0494, "step": 1370 }, { "epoch": 0.10718446601941747, "grad_norm": 2.545188093519763, "learning_rate": 9.911161703150375e-06, "loss": 0.0552, "step": 1380 }, { "epoch": 0.1079611650485437, "grad_norm": 0.29165390515860456, "learning_rate": 9.90873532069169e-06, "loss": 0.0523, "step": 1390 }, { "epoch": 0.1087378640776699, "grad_norm": 0.30777454700925216, "learning_rate": 9.906276553136924e-06, "loss": 0.0536, "step": 1400 }, { "epoch": 0.10951456310679612, "grad_norm": 1.2096747952256435, "learning_rate": 9.90378541670766e-06, "loss": 0.0512, "step": 1410 }, { "epoch": 0.11029126213592233, "grad_norm": 0.12895136921523498, "learning_rate": 9.90126192783902e-06, "loss": 0.0502, "step": 1420 }, { "epoch": 0.11106796116504854, "grad_norm": 0.20451158916087384, "learning_rate": 9.898706103179577e-06, "loss": 0.0528, "step": 1430 }, { "epoch": 0.11184466019417476, "grad_norm": 0.2608112327745393, "learning_rate": 9.896117959591239e-06, "loss": 0.0523, "step": 1440 }, { "epoch": 0.11262135922330097, "grad_norm": 0.43185557163527577, "learning_rate": 9.893497514149127e-06, "loss": 0.0497, "step": 1450 }, { "epoch": 0.11339805825242719, "grad_norm": 0.38878175870908804, "learning_rate": 9.890844784141483e-06, "loss": 0.0536, "step": 1460 }, { "epoch": 0.1141747572815534, "grad_norm": 0.2131451506025931, "learning_rate": 9.888159787069538e-06, "loss": 0.0523, "step": 1470 }, { "epoch": 0.11495145631067961, "grad_norm": 0.13358297297993682, "learning_rate": 9.885442540647402e-06, "loss": 0.0504, "step": 1480 }, { "epoch": 0.11572815533980582, "grad_norm": 0.22307091649658065, "learning_rate": 9.88269306280195e-06, "loss": 0.0472, "step": 1490 }, { "epoch": 0.11650485436893204, "grad_norm": 0.2238456005691382, "learning_rate": 9.879911371672706e-06, "loss": 0.0486, "step": 1500 }, { "epoch": 0.11728155339805825, "grad_norm": 0.20198760410996, "learning_rate": 9.877097485611713e-06, "loss": 0.0501, "step": 1510 }, { "epoch": 0.11805825242718447, "grad_norm": 0.24084666698281773, "learning_rate": 9.874251423183421e-06, "loss": 0.0483, "step": 1520 }, { "epoch": 0.11883495145631068, "grad_norm": 0.24304108612481995, "learning_rate": 9.871373203164562e-06, "loss": 0.048, "step": 1530 }, { "epoch": 0.1196116504854369, "grad_norm": 0.15080555781968732, "learning_rate": 9.868462844544024e-06, "loss": 0.0477, "step": 1540 }, { "epoch": 0.1203883495145631, "grad_norm": 0.234277753040414, "learning_rate": 9.865520366522732e-06, "loss": 0.0507, "step": 1550 }, { "epoch": 0.12116504854368933, "grad_norm": 0.20104770907475544, "learning_rate": 9.862545788513512e-06, "loss": 0.0496, "step": 1560 }, { "epoch": 0.12194174757281553, "grad_norm": 0.2070561545904625, "learning_rate": 9.859539130140967e-06, "loss": 0.0502, "step": 1570 }, { "epoch": 0.12271844660194174, "grad_norm": 0.2157258562554082, "learning_rate": 9.856500411241355e-06, "loss": 0.0495, "step": 1580 }, { "epoch": 0.12349514563106796, "grad_norm": 0.32425100564201415, "learning_rate": 9.853429651862445e-06, "loss": 0.048, "step": 1590 }, { "epoch": 0.12427184466019417, "grad_norm": 0.30785358784987626, "learning_rate": 9.850326872263396e-06, "loss": 0.0455, "step": 1600 }, { "epoch": 0.1250485436893204, "grad_norm": 0.14682875251927274, "learning_rate": 9.847192092914613e-06, "loss": 0.0484, "step": 1610 }, { "epoch": 0.1258252427184466, "grad_norm": 0.24131128421429165, "learning_rate": 9.844025334497622e-06, "loss": 0.0454, "step": 1620 }, { "epoch": 0.1266019417475728, "grad_norm": 0.29937617542436845, "learning_rate": 9.840826617904928e-06, "loss": 0.0488, "step": 1630 }, { "epoch": 0.12737864077669903, "grad_norm": 0.3314224818444492, "learning_rate": 9.83759596423988e-06, "loss": 0.0493, "step": 1640 }, { "epoch": 0.12815533980582525, "grad_norm": 0.20415878417149871, "learning_rate": 9.834333394816523e-06, "loss": 0.047, "step": 1650 }, { "epoch": 0.12893203883495147, "grad_norm": 0.21326299908161517, "learning_rate": 9.83103893115947e-06, "loss": 0.0472, "step": 1660 }, { "epoch": 0.12970873786407766, "grad_norm": 0.23498506840287625, "learning_rate": 9.827712595003759e-06, "loss": 0.0471, "step": 1670 }, { "epoch": 0.13048543689320388, "grad_norm": 0.23911397923405153, "learning_rate": 9.824354408294695e-06, "loss": 0.0485, "step": 1680 }, { "epoch": 0.1312621359223301, "grad_norm": 0.2084272987235491, "learning_rate": 9.820964393187718e-06, "loss": 0.0457, "step": 1690 }, { "epoch": 0.13203883495145632, "grad_norm": 0.15662671593834604, "learning_rate": 9.81754257204826e-06, "loss": 0.0477, "step": 1700 }, { "epoch": 0.13281553398058252, "grad_norm": 0.24920901131716153, "learning_rate": 9.814088967451585e-06, "loss": 0.0527, "step": 1710 }, { "epoch": 0.13359223300970874, "grad_norm": 0.35551886820543205, "learning_rate": 9.810603602182648e-06, "loss": 0.049, "step": 1720 }, { "epoch": 0.13436893203883496, "grad_norm": 0.3279479785662138, "learning_rate": 9.807086499235947e-06, "loss": 0.0505, "step": 1730 }, { "epoch": 0.13514563106796115, "grad_norm": 0.1897498325370596, "learning_rate": 9.803537681815362e-06, "loss": 0.0508, "step": 1740 }, { "epoch": 0.13592233009708737, "grad_norm": 0.12562068545168395, "learning_rate": 9.799957173334009e-06, "loss": 0.0494, "step": 1750 }, { "epoch": 0.1366990291262136, "grad_norm": 0.3003479984638896, "learning_rate": 9.796344997414087e-06, "loss": 0.0466, "step": 1760 }, { "epoch": 0.13747572815533982, "grad_norm": 0.16517254973702666, "learning_rate": 9.792701177886714e-06, "loss": 0.0515, "step": 1770 }, { "epoch": 0.138252427184466, "grad_norm": 0.1373737081002768, "learning_rate": 9.789025738791778e-06, "loss": 0.0444, "step": 1780 }, { "epoch": 0.13902912621359223, "grad_norm": 0.24047523488644235, "learning_rate": 9.785318704377778e-06, "loss": 0.0487, "step": 1790 }, { "epoch": 0.13980582524271845, "grad_norm": 0.19083851367523677, "learning_rate": 9.781580099101655e-06, "loss": 0.0465, "step": 1800 }, { "epoch": 0.14058252427184467, "grad_norm": 0.2625116271212534, "learning_rate": 9.777809947628641e-06, "loss": 0.0478, "step": 1810 }, { "epoch": 0.14135922330097087, "grad_norm": 0.2338826470351427, "learning_rate": 9.774008274832091e-06, "loss": 0.0437, "step": 1820 }, { "epoch": 0.1421359223300971, "grad_norm": 0.34474787083713043, "learning_rate": 9.770175105793324e-06, "loss": 0.0503, "step": 1830 }, { "epoch": 0.1429126213592233, "grad_norm": 0.15399661933342382, "learning_rate": 9.766310465801445e-06, "loss": 0.0472, "step": 1840 }, { "epoch": 0.1436893203883495, "grad_norm": 0.15399430421277074, "learning_rate": 9.762414380353194e-06, "loss": 0.045, "step": 1850 }, { "epoch": 0.14446601941747572, "grad_norm": 0.1425156823861072, "learning_rate": 9.758486875152766e-06, "loss": 0.045, "step": 1860 }, { "epoch": 0.14524271844660194, "grad_norm": 0.31305062154523244, "learning_rate": 9.754527976111651e-06, "loss": 0.0446, "step": 1870 }, { "epoch": 0.14601941747572816, "grad_norm": 0.27957991791629455, "learning_rate": 9.750537709348457e-06, "loss": 0.0464, "step": 1880 }, { "epoch": 0.14679611650485436, "grad_norm": 0.14181084807995473, "learning_rate": 9.746516101188734e-06, "loss": 0.0429, "step": 1890 }, { "epoch": 0.14757281553398058, "grad_norm": 0.17327103194523136, "learning_rate": 9.74246317816481e-06, "loss": 0.0447, "step": 1900 }, { "epoch": 0.1483495145631068, "grad_norm": 0.12773663959198345, "learning_rate": 9.738378967015606e-06, "loss": 0.0414, "step": 1910 }, { "epoch": 0.14912621359223302, "grad_norm": 0.38175314019843926, "learning_rate": 9.734263494686472e-06, "loss": 0.0439, "step": 1920 }, { "epoch": 0.1499029126213592, "grad_norm": 0.15835370820850259, "learning_rate": 9.730116788328994e-06, "loss": 0.0441, "step": 1930 }, { "epoch": 0.15067961165048543, "grad_norm": 0.21686211705775543, "learning_rate": 9.725938875300829e-06, "loss": 0.0434, "step": 1940 }, { "epoch": 0.15145631067961166, "grad_norm": 0.11718406348895274, "learning_rate": 9.721729783165514e-06, "loss": 0.045, "step": 1950 }, { "epoch": 0.15223300970873788, "grad_norm": 0.2465972132632146, "learning_rate": 9.71748953969229e-06, "loss": 0.0419, "step": 1960 }, { "epoch": 0.15300970873786407, "grad_norm": 0.918853139684945, "learning_rate": 9.713218172855919e-06, "loss": 0.0429, "step": 1970 }, { "epoch": 0.1537864077669903, "grad_norm": 0.1460114717637408, "learning_rate": 9.708915710836492e-06, "loss": 0.0417, "step": 1980 }, { "epoch": 0.1545631067961165, "grad_norm": 0.15285612850375802, "learning_rate": 9.704582182019255e-06, "loss": 0.0441, "step": 1990 }, { "epoch": 0.1553398058252427, "grad_norm": 0.14577220665014945, "learning_rate": 9.700217614994411e-06, "loss": 0.0428, "step": 2000 }, { "epoch": 0.1553398058252427, "eval_loss": 0.045544762164354324, "eval_runtime": 1428.209, "eval_samples_per_second": 3.93, "eval_steps_per_second": 0.281, "step": 2000 }, { "epoch": 0.15611650485436893, "grad_norm": 0.39483996549331163, "learning_rate": 9.695822038556938e-06, "loss": 0.0476, "step": 2010 }, { "epoch": 0.15689320388349515, "grad_norm": 0.17008310552276865, "learning_rate": 9.691395481706397e-06, "loss": 0.0425, "step": 2020 }, { "epoch": 0.15766990291262137, "grad_norm": 0.19588181773428445, "learning_rate": 9.686937973646736e-06, "loss": 0.0445, "step": 2030 }, { "epoch": 0.15844660194174756, "grad_norm": 0.24307625431650168, "learning_rate": 9.68244954378611e-06, "loss": 0.0459, "step": 2040 }, { "epoch": 0.15922330097087378, "grad_norm": 0.20395398207234372, "learning_rate": 9.677930221736671e-06, "loss": 0.0441, "step": 2050 }, { "epoch": 0.16, "grad_norm": 0.1607702539431335, "learning_rate": 9.673380037314386e-06, "loss": 0.0429, "step": 2060 }, { "epoch": 0.16077669902912622, "grad_norm": 0.1209049083294209, "learning_rate": 9.668799020538831e-06, "loss": 0.0426, "step": 2070 }, { "epoch": 0.16155339805825242, "grad_norm": 0.14172446153158563, "learning_rate": 9.664187201632999e-06, "loss": 0.041, "step": 2080 }, { "epoch": 0.16233009708737864, "grad_norm": 0.19674267778098548, "learning_rate": 9.659544611023097e-06, "loss": 0.0467, "step": 2090 }, { "epoch": 0.16310679611650486, "grad_norm": 0.18247172496992206, "learning_rate": 9.654871279338344e-06, "loss": 0.0419, "step": 2100 }, { "epoch": 0.16388349514563108, "grad_norm": 0.21381187543590863, "learning_rate": 9.65016723741078e-06, "loss": 0.0454, "step": 2110 }, { "epoch": 0.16466019417475727, "grad_norm": 0.1844303564045429, "learning_rate": 9.645432516275042e-06, "loss": 0.0437, "step": 2120 }, { "epoch": 0.1654368932038835, "grad_norm": 0.12898404337238012, "learning_rate": 9.640667147168182e-06, "loss": 0.0453, "step": 2130 }, { "epoch": 0.16621359223300972, "grad_norm": 0.1512785037521986, "learning_rate": 9.635871161529445e-06, "loss": 0.0448, "step": 2140 }, { "epoch": 0.1669902912621359, "grad_norm": 0.18452005504086405, "learning_rate": 9.631044591000065e-06, "loss": 0.0431, "step": 2150 }, { "epoch": 0.16776699029126213, "grad_norm": 0.18866698349715472, "learning_rate": 9.626187467423066e-06, "loss": 0.0424, "step": 2160 }, { "epoch": 0.16854368932038835, "grad_norm": 0.1668717475167489, "learning_rate": 9.62129982284303e-06, "loss": 0.0449, "step": 2170 }, { "epoch": 0.16932038834951457, "grad_norm": 0.17010427418245216, "learning_rate": 9.616381689505918e-06, "loss": 0.0435, "step": 2180 }, { "epoch": 0.17009708737864077, "grad_norm": 0.15143205535959225, "learning_rate": 9.611433099858823e-06, "loss": 0.0419, "step": 2190 }, { "epoch": 0.170873786407767, "grad_norm": 0.18406797971348265, "learning_rate": 9.606454086549779e-06, "loss": 0.041, "step": 2200 }, { "epoch": 0.1716504854368932, "grad_norm": 0.14100628061667672, "learning_rate": 9.601444682427537e-06, "loss": 0.0419, "step": 2210 }, { "epoch": 0.17242718446601943, "grad_norm": 0.18663647409928352, "learning_rate": 9.59640492054135e-06, "loss": 0.0412, "step": 2220 }, { "epoch": 0.17320388349514562, "grad_norm": 0.12798005716251423, "learning_rate": 9.591334834140752e-06, "loss": 0.0433, "step": 2230 }, { "epoch": 0.17398058252427184, "grad_norm": 0.16964659715487737, "learning_rate": 9.586234456675348e-06, "loss": 0.0428, "step": 2240 }, { "epoch": 0.17475728155339806, "grad_norm": 0.1163947125377324, "learning_rate": 9.58110382179458e-06, "loss": 0.041, "step": 2250 }, { "epoch": 0.17553398058252428, "grad_norm": 0.1492961232935505, "learning_rate": 9.57594296334751e-06, "loss": 0.0414, "step": 2260 }, { "epoch": 0.17631067961165048, "grad_norm": 0.2347434884858462, "learning_rate": 9.570751915382605e-06, "loss": 0.0435, "step": 2270 }, { "epoch": 0.1770873786407767, "grad_norm": 0.32362390609943836, "learning_rate": 9.565530712147505e-06, "loss": 0.0412, "step": 2280 }, { "epoch": 0.17786407766990292, "grad_norm": 0.2180939652540125, "learning_rate": 9.560279388088788e-06, "loss": 0.0415, "step": 2290 }, { "epoch": 0.1786407766990291, "grad_norm": 0.22823478710750025, "learning_rate": 9.554997977851766e-06, "loss": 0.0407, "step": 2300 }, { "epoch": 0.17941747572815533, "grad_norm": 0.35066238612897815, "learning_rate": 9.549686516280233e-06, "loss": 0.0427, "step": 2310 }, { "epoch": 0.18019417475728156, "grad_norm": 0.14517907284634626, "learning_rate": 9.544345038416248e-06, "loss": 0.041, "step": 2320 }, { "epoch": 0.18097087378640778, "grad_norm": 0.11721105412186893, "learning_rate": 9.538973579499902e-06, "loss": 0.04, "step": 2330 }, { "epoch": 0.18174757281553397, "grad_norm": 0.09396284367677303, "learning_rate": 9.533572174969083e-06, "loss": 0.0395, "step": 2340 }, { "epoch": 0.1825242718446602, "grad_norm": 0.16835437638033476, "learning_rate": 9.528140860459243e-06, "loss": 0.0426, "step": 2350 }, { "epoch": 0.1833009708737864, "grad_norm": 0.130224491897812, "learning_rate": 9.522679671803163e-06, "loss": 0.0428, "step": 2360 }, { "epoch": 0.18407766990291263, "grad_norm": 0.11762091536139932, "learning_rate": 9.517188645030722e-06, "loss": 0.0391, "step": 2370 }, { "epoch": 0.18485436893203883, "grad_norm": 0.13470966701382048, "learning_rate": 9.511667816368644e-06, "loss": 0.0399, "step": 2380 }, { "epoch": 0.18563106796116505, "grad_norm": 0.2518098197985445, "learning_rate": 9.50611722224028e-06, "loss": 0.0617, "step": 2390 }, { "epoch": 0.18640776699029127, "grad_norm": 0.24593386008354953, "learning_rate": 9.50053689926535e-06, "loss": 0.0422, "step": 2400 }, { "epoch": 0.1871844660194175, "grad_norm": 0.1623148714462029, "learning_rate": 9.494926884259712e-06, "loss": 0.0413, "step": 2410 }, { "epoch": 0.18796116504854368, "grad_norm": 0.147708085428062, "learning_rate": 9.489287214235112e-06, "loss": 0.0396, "step": 2420 }, { "epoch": 0.1887378640776699, "grad_norm": 0.21027234660728417, "learning_rate": 9.483617926398947e-06, "loss": 0.04, "step": 2430 }, { "epoch": 0.18951456310679612, "grad_norm": 0.2506469917389465, "learning_rate": 9.477919058154014e-06, "loss": 0.0401, "step": 2440 }, { "epoch": 0.19029126213592232, "grad_norm": 0.13951039793931203, "learning_rate": 9.472190647098265e-06, "loss": 0.0384, "step": 2450 }, { "epoch": 0.19106796116504854, "grad_norm": 0.23646865904763428, "learning_rate": 9.466432731024559e-06, "loss": 0.0419, "step": 2460 }, { "epoch": 0.19184466019417476, "grad_norm": 0.2009902332369228, "learning_rate": 9.460645347920415e-06, "loss": 0.0448, "step": 2470 }, { "epoch": 0.19262135922330098, "grad_norm": 0.2558601409951373, "learning_rate": 9.454828535967752e-06, "loss": 0.0408, "step": 2480 }, { "epoch": 0.19339805825242717, "grad_norm": 0.16355060873497454, "learning_rate": 9.448982333542651e-06, "loss": 0.042, "step": 2490 }, { "epoch": 0.1941747572815534, "grad_norm": 0.1685548686857409, "learning_rate": 9.443106779215096e-06, "loss": 0.0392, "step": 2500 }, { "epoch": 0.19495145631067962, "grad_norm": 0.16568346354115016, "learning_rate": 9.43720191174871e-06, "loss": 0.0411, "step": 2510 }, { "epoch": 0.19572815533980584, "grad_norm": 0.13553460197922218, "learning_rate": 9.431267770100518e-06, "loss": 0.0418, "step": 2520 }, { "epoch": 0.19650485436893203, "grad_norm": 0.24390290239067297, "learning_rate": 9.42530439342067e-06, "loss": 0.0384, "step": 2530 }, { "epoch": 0.19728155339805825, "grad_norm": 0.2682863979283329, "learning_rate": 9.4193118210522e-06, "loss": 0.0417, "step": 2540 }, { "epoch": 0.19805825242718447, "grad_norm": 0.108842339601181, "learning_rate": 9.413290092530756e-06, "loss": 0.0463, "step": 2550 }, { "epoch": 0.1988349514563107, "grad_norm": 0.12501492222930413, "learning_rate": 9.407239247584343e-06, "loss": 0.0421, "step": 2560 }, { "epoch": 0.1996116504854369, "grad_norm": 0.16994619127780208, "learning_rate": 9.401159326133055e-06, "loss": 0.0407, "step": 2570 }, { "epoch": 0.2003883495145631, "grad_norm": 0.17961088560638136, "learning_rate": 9.395050368288825e-06, "loss": 0.0428, "step": 2580 }, { "epoch": 0.20116504854368933, "grad_norm": 0.2824268929785853, "learning_rate": 9.388912414355145e-06, "loss": 0.0405, "step": 2590 }, { "epoch": 0.20194174757281552, "grad_norm": 0.1716278298012604, "learning_rate": 9.38274550482681e-06, "loss": 0.0411, "step": 2600 }, { "epoch": 0.20271844660194174, "grad_norm": 0.18878689803869975, "learning_rate": 9.376549680389648e-06, "loss": 0.0398, "step": 2610 }, { "epoch": 0.20349514563106796, "grad_norm": 0.2501598696457349, "learning_rate": 9.370324981920249e-06, "loss": 0.0409, "step": 2620 }, { "epoch": 0.20427184466019419, "grad_norm": 0.15585706757633877, "learning_rate": 9.3640714504857e-06, "loss": 0.0396, "step": 2630 }, { "epoch": 0.20504854368932038, "grad_norm": 0.09627525971748133, "learning_rate": 9.357789127343315e-06, "loss": 0.0385, "step": 2640 }, { "epoch": 0.2058252427184466, "grad_norm": 0.46842001489464064, "learning_rate": 9.35147805394035e-06, "loss": 0.0407, "step": 2650 }, { "epoch": 0.20660194174757282, "grad_norm": 0.16264009156933132, "learning_rate": 9.34513827191375e-06, "loss": 0.0436, "step": 2660 }, { "epoch": 0.20737864077669904, "grad_norm": 0.14048476899301615, "learning_rate": 9.338769823089853e-06, "loss": 0.0434, "step": 2670 }, { "epoch": 0.20815533980582523, "grad_norm": 0.15037619784510675, "learning_rate": 9.332372749484138e-06, "loss": 0.0424, "step": 2680 }, { "epoch": 0.20893203883495146, "grad_norm": 0.2599159849048256, "learning_rate": 9.325947093300918e-06, "loss": 0.0391, "step": 2690 }, { "epoch": 0.20970873786407768, "grad_norm": 0.2885238376852637, "learning_rate": 9.319492896933094e-06, "loss": 0.0401, "step": 2700 }, { "epoch": 0.21048543689320387, "grad_norm": 0.17588375800121311, "learning_rate": 9.313010202961845e-06, "loss": 0.0392, "step": 2710 }, { "epoch": 0.2112621359223301, "grad_norm": 0.16320789876382566, "learning_rate": 9.306499054156373e-06, "loss": 0.0424, "step": 2720 }, { "epoch": 0.2120388349514563, "grad_norm": 0.21701776773889916, "learning_rate": 9.2999594934736e-06, "loss": 0.0405, "step": 2730 }, { "epoch": 0.21281553398058253, "grad_norm": 0.22061658248470453, "learning_rate": 9.293391564057898e-06, "loss": 0.0409, "step": 2740 }, { "epoch": 0.21359223300970873, "grad_norm": 0.14695030110390778, "learning_rate": 9.286795309240804e-06, "loss": 0.0373, "step": 2750 }, { "epoch": 0.21436893203883495, "grad_norm": 0.13794042961616945, "learning_rate": 9.280170772540722e-06, "loss": 0.0399, "step": 2760 }, { "epoch": 0.21514563106796117, "grad_norm": 0.18829191950820665, "learning_rate": 9.27351799766265e-06, "loss": 0.0406, "step": 2770 }, { "epoch": 0.2159223300970874, "grad_norm": 0.18301585587769434, "learning_rate": 9.26683702849788e-06, "loss": 0.039, "step": 2780 }, { "epoch": 0.21669902912621358, "grad_norm": 0.1962904858545287, "learning_rate": 9.260127909123722e-06, "loss": 0.0428, "step": 2790 }, { "epoch": 0.2174757281553398, "grad_norm": 0.11720933833658112, "learning_rate": 9.253390683803201e-06, "loss": 0.0385, "step": 2800 }, { "epoch": 0.21825242718446602, "grad_norm": 0.11623092934180192, "learning_rate": 9.246625396984766e-06, "loss": 0.0393, "step": 2810 }, { "epoch": 0.21902912621359225, "grad_norm": 0.21669742674675474, "learning_rate": 9.239832093302007e-06, "loss": 0.0383, "step": 2820 }, { "epoch": 0.21980582524271844, "grad_norm": 0.13726628107618066, "learning_rate": 9.233010817573352e-06, "loss": 0.041, "step": 2830 }, { "epoch": 0.22058252427184466, "grad_norm": 0.1821251489179769, "learning_rate": 9.226161614801765e-06, "loss": 0.0396, "step": 2840 }, { "epoch": 0.22135922330097088, "grad_norm": 0.16309665159266318, "learning_rate": 9.219284530174469e-06, "loss": 0.039, "step": 2850 }, { "epoch": 0.22213592233009707, "grad_norm": 0.19698537682582234, "learning_rate": 9.21237960906263e-06, "loss": 0.0388, "step": 2860 }, { "epoch": 0.2229126213592233, "grad_norm": 0.14524800205718985, "learning_rate": 9.20544689702106e-06, "loss": 0.0401, "step": 2870 }, { "epoch": 0.22368932038834952, "grad_norm": 0.19390470448510735, "learning_rate": 9.198486439787926e-06, "loss": 0.0416, "step": 2880 }, { "epoch": 0.22446601941747574, "grad_norm": 0.10805763937835576, "learning_rate": 9.191498283284441e-06, "loss": 0.0394, "step": 2890 }, { "epoch": 0.22524271844660193, "grad_norm": 0.15618881648758767, "learning_rate": 9.184482473614562e-06, "loss": 0.0368, "step": 2900 }, { "epoch": 0.22601941747572815, "grad_norm": 0.15561605470147402, "learning_rate": 9.177439057064684e-06, "loss": 0.0383, "step": 2910 }, { "epoch": 0.22679611650485437, "grad_norm": 0.109158075852671, "learning_rate": 9.170368080103339e-06, "loss": 0.038, "step": 2920 }, { "epoch": 0.2275728155339806, "grad_norm": 0.1825877834992463, "learning_rate": 9.163269589380886e-06, "loss": 0.0522, "step": 2930 }, { "epoch": 0.2283495145631068, "grad_norm": 0.27823570376024215, "learning_rate": 9.156143631729205e-06, "loss": 0.0394, "step": 2940 }, { "epoch": 0.229126213592233, "grad_norm": 0.37658349854044304, "learning_rate": 9.14899025416139e-06, "loss": 0.0379, "step": 2950 }, { "epoch": 0.22990291262135923, "grad_norm": 0.2352698339146687, "learning_rate": 9.14180950387143e-06, "loss": 0.0395, "step": 2960 }, { "epoch": 0.23067961165048545, "grad_norm": 0.17496572999151425, "learning_rate": 9.134601428233914e-06, "loss": 0.0372, "step": 2970 }, { "epoch": 0.23145631067961164, "grad_norm": 0.15246180879353766, "learning_rate": 9.127366074803696e-06, "loss": 0.04, "step": 2980 }, { "epoch": 0.23223300970873786, "grad_norm": 0.13478314802598934, "learning_rate": 9.120103491315602e-06, "loss": 0.0398, "step": 2990 }, { "epoch": 0.23300970873786409, "grad_norm": 0.20997620620386132, "learning_rate": 9.112813725684104e-06, "loss": 0.04, "step": 3000 }, { "epoch": 0.23300970873786409, "eval_loss": 0.040163930505514145, "eval_runtime": 1425.5289, "eval_samples_per_second": 3.937, "eval_steps_per_second": 0.281, "step": 3000 }, { "epoch": 0.23378640776699028, "grad_norm": 0.20702308668717662, "learning_rate": 9.10549682600301e-06, "loss": 0.0388, "step": 3010 }, { "epoch": 0.2345631067961165, "grad_norm": 0.21303578756374209, "learning_rate": 9.098152840545138e-06, "loss": 0.0413, "step": 3020 }, { "epoch": 0.23533980582524272, "grad_norm": 0.1197762790397557, "learning_rate": 9.090781817762007e-06, "loss": 0.0379, "step": 3030 }, { "epoch": 0.23611650485436894, "grad_norm": 0.13687990676668624, "learning_rate": 9.083383806283507e-06, "loss": 0.0389, "step": 3040 }, { "epoch": 0.23689320388349513, "grad_norm": 2.7535579511167345, "learning_rate": 9.075958854917595e-06, "loss": 0.0395, "step": 3050 }, { "epoch": 0.23766990291262136, "grad_norm": 0.11729659843794633, "learning_rate": 9.068507012649955e-06, "loss": 0.0396, "step": 3060 }, { "epoch": 0.23844660194174758, "grad_norm": 0.1986330675743394, "learning_rate": 9.06102832864368e-06, "loss": 0.0401, "step": 3070 }, { "epoch": 0.2392233009708738, "grad_norm": 0.12807847521879387, "learning_rate": 9.053522852238953e-06, "loss": 0.0396, "step": 3080 }, { "epoch": 0.24, "grad_norm": 0.2621452626867742, "learning_rate": 9.045990632952724e-06, "loss": 0.0394, "step": 3090 }, { "epoch": 0.2407766990291262, "grad_norm": 0.21819286539899255, "learning_rate": 9.038431720478369e-06, "loss": 0.038, "step": 3100 }, { "epoch": 0.24155339805825243, "grad_norm": 0.14753576767210358, "learning_rate": 9.030846164685371e-06, "loss": 0.039, "step": 3110 }, { "epoch": 0.24233009708737865, "grad_norm": 0.2930531269972848, "learning_rate": 9.023234015619e-06, "loss": 0.0378, "step": 3120 }, { "epoch": 0.24310679611650485, "grad_norm": 0.2883522659866763, "learning_rate": 9.015595323499962e-06, "loss": 0.0375, "step": 3130 }, { "epoch": 0.24388349514563107, "grad_norm": 0.09652813018972847, "learning_rate": 9.007930138724084e-06, "loss": 0.0385, "step": 3140 }, { "epoch": 0.2446601941747573, "grad_norm": 0.15341104314350587, "learning_rate": 9.000238511861983e-06, "loss": 0.0385, "step": 3150 }, { "epoch": 0.24543689320388348, "grad_norm": 0.12952934565430457, "learning_rate": 8.992520493658713e-06, "loss": 0.0362, "step": 3160 }, { "epoch": 0.2462135922330097, "grad_norm": 0.1427496468010825, "learning_rate": 8.984776135033456e-06, "loss": 0.0401, "step": 3170 }, { "epoch": 0.24699029126213592, "grad_norm": 0.15748175721479005, "learning_rate": 8.977005487079165e-06, "loss": 0.0364, "step": 3180 }, { "epoch": 0.24776699029126215, "grad_norm": 0.1144589972583889, "learning_rate": 8.96920860106224e-06, "loss": 0.0393, "step": 3190 }, { "epoch": 0.24854368932038834, "grad_norm": 0.12853589001322274, "learning_rate": 8.961385528422184e-06, "loss": 0.0365, "step": 3200 }, { "epoch": 0.24932038834951456, "grad_norm": 0.12227227584193531, "learning_rate": 8.953536320771264e-06, "loss": 0.0364, "step": 3210 }, { "epoch": 0.2500970873786408, "grad_norm": 0.2790272087091698, "learning_rate": 8.94566102989417e-06, "loss": 0.0372, "step": 3220 }, { "epoch": 0.250873786407767, "grad_norm": 0.13724560347263398, "learning_rate": 8.937759707747675e-06, "loss": 0.0376, "step": 3230 }, { "epoch": 0.2516504854368932, "grad_norm": 0.12745557672861768, "learning_rate": 8.929832406460298e-06, "loss": 0.0402, "step": 3240 }, { "epoch": 0.2524271844660194, "grad_norm": 0.12319243451568349, "learning_rate": 8.921879178331943e-06, "loss": 0.0383, "step": 3250 }, { "epoch": 0.2532038834951456, "grad_norm": 0.12992064370185671, "learning_rate": 8.913900075833572e-06, "loss": 0.0382, "step": 3260 }, { "epoch": 0.25398058252427186, "grad_norm": 0.15867900046302788, "learning_rate": 8.905895151606853e-06, "loss": 0.0392, "step": 3270 }, { "epoch": 0.25475728155339805, "grad_norm": 0.20211481490667213, "learning_rate": 8.897864458463806e-06, "loss": 0.0364, "step": 3280 }, { "epoch": 0.25553398058252424, "grad_norm": 0.14709332621948268, "learning_rate": 8.889808049386462e-06, "loss": 0.0395, "step": 3290 }, { "epoch": 0.2563106796116505, "grad_norm": 0.16790614888557276, "learning_rate": 8.881725977526511e-06, "loss": 0.0394, "step": 3300 }, { "epoch": 0.2570873786407767, "grad_norm": 0.11681705717810983, "learning_rate": 8.873618296204957e-06, "loss": 0.0368, "step": 3310 }, { "epoch": 0.25786407766990294, "grad_norm": 0.14116203389718643, "learning_rate": 8.865485058911754e-06, "loss": 0.0342, "step": 3320 }, { "epoch": 0.25864077669902913, "grad_norm": 0.12358903026259321, "learning_rate": 8.85732631930546e-06, "loss": 0.0362, "step": 3330 }, { "epoch": 0.2594174757281553, "grad_norm": 0.18721878853833843, "learning_rate": 8.84914213121289e-06, "loss": 0.0388, "step": 3340 }, { "epoch": 0.26019417475728157, "grad_norm": 0.165026512287157, "learning_rate": 8.840932548628745e-06, "loss": 0.038, "step": 3350 }, { "epoch": 0.26097087378640776, "grad_norm": 0.19342676294549033, "learning_rate": 8.83269762571527e-06, "loss": 0.0367, "step": 3360 }, { "epoch": 0.26174757281553396, "grad_norm": 0.1323157648567206, "learning_rate": 8.824437416801894e-06, "loss": 0.0358, "step": 3370 }, { "epoch": 0.2625242718446602, "grad_norm": 0.17204698807201935, "learning_rate": 8.816151976384863e-06, "loss": 0.0419, "step": 3380 }, { "epoch": 0.2633009708737864, "grad_norm": 0.15964222423073549, "learning_rate": 8.807841359126887e-06, "loss": 0.0363, "step": 3390 }, { "epoch": 0.26407766990291265, "grad_norm": 0.14330155792081253, "learning_rate": 8.799505619856783e-06, "loss": 0.0376, "step": 3400 }, { "epoch": 0.26485436893203884, "grad_norm": 0.1161131444019064, "learning_rate": 8.791144813569106e-06, "loss": 0.0363, "step": 3410 }, { "epoch": 0.26563106796116503, "grad_norm": 0.15306711242258175, "learning_rate": 8.78275899542379e-06, "loss": 0.0385, "step": 3420 }, { "epoch": 0.2664077669902913, "grad_norm": 0.263646730975128, "learning_rate": 8.774348220745783e-06, "loss": 0.0377, "step": 3430 }, { "epoch": 0.2671844660194175, "grad_norm": 0.13595994628467736, "learning_rate": 8.765912545024681e-06, "loss": 0.0371, "step": 3440 }, { "epoch": 0.26796116504854367, "grad_norm": 0.14980346415101303, "learning_rate": 8.757452023914365e-06, "loss": 0.0365, "step": 3450 }, { "epoch": 0.2687378640776699, "grad_norm": 0.19942911796599366, "learning_rate": 8.748966713232632e-06, "loss": 0.0388, "step": 3460 }, { "epoch": 0.2695145631067961, "grad_norm": 0.1460655031371832, "learning_rate": 8.740456668960826e-06, "loss": 0.0393, "step": 3470 }, { "epoch": 0.2702912621359223, "grad_norm": 0.21984360028356562, "learning_rate": 8.73192194724347e-06, "loss": 0.0447, "step": 3480 }, { "epoch": 0.27106796116504855, "grad_norm": 0.25412850126515635, "learning_rate": 8.723362604387892e-06, "loss": 0.0365, "step": 3490 }, { "epoch": 0.27184466019417475, "grad_norm": 0.18244816724170185, "learning_rate": 8.714778696863863e-06, "loss": 0.0377, "step": 3500 }, { "epoch": 0.272621359223301, "grad_norm": 0.1473706189067493, "learning_rate": 8.706170281303214e-06, "loss": 0.0371, "step": 3510 }, { "epoch": 0.2733980582524272, "grad_norm": 0.13657398193839615, "learning_rate": 8.697537414499465e-06, "loss": 0.038, "step": 3520 }, { "epoch": 0.2741747572815534, "grad_norm": 0.1758824334642281, "learning_rate": 8.68888015340745e-06, "loss": 0.0361, "step": 3530 }, { "epoch": 0.27495145631067963, "grad_norm": 0.1406201920396608, "learning_rate": 8.68019855514295e-06, "loss": 0.0419, "step": 3540 }, { "epoch": 0.2757281553398058, "grad_norm": 0.22082625330612943, "learning_rate": 8.671492676982308e-06, "loss": 0.037, "step": 3550 }, { "epoch": 0.276504854368932, "grad_norm": 0.19912693856274002, "learning_rate": 8.662762576362043e-06, "loss": 0.0387, "step": 3560 }, { "epoch": 0.27728155339805827, "grad_norm": 0.17722658308090192, "learning_rate": 8.654008310878489e-06, "loss": 0.0359, "step": 3570 }, { "epoch": 0.27805825242718446, "grad_norm": 0.2960438689059768, "learning_rate": 8.645229938287406e-06, "loss": 0.0379, "step": 3580 }, { "epoch": 0.27883495145631065, "grad_norm": 0.11565548528260658, "learning_rate": 8.636427516503594e-06, "loss": 0.0365, "step": 3590 }, { "epoch": 0.2796116504854369, "grad_norm": 0.1047074003726959, "learning_rate": 8.62760110360052e-06, "loss": 0.0371, "step": 3600 }, { "epoch": 0.2803883495145631, "grad_norm": 0.13265287315026214, "learning_rate": 8.618750757809933e-06, "loss": 0.0384, "step": 3610 }, { "epoch": 0.28116504854368934, "grad_norm": 0.10761593196384778, "learning_rate": 8.609876537521474e-06, "loss": 0.0386, "step": 3620 }, { "epoch": 0.28194174757281554, "grad_norm": 0.1396007484029645, "learning_rate": 8.600978501282292e-06, "loss": 0.0399, "step": 3630 }, { "epoch": 0.28271844660194173, "grad_norm": 0.20153111572855906, "learning_rate": 8.592056707796668e-06, "loss": 0.037, "step": 3640 }, { "epoch": 0.283495145631068, "grad_norm": 0.2268046706982376, "learning_rate": 8.583111215925616e-06, "loss": 0.0367, "step": 3650 }, { "epoch": 0.2842718446601942, "grad_norm": 0.10470900403408513, "learning_rate": 8.5741420846865e-06, "loss": 0.0369, "step": 3660 }, { "epoch": 0.28504854368932037, "grad_norm": 0.176872394060829, "learning_rate": 8.565149373252637e-06, "loss": 0.0365, "step": 3670 }, { "epoch": 0.2858252427184466, "grad_norm": 0.11377720023349941, "learning_rate": 8.556133140952923e-06, "loss": 0.0372, "step": 3680 }, { "epoch": 0.2866019417475728, "grad_norm": 0.1306112036855491, "learning_rate": 8.547093447271424e-06, "loss": 0.0407, "step": 3690 }, { "epoch": 0.287378640776699, "grad_norm": 0.12407158658791907, "learning_rate": 8.538030351846996e-06, "loss": 0.0369, "step": 3700 }, { "epoch": 0.28815533980582525, "grad_norm": 0.2047381512365336, "learning_rate": 8.528943914472882e-06, "loss": 0.0378, "step": 3710 }, { "epoch": 0.28893203883495144, "grad_norm": 0.14220139202161663, "learning_rate": 8.51983419509633e-06, "loss": 0.0336, "step": 3720 }, { "epoch": 0.2897087378640777, "grad_norm": 0.20291559159085565, "learning_rate": 8.510701253818177e-06, "loss": 0.035, "step": 3730 }, { "epoch": 0.2904854368932039, "grad_norm": 0.1324819782285439, "learning_rate": 8.501545150892478e-06, "loss": 0.048, "step": 3740 }, { "epoch": 0.2912621359223301, "grad_norm": 0.12652425999142733, "learning_rate": 8.492365946726087e-06, "loss": 0.0372, "step": 3750 }, { "epoch": 0.2920388349514563, "grad_norm": 0.1173929063214391, "learning_rate": 8.483163701878274e-06, "loss": 0.0361, "step": 3760 }, { "epoch": 0.2928155339805825, "grad_norm": 0.1503099202113078, "learning_rate": 8.473938477060309e-06, "loss": 0.0376, "step": 3770 }, { "epoch": 0.2935922330097087, "grad_norm": 0.1040720726339797, "learning_rate": 8.46469033313508e-06, "loss": 0.0366, "step": 3780 }, { "epoch": 0.29436893203883496, "grad_norm": 0.10487252783389965, "learning_rate": 8.455419331116679e-06, "loss": 0.0374, "step": 3790 }, { "epoch": 0.29514563106796116, "grad_norm": 0.12066469701655427, "learning_rate": 8.446125532170005e-06, "loss": 0.0347, "step": 3800 }, { "epoch": 0.2959223300970874, "grad_norm": 0.5058234892981236, "learning_rate": 8.436808997610352e-06, "loss": 0.0377, "step": 3810 }, { "epoch": 0.2966990291262136, "grad_norm": 0.15329432259269354, "learning_rate": 8.42746978890302e-06, "loss": 0.0388, "step": 3820 }, { "epoch": 0.2974757281553398, "grad_norm": 0.29166576655403853, "learning_rate": 8.418107967662894e-06, "loss": 0.0363, "step": 3830 }, { "epoch": 0.29825242718446604, "grad_norm": 0.12606930898393662, "learning_rate": 8.408723595654046e-06, "loss": 0.036, "step": 3840 }, { "epoch": 0.29902912621359223, "grad_norm": 0.14758829275889757, "learning_rate": 8.399316734789323e-06, "loss": 0.0395, "step": 3850 }, { "epoch": 0.2998058252427184, "grad_norm": 0.1431397416755665, "learning_rate": 8.389887447129947e-06, "loss": 0.036, "step": 3860 }, { "epoch": 0.3005825242718447, "grad_norm": 0.14268861166172467, "learning_rate": 8.38043579488509e-06, "loss": 0.0363, "step": 3870 }, { "epoch": 0.30135922330097087, "grad_norm": 1.2279802746257964, "learning_rate": 8.370961840411486e-06, "loss": 0.0358, "step": 3880 }, { "epoch": 0.30213592233009706, "grad_norm": 0.11610002995374469, "learning_rate": 8.361465646212993e-06, "loss": 0.0385, "step": 3890 }, { "epoch": 0.3029126213592233, "grad_norm": 0.19499660283806397, "learning_rate": 8.351947274940202e-06, "loss": 0.0361, "step": 3900 }, { "epoch": 0.3036893203883495, "grad_norm": 0.09908423287530024, "learning_rate": 8.342406789390017e-06, "loss": 0.0349, "step": 3910 }, { "epoch": 0.30446601941747575, "grad_norm": 0.13228393476703243, "learning_rate": 8.332844252505232e-06, "loss": 0.037, "step": 3920 }, { "epoch": 0.30524271844660195, "grad_norm": 0.09121730852763026, "learning_rate": 8.323259727374134e-06, "loss": 0.0334, "step": 3930 }, { "epoch": 0.30601941747572814, "grad_norm": 0.09638895093625847, "learning_rate": 8.313653277230066e-06, "loss": 0.0337, "step": 3940 }, { "epoch": 0.3067961165048544, "grad_norm": 0.11400078717538539, "learning_rate": 8.30402496545103e-06, "loss": 0.0365, "step": 3950 }, { "epoch": 0.3075728155339806, "grad_norm": 0.22453116099119588, "learning_rate": 8.294374855559247e-06, "loss": 0.0358, "step": 3960 }, { "epoch": 0.3083495145631068, "grad_norm": 0.17317858903654368, "learning_rate": 8.284703011220763e-06, "loss": 0.0368, "step": 3970 }, { "epoch": 0.309126213592233, "grad_norm": 0.1232074053067404, "learning_rate": 8.275009496245004e-06, "loss": 0.0355, "step": 3980 }, { "epoch": 0.3099029126213592, "grad_norm": 0.1231145253475705, "learning_rate": 8.265294374584374e-06, "loss": 0.0376, "step": 3990 }, { "epoch": 0.3106796116504854, "grad_norm": 0.1276412881059381, "learning_rate": 8.255557710333824e-06, "loss": 0.0349, "step": 4000 }, { "epoch": 0.3106796116504854, "eval_loss": 0.03705970197916031, "eval_runtime": 1428.3921, "eval_samples_per_second": 3.93, "eval_steps_per_second": 0.281, "step": 4000 }, { "epoch": 0.31145631067961166, "grad_norm": 0.09612031825937363, "learning_rate": 8.24579956773043e-06, "loss": 0.0393, "step": 4010 }, { "epoch": 0.31223300970873785, "grad_norm": 0.12934622409523255, "learning_rate": 8.23602001115297e-06, "loss": 0.0365, "step": 4020 }, { "epoch": 0.3130097087378641, "grad_norm": 0.2603991125750957, "learning_rate": 8.226219105121503e-06, "loss": 0.0345, "step": 4030 }, { "epoch": 0.3137864077669903, "grad_norm": 0.1931987312888327, "learning_rate": 8.216396914296935e-06, "loss": 0.0341, "step": 4040 }, { "epoch": 0.3145631067961165, "grad_norm": 0.17131513876388083, "learning_rate": 8.206553503480599e-06, "loss": 0.0396, "step": 4050 }, { "epoch": 0.31533980582524274, "grad_norm": 0.09412854076073641, "learning_rate": 8.196688937613827e-06, "loss": 0.0441, "step": 4060 }, { "epoch": 0.31611650485436893, "grad_norm": 0.10184613301965349, "learning_rate": 8.18680328177752e-06, "loss": 0.035, "step": 4070 }, { "epoch": 0.3168932038834951, "grad_norm": 0.09064844629103995, "learning_rate": 8.176896601191714e-06, "loss": 0.0361, "step": 4080 }, { "epoch": 0.31766990291262137, "grad_norm": 0.11876904609837882, "learning_rate": 8.166968961215164e-06, "loss": 0.035, "step": 4090 }, { "epoch": 0.31844660194174756, "grad_norm": 0.13785677945573346, "learning_rate": 8.157020427344895e-06, "loss": 0.0438, "step": 4100 }, { "epoch": 0.3192233009708738, "grad_norm": 0.11572898368301965, "learning_rate": 8.147051065215782e-06, "loss": 0.0352, "step": 4110 }, { "epoch": 0.32, "grad_norm": 0.4222173120755279, "learning_rate": 8.13706094060011e-06, "loss": 0.0346, "step": 4120 }, { "epoch": 0.3207766990291262, "grad_norm": 0.10385678826587273, "learning_rate": 8.12705011940715e-06, "loss": 0.036, "step": 4130 }, { "epoch": 0.32155339805825245, "grad_norm": 0.18815469979194366, "learning_rate": 8.11701866768271e-06, "loss": 0.0372, "step": 4140 }, { "epoch": 0.32233009708737864, "grad_norm": 0.09964969700889972, "learning_rate": 8.106966651608712e-06, "loss": 0.0569, "step": 4150 }, { "epoch": 0.32310679611650484, "grad_norm": 0.12258252018565215, "learning_rate": 8.096894137502745e-06, "loss": 0.0347, "step": 4160 }, { "epoch": 0.3238834951456311, "grad_norm": 0.146088707069061, "learning_rate": 8.086801191817638e-06, "loss": 0.0373, "step": 4170 }, { "epoch": 0.3246601941747573, "grad_norm": 0.14602672392114585, "learning_rate": 8.076687881141012e-06, "loss": 0.0368, "step": 4180 }, { "epoch": 0.32543689320388347, "grad_norm": 0.11683776160225515, "learning_rate": 8.06655427219485e-06, "loss": 0.0344, "step": 4190 }, { "epoch": 0.3262135922330097, "grad_norm": 0.5961848175267904, "learning_rate": 8.056400431835045e-06, "loss": 0.0345, "step": 4200 }, { "epoch": 0.3269902912621359, "grad_norm": 0.14260822647042384, "learning_rate": 8.04622642705097e-06, "loss": 0.0331, "step": 4210 }, { "epoch": 0.32776699029126216, "grad_norm": 0.12826167416298445, "learning_rate": 8.03603232496503e-06, "loss": 0.0341, "step": 4220 }, { "epoch": 0.32854368932038835, "grad_norm": 0.14099136509320068, "learning_rate": 8.025818192832222e-06, "loss": 0.0341, "step": 4230 }, { "epoch": 0.32932038834951455, "grad_norm": 0.152836198846249, "learning_rate": 8.015584098039687e-06, "loss": 0.0361, "step": 4240 }, { "epoch": 0.3300970873786408, "grad_norm": 0.18601757154249116, "learning_rate": 8.005330108106274e-06, "loss": 0.0357, "step": 4250 }, { "epoch": 0.330873786407767, "grad_norm": 0.17082536536688353, "learning_rate": 7.995056290682086e-06, "loss": 0.0379, "step": 4260 }, { "epoch": 0.3316504854368932, "grad_norm": 0.08244058354648247, "learning_rate": 7.984762713548034e-06, "loss": 0.0351, "step": 4270 }, { "epoch": 0.33242718446601943, "grad_norm": 0.08405274278111631, "learning_rate": 7.974449444615397e-06, "loss": 0.0335, "step": 4280 }, { "epoch": 0.3332038834951456, "grad_norm": 0.13237459610692495, "learning_rate": 7.964116551925365e-06, "loss": 0.0345, "step": 4290 }, { "epoch": 0.3339805825242718, "grad_norm": 0.12542664454934851, "learning_rate": 7.9537641036486e-06, "loss": 0.0332, "step": 4300 }, { "epoch": 0.33475728155339807, "grad_norm": 0.1153082853017327, "learning_rate": 7.943392168084775e-06, "loss": 0.034, "step": 4310 }, { "epoch": 0.33553398058252426, "grad_norm": 0.11155039439823741, "learning_rate": 7.933000813662135e-06, "loss": 0.0333, "step": 4320 }, { "epoch": 0.3363106796116505, "grad_norm": 0.12974611511359335, "learning_rate": 7.922590108937036e-06, "loss": 0.035, "step": 4330 }, { "epoch": 0.3370873786407767, "grad_norm": 0.08839479803283527, "learning_rate": 7.9121601225935e-06, "loss": 0.0347, "step": 4340 }, { "epoch": 0.3378640776699029, "grad_norm": 0.11963439717628098, "learning_rate": 7.901710923442751e-06, "loss": 0.0348, "step": 4350 }, { "epoch": 0.33864077669902914, "grad_norm": 0.18763115349868922, "learning_rate": 7.891242580422776e-06, "loss": 0.0332, "step": 4360 }, { "epoch": 0.33941747572815534, "grad_norm": 0.13078588206019445, "learning_rate": 7.880755162597862e-06, "loss": 0.0354, "step": 4370 }, { "epoch": 0.34019417475728153, "grad_norm": 0.13855739737403094, "learning_rate": 7.870248739158135e-06, "loss": 0.036, "step": 4380 }, { "epoch": 0.3409708737864078, "grad_norm": 0.23600179500498844, "learning_rate": 7.859723379419113e-06, "loss": 0.0361, "step": 4390 }, { "epoch": 0.341747572815534, "grad_norm": 0.11629139809071777, "learning_rate": 7.849179152821251e-06, "loss": 0.0349, "step": 4400 }, { "epoch": 0.3425242718446602, "grad_norm": 0.17355486146636037, "learning_rate": 7.838616128929464e-06, "loss": 0.0353, "step": 4410 }, { "epoch": 0.3433009708737864, "grad_norm": 0.17041254313375334, "learning_rate": 7.828034377432694e-06, "loss": 0.0371, "step": 4420 }, { "epoch": 0.3440776699029126, "grad_norm": 0.19969442002907345, "learning_rate": 7.817433968143428e-06, "loss": 0.0334, "step": 4430 }, { "epoch": 0.34485436893203886, "grad_norm": 0.10334445855038885, "learning_rate": 7.806814970997254e-06, "loss": 0.0338, "step": 4440 }, { "epoch": 0.34563106796116505, "grad_norm": 0.07522817494865176, "learning_rate": 7.796177456052385e-06, "loss": 0.032, "step": 4450 }, { "epoch": 0.34640776699029124, "grad_norm": 0.18295227419233565, "learning_rate": 7.785521493489209e-06, "loss": 0.0353, "step": 4460 }, { "epoch": 0.3471844660194175, "grad_norm": 0.14294003401572528, "learning_rate": 7.774847153609822e-06, "loss": 0.0406, "step": 4470 }, { "epoch": 0.3479611650485437, "grad_norm": 0.09240897219622517, "learning_rate": 7.76415450683756e-06, "loss": 0.0354, "step": 4480 }, { "epoch": 0.3487378640776699, "grad_norm": 0.12004613378390809, "learning_rate": 7.753443623716536e-06, "loss": 0.0347, "step": 4490 }, { "epoch": 0.34951456310679613, "grad_norm": 0.09475350224602722, "learning_rate": 7.742714574911183e-06, "loss": 0.0355, "step": 4500 }, { "epoch": 0.3502912621359223, "grad_norm": 0.13702338582316537, "learning_rate": 7.731967431205776e-06, "loss": 0.0351, "step": 4510 }, { "epoch": 0.35106796116504857, "grad_norm": 0.11475977536282522, "learning_rate": 7.721202263503971e-06, "loss": 0.0345, "step": 4520 }, { "epoch": 0.35184466019417476, "grad_norm": 0.09448090755242017, "learning_rate": 7.710419142828334e-06, "loss": 0.0362, "step": 4530 }, { "epoch": 0.35262135922330096, "grad_norm": 0.09275862380468582, "learning_rate": 7.69961814031988e-06, "loss": 0.0351, "step": 4540 }, { "epoch": 0.3533980582524272, "grad_norm": 0.18037400256566363, "learning_rate": 7.688799327237597e-06, "loss": 0.0343, "step": 4550 }, { "epoch": 0.3541747572815534, "grad_norm": 0.12242889685590429, "learning_rate": 7.677962774957971e-06, "loss": 0.0344, "step": 4560 }, { "epoch": 0.3549514563106796, "grad_norm": 0.1310038485347947, "learning_rate": 7.66710855497453e-06, "loss": 0.0359, "step": 4570 }, { "epoch": 0.35572815533980584, "grad_norm": 0.1421289643849169, "learning_rate": 7.656236738897358e-06, "loss": 0.0402, "step": 4580 }, { "epoch": 0.35650485436893203, "grad_norm": 0.15342315619364819, "learning_rate": 7.645347398452631e-06, "loss": 0.0335, "step": 4590 }, { "epoch": 0.3572815533980582, "grad_norm": 0.13428882425225164, "learning_rate": 7.634440605482138e-06, "loss": 0.0371, "step": 4600 }, { "epoch": 0.3580582524271845, "grad_norm": 0.08709091143738386, "learning_rate": 7.623516431942814e-06, "loss": 0.0348, "step": 4610 }, { "epoch": 0.35883495145631067, "grad_norm": 0.16076637030430035, "learning_rate": 7.612574949906258e-06, "loss": 0.0321, "step": 4620 }, { "epoch": 0.3596116504854369, "grad_norm": 0.14313492840662945, "learning_rate": 7.601616231558261e-06, "loss": 0.0329, "step": 4630 }, { "epoch": 0.3603883495145631, "grad_norm": 0.10886658012286689, "learning_rate": 7.590640349198332e-06, "loss": 0.0326, "step": 4640 }, { "epoch": 0.3611650485436893, "grad_norm": 0.11756264595025138, "learning_rate": 7.5796473752392206e-06, "loss": 0.0319, "step": 4650 }, { "epoch": 0.36194174757281555, "grad_norm": 0.25701421872157165, "learning_rate": 7.568637382206426e-06, "loss": 0.0348, "step": 4660 }, { "epoch": 0.36271844660194175, "grad_norm": 0.10912413706255278, "learning_rate": 7.557610442737745e-06, "loss": 0.0337, "step": 4670 }, { "epoch": 0.36349514563106794, "grad_norm": 0.2438126349085161, "learning_rate": 7.546566629582765e-06, "loss": 0.0359, "step": 4680 }, { "epoch": 0.3642718446601942, "grad_norm": 0.10250332556177472, "learning_rate": 7.535506015602405e-06, "loss": 0.0372, "step": 4690 }, { "epoch": 0.3650485436893204, "grad_norm": 0.07809680150883937, "learning_rate": 7.52442867376842e-06, "loss": 0.0345, "step": 4700 }, { "epoch": 0.3658252427184466, "grad_norm": 0.16149331130534295, "learning_rate": 7.513334677162934e-06, "loss": 0.0344, "step": 4710 }, { "epoch": 0.3666019417475728, "grad_norm": 0.12002237435416307, "learning_rate": 7.5022240989779375e-06, "loss": 0.0355, "step": 4720 }, { "epoch": 0.367378640776699, "grad_norm": 0.16137299011530135, "learning_rate": 7.491097012514832e-06, "loss": 0.0356, "step": 4730 }, { "epoch": 0.36815533980582527, "grad_norm": 0.12359558338243708, "learning_rate": 7.479953491183919e-06, "loss": 0.035, "step": 4740 }, { "epoch": 0.36893203883495146, "grad_norm": 0.1076191993813983, "learning_rate": 7.4687936085039355e-06, "loss": 0.0334, "step": 4750 }, { "epoch": 0.36970873786407765, "grad_norm": 0.10075288501871599, "learning_rate": 7.457617438101559e-06, "loss": 0.0352, "step": 4760 }, { "epoch": 0.3704854368932039, "grad_norm": 0.08501045227266753, "learning_rate": 7.446425053710923e-06, "loss": 0.0361, "step": 4770 }, { "epoch": 0.3712621359223301, "grad_norm": 0.14420107074842833, "learning_rate": 7.435216529173135e-06, "loss": 0.0337, "step": 4780 }, { "epoch": 0.3720388349514563, "grad_norm": 0.10082690896416076, "learning_rate": 7.423991938435783e-06, "loss": 0.0356, "step": 4790 }, { "epoch": 0.37281553398058254, "grad_norm": 0.20704917133576548, "learning_rate": 7.412751355552452e-06, "loss": 0.0383, "step": 4800 }, { "epoch": 0.37359223300970873, "grad_norm": 0.12554863180467118, "learning_rate": 7.401494854682236e-06, "loss": 0.0352, "step": 4810 }, { "epoch": 0.374368932038835, "grad_norm": 0.0759064608904525, "learning_rate": 7.390222510089245e-06, "loss": 0.0342, "step": 4820 }, { "epoch": 0.37514563106796117, "grad_norm": 0.135858299564365, "learning_rate": 7.378934396142116e-06, "loss": 0.0329, "step": 4830 }, { "epoch": 0.37592233009708736, "grad_norm": 0.09067316009040932, "learning_rate": 7.367630587313528e-06, "loss": 0.0325, "step": 4840 }, { "epoch": 0.3766990291262136, "grad_norm": 0.11450693789361709, "learning_rate": 7.356311158179698e-06, "loss": 0.0344, "step": 4850 }, { "epoch": 0.3774757281553398, "grad_norm": 0.11466192313267366, "learning_rate": 7.344976183419909e-06, "loss": 0.0335, "step": 4860 }, { "epoch": 0.378252427184466, "grad_norm": 0.21501153283274668, "learning_rate": 7.333625737815993e-06, "loss": 0.0338, "step": 4870 }, { "epoch": 0.37902912621359225, "grad_norm": 0.10782775970702096, "learning_rate": 7.322259896251856e-06, "loss": 0.0323, "step": 4880 }, { "epoch": 0.37980582524271844, "grad_norm": 0.10129295460340167, "learning_rate": 7.3108787337129785e-06, "loss": 0.034, "step": 4890 }, { "epoch": 0.38058252427184464, "grad_norm": 0.12997030314488892, "learning_rate": 7.2994823252859205e-06, "loss": 0.0345, "step": 4900 }, { "epoch": 0.3813592233009709, "grad_norm": 0.1399490229671598, "learning_rate": 7.288070746157822e-06, "loss": 0.0335, "step": 4910 }, { "epoch": 0.3821359223300971, "grad_norm": 0.12480202898871891, "learning_rate": 7.2766440716159135e-06, "loss": 0.0328, "step": 4920 }, { "epoch": 0.3829126213592233, "grad_norm": 0.09648426146275403, "learning_rate": 7.265202377047017e-06, "loss": 0.0346, "step": 4930 }, { "epoch": 0.3836893203883495, "grad_norm": 0.13778383997890792, "learning_rate": 7.253745737937048e-06, "loss": 0.0351, "step": 4940 }, { "epoch": 0.3844660194174757, "grad_norm": 0.1696264671047826, "learning_rate": 7.242274229870518e-06, "loss": 0.0325, "step": 4950 }, { "epoch": 0.38524271844660196, "grad_norm": 0.14921626404439897, "learning_rate": 7.230787928530034e-06, "loss": 0.0358, "step": 4960 }, { "epoch": 0.38601941747572815, "grad_norm": 0.16035405015573642, "learning_rate": 7.219286909695801e-06, "loss": 0.0352, "step": 4970 }, { "epoch": 0.38679611650485435, "grad_norm": 0.10918507194970713, "learning_rate": 7.207771249245124e-06, "loss": 0.0349, "step": 4980 }, { "epoch": 0.3875728155339806, "grad_norm": 0.0979315338798473, "learning_rate": 7.196241023151902e-06, "loss": 0.0335, "step": 4990 }, { "epoch": 0.3883495145631068, "grad_norm": 0.06855490009831777, "learning_rate": 7.1846963074861345e-06, "loss": 0.0384, "step": 5000 }, { "epoch": 0.3883495145631068, "eval_loss": 0.03518132120370865, "eval_runtime": 1429.3423, "eval_samples_per_second": 3.927, "eval_steps_per_second": 0.281, "step": 5000 }, { "epoch": 0.389126213592233, "grad_norm": 0.09429078779259148, "learning_rate": 7.173137178413409e-06, "loss": 0.0342, "step": 5010 }, { "epoch": 0.38990291262135923, "grad_norm": 0.17162973647689572, "learning_rate": 7.16156371219441e-06, "loss": 0.034, "step": 5020 }, { "epoch": 0.3906796116504854, "grad_norm": 0.11140713720447118, "learning_rate": 7.149975985184409e-06, "loss": 0.0341, "step": 5030 }, { "epoch": 0.3914563106796117, "grad_norm": 0.11677013280867167, "learning_rate": 7.13837407383276e-06, "loss": 0.0339, "step": 5040 }, { "epoch": 0.39223300970873787, "grad_norm": 0.10000131541825925, "learning_rate": 7.1267580546824e-06, "loss": 0.0332, "step": 5050 }, { "epoch": 0.39300970873786406, "grad_norm": 0.1240934512903845, "learning_rate": 7.1151280043693405e-06, "loss": 0.0329, "step": 5060 }, { "epoch": 0.3937864077669903, "grad_norm": 0.08319625233828501, "learning_rate": 7.1034839996221626e-06, "loss": 0.0366, "step": 5070 }, { "epoch": 0.3945631067961165, "grad_norm": 0.13963795271553764, "learning_rate": 7.09182611726151e-06, "loss": 0.0326, "step": 5080 }, { "epoch": 0.3953398058252427, "grad_norm": 0.13798386150695127, "learning_rate": 7.0801544341995865e-06, "loss": 0.0331, "step": 5090 }, { "epoch": 0.39611650485436894, "grad_norm": 0.12226196134074617, "learning_rate": 7.068469027439642e-06, "loss": 0.0346, "step": 5100 }, { "epoch": 0.39689320388349514, "grad_norm": 0.1667454429882751, "learning_rate": 7.056769974075466e-06, "loss": 0.0345, "step": 5110 }, { "epoch": 0.3976699029126214, "grad_norm": 0.123964069029111, "learning_rate": 7.045057351290887e-06, "loss": 0.0344, "step": 5120 }, { "epoch": 0.3984466019417476, "grad_norm": 0.10574571317719644, "learning_rate": 7.033331236359254e-06, "loss": 0.0318, "step": 5130 }, { "epoch": 0.3992233009708738, "grad_norm": 0.08910380037622273, "learning_rate": 7.021591706642924e-06, "loss": 0.0336, "step": 5140 }, { "epoch": 0.4, "grad_norm": 0.0951484988485021, "learning_rate": 7.009838839592768e-06, "loss": 0.0358, "step": 5150 }, { "epoch": 0.4007766990291262, "grad_norm": 0.08273264925010433, "learning_rate": 6.998072712747639e-06, "loss": 0.0341, "step": 5160 }, { "epoch": 0.4015533980582524, "grad_norm": 0.1213889926812037, "learning_rate": 6.986293403733877e-06, "loss": 0.0335, "step": 5170 }, { "epoch": 0.40233009708737866, "grad_norm": 0.1552648291290222, "learning_rate": 6.9745009902647896e-06, "loss": 0.0335, "step": 5180 }, { "epoch": 0.40310679611650485, "grad_norm": 0.09817973172700438, "learning_rate": 6.9626955501401395e-06, "loss": 0.0341, "step": 5190 }, { "epoch": 0.40388349514563104, "grad_norm": 1.1366502078441443, "learning_rate": 6.95087716124563e-06, "loss": 0.0337, "step": 5200 }, { "epoch": 0.4046601941747573, "grad_norm": 0.12558671513976358, "learning_rate": 6.939045901552397e-06, "loss": 0.0358, "step": 5210 }, { "epoch": 0.4054368932038835, "grad_norm": 0.13940369927779223, "learning_rate": 6.927201849116488e-06, "loss": 0.0341, "step": 5220 }, { "epoch": 0.40621359223300973, "grad_norm": 0.1170314799221389, "learning_rate": 6.915345082078351e-06, "loss": 0.0351, "step": 5230 }, { "epoch": 0.40699029126213593, "grad_norm": 0.13974395325681005, "learning_rate": 6.903475678662318e-06, "loss": 0.0335, "step": 5240 }, { "epoch": 0.4077669902912621, "grad_norm": 0.10014243880246401, "learning_rate": 6.891593717176088e-06, "loss": 0.0328, "step": 5250 }, { "epoch": 0.40854368932038837, "grad_norm": 0.14938616367290383, "learning_rate": 6.8796992760102126e-06, "loss": 0.0348, "step": 5260 }, { "epoch": 0.40932038834951456, "grad_norm": 0.1242270133896958, "learning_rate": 6.867792433637576e-06, "loss": 0.0334, "step": 5270 }, { "epoch": 0.41009708737864076, "grad_norm": 0.11837162581782518, "learning_rate": 6.855873268612876e-06, "loss": 0.0336, "step": 5280 }, { "epoch": 0.410873786407767, "grad_norm": 0.13150480263934636, "learning_rate": 6.843941859572117e-06, "loss": 0.0325, "step": 5290 }, { "epoch": 0.4116504854368932, "grad_norm": 0.13509419569234965, "learning_rate": 6.831998285232071e-06, "loss": 0.0359, "step": 5300 }, { "epoch": 0.4124271844660194, "grad_norm": 0.11673174701761573, "learning_rate": 6.8200426243897795e-06, "loss": 0.0395, "step": 5310 }, { "epoch": 0.41320388349514564, "grad_norm": 0.1165906353254538, "learning_rate": 6.808074955922019e-06, "loss": 0.035, "step": 5320 }, { "epoch": 0.41398058252427183, "grad_norm": 0.0996672386026295, "learning_rate": 6.796095358784789e-06, "loss": 0.0362, "step": 5330 }, { "epoch": 0.4147572815533981, "grad_norm": 0.10995961566335488, "learning_rate": 6.784103912012781e-06, "loss": 0.0336, "step": 5340 }, { "epoch": 0.4155339805825243, "grad_norm": 0.16087163877259772, "learning_rate": 6.772100694718873e-06, "loss": 0.0337, "step": 5350 }, { "epoch": 0.41631067961165047, "grad_norm": 0.11077953848791751, "learning_rate": 6.760085786093591e-06, "loss": 0.0337, "step": 5360 }, { "epoch": 0.4170873786407767, "grad_norm": 0.10420252843726117, "learning_rate": 6.748059265404598e-06, "loss": 0.0333, "step": 5370 }, { "epoch": 0.4178640776699029, "grad_norm": 0.0987777986530611, "learning_rate": 6.736021211996164e-06, "loss": 0.0323, "step": 5380 }, { "epoch": 0.4186407766990291, "grad_norm": 0.12979235880142231, "learning_rate": 6.723971705288647e-06, "loss": 0.0334, "step": 5390 }, { "epoch": 0.41941747572815535, "grad_norm": 0.10448085643024156, "learning_rate": 6.71191082477797e-06, "loss": 0.0329, "step": 5400 }, { "epoch": 0.42019417475728155, "grad_norm": 0.10186519696044918, "learning_rate": 6.699838650035088e-06, "loss": 0.032, "step": 5410 }, { "epoch": 0.42097087378640774, "grad_norm": 0.14701263509094134, "learning_rate": 6.6877552607054764e-06, "loss": 0.0334, "step": 5420 }, { "epoch": 0.421747572815534, "grad_norm": 0.15165917627530096, "learning_rate": 6.675660736508591e-06, "loss": 0.0341, "step": 5430 }, { "epoch": 0.4225242718446602, "grad_norm": 0.10683335287074466, "learning_rate": 6.663555157237355e-06, "loss": 0.0335, "step": 5440 }, { "epoch": 0.42330097087378643, "grad_norm": 0.12285935229314345, "learning_rate": 6.65143860275762e-06, "loss": 0.0346, "step": 5450 }, { "epoch": 0.4240776699029126, "grad_norm": 0.09647691961730581, "learning_rate": 6.639311153007655e-06, "loss": 0.0338, "step": 5460 }, { "epoch": 0.4248543689320388, "grad_norm": 0.08230836247172503, "learning_rate": 6.627172887997602e-06, "loss": 0.0317, "step": 5470 }, { "epoch": 0.42563106796116507, "grad_norm": 0.11203383218750483, "learning_rate": 6.61502388780896e-06, "loss": 0.0312, "step": 5480 }, { "epoch": 0.42640776699029126, "grad_norm": 0.09944250560022817, "learning_rate": 6.602864232594053e-06, "loss": 0.0345, "step": 5490 }, { "epoch": 0.42718446601941745, "grad_norm": 0.18512929114250126, "learning_rate": 6.5906940025755e-06, "loss": 0.0312, "step": 5500 }, { "epoch": 0.4279611650485437, "grad_norm": 0.13120432725149225, "learning_rate": 6.578513278045683e-06, "loss": 0.0343, "step": 5510 }, { "epoch": 0.4287378640776699, "grad_norm": 0.0916398697335342, "learning_rate": 6.566322139366229e-06, "loss": 0.0396, "step": 5520 }, { "epoch": 0.42951456310679614, "grad_norm": 0.11778336968931845, "learning_rate": 6.554120666967464e-06, "loss": 0.033, "step": 5530 }, { "epoch": 0.43029126213592234, "grad_norm": 0.09695552322879869, "learning_rate": 6.5419089413478935e-06, "loss": 0.0329, "step": 5540 }, { "epoch": 0.43106796116504853, "grad_norm": 0.09147725308360351, "learning_rate": 6.529687043073669e-06, "loss": 0.0336, "step": 5550 }, { "epoch": 0.4318446601941748, "grad_norm": 0.12728215592428643, "learning_rate": 6.517455052778053e-06, "loss": 0.0365, "step": 5560 }, { "epoch": 0.43262135922330097, "grad_norm": 0.08436120168716021, "learning_rate": 6.505213051160892e-06, "loss": 0.0315, "step": 5570 }, { "epoch": 0.43339805825242717, "grad_norm": 0.14559568695389727, "learning_rate": 6.49296111898808e-06, "loss": 0.0359, "step": 5580 }, { "epoch": 0.4341747572815534, "grad_norm": 0.09344598759998686, "learning_rate": 6.4806993370910265e-06, "loss": 0.0506, "step": 5590 }, { "epoch": 0.4349514563106796, "grad_norm": 0.13855735948875303, "learning_rate": 6.468427786366128e-06, "loss": 0.0328, "step": 5600 }, { "epoch": 0.4357281553398058, "grad_norm": 0.16612031869017035, "learning_rate": 6.456146547774225e-06, "loss": 0.0328, "step": 5610 }, { "epoch": 0.43650485436893205, "grad_norm": 0.16932451850715438, "learning_rate": 6.443855702340075e-06, "loss": 0.0403, "step": 5620 }, { "epoch": 0.43728155339805824, "grad_norm": 0.09559316896556012, "learning_rate": 6.431555331151819e-06, "loss": 0.033, "step": 5630 }, { "epoch": 0.4380582524271845, "grad_norm": 0.09112270318821118, "learning_rate": 6.419245515360441e-06, "loss": 0.0341, "step": 5640 }, { "epoch": 0.4388349514563107, "grad_norm": 0.0967669929863555, "learning_rate": 6.406926336179231e-06, "loss": 0.0338, "step": 5650 }, { "epoch": 0.4396116504854369, "grad_norm": 0.08111974047418405, "learning_rate": 6.394597874883265e-06, "loss": 0.0312, "step": 5660 }, { "epoch": 0.4403883495145631, "grad_norm": 0.15987801309060234, "learning_rate": 6.382260212808844e-06, "loss": 0.0392, "step": 5670 }, { "epoch": 0.4411650485436893, "grad_norm": 0.09023165197185735, "learning_rate": 6.3699134313529806e-06, "loss": 0.0345, "step": 5680 }, { "epoch": 0.4419417475728155, "grad_norm": 0.10708774457361471, "learning_rate": 6.3575576119728466e-06, "loss": 0.0336, "step": 5690 }, { "epoch": 0.44271844660194176, "grad_norm": 0.07866710592470307, "learning_rate": 6.345192836185242e-06, "loss": 0.0323, "step": 5700 }, { "epoch": 0.44349514563106796, "grad_norm": 0.0849752630464463, "learning_rate": 6.332819185566057e-06, "loss": 0.0315, "step": 5710 }, { "epoch": 0.44427184466019415, "grad_norm": 0.12262336839274321, "learning_rate": 6.320436741749733e-06, "loss": 0.0333, "step": 5720 }, { "epoch": 0.4450485436893204, "grad_norm": 0.105701561586313, "learning_rate": 6.308045586428725e-06, "loss": 0.0347, "step": 5730 }, { "epoch": 0.4458252427184466, "grad_norm": 0.08754118657255959, "learning_rate": 6.295645801352958e-06, "loss": 0.0328, "step": 5740 }, { "epoch": 0.44660194174757284, "grad_norm": 0.09147016252337913, "learning_rate": 6.283237468329295e-06, "loss": 0.0373, "step": 5750 }, { "epoch": 0.44737864077669903, "grad_norm": 0.11237578640836912, "learning_rate": 6.2708206692209905e-06, "loss": 0.0335, "step": 5760 }, { "epoch": 0.4481553398058252, "grad_norm": 0.32175351855265244, "learning_rate": 6.258395485947157e-06, "loss": 0.0344, "step": 5770 }, { "epoch": 0.4489320388349515, "grad_norm": 0.08316864266322528, "learning_rate": 6.245962000482219e-06, "loss": 0.0327, "step": 5780 }, { "epoch": 0.44970873786407767, "grad_norm": 0.1641909968950528, "learning_rate": 6.233520294855373e-06, "loss": 0.0303, "step": 5790 }, { "epoch": 0.45048543689320386, "grad_norm": 0.15884668153942239, "learning_rate": 6.221070451150051e-06, "loss": 0.0331, "step": 5800 }, { "epoch": 0.4512621359223301, "grad_norm": 0.10293415049531084, "learning_rate": 6.2086125515033735e-06, "loss": 0.031, "step": 5810 }, { "epoch": 0.4520388349514563, "grad_norm": 0.15146295085367623, "learning_rate": 6.1961466781056055e-06, "loss": 0.0322, "step": 5820 }, { "epoch": 0.45281553398058255, "grad_norm": 0.13931737126334184, "learning_rate": 6.183672913199629e-06, "loss": 0.0335, "step": 5830 }, { "epoch": 0.45359223300970875, "grad_norm": 0.092190992939084, "learning_rate": 6.171191339080378e-06, "loss": 0.0337, "step": 5840 }, { "epoch": 0.45436893203883494, "grad_norm": 0.085610611260247, "learning_rate": 6.158702038094314e-06, "loss": 0.0312, "step": 5850 }, { "epoch": 0.4551456310679612, "grad_norm": 0.14924937642644345, "learning_rate": 6.146205092638876e-06, "loss": 0.0325, "step": 5860 }, { "epoch": 0.4559223300970874, "grad_norm": 0.14345355684383154, "learning_rate": 6.133700585161935e-06, "loss": 0.0321, "step": 5870 }, { "epoch": 0.4566990291262136, "grad_norm": 0.09078213339426157, "learning_rate": 6.121188598161251e-06, "loss": 0.0324, "step": 5880 }, { "epoch": 0.4574757281553398, "grad_norm": 0.09194794293953938, "learning_rate": 6.108669214183933e-06, "loss": 0.0311, "step": 5890 }, { "epoch": 0.458252427184466, "grad_norm": 0.13651432794557464, "learning_rate": 6.096142515825888e-06, "loss": 0.0321, "step": 5900 }, { "epoch": 0.4590291262135922, "grad_norm": 0.11620716319166027, "learning_rate": 6.083608585731283e-06, "loss": 0.0308, "step": 5910 }, { "epoch": 0.45980582524271846, "grad_norm": 0.12297839003951416, "learning_rate": 6.071067506591991e-06, "loss": 0.0414, "step": 5920 }, { "epoch": 0.46058252427184465, "grad_norm": 0.10782400464378368, "learning_rate": 6.058519361147055e-06, "loss": 0.0337, "step": 5930 }, { "epoch": 0.4613592233009709, "grad_norm": 0.144791365050361, "learning_rate": 6.045964232182133e-06, "loss": 0.0347, "step": 5940 }, { "epoch": 0.4621359223300971, "grad_norm": 0.15203248769469757, "learning_rate": 6.033402202528962e-06, "loss": 0.0351, "step": 5950 }, { "epoch": 0.4629126213592233, "grad_norm": 0.11162846688337043, "learning_rate": 6.020833355064799e-06, "loss": 0.031, "step": 5960 }, { "epoch": 0.46368932038834954, "grad_norm": 0.12706166919554365, "learning_rate": 6.008257772711888e-06, "loss": 0.0329, "step": 5970 }, { "epoch": 0.46446601941747573, "grad_norm": 0.08935538074442269, "learning_rate": 5.995675538436905e-06, "loss": 0.0317, "step": 5980 }, { "epoch": 0.4652427184466019, "grad_norm": 0.09490758366976129, "learning_rate": 5.983086735250402e-06, "loss": 0.0303, "step": 5990 }, { "epoch": 0.46601941747572817, "grad_norm": 0.11374713366999266, "learning_rate": 5.970491446206283e-06, "loss": 0.0319, "step": 6000 }, { "epoch": 0.46601941747572817, "eval_loss": 0.03369936719536781, "eval_runtime": 1432.7372, "eval_samples_per_second": 3.918, "eval_steps_per_second": 0.28, "step": 6000 }, { "epoch": 0.46679611650485436, "grad_norm": 0.15236285527040203, "learning_rate": 5.9578897544012345e-06, "loss": 0.0342, "step": 6010 }, { "epoch": 0.46757281553398056, "grad_norm": 0.11208407796843485, "learning_rate": 5.945281742974185e-06, "loss": 0.0344, "step": 6020 }, { "epoch": 0.4683495145631068, "grad_norm": 0.1721040197843031, "learning_rate": 5.9326674951057575e-06, "loss": 0.0306, "step": 6030 }, { "epoch": 0.469126213592233, "grad_norm": 0.10369274763217506, "learning_rate": 5.92004709401772e-06, "loss": 0.0343, "step": 6040 }, { "epoch": 0.46990291262135925, "grad_norm": 0.0976893087517415, "learning_rate": 5.907420622972434e-06, "loss": 0.0347, "step": 6050 }, { "epoch": 0.47067961165048544, "grad_norm": 0.08278954961096646, "learning_rate": 5.894788165272312e-06, "loss": 0.0334, "step": 6060 }, { "epoch": 0.47145631067961163, "grad_norm": 0.11744486073692043, "learning_rate": 5.882149804259254e-06, "loss": 0.0318, "step": 6070 }, { "epoch": 0.4722330097087379, "grad_norm": 0.17068434799134144, "learning_rate": 5.869505623314119e-06, "loss": 0.0325, "step": 6080 }, { "epoch": 0.4730097087378641, "grad_norm": 0.21091533855599123, "learning_rate": 5.8568557058561525e-06, "loss": 0.034, "step": 6090 }, { "epoch": 0.47378640776699027, "grad_norm": 0.10087997906773126, "learning_rate": 5.8442001353424495e-06, "loss": 0.0334, "step": 6100 }, { "epoch": 0.4745631067961165, "grad_norm": 0.11064572320995678, "learning_rate": 5.831538995267402e-06, "loss": 0.033, "step": 6110 }, { "epoch": 0.4753398058252427, "grad_norm": 0.27142650561514914, "learning_rate": 5.818872369162146e-06, "loss": 0.0342, "step": 6120 }, { "epoch": 0.4761165048543689, "grad_norm": 0.10080230400892518, "learning_rate": 5.806200340594013e-06, "loss": 0.032, "step": 6130 }, { "epoch": 0.47689320388349515, "grad_norm": 0.09226829775827713, "learning_rate": 5.793522993165974e-06, "loss": 0.0333, "step": 6140 }, { "epoch": 0.47766990291262135, "grad_norm": 0.18865746109033485, "learning_rate": 5.78084041051609e-06, "loss": 0.0347, "step": 6150 }, { "epoch": 0.4784466019417476, "grad_norm": 0.10578492067172693, "learning_rate": 5.768152676316966e-06, "loss": 0.0334, "step": 6160 }, { "epoch": 0.4792233009708738, "grad_norm": 0.08679688388567215, "learning_rate": 5.75545987427519e-06, "loss": 0.0317, "step": 6170 }, { "epoch": 0.48, "grad_norm": 0.10116616061038274, "learning_rate": 5.742762088130785e-06, "loss": 0.0383, "step": 6180 }, { "epoch": 0.48077669902912623, "grad_norm": 0.3297939479400083, "learning_rate": 5.730059401656661e-06, "loss": 0.0346, "step": 6190 }, { "epoch": 0.4815533980582524, "grad_norm": 0.08176311547606052, "learning_rate": 5.71735189865805e-06, "loss": 0.0324, "step": 6200 }, { "epoch": 0.4823300970873786, "grad_norm": 0.08071930267697436, "learning_rate": 5.704639662971965e-06, "loss": 0.0341, "step": 6210 }, { "epoch": 0.48310679611650487, "grad_norm": 0.12179351888414967, "learning_rate": 5.691922778466646e-06, "loss": 0.0328, "step": 6220 }, { "epoch": 0.48388349514563106, "grad_norm": 0.1125298117198277, "learning_rate": 5.679201329040995e-06, "loss": 0.0315, "step": 6230 }, { "epoch": 0.4846601941747573, "grad_norm": 0.09279098075812245, "learning_rate": 5.6664753986240394e-06, "loss": 0.0322, "step": 6240 }, { "epoch": 0.4854368932038835, "grad_norm": 0.07120729102857828, "learning_rate": 5.653745071174364e-06, "loss": 0.031, "step": 6250 }, { "epoch": 0.4862135922330097, "grad_norm": 0.3291125238431995, "learning_rate": 5.641010430679563e-06, "loss": 0.0344, "step": 6260 }, { "epoch": 0.48699029126213594, "grad_norm": 0.08281527634439739, "learning_rate": 5.62827156115569e-06, "loss": 0.0306, "step": 6270 }, { "epoch": 0.48776699029126214, "grad_norm": 0.10666966489225792, "learning_rate": 5.615528546646694e-06, "loss": 0.0313, "step": 6280 }, { "epoch": 0.48854368932038833, "grad_norm": 0.09594788873826923, "learning_rate": 5.602781471223876e-06, "loss": 0.0318, "step": 6290 }, { "epoch": 0.4893203883495146, "grad_norm": 0.18348663153693262, "learning_rate": 5.590030418985323e-06, "loss": 0.033, "step": 6300 }, { "epoch": 0.49009708737864077, "grad_norm": 0.14907921810469854, "learning_rate": 5.577275474055363e-06, "loss": 0.0325, "step": 6310 }, { "epoch": 0.49087378640776697, "grad_norm": 0.1716644562687576, "learning_rate": 5.564516720584001e-06, "loss": 0.0329, "step": 6320 }, { "epoch": 0.4916504854368932, "grad_norm": 0.15715855144391658, "learning_rate": 5.551754242746375e-06, "loss": 0.0352, "step": 6330 }, { "epoch": 0.4924271844660194, "grad_norm": 0.09242267296232623, "learning_rate": 5.538988124742188e-06, "loss": 0.0334, "step": 6340 }, { "epoch": 0.49320388349514566, "grad_norm": 0.15043664434093285, "learning_rate": 5.526218450795164e-06, "loss": 0.0307, "step": 6350 }, { "epoch": 0.49398058252427185, "grad_norm": 0.08308270653166681, "learning_rate": 5.513445305152486e-06, "loss": 0.0331, "step": 6360 }, { "epoch": 0.49475728155339804, "grad_norm": 0.1418458162678893, "learning_rate": 5.500668772084239e-06, "loss": 0.0317, "step": 6370 }, { "epoch": 0.4955339805825243, "grad_norm": 0.09959937918897721, "learning_rate": 5.487888935882855e-06, "loss": 0.0313, "step": 6380 }, { "epoch": 0.4963106796116505, "grad_norm": 0.19315880796165136, "learning_rate": 5.4751058808625655e-06, "loss": 0.0328, "step": 6390 }, { "epoch": 0.4970873786407767, "grad_norm": 0.10163503201064937, "learning_rate": 5.462319691358831e-06, "loss": 0.0309, "step": 6400 }, { "epoch": 0.4978640776699029, "grad_norm": 0.12646545290710698, "learning_rate": 5.449530451727792e-06, "loss": 0.0322, "step": 6410 }, { "epoch": 0.4986407766990291, "grad_norm": 0.08909974364691342, "learning_rate": 5.4367382463457165e-06, "loss": 0.0351, "step": 6420 }, { "epoch": 0.4994174757281553, "grad_norm": 0.17850107365474066, "learning_rate": 5.423943159608436e-06, "loss": 0.0328, "step": 6430 }, { "epoch": 0.5001941747572816, "grad_norm": 0.11300138462109507, "learning_rate": 5.411145275930791e-06, "loss": 0.0337, "step": 6440 }, { "epoch": 0.5009708737864078, "grad_norm": 0.1623674289110957, "learning_rate": 5.398344679746077e-06, "loss": 0.0333, "step": 6450 }, { "epoch": 0.501747572815534, "grad_norm": 0.11015667653237636, "learning_rate": 5.385541455505481e-06, "loss": 0.0316, "step": 6460 }, { "epoch": 0.5025242718446602, "grad_norm": 0.08166982795292617, "learning_rate": 5.372735687677533e-06, "loss": 0.0321, "step": 6470 }, { "epoch": 0.5033009708737864, "grad_norm": 0.08662394803991076, "learning_rate": 5.3599274607475415e-06, "loss": 0.0317, "step": 6480 }, { "epoch": 0.5040776699029126, "grad_norm": 0.08027087237780195, "learning_rate": 5.34711685921704e-06, "loss": 0.0304, "step": 6490 }, { "epoch": 0.5048543689320388, "grad_norm": 0.08548373533834593, "learning_rate": 5.334303967603227e-06, "loss": 0.0321, "step": 6500 }, { "epoch": 0.5056310679611651, "grad_norm": 0.0780149242416534, "learning_rate": 5.321488870438412e-06, "loss": 0.0313, "step": 6510 }, { "epoch": 0.5064077669902912, "grad_norm": 0.21674172842292297, "learning_rate": 5.308671652269451e-06, "loss": 0.0325, "step": 6520 }, { "epoch": 0.5071844660194175, "grad_norm": 0.07455376506865757, "learning_rate": 5.2958523976572006e-06, "loss": 0.0316, "step": 6530 }, { "epoch": 0.5079611650485437, "grad_norm": 0.1079559774918328, "learning_rate": 5.283031191175944e-06, "loss": 0.0306, "step": 6540 }, { "epoch": 0.5087378640776699, "grad_norm": 0.11590869164318615, "learning_rate": 5.270208117412849e-06, "loss": 0.0339, "step": 6550 }, { "epoch": 0.5095145631067961, "grad_norm": 0.08794482833494133, "learning_rate": 5.257383260967399e-06, "loss": 0.0363, "step": 6560 }, { "epoch": 0.5102912621359224, "grad_norm": 0.17575740280836727, "learning_rate": 5.244556706450838e-06, "loss": 0.0314, "step": 6570 }, { "epoch": 0.5110679611650485, "grad_norm": 0.11074019218788225, "learning_rate": 5.231728538485615e-06, "loss": 0.0319, "step": 6580 }, { "epoch": 0.5118446601941747, "grad_norm": 0.12619366643621954, "learning_rate": 5.218898841704823e-06, "loss": 0.0321, "step": 6590 }, { "epoch": 0.512621359223301, "grad_norm": 0.19730691929146507, "learning_rate": 5.206067700751643e-06, "loss": 0.0306, "step": 6600 }, { "epoch": 0.5133980582524272, "grad_norm": 0.0939695265340132, "learning_rate": 5.1932352002787775e-06, "loss": 0.0309, "step": 6610 }, { "epoch": 0.5141747572815534, "grad_norm": 0.1569880693689268, "learning_rate": 5.1804014249479074e-06, "loss": 0.0331, "step": 6620 }, { "epoch": 0.5149514563106796, "grad_norm": 0.09871364572260666, "learning_rate": 5.167566459429116e-06, "loss": 0.0308, "step": 6630 }, { "epoch": 0.5157281553398059, "grad_norm": 0.11955146750051933, "learning_rate": 5.154730388400344e-06, "loss": 0.0371, "step": 6640 }, { "epoch": 0.516504854368932, "grad_norm": 0.0800089626991889, "learning_rate": 5.141893296546826e-06, "loss": 0.0315, "step": 6650 }, { "epoch": 0.5172815533980583, "grad_norm": 0.1505130721203152, "learning_rate": 5.129055268560526e-06, "loss": 0.0331, "step": 6660 }, { "epoch": 0.5180582524271845, "grad_norm": 0.07008446021241277, "learning_rate": 5.116216389139592e-06, "loss": 0.0306, "step": 6670 }, { "epoch": 0.5188349514563106, "grad_norm": 0.1112591394264733, "learning_rate": 5.1033767429877825e-06, "loss": 0.0307, "step": 6680 }, { "epoch": 0.5196116504854369, "grad_norm": 0.09274452491274066, "learning_rate": 5.090536414813916e-06, "loss": 0.0343, "step": 6690 }, { "epoch": 0.5203883495145631, "grad_norm": 0.09260836432470411, "learning_rate": 5.077695489331315e-06, "loss": 0.0323, "step": 6700 }, { "epoch": 0.5211650485436893, "grad_norm": 0.10197498113439939, "learning_rate": 5.064854051257235e-06, "loss": 0.0343, "step": 6710 }, { "epoch": 0.5219417475728155, "grad_norm": 0.13066175072302594, "learning_rate": 5.052012185312322e-06, "loss": 0.0301, "step": 6720 }, { "epoch": 0.5227184466019418, "grad_norm": 0.10221245605425895, "learning_rate": 5.0391699762200375e-06, "loss": 0.0367, "step": 6730 }, { "epoch": 0.5234951456310679, "grad_norm": 0.07182319498294976, "learning_rate": 5.02632750870611e-06, "loss": 0.0321, "step": 6740 }, { "epoch": 0.5242718446601942, "grad_norm": 0.11527764509737048, "learning_rate": 5.013484867497974e-06, "loss": 0.0402, "step": 6750 }, { "epoch": 0.5250485436893204, "grad_norm": 0.1298196307763526, "learning_rate": 5.0006421373242085e-06, "loss": 0.0339, "step": 6760 }, { "epoch": 0.5258252427184466, "grad_norm": 0.19266703158170723, "learning_rate": 4.987799402913979e-06, "loss": 0.0311, "step": 6770 }, { "epoch": 0.5266019417475728, "grad_norm": 0.1676448059901746, "learning_rate": 4.974956748996479e-06, "loss": 0.0315, "step": 6780 }, { "epoch": 0.527378640776699, "grad_norm": 0.12039718340075856, "learning_rate": 4.962114260300375e-06, "loss": 0.0416, "step": 6790 }, { "epoch": 0.5281553398058253, "grad_norm": 0.13615958680423465, "learning_rate": 4.949272021553236e-06, "loss": 0.0292, "step": 6800 }, { "epoch": 0.5289320388349514, "grad_norm": 0.08030106550395978, "learning_rate": 4.936430117480987e-06, "loss": 0.0319, "step": 6810 }, { "epoch": 0.5297087378640777, "grad_norm": 0.11717038315680303, "learning_rate": 4.923588632807344e-06, "loss": 0.0312, "step": 6820 }, { "epoch": 0.5304854368932039, "grad_norm": 0.0729256859301243, "learning_rate": 4.910747652253258e-06, "loss": 0.0316, "step": 6830 }, { "epoch": 0.5312621359223301, "grad_norm": 0.09914927642004312, "learning_rate": 4.89790726053635e-06, "loss": 0.0315, "step": 6840 }, { "epoch": 0.5320388349514563, "grad_norm": 0.13845225023638497, "learning_rate": 4.885067542370358e-06, "loss": 0.0319, "step": 6850 }, { "epoch": 0.5328155339805826, "grad_norm": 0.10873896256337931, "learning_rate": 4.872228582464578e-06, "loss": 0.0337, "step": 6860 }, { "epoch": 0.5335922330097087, "grad_norm": 0.06860578652243003, "learning_rate": 4.859390465523304e-06, "loss": 0.0329, "step": 6870 }, { "epoch": 0.534368932038835, "grad_norm": 0.11417514166149667, "learning_rate": 4.846553276245262e-06, "loss": 0.0315, "step": 6880 }, { "epoch": 0.5351456310679612, "grad_norm": 0.20325437006513952, "learning_rate": 4.833717099323063e-06, "loss": 0.0311, "step": 6890 }, { "epoch": 0.5359223300970873, "grad_norm": 0.09954824986699067, "learning_rate": 4.820882019442643e-06, "loss": 0.0329, "step": 6900 }, { "epoch": 0.5366990291262136, "grad_norm": 0.10353238017063876, "learning_rate": 4.808048121282692e-06, "loss": 0.03, "step": 6910 }, { "epoch": 0.5374757281553398, "grad_norm": 0.07558471833125073, "learning_rate": 4.795215489514109e-06, "loss": 0.031, "step": 6920 }, { "epoch": 0.538252427184466, "grad_norm": 0.11553026153675787, "learning_rate": 4.7823842087994336e-06, "loss": 0.0317, "step": 6930 }, { "epoch": 0.5390291262135922, "grad_norm": 0.11118458186876524, "learning_rate": 4.769554363792298e-06, "loss": 0.0339, "step": 6940 }, { "epoch": 0.5398058252427185, "grad_norm": 0.10912837255703754, "learning_rate": 4.75672603913686e-06, "loss": 0.0306, "step": 6950 }, { "epoch": 0.5405825242718446, "grad_norm": 0.11018903960456042, "learning_rate": 4.743899319467244e-06, "loss": 0.0307, "step": 6960 }, { "epoch": 0.5413592233009709, "grad_norm": 0.09044399775794357, "learning_rate": 4.731074289406986e-06, "loss": 0.0322, "step": 6970 }, { "epoch": 0.5421359223300971, "grad_norm": 0.07013312617600766, "learning_rate": 4.71825103356848e-06, "loss": 0.0306, "step": 6980 }, { "epoch": 0.5429126213592232, "grad_norm": 0.18095777904176227, "learning_rate": 4.705429636552411e-06, "loss": 0.0317, "step": 6990 }, { "epoch": 0.5436893203883495, "grad_norm": 0.08281115011404426, "learning_rate": 4.692610182947199e-06, "loss": 0.0313, "step": 7000 }, { "epoch": 0.5436893203883495, "eval_loss": 0.032582711428403854, "eval_runtime": 1433.4244, "eval_samples_per_second": 3.916, "eval_steps_per_second": 0.28, "step": 7000 }, { "epoch": 0.5444660194174757, "grad_norm": 0.1562171607820171, "learning_rate": 4.679792757328445e-06, "loss": 0.0295, "step": 7010 }, { "epoch": 0.545242718446602, "grad_norm": 0.1294890397465028, "learning_rate": 4.6669774442583724e-06, "loss": 0.0312, "step": 7020 }, { "epoch": 0.5460194174757281, "grad_norm": 0.07211035994235415, "learning_rate": 4.654164328285261e-06, "loss": 0.0305, "step": 7030 }, { "epoch": 0.5467961165048544, "grad_norm": 0.17439539368346013, "learning_rate": 4.641353493942902e-06, "loss": 0.0323, "step": 7040 }, { "epoch": 0.5475728155339806, "grad_norm": 0.09540977778055347, "learning_rate": 4.6285450257500265e-06, "loss": 0.0331, "step": 7050 }, { "epoch": 0.5483495145631068, "grad_norm": 0.20440351969812895, "learning_rate": 4.615739008209762e-06, "loss": 0.0327, "step": 7060 }, { "epoch": 0.549126213592233, "grad_norm": 0.13407423871189023, "learning_rate": 4.602935525809068e-06, "loss": 0.0333, "step": 7070 }, { "epoch": 0.5499029126213593, "grad_norm": 0.10272879483124647, "learning_rate": 4.59013466301817e-06, "loss": 0.0314, "step": 7080 }, { "epoch": 0.5506796116504854, "grad_norm": 0.13656667800563416, "learning_rate": 4.5773365042900195e-06, "loss": 0.0329, "step": 7090 }, { "epoch": 0.5514563106796116, "grad_norm": 0.09685652266822328, "learning_rate": 4.5645411340597264e-06, "loss": 0.0323, "step": 7100 }, { "epoch": 0.5522330097087379, "grad_norm": 0.08272763310005903, "learning_rate": 4.551748636744e-06, "loss": 0.0332, "step": 7110 }, { "epoch": 0.553009708737864, "grad_norm": 0.11555877738923265, "learning_rate": 4.5389590967406e-06, "loss": 0.0318, "step": 7120 }, { "epoch": 0.5537864077669903, "grad_norm": 0.09891801587754706, "learning_rate": 4.52617259842777e-06, "loss": 0.0334, "step": 7130 }, { "epoch": 0.5545631067961165, "grad_norm": 0.16238965864871066, "learning_rate": 4.513389226163694e-06, "loss": 0.0337, "step": 7140 }, { "epoch": 0.5553398058252427, "grad_norm": 0.12084543198382287, "learning_rate": 4.5006090642859266e-06, "loss": 0.0311, "step": 7150 }, { "epoch": 0.5561165048543689, "grad_norm": 0.09514754393879454, "learning_rate": 4.4878321971108405e-06, "loss": 0.0316, "step": 7160 }, { "epoch": 0.5568932038834952, "grad_norm": 0.1790760951422049, "learning_rate": 4.475058708933077e-06, "loss": 0.0321, "step": 7170 }, { "epoch": 0.5576699029126213, "grad_norm": 0.09428532492193505, "learning_rate": 4.4622886840249846e-06, "loss": 0.0296, "step": 7180 }, { "epoch": 0.5584466019417476, "grad_norm": 0.12498750036823862, "learning_rate": 4.449522206636056e-06, "loss": 0.0339, "step": 7190 }, { "epoch": 0.5592233009708738, "grad_norm": 0.10782930273955517, "learning_rate": 4.436759360992385e-06, "loss": 0.0322, "step": 7200 }, { "epoch": 0.56, "grad_norm": 0.1493644022475598, "learning_rate": 4.4240002312961075e-06, "loss": 0.0313, "step": 7210 }, { "epoch": 0.5607766990291262, "grad_norm": 0.10109157748584013, "learning_rate": 4.411244901724836e-06, "loss": 0.0321, "step": 7220 }, { "epoch": 0.5615533980582524, "grad_norm": 0.11597997117274964, "learning_rate": 4.398493456431121e-06, "loss": 0.0291, "step": 7230 }, { "epoch": 0.5623300970873787, "grad_norm": 0.09121470758313256, "learning_rate": 4.385745979541875e-06, "loss": 0.0326, "step": 7240 }, { "epoch": 0.5631067961165048, "grad_norm": 0.11539314193515492, "learning_rate": 4.373002555157843e-06, "loss": 0.0317, "step": 7250 }, { "epoch": 0.5638834951456311, "grad_norm": 0.11744503689861098, "learning_rate": 4.360263267353026e-06, "loss": 0.0305, "step": 7260 }, { "epoch": 0.5646601941747573, "grad_norm": 0.10869820485829182, "learning_rate": 4.347528200174132e-06, "loss": 0.0315, "step": 7270 }, { "epoch": 0.5654368932038835, "grad_norm": 0.11150851768445619, "learning_rate": 4.334797437640027e-06, "loss": 0.0316, "step": 7280 }, { "epoch": 0.5662135922330097, "grad_norm": 0.09537281756326985, "learning_rate": 4.3220710637411795e-06, "loss": 0.0353, "step": 7290 }, { "epoch": 0.566990291262136, "grad_norm": 0.07565325447805013, "learning_rate": 4.309349162439102e-06, "loss": 0.0444, "step": 7300 }, { "epoch": 0.5677669902912621, "grad_norm": 0.07763359666431095, "learning_rate": 4.296631817665796e-06, "loss": 0.0297, "step": 7310 }, { "epoch": 0.5685436893203883, "grad_norm": 0.12921360171171, "learning_rate": 4.2839191133232066e-06, "loss": 0.0317, "step": 7320 }, { "epoch": 0.5693203883495146, "grad_norm": 0.12152788999477347, "learning_rate": 4.2712111332826645e-06, "loss": 0.0334, "step": 7330 }, { "epoch": 0.5700970873786407, "grad_norm": 0.11929683204988667, "learning_rate": 4.258507961384326e-06, "loss": 0.034, "step": 7340 }, { "epoch": 0.570873786407767, "grad_norm": 0.14360112272110442, "learning_rate": 4.245809681436633e-06, "loss": 0.0311, "step": 7350 }, { "epoch": 0.5716504854368932, "grad_norm": 0.11212690735305433, "learning_rate": 4.233116377215744e-06, "loss": 0.033, "step": 7360 }, { "epoch": 0.5724271844660194, "grad_norm": 0.1305497545489697, "learning_rate": 4.220428132465002e-06, "loss": 0.0314, "step": 7370 }, { "epoch": 0.5732038834951456, "grad_norm": 0.08842132680005162, "learning_rate": 4.207745030894363e-06, "loss": 0.0313, "step": 7380 }, { "epoch": 0.5739805825242719, "grad_norm": 0.16913964120815525, "learning_rate": 4.195067156179852e-06, "loss": 0.0347, "step": 7390 }, { "epoch": 0.574757281553398, "grad_norm": 0.11338560832468836, "learning_rate": 4.182394591963009e-06, "loss": 0.0323, "step": 7400 }, { "epoch": 0.5755339805825243, "grad_norm": 0.11678790666937965, "learning_rate": 4.169727421850344e-06, "loss": 0.0316, "step": 7410 }, { "epoch": 0.5763106796116505, "grad_norm": 0.10737711704081114, "learning_rate": 4.1570657294127745e-06, "loss": 0.035, "step": 7420 }, { "epoch": 0.5770873786407767, "grad_norm": 0.14808927621962115, "learning_rate": 4.1444095981850775e-06, "loss": 0.0315, "step": 7430 }, { "epoch": 0.5778640776699029, "grad_norm": 0.21852375566229668, "learning_rate": 4.131759111665349e-06, "loss": 0.0315, "step": 7440 }, { "epoch": 0.5786407766990291, "grad_norm": 0.15380688900370174, "learning_rate": 4.119114353314435e-06, "loss": 0.0325, "step": 7450 }, { "epoch": 0.5794174757281554, "grad_norm": 0.09213442154964646, "learning_rate": 4.106475406555396e-06, "loss": 0.0282, "step": 7460 }, { "epoch": 0.5801941747572815, "grad_norm": 0.11534208819671539, "learning_rate": 4.0938423547729444e-06, "loss": 0.031, "step": 7470 }, { "epoch": 0.5809708737864078, "grad_norm": 0.06438883403634209, "learning_rate": 4.081215281312911e-06, "loss": 0.0289, "step": 7480 }, { "epoch": 0.581747572815534, "grad_norm": 0.07996174389766027, "learning_rate": 4.068594269481678e-06, "loss": 0.031, "step": 7490 }, { "epoch": 0.5825242718446602, "grad_norm": 0.1608611844255694, "learning_rate": 4.055979402545636e-06, "loss": 0.0339, "step": 7500 }, { "epoch": 0.5833009708737864, "grad_norm": 0.1649225565180829, "learning_rate": 4.043370763730635e-06, "loss": 0.0318, "step": 7510 }, { "epoch": 0.5840776699029127, "grad_norm": 0.17391801647995628, "learning_rate": 4.030768436221444e-06, "loss": 0.0328, "step": 7520 }, { "epoch": 0.5848543689320388, "grad_norm": 0.08835080844114253, "learning_rate": 4.018172503161179e-06, "loss": 0.0329, "step": 7530 }, { "epoch": 0.585631067961165, "grad_norm": 0.0796932990417311, "learning_rate": 4.005583047650782e-06, "loss": 0.0292, "step": 7540 }, { "epoch": 0.5864077669902913, "grad_norm": 0.12462544705836986, "learning_rate": 3.993000152748449e-06, "loss": 0.0297, "step": 7550 }, { "epoch": 0.5871844660194174, "grad_norm": 0.0809611453351118, "learning_rate": 3.980423901469102e-06, "loss": 0.0315, "step": 7560 }, { "epoch": 0.5879611650485437, "grad_norm": 0.17970150833600457, "learning_rate": 3.967854376783828e-06, "loss": 0.033, "step": 7570 }, { "epoch": 0.5887378640776699, "grad_norm": 0.08588893780262588, "learning_rate": 3.955291661619335e-06, "loss": 0.0338, "step": 7580 }, { "epoch": 0.5895145631067961, "grad_norm": 0.08142653033169137, "learning_rate": 3.942735838857403e-06, "loss": 0.0323, "step": 7590 }, { "epoch": 0.5902912621359223, "grad_norm": 0.13093449750789612, "learning_rate": 3.930186991334349e-06, "loss": 0.0311, "step": 7600 }, { "epoch": 0.5910679611650486, "grad_norm": 0.1341015751248369, "learning_rate": 3.917645201840464e-06, "loss": 0.0297, "step": 7610 }, { "epoch": 0.5918446601941748, "grad_norm": 0.08716218707945846, "learning_rate": 3.905110553119472e-06, "loss": 0.0296, "step": 7620 }, { "epoch": 0.592621359223301, "grad_norm": 0.12045975153691053, "learning_rate": 3.892583127867992e-06, "loss": 0.0299, "step": 7630 }, { "epoch": 0.5933980582524272, "grad_norm": 0.07911713025647142, "learning_rate": 3.880063008734986e-06, "loss": 0.031, "step": 7640 }, { "epoch": 0.5941747572815534, "grad_norm": 0.14012401089345614, "learning_rate": 3.86755027832121e-06, "loss": 0.0287, "step": 7650 }, { "epoch": 0.5949514563106796, "grad_norm": 0.09006047465797659, "learning_rate": 3.855045019178677e-06, "loss": 0.0337, "step": 7660 }, { "epoch": 0.5957281553398058, "grad_norm": 0.10070043924923365, "learning_rate": 3.842547313810106e-06, "loss": 0.0294, "step": 7670 }, { "epoch": 0.5965048543689321, "grad_norm": 0.11585193655846163, "learning_rate": 3.830057244668384e-06, "loss": 0.032, "step": 7680 }, { "epoch": 0.5972815533980582, "grad_norm": 0.14205459752752966, "learning_rate": 3.817574894156016e-06, "loss": 0.0302, "step": 7690 }, { "epoch": 0.5980582524271845, "grad_norm": 0.0954133004961122, "learning_rate": 3.8051003446245832e-06, "loss": 0.0295, "step": 7700 }, { "epoch": 0.5988349514563107, "grad_norm": 0.0883119082151053, "learning_rate": 3.792633678374203e-06, "loss": 0.031, "step": 7710 }, { "epoch": 0.5996116504854369, "grad_norm": 0.1154775218284604, "learning_rate": 3.7801749776529824e-06, "loss": 0.03, "step": 7720 }, { "epoch": 0.6003883495145631, "grad_norm": 0.1160082716385863, "learning_rate": 3.7677243246564722e-06, "loss": 0.0314, "step": 7730 }, { "epoch": 0.6011650485436894, "grad_norm": 0.08324606663981451, "learning_rate": 3.755281801527134e-06, "loss": 0.0309, "step": 7740 }, { "epoch": 0.6019417475728155, "grad_norm": 0.11306212579448019, "learning_rate": 3.7428474903537926e-06, "loss": 0.0304, "step": 7750 }, { "epoch": 0.6027184466019417, "grad_norm": 0.12678618925013363, "learning_rate": 3.7304214731710907e-06, "loss": 0.0318, "step": 7760 }, { "epoch": 0.603495145631068, "grad_norm": 0.08423628151726723, "learning_rate": 3.7180038319589584e-06, "loss": 0.03, "step": 7770 }, { "epoch": 0.6042718446601941, "grad_norm": 0.07919034717740288, "learning_rate": 3.7055946486420567e-06, "loss": 0.0328, "step": 7780 }, { "epoch": 0.6050485436893204, "grad_norm": 0.09670185828926932, "learning_rate": 3.6931940050892568e-06, "loss": 0.0309, "step": 7790 }, { "epoch": 0.6058252427184466, "grad_norm": 0.09773901193419676, "learning_rate": 3.6808019831130824e-06, "loss": 0.0302, "step": 7800 }, { "epoch": 0.6066019417475729, "grad_norm": 0.12087114576920742, "learning_rate": 3.6684186644691777e-06, "loss": 0.0298, "step": 7810 }, { "epoch": 0.607378640776699, "grad_norm": 0.16007749776393776, "learning_rate": 3.656044130855767e-06, "loss": 0.0321, "step": 7820 }, { "epoch": 0.6081553398058253, "grad_norm": 0.0911463801606877, "learning_rate": 3.6436784639131204e-06, "loss": 0.0296, "step": 7830 }, { "epoch": 0.6089320388349515, "grad_norm": 0.10673427848363125, "learning_rate": 3.6313217452230037e-06, "loss": 0.03, "step": 7840 }, { "epoch": 0.6097087378640776, "grad_norm": 0.11423662964779305, "learning_rate": 3.618974056308153e-06, "loss": 0.0304, "step": 7850 }, { "epoch": 0.6104854368932039, "grad_norm": 0.11688888552687565, "learning_rate": 3.6066354786317253e-06, "loss": 0.0322, "step": 7860 }, { "epoch": 0.6112621359223301, "grad_norm": 0.13407557875078321, "learning_rate": 3.594306093596773e-06, "loss": 0.03, "step": 7870 }, { "epoch": 0.6120388349514563, "grad_norm": 0.11165343678605566, "learning_rate": 3.581985982545697e-06, "loss": 0.0301, "step": 7880 }, { "epoch": 0.6128155339805825, "grad_norm": 0.09828241394715977, "learning_rate": 3.569675226759714e-06, "loss": 0.0299, "step": 7890 }, { "epoch": 0.6135922330097088, "grad_norm": 0.11927418894720826, "learning_rate": 3.5573739074583163e-06, "loss": 0.0298, "step": 7900 }, { "epoch": 0.6143689320388349, "grad_norm": 0.11812596114972819, "learning_rate": 3.54508210579875e-06, "loss": 0.0311, "step": 7910 }, { "epoch": 0.6151456310679612, "grad_norm": 0.10474374745008709, "learning_rate": 3.532799902875459e-06, "loss": 0.0376, "step": 7920 }, { "epoch": 0.6159223300970874, "grad_norm": 0.16721993915588432, "learning_rate": 3.520527379719562e-06, "loss": 0.0308, "step": 7930 }, { "epoch": 0.6166990291262135, "grad_norm": 0.08934542266834772, "learning_rate": 3.508264617298318e-06, "loss": 0.0297, "step": 7940 }, { "epoch": 0.6174757281553398, "grad_norm": 0.1501297661672405, "learning_rate": 3.496011696514593e-06, "loss": 0.0317, "step": 7950 }, { "epoch": 0.618252427184466, "grad_norm": 0.10091006283132752, "learning_rate": 3.4837686982063162e-06, "loss": 0.0311, "step": 7960 }, { "epoch": 0.6190291262135922, "grad_norm": 0.10314422085157768, "learning_rate": 3.47153570314596e-06, "loss": 0.0332, "step": 7970 }, { "epoch": 0.6198058252427184, "grad_norm": 0.07138750480883838, "learning_rate": 3.459312792039996e-06, "loss": 0.0301, "step": 7980 }, { "epoch": 0.6205825242718447, "grad_norm": 0.16584525658353458, "learning_rate": 3.447100045528372e-06, "loss": 0.0313, "step": 7990 }, { "epoch": 0.6213592233009708, "grad_norm": 0.06812764557641995, "learning_rate": 3.4348975441839738e-06, "loss": 0.0312, "step": 8000 }, { "epoch": 0.6213592233009708, "eval_loss": 0.031790632754564285, "eval_runtime": 1427.9428, "eval_samples_per_second": 3.931, "eval_steps_per_second": 0.281, "step": 8000 }, { "epoch": 0.6221359223300971, "grad_norm": 0.0879669109738688, "learning_rate": 3.4227053685120927e-06, "loss": 0.0316, "step": 8010 }, { "epoch": 0.6229126213592233, "grad_norm": 0.0720014800750367, "learning_rate": 3.4105235989499018e-06, "loss": 0.0298, "step": 8020 }, { "epoch": 0.6236893203883496, "grad_norm": 0.13889453161315435, "learning_rate": 3.3983523158659187e-06, "loss": 0.033, "step": 8030 }, { "epoch": 0.6244660194174757, "grad_norm": 0.10277620473211348, "learning_rate": 3.3861915995594737e-06, "loss": 0.03, "step": 8040 }, { "epoch": 0.625242718446602, "grad_norm": 0.15611615961369213, "learning_rate": 3.374041530260186e-06, "loss": 0.0292, "step": 8050 }, { "epoch": 0.6260194174757282, "grad_norm": 0.18129593468852587, "learning_rate": 3.361902188127435e-06, "loss": 0.0308, "step": 8060 }, { "epoch": 0.6267961165048543, "grad_norm": 0.10212129420122515, "learning_rate": 3.3497736532498228e-06, "loss": 0.0302, "step": 8070 }, { "epoch": 0.6275728155339806, "grad_norm": 0.08808626356375811, "learning_rate": 3.337656005644655e-06, "loss": 0.0304, "step": 8080 }, { "epoch": 0.6283495145631068, "grad_norm": 0.08196422440720673, "learning_rate": 3.3255493252574043e-06, "loss": 0.0363, "step": 8090 }, { "epoch": 0.629126213592233, "grad_norm": 0.08984212302443714, "learning_rate": 3.3134536919611943e-06, "loss": 0.0316, "step": 8100 }, { "epoch": 0.6299029126213592, "grad_norm": 0.1345493602466238, "learning_rate": 3.3013691855562637e-06, "loss": 0.0314, "step": 8110 }, { "epoch": 0.6306796116504855, "grad_norm": 0.09330817837603261, "learning_rate": 3.2892958857694386e-06, "loss": 0.0304, "step": 8120 }, { "epoch": 0.6314563106796116, "grad_norm": 0.11816016772297075, "learning_rate": 3.277233872253613e-06, "loss": 0.0316, "step": 8130 }, { "epoch": 0.6322330097087379, "grad_norm": 0.0872621134195598, "learning_rate": 3.2651832245872228e-06, "loss": 0.0298, "step": 8140 }, { "epoch": 0.6330097087378641, "grad_norm": 0.09508292005529252, "learning_rate": 3.253144022273714e-06, "loss": 0.0297, "step": 8150 }, { "epoch": 0.6337864077669902, "grad_norm": 0.12873824295398115, "learning_rate": 3.241116344741026e-06, "loss": 0.0307, "step": 8160 }, { "epoch": 0.6345631067961165, "grad_norm": 0.15974953620889254, "learning_rate": 3.229100271341059e-06, "loss": 0.0307, "step": 8170 }, { "epoch": 0.6353398058252427, "grad_norm": 0.08044452116385228, "learning_rate": 3.2170958813491625e-06, "loss": 0.0308, "step": 8180 }, { "epoch": 0.6361165048543689, "grad_norm": 0.23374160504578723, "learning_rate": 3.205103253963602e-06, "loss": 0.0325, "step": 8190 }, { "epoch": 0.6368932038834951, "grad_norm": 0.10142687746935287, "learning_rate": 3.1931224683050366e-06, "loss": 0.0295, "step": 8200 }, { "epoch": 0.6376699029126214, "grad_norm": 0.0724914533475222, "learning_rate": 3.1811536034160063e-06, "loss": 0.0302, "step": 8210 }, { "epoch": 0.6384466019417476, "grad_norm": 0.0853832018927383, "learning_rate": 3.1691967382604024e-06, "loss": 0.0386, "step": 8220 }, { "epoch": 0.6392233009708738, "grad_norm": 0.10022289285487375, "learning_rate": 3.157251951722947e-06, "loss": 0.0295, "step": 8230 }, { "epoch": 0.64, "grad_norm": 0.15332409995681204, "learning_rate": 3.1453193226086743e-06, "loss": 0.0298, "step": 8240 }, { "epoch": 0.6407766990291263, "grad_norm": 0.21898776981765594, "learning_rate": 3.1333989296424116e-06, "loss": 0.0295, "step": 8250 }, { "epoch": 0.6415533980582524, "grad_norm": 0.12465572203656224, "learning_rate": 3.121490851468262e-06, "loss": 0.0309, "step": 8260 }, { "epoch": 0.6423300970873786, "grad_norm": 0.11032698445012323, "learning_rate": 3.1095951666490757e-06, "loss": 0.0329, "step": 8270 }, { "epoch": 0.6431067961165049, "grad_norm": 0.08385008567004863, "learning_rate": 3.097711953665943e-06, "loss": 0.0299, "step": 8280 }, { "epoch": 0.643883495145631, "grad_norm": 0.09631556127515749, "learning_rate": 3.0858412909176736e-06, "loss": 0.0322, "step": 8290 }, { "epoch": 0.6446601941747573, "grad_norm": 0.29654874976508916, "learning_rate": 3.0739832567202727e-06, "loss": 0.0301, "step": 8300 }, { "epoch": 0.6454368932038835, "grad_norm": 0.1012354694292051, "learning_rate": 3.0621379293064334e-06, "loss": 0.0311, "step": 8310 }, { "epoch": 0.6462135922330097, "grad_norm": 0.08509541254785014, "learning_rate": 3.050305386825013e-06, "loss": 0.0295, "step": 8320 }, { "epoch": 0.6469902912621359, "grad_norm": 0.11025836690890176, "learning_rate": 3.0384857073405248e-06, "loss": 0.0304, "step": 8330 }, { "epoch": 0.6477669902912622, "grad_norm": 0.09082816877458712, "learning_rate": 3.0266789688326187e-06, "loss": 0.0306, "step": 8340 }, { "epoch": 0.6485436893203883, "grad_norm": 0.10057451725650152, "learning_rate": 3.0148852491955633e-06, "loss": 0.0287, "step": 8350 }, { "epoch": 0.6493203883495146, "grad_norm": 0.10807613495515916, "learning_rate": 3.003104626237738e-06, "loss": 0.03, "step": 8360 }, { "epoch": 0.6500970873786408, "grad_norm": 0.07081570709273605, "learning_rate": 2.9913371776811235e-06, "loss": 0.0315, "step": 8370 }, { "epoch": 0.6508737864077669, "grad_norm": 0.06906391891520963, "learning_rate": 2.979582981160773e-06, "loss": 0.0296, "step": 8380 }, { "epoch": 0.6516504854368932, "grad_norm": 0.10458418701764713, "learning_rate": 2.9678421142243175e-06, "loss": 0.0286, "step": 8390 }, { "epoch": 0.6524271844660194, "grad_norm": 0.08044442420109658, "learning_rate": 2.9561146543314423e-06, "loss": 0.028, "step": 8400 }, { "epoch": 0.6532038834951456, "grad_norm": 0.14067299742210426, "learning_rate": 2.9444006788533843e-06, "loss": 0.0306, "step": 8410 }, { "epoch": 0.6539805825242718, "grad_norm": 0.10944192873678996, "learning_rate": 2.932700265072417e-06, "loss": 0.0324, "step": 8420 }, { "epoch": 0.6547572815533981, "grad_norm": 0.15220626757904965, "learning_rate": 2.9210134901813384e-06, "loss": 0.0301, "step": 8430 }, { "epoch": 0.6555339805825243, "grad_norm": 0.12927891621174092, "learning_rate": 2.9093404312829684e-06, "loss": 0.0316, "step": 8440 }, { "epoch": 0.6563106796116505, "grad_norm": 0.07553410865029397, "learning_rate": 2.897681165389633e-06, "loss": 0.0302, "step": 8450 }, { "epoch": 0.6570873786407767, "grad_norm": 0.09236833881585226, "learning_rate": 2.8860357694226644e-06, "loss": 0.0288, "step": 8460 }, { "epoch": 0.657864077669903, "grad_norm": 0.0972569906788794, "learning_rate": 2.8744043202118844e-06, "loss": 0.0332, "step": 8470 }, { "epoch": 0.6586407766990291, "grad_norm": 0.2630128279230889, "learning_rate": 2.8627868944951014e-06, "loss": 0.0292, "step": 8480 }, { "epoch": 0.6594174757281553, "grad_norm": 0.10602801238342699, "learning_rate": 2.8511835689176103e-06, "loss": 0.0308, "step": 8490 }, { "epoch": 0.6601941747572816, "grad_norm": 0.1218936451515912, "learning_rate": 2.839594420031674e-06, "loss": 0.0282, "step": 8500 }, { "epoch": 0.6609708737864077, "grad_norm": 0.10021986527015017, "learning_rate": 2.828019524296033e-06, "loss": 0.03, "step": 8510 }, { "epoch": 0.661747572815534, "grad_norm": 0.08200888442799228, "learning_rate": 2.8164589580753843e-06, "loss": 0.0307, "step": 8520 }, { "epoch": 0.6625242718446602, "grad_norm": 0.07332988867386886, "learning_rate": 2.804912797639896e-06, "loss": 0.0298, "step": 8530 }, { "epoch": 0.6633009708737864, "grad_norm": 0.09205628853463732, "learning_rate": 2.79338111916469e-06, "loss": 0.0295, "step": 8540 }, { "epoch": 0.6640776699029126, "grad_norm": 0.06851198552916718, "learning_rate": 2.7818639987293437e-06, "loss": 0.0294, "step": 8550 }, { "epoch": 0.6648543689320389, "grad_norm": 0.11911242622185858, "learning_rate": 2.770361512317387e-06, "loss": 0.0302, "step": 8560 }, { "epoch": 0.665631067961165, "grad_norm": 0.07673891866189778, "learning_rate": 2.7588737358158135e-06, "loss": 0.0302, "step": 8570 }, { "epoch": 0.6664077669902913, "grad_norm": 0.17588796279229735, "learning_rate": 2.747400745014557e-06, "loss": 0.0304, "step": 8580 }, { "epoch": 0.6671844660194175, "grad_norm": 0.1467545391725963, "learning_rate": 2.7359426156060055e-06, "loss": 0.0285, "step": 8590 }, { "epoch": 0.6679611650485436, "grad_norm": 0.15591008642329526, "learning_rate": 2.724499423184506e-06, "loss": 0.03, "step": 8600 }, { "epoch": 0.6687378640776699, "grad_norm": 0.1289026167233618, "learning_rate": 2.7130712432458537e-06, "loss": 0.0334, "step": 8610 }, { "epoch": 0.6695145631067961, "grad_norm": 0.07272308623703849, "learning_rate": 2.7016581511868007e-06, "loss": 0.0298, "step": 8620 }, { "epoch": 0.6702912621359224, "grad_norm": 0.0841200022259912, "learning_rate": 2.69026022230456e-06, "loss": 0.0279, "step": 8630 }, { "epoch": 0.6710679611650485, "grad_norm": 0.10098670751010223, "learning_rate": 2.678877531796307e-06, "loss": 0.0391, "step": 8640 }, { "epoch": 0.6718446601941748, "grad_norm": 0.10122853637990106, "learning_rate": 2.667510154758679e-06, "loss": 0.0297, "step": 8650 }, { "epoch": 0.672621359223301, "grad_norm": 0.07892680100903056, "learning_rate": 2.656158166187286e-06, "loss": 0.0292, "step": 8660 }, { "epoch": 0.6733980582524272, "grad_norm": 0.09059695302543166, "learning_rate": 2.644821640976211e-06, "loss": 0.0304, "step": 8670 }, { "epoch": 0.6741747572815534, "grad_norm": 0.10602014538458575, "learning_rate": 2.633500653917522e-06, "loss": 0.0385, "step": 8680 }, { "epoch": 0.6749514563106797, "grad_norm": 0.08088212374502025, "learning_rate": 2.622195279700776e-06, "loss": 0.03, "step": 8690 }, { "epoch": 0.6757281553398058, "grad_norm": 0.16510997433437155, "learning_rate": 2.6109055929125194e-06, "loss": 0.0313, "step": 8700 }, { "epoch": 0.676504854368932, "grad_norm": 0.11278832266695998, "learning_rate": 2.5996316680358047e-06, "loss": 0.0315, "step": 8710 }, { "epoch": 0.6772815533980583, "grad_norm": 0.07305276497039706, "learning_rate": 2.588373579449698e-06, "loss": 0.029, "step": 8720 }, { "epoch": 0.6780582524271844, "grad_norm": 0.1292927162163381, "learning_rate": 2.5771314014287818e-06, "loss": 0.031, "step": 8730 }, { "epoch": 0.6788349514563107, "grad_norm": 0.08776348222568374, "learning_rate": 2.5659052081426773e-06, "loss": 0.0285, "step": 8740 }, { "epoch": 0.6796116504854369, "grad_norm": 0.09928867865342123, "learning_rate": 2.5546950736555367e-06, "loss": 0.029, "step": 8750 }, { "epoch": 0.6803883495145631, "grad_norm": 0.07884943842089363, "learning_rate": 2.543501071925576e-06, "loss": 0.0273, "step": 8760 }, { "epoch": 0.6811650485436893, "grad_norm": 0.10519925911505033, "learning_rate": 2.532323276804569e-06, "loss": 0.0302, "step": 8770 }, { "epoch": 0.6819417475728156, "grad_norm": 0.10491567446850888, "learning_rate": 2.5211617620373686e-06, "loss": 0.0303, "step": 8780 }, { "epoch": 0.6827184466019417, "grad_norm": 0.0924169831469643, "learning_rate": 2.5100166012614203e-06, "loss": 0.0297, "step": 8790 }, { "epoch": 0.683495145631068, "grad_norm": 0.11851783777848983, "learning_rate": 2.498887868006277e-06, "loss": 0.0294, "step": 8800 }, { "epoch": 0.6842718446601942, "grad_norm": 0.09305177948730388, "learning_rate": 2.4877756356931084e-06, "loss": 0.029, "step": 8810 }, { "epoch": 0.6850485436893204, "grad_norm": 0.11115753061698577, "learning_rate": 2.4766799776342215e-06, "loss": 0.0294, "step": 8820 }, { "epoch": 0.6858252427184466, "grad_norm": 0.27234451387500563, "learning_rate": 2.465600967032574e-06, "loss": 0.0289, "step": 8830 }, { "epoch": 0.6866019417475728, "grad_norm": 0.12000459743370118, "learning_rate": 2.454538676981296e-06, "loss": 0.0281, "step": 8840 }, { "epoch": 0.6873786407766991, "grad_norm": 0.1131683869016408, "learning_rate": 2.4434931804632057e-06, "loss": 0.0295, "step": 8850 }, { "epoch": 0.6881553398058252, "grad_norm": 0.13180260673480046, "learning_rate": 2.43246455035032e-06, "loss": 0.0288, "step": 8860 }, { "epoch": 0.6889320388349515, "grad_norm": 0.1225457714492501, "learning_rate": 2.4214528594033904e-06, "loss": 0.0308, "step": 8870 }, { "epoch": 0.6897087378640777, "grad_norm": 0.06994729027201395, "learning_rate": 2.410458180271405e-06, "loss": 0.0309, "step": 8880 }, { "epoch": 0.6904854368932039, "grad_norm": 0.13934262203296532, "learning_rate": 2.399480585491122e-06, "loss": 0.0307, "step": 8890 }, { "epoch": 0.6912621359223301, "grad_norm": 0.06569964272188344, "learning_rate": 2.3885201474865815e-06, "loss": 0.0303, "step": 8900 }, { "epoch": 0.6920388349514563, "grad_norm": 0.10224631183143378, "learning_rate": 2.37757693856864e-06, "loss": 0.0355, "step": 8910 }, { "epoch": 0.6928155339805825, "grad_norm": 0.1273739945665216, "learning_rate": 2.366651030934483e-06, "loss": 0.0293, "step": 8920 }, { "epoch": 0.6935922330097087, "grad_norm": 0.12433009387723798, "learning_rate": 2.3557424966671504e-06, "loss": 0.0317, "step": 8930 }, { "epoch": 0.694368932038835, "grad_norm": 0.1334114972588639, "learning_rate": 2.344851407735061e-06, "loss": 0.0308, "step": 8940 }, { "epoch": 0.6951456310679611, "grad_norm": 0.12074227652311909, "learning_rate": 2.333977835991545e-06, "loss": 0.0302, "step": 8950 }, { "epoch": 0.6959223300970874, "grad_norm": 0.15156153138430148, "learning_rate": 2.3231218531743564e-06, "loss": 0.0302, "step": 8960 }, { "epoch": 0.6966990291262136, "grad_norm": 0.08774528875649107, "learning_rate": 2.312283530905215e-06, "loss": 0.0316, "step": 8970 }, { "epoch": 0.6974757281553398, "grad_norm": 0.08600394541299058, "learning_rate": 2.3014629406893185e-06, "loss": 0.0297, "step": 8980 }, { "epoch": 0.698252427184466, "grad_norm": 0.11454738284688669, "learning_rate": 2.290660153914886e-06, "loss": 0.03, "step": 8990 }, { "epoch": 0.6990291262135923, "grad_norm": 0.1251780650330318, "learning_rate": 2.2798752418526736e-06, "loss": 0.0307, "step": 9000 }, { "epoch": 0.6990291262135923, "eval_loss": 0.03117072768509388, "eval_runtime": 1428.1093, "eval_samples_per_second": 3.93, "eval_steps_per_second": 0.281, "step": 9000 }, { "epoch": 0.6998058252427184, "grad_norm": 0.19527055238713387, "learning_rate": 2.26910827565551e-06, "loss": 0.0318, "step": 9010 }, { "epoch": 0.7005825242718446, "grad_norm": 0.10121378685235313, "learning_rate": 2.258359326357831e-06, "loss": 0.0285, "step": 9020 }, { "epoch": 0.7013592233009709, "grad_norm": 0.0873113866577195, "learning_rate": 2.247628464875207e-06, "loss": 0.0308, "step": 9030 }, { "epoch": 0.7021359223300971, "grad_norm": 0.0769734007087443, "learning_rate": 2.236915762003872e-06, "loss": 0.0288, "step": 9040 }, { "epoch": 0.7029126213592233, "grad_norm": 0.15923504680585968, "learning_rate": 2.22622128842026e-06, "loss": 0.0313, "step": 9050 }, { "epoch": 0.7036893203883495, "grad_norm": 0.12964740915193693, "learning_rate": 2.215545114680537e-06, "loss": 0.0292, "step": 9060 }, { "epoch": 0.7044660194174758, "grad_norm": 0.08804978334359373, "learning_rate": 2.2048873112201414e-06, "loss": 0.0297, "step": 9070 }, { "epoch": 0.7052427184466019, "grad_norm": 0.18806066922868644, "learning_rate": 2.1942479483533135e-06, "loss": 0.031, "step": 9080 }, { "epoch": 0.7060194174757282, "grad_norm": 0.11028708938020265, "learning_rate": 2.183627096272631e-06, "loss": 0.0271, "step": 9090 }, { "epoch": 0.7067961165048544, "grad_norm": 0.08975581174201842, "learning_rate": 2.173024825048545e-06, "loss": 0.0303, "step": 9100 }, { "epoch": 0.7075728155339805, "grad_norm": 0.11181543134046373, "learning_rate": 2.1624412046289294e-06, "loss": 0.0291, "step": 9110 }, { "epoch": 0.7083495145631068, "grad_norm": 0.11323857684837961, "learning_rate": 2.1518763048386e-06, "loss": 0.0296, "step": 9120 }, { "epoch": 0.709126213592233, "grad_norm": 0.1410048324104181, "learning_rate": 2.1413301953788733e-06, "loss": 0.0316, "step": 9130 }, { "epoch": 0.7099029126213592, "grad_norm": 0.10302695232538428, "learning_rate": 2.13080294582709e-06, "loss": 0.0428, "step": 9140 }, { "epoch": 0.7106796116504854, "grad_norm": 0.09548256620444136, "learning_rate": 2.120294625636171e-06, "loss": 0.0295, "step": 9150 }, { "epoch": 0.7114563106796117, "grad_norm": 0.10489023705850714, "learning_rate": 2.109805304134147e-06, "loss": 0.0318, "step": 9160 }, { "epoch": 0.7122330097087378, "grad_norm": 0.10407836962418725, "learning_rate": 2.0993350505237046e-06, "loss": 0.0299, "step": 9170 }, { "epoch": 0.7130097087378641, "grad_norm": 0.09729959137544676, "learning_rate": 2.0888839338817364e-06, "loss": 0.0299, "step": 9180 }, { "epoch": 0.7137864077669903, "grad_norm": 0.0980545918641315, "learning_rate": 2.0784520231588778e-06, "loss": 0.0284, "step": 9190 }, { "epoch": 0.7145631067961165, "grad_norm": 0.11087719216515526, "learning_rate": 2.068039387179053e-06, "loss": 0.0312, "step": 9200 }, { "epoch": 0.7153398058252427, "grad_norm": 0.12283265384106902, "learning_rate": 2.0576460946390204e-06, "loss": 0.0319, "step": 9210 }, { "epoch": 0.716116504854369, "grad_norm": 0.13671132553068388, "learning_rate": 2.047272214107927e-06, "loss": 0.0291, "step": 9220 }, { "epoch": 0.7168932038834952, "grad_norm": 0.11749938227052398, "learning_rate": 2.0369178140268462e-06, "loss": 0.0296, "step": 9230 }, { "epoch": 0.7176699029126213, "grad_norm": 0.1058660257756773, "learning_rate": 2.0265829627083284e-06, "loss": 0.0295, "step": 9240 }, { "epoch": 0.7184466019417476, "grad_norm": 0.0861269823529748, "learning_rate": 2.0162677283359565e-06, "loss": 0.0302, "step": 9250 }, { "epoch": 0.7192233009708738, "grad_norm": 0.06735702278985004, "learning_rate": 2.0059721789638918e-06, "loss": 0.0312, "step": 9260 }, { "epoch": 0.72, "grad_norm": 0.07633477740225952, "learning_rate": 1.9956963825164216e-06, "loss": 0.0297, "step": 9270 }, { "epoch": 0.7207766990291262, "grad_norm": 0.09029039120862145, "learning_rate": 1.9854404067875157e-06, "loss": 0.0314, "step": 9280 }, { "epoch": 0.7215533980582525, "grad_norm": 0.15277293489315164, "learning_rate": 1.9752043194403757e-06, "loss": 0.029, "step": 9290 }, { "epoch": 0.7223300970873786, "grad_norm": 0.10379187448566798, "learning_rate": 1.964988188006996e-06, "loss": 0.0295, "step": 9300 }, { "epoch": 0.7231067961165049, "grad_norm": 0.10493490654122671, "learning_rate": 1.9547920798877118e-06, "loss": 0.032, "step": 9310 }, { "epoch": 0.7238834951456311, "grad_norm": 0.07088158260522583, "learning_rate": 1.9446160623507517e-06, "loss": 0.0297, "step": 9320 }, { "epoch": 0.7246601941747572, "grad_norm": 0.11541160378490888, "learning_rate": 1.9344602025317983e-06, "loss": 0.0318, "step": 9330 }, { "epoch": 0.7254368932038835, "grad_norm": 0.07324557959416572, "learning_rate": 1.924324567433551e-06, "loss": 0.0295, "step": 9340 }, { "epoch": 0.7262135922330097, "grad_norm": 0.13899103849811528, "learning_rate": 1.914209223925269e-06, "loss": 0.0284, "step": 9350 }, { "epoch": 0.7269902912621359, "grad_norm": 0.07976570057082337, "learning_rate": 1.9041142387423478e-06, "loss": 0.0296, "step": 9360 }, { "epoch": 0.7277669902912621, "grad_norm": 0.08513766888444996, "learning_rate": 1.8940396784858616e-06, "loss": 0.0291, "step": 9370 }, { "epoch": 0.7285436893203884, "grad_norm": 0.10782557967723075, "learning_rate": 1.8839856096221399e-06, "loss": 0.0287, "step": 9380 }, { "epoch": 0.7293203883495145, "grad_norm": 0.13272179375877996, "learning_rate": 1.873952098482316e-06, "loss": 0.0283, "step": 9390 }, { "epoch": 0.7300970873786408, "grad_norm": 0.1260909252531801, "learning_rate": 1.863939211261896e-06, "loss": 0.0318, "step": 9400 }, { "epoch": 0.730873786407767, "grad_norm": 0.12371460068678838, "learning_rate": 1.8539470140203214e-06, "loss": 0.0291, "step": 9410 }, { "epoch": 0.7316504854368931, "grad_norm": 0.09135423639130927, "learning_rate": 1.8439755726805365e-06, "loss": 0.0309, "step": 9420 }, { "epoch": 0.7324271844660194, "grad_norm": 0.08748869338921458, "learning_rate": 1.8340249530285437e-06, "loss": 0.0302, "step": 9430 }, { "epoch": 0.7332038834951456, "grad_norm": 0.12175012692332783, "learning_rate": 1.8240952207129759e-06, "loss": 0.0313, "step": 9440 }, { "epoch": 0.7339805825242719, "grad_norm": 0.06033949185291851, "learning_rate": 1.814186441244669e-06, "loss": 0.0285, "step": 9450 }, { "epoch": 0.734757281553398, "grad_norm": 0.08798154377806218, "learning_rate": 1.8042986799962164e-06, "loss": 0.0318, "step": 9460 }, { "epoch": 0.7355339805825243, "grad_norm": 0.07862467186854985, "learning_rate": 1.794432002201552e-06, "loss": 0.03, "step": 9470 }, { "epoch": 0.7363106796116505, "grad_norm": 0.07668025720394449, "learning_rate": 1.7845864729555058e-06, "loss": 0.0293, "step": 9480 }, { "epoch": 0.7370873786407767, "grad_norm": 0.07916979854032626, "learning_rate": 1.7747621572133883e-06, "loss": 0.032, "step": 9490 }, { "epoch": 0.7378640776699029, "grad_norm": 0.11183351119175876, "learning_rate": 1.7649591197905503e-06, "loss": 0.032, "step": 9500 }, { "epoch": 0.7386407766990292, "grad_norm": 0.0677545189790146, "learning_rate": 1.7551774253619608e-06, "loss": 0.0273, "step": 9510 }, { "epoch": 0.7394174757281553, "grad_norm": 0.1222864105518296, "learning_rate": 1.745417138461778e-06, "loss": 0.0303, "step": 9520 }, { "epoch": 0.7401941747572816, "grad_norm": 0.10074916036285318, "learning_rate": 1.7356783234829343e-06, "loss": 0.0322, "step": 9530 }, { "epoch": 0.7409708737864078, "grad_norm": 0.11909280878905262, "learning_rate": 1.7259610446766923e-06, "loss": 0.0305, "step": 9540 }, { "epoch": 0.7417475728155339, "grad_norm": 0.0637500032253991, "learning_rate": 1.7162653661522338e-06, "loss": 0.0323, "step": 9550 }, { "epoch": 0.7425242718446602, "grad_norm": 0.10042150789013699, "learning_rate": 1.7065913518762333e-06, "loss": 0.029, "step": 9560 }, { "epoch": 0.7433009708737864, "grad_norm": 0.12242924728394321, "learning_rate": 1.6969390656724416e-06, "loss": 0.0287, "step": 9570 }, { "epoch": 0.7440776699029126, "grad_norm": 0.10202418959076494, "learning_rate": 1.6873085712212522e-06, "loss": 0.0292, "step": 9580 }, { "epoch": 0.7448543689320388, "grad_norm": 0.07166178840939745, "learning_rate": 1.6776999320592963e-06, "loss": 0.0289, "step": 9590 }, { "epoch": 0.7456310679611651, "grad_norm": 0.09640673497539176, "learning_rate": 1.6681132115790088e-06, "loss": 0.0296, "step": 9600 }, { "epoch": 0.7464077669902912, "grad_norm": 0.09172676290632907, "learning_rate": 1.6585484730282253e-06, "loss": 0.03, "step": 9610 }, { "epoch": 0.7471844660194175, "grad_norm": 0.09566299016697107, "learning_rate": 1.6490057795097503e-06, "loss": 0.0282, "step": 9620 }, { "epoch": 0.7479611650485437, "grad_norm": 0.09145215285562494, "learning_rate": 1.6394851939809481e-06, "loss": 0.0313, "step": 9630 }, { "epoch": 0.74873786407767, "grad_norm": 0.1280366111853625, "learning_rate": 1.6299867792533303e-06, "loss": 0.0291, "step": 9640 }, { "epoch": 0.7495145631067961, "grad_norm": 0.14993152211708946, "learning_rate": 1.6205105979921382e-06, "loss": 0.0298, "step": 9650 }, { "epoch": 0.7502912621359223, "grad_norm": 0.09160018622421008, "learning_rate": 1.6110567127159254e-06, "loss": 0.0377, "step": 9660 }, { "epoch": 0.7510679611650486, "grad_norm": 0.17406408055005052, "learning_rate": 1.60162518579615e-06, "loss": 0.0289, "step": 9670 }, { "epoch": 0.7518446601941747, "grad_norm": 0.0973060319212948, "learning_rate": 1.5922160794567627e-06, "loss": 0.0289, "step": 9680 }, { "epoch": 0.752621359223301, "grad_norm": 0.11270092225683771, "learning_rate": 1.582829455773796e-06, "loss": 0.0301, "step": 9690 }, { "epoch": 0.7533980582524272, "grad_norm": 0.2119566568949407, "learning_rate": 1.573465376674957e-06, "loss": 0.0286, "step": 9700 }, { "epoch": 0.7541747572815534, "grad_norm": 0.12728911054481812, "learning_rate": 1.5641239039392109e-06, "loss": 0.0315, "step": 9710 }, { "epoch": 0.7549514563106796, "grad_norm": 0.07606805762099554, "learning_rate": 1.5548050991963809e-06, "loss": 0.033, "step": 9720 }, { "epoch": 0.7557281553398059, "grad_norm": 0.07257317505040604, "learning_rate": 1.5455090239267418e-06, "loss": 0.0292, "step": 9730 }, { "epoch": 0.756504854368932, "grad_norm": 0.06944120902054417, "learning_rate": 1.5362357394606082e-06, "loss": 0.0313, "step": 9740 }, { "epoch": 0.7572815533980582, "grad_norm": 0.08864314265909914, "learning_rate": 1.5269853069779362e-06, "loss": 0.0297, "step": 9750 }, { "epoch": 0.7580582524271845, "grad_norm": 0.12438330860403829, "learning_rate": 1.517757787507918e-06, "loss": 0.0297, "step": 9760 }, { "epoch": 0.7588349514563106, "grad_norm": 0.11642856466570944, "learning_rate": 1.508553241928577e-06, "loss": 0.0301, "step": 9770 }, { "epoch": 0.7596116504854369, "grad_norm": 0.14500695291999435, "learning_rate": 1.4993717309663663e-06, "loss": 0.0292, "step": 9780 }, { "epoch": 0.7603883495145631, "grad_norm": 0.21992848077754557, "learning_rate": 1.4902133151957703e-06, "loss": 0.03, "step": 9790 }, { "epoch": 0.7611650485436893, "grad_norm": 0.07427022079220863, "learning_rate": 1.4810780550389064e-06, "loss": 0.0318, "step": 9800 }, { "epoch": 0.7619417475728155, "grad_norm": 0.09703195058562447, "learning_rate": 1.4719660107651246e-06, "loss": 0.0283, "step": 9810 }, { "epoch": 0.7627184466019418, "grad_norm": 0.07225891039417892, "learning_rate": 1.4628772424906051e-06, "loss": 0.0284, "step": 9820 }, { "epoch": 0.7634951456310679, "grad_norm": 0.11253818971492703, "learning_rate": 1.4538118101779674e-06, "loss": 0.0287, "step": 9830 }, { "epoch": 0.7642718446601942, "grad_norm": 0.0929048973306013, "learning_rate": 1.4447697736358785e-06, "loss": 0.0293, "step": 9840 }, { "epoch": 0.7650485436893204, "grad_norm": 0.07952157169805243, "learning_rate": 1.4357511925186462e-06, "loss": 0.029, "step": 9850 }, { "epoch": 0.7658252427184467, "grad_norm": 0.10702167194398754, "learning_rate": 1.4267561263258344e-06, "loss": 0.0278, "step": 9860 }, { "epoch": 0.7666019417475728, "grad_norm": 0.09497661302918059, "learning_rate": 1.4177846344018713e-06, "loss": 0.0294, "step": 9870 }, { "epoch": 0.767378640776699, "grad_norm": 0.10258018451948869, "learning_rate": 1.4088367759356557e-06, "loss": 0.0281, "step": 9880 }, { "epoch": 0.7681553398058253, "grad_norm": 0.08982721871770394, "learning_rate": 1.3999126099601619e-06, "loss": 0.0291, "step": 9890 }, { "epoch": 0.7689320388349514, "grad_norm": 0.08243057907328787, "learning_rate": 1.3910121953520555e-06, "loss": 0.0281, "step": 9900 }, { "epoch": 0.7697087378640777, "grad_norm": 0.07402687072327788, "learning_rate": 1.382135590831305e-06, "loss": 0.0295, "step": 9910 }, { "epoch": 0.7704854368932039, "grad_norm": 0.12191920990089877, "learning_rate": 1.3732828549607936e-06, "loss": 0.0293, "step": 9920 }, { "epoch": 0.7712621359223301, "grad_norm": 0.10525153263643547, "learning_rate": 1.3644540461459327e-06, "loss": 0.0279, "step": 9930 }, { "epoch": 0.7720388349514563, "grad_norm": 0.09818138307438501, "learning_rate": 1.3556492226342755e-06, "loss": 0.0296, "step": 9940 }, { "epoch": 0.7728155339805826, "grad_norm": 0.1245914731989837, "learning_rate": 1.3468684425151308e-06, "loss": 0.0283, "step": 9950 }, { "epoch": 0.7735922330097087, "grad_norm": 0.12570037458163955, "learning_rate": 1.3381117637191887e-06, "loss": 0.039, "step": 9960 }, { "epoch": 0.774368932038835, "grad_norm": 0.12970621795539192, "learning_rate": 1.3293792440181263e-06, "loss": 0.029, "step": 9970 }, { "epoch": 0.7751456310679612, "grad_norm": 0.15504326506540855, "learning_rate": 1.3206709410242368e-06, "loss": 0.031, "step": 9980 }, { "epoch": 0.7759223300970873, "grad_norm": 0.08162257751962226, "learning_rate": 1.3119869121900414e-06, "loss": 0.0292, "step": 9990 }, { "epoch": 0.7766990291262136, "grad_norm": 0.10091145531225423, "learning_rate": 1.303327214807918e-06, "loss": 0.0303, "step": 10000 }, { "epoch": 0.7766990291262136, "eval_loss": 0.030751362442970276, "eval_runtime": 1432.6875, "eval_samples_per_second": 3.918, "eval_steps_per_second": 0.28, "step": 10000 }, { "epoch": 0.7774757281553398, "grad_norm": 0.08458762989336281, "learning_rate": 1.2946919060097162e-06, "loss": 0.0315, "step": 10010 }, { "epoch": 0.778252427184466, "grad_norm": 0.08564299666651162, "learning_rate": 1.2860810427663818e-06, "loss": 0.0288, "step": 10020 }, { "epoch": 0.7790291262135922, "grad_norm": 0.10169282624241828, "learning_rate": 1.2774946818875867e-06, "loss": 0.028, "step": 10030 }, { "epoch": 0.7798058252427185, "grad_norm": 0.13264787729889163, "learning_rate": 1.2689328800213491e-06, "loss": 0.0387, "step": 10040 }, { "epoch": 0.7805825242718447, "grad_norm": 0.17751950436591826, "learning_rate": 1.2603956936536583e-06, "loss": 0.0286, "step": 10050 }, { "epoch": 0.7813592233009709, "grad_norm": 0.15666936060101594, "learning_rate": 1.2518831791081031e-06, "loss": 0.0313, "step": 10060 }, { "epoch": 0.7821359223300971, "grad_norm": 0.15665454495436415, "learning_rate": 1.2433953925455077e-06, "loss": 0.0324, "step": 10070 }, { "epoch": 0.7829126213592233, "grad_norm": 0.1674241483706272, "learning_rate": 1.2349323899635469e-06, "loss": 0.0299, "step": 10080 }, { "epoch": 0.7836893203883495, "grad_norm": 0.08328119793566058, "learning_rate": 1.226494227196392e-06, "loss": 0.0299, "step": 10090 }, { "epoch": 0.7844660194174757, "grad_norm": 0.07225062056766665, "learning_rate": 1.218080959914329e-06, "loss": 0.0305, "step": 10100 }, { "epoch": 0.785242718446602, "grad_norm": 0.10822791006074446, "learning_rate": 1.209692643623402e-06, "loss": 0.0297, "step": 10110 }, { "epoch": 0.7860194174757281, "grad_norm": 0.10037119677673761, "learning_rate": 1.2013293336650389e-06, "loss": 0.029, "step": 10120 }, { "epoch": 0.7867961165048544, "grad_norm": 0.08287376997331447, "learning_rate": 1.1929910852156906e-06, "loss": 0.0299, "step": 10130 }, { "epoch": 0.7875728155339806, "grad_norm": 0.06823440962329794, "learning_rate": 1.1846779532864676e-06, "loss": 0.0296, "step": 10140 }, { "epoch": 0.7883495145631068, "grad_norm": 0.2408116131496015, "learning_rate": 1.1763899927227768e-06, "loss": 0.0298, "step": 10150 }, { "epoch": 0.789126213592233, "grad_norm": 0.1684120592462139, "learning_rate": 1.1681272582039544e-06, "loss": 0.0541, "step": 10160 }, { "epoch": 0.7899029126213593, "grad_norm": 0.12401706853107786, "learning_rate": 1.1598898042429119e-06, "loss": 0.0304, "step": 10170 }, { "epoch": 0.7906796116504854, "grad_norm": 0.08514440328203628, "learning_rate": 1.1516776851857714e-06, "loss": 0.0288, "step": 10180 }, { "epoch": 0.7914563106796116, "grad_norm": 0.10531465931493261, "learning_rate": 1.1434909552115141e-06, "loss": 0.0272, "step": 10190 }, { "epoch": 0.7922330097087379, "grad_norm": 0.08384418850740256, "learning_rate": 1.1353296683316128e-06, "loss": 0.0292, "step": 10200 }, { "epoch": 0.793009708737864, "grad_norm": 0.08150358051257155, "learning_rate": 1.1271938783896862e-06, "loss": 0.0283, "step": 10210 }, { "epoch": 0.7937864077669903, "grad_norm": 0.1263227154754521, "learning_rate": 1.1190836390611325e-06, "loss": 0.0284, "step": 10220 }, { "epoch": 0.7945631067961165, "grad_norm": 0.08655660940802054, "learning_rate": 1.1109990038527878e-06, "loss": 0.0282, "step": 10230 }, { "epoch": 0.7953398058252428, "grad_norm": 0.08558854840202158, "learning_rate": 1.102940026102562e-06, "loss": 0.0296, "step": 10240 }, { "epoch": 0.7961165048543689, "grad_norm": 0.07692131719659867, "learning_rate": 1.0949067589790907e-06, "loss": 0.0283, "step": 10250 }, { "epoch": 0.7968932038834952, "grad_norm": 0.10139971527289053, "learning_rate": 1.0868992554813884e-06, "loss": 0.0301, "step": 10260 }, { "epoch": 0.7976699029126214, "grad_norm": 0.0832970601027867, "learning_rate": 1.0789175684384956e-06, "loss": 0.0286, "step": 10270 }, { "epoch": 0.7984466019417475, "grad_norm": 0.06789476435836798, "learning_rate": 1.070961750509128e-06, "loss": 0.0277, "step": 10280 }, { "epoch": 0.7992233009708738, "grad_norm": 0.11391535045135999, "learning_rate": 1.0630318541813294e-06, "loss": 0.0317, "step": 10290 }, { "epoch": 0.8, "grad_norm": 0.11978094440281316, "learning_rate": 1.0551279317721348e-06, "loss": 0.0304, "step": 10300 }, { "epoch": 0.8007766990291262, "grad_norm": 0.08468784844586731, "learning_rate": 1.0472500354272085e-06, "loss": 0.0337, "step": 10310 }, { "epoch": 0.8015533980582524, "grad_norm": 0.10367068338977112, "learning_rate": 1.0393982171205175e-06, "loss": 0.0292, "step": 10320 }, { "epoch": 0.8023300970873787, "grad_norm": 0.12079114080477278, "learning_rate": 1.031572528653974e-06, "loss": 0.0294, "step": 10330 }, { "epoch": 0.8031067961165048, "grad_norm": 0.06523473175241104, "learning_rate": 1.0237730216571063e-06, "loss": 0.0285, "step": 10340 }, { "epoch": 0.8038834951456311, "grad_norm": 0.0884836505753561, "learning_rate": 1.0159997475867062e-06, "loss": 0.0336, "step": 10350 }, { "epoch": 0.8046601941747573, "grad_norm": 0.09261397685997377, "learning_rate": 1.0082527577264972e-06, "loss": 0.0287, "step": 10360 }, { "epoch": 0.8054368932038835, "grad_norm": 0.07634871963880396, "learning_rate": 1.000532103186796e-06, "loss": 0.0303, "step": 10370 }, { "epoch": 0.8062135922330097, "grad_norm": 0.10465141214966443, "learning_rate": 9.928378349041734e-07, "loss": 0.0301, "step": 10380 }, { "epoch": 0.806990291262136, "grad_norm": 0.09227564806152772, "learning_rate": 9.851700036411165e-07, "loss": 0.0325, "step": 10390 }, { "epoch": 0.8077669902912621, "grad_norm": 0.1203406772067101, "learning_rate": 9.775286599856959e-07, "loss": 0.0287, "step": 10400 }, { "epoch": 0.8085436893203883, "grad_norm": 0.07527119506166657, "learning_rate": 9.69913854351232e-07, "loss": 0.0301, "step": 10410 }, { "epoch": 0.8093203883495146, "grad_norm": 0.139550626731937, "learning_rate": 9.623256369759631e-07, "loss": 0.0286, "step": 10420 }, { "epoch": 0.8100970873786407, "grad_norm": 0.12626316188796072, "learning_rate": 9.547640579227135e-07, "loss": 0.0269, "step": 10430 }, { "epoch": 0.810873786407767, "grad_norm": 0.0877359341073735, "learning_rate": 9.472291670785616e-07, "loss": 0.0283, "step": 10440 }, { "epoch": 0.8116504854368932, "grad_norm": 0.11711620151250532, "learning_rate": 9.397210141545094e-07, "loss": 0.0303, "step": 10450 }, { "epoch": 0.8124271844660195, "grad_norm": 0.08561784609382164, "learning_rate": 9.322396486851626e-07, "loss": 0.0302, "step": 10460 }, { "epoch": 0.8132038834951456, "grad_norm": 0.12727181819407235, "learning_rate": 9.24785120028393e-07, "loss": 0.0314, "step": 10470 }, { "epoch": 0.8139805825242719, "grad_norm": 0.08691769876726088, "learning_rate": 9.173574773650229e-07, "loss": 0.0283, "step": 10480 }, { "epoch": 0.8147572815533981, "grad_norm": 0.1314249533232593, "learning_rate": 9.099567696984896e-07, "loss": 0.0298, "step": 10490 }, { "epoch": 0.8155339805825242, "grad_norm": 0.11735735629664899, "learning_rate": 9.025830458545359e-07, "loss": 0.035, "step": 10500 }, { "epoch": 0.8163106796116505, "grad_norm": 0.21470385727650693, "learning_rate": 8.95236354480874e-07, "loss": 0.0297, "step": 10510 }, { "epoch": 0.8170873786407767, "grad_norm": 0.10759882231585681, "learning_rate": 8.879167440468745e-07, "loss": 0.0289, "step": 10520 }, { "epoch": 0.8178640776699029, "grad_norm": 0.09548973866177901, "learning_rate": 8.806242628432393e-07, "loss": 0.0287, "step": 10530 }, { "epoch": 0.8186407766990291, "grad_norm": 0.09242449704898173, "learning_rate": 8.733589589816921e-07, "loss": 0.0287, "step": 10540 }, { "epoch": 0.8194174757281554, "grad_norm": 0.1051801484812129, "learning_rate": 8.661208803946536e-07, "loss": 0.0295, "step": 10550 }, { "epoch": 0.8201941747572815, "grad_norm": 0.12831864290827705, "learning_rate": 8.589100748349255e-07, "loss": 0.0298, "step": 10560 }, { "epoch": 0.8209708737864078, "grad_norm": 0.12103383252107582, "learning_rate": 8.517265898753773e-07, "loss": 0.0347, "step": 10570 }, { "epoch": 0.821747572815534, "grad_norm": 0.07442769811812062, "learning_rate": 8.445704729086362e-07, "loss": 0.028, "step": 10580 }, { "epoch": 0.8225242718446601, "grad_norm": 0.16633200177108767, "learning_rate": 8.374417711467659e-07, "loss": 0.0317, "step": 10590 }, { "epoch": 0.8233009708737864, "grad_norm": 0.10236428728281714, "learning_rate": 8.303405316209628e-07, "loss": 0.028, "step": 10600 }, { "epoch": 0.8240776699029126, "grad_norm": 0.0827250648423402, "learning_rate": 8.232668011812433e-07, "loss": 0.0286, "step": 10610 }, { "epoch": 0.8248543689320388, "grad_norm": 0.09584895882528374, "learning_rate": 8.162206264961326e-07, "loss": 0.0286, "step": 10620 }, { "epoch": 0.825631067961165, "grad_norm": 0.11061935757986574, "learning_rate": 8.092020540523576e-07, "loss": 0.0315, "step": 10630 }, { "epoch": 0.8264077669902913, "grad_norm": 0.07680448219084145, "learning_rate": 8.022111301545415e-07, "loss": 0.0293, "step": 10640 }, { "epoch": 0.8271844660194175, "grad_norm": 0.09757130677361327, "learning_rate": 7.952479009249003e-07, "loss": 0.0297, "step": 10650 }, { "epoch": 0.8279611650485437, "grad_norm": 0.14455368949840322, "learning_rate": 7.88312412302935e-07, "loss": 0.0313, "step": 10660 }, { "epoch": 0.8287378640776699, "grad_norm": 0.06371704342006297, "learning_rate": 7.814047100451278e-07, "loss": 0.0298, "step": 10670 }, { "epoch": 0.8295145631067962, "grad_norm": 0.08105956529716732, "learning_rate": 7.745248397246418e-07, "loss": 0.031, "step": 10680 }, { "epoch": 0.8302912621359223, "grad_norm": 0.10174583655287388, "learning_rate": 7.676728467310235e-07, "loss": 0.0313, "step": 10690 }, { "epoch": 0.8310679611650486, "grad_norm": 0.09527646882334112, "learning_rate": 7.608487762698969e-07, "loss": 0.0311, "step": 10700 }, { "epoch": 0.8318446601941748, "grad_norm": 0.0637043513883016, "learning_rate": 7.540526733626724e-07, "loss": 0.03, "step": 10710 }, { "epoch": 0.8326213592233009, "grad_norm": 0.07396675409243425, "learning_rate": 7.472845828462421e-07, "loss": 0.0302, "step": 10720 }, { "epoch": 0.8333980582524272, "grad_norm": 0.0648448018549443, "learning_rate": 7.40544549372692e-07, "loss": 0.0279, "step": 10730 }, { "epoch": 0.8341747572815534, "grad_norm": 0.07369294304788454, "learning_rate": 7.338326174089999e-07, "loss": 0.0301, "step": 10740 }, { "epoch": 0.8349514563106796, "grad_norm": 0.11041412074322586, "learning_rate": 7.271488312367464e-07, "loss": 0.0321, "step": 10750 }, { "epoch": 0.8357281553398058, "grad_norm": 0.108484139283848, "learning_rate": 7.204932349518235e-07, "loss": 0.0305, "step": 10760 }, { "epoch": 0.8365048543689321, "grad_norm": 0.09727270797586987, "learning_rate": 7.138658724641417e-07, "loss": 0.0292, "step": 10770 }, { "epoch": 0.8372815533980582, "grad_norm": 0.086859894935072, "learning_rate": 7.072667874973388e-07, "loss": 0.0299, "step": 10780 }, { "epoch": 0.8380582524271845, "grad_norm": 0.07850308524713388, "learning_rate": 7.006960235884952e-07, "loss": 0.0295, "step": 10790 }, { "epoch": 0.8388349514563107, "grad_norm": 0.13522686221605362, "learning_rate": 6.941536240878422e-07, "loss": 0.0305, "step": 10800 }, { "epoch": 0.8396116504854368, "grad_norm": 0.07353865199900012, "learning_rate": 6.876396321584816e-07, "loss": 0.0309, "step": 10810 }, { "epoch": 0.8403883495145631, "grad_norm": 0.08695704157621682, "learning_rate": 6.811540907760977e-07, "loss": 0.0282, "step": 10820 }, { "epoch": 0.8411650485436893, "grad_norm": 0.127470442680291, "learning_rate": 6.746970427286725e-07, "loss": 0.0305, "step": 10830 }, { "epoch": 0.8419417475728155, "grad_norm": 0.08502360332026991, "learning_rate": 6.68268530616204e-07, "loss": 0.0282, "step": 10840 }, { "epoch": 0.8427184466019417, "grad_norm": 0.09606189842757615, "learning_rate": 6.61868596850429e-07, "loss": 0.0333, "step": 10850 }, { "epoch": 0.843495145631068, "grad_norm": 0.21955446653381133, "learning_rate": 6.554972836545381e-07, "loss": 0.0287, "step": 10860 }, { "epoch": 0.8442718446601942, "grad_norm": 0.08128253010153715, "learning_rate": 6.491546330628984e-07, "loss": 0.0287, "step": 10870 }, { "epoch": 0.8450485436893204, "grad_norm": 0.07422195573679011, "learning_rate": 6.428406869207804e-07, "loss": 0.0289, "step": 10880 }, { "epoch": 0.8458252427184466, "grad_norm": 0.08732202570367426, "learning_rate": 6.365554868840773e-07, "loss": 0.0291, "step": 10890 }, { "epoch": 0.8466019417475729, "grad_norm": 0.11085891058971796, "learning_rate": 6.302990744190307e-07, "loss": 0.0309, "step": 10900 }, { "epoch": 0.847378640776699, "grad_norm": 0.11566336739113821, "learning_rate": 6.240714908019568e-07, "loss": 0.0304, "step": 10910 }, { "epoch": 0.8481553398058252, "grad_norm": 0.12131849153586224, "learning_rate": 6.178727771189796e-07, "loss": 0.0366, "step": 10920 }, { "epoch": 0.8489320388349515, "grad_norm": 0.08794021066657202, "learning_rate": 6.117029742657493e-07, "loss": 0.0293, "step": 10930 }, { "epoch": 0.8497087378640776, "grad_norm": 0.08901016988914191, "learning_rate": 6.055621229471842e-07, "loss": 0.0317, "step": 10940 }, { "epoch": 0.8504854368932039, "grad_norm": 0.07896200704599227, "learning_rate": 5.994502636771926e-07, "loss": 0.0302, "step": 10950 }, { "epoch": 0.8512621359223301, "grad_norm": 0.112033441385661, "learning_rate": 5.93367436778412e-07, "loss": 0.0313, "step": 10960 }, { "epoch": 0.8520388349514563, "grad_norm": 0.09782985945629974, "learning_rate": 5.873136823819397e-07, "loss": 0.0294, "step": 10970 }, { "epoch": 0.8528155339805825, "grad_norm": 0.11909990994094, "learning_rate": 5.812890404270661e-07, "loss": 0.0277, "step": 10980 }, { "epoch": 0.8535922330097088, "grad_norm": 0.10623953071744352, "learning_rate": 5.752935506610185e-07, "loss": 0.0287, "step": 10990 }, { "epoch": 0.8543689320388349, "grad_norm": 0.10926727945682298, "learning_rate": 5.693272526386934e-07, "loss": 0.0283, "step": 11000 }, { "epoch": 0.8543689320388349, "eval_loss": 0.030492937192320824, "eval_runtime": 1431.4907, "eval_samples_per_second": 3.921, "eval_steps_per_second": 0.28, "step": 11000 }, { "epoch": 0.8551456310679612, "grad_norm": 0.070297704594749, "learning_rate": 5.633901857223934e-07, "loss": 0.0282, "step": 11010 }, { "epoch": 0.8559223300970874, "grad_norm": 0.08016447701616448, "learning_rate": 5.574823890815733e-07, "loss": 0.0289, "step": 11020 }, { "epoch": 0.8566990291262135, "grad_norm": 0.11411616931696593, "learning_rate": 5.516039016925772e-07, "loss": 0.0301, "step": 11030 }, { "epoch": 0.8574757281553398, "grad_norm": 0.07327199972498653, "learning_rate": 5.457547623383846e-07, "loss": 0.0283, "step": 11040 }, { "epoch": 0.858252427184466, "grad_norm": 0.0730656339298446, "learning_rate": 5.399350096083538e-07, "loss": 0.0285, "step": 11050 }, { "epoch": 0.8590291262135923, "grad_norm": 0.07301106813381375, "learning_rate": 5.341446818979645e-07, "loss": 0.0303, "step": 11060 }, { "epoch": 0.8598058252427184, "grad_norm": 0.08775394994384228, "learning_rate": 5.283838174085665e-07, "loss": 0.0298, "step": 11070 }, { "epoch": 0.8605825242718447, "grad_norm": 0.1207232826976455, "learning_rate": 5.2265245414713e-07, "loss": 0.0298, "step": 11080 }, { "epoch": 0.8613592233009709, "grad_norm": 0.10363626467415733, "learning_rate": 5.169506299259891e-07, "loss": 0.0275, "step": 11090 }, { "epoch": 0.8621359223300971, "grad_norm": 0.07250475935327676, "learning_rate": 5.112783823626005e-07, "loss": 0.0353, "step": 11100 }, { "epoch": 0.8629126213592233, "grad_norm": 0.09432928035675812, "learning_rate": 5.056357488792852e-07, "loss": 0.0291, "step": 11110 }, { "epoch": 0.8636893203883496, "grad_norm": 0.08414231355432496, "learning_rate": 5.000227667029922e-07, "loss": 0.0274, "step": 11120 }, { "epoch": 0.8644660194174757, "grad_norm": 0.06444327748353273, "learning_rate": 4.944394728650448e-07, "loss": 0.029, "step": 11130 }, { "epoch": 0.8652427184466019, "grad_norm": 0.13267626285357073, "learning_rate": 4.888859042008981e-07, "loss": 0.0303, "step": 11140 }, { "epoch": 0.8660194174757282, "grad_norm": 0.13311864986131422, "learning_rate": 4.833620973498993e-07, "loss": 0.0307, "step": 11150 }, { "epoch": 0.8667961165048543, "grad_norm": 0.0675202753942419, "learning_rate": 4.778680887550436e-07, "loss": 0.0296, "step": 11160 }, { "epoch": 0.8675728155339806, "grad_norm": 0.08712230022003857, "learning_rate": 4.7240391466273407e-07, "loss": 0.0302, "step": 11170 }, { "epoch": 0.8683495145631068, "grad_norm": 0.07893994791170128, "learning_rate": 4.669696111225408e-07, "loss": 0.0301, "step": 11180 }, { "epoch": 0.869126213592233, "grad_norm": 0.07849567407540838, "learning_rate": 4.615652139869664e-07, "loss": 0.0306, "step": 11190 }, { "epoch": 0.8699029126213592, "grad_norm": 0.0698234995045095, "learning_rate": 4.561907589112069e-07, "loss": 0.0316, "step": 11200 }, { "epoch": 0.8706796116504855, "grad_norm": 0.09678199222071401, "learning_rate": 4.5084628135291633e-07, "loss": 0.0317, "step": 11210 }, { "epoch": 0.8714563106796116, "grad_norm": 0.13879077931546185, "learning_rate": 4.4553181657197574e-07, "loss": 0.0311, "step": 11220 }, { "epoch": 0.8722330097087378, "grad_norm": 0.0820025708012411, "learning_rate": 4.402473996302581e-07, "loss": 0.0311, "step": 11230 }, { "epoch": 0.8730097087378641, "grad_norm": 0.09728763101137297, "learning_rate": 4.3499306539139694e-07, "loss": 0.0295, "step": 11240 }, { "epoch": 0.8737864077669902, "grad_norm": 0.07520911813493888, "learning_rate": 4.29768848520557e-07, "loss": 0.0307, "step": 11250 }, { "epoch": 0.8745631067961165, "grad_norm": 0.11962658121332355, "learning_rate": 4.2457478348420457e-07, "loss": 0.028, "step": 11260 }, { "epoch": 0.8753398058252427, "grad_norm": 0.1288140536379559, "learning_rate": 4.1941090454988285e-07, "loss": 0.0301, "step": 11270 }, { "epoch": 0.876116504854369, "grad_norm": 0.08793048850266288, "learning_rate": 4.1427724578598393e-07, "loss": 0.0285, "step": 11280 }, { "epoch": 0.8768932038834951, "grad_norm": 0.102799172961193, "learning_rate": 4.0917384106152256e-07, "loss": 0.0309, "step": 11290 }, { "epoch": 0.8776699029126214, "grad_norm": 0.10207463275849381, "learning_rate": 4.0410072404591327e-07, "loss": 0.0276, "step": 11300 }, { "epoch": 0.8784466019417476, "grad_norm": 0.07513814559084296, "learning_rate": 3.990579282087537e-07, "loss": 0.031, "step": 11310 }, { "epoch": 0.8792233009708738, "grad_norm": 0.0968083056957315, "learning_rate": 3.9404548681959363e-07, "loss": 0.0351, "step": 11320 }, { "epoch": 0.88, "grad_norm": 0.07272135098698597, "learning_rate": 3.8906343294772666e-07, "loss": 0.0291, "step": 11330 }, { "epoch": 0.8807766990291263, "grad_norm": 0.09431594412165197, "learning_rate": 3.841117994619614e-07, "loss": 0.0298, "step": 11340 }, { "epoch": 0.8815533980582524, "grad_norm": 0.08371412260838028, "learning_rate": 3.791906190304134e-07, "loss": 0.03, "step": 11350 }, { "epoch": 0.8823300970873786, "grad_norm": 0.10677625055612429, "learning_rate": 3.742999241202844e-07, "loss": 0.0289, "step": 11360 }, { "epoch": 0.8831067961165049, "grad_norm": 0.10568020872200641, "learning_rate": 3.6943974699764905e-07, "loss": 0.0317, "step": 11370 }, { "epoch": 0.883883495145631, "grad_norm": 0.10469131310522271, "learning_rate": 3.646101197272445e-07, "loss": 0.0287, "step": 11380 }, { "epoch": 0.8846601941747573, "grad_norm": 0.1204146631924757, "learning_rate": 3.5981107417225603e-07, "loss": 0.0286, "step": 11390 }, { "epoch": 0.8854368932038835, "grad_norm": 0.11146548786723091, "learning_rate": 3.5504264199410776e-07, "loss": 0.0292, "step": 11400 }, { "epoch": 0.8862135922330097, "grad_norm": 0.13569808006813175, "learning_rate": 3.5030485465225416e-07, "loss": 0.0294, "step": 11410 }, { "epoch": 0.8869902912621359, "grad_norm": 0.14566770964736772, "learning_rate": 3.4559774340397033e-07, "loss": 0.0283, "step": 11420 }, { "epoch": 0.8877669902912622, "grad_norm": 0.09286710791159816, "learning_rate": 3.4092133930415105e-07, "loss": 0.0281, "step": 11430 }, { "epoch": 0.8885436893203883, "grad_norm": 0.1340479277125172, "learning_rate": 3.362756732051009e-07, "loss": 0.0309, "step": 11440 }, { "epoch": 0.8893203883495145, "grad_norm": 0.09764674557881928, "learning_rate": 3.3166077575633006e-07, "loss": 0.0291, "step": 11450 }, { "epoch": 0.8900970873786408, "grad_norm": 0.10684811655238748, "learning_rate": 3.2707667740435823e-07, "loss": 0.0307, "step": 11460 }, { "epoch": 0.890873786407767, "grad_norm": 0.09715054723664016, "learning_rate": 3.2252340839250774e-07, "loss": 0.0292, "step": 11470 }, { "epoch": 0.8916504854368932, "grad_norm": 0.10364785503503922, "learning_rate": 3.1800099876070577e-07, "loss": 0.0293, "step": 11480 }, { "epoch": 0.8924271844660194, "grad_norm": 0.06497137503305275, "learning_rate": 3.1350947834528633e-07, "loss": 0.0282, "step": 11490 }, { "epoch": 0.8932038834951457, "grad_norm": 0.12450854735880267, "learning_rate": 3.0904887677879526e-07, "loss": 0.0294, "step": 11500 }, { "epoch": 0.8939805825242718, "grad_norm": 0.13345479068588603, "learning_rate": 3.046192234897932e-07, "loss": 0.0282, "step": 11510 }, { "epoch": 0.8947572815533981, "grad_norm": 0.09549858050756564, "learning_rate": 3.002205477026593e-07, "loss": 0.0311, "step": 11520 }, { "epoch": 0.8955339805825243, "grad_norm": 0.09256424330358669, "learning_rate": 2.958528784374004e-07, "loss": 0.0289, "step": 11530 }, { "epoch": 0.8963106796116505, "grad_norm": 0.12270880721038029, "learning_rate": 2.9151624450946325e-07, "loss": 0.0285, "step": 11540 }, { "epoch": 0.8970873786407767, "grad_norm": 0.0825198070641202, "learning_rate": 2.872106745295361e-07, "loss": 0.0341, "step": 11550 }, { "epoch": 0.897864077669903, "grad_norm": 0.14332965913697687, "learning_rate": 2.829361969033678e-07, "loss": 0.0296, "step": 11560 }, { "epoch": 0.8986407766990291, "grad_norm": 0.0657477127178437, "learning_rate": 2.786928398315758e-07, "loss": 0.0294, "step": 11570 }, { "epoch": 0.8994174757281553, "grad_norm": 0.12890309415026033, "learning_rate": 2.7448063130946224e-07, "loss": 0.0281, "step": 11580 }, { "epoch": 0.9001941747572816, "grad_norm": 0.08205074282042245, "learning_rate": 2.7029959912682757e-07, "loss": 0.03, "step": 11590 }, { "epoch": 0.9009708737864077, "grad_norm": 0.08906913208405066, "learning_rate": 2.661497708677885e-07, "loss": 0.0292, "step": 11600 }, { "epoch": 0.901747572815534, "grad_norm": 0.09938122325000938, "learning_rate": 2.620311739105963e-07, "loss": 0.0282, "step": 11610 }, { "epoch": 0.9025242718446602, "grad_norm": 0.07691555784407579, "learning_rate": 2.5794383542745606e-07, "loss": 0.0298, "step": 11620 }, { "epoch": 0.9033009708737864, "grad_norm": 0.08473233500557172, "learning_rate": 2.5388778238434444e-07, "loss": 0.0293, "step": 11630 }, { "epoch": 0.9040776699029126, "grad_norm": 0.07698582574079837, "learning_rate": 2.4986304154083653e-07, "loss": 0.0293, "step": 11640 }, { "epoch": 0.9048543689320389, "grad_norm": 0.09903745504452748, "learning_rate": 2.4586963944992493e-07, "loss": 0.0279, "step": 11650 }, { "epoch": 0.9056310679611651, "grad_norm": 0.08466149024342672, "learning_rate": 2.4190760245784927e-07, "loss": 0.0304, "step": 11660 }, { "epoch": 0.9064077669902912, "grad_norm": 0.104929250465645, "learning_rate": 2.3797695670391862e-07, "loss": 0.0559, "step": 11670 }, { "epoch": 0.9071844660194175, "grad_norm": 0.09804239099626756, "learning_rate": 2.3407772812033935e-07, "loss": 0.032, "step": 11680 }, { "epoch": 0.9079611650485437, "grad_norm": 0.08566201210656381, "learning_rate": 2.3020994243204474e-07, "loss": 0.0337, "step": 11690 }, { "epoch": 0.9087378640776699, "grad_norm": 0.07540418016418762, "learning_rate": 2.2637362515652793e-07, "loss": 0.0299, "step": 11700 }, { "epoch": 0.9095145631067961, "grad_norm": 0.23406616045371462, "learning_rate": 2.22568801603667e-07, "loss": 0.0302, "step": 11710 }, { "epoch": 0.9102912621359224, "grad_norm": 0.10573542524156698, "learning_rate": 2.1879549687556678e-07, "loss": 0.0297, "step": 11720 }, { "epoch": 0.9110679611650485, "grad_norm": 0.11418549109335205, "learning_rate": 2.15053735866384e-07, "loss": 0.0285, "step": 11730 }, { "epoch": 0.9118446601941748, "grad_norm": 0.08743981293565542, "learning_rate": 2.1134354326217078e-07, "loss": 0.0296, "step": 11740 }, { "epoch": 0.912621359223301, "grad_norm": 0.09522874189816316, "learning_rate": 2.0766494354070688e-07, "loss": 0.0307, "step": 11750 }, { "epoch": 0.9133980582524271, "grad_norm": 0.12486176811285318, "learning_rate": 2.040179609713394e-07, "loss": 0.0276, "step": 11760 }, { "epoch": 0.9141747572815534, "grad_norm": 0.07594893058007789, "learning_rate": 2.0040261961482455e-07, "loss": 0.0279, "step": 11770 }, { "epoch": 0.9149514563106796, "grad_norm": 0.08881919943956164, "learning_rate": 1.9681894332316765e-07, "loss": 0.0288, "step": 11780 }, { "epoch": 0.9157281553398058, "grad_norm": 0.08658461759417058, "learning_rate": 1.9326695573946453e-07, "loss": 0.0291, "step": 11790 }, { "epoch": 0.916504854368932, "grad_norm": 0.08353999310283645, "learning_rate": 1.8974668029774546e-07, "loss": 0.03, "step": 11800 }, { "epoch": 0.9172815533980583, "grad_norm": 0.07989172874936395, "learning_rate": 1.8625814022282473e-07, "loss": 0.0304, "step": 11810 }, { "epoch": 0.9180582524271844, "grad_norm": 0.09354836150512864, "learning_rate": 1.8280135853014246e-07, "loss": 0.0333, "step": 11820 }, { "epoch": 0.9188349514563107, "grad_norm": 0.09442911865500384, "learning_rate": 1.793763580256147e-07, "loss": 0.0308, "step": 11830 }, { "epoch": 0.9196116504854369, "grad_norm": 0.12031067474804001, "learning_rate": 1.7598316130548298e-07, "loss": 0.0299, "step": 11840 }, { "epoch": 0.920388349514563, "grad_norm": 0.07392088876724823, "learning_rate": 1.7262179075616614e-07, "loss": 0.0281, "step": 11850 }, { "epoch": 0.9211650485436893, "grad_norm": 0.09696547244088893, "learning_rate": 1.692922685541115e-07, "loss": 0.0271, "step": 11860 }, { "epoch": 0.9219417475728156, "grad_norm": 0.06810230910233919, "learning_rate": 1.659946166656473e-07, "loss": 0.0287, "step": 11870 }, { "epoch": 0.9227184466019418, "grad_norm": 0.07012782879495114, "learning_rate": 1.6272885684684047e-07, "loss": 0.0284, "step": 11880 }, { "epoch": 0.9234951456310679, "grad_norm": 0.059604799476640796, "learning_rate": 1.5949501064335183e-07, "loss": 0.0261, "step": 11890 }, { "epoch": 0.9242718446601942, "grad_norm": 0.08087184987593803, "learning_rate": 1.5629309939029446e-07, "loss": 0.0285, "step": 11900 }, { "epoch": 0.9250485436893204, "grad_norm": 0.14146857127790838, "learning_rate": 1.5312314421209118e-07, "loss": 0.0283, "step": 11910 }, { "epoch": 0.9258252427184466, "grad_norm": 0.1070860024448902, "learning_rate": 1.4998516602233615e-07, "loss": 0.0318, "step": 11920 }, { "epoch": 0.9266019417475728, "grad_norm": 0.12972376721653983, "learning_rate": 1.4687918552366066e-07, "loss": 0.0285, "step": 11930 }, { "epoch": 0.9273786407766991, "grad_norm": 0.14882651313768905, "learning_rate": 1.4380522320758816e-07, "loss": 0.0271, "step": 11940 }, { "epoch": 0.9281553398058252, "grad_norm": 0.09650969811363871, "learning_rate": 1.407632993544078e-07, "loss": 0.0312, "step": 11950 }, { "epoch": 0.9289320388349515, "grad_norm": 0.07961961980398696, "learning_rate": 1.3775343403303443e-07, "loss": 0.0305, "step": 11960 }, { "epoch": 0.9297087378640777, "grad_norm": 0.08714721804347199, "learning_rate": 1.3477564710088097e-07, "loss": 0.0331, "step": 11970 }, { "epoch": 0.9304854368932038, "grad_norm": 0.07518300873784578, "learning_rate": 1.31829958203723e-07, "loss": 0.0298, "step": 11980 }, { "epoch": 0.9312621359223301, "grad_norm": 0.11139699112375682, "learning_rate": 1.2891638677557107e-07, "loss": 0.0295, "step": 11990 }, { "epoch": 0.9320388349514563, "grad_norm": 0.1070546352287505, "learning_rate": 1.2603495203854454e-07, "loss": 0.0279, "step": 12000 }, { "epoch": 0.9320388349514563, "eval_loss": 0.03036821074783802, "eval_runtime": 1430.4817, "eval_samples_per_second": 3.924, "eval_steps_per_second": 0.28, "step": 12000 }, { "epoch": 0.9328155339805825, "grad_norm": 0.10969492561406734, "learning_rate": 1.2318567300274199e-07, "loss": 0.0285, "step": 12010 }, { "epoch": 0.9335922330097087, "grad_norm": 0.14784017076301442, "learning_rate": 1.2036856846611488e-07, "loss": 0.0268, "step": 12020 }, { "epoch": 0.934368932038835, "grad_norm": 0.07722486597964677, "learning_rate": 1.1758365701434793e-07, "loss": 0.0287, "step": 12030 }, { "epoch": 0.9351456310679611, "grad_norm": 0.09097829328891131, "learning_rate": 1.1483095702073177e-07, "loss": 0.0306, "step": 12040 }, { "epoch": 0.9359223300970874, "grad_norm": 0.07795057035302067, "learning_rate": 1.1211048664604485e-07, "loss": 0.0304, "step": 12050 }, { "epoch": 0.9366990291262136, "grad_norm": 0.1126363738692364, "learning_rate": 1.0942226383843346e-07, "loss": 0.0303, "step": 12060 }, { "epoch": 0.9374757281553399, "grad_norm": 0.10111267632068063, "learning_rate": 1.0676630633329021e-07, "loss": 0.0293, "step": 12070 }, { "epoch": 0.938252427184466, "grad_norm": 0.07532263305268921, "learning_rate": 1.0414263165314076e-07, "loss": 0.0297, "step": 12080 }, { "epoch": 0.9390291262135922, "grad_norm": 0.12073767719326932, "learning_rate": 1.0155125710752666e-07, "loss": 0.0331, "step": 12090 }, { "epoch": 0.9398058252427185, "grad_norm": 0.08879856703803826, "learning_rate": 9.899219979289055e-08, "loss": 0.0289, "step": 12100 }, { "epoch": 0.9405825242718446, "grad_norm": 0.13163292022073875, "learning_rate": 9.64654765924633e-08, "loss": 0.0309, "step": 12110 }, { "epoch": 0.9413592233009709, "grad_norm": 0.06477722959490646, "learning_rate": 9.397110417615707e-08, "loss": 0.0295, "step": 12120 }, { "epoch": 0.9421359223300971, "grad_norm": 0.09436331754936761, "learning_rate": 9.150909900044691e-08, "loss": 0.0277, "step": 12130 }, { "epoch": 0.9429126213592233, "grad_norm": 0.10510956176513196, "learning_rate": 8.907947730826983e-08, "loss": 0.0325, "step": 12140 }, { "epoch": 0.9436893203883495, "grad_norm": 0.08704036903576372, "learning_rate": 8.668225512891315e-08, "loss": 0.0297, "step": 12150 }, { "epoch": 0.9444660194174758, "grad_norm": 0.11153859988245503, "learning_rate": 8.431744827791134e-08, "loss": 0.0287, "step": 12160 }, { "epoch": 0.9452427184466019, "grad_norm": 0.15210421902293708, "learning_rate": 8.198507235693987e-08, "loss": 0.0305, "step": 12170 }, { "epoch": 0.9460194174757282, "grad_norm": 0.09776427737486085, "learning_rate": 7.968514275371375e-08, "loss": 0.0284, "step": 12180 }, { "epoch": 0.9467961165048544, "grad_norm": 0.10862971671329026, "learning_rate": 7.741767464188421e-08, "loss": 0.0292, "step": 12190 }, { "epoch": 0.9475728155339805, "grad_norm": 0.11472970759846661, "learning_rate": 7.518268298094155e-08, "loss": 0.0289, "step": 12200 }, { "epoch": 0.9483495145631068, "grad_norm": 0.06881938642222563, "learning_rate": 7.298018251611305e-08, "loss": 0.0302, "step": 12210 }, { "epoch": 0.949126213592233, "grad_norm": 0.1111769164603233, "learning_rate": 7.081018777826631e-08, "loss": 0.03, "step": 12220 }, { "epoch": 0.9499029126213592, "grad_norm": 0.08147769545009345, "learning_rate": 6.867271308381662e-08, "loss": 0.0301, "step": 12230 }, { "epoch": 0.9506796116504854, "grad_norm": 0.0956385777922673, "learning_rate": 6.656777253462976e-08, "loss": 0.0297, "step": 12240 }, { "epoch": 0.9514563106796117, "grad_norm": 0.09647954149383654, "learning_rate": 6.449538001792766e-08, "loss": 0.0308, "step": 12250 }, { "epoch": 0.9522330097087378, "grad_norm": 0.07483582845264357, "learning_rate": 6.245554920619956e-08, "loss": 0.0299, "step": 12260 }, { "epoch": 0.9530097087378641, "grad_norm": 0.09281923478499814, "learning_rate": 6.044829355711046e-08, "loss": 0.0282, "step": 12270 }, { "epoch": 0.9537864077669903, "grad_norm": 0.09570533621861751, "learning_rate": 5.8473626313413915e-08, "loss": 0.0285, "step": 12280 }, { "epoch": 0.9545631067961166, "grad_norm": 0.12337902266642697, "learning_rate": 5.653156050286157e-08, "loss": 0.0283, "step": 12290 }, { "epoch": 0.9553398058252427, "grad_norm": 0.07936613484305918, "learning_rate": 5.4622108938120474e-08, "loss": 0.0287, "step": 12300 }, { "epoch": 0.9561165048543689, "grad_norm": 0.097059130226132, "learning_rate": 5.2745284216685346e-08, "loss": 0.0278, "step": 12310 }, { "epoch": 0.9568932038834952, "grad_norm": 0.13632125780969306, "learning_rate": 5.090109872079918e-08, "loss": 0.0295, "step": 12320 }, { "epoch": 0.9576699029126213, "grad_norm": 0.1667463718720281, "learning_rate": 4.9089564617367803e-08, "loss": 0.0299, "step": 12330 }, { "epoch": 0.9584466019417476, "grad_norm": 0.08355517235619568, "learning_rate": 4.7310693857882676e-08, "loss": 0.0295, "step": 12340 }, { "epoch": 0.9592233009708738, "grad_norm": 0.1438661337430918, "learning_rate": 4.556449817834041e-08, "loss": 0.0305, "step": 12350 }, { "epoch": 0.96, "grad_norm": 0.11870450603668366, "learning_rate": 4.385098909916563e-08, "loss": 0.0303, "step": 12360 }, { "epoch": 0.9607766990291262, "grad_norm": 0.10465458448563018, "learning_rate": 4.217017792513489e-08, "loss": 0.0471, "step": 12370 }, { "epoch": 0.9615533980582525, "grad_norm": 0.09387100948251206, "learning_rate": 4.052207574530176e-08, "loss": 0.0278, "step": 12380 }, { "epoch": 0.9623300970873786, "grad_norm": 0.08957919562547044, "learning_rate": 3.890669343292464e-08, "loss": 0.0294, "step": 12390 }, { "epoch": 0.9631067961165048, "grad_norm": 0.10460219954590727, "learning_rate": 3.732404164539516e-08, "loss": 0.0275, "step": 12400 }, { "epoch": 0.9638834951456311, "grad_norm": 0.13719486585682125, "learning_rate": 3.577413082416603e-08, "loss": 0.0281, "step": 12410 }, { "epoch": 0.9646601941747572, "grad_norm": 0.08110335283385899, "learning_rate": 3.4256971194683854e-08, "loss": 0.0294, "step": 12420 }, { "epoch": 0.9654368932038835, "grad_norm": 0.0996482841542113, "learning_rate": 3.2772572766321395e-08, "loss": 0.0303, "step": 12430 }, { "epoch": 0.9662135922330097, "grad_norm": 0.07152177265863328, "learning_rate": 3.1320945332310446e-08, "loss": 0.028, "step": 12440 }, { "epoch": 0.9669902912621359, "grad_norm": 0.10648413552545512, "learning_rate": 2.990209846967962e-08, "loss": 0.0286, "step": 12450 }, { "epoch": 0.9677669902912621, "grad_norm": 0.12498902366865768, "learning_rate": 2.8516041539187745e-08, "loss": 0.0313, "step": 12460 }, { "epoch": 0.9685436893203884, "grad_norm": 0.09995101394091593, "learning_rate": 2.7162783685265593e-08, "loss": 0.029, "step": 12470 }, { "epoch": 0.9693203883495146, "grad_norm": 0.11509555424447614, "learning_rate": 2.5842333835952583e-08, "loss": 0.0303, "step": 12480 }, { "epoch": 0.9700970873786408, "grad_norm": 0.07164554061226856, "learning_rate": 2.4554700702840716e-08, "loss": 0.0293, "step": 12490 }, { "epoch": 0.970873786407767, "grad_norm": 0.06173970450812764, "learning_rate": 2.3299892781014077e-08, "loss": 0.0289, "step": 12500 }, { "epoch": 0.9716504854368933, "grad_norm": 0.09487327385267921, "learning_rate": 2.2077918348994976e-08, "loss": 0.0292, "step": 12510 }, { "epoch": 0.9724271844660194, "grad_norm": 0.10154752859732497, "learning_rate": 2.0888785468689e-08, "loss": 0.028, "step": 12520 }, { "epoch": 0.9732038834951456, "grad_norm": 0.07374892410223644, "learning_rate": 1.9732501985331165e-08, "loss": 0.0288, "step": 12530 }, { "epoch": 0.9739805825242719, "grad_norm": 0.08438561462038181, "learning_rate": 1.860907552743485e-08, "loss": 0.028, "step": 12540 }, { "epoch": 0.974757281553398, "grad_norm": 0.11898206968961494, "learning_rate": 1.751851350674072e-08, "loss": 0.03, "step": 12550 }, { "epoch": 0.9755339805825243, "grad_norm": 0.13628942179067444, "learning_rate": 1.646082311816788e-08, "loss": 0.0318, "step": 12560 }, { "epoch": 0.9763106796116505, "grad_norm": 0.09172626971214422, "learning_rate": 1.54360113397678e-08, "loss": 0.0294, "step": 12570 }, { "epoch": 0.9770873786407767, "grad_norm": 0.09154911832007584, "learning_rate": 1.4444084932676017e-08, "loss": 0.0445, "step": 12580 }, { "epoch": 0.9778640776699029, "grad_norm": 0.16979378997585964, "learning_rate": 1.3485050441069958e-08, "loss": 0.03, "step": 12590 }, { "epoch": 0.9786407766990292, "grad_norm": 0.12072275853218109, "learning_rate": 1.2558914192123961e-08, "loss": 0.0282, "step": 12600 }, { "epoch": 0.9794174757281553, "grad_norm": 0.10891322946689018, "learning_rate": 1.1665682295967651e-08, "loss": 0.0297, "step": 12610 }, { "epoch": 0.9801941747572815, "grad_norm": 0.08636576473926191, "learning_rate": 1.0805360645647078e-08, "loss": 0.03, "step": 12620 }, { "epoch": 0.9809708737864078, "grad_norm": 0.14396441335457474, "learning_rate": 9.977954917084199e-09, "loss": 0.03, "step": 12630 }, { "epoch": 0.9817475728155339, "grad_norm": 0.09383677788839022, "learning_rate": 9.18347056904023e-09, "loss": 0.0298, "step": 12640 }, { "epoch": 0.9825242718446602, "grad_norm": 0.1039308622644538, "learning_rate": 8.421912843079028e-09, "loss": 0.0332, "step": 12650 }, { "epoch": 0.9833009708737864, "grad_norm": 0.11786880457900845, "learning_rate": 7.693286763533758e-09, "loss": 0.0279, "step": 12660 }, { "epoch": 0.9840776699029126, "grad_norm": 0.08041898731917498, "learning_rate": 6.997597137473056e-09, "loss": 0.0295, "step": 12670 }, { "epoch": 0.9848543689320388, "grad_norm": 0.13951701427045193, "learning_rate": 6.334848554668261e-09, "loss": 0.0288, "step": 12680 }, { "epoch": 0.9856310679611651, "grad_norm": 0.10540427525950581, "learning_rate": 5.705045387564556e-09, "loss": 0.0328, "step": 12690 }, { "epoch": 0.9864077669902913, "grad_norm": 0.0604459322864256, "learning_rate": 5.108191791252659e-09, "loss": 0.0286, "step": 12700 }, { "epoch": 0.9871844660194175, "grad_norm": 0.1258188171674739, "learning_rate": 4.544291703438841e-09, "loss": 0.0309, "step": 12710 }, { "epoch": 0.9879611650485437, "grad_norm": 0.08910976175339864, "learning_rate": 4.013348844420506e-09, "loss": 0.0277, "step": 12720 }, { "epoch": 0.98873786407767, "grad_norm": 0.06505141984947507, "learning_rate": 3.5153667170634285e-09, "loss": 0.0289, "step": 12730 }, { "epoch": 0.9895145631067961, "grad_norm": 0.08190867482823701, "learning_rate": 3.050348606775666e-09, "loss": 0.0319, "step": 12740 }, { "epoch": 0.9902912621359223, "grad_norm": 0.09236924002879458, "learning_rate": 2.618297581486462e-09, "loss": 0.0308, "step": 12750 }, { "epoch": 0.9910679611650486, "grad_norm": 0.16895230246140144, "learning_rate": 2.219216491627929e-09, "loss": 0.032, "step": 12760 }, { "epoch": 0.9918446601941747, "grad_norm": 0.09304115656990171, "learning_rate": 1.8531079701139543e-09, "loss": 0.0302, "step": 12770 }, { "epoch": 0.992621359223301, "grad_norm": 0.11332737585550963, "learning_rate": 1.5199744323246556e-09, "loss": 0.0294, "step": 12780 }, { "epoch": 0.9933980582524272, "grad_norm": 0.0827305488553855, "learning_rate": 1.2198180760891742e-09, "loss": 0.0314, "step": 12790 }, { "epoch": 0.9941747572815534, "grad_norm": 0.09276853823464741, "learning_rate": 9.526408816712406e-10, "loss": 0.0268, "step": 12800 }, { "epoch": 0.9949514563106796, "grad_norm": 0.06953674051399332, "learning_rate": 7.184446117569633e-10, "loss": 0.029, "step": 12810 }, { "epoch": 0.9957281553398059, "grad_norm": 0.06676367140447365, "learning_rate": 5.172308114431701e-10, "loss": 0.0283, "step": 12820 }, { "epoch": 0.996504854368932, "grad_norm": 0.09449987776288662, "learning_rate": 3.490008082251972e-10, "loss": 0.0301, "step": 12830 }, { "epoch": 0.9972815533980582, "grad_norm": 0.09009932048476241, "learning_rate": 2.1375571199189205e-10, "loss": 0.029, "step": 12840 }, { "epoch": 0.9980582524271845, "grad_norm": 0.08021345692212206, "learning_rate": 1.1149641501395637e-10, "loss": 0.036, "step": 12850 }, { "epoch": 0.9988349514563106, "grad_norm": 0.08697559179872223, "learning_rate": 4.222359194172576e-11, "loss": 0.0293, "step": 12860 }, { "epoch": 0.9996116504854369, "grad_norm": 0.07702027556641607, "learning_rate": 5.937699798508334e-12, "loss": 0.0285, "step": 12870 }, { "epoch": 1.0, "step": 12875, "total_flos": 9727751749632000.0, "train_loss": 0.06894543814774856, "train_runtime": 212159.5801, "train_samples_per_second": 0.85, "train_steps_per_second": 0.061 } ], "logging_steps": 10, "max_steps": 12875, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9727751749632000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }