{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 243750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006153846153846154, "grad_norm": 15.441011428833008, "learning_rate": 4.989764102564103e-05, "loss": 1.5402, "step": 500 }, { "epoch": 0.012307692307692308, "grad_norm": 9.19687557220459, "learning_rate": 4.979507692307693e-05, "loss": 1.2127, "step": 1000 }, { "epoch": 0.018461538461538463, "grad_norm": 11.655864715576172, "learning_rate": 4.969251282051282e-05, "loss": 1.1338, "step": 1500 }, { "epoch": 0.024615384615384615, "grad_norm": 20.931015014648438, "learning_rate": 4.958994871794872e-05, "loss": 1.0639, "step": 2000 }, { "epoch": 0.03076923076923077, "grad_norm": 12.394627571105957, "learning_rate": 4.948738461538462e-05, "loss": 1.0317, "step": 2500 }, { "epoch": 0.036923076923076927, "grad_norm": 12.100915908813477, "learning_rate": 4.9384820512820516e-05, "loss": 1.0283, "step": 3000 }, { "epoch": 0.043076923076923075, "grad_norm": 14.47819995880127, "learning_rate": 4.928225641025641e-05, "loss": 1.0115, "step": 3500 }, { "epoch": 0.04923076923076923, "grad_norm": 15.608175277709961, "learning_rate": 4.917969230769231e-05, "loss": 1.0152, "step": 4000 }, { "epoch": 0.055384615384615386, "grad_norm": 40.89103317260742, "learning_rate": 4.9077128205128206e-05, "loss": 0.9857, "step": 4500 }, { "epoch": 0.06153846153846154, "grad_norm": 5.455167293548584, "learning_rate": 4.89745641025641e-05, "loss": 1.0276, "step": 5000 }, { "epoch": 0.06769230769230769, "grad_norm": 25.55486297607422, "learning_rate": 4.8872e-05, "loss": 0.9773, "step": 5500 }, { "epoch": 0.07384615384615385, "grad_norm": 16.84791374206543, "learning_rate": 4.87694358974359e-05, "loss": 0.9584, "step": 6000 }, { "epoch": 0.08, "grad_norm": 16.22901725769043, "learning_rate": 4.86668717948718e-05, "loss": 1.0158, "step": 6500 }, { "epoch": 0.08615384615384615, "grad_norm": 22.361555099487305, "learning_rate": 4.856430769230769e-05, "loss": 0.9774, "step": 7000 }, { "epoch": 0.09230769230769231, "grad_norm": 11.770121574401855, "learning_rate": 4.8461743589743594e-05, "loss": 0.9599, "step": 7500 }, { "epoch": 0.09846153846153846, "grad_norm": 17.554147720336914, "learning_rate": 4.835917948717949e-05, "loss": 0.9438, "step": 8000 }, { "epoch": 0.10461538461538461, "grad_norm": 11.665477752685547, "learning_rate": 4.825661538461539e-05, "loss": 0.9608, "step": 8500 }, { "epoch": 0.11076923076923077, "grad_norm": 17.011043548583984, "learning_rate": 4.8154051282051285e-05, "loss": 0.9836, "step": 9000 }, { "epoch": 0.11692307692307692, "grad_norm": 12.644675254821777, "learning_rate": 4.805148717948718e-05, "loss": 0.9505, "step": 9500 }, { "epoch": 0.12307692307692308, "grad_norm": 16.7060489654541, "learning_rate": 4.794892307692308e-05, "loss": 0.9767, "step": 10000 }, { "epoch": 0.12923076923076923, "grad_norm": 15.813322067260742, "learning_rate": 4.784635897435898e-05, "loss": 0.947, "step": 10500 }, { "epoch": 0.13538461538461538, "grad_norm": 7.84370231628418, "learning_rate": 4.774379487179487e-05, "loss": 0.9553, "step": 11000 }, { "epoch": 0.14153846153846153, "grad_norm": 7.926013469696045, "learning_rate": 4.764123076923077e-05, "loss": 0.9518, "step": 11500 }, { "epoch": 0.1476923076923077, "grad_norm": 12.878284454345703, "learning_rate": 4.753866666666667e-05, "loss": 0.9307, "step": 12000 }, { "epoch": 0.15384615384615385, "grad_norm": 14.422932624816895, "learning_rate": 4.743610256410257e-05, "loss": 0.9562, "step": 12500 }, { "epoch": 0.16, "grad_norm": 8.4630708694458, "learning_rate": 4.733353846153846e-05, "loss": 0.931, "step": 13000 }, { "epoch": 0.16615384615384615, "grad_norm": 18.567655563354492, "learning_rate": 4.723097435897436e-05, "loss": 0.9471, "step": 13500 }, { "epoch": 0.1723076923076923, "grad_norm": 13.360151290893555, "learning_rate": 4.712841025641026e-05, "loss": 0.9538, "step": 14000 }, { "epoch": 0.17846153846153845, "grad_norm": 8.191686630249023, "learning_rate": 4.702584615384615e-05, "loss": 0.9378, "step": 14500 }, { "epoch": 0.18461538461538463, "grad_norm": 16.853740692138672, "learning_rate": 4.6923282051282054e-05, "loss": 0.9602, "step": 15000 }, { "epoch": 0.19076923076923077, "grad_norm": 13.758535385131836, "learning_rate": 4.682071794871795e-05, "loss": 0.9451, "step": 15500 }, { "epoch": 0.19692307692307692, "grad_norm": 11.444204330444336, "learning_rate": 4.671815384615385e-05, "loss": 0.919, "step": 16000 }, { "epoch": 0.20307692307692307, "grad_norm": 8.91698169708252, "learning_rate": 4.6615589743589744e-05, "loss": 0.9341, "step": 16500 }, { "epoch": 0.20923076923076922, "grad_norm": 47.140968322753906, "learning_rate": 4.6513025641025646e-05, "loss": 0.9455, "step": 17000 }, { "epoch": 0.2153846153846154, "grad_norm": 9.847350120544434, "learning_rate": 4.641046153846154e-05, "loss": 0.9336, "step": 17500 }, { "epoch": 0.22153846153846155, "grad_norm": 7.988911151885986, "learning_rate": 4.630789743589744e-05, "loss": 0.9189, "step": 18000 }, { "epoch": 0.2276923076923077, "grad_norm": 12.728645324707031, "learning_rate": 4.6205333333333336e-05, "loss": 0.9206, "step": 18500 }, { "epoch": 0.23384615384615384, "grad_norm": 16.50530433654785, "learning_rate": 4.610276923076923e-05, "loss": 0.9325, "step": 19000 }, { "epoch": 0.24, "grad_norm": 12.972574234008789, "learning_rate": 4.600020512820513e-05, "loss": 0.8916, "step": 19500 }, { "epoch": 0.24615384615384617, "grad_norm": 12.662789344787598, "learning_rate": 4.5897641025641034e-05, "loss": 0.9135, "step": 20000 }, { "epoch": 0.2523076923076923, "grad_norm": 6.047543525695801, "learning_rate": 4.579507692307692e-05, "loss": 0.9244, "step": 20500 }, { "epoch": 0.25846153846153846, "grad_norm": 16.157981872558594, "learning_rate": 4.569251282051282e-05, "loss": 0.9131, "step": 21000 }, { "epoch": 0.26461538461538464, "grad_norm": 5.811090469360352, "learning_rate": 4.5589948717948724e-05, "loss": 0.9179, "step": 21500 }, { "epoch": 0.27076923076923076, "grad_norm": 7.486561298370361, "learning_rate": 4.548738461538461e-05, "loss": 0.9209, "step": 22000 }, { "epoch": 0.27692307692307694, "grad_norm": 9.555835723876953, "learning_rate": 4.5384820512820513e-05, "loss": 0.9323, "step": 22500 }, { "epoch": 0.28307692307692306, "grad_norm": 7.111580848693848, "learning_rate": 4.5282256410256415e-05, "loss": 0.9645, "step": 23000 }, { "epoch": 0.28923076923076924, "grad_norm": 87.1783218383789, "learning_rate": 4.517969230769231e-05, "loss": 0.9342, "step": 23500 }, { "epoch": 0.2953846153846154, "grad_norm": 13.641851425170898, "learning_rate": 4.5077128205128204e-05, "loss": 0.9286, "step": 24000 }, { "epoch": 0.30153846153846153, "grad_norm": 13.563926696777344, "learning_rate": 4.4974564102564105e-05, "loss": 0.9342, "step": 24500 }, { "epoch": 0.3076923076923077, "grad_norm": 16.500226974487305, "learning_rate": 4.4872e-05, "loss": 0.9239, "step": 25000 }, { "epoch": 0.31384615384615383, "grad_norm": 11.090267181396484, "learning_rate": 4.47694358974359e-05, "loss": 0.9261, "step": 25500 }, { "epoch": 0.32, "grad_norm": 9.335100173950195, "learning_rate": 4.4666871794871796e-05, "loss": 0.9099, "step": 26000 }, { "epoch": 0.3261538461538461, "grad_norm": 13.137558937072754, "learning_rate": 4.45643076923077e-05, "loss": 0.9318, "step": 26500 }, { "epoch": 0.3323076923076923, "grad_norm": 8.562922477722168, "learning_rate": 4.446174358974359e-05, "loss": 0.9289, "step": 27000 }, { "epoch": 0.3384615384615385, "grad_norm": 9.399090766906738, "learning_rate": 4.435917948717949e-05, "loss": 0.9225, "step": 27500 }, { "epoch": 0.3446153846153846, "grad_norm": 12.728873252868652, "learning_rate": 4.425661538461539e-05, "loss": 0.9334, "step": 28000 }, { "epoch": 0.3507692307692308, "grad_norm": 14.883627891540527, "learning_rate": 4.415405128205128e-05, "loss": 0.9224, "step": 28500 }, { "epoch": 0.3569230769230769, "grad_norm": 7.362912654876709, "learning_rate": 4.4051487179487184e-05, "loss": 0.9306, "step": 29000 }, { "epoch": 0.3630769230769231, "grad_norm": 7.706457614898682, "learning_rate": 4.394892307692308e-05, "loss": 0.9071, "step": 29500 }, { "epoch": 0.36923076923076925, "grad_norm": 10.347256660461426, "learning_rate": 4.384635897435897e-05, "loss": 0.9291, "step": 30000 }, { "epoch": 0.37538461538461537, "grad_norm": 8.69521713256836, "learning_rate": 4.3743794871794874e-05, "loss": 0.9067, "step": 30500 }, { "epoch": 0.38153846153846155, "grad_norm": 8.357994079589844, "learning_rate": 4.3641230769230776e-05, "loss": 0.9012, "step": 31000 }, { "epoch": 0.38769230769230767, "grad_norm": 14.694459915161133, "learning_rate": 4.3538666666666664e-05, "loss": 0.9056, "step": 31500 }, { "epoch": 0.39384615384615385, "grad_norm": 12.09356689453125, "learning_rate": 4.3436102564102565e-05, "loss": 0.945, "step": 32000 }, { "epoch": 0.4, "grad_norm": 17.006885528564453, "learning_rate": 4.3333538461538466e-05, "loss": 0.9466, "step": 32500 }, { "epoch": 0.40615384615384614, "grad_norm": 10.885141372680664, "learning_rate": 4.323097435897436e-05, "loss": 0.8933, "step": 33000 }, { "epoch": 0.4123076923076923, "grad_norm": 17.807836532592773, "learning_rate": 4.3128410256410256e-05, "loss": 0.9188, "step": 33500 }, { "epoch": 0.41846153846153844, "grad_norm": 15.311736106872559, "learning_rate": 4.302584615384616e-05, "loss": 0.8805, "step": 34000 }, { "epoch": 0.4246153846153846, "grad_norm": 7.447861671447754, "learning_rate": 4.292328205128205e-05, "loss": 0.9043, "step": 34500 }, { "epoch": 0.4307692307692308, "grad_norm": 21.085975646972656, "learning_rate": 4.282071794871795e-05, "loss": 0.8851, "step": 35000 }, { "epoch": 0.4369230769230769, "grad_norm": 19.813968658447266, "learning_rate": 4.271815384615385e-05, "loss": 0.8974, "step": 35500 }, { "epoch": 0.4430769230769231, "grad_norm": 9.81804084777832, "learning_rate": 4.261558974358975e-05, "loss": 0.8886, "step": 36000 }, { "epoch": 0.4492307692307692, "grad_norm": 17.820144653320312, "learning_rate": 4.2513025641025643e-05, "loss": 0.879, "step": 36500 }, { "epoch": 0.4553846153846154, "grad_norm": 10.280338287353516, "learning_rate": 4.241046153846154e-05, "loss": 0.9004, "step": 37000 }, { "epoch": 0.46153846153846156, "grad_norm": 12.174972534179688, "learning_rate": 4.230789743589744e-05, "loss": 0.8885, "step": 37500 }, { "epoch": 0.4676923076923077, "grad_norm": 13.532380104064941, "learning_rate": 4.2205333333333334e-05, "loss": 0.893, "step": 38000 }, { "epoch": 0.47384615384615386, "grad_norm": 13.15241527557373, "learning_rate": 4.2102769230769235e-05, "loss": 0.8866, "step": 38500 }, { "epoch": 0.48, "grad_norm": 11.801546096801758, "learning_rate": 4.200020512820513e-05, "loss": 0.8872, "step": 39000 }, { "epoch": 0.48615384615384616, "grad_norm": 7.681485652923584, "learning_rate": 4.1897641025641025e-05, "loss": 0.9096, "step": 39500 }, { "epoch": 0.49230769230769234, "grad_norm": 11.140621185302734, "learning_rate": 4.1795076923076926e-05, "loss": 0.8976, "step": 40000 }, { "epoch": 0.49846153846153846, "grad_norm": 10.523269653320312, "learning_rate": 4.169251282051283e-05, "loss": 0.9242, "step": 40500 }, { "epoch": 0.5046153846153846, "grad_norm": 26.289159774780273, "learning_rate": 4.1589948717948715e-05, "loss": 0.9132, "step": 41000 }, { "epoch": 0.5107692307692308, "grad_norm": 7.7466511726379395, "learning_rate": 4.1487384615384617e-05, "loss": 0.8945, "step": 41500 }, { "epoch": 0.5169230769230769, "grad_norm": 11.930273056030273, "learning_rate": 4.138482051282052e-05, "loss": 0.8633, "step": 42000 }, { "epoch": 0.5230769230769231, "grad_norm": 10.300634384155273, "learning_rate": 4.128225641025641e-05, "loss": 0.8913, "step": 42500 }, { "epoch": 0.5292307692307693, "grad_norm": 11.300227165222168, "learning_rate": 4.117969230769231e-05, "loss": 0.887, "step": 43000 }, { "epoch": 0.5353846153846153, "grad_norm": 5.9636993408203125, "learning_rate": 4.107712820512821e-05, "loss": 0.8979, "step": 43500 }, { "epoch": 0.5415384615384615, "grad_norm": 10.92505168914795, "learning_rate": 4.09745641025641e-05, "loss": 0.8941, "step": 44000 }, { "epoch": 0.5476923076923077, "grad_norm": 8.724580764770508, "learning_rate": 4.0872000000000004e-05, "loss": 0.9351, "step": 44500 }, { "epoch": 0.5538461538461539, "grad_norm": 43.44682312011719, "learning_rate": 4.07694358974359e-05, "loss": 0.8973, "step": 45000 }, { "epoch": 0.56, "grad_norm": 5.966015338897705, "learning_rate": 4.06668717948718e-05, "loss": 0.9027, "step": 45500 }, { "epoch": 0.5661538461538461, "grad_norm": 15.022377014160156, "learning_rate": 4.0564307692307695e-05, "loss": 0.899, "step": 46000 }, { "epoch": 0.5723076923076923, "grad_norm": 17.20585823059082, "learning_rate": 4.046174358974359e-05, "loss": 0.8816, "step": 46500 }, { "epoch": 0.5784615384615385, "grad_norm": 15.906556129455566, "learning_rate": 4.035917948717949e-05, "loss": 0.9028, "step": 47000 }, { "epoch": 0.5846153846153846, "grad_norm": 9.913738250732422, "learning_rate": 4.0256615384615386e-05, "loss": 0.8919, "step": 47500 }, { "epoch": 0.5907692307692308, "grad_norm": 8.156500816345215, "learning_rate": 4.015405128205129e-05, "loss": 0.8961, "step": 48000 }, { "epoch": 0.5969230769230769, "grad_norm": 5.659274578094482, "learning_rate": 4.005148717948718e-05, "loss": 0.8822, "step": 48500 }, { "epoch": 0.6030769230769231, "grad_norm": 14.798992156982422, "learning_rate": 3.9948923076923076e-05, "loss": 0.8831, "step": 49000 }, { "epoch": 0.6092307692307692, "grad_norm": 8.390789031982422, "learning_rate": 3.984635897435898e-05, "loss": 0.8873, "step": 49500 }, { "epoch": 0.6153846153846154, "grad_norm": 16.352163314819336, "learning_rate": 3.974379487179488e-05, "loss": 0.8772, "step": 50000 }, { "epoch": 0.6215384615384615, "grad_norm": 13.056022644042969, "learning_rate": 3.964123076923077e-05, "loss": 0.8895, "step": 50500 }, { "epoch": 0.6276923076923077, "grad_norm": 9.904449462890625, "learning_rate": 3.953866666666667e-05, "loss": 0.8684, "step": 51000 }, { "epoch": 0.6338461538461538, "grad_norm": 6.988232135772705, "learning_rate": 3.943610256410257e-05, "loss": 0.8763, "step": 51500 }, { "epoch": 0.64, "grad_norm": 7.327888488769531, "learning_rate": 3.9333538461538464e-05, "loss": 0.8991, "step": 52000 }, { "epoch": 0.6461538461538462, "grad_norm": 13.140373229980469, "learning_rate": 3.923097435897436e-05, "loss": 0.8447, "step": 52500 }, { "epoch": 0.6523076923076923, "grad_norm": 8.636076927185059, "learning_rate": 3.912841025641026e-05, "loss": 0.8935, "step": 53000 }, { "epoch": 0.6584615384615384, "grad_norm": 7.659938335418701, "learning_rate": 3.9025846153846155e-05, "loss": 0.8545, "step": 53500 }, { "epoch": 0.6646153846153846, "grad_norm": 16.80904197692871, "learning_rate": 3.892328205128205e-05, "loss": 0.8742, "step": 54000 }, { "epoch": 0.6707692307692308, "grad_norm": 46.774444580078125, "learning_rate": 3.882071794871795e-05, "loss": 0.9116, "step": 54500 }, { "epoch": 0.676923076923077, "grad_norm": 13.705849647521973, "learning_rate": 3.8718153846153845e-05, "loss": 0.8751, "step": 55000 }, { "epoch": 0.683076923076923, "grad_norm": 10.969395637512207, "learning_rate": 3.8615589743589746e-05, "loss": 0.8746, "step": 55500 }, { "epoch": 0.6892307692307692, "grad_norm": 8.494479179382324, "learning_rate": 3.851302564102564e-05, "loss": 0.8965, "step": 56000 }, { "epoch": 0.6953846153846154, "grad_norm": 8.132564544677734, "learning_rate": 3.841046153846154e-05, "loss": 0.8956, "step": 56500 }, { "epoch": 0.7015384615384616, "grad_norm": 9.6904935836792, "learning_rate": 3.830789743589744e-05, "loss": 0.8873, "step": 57000 }, { "epoch": 0.7076923076923077, "grad_norm": 13.937239646911621, "learning_rate": 3.820533333333334e-05, "loss": 0.8742, "step": 57500 }, { "epoch": 0.7138461538461538, "grad_norm": 14.830101013183594, "learning_rate": 3.810276923076923e-05, "loss": 0.8829, "step": 58000 }, { "epoch": 0.72, "grad_norm": 19.47818946838379, "learning_rate": 3.800020512820513e-05, "loss": 0.8977, "step": 58500 }, { "epoch": 0.7261538461538461, "grad_norm": 7.47867488861084, "learning_rate": 3.789764102564103e-05, "loss": 0.8792, "step": 59000 }, { "epoch": 0.7323076923076923, "grad_norm": 21.68954849243164, "learning_rate": 3.779507692307693e-05, "loss": 0.8681, "step": 59500 }, { "epoch": 0.7384615384615385, "grad_norm": 10.632262229919434, "learning_rate": 3.769251282051282e-05, "loss": 0.9502, "step": 60000 }, { "epoch": 0.7446153846153846, "grad_norm": 6.99915885925293, "learning_rate": 3.758994871794872e-05, "loss": 0.9691, "step": 60500 }, { "epoch": 0.7507692307692307, "grad_norm": 8.757305145263672, "learning_rate": 3.748738461538462e-05, "loss": 0.9446, "step": 61000 }, { "epoch": 0.7569230769230769, "grad_norm": 60.0024299621582, "learning_rate": 3.738482051282051e-05, "loss": 0.8887, "step": 61500 }, { "epoch": 0.7630769230769231, "grad_norm": 9.167777061462402, "learning_rate": 3.728225641025641e-05, "loss": 0.8668, "step": 62000 }, { "epoch": 0.7692307692307693, "grad_norm": 14.931575775146484, "learning_rate": 3.717969230769231e-05, "loss": 0.8832, "step": 62500 }, { "epoch": 0.7753846153846153, "grad_norm": 9.063596725463867, "learning_rate": 3.7077128205128206e-05, "loss": 0.8853, "step": 63000 }, { "epoch": 0.7815384615384615, "grad_norm": 8.906396865844727, "learning_rate": 3.69745641025641e-05, "loss": 0.8824, "step": 63500 }, { "epoch": 0.7876923076923077, "grad_norm": 12.211151123046875, "learning_rate": 3.6872e-05, "loss": 0.8747, "step": 64000 }, { "epoch": 0.7938461538461539, "grad_norm": 14.395017623901367, "learning_rate": 3.67694358974359e-05, "loss": 0.8858, "step": 64500 }, { "epoch": 0.8, "grad_norm": 6.887247085571289, "learning_rate": 3.66668717948718e-05, "loss": 0.8726, "step": 65000 }, { "epoch": 0.8061538461538461, "grad_norm": 9.119629859924316, "learning_rate": 3.656430769230769e-05, "loss": 0.901, "step": 65500 }, { "epoch": 0.8123076923076923, "grad_norm": 12.743978500366211, "learning_rate": 3.6461743589743594e-05, "loss": 0.8833, "step": 66000 }, { "epoch": 0.8184615384615385, "grad_norm": 22.06646156311035, "learning_rate": 3.635917948717949e-05, "loss": 0.9003, "step": 66500 }, { "epoch": 0.8246153846153846, "grad_norm": 10.94338607788086, "learning_rate": 3.625661538461539e-05, "loss": 0.8665, "step": 67000 }, { "epoch": 0.8307692307692308, "grad_norm": 6.3582892417907715, "learning_rate": 3.6154051282051285e-05, "loss": 0.875, "step": 67500 }, { "epoch": 0.8369230769230769, "grad_norm": 9.930424690246582, "learning_rate": 3.605148717948718e-05, "loss": 0.8544, "step": 68000 }, { "epoch": 0.8430769230769231, "grad_norm": 5.821447372436523, "learning_rate": 3.594892307692308e-05, "loss": 0.89, "step": 68500 }, { "epoch": 0.8492307692307692, "grad_norm": 112.40758514404297, "learning_rate": 3.5846358974358975e-05, "loss": 0.8703, "step": 69000 }, { "epoch": 0.8553846153846154, "grad_norm": 31.642396926879883, "learning_rate": 3.574379487179487e-05, "loss": 0.8625, "step": 69500 }, { "epoch": 0.8615384615384616, "grad_norm": 9.700116157531738, "learning_rate": 3.564123076923077e-05, "loss": 0.8722, "step": 70000 }, { "epoch": 0.8676923076923077, "grad_norm": 5.7299299240112305, "learning_rate": 3.553866666666667e-05, "loss": 0.8499, "step": 70500 }, { "epoch": 0.8738461538461538, "grad_norm": 8.070367813110352, "learning_rate": 3.543610256410256e-05, "loss": 0.8658, "step": 71000 }, { "epoch": 0.88, "grad_norm": 6.736323833465576, "learning_rate": 3.533353846153846e-05, "loss": 0.8882, "step": 71500 }, { "epoch": 0.8861538461538462, "grad_norm": 13.512341499328613, "learning_rate": 3.523097435897436e-05, "loss": 0.8704, "step": 72000 }, { "epoch": 0.8923076923076924, "grad_norm": 13.418638229370117, "learning_rate": 3.512841025641026e-05, "loss": 0.8669, "step": 72500 }, { "epoch": 0.8984615384615384, "grad_norm": 11.72990894317627, "learning_rate": 3.502584615384615e-05, "loss": 0.8479, "step": 73000 }, { "epoch": 0.9046153846153846, "grad_norm": 6.2977166175842285, "learning_rate": 3.4923282051282054e-05, "loss": 0.8371, "step": 73500 }, { "epoch": 0.9107692307692308, "grad_norm": 12.015417098999023, "learning_rate": 3.482071794871795e-05, "loss": 0.9644, "step": 74000 }, { "epoch": 0.916923076923077, "grad_norm": 7.836276531219482, "learning_rate": 3.471815384615385e-05, "loss": 0.999, "step": 74500 }, { "epoch": 0.9230769230769231, "grad_norm": 12.245140075683594, "learning_rate": 3.4615589743589744e-05, "loss": 0.869, "step": 75000 }, { "epoch": 0.9292307692307692, "grad_norm": 10.9988431930542, "learning_rate": 3.4513025641025646e-05, "loss": 0.855, "step": 75500 }, { "epoch": 0.9353846153846154, "grad_norm": 15.52302360534668, "learning_rate": 3.441046153846154e-05, "loss": 0.8352, "step": 76000 }, { "epoch": 0.9415384615384615, "grad_norm": 8.803082466125488, "learning_rate": 3.4307897435897435e-05, "loss": 0.9458, "step": 76500 }, { "epoch": 0.9476923076923077, "grad_norm": 5.698943138122559, "learning_rate": 3.4205333333333336e-05, "loss": 0.8773, "step": 77000 }, { "epoch": 0.9538461538461539, "grad_norm": 11.936111450195312, "learning_rate": 3.410276923076923e-05, "loss": 0.8875, "step": 77500 }, { "epoch": 0.96, "grad_norm": 9.272172927856445, "learning_rate": 3.400020512820513e-05, "loss": 0.879, "step": 78000 }, { "epoch": 0.9661538461538461, "grad_norm": 11.030377388000488, "learning_rate": 3.389764102564103e-05, "loss": 0.9059, "step": 78500 }, { "epoch": 0.9723076923076923, "grad_norm": 11.022510528564453, "learning_rate": 3.379507692307692e-05, "loss": 0.9664, "step": 79000 }, { "epoch": 0.9784615384615385, "grad_norm": 3.991237163543701, "learning_rate": 3.369251282051282e-05, "loss": 0.9605, "step": 79500 }, { "epoch": 0.9846153846153847, "grad_norm": 9.413519859313965, "learning_rate": 3.3589948717948724e-05, "loss": 0.8966, "step": 80000 }, { "epoch": 0.9907692307692307, "grad_norm": 8.189461708068848, "learning_rate": 3.348738461538461e-05, "loss": 0.9141, "step": 80500 }, { "epoch": 0.9969230769230769, "grad_norm": 11.961719512939453, "learning_rate": 3.338482051282051e-05, "loss": 0.9069, "step": 81000 }, { "epoch": 1.0, "eval_accuracy": 0.63552, "eval_loss": 0.8960253000259399, "eval_runtime": 365.8936, "eval_samples_per_second": 136.652, "eval_steps_per_second": 17.081, "step": 81250 }, { "epoch": 1.003076923076923, "grad_norm": 16.277856826782227, "learning_rate": 3.3282256410256415e-05, "loss": 0.8695, "step": 81500 }, { "epoch": 1.0092307692307692, "grad_norm": 11.194209098815918, "learning_rate": 3.317969230769231e-05, "loss": 0.8373, "step": 82000 }, { "epoch": 1.0153846153846153, "grad_norm": 39.81032943725586, "learning_rate": 3.3077128205128204e-05, "loss": 0.8235, "step": 82500 }, { "epoch": 1.0215384615384615, "grad_norm": 4.980942249298096, "learning_rate": 3.2974564102564105e-05, "loss": 0.8384, "step": 83000 }, { "epoch": 1.0276923076923077, "grad_norm": 6.843782424926758, "learning_rate": 3.2872e-05, "loss": 0.8436, "step": 83500 }, { "epoch": 1.0338461538461539, "grad_norm": 18.53227996826172, "learning_rate": 3.27694358974359e-05, "loss": 0.8283, "step": 84000 }, { "epoch": 1.04, "grad_norm": 9.532831192016602, "learning_rate": 3.2666871794871796e-05, "loss": 0.81, "step": 84500 }, { "epoch": 1.0461538461538462, "grad_norm": 11.284141540527344, "learning_rate": 3.25643076923077e-05, "loss": 0.8255, "step": 85000 }, { "epoch": 1.0523076923076924, "grad_norm": 44.83882522583008, "learning_rate": 3.246174358974359e-05, "loss": 0.8654, "step": 85500 }, { "epoch": 1.0584615384615386, "grad_norm": 9.345219612121582, "learning_rate": 3.2359179487179486e-05, "loss": 0.8342, "step": 86000 }, { "epoch": 1.0646153846153845, "grad_norm": 45.78304672241211, "learning_rate": 3.225661538461539e-05, "loss": 0.8108, "step": 86500 }, { "epoch": 1.0707692307692307, "grad_norm": 7.037914276123047, "learning_rate": 3.215405128205128e-05, "loss": 0.828, "step": 87000 }, { "epoch": 1.0769230769230769, "grad_norm": 9.096734046936035, "learning_rate": 3.2051487179487184e-05, "loss": 0.8203, "step": 87500 }, { "epoch": 1.083076923076923, "grad_norm": 11.846463203430176, "learning_rate": 3.194892307692308e-05, "loss": 0.8453, "step": 88000 }, { "epoch": 1.0892307692307692, "grad_norm": 13.351137161254883, "learning_rate": 3.184635897435897e-05, "loss": 0.8554, "step": 88500 }, { "epoch": 1.0953846153846154, "grad_norm": 11.435630798339844, "learning_rate": 3.1743794871794874e-05, "loss": 1.0251, "step": 89000 }, { "epoch": 1.1015384615384616, "grad_norm": 18.781211853027344, "learning_rate": 3.1641230769230775e-05, "loss": 1.0217, "step": 89500 }, { "epoch": 1.1076923076923078, "grad_norm": 9.165902137756348, "learning_rate": 3.153866666666666e-05, "loss": 0.9597, "step": 90000 }, { "epoch": 1.113846153846154, "grad_norm": 7.910101890563965, "learning_rate": 3.1436102564102565e-05, "loss": 1.0139, "step": 90500 }, { "epoch": 1.12, "grad_norm": 9.269392967224121, "learning_rate": 3.1333538461538466e-05, "loss": 1.0017, "step": 91000 }, { "epoch": 1.126153846153846, "grad_norm": 1578.6934814453125, "learning_rate": 3.123097435897436e-05, "loss": 0.945, "step": 91500 }, { "epoch": 1.1323076923076922, "grad_norm": 9.8006010055542, "learning_rate": 3.1128410256410255e-05, "loss": 0.8265, "step": 92000 }, { "epoch": 1.1384615384615384, "grad_norm": 6.808800220489502, "learning_rate": 3.102584615384616e-05, "loss": 0.8309, "step": 92500 }, { "epoch": 1.1446153846153846, "grad_norm": 9.62307357788086, "learning_rate": 3.092328205128205e-05, "loss": 0.8457, "step": 93000 }, { "epoch": 1.1507692307692308, "grad_norm": 11.362460136413574, "learning_rate": 3.0820717948717946e-05, "loss": 0.854, "step": 93500 }, { "epoch": 1.156923076923077, "grad_norm": 15.59352970123291, "learning_rate": 3.071815384615385e-05, "loss": 0.8236, "step": 94000 }, { "epoch": 1.1630769230769231, "grad_norm": 16.21070671081543, "learning_rate": 3.061558974358975e-05, "loss": 0.8278, "step": 94500 }, { "epoch": 1.1692307692307693, "grad_norm": 9.07491683959961, "learning_rate": 3.0513025641025643e-05, "loss": 0.8625, "step": 95000 }, { "epoch": 1.1753846153846155, "grad_norm": 11.091276168823242, "learning_rate": 3.0410461538461538e-05, "loss": 0.8767, "step": 95500 }, { "epoch": 1.1815384615384614, "grad_norm": 9.517818450927734, "learning_rate": 3.0307897435897436e-05, "loss": 1.0201, "step": 96000 }, { "epoch": 1.1876923076923076, "grad_norm": 15.169801712036133, "learning_rate": 3.0205333333333337e-05, "loss": 1.0669, "step": 96500 }, { "epoch": 1.1938461538461538, "grad_norm": 9.808341979980469, "learning_rate": 3.0102769230769235e-05, "loss": 1.0947, "step": 97000 }, { "epoch": 1.2, "grad_norm": 5.031385898590088, "learning_rate": 3.0000205128205126e-05, "loss": 1.0694, "step": 97500 }, { "epoch": 1.2061538461538461, "grad_norm": 11.511312484741211, "learning_rate": 2.9897641025641028e-05, "loss": 1.1515, "step": 98000 }, { "epoch": 1.2123076923076923, "grad_norm": 13.318807601928711, "learning_rate": 2.9795076923076926e-05, "loss": 1.202, "step": 98500 }, { "epoch": 1.2184615384615385, "grad_norm": 10.762772560119629, "learning_rate": 2.9692512820512824e-05, "loss": 1.4163, "step": 99000 }, { "epoch": 1.2246153846153847, "grad_norm": 287.3322448730469, "learning_rate": 2.9589948717948718e-05, "loss": 1.4719, "step": 99500 }, { "epoch": 1.2307692307692308, "grad_norm": 5.190405368804932, "learning_rate": 2.9487384615384616e-05, "loss": 1.4996, "step": 100000 }, { "epoch": 1.236923076923077, "grad_norm": 5.880887508392334, "learning_rate": 2.9384820512820514e-05, "loss": 1.4431, "step": 100500 }, { "epoch": 1.2430769230769232, "grad_norm": 461.6323547363281, "learning_rate": 2.928225641025641e-05, "loss": 1.3496, "step": 101000 }, { "epoch": 1.2492307692307691, "grad_norm": 9.11361312866211, "learning_rate": 2.9179692307692307e-05, "loss": 1.2812, "step": 101500 }, { "epoch": 1.2553846153846153, "grad_norm": 693.46240234375, "learning_rate": 2.9077128205128208e-05, "loss": 1.4237, "step": 102000 }, { "epoch": 1.2615384615384615, "grad_norm": 11.541461944580078, "learning_rate": 2.8974564102564106e-05, "loss": 1.3735, "step": 102500 }, { "epoch": 1.2676923076923077, "grad_norm": 28.216230392456055, "learning_rate": 2.8872e-05, "loss": 1.201, "step": 103000 }, { "epoch": 1.2738461538461539, "grad_norm": 10.394718170166016, "learning_rate": 2.87694358974359e-05, "loss": 1.0158, "step": 103500 }, { "epoch": 1.28, "grad_norm": 29.5662784576416, "learning_rate": 2.8666871794871797e-05, "loss": 0.995, "step": 104000 }, { "epoch": 1.2861538461538462, "grad_norm": 9.405082702636719, "learning_rate": 2.8564307692307695e-05, "loss": 1.0206, "step": 104500 }, { "epoch": 1.2923076923076924, "grad_norm": 5.457224369049072, "learning_rate": 2.846174358974359e-05, "loss": 1.0332, "step": 105000 }, { "epoch": 1.2984615384615386, "grad_norm": 41.30308532714844, "learning_rate": 2.8359179487179487e-05, "loss": 1.1253, "step": 105500 }, { "epoch": 1.3046153846153845, "grad_norm": 11.211895942687988, "learning_rate": 2.825661538461539e-05, "loss": 1.2573, "step": 106000 }, { "epoch": 1.3107692307692307, "grad_norm": 8.621989250183105, "learning_rate": 2.8154051282051287e-05, "loss": 1.0103, "step": 106500 }, { "epoch": 1.3169230769230769, "grad_norm": 22.292667388916016, "learning_rate": 2.8051487179487178e-05, "loss": 0.8642, "step": 107000 }, { "epoch": 1.323076923076923, "grad_norm": 9.553186416625977, "learning_rate": 2.794892307692308e-05, "loss": 0.8808, "step": 107500 }, { "epoch": 1.3292307692307692, "grad_norm": 38.40745544433594, "learning_rate": 2.7846358974358977e-05, "loss": 0.8601, "step": 108000 }, { "epoch": 1.3353846153846154, "grad_norm": 8.445296287536621, "learning_rate": 2.7743794871794872e-05, "loss": 0.8856, "step": 108500 }, { "epoch": 1.3415384615384616, "grad_norm": 9.499748229980469, "learning_rate": 2.764123076923077e-05, "loss": 0.8597, "step": 109000 }, { "epoch": 1.3476923076923077, "grad_norm": 12.908239364624023, "learning_rate": 2.7538666666666668e-05, "loss": 0.96, "step": 109500 }, { "epoch": 1.353846153846154, "grad_norm": 12.40140438079834, "learning_rate": 2.7436102564102566e-05, "loss": 0.9255, "step": 110000 }, { "epoch": 1.3599999999999999, "grad_norm": 6.697096824645996, "learning_rate": 2.733353846153846e-05, "loss": 0.8948, "step": 110500 }, { "epoch": 1.3661538461538463, "grad_norm": 35.72837829589844, "learning_rate": 2.7230974358974358e-05, "loss": 0.8818, "step": 111000 }, { "epoch": 1.3723076923076922, "grad_norm": 238.4368133544922, "learning_rate": 2.712841025641026e-05, "loss": 0.972, "step": 111500 }, { "epoch": 1.3784615384615384, "grad_norm": 7.303473949432373, "learning_rate": 2.7025846153846158e-05, "loss": 0.9542, "step": 112000 }, { "epoch": 1.3846153846153846, "grad_norm": 7.700074195861816, "learning_rate": 2.692328205128205e-05, "loss": 0.9335, "step": 112500 }, { "epoch": 1.3907692307692308, "grad_norm": 10.46871566772461, "learning_rate": 2.682071794871795e-05, "loss": 0.9423, "step": 113000 }, { "epoch": 1.396923076923077, "grad_norm": 6448.5751953125, "learning_rate": 2.6718153846153848e-05, "loss": 0.8893, "step": 113500 }, { "epoch": 1.403076923076923, "grad_norm": 12.207977294921875, "learning_rate": 2.6615589743589746e-05, "loss": 0.8791, "step": 114000 }, { "epoch": 1.4092307692307693, "grad_norm": 10.765968322753906, "learning_rate": 2.651302564102564e-05, "loss": 0.883, "step": 114500 }, { "epoch": 1.4153846153846155, "grad_norm": 46.836402893066406, "learning_rate": 2.641046153846154e-05, "loss": 0.8503, "step": 115000 }, { "epoch": 1.4215384615384616, "grad_norm": 15.547259330749512, "learning_rate": 2.6307897435897437e-05, "loss": 0.8611, "step": 115500 }, { "epoch": 1.4276923076923076, "grad_norm": 11.880488395690918, "learning_rate": 2.6205333333333338e-05, "loss": 0.828, "step": 116000 }, { "epoch": 1.4338461538461538, "grad_norm": 20.702163696289062, "learning_rate": 2.610276923076923e-05, "loss": 0.8337, "step": 116500 }, { "epoch": 1.44, "grad_norm": 16.449716567993164, "learning_rate": 2.600020512820513e-05, "loss": 0.8362, "step": 117000 }, { "epoch": 1.4461538461538461, "grad_norm": 7.47279167175293, "learning_rate": 2.589764102564103e-05, "loss": 0.8629, "step": 117500 }, { "epoch": 1.4523076923076923, "grad_norm": 9.32483196258545, "learning_rate": 2.5795076923076923e-05, "loss": 0.9098, "step": 118000 }, { "epoch": 1.4584615384615385, "grad_norm": 18.407270431518555, "learning_rate": 2.569251282051282e-05, "loss": 0.8453, "step": 118500 }, { "epoch": 1.4646153846153847, "grad_norm": 15.567869186401367, "learning_rate": 2.558994871794872e-05, "loss": 1.0106, "step": 119000 }, { "epoch": 1.4707692307692308, "grad_norm": 10.05256462097168, "learning_rate": 2.5487384615384617e-05, "loss": 0.9477, "step": 119500 }, { "epoch": 1.476923076923077, "grad_norm": 26.323793411254883, "learning_rate": 2.5384820512820512e-05, "loss": 0.929, "step": 120000 }, { "epoch": 1.483076923076923, "grad_norm": 13.73338794708252, "learning_rate": 2.528225641025641e-05, "loss": 0.9654, "step": 120500 }, { "epoch": 1.4892307692307694, "grad_norm": 10.710335731506348, "learning_rate": 2.517969230769231e-05, "loss": 1.0033, "step": 121000 }, { "epoch": 1.4953846153846153, "grad_norm": 6.321354389190674, "learning_rate": 2.507712820512821e-05, "loss": 1.0259, "step": 121500 }, { "epoch": 1.5015384615384615, "grad_norm": 23.730915069580078, "learning_rate": 2.4974564102564104e-05, "loss": 0.935, "step": 122000 }, { "epoch": 1.5076923076923077, "grad_norm": 11.908498764038086, "learning_rate": 2.4872000000000002e-05, "loss": 0.9452, "step": 122500 }, { "epoch": 1.5138461538461538, "grad_norm": 14.499602317810059, "learning_rate": 2.47694358974359e-05, "loss": 0.8601, "step": 123000 }, { "epoch": 1.52, "grad_norm": 11.781792640686035, "learning_rate": 2.4666871794871794e-05, "loss": 0.8146, "step": 123500 }, { "epoch": 1.5261538461538462, "grad_norm": 13.83382797241211, "learning_rate": 2.4564307692307696e-05, "loss": 0.8392, "step": 124000 }, { "epoch": 1.5323076923076924, "grad_norm": 10.906270027160645, "learning_rate": 2.446174358974359e-05, "loss": 0.8258, "step": 124500 }, { "epoch": 1.5384615384615383, "grad_norm": 17.15850830078125, "learning_rate": 2.4359179487179488e-05, "loss": 0.8196, "step": 125000 }, { "epoch": 1.5446153846153847, "grad_norm": 5.727079391479492, "learning_rate": 2.4256615384615386e-05, "loss": 0.8243, "step": 125500 }, { "epoch": 1.5507692307692307, "grad_norm": 10.334206581115723, "learning_rate": 2.415405128205128e-05, "loss": 0.823, "step": 126000 }, { "epoch": 1.556923076923077, "grad_norm": 8.225074768066406, "learning_rate": 2.4051487179487182e-05, "loss": 0.8161, "step": 126500 }, { "epoch": 1.563076923076923, "grad_norm": 14.744596481323242, "learning_rate": 2.3948923076923077e-05, "loss": 0.8077, "step": 127000 }, { "epoch": 1.5692307692307692, "grad_norm": 17.01348876953125, "learning_rate": 2.3846358974358975e-05, "loss": 0.8177, "step": 127500 }, { "epoch": 1.5753846153846154, "grad_norm": 4.435682773590088, "learning_rate": 2.3743794871794873e-05, "loss": 0.8333, "step": 128000 }, { "epoch": 1.5815384615384616, "grad_norm": 11.365696907043457, "learning_rate": 2.364123076923077e-05, "loss": 0.8203, "step": 128500 }, { "epoch": 1.5876923076923077, "grad_norm": 10.943119049072266, "learning_rate": 2.353866666666667e-05, "loss": 0.8127, "step": 129000 }, { "epoch": 1.5938461538461537, "grad_norm": 4.344493389129639, "learning_rate": 2.3436102564102567e-05, "loss": 0.8129, "step": 129500 }, { "epoch": 1.6, "grad_norm": 8.629764556884766, "learning_rate": 2.333353846153846e-05, "loss": 1.1107, "step": 130000 }, { "epoch": 1.606153846153846, "grad_norm": 9.595703125, "learning_rate": 2.3230974358974363e-05, "loss": 1.2713, "step": 130500 }, { "epoch": 1.6123076923076924, "grad_norm": 18.25543975830078, "learning_rate": 2.3128410256410257e-05, "loss": 0.8794, "step": 131000 }, { "epoch": 1.6184615384615384, "grad_norm": 7.203105926513672, "learning_rate": 2.3025846153846155e-05, "loss": 0.8804, "step": 131500 }, { "epoch": 1.6246153846153846, "grad_norm": 5.362618923187256, "learning_rate": 2.2923282051282053e-05, "loss": 0.8374, "step": 132000 }, { "epoch": 1.6307692307692307, "grad_norm": 14.795818328857422, "learning_rate": 2.2820717948717948e-05, "loss": 0.8055, "step": 132500 }, { "epoch": 1.636923076923077, "grad_norm": 12.59130859375, "learning_rate": 2.2718153846153846e-05, "loss": 0.7875, "step": 133000 }, { "epoch": 1.643076923076923, "grad_norm": 28.4409122467041, "learning_rate": 2.2615589743589744e-05, "loss": 0.8014, "step": 133500 }, { "epoch": 1.6492307692307693, "grad_norm": 13.779533386230469, "learning_rate": 2.2513025641025642e-05, "loss": 0.8017, "step": 134000 }, { "epoch": 1.6553846153846155, "grad_norm": 7.0997233390808105, "learning_rate": 2.241046153846154e-05, "loss": 0.8027, "step": 134500 }, { "epoch": 1.6615384615384614, "grad_norm": 18.23006820678711, "learning_rate": 2.2307897435897438e-05, "loss": 0.8211, "step": 135000 }, { "epoch": 1.6676923076923078, "grad_norm": 13.56887149810791, "learning_rate": 2.2205333333333332e-05, "loss": 0.7968, "step": 135500 }, { "epoch": 1.6738461538461538, "grad_norm": 8.843082427978516, "learning_rate": 2.2102769230769234e-05, "loss": 0.8178, "step": 136000 }, { "epoch": 1.6800000000000002, "grad_norm": 16.259294509887695, "learning_rate": 2.200020512820513e-05, "loss": 0.8394, "step": 136500 }, { "epoch": 1.6861538461538461, "grad_norm": 15.540386199951172, "learning_rate": 2.1897641025641026e-05, "loss": 0.8253, "step": 137000 }, { "epoch": 1.6923076923076923, "grad_norm": 8.204901695251465, "learning_rate": 2.1795076923076924e-05, "loss": 0.8088, "step": 137500 }, { "epoch": 1.6984615384615385, "grad_norm": 11.878310203552246, "learning_rate": 2.1692512820512822e-05, "loss": 0.9658, "step": 138000 }, { "epoch": 1.7046153846153846, "grad_norm": 11.583746910095215, "learning_rate": 2.158994871794872e-05, "loss": 1.0792, "step": 138500 }, { "epoch": 1.7107692307692308, "grad_norm": 8.308416366577148, "learning_rate": 2.1487384615384618e-05, "loss": 0.8619, "step": 139000 }, { "epoch": 1.7169230769230768, "grad_norm": 17.01491928100586, "learning_rate": 2.1384820512820513e-05, "loss": 0.8132, "step": 139500 }, { "epoch": 1.7230769230769232, "grad_norm": 8.138616561889648, "learning_rate": 2.1282256410256414e-05, "loss": 0.8732, "step": 140000 }, { "epoch": 1.7292307692307691, "grad_norm": 6.041689872741699, "learning_rate": 2.117969230769231e-05, "loss": 0.8033, "step": 140500 }, { "epoch": 1.7353846153846155, "grad_norm": 17.37460708618164, "learning_rate": 2.1077128205128203e-05, "loss": 0.8168, "step": 141000 }, { "epoch": 1.7415384615384615, "grad_norm": 9.945074081420898, "learning_rate": 2.0974564102564105e-05, "loss": 0.768, "step": 141500 }, { "epoch": 1.7476923076923077, "grad_norm": 7.900252342224121, "learning_rate": 2.0872e-05, "loss": 0.8142, "step": 142000 }, { "epoch": 1.7538461538461538, "grad_norm": 11.957608222961426, "learning_rate": 2.0769435897435897e-05, "loss": 0.8076, "step": 142500 }, { "epoch": 1.76, "grad_norm": 8.552706718444824, "learning_rate": 2.0666871794871795e-05, "loss": 0.7828, "step": 143000 }, { "epoch": 1.7661538461538462, "grad_norm": 10.120139122009277, "learning_rate": 2.0564307692307693e-05, "loss": 0.7922, "step": 143500 }, { "epoch": 1.7723076923076924, "grad_norm": 11.39379596710205, "learning_rate": 2.046174358974359e-05, "loss": 0.81, "step": 144000 }, { "epoch": 1.7784615384615385, "grad_norm": 14.276771545410156, "learning_rate": 2.035917948717949e-05, "loss": 0.7994, "step": 144500 }, { "epoch": 1.7846153846153845, "grad_norm": 21.5369815826416, "learning_rate": 2.0256615384615384e-05, "loss": 0.7733, "step": 145000 }, { "epoch": 1.790769230769231, "grad_norm": 5.541769504547119, "learning_rate": 2.0154051282051285e-05, "loss": 0.804, "step": 145500 }, { "epoch": 1.7969230769230768, "grad_norm": 8.328704833984375, "learning_rate": 2.005148717948718e-05, "loss": 0.801, "step": 146000 }, { "epoch": 1.803076923076923, "grad_norm": 11.290348052978516, "learning_rate": 1.9948923076923078e-05, "loss": 0.8003, "step": 146500 }, { "epoch": 1.8092307692307692, "grad_norm": 10.958141326904297, "learning_rate": 1.9846358974358976e-05, "loss": 0.8086, "step": 147000 }, { "epoch": 1.8153846153846154, "grad_norm": 16.242244720458984, "learning_rate": 1.9743794871794874e-05, "loss": 0.7838, "step": 147500 }, { "epoch": 1.8215384615384616, "grad_norm": 5.659574031829834, "learning_rate": 1.964123076923077e-05, "loss": 0.7879, "step": 148000 }, { "epoch": 1.8276923076923077, "grad_norm": 7.112459659576416, "learning_rate": 1.9538666666666666e-05, "loss": 0.7991, "step": 148500 }, { "epoch": 1.833846153846154, "grad_norm": 24.664087295532227, "learning_rate": 1.9436102564102564e-05, "loss": 0.7605, "step": 149000 }, { "epoch": 1.8399999999999999, "grad_norm": 7.420779228210449, "learning_rate": 1.9333538461538462e-05, "loss": 0.7808, "step": 149500 }, { "epoch": 1.8461538461538463, "grad_norm": 12.564159393310547, "learning_rate": 1.923097435897436e-05, "loss": 0.7922, "step": 150000 }, { "epoch": 1.8523076923076922, "grad_norm": 6.439233779907227, "learning_rate": 1.9128410256410255e-05, "loss": 0.7836, "step": 150500 }, { "epoch": 1.8584615384615386, "grad_norm": 10.705535888671875, "learning_rate": 1.9025846153846156e-05, "loss": 0.7983, "step": 151000 }, { "epoch": 1.8646153846153846, "grad_norm": 15.499585151672363, "learning_rate": 1.892328205128205e-05, "loss": 0.7711, "step": 151500 }, { "epoch": 1.8707692307692307, "grad_norm": 23.128767013549805, "learning_rate": 1.882071794871795e-05, "loss": 0.7793, "step": 152000 }, { "epoch": 1.876923076923077, "grad_norm": 5.763806343078613, "learning_rate": 1.8718153846153847e-05, "loss": 0.798, "step": 152500 }, { "epoch": 1.883076923076923, "grad_norm": 14.247387886047363, "learning_rate": 1.8615589743589745e-05, "loss": 0.7735, "step": 153000 }, { "epoch": 1.8892307692307693, "grad_norm": 10.602540969848633, "learning_rate": 1.8513025641025643e-05, "loss": 0.7922, "step": 153500 }, { "epoch": 1.8953846153846152, "grad_norm": 8.66236400604248, "learning_rate": 1.841046153846154e-05, "loss": 0.7976, "step": 154000 }, { "epoch": 1.9015384615384616, "grad_norm": 10.02337646484375, "learning_rate": 1.8307897435897435e-05, "loss": 0.8002, "step": 154500 }, { "epoch": 1.9076923076923076, "grad_norm": 5.49017333984375, "learning_rate": 1.8205333333333337e-05, "loss": 0.7812, "step": 155000 }, { "epoch": 1.913846153846154, "grad_norm": 10.681382179260254, "learning_rate": 1.810276923076923e-05, "loss": 0.7858, "step": 155500 }, { "epoch": 1.92, "grad_norm": 6.697972297668457, "learning_rate": 1.800020512820513e-05, "loss": 0.7823, "step": 156000 }, { "epoch": 1.926153846153846, "grad_norm": 12.531645774841309, "learning_rate": 1.7897641025641027e-05, "loss": 0.766, "step": 156500 }, { "epoch": 1.9323076923076923, "grad_norm": 15.05484676361084, "learning_rate": 1.7795076923076922e-05, "loss": 0.8025, "step": 157000 }, { "epoch": 1.9384615384615385, "grad_norm": 7.559576511383057, "learning_rate": 1.769251282051282e-05, "loss": 0.7902, "step": 157500 }, { "epoch": 1.9446153846153846, "grad_norm": 10.720475196838379, "learning_rate": 1.7589948717948718e-05, "loss": 0.7806, "step": 158000 }, { "epoch": 1.9507692307692308, "grad_norm": 4.4791131019592285, "learning_rate": 1.7487384615384616e-05, "loss": 0.766, "step": 158500 }, { "epoch": 1.956923076923077, "grad_norm": 10.95101547241211, "learning_rate": 1.7384820512820514e-05, "loss": 0.7756, "step": 159000 }, { "epoch": 1.963076923076923, "grad_norm": 11.451761245727539, "learning_rate": 1.7282256410256412e-05, "loss": 0.8042, "step": 159500 }, { "epoch": 1.9692307692307693, "grad_norm": 4.005158424377441, "learning_rate": 1.7179692307692306e-05, "loss": 0.776, "step": 160000 }, { "epoch": 1.9753846153846153, "grad_norm": 10.658405303955078, "learning_rate": 1.7077128205128208e-05, "loss": 0.7729, "step": 160500 }, { "epoch": 1.9815384615384617, "grad_norm": 20.476329803466797, "learning_rate": 1.6974564102564102e-05, "loss": 0.7744, "step": 161000 }, { "epoch": 1.9876923076923076, "grad_norm": 6.919234275817871, "learning_rate": 1.6872e-05, "loss": 0.7602, "step": 161500 }, { "epoch": 1.9938461538461538, "grad_norm": 11.781153678894043, "learning_rate": 1.67694358974359e-05, "loss": 0.7743, "step": 162000 }, { "epoch": 2.0, "grad_norm": 20.044790267944336, "learning_rate": 1.6666871794871796e-05, "loss": 0.7397, "step": 162500 }, { "epoch": 2.0, "eval_accuracy": 0.65574, "eval_loss": 0.8237444758415222, "eval_runtime": 367.5973, "eval_samples_per_second": 136.018, "eval_steps_per_second": 17.002, "step": 162500 }, { "epoch": 2.006153846153846, "grad_norm": 9.063031196594238, "learning_rate": 1.6564307692307694e-05, "loss": 0.7207, "step": 163000 }, { "epoch": 2.0123076923076924, "grad_norm": 11.945392608642578, "learning_rate": 1.6461743589743592e-05, "loss": 0.7343, "step": 163500 }, { "epoch": 2.0184615384615383, "grad_norm": 10.161272048950195, "learning_rate": 1.6359179487179487e-05, "loss": 0.7373, "step": 164000 }, { "epoch": 2.0246153846153847, "grad_norm": 12.465530395507812, "learning_rate": 1.6256615384615385e-05, "loss": 0.7249, "step": 164500 }, { "epoch": 2.0307692307692307, "grad_norm": 9.874961853027344, "learning_rate": 1.6154051282051283e-05, "loss": 0.7294, "step": 165000 }, { "epoch": 2.036923076923077, "grad_norm": 11.658003807067871, "learning_rate": 1.6051487179487178e-05, "loss": 0.7581, "step": 165500 }, { "epoch": 2.043076923076923, "grad_norm": 13.40922737121582, "learning_rate": 1.594892307692308e-05, "loss": 0.7284, "step": 166000 }, { "epoch": 2.0492307692307694, "grad_norm": 7.658026218414307, "learning_rate": 1.5846358974358973e-05, "loss": 0.7391, "step": 166500 }, { "epoch": 2.0553846153846154, "grad_norm": 9.820539474487305, "learning_rate": 1.574379487179487e-05, "loss": 0.737, "step": 167000 }, { "epoch": 2.0615384615384613, "grad_norm": 11.957843780517578, "learning_rate": 1.564123076923077e-05, "loss": 0.7257, "step": 167500 }, { "epoch": 2.0676923076923077, "grad_norm": 4.503595352172852, "learning_rate": 1.5538666666666667e-05, "loss": 0.7271, "step": 168000 }, { "epoch": 2.0738461538461537, "grad_norm": 13.069839477539062, "learning_rate": 1.5436102564102565e-05, "loss": 0.7292, "step": 168500 }, { "epoch": 2.08, "grad_norm": 27.35700225830078, "learning_rate": 1.5333538461538463e-05, "loss": 0.7381, "step": 169000 }, { "epoch": 2.086153846153846, "grad_norm": 16.696022033691406, "learning_rate": 1.523097435897436e-05, "loss": 0.7449, "step": 169500 }, { "epoch": 2.0923076923076924, "grad_norm": 15.537075996398926, "learning_rate": 1.5128410256410258e-05, "loss": 0.7271, "step": 170000 }, { "epoch": 2.0984615384615384, "grad_norm": 10.185432434082031, "learning_rate": 1.5025846153846154e-05, "loss": 0.7321, "step": 170500 }, { "epoch": 2.1046153846153848, "grad_norm": 19.049474716186523, "learning_rate": 1.4923282051282054e-05, "loss": 0.7125, "step": 171000 }, { "epoch": 2.1107692307692307, "grad_norm": 9.854257583618164, "learning_rate": 1.482071794871795e-05, "loss": 0.7453, "step": 171500 }, { "epoch": 2.116923076923077, "grad_norm": 19.2335262298584, "learning_rate": 1.4718153846153848e-05, "loss": 0.7235, "step": 172000 }, { "epoch": 2.123076923076923, "grad_norm": 13.282673835754395, "learning_rate": 1.4615589743589744e-05, "loss": 0.7597, "step": 172500 }, { "epoch": 2.129230769230769, "grad_norm": 15.75724983215332, "learning_rate": 1.451302564102564e-05, "loss": 0.7227, "step": 173000 }, { "epoch": 2.1353846153846154, "grad_norm": 7.548396587371826, "learning_rate": 1.441046153846154e-05, "loss": 0.7052, "step": 173500 }, { "epoch": 2.1415384615384614, "grad_norm": 16.264101028442383, "learning_rate": 1.4307897435897435e-05, "loss": 0.7449, "step": 174000 }, { "epoch": 2.147692307692308, "grad_norm": 16.141807556152344, "learning_rate": 1.4205333333333334e-05, "loss": 0.7478, "step": 174500 }, { "epoch": 2.1538461538461537, "grad_norm": 12.128105163574219, "learning_rate": 1.410276923076923e-05, "loss": 0.7373, "step": 175000 }, { "epoch": 2.16, "grad_norm": 12.569074630737305, "learning_rate": 1.4000205128205129e-05, "loss": 0.7279, "step": 175500 }, { "epoch": 2.166153846153846, "grad_norm": 14.476816177368164, "learning_rate": 1.3897641025641025e-05, "loss": 0.7369, "step": 176000 }, { "epoch": 2.1723076923076925, "grad_norm": 18.914968490600586, "learning_rate": 1.3795076923076925e-05, "loss": 0.7368, "step": 176500 }, { "epoch": 2.1784615384615384, "grad_norm": 7.522904396057129, "learning_rate": 1.3692512820512821e-05, "loss": 0.7548, "step": 177000 }, { "epoch": 2.184615384615385, "grad_norm": 6.408961772918701, "learning_rate": 1.3589948717948719e-05, "loss": 0.7335, "step": 177500 }, { "epoch": 2.190769230769231, "grad_norm": 3.879934310913086, "learning_rate": 1.3487384615384615e-05, "loss": 0.7447, "step": 178000 }, { "epoch": 2.1969230769230768, "grad_norm": 7.733790397644043, "learning_rate": 1.3384820512820515e-05, "loss": 0.7229, "step": 178500 }, { "epoch": 2.203076923076923, "grad_norm": 8.292427062988281, "learning_rate": 1.3282256410256411e-05, "loss": 0.7214, "step": 179000 }, { "epoch": 2.209230769230769, "grad_norm": 13.548661231994629, "learning_rate": 1.317969230769231e-05, "loss": 0.7406, "step": 179500 }, { "epoch": 2.2153846153846155, "grad_norm": 14.540371894836426, "learning_rate": 1.3077128205128205e-05, "loss": 0.732, "step": 180000 }, { "epoch": 2.2215384615384615, "grad_norm": 11.103569030761719, "learning_rate": 1.2974564102564102e-05, "loss": 0.7217, "step": 180500 }, { "epoch": 2.227692307692308, "grad_norm": 10.219368934631348, "learning_rate": 1.2872000000000001e-05, "loss": 0.7375, "step": 181000 }, { "epoch": 2.233846153846154, "grad_norm": 6.829085350036621, "learning_rate": 1.2769435897435896e-05, "loss": 0.7237, "step": 181500 }, { "epoch": 2.24, "grad_norm": 6.320499897003174, "learning_rate": 1.2666871794871796e-05, "loss": 0.7119, "step": 182000 }, { "epoch": 2.246153846153846, "grad_norm": 8.686861038208008, "learning_rate": 1.2564307692307692e-05, "loss": 0.724, "step": 182500 }, { "epoch": 2.252307692307692, "grad_norm": 6.650514125823975, "learning_rate": 1.246174358974359e-05, "loss": 0.7153, "step": 183000 }, { "epoch": 2.2584615384615385, "grad_norm": 5.943713188171387, "learning_rate": 1.2359179487179488e-05, "loss": 0.7315, "step": 183500 }, { "epoch": 2.2646153846153845, "grad_norm": 11.34069538116455, "learning_rate": 1.2256615384615386e-05, "loss": 0.7062, "step": 184000 }, { "epoch": 2.270769230769231, "grad_norm": 180.521240234375, "learning_rate": 1.2154051282051282e-05, "loss": 0.7384, "step": 184500 }, { "epoch": 2.276923076923077, "grad_norm": 8.282730102539062, "learning_rate": 1.205148717948718e-05, "loss": 0.7173, "step": 185000 }, { "epoch": 2.2830769230769232, "grad_norm": 12.209334373474121, "learning_rate": 1.1948923076923077e-05, "loss": 0.7165, "step": 185500 }, { "epoch": 2.289230769230769, "grad_norm": 9.707290649414062, "learning_rate": 1.1846358974358975e-05, "loss": 0.7426, "step": 186000 }, { "epoch": 2.2953846153846156, "grad_norm": 26.727867126464844, "learning_rate": 1.1743794871794872e-05, "loss": 0.7393, "step": 186500 }, { "epoch": 2.3015384615384615, "grad_norm": 11.103117942810059, "learning_rate": 1.1641230769230769e-05, "loss": 0.7199, "step": 187000 }, { "epoch": 2.3076923076923075, "grad_norm": 15.823190689086914, "learning_rate": 1.1538666666666667e-05, "loss": 0.7199, "step": 187500 }, { "epoch": 2.313846153846154, "grad_norm": 5.7931671142578125, "learning_rate": 1.1436102564102565e-05, "loss": 0.7331, "step": 188000 }, { "epoch": 2.32, "grad_norm": 3.72764253616333, "learning_rate": 1.1333538461538463e-05, "loss": 0.722, "step": 188500 }, { "epoch": 2.3261538461538462, "grad_norm": 7.738234996795654, "learning_rate": 1.1230974358974359e-05, "loss": 0.7356, "step": 189000 }, { "epoch": 2.332307692307692, "grad_norm": 12.795975685119629, "learning_rate": 1.1128410256410257e-05, "loss": 0.7152, "step": 189500 }, { "epoch": 2.3384615384615386, "grad_norm": 13.906537055969238, "learning_rate": 1.1025846153846155e-05, "loss": 0.728, "step": 190000 }, { "epoch": 2.3446153846153845, "grad_norm": 17.184505462646484, "learning_rate": 1.0923282051282053e-05, "loss": 0.7054, "step": 190500 }, { "epoch": 2.350769230769231, "grad_norm": 9.99722671508789, "learning_rate": 1.082071794871795e-05, "loss": 0.7325, "step": 191000 }, { "epoch": 2.356923076923077, "grad_norm": 8.681159973144531, "learning_rate": 1.0718153846153847e-05, "loss": 0.7369, "step": 191500 }, { "epoch": 2.363076923076923, "grad_norm": 8.222824096679688, "learning_rate": 1.0615589743589745e-05, "loss": 0.7494, "step": 192000 }, { "epoch": 2.3692307692307693, "grad_norm": 9.75273609161377, "learning_rate": 1.0513025641025642e-05, "loss": 0.7219, "step": 192500 }, { "epoch": 2.375384615384615, "grad_norm": 20.542802810668945, "learning_rate": 1.0410461538461538e-05, "loss": 0.7268, "step": 193000 }, { "epoch": 2.3815384615384616, "grad_norm": 16.02739906311035, "learning_rate": 1.0307897435897436e-05, "loss": 0.7155, "step": 193500 }, { "epoch": 2.3876923076923076, "grad_norm": 11.635347366333008, "learning_rate": 1.0205333333333334e-05, "loss": 0.7333, "step": 194000 }, { "epoch": 2.393846153846154, "grad_norm": 16.570072174072266, "learning_rate": 1.0102769230769232e-05, "loss": 0.7217, "step": 194500 }, { "epoch": 2.4, "grad_norm": 14.55761432647705, "learning_rate": 1.0000205128205128e-05, "loss": 0.7223, "step": 195000 }, { "epoch": 2.4061538461538463, "grad_norm": 10.24792194366455, "learning_rate": 9.897641025641026e-06, "loss": 0.7337, "step": 195500 }, { "epoch": 2.4123076923076923, "grad_norm": 11.387216567993164, "learning_rate": 9.795076923076924e-06, "loss": 0.7156, "step": 196000 }, { "epoch": 2.418461538461538, "grad_norm": 6.960866928100586, "learning_rate": 9.69251282051282e-06, "loss": 0.7199, "step": 196500 }, { "epoch": 2.4246153846153846, "grad_norm": 25.213428497314453, "learning_rate": 9.589948717948718e-06, "loss": 0.7166, "step": 197000 }, { "epoch": 2.430769230769231, "grad_norm": 16.338205337524414, "learning_rate": 9.487384615384616e-06, "loss": 0.7164, "step": 197500 }, { "epoch": 2.436923076923077, "grad_norm": 9.887063980102539, "learning_rate": 9.384820512820514e-06, "loss": 0.7242, "step": 198000 }, { "epoch": 2.443076923076923, "grad_norm": 15.347149848937988, "learning_rate": 9.28225641025641e-06, "loss": 0.7354, "step": 198500 }, { "epoch": 2.4492307692307693, "grad_norm": 11.793184280395508, "learning_rate": 9.179692307692309e-06, "loss": 0.7176, "step": 199000 }, { "epoch": 2.4553846153846153, "grad_norm": 6.337645053863525, "learning_rate": 9.077128205128207e-06, "loss": 0.713, "step": 199500 }, { "epoch": 2.4615384615384617, "grad_norm": 8.685958862304688, "learning_rate": 8.974564102564103e-06, "loss": 0.7392, "step": 200000 }, { "epoch": 2.4676923076923076, "grad_norm": 8.590500831604004, "learning_rate": 8.871999999999999e-06, "loss": 0.7217, "step": 200500 }, { "epoch": 2.473846153846154, "grad_norm": 16.368959426879883, "learning_rate": 8.769435897435897e-06, "loss": 0.7245, "step": 201000 }, { "epoch": 2.48, "grad_norm": 6.3712544441223145, "learning_rate": 8.666871794871795e-06, "loss": 0.7305, "step": 201500 }, { "epoch": 2.4861538461538464, "grad_norm": 13.746201515197754, "learning_rate": 8.564307692307693e-06, "loss": 0.7163, "step": 202000 }, { "epoch": 2.4923076923076923, "grad_norm": 7.711772441864014, "learning_rate": 8.46174358974359e-06, "loss": 0.6961, "step": 202500 }, { "epoch": 2.4984615384615383, "grad_norm": 12.77273941040039, "learning_rate": 8.359179487179487e-06, "loss": 0.7267, "step": 203000 }, { "epoch": 2.5046153846153847, "grad_norm": 10.931082725524902, "learning_rate": 8.256615384615385e-06, "loss": 0.7078, "step": 203500 }, { "epoch": 2.5107692307692306, "grad_norm": 16.658597946166992, "learning_rate": 8.154051282051282e-06, "loss": 0.7072, "step": 204000 }, { "epoch": 2.516923076923077, "grad_norm": 8.216838836669922, "learning_rate": 8.05148717948718e-06, "loss": 0.7244, "step": 204500 }, { "epoch": 2.523076923076923, "grad_norm": 15.0646333694458, "learning_rate": 7.948923076923078e-06, "loss": 0.7186, "step": 205000 }, { "epoch": 2.5292307692307694, "grad_norm": 6.978567600250244, "learning_rate": 7.846358974358976e-06, "loss": 0.7263, "step": 205500 }, { "epoch": 2.5353846153846153, "grad_norm": 12.0398530960083, "learning_rate": 7.743794871794872e-06, "loss": 0.715, "step": 206000 }, { "epoch": 2.5415384615384617, "grad_norm": 54.27073669433594, "learning_rate": 7.64123076923077e-06, "loss": 0.7139, "step": 206500 }, { "epoch": 2.5476923076923077, "grad_norm": 10.282230377197266, "learning_rate": 7.538666666666668e-06, "loss": 0.7304, "step": 207000 }, { "epoch": 2.5538461538461537, "grad_norm": 10.705265045166016, "learning_rate": 7.436102564102565e-06, "loss": 0.737, "step": 207500 }, { "epoch": 2.56, "grad_norm": 19.464488983154297, "learning_rate": 7.333538461538463e-06, "loss": 0.696, "step": 208000 }, { "epoch": 2.566153846153846, "grad_norm": 15.14621639251709, "learning_rate": 7.230974358974358e-06, "loss": 0.7315, "step": 208500 }, { "epoch": 2.5723076923076924, "grad_norm": 15.028178215026855, "learning_rate": 7.128410256410256e-06, "loss": 0.7099, "step": 209000 }, { "epoch": 2.5784615384615384, "grad_norm": 11.935062408447266, "learning_rate": 7.0258461538461535e-06, "loss": 0.7224, "step": 209500 }, { "epoch": 2.5846153846153848, "grad_norm": 9.3062105178833, "learning_rate": 6.9232820512820515e-06, "loss": 0.7366, "step": 210000 }, { "epoch": 2.5907692307692307, "grad_norm": 15.54145622253418, "learning_rate": 6.820717948717949e-06, "loss": 0.7005, "step": 210500 }, { "epoch": 2.596923076923077, "grad_norm": 20.382858276367188, "learning_rate": 6.7181538461538466e-06, "loss": 0.719, "step": 211000 }, { "epoch": 2.603076923076923, "grad_norm": 12.054417610168457, "learning_rate": 6.615589743589744e-06, "loss": 0.6902, "step": 211500 }, { "epoch": 2.609230769230769, "grad_norm": 18.24418067932129, "learning_rate": 6.513025641025642e-06, "loss": 0.714, "step": 212000 }, { "epoch": 2.6153846153846154, "grad_norm": 7.818421363830566, "learning_rate": 6.410461538461539e-06, "loss": 0.7108, "step": 212500 }, { "epoch": 2.6215384615384614, "grad_norm": 11.045573234558105, "learning_rate": 6.307897435897436e-06, "loss": 0.7189, "step": 213000 }, { "epoch": 2.6276923076923078, "grad_norm": 3.8956141471862793, "learning_rate": 6.205333333333334e-06, "loss": 0.7306, "step": 213500 }, { "epoch": 2.6338461538461537, "grad_norm": 6.8342061042785645, "learning_rate": 6.102769230769231e-06, "loss": 0.705, "step": 214000 }, { "epoch": 2.64, "grad_norm": 3.917092800140381, "learning_rate": 6.000205128205128e-06, "loss": 0.7128, "step": 214500 }, { "epoch": 2.646153846153846, "grad_norm": 10.602785110473633, "learning_rate": 5.897641025641025e-06, "loss": 0.7178, "step": 215000 }, { "epoch": 2.6523076923076925, "grad_norm": 44.85240173339844, "learning_rate": 5.795076923076923e-06, "loss": 0.7124, "step": 215500 }, { "epoch": 2.6584615384615384, "grad_norm": 6.232956886291504, "learning_rate": 5.6925128205128205e-06, "loss": 0.7235, "step": 216000 }, { "epoch": 2.6646153846153844, "grad_norm": 15.6179780960083, "learning_rate": 5.5899487179487185e-06, "loss": 0.7367, "step": 216500 }, { "epoch": 2.670769230769231, "grad_norm": 28.499755859375, "learning_rate": 5.487384615384616e-06, "loss": 0.7122, "step": 217000 }, { "epoch": 2.676923076923077, "grad_norm": 7.36679220199585, "learning_rate": 5.384820512820514e-06, "loss": 0.7083, "step": 217500 }, { "epoch": 2.683076923076923, "grad_norm": 7.019028663635254, "learning_rate": 5.282256410256411e-06, "loss": 0.7167, "step": 218000 }, { "epoch": 2.689230769230769, "grad_norm": 13.332535743713379, "learning_rate": 5.179692307692308e-06, "loss": 0.6813, "step": 218500 }, { "epoch": 2.6953846153846155, "grad_norm": 6.092926025390625, "learning_rate": 5.077128205128205e-06, "loss": 0.7095, "step": 219000 }, { "epoch": 2.7015384615384614, "grad_norm": 5.190014362335205, "learning_rate": 4.974564102564103e-06, "loss": 0.6941, "step": 219500 }, { "epoch": 2.707692307692308, "grad_norm": 6.392735004425049, "learning_rate": 4.872e-06, "loss": 0.7114, "step": 220000 }, { "epoch": 2.713846153846154, "grad_norm": 18.07638931274414, "learning_rate": 4.769435897435898e-06, "loss": 0.7115, "step": 220500 }, { "epoch": 2.7199999999999998, "grad_norm": 6.718528747558594, "learning_rate": 4.666871794871795e-06, "loss": 0.7065, "step": 221000 }, { "epoch": 2.726153846153846, "grad_norm": 17.323354721069336, "learning_rate": 4.564307692307692e-06, "loss": 0.701, "step": 221500 }, { "epoch": 2.7323076923076925, "grad_norm": 11.844677925109863, "learning_rate": 4.46174358974359e-06, "loss": 0.7183, "step": 222000 }, { "epoch": 2.7384615384615385, "grad_norm": 17.19860076904297, "learning_rate": 4.3591794871794875e-06, "loss": 0.6908, "step": 222500 }, { "epoch": 2.7446153846153845, "grad_norm": 14.058614730834961, "learning_rate": 4.256615384615385e-06, "loss": 0.721, "step": 223000 }, { "epoch": 2.750769230769231, "grad_norm": 15.834685325622559, "learning_rate": 4.154051282051282e-06, "loss": 0.7267, "step": 223500 }, { "epoch": 2.756923076923077, "grad_norm": 7.217898845672607, "learning_rate": 4.05148717948718e-06, "loss": 0.7047, "step": 224000 }, { "epoch": 2.763076923076923, "grad_norm": 8.565190315246582, "learning_rate": 3.948923076923077e-06, "loss": 0.6915, "step": 224500 }, { "epoch": 2.769230769230769, "grad_norm": 6.846717357635498, "learning_rate": 3.846358974358975e-06, "loss": 0.7139, "step": 225000 }, { "epoch": 2.775384615384615, "grad_norm": 10.877586364746094, "learning_rate": 3.743794871794872e-06, "loss": 0.7226, "step": 225500 }, { "epoch": 2.7815384615384615, "grad_norm": 7.898690700531006, "learning_rate": 3.6412307692307696e-06, "loss": 0.7001, "step": 226000 }, { "epoch": 2.787692307692308, "grad_norm": 11.410025596618652, "learning_rate": 3.5386666666666667e-06, "loss": 0.6996, "step": 226500 }, { "epoch": 2.793846153846154, "grad_norm": 19.853111267089844, "learning_rate": 3.436102564102564e-06, "loss": 0.7054, "step": 227000 }, { "epoch": 2.8, "grad_norm": 11.761764526367188, "learning_rate": 3.3335384615384614e-06, "loss": 0.721, "step": 227500 }, { "epoch": 2.806153846153846, "grad_norm": 18.024303436279297, "learning_rate": 3.230974358974359e-06, "loss": 0.7255, "step": 228000 }, { "epoch": 2.812307692307692, "grad_norm": 15.604559898376465, "learning_rate": 3.1284102564102565e-06, "loss": 0.7013, "step": 228500 }, { "epoch": 2.8184615384615386, "grad_norm": 11.40152359008789, "learning_rate": 3.025846153846154e-06, "loss": 0.7042, "step": 229000 }, { "epoch": 2.8246153846153845, "grad_norm": 21.253713607788086, "learning_rate": 2.923282051282051e-06, "loss": 0.7187, "step": 229500 }, { "epoch": 2.830769230769231, "grad_norm": 10.321839332580566, "learning_rate": 2.8207179487179488e-06, "loss": 0.7111, "step": 230000 }, { "epoch": 2.836923076923077, "grad_norm": 9.331689834594727, "learning_rate": 2.7181538461538463e-06, "loss": 0.6974, "step": 230500 }, { "epoch": 2.8430769230769233, "grad_norm": 14.935019493103027, "learning_rate": 2.615589743589744e-06, "loss": 0.6993, "step": 231000 }, { "epoch": 2.8492307692307692, "grad_norm": 5.596281051635742, "learning_rate": 2.513025641025641e-06, "loss": 0.7184, "step": 231500 }, { "epoch": 2.855384615384615, "grad_norm": 8.525458335876465, "learning_rate": 2.4104615384615386e-06, "loss": 0.7073, "step": 232000 }, { "epoch": 2.8615384615384616, "grad_norm": 8.31210994720459, "learning_rate": 2.307897435897436e-06, "loss": 0.7043, "step": 232500 }, { "epoch": 2.8676923076923075, "grad_norm": 5.1119608879089355, "learning_rate": 2.2053333333333337e-06, "loss": 0.6754, "step": 233000 }, { "epoch": 2.873846153846154, "grad_norm": 11.781253814697266, "learning_rate": 2.102769230769231e-06, "loss": 0.7187, "step": 233500 }, { "epoch": 2.88, "grad_norm": 6.548630237579346, "learning_rate": 2.0002051282051284e-06, "loss": 0.7199, "step": 234000 }, { "epoch": 2.8861538461538463, "grad_norm": 12.736579895019531, "learning_rate": 1.8976410256410257e-06, "loss": 0.7146, "step": 234500 }, { "epoch": 2.8923076923076922, "grad_norm": 7.0301690101623535, "learning_rate": 1.7950769230769233e-06, "loss": 0.7299, "step": 235000 }, { "epoch": 2.8984615384615386, "grad_norm": 14.406909942626953, "learning_rate": 1.6925128205128204e-06, "loss": 0.7021, "step": 235500 }, { "epoch": 2.9046153846153846, "grad_norm": 4.011630535125732, "learning_rate": 1.589948717948718e-06, "loss": 0.7041, "step": 236000 }, { "epoch": 2.9107692307692306, "grad_norm": 3.6570491790771484, "learning_rate": 1.4873846153846156e-06, "loss": 0.7047, "step": 236500 }, { "epoch": 2.916923076923077, "grad_norm": 10.598403930664062, "learning_rate": 1.384820512820513e-06, "loss": 0.7058, "step": 237000 }, { "epoch": 2.9230769230769234, "grad_norm": 17.667306900024414, "learning_rate": 1.2822564102564103e-06, "loss": 0.6842, "step": 237500 }, { "epoch": 2.9292307692307693, "grad_norm": 4.180660724639893, "learning_rate": 1.1796923076923076e-06, "loss": 0.7185, "step": 238000 }, { "epoch": 2.9353846153846153, "grad_norm": 9.529131889343262, "learning_rate": 1.0771282051282052e-06, "loss": 0.7073, "step": 238500 }, { "epoch": 2.9415384615384617, "grad_norm": 8.090576171875, "learning_rate": 9.745641025641025e-07, "loss": 0.6974, "step": 239000 }, { "epoch": 2.9476923076923076, "grad_norm": 14.463215827941895, "learning_rate": 8.720000000000001e-07, "loss": 0.6877, "step": 239500 }, { "epoch": 2.953846153846154, "grad_norm": 9.106619834899902, "learning_rate": 7.694358974358975e-07, "loss": 0.6888, "step": 240000 }, { "epoch": 2.96, "grad_norm": 10.116360664367676, "learning_rate": 6.66871794871795e-07, "loss": 0.6956, "step": 240500 }, { "epoch": 2.966153846153846, "grad_norm": 6.25929594039917, "learning_rate": 5.643076923076923e-07, "loss": 0.734, "step": 241000 }, { "epoch": 2.9723076923076923, "grad_norm": 6.0323309898376465, "learning_rate": 4.617435897435898e-07, "loss": 0.7066, "step": 241500 }, { "epoch": 2.9784615384615387, "grad_norm": 16.856470108032227, "learning_rate": 3.591794871794872e-07, "loss": 0.7091, "step": 242000 }, { "epoch": 2.9846153846153847, "grad_norm": 10.947349548339844, "learning_rate": 2.566153846153846e-07, "loss": 0.6909, "step": 242500 }, { "epoch": 2.9907692307692306, "grad_norm": 5.442905426025391, "learning_rate": 1.5405128205128205e-07, "loss": 0.7271, "step": 243000 }, { "epoch": 2.996923076923077, "grad_norm": 7.611256122589111, "learning_rate": 5.148717948717949e-08, "loss": 0.7146, "step": 243500 } ], "logging_steps": 500, "max_steps": 243750, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.130803778048e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }