{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3517, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002843652434581183, "grad_norm": 49.12633514404297, "learning_rate": 0.0, "loss": 13.2624, "step": 1 }, { "epoch": 0.0005687304869162366, "grad_norm": 52.63550567626953, "learning_rate": 1.111111111111111e-06, "loss": 12.8107, "step": 2 }, { "epoch": 0.000853095730374355, "grad_norm": 51.16219711303711, "learning_rate": 2.222222222222222e-06, "loss": 12.7371, "step": 3 }, { "epoch": 0.0011374609738324733, "grad_norm": 48.81122970581055, "learning_rate": 3.3333333333333333e-06, "loss": 11.5423, "step": 4 }, { "epoch": 0.0014218262172905917, "grad_norm": 49.11295700073242, "learning_rate": 4.444444444444444e-06, "loss": 11.0671, "step": 5 }, { "epoch": 0.00170619146074871, "grad_norm": 43.88680648803711, "learning_rate": 5.555555555555557e-06, "loss": 10.293, "step": 6 }, { "epoch": 0.0019905567042068284, "grad_norm": 41.188690185546875, "learning_rate": 6.666666666666667e-06, "loss": 9.5132, "step": 7 }, { "epoch": 0.0022749219476649466, "grad_norm": 34.85954284667969, "learning_rate": 7.77777777777778e-06, "loss": 8.9321, "step": 8 }, { "epoch": 0.002559287191123065, "grad_norm": 40.25858688354492, "learning_rate": 8.888888888888888e-06, "loss": 12.101, "step": 9 }, { "epoch": 0.0028436524345811834, "grad_norm": 37.3657112121582, "learning_rate": 1e-05, "loss": 10.7739, "step": 10 }, { "epoch": 0.0031280176780393017, "grad_norm": 31.502487182617188, "learning_rate": 1.1111111111111113e-05, "loss": 10.0982, "step": 11 }, { "epoch": 0.00341238292149742, "grad_norm": 33.45932388305664, "learning_rate": 1.2222222222222224e-05, "loss": 9.332, "step": 12 }, { "epoch": 0.0036967481649555385, "grad_norm": 27.317989349365234, "learning_rate": 1.3333333333333333e-05, "loss": 8.051, "step": 13 }, { "epoch": 0.003981113408413657, "grad_norm": 19.718740463256836, "learning_rate": 1.4444444444444446e-05, "loss": 7.3297, "step": 14 }, { "epoch": 0.004265478651871775, "grad_norm": 18.006954193115234, "learning_rate": 1.555555555555556e-05, "loss": 6.4547, "step": 15 }, { "epoch": 0.004549843895329893, "grad_norm": 14.854341506958008, "learning_rate": 1.6666666666666667e-05, "loss": 6.0132, "step": 16 }, { "epoch": 0.004834209138788011, "grad_norm": 24.93276596069336, "learning_rate": 1.7777777777777777e-05, "loss": 9.1995, "step": 17 }, { "epoch": 0.00511857438224613, "grad_norm": 22.159881591796875, "learning_rate": 1.888888888888889e-05, "loss": 8.4067, "step": 18 }, { "epoch": 0.005402939625704249, "grad_norm": 17.916210174560547, "learning_rate": 2e-05, "loss": 7.7151, "step": 19 }, { "epoch": 0.005687304869162367, "grad_norm": 15.477251052856445, "learning_rate": 2.1111111111111114e-05, "loss": 7.1676, "step": 20 }, { "epoch": 0.005971670112620485, "grad_norm": 15.920812606811523, "learning_rate": 2.2222222222222227e-05, "loss": 6.4084, "step": 21 }, { "epoch": 0.006256035356078603, "grad_norm": 15.113493919372559, "learning_rate": 2.3333333333333336e-05, "loss": 6.1668, "step": 22 }, { "epoch": 0.0065404005995367215, "grad_norm": 11.510760307312012, "learning_rate": 2.444444444444445e-05, "loss": 5.4598, "step": 23 }, { "epoch": 0.00682476584299484, "grad_norm": 7.77994966506958, "learning_rate": 2.5555555555555554e-05, "loss": 5.2792, "step": 24 }, { "epoch": 0.007109131086452958, "grad_norm": 12.673455238342285, "learning_rate": 2.6666666666666667e-05, "loss": 7.6026, "step": 25 }, { "epoch": 0.007393496329911077, "grad_norm": 13.792389869689941, "learning_rate": 2.777777777777778e-05, "loss": 7.2058, "step": 26 }, { "epoch": 0.007677861573369195, "grad_norm": 10.524365425109863, "learning_rate": 2.888888888888889e-05, "loss": 6.3227, "step": 27 }, { "epoch": 0.007962226816827313, "grad_norm": 105.05674743652344, "learning_rate": 3.0000000000000004e-05, "loss": 6.1134, "step": 28 }, { "epoch": 0.008246592060285432, "grad_norm": 13.615738868713379, "learning_rate": 3.111111111111112e-05, "loss": 5.7431, "step": 29 }, { "epoch": 0.00853095730374355, "grad_norm": 17.219816207885742, "learning_rate": 3.222222222222223e-05, "loss": 5.403, "step": 30 }, { "epoch": 0.008815322547201668, "grad_norm": 9.333234786987305, "learning_rate": 3.3333333333333335e-05, "loss": 4.8142, "step": 31 }, { "epoch": 0.009099687790659786, "grad_norm": 86.27250671386719, "learning_rate": 3.444444444444445e-05, "loss": 4.7489, "step": 32 }, { "epoch": 0.009384053034117905, "grad_norm": 18.35126304626465, "learning_rate": 3.555555555555555e-05, "loss": 6.8241, "step": 33 }, { "epoch": 0.009668418277576023, "grad_norm": 22.33354377746582, "learning_rate": 3.6666666666666666e-05, "loss": 6.552, "step": 34 }, { "epoch": 0.009952783521034141, "grad_norm": 21.162761688232422, "learning_rate": 3.777777777777778e-05, "loss": 6.3146, "step": 35 }, { "epoch": 0.01023714876449226, "grad_norm": 18.522459030151367, "learning_rate": 3.888888888888889e-05, "loss": 5.7251, "step": 36 }, { "epoch": 0.010521514007950379, "grad_norm": 19.82053565979004, "learning_rate": 4e-05, "loss": 5.5137, "step": 37 }, { "epoch": 0.010805879251408497, "grad_norm": 16.579193115234375, "learning_rate": 3.998850904912382e-05, "loss": 5.1559, "step": 38 }, { "epoch": 0.011090244494866616, "grad_norm": 6.53575325012207, "learning_rate": 3.997701809824763e-05, "loss": 4.8303, "step": 39 }, { "epoch": 0.011374609738324734, "grad_norm": 16.461362838745117, "learning_rate": 3.996552714737145e-05, "loss": 4.6216, "step": 40 }, { "epoch": 0.011658974981782852, "grad_norm": 24.829219818115234, "learning_rate": 3.995403619649526e-05, "loss": 6.8729, "step": 41 }, { "epoch": 0.01194334022524097, "grad_norm": 23.324426651000977, "learning_rate": 3.994254524561908e-05, "loss": 6.3201, "step": 42 }, { "epoch": 0.012227705468699088, "grad_norm": 20.28281021118164, "learning_rate": 3.993105429474289e-05, "loss": 5.8256, "step": 43 }, { "epoch": 0.012512070712157207, "grad_norm": 15.08171272277832, "learning_rate": 3.991956334386671e-05, "loss": 5.3353, "step": 44 }, { "epoch": 0.012796435955615325, "grad_norm": 9.521646499633789, "learning_rate": 3.990807239299052e-05, "loss": 5.1606, "step": 45 }, { "epoch": 0.013080801199073443, "grad_norm": 14.245501518249512, "learning_rate": 3.989658144211434e-05, "loss": 4.9048, "step": 46 }, { "epoch": 0.013365166442531561, "grad_norm": 16.89498519897461, "learning_rate": 3.9885090491238155e-05, "loss": 4.4536, "step": 47 }, { "epoch": 0.01364953168598968, "grad_norm": 11.991838455200195, "learning_rate": 3.9873599540361966e-05, "loss": 4.1927, "step": 48 }, { "epoch": 0.013933896929447798, "grad_norm": 14.163195610046387, "learning_rate": 3.9862108589485784e-05, "loss": 6.2989, "step": 49 }, { "epoch": 0.014218262172905916, "grad_norm": 11.865116119384766, "learning_rate": 3.9850617638609595e-05, "loss": 5.6554, "step": 50 }, { "epoch": 0.014502627416364034, "grad_norm": 8.582511901855469, "learning_rate": 3.983912668773341e-05, "loss": 5.5022, "step": 51 }, { "epoch": 0.014786992659822154, "grad_norm": 9.730727195739746, "learning_rate": 3.9827635736857224e-05, "loss": 4.9733, "step": 52 }, { "epoch": 0.015071357903280272, "grad_norm": 11.268597602844238, "learning_rate": 3.981614478598104e-05, "loss": 4.6759, "step": 53 }, { "epoch": 0.01535572314673839, "grad_norm": 13.982292175292969, "learning_rate": 3.980465383510485e-05, "loss": 4.7606, "step": 54 }, { "epoch": 0.015640088390196507, "grad_norm": 8.659544944763184, "learning_rate": 3.979316288422867e-05, "loss": 4.394, "step": 55 }, { "epoch": 0.015924453633654627, "grad_norm": 5.434732437133789, "learning_rate": 3.978167193335249e-05, "loss": 3.9735, "step": 56 }, { "epoch": 0.016208818877112743, "grad_norm": 5.0773773193359375, "learning_rate": 3.977018098247631e-05, "loss": 5.9472, "step": 57 }, { "epoch": 0.016493184120570863, "grad_norm": 6.471149444580078, "learning_rate": 3.975869003160012e-05, "loss": 5.2329, "step": 58 }, { "epoch": 0.016777549364028983, "grad_norm": 8.178523063659668, "learning_rate": 3.9747199080723936e-05, "loss": 5.1182, "step": 59 }, { "epoch": 0.0170619146074871, "grad_norm": 10.304596900939941, "learning_rate": 3.973570812984775e-05, "loss": 4.8543, "step": 60 }, { "epoch": 0.01734627985094522, "grad_norm": 9.378915786743164, "learning_rate": 3.9724217178971565e-05, "loss": 4.4329, "step": 61 }, { "epoch": 0.017630645094403336, "grad_norm": 7.112333297729492, "learning_rate": 3.971272622809538e-05, "loss": 4.1529, "step": 62 }, { "epoch": 0.017915010337861456, "grad_norm": 6.022056579589844, "learning_rate": 3.9701235277219194e-05, "loss": 4.0711, "step": 63 }, { "epoch": 0.018199375581319573, "grad_norm": 6.005387783050537, "learning_rate": 3.968974432634301e-05, "loss": 3.7158, "step": 64 }, { "epoch": 0.018483740824777693, "grad_norm": 11.265110969543457, "learning_rate": 3.967825337546682e-05, "loss": 5.7308, "step": 65 }, { "epoch": 0.01876810606823581, "grad_norm": 15.99368953704834, "learning_rate": 3.966676242459064e-05, "loss": 5.2962, "step": 66 }, { "epoch": 0.01905247131169393, "grad_norm": 18.645151138305664, "learning_rate": 3.965527147371445e-05, "loss": 5.2347, "step": 67 }, { "epoch": 0.019336836555152045, "grad_norm": 16.649398803710938, "learning_rate": 3.964378052283827e-05, "loss": 4.9301, "step": 68 }, { "epoch": 0.019621201798610165, "grad_norm": 12.267605781555176, "learning_rate": 3.963228957196208e-05, "loss": 4.6981, "step": 69 }, { "epoch": 0.019905567042068282, "grad_norm": 8.383939743041992, "learning_rate": 3.96207986210859e-05, "loss": 4.4809, "step": 70 }, { "epoch": 0.020189932285526402, "grad_norm": 8.573983192443848, "learning_rate": 3.9609307670209717e-05, "loss": 3.8532, "step": 71 }, { "epoch": 0.02047429752898452, "grad_norm": 11.06430435180664, "learning_rate": 3.959781671933353e-05, "loss": 4.1749, "step": 72 }, { "epoch": 0.02075866277244264, "grad_norm": 10.86804485321045, "learning_rate": 3.9586325768457346e-05, "loss": 5.4128, "step": 73 }, { "epoch": 0.021043028015900758, "grad_norm": 10.805957794189453, "learning_rate": 3.957483481758116e-05, "loss": 4.873, "step": 74 }, { "epoch": 0.021327393259358875, "grad_norm": 8.829920768737793, "learning_rate": 3.9563343866704975e-05, "loss": 4.7101, "step": 75 }, { "epoch": 0.021611758502816995, "grad_norm": 7.231428146362305, "learning_rate": 3.9551852915828786e-05, "loss": 4.696, "step": 76 }, { "epoch": 0.02189612374627511, "grad_norm": 5.561997413635254, "learning_rate": 3.9540361964952603e-05, "loss": 4.1954, "step": 77 }, { "epoch": 0.02218048898973323, "grad_norm": 7.825724124908447, "learning_rate": 3.9528871014076415e-05, "loss": 3.9524, "step": 78 }, { "epoch": 0.022464854233191348, "grad_norm": 7.717831611633301, "learning_rate": 3.951738006320023e-05, "loss": 3.5277, "step": 79 }, { "epoch": 0.022749219476649468, "grad_norm": 6.9925618171691895, "learning_rate": 3.950588911232405e-05, "loss": 3.3675, "step": 80 }, { "epoch": 0.023033584720107584, "grad_norm": 6.320919513702393, "learning_rate": 3.949439816144786e-05, "loss": 5.2571, "step": 81 }, { "epoch": 0.023317949963565704, "grad_norm": 5.361174583435059, "learning_rate": 3.948290721057168e-05, "loss": 4.7185, "step": 82 }, { "epoch": 0.02360231520702382, "grad_norm": 4.8023362159729, "learning_rate": 3.947141625969549e-05, "loss": 4.4968, "step": 83 }, { "epoch": 0.02388668045048194, "grad_norm": 6.406538963317871, "learning_rate": 3.945992530881931e-05, "loss": 4.3588, "step": 84 }, { "epoch": 0.024171045693940057, "grad_norm": 9.289493560791016, "learning_rate": 3.944843435794312e-05, "loss": 4.2662, "step": 85 }, { "epoch": 0.024455410937398177, "grad_norm": 9.779614448547363, "learning_rate": 3.943694340706694e-05, "loss": 4.169, "step": 86 }, { "epoch": 0.024739776180856293, "grad_norm": 8.45078182220459, "learning_rate": 3.942545245619075e-05, "loss": 3.7212, "step": 87 }, { "epoch": 0.025024141424314413, "grad_norm": 7.9592084884643555, "learning_rate": 3.9413961505314566e-05, "loss": 3.7354, "step": 88 }, { "epoch": 0.025308506667772533, "grad_norm": 6.674322128295898, "learning_rate": 3.9402470554438384e-05, "loss": 4.912, "step": 89 }, { "epoch": 0.02559287191123065, "grad_norm": 5.126534461975098, "learning_rate": 3.9390979603562195e-05, "loss": 4.7649, "step": 90 }, { "epoch": 0.02587723715468877, "grad_norm": 7.692286014556885, "learning_rate": 3.937948865268601e-05, "loss": 4.4216, "step": 91 }, { "epoch": 0.026161602398146886, "grad_norm": 13.10142993927002, "learning_rate": 3.9367997701809824e-05, "loss": 4.4418, "step": 92 }, { "epoch": 0.026445967641605006, "grad_norm": 13.905843734741211, "learning_rate": 3.935650675093364e-05, "loss": 4.2435, "step": 93 }, { "epoch": 0.026730332885063122, "grad_norm": 9.100831985473633, "learning_rate": 3.934501580005746e-05, "loss": 3.7519, "step": 94 }, { "epoch": 0.027014698128521242, "grad_norm": 6.695760726928711, "learning_rate": 3.933352484918128e-05, "loss": 3.3665, "step": 95 }, { "epoch": 0.02729906337197936, "grad_norm": 3.8529748916625977, "learning_rate": 3.932203389830509e-05, "loss": 3.6497, "step": 96 }, { "epoch": 0.02758342861543748, "grad_norm": 7.971911907196045, "learning_rate": 3.931054294742891e-05, "loss": 4.8833, "step": 97 }, { "epoch": 0.027867793858895595, "grad_norm": 9.789183616638184, "learning_rate": 3.929905199655272e-05, "loss": 4.6882, "step": 98 }, { "epoch": 0.028152159102353715, "grad_norm": 9.128085136413574, "learning_rate": 3.9287561045676536e-05, "loss": 4.4283, "step": 99 }, { "epoch": 0.028436524345811832, "grad_norm": 7.517631530761719, "learning_rate": 3.927607009480035e-05, "loss": 4.0737, "step": 100 }, { "epoch": 0.02872088958926995, "grad_norm": 8.111870765686035, "learning_rate": 3.9264579143924165e-05, "loss": 3.9202, "step": 101 }, { "epoch": 0.029005254832728068, "grad_norm": 6.428286075592041, "learning_rate": 3.9253088193047976e-05, "loss": 3.8114, "step": 102 }, { "epoch": 0.029289620076186188, "grad_norm": 3.650003671646118, "learning_rate": 3.9241597242171794e-05, "loss": 3.4783, "step": 103 }, { "epoch": 0.029573985319644308, "grad_norm": 3.3745474815368652, "learning_rate": 3.923010629129561e-05, "loss": 3.2149, "step": 104 }, { "epoch": 0.029858350563102425, "grad_norm": 6.395346164703369, "learning_rate": 3.921861534041942e-05, "loss": 4.6489, "step": 105 }, { "epoch": 0.030142715806560545, "grad_norm": 4.820883750915527, "learning_rate": 3.920712438954324e-05, "loss": 4.141, "step": 106 }, { "epoch": 0.03042708105001866, "grad_norm": 5.000969409942627, "learning_rate": 3.919563343866705e-05, "loss": 4.0283, "step": 107 }, { "epoch": 0.03071144629347678, "grad_norm": 6.004735469818115, "learning_rate": 3.918414248779087e-05, "loss": 4.0384, "step": 108 }, { "epoch": 0.030995811536934897, "grad_norm": 4.477464199066162, "learning_rate": 3.917265153691468e-05, "loss": 3.8376, "step": 109 }, { "epoch": 0.031280176780393014, "grad_norm": 3.1061179637908936, "learning_rate": 3.91611605860385e-05, "loss": 3.7448, "step": 110 }, { "epoch": 0.03156454202385114, "grad_norm": 4.4171600341796875, "learning_rate": 3.914966963516231e-05, "loss": 3.4045, "step": 111 }, { "epoch": 0.031848907267309254, "grad_norm": 5.526491641998291, "learning_rate": 3.913817868428613e-05, "loss": 3.2656, "step": 112 }, { "epoch": 0.03213327251076737, "grad_norm": 8.430256843566895, "learning_rate": 3.9126687733409946e-05, "loss": 4.759, "step": 113 }, { "epoch": 0.03241763775422549, "grad_norm": 7.523831367492676, "learning_rate": 3.911519678253376e-05, "loss": 4.3822, "step": 114 }, { "epoch": 0.03270200299768361, "grad_norm": 6.166808605194092, "learning_rate": 3.9103705831657575e-05, "loss": 3.8537, "step": 115 }, { "epoch": 0.03298636824114173, "grad_norm": 4.141753196716309, "learning_rate": 3.9092214880781386e-05, "loss": 3.9163, "step": 116 }, { "epoch": 0.03327073348459984, "grad_norm": 4.088106632232666, "learning_rate": 3.9080723929905204e-05, "loss": 3.7164, "step": 117 }, { "epoch": 0.03355509872805797, "grad_norm": 7.6691107749938965, "learning_rate": 3.9069232979029015e-05, "loss": 3.4528, "step": 118 }, { "epoch": 0.03383946397151608, "grad_norm": 8.339359283447266, "learning_rate": 3.905774202815283e-05, "loss": 3.0847, "step": 119 }, { "epoch": 0.0341238292149742, "grad_norm": 5.894288063049316, "learning_rate": 3.9046251077276644e-05, "loss": 3.1057, "step": 120 }, { "epoch": 0.034408194458432316, "grad_norm": 4.960434436798096, "learning_rate": 3.903476012640046e-05, "loss": 4.8553, "step": 121 }, { "epoch": 0.03469255970189044, "grad_norm": 4.705537796020508, "learning_rate": 3.902326917552428e-05, "loss": 4.2162, "step": 122 }, { "epoch": 0.034976924945348556, "grad_norm": 3.077615737915039, "learning_rate": 3.901177822464809e-05, "loss": 4.2706, "step": 123 }, { "epoch": 0.03526129018880667, "grad_norm": 3.1837899684906006, "learning_rate": 3.900028727377191e-05, "loss": 3.9014, "step": 124 }, { "epoch": 0.03554565543226479, "grad_norm": 6.796784400939941, "learning_rate": 3.898879632289572e-05, "loss": 3.7483, "step": 125 }, { "epoch": 0.03583002067572291, "grad_norm": 6.224754810333252, "learning_rate": 3.897730537201954e-05, "loss": 3.6634, "step": 126 }, { "epoch": 0.03611438591918103, "grad_norm": 4.740405559539795, "learning_rate": 3.896581442114335e-05, "loss": 3.378, "step": 127 }, { "epoch": 0.036398751162639145, "grad_norm": 4.660193920135498, "learning_rate": 3.8954323470267167e-05, "loss": 3.2241, "step": 128 }, { "epoch": 0.03668311640609726, "grad_norm": 6.38066291809082, "learning_rate": 3.894283251939098e-05, "loss": 4.5271, "step": 129 }, { "epoch": 0.036967481649555385, "grad_norm": 4.577174186706543, "learning_rate": 3.89313415685148e-05, "loss": 4.356, "step": 130 }, { "epoch": 0.0372518468930135, "grad_norm": 4.075042247772217, "learning_rate": 3.8919850617638613e-05, "loss": 3.8877, "step": 131 }, { "epoch": 0.03753621213647162, "grad_norm": 5.604481220245361, "learning_rate": 3.890835966676243e-05, "loss": 3.8834, "step": 132 }, { "epoch": 0.03782057737992974, "grad_norm": 5.59628438949585, "learning_rate": 3.889686871588624e-05, "loss": 3.7806, "step": 133 }, { "epoch": 0.03810494262338786, "grad_norm": 5.5282206535339355, "learning_rate": 3.888537776501006e-05, "loss": 3.4985, "step": 134 }, { "epoch": 0.038389307866845974, "grad_norm": 5.6480865478515625, "learning_rate": 3.887388681413387e-05, "loss": 3.4003, "step": 135 }, { "epoch": 0.03867367311030409, "grad_norm": 3.6527063846588135, "learning_rate": 3.886239586325769e-05, "loss": 2.9528, "step": 136 }, { "epoch": 0.038958038353762214, "grad_norm": 4.994736671447754, "learning_rate": 3.885090491238151e-05, "loss": 4.6074, "step": 137 }, { "epoch": 0.03924240359722033, "grad_norm": 5.265069484710693, "learning_rate": 3.883941396150532e-05, "loss": 3.9333, "step": 138 }, { "epoch": 0.03952676884067845, "grad_norm": 5.489358901977539, "learning_rate": 3.8827923010629136e-05, "loss": 3.9805, "step": 139 }, { "epoch": 0.039811134084136564, "grad_norm": 5.382353782653809, "learning_rate": 3.881643205975295e-05, "loss": 3.9738, "step": 140 }, { "epoch": 0.04009549932759469, "grad_norm": 6.055698394775391, "learning_rate": 3.8804941108876765e-05, "loss": 3.621, "step": 141 }, { "epoch": 0.040379864571052804, "grad_norm": 5.292023181915283, "learning_rate": 3.8793450158000576e-05, "loss": 3.5441, "step": 142 }, { "epoch": 0.04066422981451092, "grad_norm": 3.348857879638672, "learning_rate": 3.8781959207124394e-05, "loss": 2.9929, "step": 143 }, { "epoch": 0.04094859505796904, "grad_norm": 3.1222856044769287, "learning_rate": 3.8770468256248205e-05, "loss": 2.8722, "step": 144 }, { "epoch": 0.04123296030142716, "grad_norm": 8.240327835083008, "learning_rate": 3.875897730537202e-05, "loss": 4.6144, "step": 145 }, { "epoch": 0.04151732554488528, "grad_norm": 7.63242769241333, "learning_rate": 3.874748635449584e-05, "loss": 4.1743, "step": 146 }, { "epoch": 0.04180169078834339, "grad_norm": 6.464611530303955, "learning_rate": 3.873599540361965e-05, "loss": 3.898, "step": 147 }, { "epoch": 0.042086056031801516, "grad_norm": 6.321760177612305, "learning_rate": 3.872450445274347e-05, "loss": 3.8015, "step": 148 }, { "epoch": 0.04237042127525963, "grad_norm": 4.688291549682617, "learning_rate": 3.871301350186728e-05, "loss": 3.5913, "step": 149 }, { "epoch": 0.04265478651871775, "grad_norm": 3.5380592346191406, "learning_rate": 3.87015225509911e-05, "loss": 3.2963, "step": 150 }, { "epoch": 0.042939151762175866, "grad_norm": 3.186795473098755, "learning_rate": 3.869003160011491e-05, "loss": 3.1665, "step": 151 }, { "epoch": 0.04322351700563399, "grad_norm": 3.351107597351074, "learning_rate": 3.867854064923873e-05, "loss": 3.0804, "step": 152 }, { "epoch": 0.043507882249092106, "grad_norm": 6.562431812286377, "learning_rate": 3.866704969836254e-05, "loss": 4.5577, "step": 153 }, { "epoch": 0.04379224749255022, "grad_norm": 5.137665748596191, "learning_rate": 3.865555874748636e-05, "loss": 3.9937, "step": 154 }, { "epoch": 0.04407661273600834, "grad_norm": 4.792855262756348, "learning_rate": 3.8644067796610175e-05, "loss": 3.6608, "step": 155 }, { "epoch": 0.04436097797946646, "grad_norm": 4.835788726806641, "learning_rate": 3.8632576845733986e-05, "loss": 3.7309, "step": 156 }, { "epoch": 0.04464534322292458, "grad_norm": 4.597768306732178, "learning_rate": 3.8621085894857804e-05, "loss": 3.348, "step": 157 }, { "epoch": 0.044929708466382695, "grad_norm": 4.271462440490723, "learning_rate": 3.8609594943981615e-05, "loss": 3.3393, "step": 158 }, { "epoch": 0.04521407370984081, "grad_norm": 3.781461477279663, "learning_rate": 3.859810399310543e-05, "loss": 3.0811, "step": 159 }, { "epoch": 0.045498438953298935, "grad_norm": 3.5072243213653564, "learning_rate": 3.8586613042229244e-05, "loss": 2.8874, "step": 160 }, { "epoch": 0.04578280419675705, "grad_norm": 4.078586101531982, "learning_rate": 3.857512209135306e-05, "loss": 4.3578, "step": 161 }, { "epoch": 0.04606716944021517, "grad_norm": 4.5630784034729, "learning_rate": 3.856363114047687e-05, "loss": 4.0361, "step": 162 }, { "epoch": 0.04635153468367329, "grad_norm": 4.149646282196045, "learning_rate": 3.855214018960069e-05, "loss": 3.7167, "step": 163 }, { "epoch": 0.04663589992713141, "grad_norm": 3.7590112686157227, "learning_rate": 3.854064923872451e-05, "loss": 3.3838, "step": 164 }, { "epoch": 0.046920265170589524, "grad_norm": 4.184499263763428, "learning_rate": 3.852915828784832e-05, "loss": 3.5132, "step": 165 }, { "epoch": 0.04720463041404764, "grad_norm": 4.422394275665283, "learning_rate": 3.851766733697214e-05, "loss": 3.2853, "step": 166 }, { "epoch": 0.047488995657505764, "grad_norm": 4.050603866577148, "learning_rate": 3.8506176386095956e-05, "loss": 2.9437, "step": 167 }, { "epoch": 0.04777336090096388, "grad_norm": 4.047832489013672, "learning_rate": 3.849468543521977e-05, "loss": 2.949, "step": 168 }, { "epoch": 0.048057726144422, "grad_norm": 4.343886375427246, "learning_rate": 3.8483194484343585e-05, "loss": 4.5678, "step": 169 }, { "epoch": 0.048342091387880114, "grad_norm": 3.9210119247436523, "learning_rate": 3.84717035334674e-05, "loss": 3.8641, "step": 170 }, { "epoch": 0.04862645663133824, "grad_norm": 3.6741321086883545, "learning_rate": 3.8460212582591214e-05, "loss": 3.498, "step": 171 }, { "epoch": 0.048910821874796354, "grad_norm": 3.9248976707458496, "learning_rate": 3.844872163171503e-05, "loss": 3.6691, "step": 172 }, { "epoch": 0.04919518711825447, "grad_norm": 4.195116996765137, "learning_rate": 3.843723068083884e-05, "loss": 3.4649, "step": 173 }, { "epoch": 0.049479552361712587, "grad_norm": 4.228196144104004, "learning_rate": 3.842573972996266e-05, "loss": 3.1212, "step": 174 }, { "epoch": 0.04976391760517071, "grad_norm": 4.049994468688965, "learning_rate": 3.841424877908647e-05, "loss": 3.144, "step": 175 }, { "epoch": 0.050048282848628826, "grad_norm": 3.637624502182007, "learning_rate": 3.840275782821029e-05, "loss": 2.8251, "step": 176 }, { "epoch": 0.05033264809208694, "grad_norm": 3.211629867553711, "learning_rate": 3.83912668773341e-05, "loss": 4.115, "step": 177 }, { "epoch": 0.050617013335545066, "grad_norm": 3.2752835750579834, "learning_rate": 3.837977592645792e-05, "loss": 3.8892, "step": 178 }, { "epoch": 0.05090137857900318, "grad_norm": 3.31851863861084, "learning_rate": 3.8368284975581736e-05, "loss": 3.4402, "step": 179 }, { "epoch": 0.0511857438224613, "grad_norm": 3.9213197231292725, "learning_rate": 3.835679402470555e-05, "loss": 3.3855, "step": 180 }, { "epoch": 0.051470109065919416, "grad_norm": 4.550943851470947, "learning_rate": 3.8345303073829365e-05, "loss": 3.3385, "step": 181 }, { "epoch": 0.05175447430937754, "grad_norm": 3.618424654006958, "learning_rate": 3.8333812122953177e-05, "loss": 3.0774, "step": 182 }, { "epoch": 0.052038839552835656, "grad_norm": 2.8778865337371826, "learning_rate": 3.8322321172076994e-05, "loss": 2.9035, "step": 183 }, { "epoch": 0.05232320479629377, "grad_norm": 2.7407846450805664, "learning_rate": 3.8310830221200806e-05, "loss": 2.8378, "step": 184 }, { "epoch": 0.05260757003975189, "grad_norm": 3.778500556945801, "learning_rate": 3.8299339270324623e-05, "loss": 4.2712, "step": 185 }, { "epoch": 0.05289193528321001, "grad_norm": 4.52739143371582, "learning_rate": 3.8287848319448435e-05, "loss": 3.698, "step": 186 }, { "epoch": 0.05317630052666813, "grad_norm": 4.739940166473389, "learning_rate": 3.827635736857225e-05, "loss": 3.7459, "step": 187 }, { "epoch": 0.053460665770126245, "grad_norm": 4.952041149139404, "learning_rate": 3.826486641769607e-05, "loss": 3.355, "step": 188 }, { "epoch": 0.05374503101358436, "grad_norm": 3.8098363876342773, "learning_rate": 3.825337546681988e-05, "loss": 3.2634, "step": 189 }, { "epoch": 0.054029396257042485, "grad_norm": 3.0846543312072754, "learning_rate": 3.82418845159437e-05, "loss": 3.0904, "step": 190 }, { "epoch": 0.0543137615005006, "grad_norm": 2.909727096557617, "learning_rate": 3.823039356506751e-05, "loss": 2.9777, "step": 191 }, { "epoch": 0.05459812674395872, "grad_norm": 3.1065726280212402, "learning_rate": 3.821890261419133e-05, "loss": 2.7359, "step": 192 }, { "epoch": 0.05488249198741684, "grad_norm": 7.011332988739014, "learning_rate": 3.820741166331514e-05, "loss": 4.007, "step": 193 }, { "epoch": 0.05516685723087496, "grad_norm": 5.220023155212402, "learning_rate": 3.819592071243896e-05, "loss": 3.6895, "step": 194 }, { "epoch": 0.055451222474333074, "grad_norm": 4.016128063201904, "learning_rate": 3.818442976156277e-05, "loss": 3.6332, "step": 195 }, { "epoch": 0.05573558771779119, "grad_norm": 3.669994831085205, "learning_rate": 3.8172938810686586e-05, "loss": 3.395, "step": 196 }, { "epoch": 0.056019952961249314, "grad_norm": 3.2430341243743896, "learning_rate": 3.8161447859810404e-05, "loss": 3.4718, "step": 197 }, { "epoch": 0.05630431820470743, "grad_norm": 2.657214403152466, "learning_rate": 3.8149956908934215e-05, "loss": 3.0473, "step": 198 }, { "epoch": 0.05658868344816555, "grad_norm": 2.9551098346710205, "learning_rate": 3.813846595805803e-05, "loss": 2.7037, "step": 199 }, { "epoch": 0.056873048691623664, "grad_norm": 2.8400704860687256, "learning_rate": 3.8126975007181844e-05, "loss": 2.6874, "step": 200 }, { "epoch": 0.05715741393508179, "grad_norm": 3.2741611003875732, "learning_rate": 3.811548405630566e-05, "loss": 4.1424, "step": 201 }, { "epoch": 0.0574417791785399, "grad_norm": 3.464838981628418, "learning_rate": 3.810399310542947e-05, "loss": 3.7012, "step": 202 }, { "epoch": 0.05772614442199802, "grad_norm": 2.831378698348999, "learning_rate": 3.809250215455329e-05, "loss": 3.4572, "step": 203 }, { "epoch": 0.058010509665456136, "grad_norm": 2.67508864402771, "learning_rate": 3.808101120367711e-05, "loss": 3.5624, "step": 204 }, { "epoch": 0.05829487490891426, "grad_norm": 2.527092218399048, "learning_rate": 3.806952025280093e-05, "loss": 3.2684, "step": 205 }, { "epoch": 0.058579240152372376, "grad_norm": 3.604074478149414, "learning_rate": 3.805802930192474e-05, "loss": 3.1068, "step": 206 }, { "epoch": 0.05886360539583049, "grad_norm": 3.566804885864258, "learning_rate": 3.8046538351048556e-05, "loss": 2.9472, "step": 207 }, { "epoch": 0.059147970639288616, "grad_norm": 3.518846273422241, "learning_rate": 3.803504740017237e-05, "loss": 2.7991, "step": 208 }, { "epoch": 0.05943233588274673, "grad_norm": 5.438695430755615, "learning_rate": 3.8023556449296185e-05, "loss": 4.3441, "step": 209 }, { "epoch": 0.05971670112620485, "grad_norm": 4.187833786010742, "learning_rate": 3.8012065498419996e-05, "loss": 3.7446, "step": 210 }, { "epoch": 0.060001066369662966, "grad_norm": 2.9121296405792236, "learning_rate": 3.8000574547543814e-05, "loss": 3.4443, "step": 211 }, { "epoch": 0.06028543161312109, "grad_norm": 3.0015816688537598, "learning_rate": 3.798908359666763e-05, "loss": 3.2636, "step": 212 }, { "epoch": 0.060569796856579206, "grad_norm": 4.640799045562744, "learning_rate": 3.797759264579144e-05, "loss": 3.5289, "step": 213 }, { "epoch": 0.06085416210003732, "grad_norm": 4.131928443908691, "learning_rate": 3.796610169491526e-05, "loss": 3.0966, "step": 214 }, { "epoch": 0.06113852734349544, "grad_norm": 3.303802251815796, "learning_rate": 3.795461074403907e-05, "loss": 2.6751, "step": 215 }, { "epoch": 0.06142289258695356, "grad_norm": 3.197770595550537, "learning_rate": 3.794311979316289e-05, "loss": 2.8337, "step": 216 }, { "epoch": 0.06170725783041168, "grad_norm": 3.2084145545959473, "learning_rate": 3.79316288422867e-05, "loss": 3.9611, "step": 217 }, { "epoch": 0.061991623073869795, "grad_norm": 2.7938461303710938, "learning_rate": 3.792013789141052e-05, "loss": 3.6372, "step": 218 }, { "epoch": 0.06227598831732791, "grad_norm": 3.456489324569702, "learning_rate": 3.790864694053433e-05, "loss": 3.4849, "step": 219 }, { "epoch": 0.06256035356078603, "grad_norm": 4.275204658508301, "learning_rate": 3.789715598965815e-05, "loss": 3.2791, "step": 220 }, { "epoch": 0.06284471880424415, "grad_norm": 3.960286855697632, "learning_rate": 3.7885665038781966e-05, "loss": 3.3154, "step": 221 }, { "epoch": 0.06312908404770227, "grad_norm": 3.113219738006592, "learning_rate": 3.787417408790578e-05, "loss": 3.0051, "step": 222 }, { "epoch": 0.06341344929116038, "grad_norm": 2.509889841079712, "learning_rate": 3.7862683137029595e-05, "loss": 2.7279, "step": 223 }, { "epoch": 0.06369781453461851, "grad_norm": 2.478565216064453, "learning_rate": 3.7851192186153406e-05, "loss": 2.6428, "step": 224 }, { "epoch": 0.06398217977807663, "grad_norm": 4.031311988830566, "learning_rate": 3.7839701235277224e-05, "loss": 3.8377, "step": 225 }, { "epoch": 0.06426654502153474, "grad_norm": 4.466932773590088, "learning_rate": 3.7828210284401035e-05, "loss": 3.516, "step": 226 }, { "epoch": 0.06455091026499286, "grad_norm": 4.715795040130615, "learning_rate": 3.781671933352485e-05, "loss": 3.545, "step": 227 }, { "epoch": 0.06483527550845097, "grad_norm": 3.541882276535034, "learning_rate": 3.7805228382648664e-05, "loss": 3.2869, "step": 228 }, { "epoch": 0.0651196407519091, "grad_norm": 3.242234945297241, "learning_rate": 3.779373743177248e-05, "loss": 3.0996, "step": 229 }, { "epoch": 0.06540400599536722, "grad_norm": 2.6109092235565186, "learning_rate": 3.77822464808963e-05, "loss": 2.9875, "step": 230 }, { "epoch": 0.06568837123882533, "grad_norm": 2.639467239379883, "learning_rate": 3.777075553002011e-05, "loss": 2.8205, "step": 231 }, { "epoch": 0.06597273648228345, "grad_norm": 2.852081775665283, "learning_rate": 3.775926457914393e-05, "loss": 2.8712, "step": 232 }, { "epoch": 0.06625710172574158, "grad_norm": 4.4990339279174805, "learning_rate": 3.774777362826774e-05, "loss": 3.8808, "step": 233 }, { "epoch": 0.06654146696919969, "grad_norm": 5.339641571044922, "learning_rate": 3.773628267739156e-05, "loss": 3.8167, "step": 234 }, { "epoch": 0.06682583221265781, "grad_norm": 3.1748464107513428, "learning_rate": 3.772479172651537e-05, "loss": 3.5924, "step": 235 }, { "epoch": 0.06711019745611593, "grad_norm": 3.4898841381073, "learning_rate": 3.7713300775639187e-05, "loss": 3.2489, "step": 236 }, { "epoch": 0.06739456269957404, "grad_norm": 3.2079555988311768, "learning_rate": 3.7701809824763e-05, "loss": 3.1664, "step": 237 }, { "epoch": 0.06767892794303217, "grad_norm": 2.7962253093719482, "learning_rate": 3.7690318873886816e-05, "loss": 2.8651, "step": 238 }, { "epoch": 0.06796329318649028, "grad_norm": 3.1500935554504395, "learning_rate": 3.767882792301063e-05, "loss": 2.7629, "step": 239 }, { "epoch": 0.0682476584299484, "grad_norm": 2.5772945880889893, "learning_rate": 3.7667336972134444e-05, "loss": 2.6508, "step": 240 }, { "epoch": 0.06853202367340652, "grad_norm": 4.086240291595459, "learning_rate": 3.765584602125826e-05, "loss": 3.8843, "step": 241 }, { "epoch": 0.06881638891686463, "grad_norm": 2.856458902359009, "learning_rate": 3.764435507038208e-05, "loss": 3.4318, "step": 242 }, { "epoch": 0.06910075416032276, "grad_norm": 3.179042100906372, "learning_rate": 3.763286411950589e-05, "loss": 3.0556, "step": 243 }, { "epoch": 0.06938511940378088, "grad_norm": 3.2401700019836426, "learning_rate": 3.762137316862971e-05, "loss": 3.3399, "step": 244 }, { "epoch": 0.06966948464723899, "grad_norm": 2.8616445064544678, "learning_rate": 3.760988221775353e-05, "loss": 3.0333, "step": 245 }, { "epoch": 0.06995384989069711, "grad_norm": 2.7594053745269775, "learning_rate": 3.759839126687734e-05, "loss": 2.9015, "step": 246 }, { "epoch": 0.07023821513415522, "grad_norm": 2.3787291049957275, "learning_rate": 3.7586900316001156e-05, "loss": 2.6074, "step": 247 }, { "epoch": 0.07052258037761334, "grad_norm": 2.604524612426758, "learning_rate": 3.757540936512497e-05, "loss": 2.5631, "step": 248 }, { "epoch": 0.07080694562107147, "grad_norm": 5.74310302734375, "learning_rate": 3.7563918414248785e-05, "loss": 3.9635, "step": 249 }, { "epoch": 0.07109131086452958, "grad_norm": 4.676276206970215, "learning_rate": 3.7552427463372596e-05, "loss": 3.4952, "step": 250 }, { "epoch": 0.0713756761079877, "grad_norm": 3.3065385818481445, "learning_rate": 3.7540936512496414e-05, "loss": 3.2999, "step": 251 }, { "epoch": 0.07166004135144582, "grad_norm": 2.9583752155303955, "learning_rate": 3.7529445561620225e-05, "loss": 3.2314, "step": 252 }, { "epoch": 0.07194440659490393, "grad_norm": 3.083742380142212, "learning_rate": 3.751795461074404e-05, "loss": 3.0786, "step": 253 }, { "epoch": 0.07222877183836206, "grad_norm": 3.241976261138916, "learning_rate": 3.750646365986786e-05, "loss": 2.8807, "step": 254 }, { "epoch": 0.07251313708182018, "grad_norm": 2.7543532848358154, "learning_rate": 3.749497270899167e-05, "loss": 2.8752, "step": 255 }, { "epoch": 0.07279750232527829, "grad_norm": 2.6907711029052734, "learning_rate": 3.748348175811549e-05, "loss": 2.7082, "step": 256 }, { "epoch": 0.07308186756873641, "grad_norm": 3.050748586654663, "learning_rate": 3.74719908072393e-05, "loss": 3.8712, "step": 257 }, { "epoch": 0.07336623281219452, "grad_norm": 3.0048818588256836, "learning_rate": 3.746049985636312e-05, "loss": 3.4344, "step": 258 }, { "epoch": 0.07365059805565265, "grad_norm": 2.973074197769165, "learning_rate": 3.744900890548693e-05, "loss": 3.1367, "step": 259 }, { "epoch": 0.07393496329911077, "grad_norm": 3.0169665813446045, "learning_rate": 3.743751795461075e-05, "loss": 3.4544, "step": 260 }, { "epoch": 0.07421932854256888, "grad_norm": 2.8069064617156982, "learning_rate": 3.742602700373456e-05, "loss": 3.1697, "step": 261 }, { "epoch": 0.074503693786027, "grad_norm": 2.524655818939209, "learning_rate": 3.741453605285838e-05, "loss": 2.9794, "step": 262 }, { "epoch": 0.07478805902948513, "grad_norm": 2.5447611808776855, "learning_rate": 3.7403045101982195e-05, "loss": 2.5633, "step": 263 }, { "epoch": 0.07507242427294324, "grad_norm": 3.1086103916168213, "learning_rate": 3.7391554151106006e-05, "loss": 2.6274, "step": 264 }, { "epoch": 0.07535678951640136, "grad_norm": 3.7446553707122803, "learning_rate": 3.7380063200229824e-05, "loss": 4.0418, "step": 265 }, { "epoch": 0.07564115475985948, "grad_norm": 3.5950560569763184, "learning_rate": 3.7368572249353635e-05, "loss": 3.3891, "step": 266 }, { "epoch": 0.07592552000331759, "grad_norm": 3.037245273590088, "learning_rate": 3.735708129847745e-05, "loss": 3.389, "step": 267 }, { "epoch": 0.07620988524677572, "grad_norm": 2.6339914798736572, "learning_rate": 3.7345590347601264e-05, "loss": 3.0648, "step": 268 }, { "epoch": 0.07649425049023383, "grad_norm": 2.379084587097168, "learning_rate": 3.733409939672508e-05, "loss": 2.9169, "step": 269 }, { "epoch": 0.07677861573369195, "grad_norm": 2.9946842193603516, "learning_rate": 3.732260844584889e-05, "loss": 2.7304, "step": 270 }, { "epoch": 0.07706298097715007, "grad_norm": 3.1141395568847656, "learning_rate": 3.731111749497271e-05, "loss": 2.7589, "step": 271 }, { "epoch": 0.07734734622060818, "grad_norm": 2.931774854660034, "learning_rate": 3.729962654409653e-05, "loss": 2.61, "step": 272 }, { "epoch": 0.0776317114640663, "grad_norm": 4.752538681030273, "learning_rate": 3.728813559322034e-05, "loss": 3.82, "step": 273 }, { "epoch": 0.07791607670752443, "grad_norm": 3.4177722930908203, "learning_rate": 3.727664464234416e-05, "loss": 3.3525, "step": 274 }, { "epoch": 0.07820044195098254, "grad_norm": 2.749371290206909, "learning_rate": 3.726515369146797e-05, "loss": 3.1633, "step": 275 }, { "epoch": 0.07848480719444066, "grad_norm": 3.0202836990356445, "learning_rate": 3.725366274059179e-05, "loss": 3.2241, "step": 276 }, { "epoch": 0.07876917243789877, "grad_norm": 3.951889753341675, "learning_rate": 3.72421717897156e-05, "loss": 3.1181, "step": 277 }, { "epoch": 0.0790535376813569, "grad_norm": 3.7062735557556152, "learning_rate": 3.723068083883942e-05, "loss": 2.924, "step": 278 }, { "epoch": 0.07933790292481502, "grad_norm": 2.9112234115600586, "learning_rate": 3.7219189887963234e-05, "loss": 2.6434, "step": 279 }, { "epoch": 0.07962226816827313, "grad_norm": 2.5431931018829346, "learning_rate": 3.720769893708705e-05, "loss": 2.5729, "step": 280 }, { "epoch": 0.07990663341173125, "grad_norm": 3.293665647506714, "learning_rate": 3.719620798621086e-05, "loss": 3.6881, "step": 281 }, { "epoch": 0.08019099865518937, "grad_norm": 3.0450286865234375, "learning_rate": 3.718471703533468e-05, "loss": 3.6725, "step": 282 }, { "epoch": 0.08047536389864748, "grad_norm": 2.8695693016052246, "learning_rate": 3.717322608445849e-05, "loss": 3.4356, "step": 283 }, { "epoch": 0.08075972914210561, "grad_norm": 2.7940287590026855, "learning_rate": 3.716173513358231e-05, "loss": 3.1586, "step": 284 }, { "epoch": 0.08104409438556373, "grad_norm": 3.127352476119995, "learning_rate": 3.715024418270612e-05, "loss": 3.0592, "step": 285 }, { "epoch": 0.08132845962902184, "grad_norm": 2.77877140045166, "learning_rate": 3.713875323182994e-05, "loss": 2.6598, "step": 286 }, { "epoch": 0.08161282487247996, "grad_norm": 2.7355434894561768, "learning_rate": 3.7127262280953756e-05, "loss": 2.5461, "step": 287 }, { "epoch": 0.08189719011593807, "grad_norm": 2.800222873687744, "learning_rate": 3.711577133007757e-05, "loss": 2.6196, "step": 288 }, { "epoch": 0.0821815553593962, "grad_norm": 3.4335219860076904, "learning_rate": 3.7104280379201385e-05, "loss": 3.6921, "step": 289 }, { "epoch": 0.08246592060285432, "grad_norm": 3.0871481895446777, "learning_rate": 3.7092789428325196e-05, "loss": 3.2295, "step": 290 }, { "epoch": 0.08275028584631243, "grad_norm": 3.372642993927002, "learning_rate": 3.7081298477449014e-05, "loss": 3.5505, "step": 291 }, { "epoch": 0.08303465108977055, "grad_norm": 2.830824613571167, "learning_rate": 3.7069807526572825e-05, "loss": 3.1795, "step": 292 }, { "epoch": 0.08331901633322868, "grad_norm": 2.850874423980713, "learning_rate": 3.705831657569664e-05, "loss": 2.9981, "step": 293 }, { "epoch": 0.08360338157668679, "grad_norm": 2.798980712890625, "learning_rate": 3.7046825624820454e-05, "loss": 2.7752, "step": 294 }, { "epoch": 0.08388774682014491, "grad_norm": 2.3986728191375732, "learning_rate": 3.703533467394427e-05, "loss": 2.4156, "step": 295 }, { "epoch": 0.08417211206360303, "grad_norm": 2.329164505004883, "learning_rate": 3.702384372306809e-05, "loss": 2.5282, "step": 296 }, { "epoch": 0.08445647730706114, "grad_norm": 3.428501844406128, "learning_rate": 3.70123527721919e-05, "loss": 3.8649, "step": 297 }, { "epoch": 0.08474084255051927, "grad_norm": 3.2161340713500977, "learning_rate": 3.700086182131572e-05, "loss": 3.4523, "step": 298 }, { "epoch": 0.08502520779397738, "grad_norm": 2.801135778427124, "learning_rate": 3.698937087043953e-05, "loss": 3.3231, "step": 299 }, { "epoch": 0.0853095730374355, "grad_norm": 2.886110544204712, "learning_rate": 3.697787991956335e-05, "loss": 2.918, "step": 300 }, { "epoch": 0.08559393828089362, "grad_norm": 3.050361156463623, "learning_rate": 3.696638896868716e-05, "loss": 2.9013, "step": 301 }, { "epoch": 0.08587830352435173, "grad_norm": 2.4915997982025146, "learning_rate": 3.695489801781098e-05, "loss": 2.9062, "step": 302 }, { "epoch": 0.08616266876780986, "grad_norm": 2.6376094818115234, "learning_rate": 3.694340706693479e-05, "loss": 2.8681, "step": 303 }, { "epoch": 0.08644703401126798, "grad_norm": 2.602095603942871, "learning_rate": 3.6931916116058606e-05, "loss": 2.6519, "step": 304 }, { "epoch": 0.08673139925472609, "grad_norm": 3.274116277694702, "learning_rate": 3.6920425165182424e-05, "loss": 3.812, "step": 305 }, { "epoch": 0.08701576449818421, "grad_norm": 2.873872756958008, "learning_rate": 3.6908934214306235e-05, "loss": 3.3963, "step": 306 }, { "epoch": 0.08730012974164232, "grad_norm": 2.432584047317505, "learning_rate": 3.689744326343005e-05, "loss": 3.2271, "step": 307 }, { "epoch": 0.08758449498510044, "grad_norm": 2.374149799346924, "learning_rate": 3.6885952312553864e-05, "loss": 3.4667, "step": 308 }, { "epoch": 0.08786886022855857, "grad_norm": 2.5668742656707764, "learning_rate": 3.687446136167768e-05, "loss": 2.9856, "step": 309 }, { "epoch": 0.08815322547201668, "grad_norm": 2.4312264919281006, "learning_rate": 3.686297041080149e-05, "loss": 2.6518, "step": 310 }, { "epoch": 0.0884375907154748, "grad_norm": 2.417278528213501, "learning_rate": 3.685147945992531e-05, "loss": 2.5552, "step": 311 }, { "epoch": 0.08872195595893292, "grad_norm": 2.399350166320801, "learning_rate": 3.683998850904912e-05, "loss": 2.5939, "step": 312 }, { "epoch": 0.08900632120239103, "grad_norm": 4.476291656494141, "learning_rate": 3.682849755817294e-05, "loss": 3.7203, "step": 313 }, { "epoch": 0.08929068644584916, "grad_norm": 2.964276075363159, "learning_rate": 3.681700660729676e-05, "loss": 3.3704, "step": 314 }, { "epoch": 0.08957505168930728, "grad_norm": 2.778978109359741, "learning_rate": 3.6805515656420576e-05, "loss": 3.3189, "step": 315 }, { "epoch": 0.08985941693276539, "grad_norm": 3.131481647491455, "learning_rate": 3.679402470554439e-05, "loss": 3.2355, "step": 316 }, { "epoch": 0.09014378217622351, "grad_norm": 3.234072208404541, "learning_rate": 3.6782533754668205e-05, "loss": 2.8635, "step": 317 }, { "epoch": 0.09042814741968162, "grad_norm": 3.1541082859039307, "learning_rate": 3.6771042803792016e-05, "loss": 2.7849, "step": 318 }, { "epoch": 0.09071251266313975, "grad_norm": 2.8236806392669678, "learning_rate": 3.6759551852915834e-05, "loss": 2.7041, "step": 319 }, { "epoch": 0.09099687790659787, "grad_norm": 2.4180235862731934, "learning_rate": 3.674806090203965e-05, "loss": 2.4594, "step": 320 }, { "epoch": 0.09128124315005598, "grad_norm": 4.0737080574035645, "learning_rate": 3.673656995116346e-05, "loss": 3.5109, "step": 321 }, { "epoch": 0.0915656083935141, "grad_norm": 4.438284873962402, "learning_rate": 3.672507900028728e-05, "loss": 3.2919, "step": 322 }, { "epoch": 0.09184997363697223, "grad_norm": 4.133691310882568, "learning_rate": 3.671358804941109e-05, "loss": 3.0562, "step": 323 }, { "epoch": 0.09213433888043034, "grad_norm": 4.211462020874023, "learning_rate": 3.670209709853491e-05, "loss": 3.2076, "step": 324 }, { "epoch": 0.09241870412388846, "grad_norm": 3.6107709407806396, "learning_rate": 3.669060614765872e-05, "loss": 3.003, "step": 325 }, { "epoch": 0.09270306936734658, "grad_norm": 2.793802261352539, "learning_rate": 3.667911519678254e-05, "loss": 2.8852, "step": 326 }, { "epoch": 0.09298743461080469, "grad_norm": 2.2556843757629395, "learning_rate": 3.666762424590635e-05, "loss": 2.5742, "step": 327 }, { "epoch": 0.09327179985426282, "grad_norm": 2.088768243789673, "learning_rate": 3.665613329503017e-05, "loss": 2.5716, "step": 328 }, { "epoch": 0.09355616509772093, "grad_norm": 4.9838056564331055, "learning_rate": 3.6644642344153986e-05, "loss": 3.7839, "step": 329 }, { "epoch": 0.09384053034117905, "grad_norm": 3.877955436706543, "learning_rate": 3.66331513932778e-05, "loss": 3.2162, "step": 330 }, { "epoch": 0.09412489558463717, "grad_norm": 3.442711353302002, "learning_rate": 3.6621660442401615e-05, "loss": 3.2394, "step": 331 }, { "epoch": 0.09440926082809528, "grad_norm": 3.3739242553710938, "learning_rate": 3.6610169491525426e-05, "loss": 3.1204, "step": 332 }, { "epoch": 0.0946936260715534, "grad_norm": 3.4325954914093018, "learning_rate": 3.6598678540649244e-05, "loss": 2.7708, "step": 333 }, { "epoch": 0.09497799131501153, "grad_norm": 2.4555609226226807, "learning_rate": 3.6587187589773055e-05, "loss": 2.5941, "step": 334 }, { "epoch": 0.09526235655846964, "grad_norm": 2.201622724533081, "learning_rate": 3.657569663889687e-05, "loss": 2.5229, "step": 335 }, { "epoch": 0.09554672180192776, "grad_norm": 2.3770065307617188, "learning_rate": 3.6564205688020684e-05, "loss": 2.4801, "step": 336 }, { "epoch": 0.09583108704538587, "grad_norm": 3.16601300239563, "learning_rate": 3.65527147371445e-05, "loss": 3.9019, "step": 337 }, { "epoch": 0.096115452288844, "grad_norm": 2.69059419631958, "learning_rate": 3.654122378626832e-05, "loss": 3.2074, "step": 338 }, { "epoch": 0.09639981753230212, "grad_norm": 2.622850179672241, "learning_rate": 3.652973283539213e-05, "loss": 3.1565, "step": 339 }, { "epoch": 0.09668418277576023, "grad_norm": 2.384047746658325, "learning_rate": 3.651824188451595e-05, "loss": 2.9815, "step": 340 }, { "epoch": 0.09696854801921835, "grad_norm": 2.255505084991455, "learning_rate": 3.650675093363976e-05, "loss": 3.0731, "step": 341 }, { "epoch": 0.09725291326267647, "grad_norm": 2.502235174179077, "learning_rate": 3.649525998276358e-05, "loss": 2.795, "step": 342 }, { "epoch": 0.09753727850613458, "grad_norm": 2.6988747119903564, "learning_rate": 3.648376903188739e-05, "loss": 2.6843, "step": 343 }, { "epoch": 0.09782164374959271, "grad_norm": 2.856539011001587, "learning_rate": 3.6472278081011206e-05, "loss": 2.4656, "step": 344 }, { "epoch": 0.09810600899305083, "grad_norm": 3.2413103580474854, "learning_rate": 3.646078713013502e-05, "loss": 3.8422, "step": 345 }, { "epoch": 0.09839037423650894, "grad_norm": 2.434211254119873, "learning_rate": 3.6449296179258835e-05, "loss": 3.0491, "step": 346 }, { "epoch": 0.09867473947996706, "grad_norm": 2.2801740169525146, "learning_rate": 3.643780522838265e-05, "loss": 3.0232, "step": 347 }, { "epoch": 0.09895910472342517, "grad_norm": 2.3498308658599854, "learning_rate": 3.6426314277506464e-05, "loss": 2.9397, "step": 348 }, { "epoch": 0.0992434699668833, "grad_norm": 2.8017773628234863, "learning_rate": 3.641482332663028e-05, "loss": 2.8337, "step": 349 }, { "epoch": 0.09952783521034142, "grad_norm": 2.93746018409729, "learning_rate": 3.6403332375754093e-05, "loss": 2.7192, "step": 350 }, { "epoch": 0.09981220045379953, "grad_norm": 3.0276105403900146, "learning_rate": 3.639184142487791e-05, "loss": 2.3286, "step": 351 }, { "epoch": 0.10009656569725765, "grad_norm": 2.725423812866211, "learning_rate": 3.638035047400173e-05, "loss": 2.298, "step": 352 }, { "epoch": 0.10038093094071578, "grad_norm": 2.9816744327545166, "learning_rate": 3.636885952312554e-05, "loss": 3.6773, "step": 353 }, { "epoch": 0.10066529618417389, "grad_norm": 2.4625086784362793, "learning_rate": 3.635736857224936e-05, "loss": 3.2648, "step": 354 }, { "epoch": 0.10094966142763201, "grad_norm": 2.325002670288086, "learning_rate": 3.6345877621373176e-05, "loss": 2.9206, "step": 355 }, { "epoch": 0.10123402667109013, "grad_norm": 2.643895149230957, "learning_rate": 3.633438667049699e-05, "loss": 2.6485, "step": 356 }, { "epoch": 0.10151839191454824, "grad_norm": 2.9441092014312744, "learning_rate": 3.6322895719620805e-05, "loss": 2.8929, "step": 357 }, { "epoch": 0.10180275715800637, "grad_norm": 2.6018552780151367, "learning_rate": 3.6311404768744616e-05, "loss": 2.4757, "step": 358 }, { "epoch": 0.10208712240146448, "grad_norm": 2.31733775138855, "learning_rate": 3.6299913817868434e-05, "loss": 2.6008, "step": 359 }, { "epoch": 0.1023714876449226, "grad_norm": 2.3203630447387695, "learning_rate": 3.6288422866992245e-05, "loss": 2.4726, "step": 360 }, { "epoch": 0.10265585288838072, "grad_norm": 3.1516456604003906, "learning_rate": 3.627693191611606e-05, "loss": 3.8334, "step": 361 }, { "epoch": 0.10294021813183883, "grad_norm": 2.976469039916992, "learning_rate": 3.626544096523988e-05, "loss": 3.2071, "step": 362 }, { "epoch": 0.10322458337529695, "grad_norm": 2.7915210723876953, "learning_rate": 3.625395001436369e-05, "loss": 3.0369, "step": 363 }, { "epoch": 0.10350894861875508, "grad_norm": 3.034956932067871, "learning_rate": 3.624245906348751e-05, "loss": 3.1236, "step": 364 }, { "epoch": 0.10379331386221319, "grad_norm": 2.835688591003418, "learning_rate": 3.623096811261132e-05, "loss": 2.8524, "step": 365 }, { "epoch": 0.10407767910567131, "grad_norm": 2.3832132816314697, "learning_rate": 3.621947716173514e-05, "loss": 2.4496, "step": 366 }, { "epoch": 0.10436204434912942, "grad_norm": 2.339113473892212, "learning_rate": 3.620798621085895e-05, "loss": 2.4954, "step": 367 }, { "epoch": 0.10464640959258754, "grad_norm": 2.407395362854004, "learning_rate": 3.619649525998277e-05, "loss": 2.568, "step": 368 }, { "epoch": 0.10493077483604567, "grad_norm": 4.512346267700195, "learning_rate": 3.618500430910658e-05, "loss": 3.5987, "step": 369 }, { "epoch": 0.10521514007950378, "grad_norm": 3.320215940475464, "learning_rate": 3.61735133582304e-05, "loss": 3.0956, "step": 370 }, { "epoch": 0.1054995053229619, "grad_norm": 3.0713627338409424, "learning_rate": 3.6162022407354215e-05, "loss": 3.1864, "step": 371 }, { "epoch": 0.10578387056642002, "grad_norm": 2.845768690109253, "learning_rate": 3.6150531456478026e-05, "loss": 3.0029, "step": 372 }, { "epoch": 0.10606823580987813, "grad_norm": 2.5338187217712402, "learning_rate": 3.6139040505601844e-05, "loss": 2.6617, "step": 373 }, { "epoch": 0.10635260105333626, "grad_norm": 2.5313353538513184, "learning_rate": 3.6127549554725655e-05, "loss": 2.5754, "step": 374 }, { "epoch": 0.10663696629679438, "grad_norm": 2.485309362411499, "learning_rate": 3.611605860384947e-05, "loss": 2.4186, "step": 375 }, { "epoch": 0.10692133154025249, "grad_norm": 2.544818878173828, "learning_rate": 3.6104567652973284e-05, "loss": 2.5501, "step": 376 }, { "epoch": 0.10720569678371061, "grad_norm": 2.9218149185180664, "learning_rate": 3.60930767020971e-05, "loss": 3.7049, "step": 377 }, { "epoch": 0.10749006202716872, "grad_norm": 2.9214773178100586, "learning_rate": 3.608158575122091e-05, "loss": 3.0877, "step": 378 }, { "epoch": 0.10777442727062685, "grad_norm": 2.4883925914764404, "learning_rate": 3.607009480034473e-05, "loss": 3.1767, "step": 379 }, { "epoch": 0.10805879251408497, "grad_norm": 2.176980972290039, "learning_rate": 3.605860384946855e-05, "loss": 2.9752, "step": 380 }, { "epoch": 0.10834315775754308, "grad_norm": 2.442439317703247, "learning_rate": 3.604711289859236e-05, "loss": 3.0929, "step": 381 }, { "epoch": 0.1086275230010012, "grad_norm": 2.6657745838165283, "learning_rate": 3.603562194771618e-05, "loss": 2.6189, "step": 382 }, { "epoch": 0.10891188824445933, "grad_norm": 2.3688509464263916, "learning_rate": 3.602413099683999e-05, "loss": 2.5404, "step": 383 }, { "epoch": 0.10919625348791744, "grad_norm": 2.235816717147827, "learning_rate": 3.601264004596381e-05, "loss": 2.4137, "step": 384 }, { "epoch": 0.10948061873137556, "grad_norm": 3.8358049392700195, "learning_rate": 3.600114909508762e-05, "loss": 3.5268, "step": 385 }, { "epoch": 0.10976498397483368, "grad_norm": 2.5932207107543945, "learning_rate": 3.5989658144211436e-05, "loss": 3.1315, "step": 386 }, { "epoch": 0.11004934921829179, "grad_norm": 2.4010274410247803, "learning_rate": 3.597816719333525e-05, "loss": 3.072, "step": 387 }, { "epoch": 0.11033371446174992, "grad_norm": 2.247072219848633, "learning_rate": 3.5966676242459065e-05, "loss": 2.5931, "step": 388 }, { "epoch": 0.11061807970520803, "grad_norm": 3.017591953277588, "learning_rate": 3.595518529158288e-05, "loss": 2.8756, "step": 389 }, { "epoch": 0.11090244494866615, "grad_norm": 2.5938799381256104, "learning_rate": 3.59436943407067e-05, "loss": 2.869, "step": 390 }, { "epoch": 0.11118681019212427, "grad_norm": 2.039341926574707, "learning_rate": 3.593220338983051e-05, "loss": 2.4231, "step": 391 }, { "epoch": 0.11147117543558238, "grad_norm": 2.2805583477020264, "learning_rate": 3.592071243895433e-05, "loss": 2.377, "step": 392 }, { "epoch": 0.1117555406790405, "grad_norm": 2.4253814220428467, "learning_rate": 3.590922148807814e-05, "loss": 3.549, "step": 393 }, { "epoch": 0.11203990592249863, "grad_norm": 2.2920830249786377, "learning_rate": 3.589773053720196e-05, "loss": 3.2533, "step": 394 }, { "epoch": 0.11232427116595674, "grad_norm": 2.4961812496185303, "learning_rate": 3.588623958632577e-05, "loss": 2.8119, "step": 395 }, { "epoch": 0.11260863640941486, "grad_norm": 2.3366754055023193, "learning_rate": 3.587474863544959e-05, "loss": 3.0371, "step": 396 }, { "epoch": 0.11289300165287297, "grad_norm": 2.443265914916992, "learning_rate": 3.5863257684573405e-05, "loss": 2.731, "step": 397 }, { "epoch": 0.1131773668963311, "grad_norm": 2.1697330474853516, "learning_rate": 3.5851766733697216e-05, "loss": 2.548, "step": 398 }, { "epoch": 0.11346173213978922, "grad_norm": 2.0168380737304688, "learning_rate": 3.5840275782821034e-05, "loss": 2.4326, "step": 399 }, { "epoch": 0.11374609738324733, "grad_norm": 2.136674642562866, "learning_rate": 3.5828784831944845e-05, "loss": 2.3445, "step": 400 }, { "epoch": 0.11403046262670545, "grad_norm": 2.96239972114563, "learning_rate": 3.581729388106866e-05, "loss": 3.7026, "step": 401 }, { "epoch": 0.11431482787016357, "grad_norm": 2.7048444747924805, "learning_rate": 3.5805802930192474e-05, "loss": 3.0997, "step": 402 }, { "epoch": 0.11459919311362168, "grad_norm": 2.455439805984497, "learning_rate": 3.579431197931629e-05, "loss": 2.9647, "step": 403 }, { "epoch": 0.1148835583570798, "grad_norm": 2.2661333084106445, "learning_rate": 3.57828210284401e-05, "loss": 2.7126, "step": 404 }, { "epoch": 0.11516792360053793, "grad_norm": 2.03705096244812, "learning_rate": 3.577133007756392e-05, "loss": 2.755, "step": 405 }, { "epoch": 0.11545228884399604, "grad_norm": 2.140838384628296, "learning_rate": 3.575983912668774e-05, "loss": 2.5794, "step": 406 }, { "epoch": 0.11573665408745416, "grad_norm": 2.352357864379883, "learning_rate": 3.574834817581155e-05, "loss": 2.64, "step": 407 }, { "epoch": 0.11602101933091227, "grad_norm": 2.3507237434387207, "learning_rate": 3.573685722493537e-05, "loss": 2.2045, "step": 408 }, { "epoch": 0.1163053845743704, "grad_norm": 4.098755359649658, "learning_rate": 3.572536627405918e-05, "loss": 3.437, "step": 409 }, { "epoch": 0.11658974981782852, "grad_norm": 2.8039045333862305, "learning_rate": 3.5713875323183e-05, "loss": 3.2256, "step": 410 }, { "epoch": 0.11687411506128663, "grad_norm": 2.3482019901275635, "learning_rate": 3.570238437230681e-05, "loss": 2.9395, "step": 411 }, { "epoch": 0.11715848030474475, "grad_norm": 2.0725553035736084, "learning_rate": 3.5690893421430626e-05, "loss": 2.8758, "step": 412 }, { "epoch": 0.11744284554820288, "grad_norm": 2.3215219974517822, "learning_rate": 3.567940247055444e-05, "loss": 2.7822, "step": 413 }, { "epoch": 0.11772721079166099, "grad_norm": 2.770766258239746, "learning_rate": 3.5667911519678255e-05, "loss": 2.5678, "step": 414 }, { "epoch": 0.11801157603511911, "grad_norm": 2.5529537200927734, "learning_rate": 3.565642056880207e-05, "loss": 2.429, "step": 415 }, { "epoch": 0.11829594127857723, "grad_norm": 2.2458651065826416, "learning_rate": 3.5644929617925884e-05, "loss": 2.1978, "step": 416 }, { "epoch": 0.11858030652203534, "grad_norm": 2.636418581008911, "learning_rate": 3.56334386670497e-05, "loss": 3.3172, "step": 417 }, { "epoch": 0.11886467176549347, "grad_norm": 2.1322054862976074, "learning_rate": 3.562194771617351e-05, "loss": 3.2257, "step": 418 }, { "epoch": 0.11914903700895157, "grad_norm": 2.089116334915161, "learning_rate": 3.561045676529733e-05, "loss": 3.1797, "step": 419 }, { "epoch": 0.1194334022524097, "grad_norm": 2.1360151767730713, "learning_rate": 3.559896581442114e-05, "loss": 2.8716, "step": 420 }, { "epoch": 0.11971776749586782, "grad_norm": 2.4757323265075684, "learning_rate": 3.558747486354496e-05, "loss": 2.9043, "step": 421 }, { "epoch": 0.12000213273932593, "grad_norm": 2.3627967834472656, "learning_rate": 3.557598391266877e-05, "loss": 2.4626, "step": 422 }, { "epoch": 0.12028649798278405, "grad_norm": 2.1213998794555664, "learning_rate": 3.556449296179259e-05, "loss": 2.5062, "step": 423 }, { "epoch": 0.12057086322624218, "grad_norm": 2.1772518157958984, "learning_rate": 3.555300201091641e-05, "loss": 2.3489, "step": 424 }, { "epoch": 0.12085522846970029, "grad_norm": 3.624419689178467, "learning_rate": 3.554151106004022e-05, "loss": 3.6556, "step": 425 }, { "epoch": 0.12113959371315841, "grad_norm": 2.4470715522766113, "learning_rate": 3.5530020109164036e-05, "loss": 3.1925, "step": 426 }, { "epoch": 0.12142395895661652, "grad_norm": 2.2214372158050537, "learning_rate": 3.5518529158287854e-05, "loss": 2.9998, "step": 427 }, { "epoch": 0.12170832420007464, "grad_norm": 2.347656011581421, "learning_rate": 3.5507038207411665e-05, "loss": 2.8243, "step": 428 }, { "epoch": 0.12199268944353277, "grad_norm": 2.0817830562591553, "learning_rate": 3.549554725653548e-05, "loss": 2.8815, "step": 429 }, { "epoch": 0.12227705468699088, "grad_norm": 1.9819862842559814, "learning_rate": 3.54840563056593e-05, "loss": 2.751, "step": 430 }, { "epoch": 0.122561419930449, "grad_norm": 2.2460458278656006, "learning_rate": 3.547256535478311e-05, "loss": 2.6482, "step": 431 }, { "epoch": 0.12284578517390712, "grad_norm": 2.3552284240722656, "learning_rate": 3.546107440390693e-05, "loss": 2.5644, "step": 432 }, { "epoch": 0.12313015041736523, "grad_norm": 2.6936285495758057, "learning_rate": 3.544958345303074e-05, "loss": 3.5913, "step": 433 }, { "epoch": 0.12341451566082336, "grad_norm": 2.459479570388794, "learning_rate": 3.543809250215456e-05, "loss": 2.9851, "step": 434 }, { "epoch": 0.12369888090428148, "grad_norm": 2.364466667175293, "learning_rate": 3.542660155127837e-05, "loss": 2.9548, "step": 435 }, { "epoch": 0.12398324614773959, "grad_norm": 2.216580629348755, "learning_rate": 3.541511060040219e-05, "loss": 2.9091, "step": 436 }, { "epoch": 0.12426761139119771, "grad_norm": 2.2435667514801025, "learning_rate": 3.5403619649526e-05, "loss": 2.8564, "step": 437 }, { "epoch": 0.12455197663465582, "grad_norm": 2.2413361072540283, "learning_rate": 3.539212869864982e-05, "loss": 2.586, "step": 438 }, { "epoch": 0.12483634187811395, "grad_norm": 2.0291380882263184, "learning_rate": 3.5380637747773635e-05, "loss": 2.4712, "step": 439 }, { "epoch": 0.12512070712157206, "grad_norm": 2.161757707595825, "learning_rate": 3.5369146796897446e-05, "loss": 2.3247, "step": 440 }, { "epoch": 0.12540507236503018, "grad_norm": 2.918247938156128, "learning_rate": 3.5357655846021264e-05, "loss": 3.5365, "step": 441 }, { "epoch": 0.1256894376084883, "grad_norm": 2.381178855895996, "learning_rate": 3.5346164895145075e-05, "loss": 3.15, "step": 442 }, { "epoch": 0.12597380285194643, "grad_norm": 2.349804162979126, "learning_rate": 3.533467394426889e-05, "loss": 3.0955, "step": 443 }, { "epoch": 0.12625816809540455, "grad_norm": 2.0576179027557373, "learning_rate": 3.5323182993392704e-05, "loss": 2.8447, "step": 444 }, { "epoch": 0.12654253333886264, "grad_norm": 2.2097201347351074, "learning_rate": 3.531169204251652e-05, "loss": 2.6086, "step": 445 }, { "epoch": 0.12682689858232077, "grad_norm": 2.347313642501831, "learning_rate": 3.530020109164033e-05, "loss": 2.5803, "step": 446 }, { "epoch": 0.1271112638257789, "grad_norm": 2.059239625930786, "learning_rate": 3.528871014076415e-05, "loss": 2.5247, "step": 447 }, { "epoch": 0.12739562906923702, "grad_norm": 2.106543779373169, "learning_rate": 3.527721918988797e-05, "loss": 2.1429, "step": 448 }, { "epoch": 0.12767999431269514, "grad_norm": 4.565354824066162, "learning_rate": 3.526572823901178e-05, "loss": 3.4247, "step": 449 }, { "epoch": 0.12796435955615326, "grad_norm": 3.1165966987609863, "learning_rate": 3.52542372881356e-05, "loss": 3.3897, "step": 450 }, { "epoch": 0.12824872479961136, "grad_norm": 2.431389808654785, "learning_rate": 3.524274633725941e-05, "loss": 2.8458, "step": 451 }, { "epoch": 0.12853309004306948, "grad_norm": 2.352386713027954, "learning_rate": 3.5231255386383226e-05, "loss": 2.8419, "step": 452 }, { "epoch": 0.1288174552865276, "grad_norm": 2.195875406265259, "learning_rate": 3.521976443550704e-05, "loss": 2.7576, "step": 453 }, { "epoch": 0.12910182052998573, "grad_norm": 2.407545566558838, "learning_rate": 3.5208273484630855e-05, "loss": 2.4436, "step": 454 }, { "epoch": 0.12938618577344385, "grad_norm": 2.3552427291870117, "learning_rate": 3.5196782533754666e-05, "loss": 2.3348, "step": 455 }, { "epoch": 0.12967055101690195, "grad_norm": 2.576530933380127, "learning_rate": 3.5185291582878484e-05, "loss": 2.3373, "step": 456 }, { "epoch": 0.12995491626036007, "grad_norm": 2.8271255493164062, "learning_rate": 3.51738006320023e-05, "loss": 3.4477, "step": 457 }, { "epoch": 0.1302392815038182, "grad_norm": 2.433382987976074, "learning_rate": 3.516230968112611e-05, "loss": 3.0918, "step": 458 }, { "epoch": 0.13052364674727632, "grad_norm": 2.547974109649658, "learning_rate": 3.515081873024993e-05, "loss": 3.0768, "step": 459 }, { "epoch": 0.13080801199073444, "grad_norm": 2.379182815551758, "learning_rate": 3.513932777937374e-05, "loss": 2.886, "step": 460 }, { "epoch": 0.13109237723419256, "grad_norm": 2.6086502075195312, "learning_rate": 3.512783682849756e-05, "loss": 2.7411, "step": 461 }, { "epoch": 0.13137674247765066, "grad_norm": 2.6192541122436523, "learning_rate": 3.511634587762137e-05, "loss": 2.6317, "step": 462 }, { "epoch": 0.13166110772110878, "grad_norm": 2.5311689376831055, "learning_rate": 3.5104854926745196e-05, "loss": 2.4297, "step": 463 }, { "epoch": 0.1319454729645669, "grad_norm": 1.9520777463912964, "learning_rate": 3.509336397586901e-05, "loss": 2.3321, "step": 464 }, { "epoch": 0.13222983820802503, "grad_norm": 3.1615819931030273, "learning_rate": 3.5081873024992825e-05, "loss": 3.519, "step": 465 }, { "epoch": 0.13251420345148315, "grad_norm": 2.8588624000549316, "learning_rate": 3.5070382074116636e-05, "loss": 3.0476, "step": 466 }, { "epoch": 0.13279856869494125, "grad_norm": 2.8191323280334473, "learning_rate": 3.5058891123240454e-05, "loss": 2.92, "step": 467 }, { "epoch": 0.13308293393839937, "grad_norm": 2.7357499599456787, "learning_rate": 3.5047400172364265e-05, "loss": 2.8323, "step": 468 }, { "epoch": 0.1333672991818575, "grad_norm": 2.820420026779175, "learning_rate": 3.503590922148808e-05, "loss": 2.6254, "step": 469 }, { "epoch": 0.13365166442531562, "grad_norm": 2.1745669841766357, "learning_rate": 3.5024418270611894e-05, "loss": 2.5728, "step": 470 }, { "epoch": 0.13393602966877374, "grad_norm": 1.995119571685791, "learning_rate": 3.501292731973571e-05, "loss": 2.3181, "step": 471 }, { "epoch": 0.13422039491223187, "grad_norm": 2.149378776550293, "learning_rate": 3.500143636885953e-05, "loss": 2.1623, "step": 472 }, { "epoch": 0.13450476015568996, "grad_norm": 3.25516414642334, "learning_rate": 3.498994541798334e-05, "loss": 3.3434, "step": 473 }, { "epoch": 0.13478912539914809, "grad_norm": 2.725456714630127, "learning_rate": 3.497845446710716e-05, "loss": 3.1657, "step": 474 }, { "epoch": 0.1350734906426062, "grad_norm": 2.5315914154052734, "learning_rate": 3.496696351623097e-05, "loss": 3.0051, "step": 475 }, { "epoch": 0.13535785588606433, "grad_norm": 2.51139760017395, "learning_rate": 3.495547256535479e-05, "loss": 2.8382, "step": 476 }, { "epoch": 0.13564222112952246, "grad_norm": 2.4551467895507812, "learning_rate": 3.49439816144786e-05, "loss": 2.6955, "step": 477 }, { "epoch": 0.13592658637298055, "grad_norm": 2.2206430435180664, "learning_rate": 3.493249066360242e-05, "loss": 2.5027, "step": 478 }, { "epoch": 0.13621095161643867, "grad_norm": 2.4770166873931885, "learning_rate": 3.492099971272623e-05, "loss": 2.2923, "step": 479 }, { "epoch": 0.1364953168598968, "grad_norm": 2.7834012508392334, "learning_rate": 3.4909508761850046e-05, "loss": 2.3686, "step": 480 }, { "epoch": 0.13677968210335492, "grad_norm": 3.358705520629883, "learning_rate": 3.4898017810973864e-05, "loss": 3.3837, "step": 481 }, { "epoch": 0.13706404734681304, "grad_norm": 2.3323278427124023, "learning_rate": 3.4886526860097675e-05, "loss": 2.8571, "step": 482 }, { "epoch": 0.13734841259027114, "grad_norm": 2.330890417098999, "learning_rate": 3.487503590922149e-05, "loss": 2.7978, "step": 483 }, { "epoch": 0.13763277783372926, "grad_norm": 2.012991428375244, "learning_rate": 3.4863544958345304e-05, "loss": 2.8301, "step": 484 }, { "epoch": 0.1379171430771874, "grad_norm": 2.383934736251831, "learning_rate": 3.485205400746912e-05, "loss": 2.6773, "step": 485 }, { "epoch": 0.1382015083206455, "grad_norm": 2.576904773712158, "learning_rate": 3.484056305659293e-05, "loss": 2.5425, "step": 486 }, { "epoch": 0.13848587356410363, "grad_norm": 2.482586145401001, "learning_rate": 3.482907210571675e-05, "loss": 2.3212, "step": 487 }, { "epoch": 0.13877023880756176, "grad_norm": 2.269639253616333, "learning_rate": 3.481758115484056e-05, "loss": 2.244, "step": 488 }, { "epoch": 0.13905460405101985, "grad_norm": 2.263761281967163, "learning_rate": 3.480609020396438e-05, "loss": 3.6028, "step": 489 }, { "epoch": 0.13933896929447798, "grad_norm": 2.271108388900757, "learning_rate": 3.47945992530882e-05, "loss": 3.1406, "step": 490 }, { "epoch": 0.1396233345379361, "grad_norm": 2.075087547302246, "learning_rate": 3.478310830221201e-05, "loss": 2.6854, "step": 491 }, { "epoch": 0.13990769978139422, "grad_norm": 2.018319845199585, "learning_rate": 3.4771617351335827e-05, "loss": 2.8959, "step": 492 }, { "epoch": 0.14019206502485235, "grad_norm": 2.369109630584717, "learning_rate": 3.476012640045964e-05, "loss": 2.7621, "step": 493 }, { "epoch": 0.14047643026831044, "grad_norm": 2.4382967948913574, "learning_rate": 3.4748635449583456e-05, "loss": 2.4256, "step": 494 }, { "epoch": 0.14076079551176857, "grad_norm": 2.1346540451049805, "learning_rate": 3.473714449870727e-05, "loss": 2.4076, "step": 495 }, { "epoch": 0.1410451607552267, "grad_norm": 2.194206953048706, "learning_rate": 3.4725653547831085e-05, "loss": 2.1643, "step": 496 }, { "epoch": 0.1413295259986848, "grad_norm": 3.0039777755737305, "learning_rate": 3.4714162596954896e-05, "loss": 3.4954, "step": 497 }, { "epoch": 0.14161389124214294, "grad_norm": 2.4674746990203857, "learning_rate": 3.4702671646078714e-05, "loss": 3.1891, "step": 498 }, { "epoch": 0.14189825648560106, "grad_norm": 2.319490432739258, "learning_rate": 3.469118069520253e-05, "loss": 2.7782, "step": 499 }, { "epoch": 0.14218262172905916, "grad_norm": 2.3277428150177, "learning_rate": 3.467968974432635e-05, "loss": 2.6973, "step": 500 }, { "epoch": 0.14246698697251728, "grad_norm": 2.28387188911438, "learning_rate": 3.466819879345016e-05, "loss": 2.8054, "step": 501 }, { "epoch": 0.1427513522159754, "grad_norm": 2.186497688293457, "learning_rate": 3.465670784257398e-05, "loss": 2.7287, "step": 502 }, { "epoch": 0.14303571745943353, "grad_norm": 2.455045700073242, "learning_rate": 3.464521689169779e-05, "loss": 2.4847, "step": 503 }, { "epoch": 0.14332008270289165, "grad_norm": 2.292936086654663, "learning_rate": 3.463372594082161e-05, "loss": 2.3205, "step": 504 }, { "epoch": 0.14360444794634974, "grad_norm": 2.924741744995117, "learning_rate": 3.4622234989945425e-05, "loss": 3.244, "step": 505 }, { "epoch": 0.14388881318980787, "grad_norm": 2.4970264434814453, "learning_rate": 3.4610744039069236e-05, "loss": 3.0944, "step": 506 }, { "epoch": 0.144173178433266, "grad_norm": 2.490755319595337, "learning_rate": 3.4599253088193054e-05, "loss": 3.0244, "step": 507 }, { "epoch": 0.14445754367672412, "grad_norm": 2.4618477821350098, "learning_rate": 3.4587762137316865e-05, "loss": 2.6005, "step": 508 }, { "epoch": 0.14474190892018224, "grad_norm": 2.505988121032715, "learning_rate": 3.457627118644068e-05, "loss": 2.5832, "step": 509 }, { "epoch": 0.14502627416364036, "grad_norm": 2.5353870391845703, "learning_rate": 3.4564780235564494e-05, "loss": 2.3534, "step": 510 }, { "epoch": 0.14531063940709846, "grad_norm": 2.2250514030456543, "learning_rate": 3.455328928468831e-05, "loss": 2.2805, "step": 511 }, { "epoch": 0.14559500465055658, "grad_norm": 2.35164475440979, "learning_rate": 3.454179833381212e-05, "loss": 2.214, "step": 512 }, { "epoch": 0.1458793698940147, "grad_norm": 2.7433321475982666, "learning_rate": 3.453030738293594e-05, "loss": 3.4267, "step": 513 }, { "epoch": 0.14616373513747283, "grad_norm": 2.7333459854125977, "learning_rate": 3.451881643205976e-05, "loss": 3.1427, "step": 514 }, { "epoch": 0.14644810038093095, "grad_norm": 2.3346779346466064, "learning_rate": 3.450732548118357e-05, "loss": 2.7976, "step": 515 }, { "epoch": 0.14673246562438905, "grad_norm": 2.188016176223755, "learning_rate": 3.449583453030739e-05, "loss": 2.7736, "step": 516 }, { "epoch": 0.14701683086784717, "grad_norm": 2.3532261848449707, "learning_rate": 3.44843435794312e-05, "loss": 2.6379, "step": 517 }, { "epoch": 0.1473011961113053, "grad_norm": 2.1835358142852783, "learning_rate": 3.447285262855502e-05, "loss": 2.2907, "step": 518 }, { "epoch": 0.14758556135476342, "grad_norm": 1.9752788543701172, "learning_rate": 3.446136167767883e-05, "loss": 2.1688, "step": 519 }, { "epoch": 0.14786992659822154, "grad_norm": 2.0870256423950195, "learning_rate": 3.4449870726802646e-05, "loss": 2.1826, "step": 520 }, { "epoch": 0.14815429184167966, "grad_norm": 3.138823986053467, "learning_rate": 3.443837977592646e-05, "loss": 3.3426, "step": 521 }, { "epoch": 0.14843865708513776, "grad_norm": 2.220222234725952, "learning_rate": 3.4426888825050275e-05, "loss": 2.9125, "step": 522 }, { "epoch": 0.14872302232859588, "grad_norm": 2.0182180404663086, "learning_rate": 3.441539787417409e-05, "loss": 2.54, "step": 523 }, { "epoch": 0.149007387572054, "grad_norm": 2.3546602725982666, "learning_rate": 3.4403906923297904e-05, "loss": 2.7855, "step": 524 }, { "epoch": 0.14929175281551213, "grad_norm": 2.4429872035980225, "learning_rate": 3.439241597242172e-05, "loss": 2.6199, "step": 525 }, { "epoch": 0.14957611805897025, "grad_norm": 2.0123140811920166, "learning_rate": 3.438092502154553e-05, "loss": 2.4555, "step": 526 }, { "epoch": 0.14986048330242835, "grad_norm": 2.059921979904175, "learning_rate": 3.436943407066935e-05, "loss": 2.3095, "step": 527 }, { "epoch": 0.15014484854588647, "grad_norm": 2.259676456451416, "learning_rate": 3.435794311979316e-05, "loss": 2.32, "step": 528 }, { "epoch": 0.1504292137893446, "grad_norm": 2.4949564933776855, "learning_rate": 3.434645216891698e-05, "loss": 3.1353, "step": 529 }, { "epoch": 0.15071357903280272, "grad_norm": 2.08988881111145, "learning_rate": 3.433496121804079e-05, "loss": 3.0615, "step": 530 }, { "epoch": 0.15099794427626084, "grad_norm": 2.143328905105591, "learning_rate": 3.432347026716461e-05, "loss": 2.7796, "step": 531 }, { "epoch": 0.15128230951971897, "grad_norm": 2.148768663406372, "learning_rate": 3.431197931628843e-05, "loss": 2.6519, "step": 532 }, { "epoch": 0.15156667476317706, "grad_norm": 2.143799066543579, "learning_rate": 3.430048836541224e-05, "loss": 2.7152, "step": 533 }, { "epoch": 0.15185104000663519, "grad_norm": 2.04683780670166, "learning_rate": 3.4288997414536056e-05, "loss": 2.3695, "step": 534 }, { "epoch": 0.1521354052500933, "grad_norm": 2.0776238441467285, "learning_rate": 3.427750646365987e-05, "loss": 2.2031, "step": 535 }, { "epoch": 0.15241977049355143, "grad_norm": 2.1074013710021973, "learning_rate": 3.4266015512783685e-05, "loss": 2.3794, "step": 536 }, { "epoch": 0.15270413573700956, "grad_norm": 3.231952667236328, "learning_rate": 3.42545245619075e-05, "loss": 3.2961, "step": 537 }, { "epoch": 0.15298850098046765, "grad_norm": 2.548482656478882, "learning_rate": 3.424303361103132e-05, "loss": 2.9752, "step": 538 }, { "epoch": 0.15327286622392577, "grad_norm": 2.271571397781372, "learning_rate": 3.423154266015513e-05, "loss": 2.9183, "step": 539 }, { "epoch": 0.1535572314673839, "grad_norm": 2.0318918228149414, "learning_rate": 3.422005170927895e-05, "loss": 2.6897, "step": 540 }, { "epoch": 0.15384159671084202, "grad_norm": 2.172313690185547, "learning_rate": 3.420856075840276e-05, "loss": 2.6676, "step": 541 }, { "epoch": 0.15412596195430014, "grad_norm": 1.9842435121536255, "learning_rate": 3.419706980752658e-05, "loss": 2.3229, "step": 542 }, { "epoch": 0.15441032719775824, "grad_norm": 1.9959310293197632, "learning_rate": 3.418557885665039e-05, "loss": 2.1866, "step": 543 }, { "epoch": 0.15469469244121636, "grad_norm": 2.2947375774383545, "learning_rate": 3.417408790577421e-05, "loss": 2.3691, "step": 544 }, { "epoch": 0.1549790576846745, "grad_norm": 3.7838504314422607, "learning_rate": 3.416259695489802e-05, "loss": 3.3556, "step": 545 }, { "epoch": 0.1552634229281326, "grad_norm": 2.482161045074463, "learning_rate": 3.4151106004021837e-05, "loss": 3.0544, "step": 546 }, { "epoch": 0.15554778817159073, "grad_norm": 2.1925017833709717, "learning_rate": 3.4139615053145654e-05, "loss": 2.8413, "step": 547 }, { "epoch": 0.15583215341504886, "grad_norm": 1.944459080696106, "learning_rate": 3.4128124102269466e-05, "loss": 2.7084, "step": 548 }, { "epoch": 0.15611651865850695, "grad_norm": 1.926611065864563, "learning_rate": 3.4116633151393283e-05, "loss": 2.5604, "step": 549 }, { "epoch": 0.15640088390196508, "grad_norm": 1.8744678497314453, "learning_rate": 3.4105142200517095e-05, "loss": 2.2331, "step": 550 }, { "epoch": 0.1566852491454232, "grad_norm": 2.163674831390381, "learning_rate": 3.409365124964091e-05, "loss": 2.2382, "step": 551 }, { "epoch": 0.15696961438888132, "grad_norm": 2.251648426055908, "learning_rate": 3.4082160298764724e-05, "loss": 2.0852, "step": 552 }, { "epoch": 0.15725397963233945, "grad_norm": 2.419774055480957, "learning_rate": 3.407066934788854e-05, "loss": 3.393, "step": 553 }, { "epoch": 0.15753834487579754, "grad_norm": 1.9137029647827148, "learning_rate": 3.405917839701235e-05, "loss": 3.2236, "step": 554 }, { "epoch": 0.15782271011925567, "grad_norm": 1.9087616205215454, "learning_rate": 3.404768744613617e-05, "loss": 2.869, "step": 555 }, { "epoch": 0.1581070753627138, "grad_norm": 1.9507688283920288, "learning_rate": 3.403619649525999e-05, "loss": 2.833, "step": 556 }, { "epoch": 0.1583914406061719, "grad_norm": 2.030203342437744, "learning_rate": 3.40247055443838e-05, "loss": 2.6474, "step": 557 }, { "epoch": 0.15867580584963004, "grad_norm": 2.2038793563842773, "learning_rate": 3.401321459350762e-05, "loss": 2.3888, "step": 558 }, { "epoch": 0.15896017109308816, "grad_norm": 2.0459814071655273, "learning_rate": 3.400172364263143e-05, "loss": 2.2976, "step": 559 }, { "epoch": 0.15924453633654626, "grad_norm": 2.089040517807007, "learning_rate": 3.3990232691755246e-05, "loss": 2.3395, "step": 560 }, { "epoch": 0.15952890158000438, "grad_norm": 2.3878250122070312, "learning_rate": 3.397874174087906e-05, "loss": 3.3155, "step": 561 }, { "epoch": 0.1598132668234625, "grad_norm": 2.1346888542175293, "learning_rate": 3.3967250790002875e-05, "loss": 3.1624, "step": 562 }, { "epoch": 0.16009763206692063, "grad_norm": 2.1869750022888184, "learning_rate": 3.3955759839126686e-05, "loss": 2.6892, "step": 563 }, { "epoch": 0.16038199731037875, "grad_norm": 2.045581102371216, "learning_rate": 3.3944268888250504e-05, "loss": 2.4891, "step": 564 }, { "epoch": 0.16066636255383684, "grad_norm": 2.3556089401245117, "learning_rate": 3.393277793737432e-05, "loss": 2.7332, "step": 565 }, { "epoch": 0.16095072779729497, "grad_norm": 2.4558780193328857, "learning_rate": 3.392128698649813e-05, "loss": 2.4167, "step": 566 }, { "epoch": 0.1612350930407531, "grad_norm": 2.398458957672119, "learning_rate": 3.390979603562195e-05, "loss": 2.336, "step": 567 }, { "epoch": 0.16151945828421121, "grad_norm": 2.256385564804077, "learning_rate": 3.389830508474576e-05, "loss": 2.3393, "step": 568 }, { "epoch": 0.16180382352766934, "grad_norm": 2.4298534393310547, "learning_rate": 3.388681413386958e-05, "loss": 3.3351, "step": 569 }, { "epoch": 0.16208818877112746, "grad_norm": 2.2216269969940186, "learning_rate": 3.387532318299339e-05, "loss": 3.1062, "step": 570 }, { "epoch": 0.16237255401458556, "grad_norm": 2.279859781265259, "learning_rate": 3.386383223211721e-05, "loss": 2.7871, "step": 571 }, { "epoch": 0.16265691925804368, "grad_norm": 2.4930570125579834, "learning_rate": 3.385234128124102e-05, "loss": 2.6784, "step": 572 }, { "epoch": 0.1629412845015018, "grad_norm": 3.0474438667297363, "learning_rate": 3.384085033036484e-05, "loss": 2.7209, "step": 573 }, { "epoch": 0.16322564974495993, "grad_norm": 2.276129961013794, "learning_rate": 3.3829359379488656e-05, "loss": 2.3874, "step": 574 }, { "epoch": 0.16351001498841805, "grad_norm": 2.0157735347747803, "learning_rate": 3.3817868428612474e-05, "loss": 2.3545, "step": 575 }, { "epoch": 0.16379438023187615, "grad_norm": 2.2889609336853027, "learning_rate": 3.3806377477736285e-05, "loss": 2.1654, "step": 576 }, { "epoch": 0.16407874547533427, "grad_norm": 2.966151714324951, "learning_rate": 3.37948865268601e-05, "loss": 3.3948, "step": 577 }, { "epoch": 0.1643631107187924, "grad_norm": 2.7245430946350098, "learning_rate": 3.3783395575983914e-05, "loss": 3.0326, "step": 578 }, { "epoch": 0.16464747596225052, "grad_norm": 2.4125325679779053, "learning_rate": 3.377190462510773e-05, "loss": 2.7461, "step": 579 }, { "epoch": 0.16493184120570864, "grad_norm": 2.4555015563964844, "learning_rate": 3.376041367423155e-05, "loss": 2.6876, "step": 580 }, { "epoch": 0.16521620644916676, "grad_norm": 2.3780357837677, "learning_rate": 3.374892272335536e-05, "loss": 2.549, "step": 581 }, { "epoch": 0.16550057169262486, "grad_norm": 1.9920421838760376, "learning_rate": 3.373743177247918e-05, "loss": 2.3843, "step": 582 }, { "epoch": 0.16578493693608298, "grad_norm": 1.880908489227295, "learning_rate": 3.372594082160299e-05, "loss": 2.1699, "step": 583 }, { "epoch": 0.1660693021795411, "grad_norm": 2.317730188369751, "learning_rate": 3.371444987072681e-05, "loss": 2.3338, "step": 584 }, { "epoch": 0.16635366742299923, "grad_norm": 3.1581149101257324, "learning_rate": 3.370295891985062e-05, "loss": 3.3314, "step": 585 }, { "epoch": 0.16663803266645735, "grad_norm": 2.4569809436798096, "learning_rate": 3.369146796897444e-05, "loss": 2.8549, "step": 586 }, { "epoch": 0.16692239790991545, "grad_norm": 2.0873701572418213, "learning_rate": 3.367997701809825e-05, "loss": 2.6875, "step": 587 }, { "epoch": 0.16720676315337357, "grad_norm": 2.1216537952423096, "learning_rate": 3.3668486067222066e-05, "loss": 2.7253, "step": 588 }, { "epoch": 0.1674911283968317, "grad_norm": 1.9596827030181885, "learning_rate": 3.3656995116345884e-05, "loss": 2.4127, "step": 589 }, { "epoch": 0.16777549364028982, "grad_norm": 2.244654417037964, "learning_rate": 3.3645504165469695e-05, "loss": 2.4795, "step": 590 }, { "epoch": 0.16805985888374794, "grad_norm": 2.387822389602661, "learning_rate": 3.363401321459351e-05, "loss": 2.413, "step": 591 }, { "epoch": 0.16834422412720607, "grad_norm": 2.2715797424316406, "learning_rate": 3.3622522263717324e-05, "loss": 2.3535, "step": 592 }, { "epoch": 0.16862858937066416, "grad_norm": 3.2900567054748535, "learning_rate": 3.361103131284114e-05, "loss": 3.2949, "step": 593 }, { "epoch": 0.16891295461412228, "grad_norm": 2.340487241744995, "learning_rate": 3.359954036196495e-05, "loss": 3.0242, "step": 594 }, { "epoch": 0.1691973198575804, "grad_norm": 1.9996750354766846, "learning_rate": 3.358804941108877e-05, "loss": 2.7541, "step": 595 }, { "epoch": 0.16948168510103853, "grad_norm": 1.8965973854064941, "learning_rate": 3.357655846021258e-05, "loss": 2.6425, "step": 596 }, { "epoch": 0.16976605034449666, "grad_norm": 2.097822427749634, "learning_rate": 3.35650675093364e-05, "loss": 2.6672, "step": 597 }, { "epoch": 0.17005041558795475, "grad_norm": 2.171520471572876, "learning_rate": 3.355357655846022e-05, "loss": 2.2089, "step": 598 }, { "epoch": 0.17033478083141287, "grad_norm": 2.1892077922821045, "learning_rate": 3.354208560758403e-05, "loss": 2.2999, "step": 599 }, { "epoch": 0.170619146074871, "grad_norm": 1.8929511308670044, "learning_rate": 3.3530594656707847e-05, "loss": 2.0265, "step": 600 }, { "epoch": 0.17090351131832912, "grad_norm": 2.2485220432281494, "learning_rate": 3.351910370583166e-05, "loss": 3.4067, "step": 601 }, { "epoch": 0.17118787656178724, "grad_norm": 2.0794124603271484, "learning_rate": 3.3507612754955476e-05, "loss": 3.0856, "step": 602 }, { "epoch": 0.17147224180524534, "grad_norm": 1.9198224544525146, "learning_rate": 3.349612180407929e-05, "loss": 2.7309, "step": 603 }, { "epoch": 0.17175660704870346, "grad_norm": 1.9311753511428833, "learning_rate": 3.3484630853203105e-05, "loss": 2.5093, "step": 604 }, { "epoch": 0.1720409722921616, "grad_norm": 2.2155637741088867, "learning_rate": 3.3473139902326916e-05, "loss": 2.4131, "step": 605 }, { "epoch": 0.1723253375356197, "grad_norm": 2.4310479164123535, "learning_rate": 3.3461648951450733e-05, "loss": 2.3546, "step": 606 }, { "epoch": 0.17260970277907783, "grad_norm": 2.170459747314453, "learning_rate": 3.345015800057455e-05, "loss": 2.1606, "step": 607 }, { "epoch": 0.17289406802253596, "grad_norm": 2.082836627960205, "learning_rate": 3.343866704969836e-05, "loss": 2.1429, "step": 608 }, { "epoch": 0.17317843326599405, "grad_norm": 2.6757774353027344, "learning_rate": 3.342717609882218e-05, "loss": 3.3129, "step": 609 }, { "epoch": 0.17346279850945218, "grad_norm": 1.9752081632614136, "learning_rate": 3.341568514794599e-05, "loss": 2.8585, "step": 610 }, { "epoch": 0.1737471637529103, "grad_norm": 2.269960880279541, "learning_rate": 3.340419419706981e-05, "loss": 2.5409, "step": 611 }, { "epoch": 0.17403152899636842, "grad_norm": 2.2884328365325928, "learning_rate": 3.339270324619363e-05, "loss": 2.6172, "step": 612 }, { "epoch": 0.17431589423982655, "grad_norm": 2.2851369380950928, "learning_rate": 3.3381212295317445e-05, "loss": 2.6341, "step": 613 }, { "epoch": 0.17460025948328464, "grad_norm": 1.8490828275680542, "learning_rate": 3.3369721344441256e-05, "loss": 2.2735, "step": 614 }, { "epoch": 0.17488462472674277, "grad_norm": 1.9733513593673706, "learning_rate": 3.3358230393565074e-05, "loss": 2.2607, "step": 615 }, { "epoch": 0.1751689899702009, "grad_norm": 1.9823896884918213, "learning_rate": 3.3346739442688885e-05, "loss": 2.0564, "step": 616 }, { "epoch": 0.175453355213659, "grad_norm": 2.479914665222168, "learning_rate": 3.33352484918127e-05, "loss": 3.352, "step": 617 }, { "epoch": 0.17573772045711714, "grad_norm": 2.3320770263671875, "learning_rate": 3.3323757540936514e-05, "loss": 2.8721, "step": 618 }, { "epoch": 0.17602208570057526, "grad_norm": 2.451782703399658, "learning_rate": 3.331226659006033e-05, "loss": 2.8098, "step": 619 }, { "epoch": 0.17630645094403335, "grad_norm": 2.2985215187072754, "learning_rate": 3.330077563918414e-05, "loss": 2.7563, "step": 620 }, { "epoch": 0.17659081618749148, "grad_norm": 2.119804859161377, "learning_rate": 3.328928468830796e-05, "loss": 2.5972, "step": 621 }, { "epoch": 0.1768751814309496, "grad_norm": 2.0667002201080322, "learning_rate": 3.327779373743178e-05, "loss": 2.2254, "step": 622 }, { "epoch": 0.17715954667440773, "grad_norm": 1.9303321838378906, "learning_rate": 3.326630278655559e-05, "loss": 2.4747, "step": 623 }, { "epoch": 0.17744391191786585, "grad_norm": 2.147676706314087, "learning_rate": 3.325481183567941e-05, "loss": 1.9882, "step": 624 }, { "epoch": 0.17772827716132394, "grad_norm": 2.619396448135376, "learning_rate": 3.324332088480322e-05, "loss": 3.2464, "step": 625 }, { "epoch": 0.17801264240478207, "grad_norm": 2.6688385009765625, "learning_rate": 3.323182993392704e-05, "loss": 3.0559, "step": 626 }, { "epoch": 0.1782970076482402, "grad_norm": 1.9910519123077393, "learning_rate": 3.322033898305085e-05, "loss": 2.7364, "step": 627 }, { "epoch": 0.17858137289169831, "grad_norm": 2.037667989730835, "learning_rate": 3.3208848032174666e-05, "loss": 2.3946, "step": 628 }, { "epoch": 0.17886573813515644, "grad_norm": 2.0019378662109375, "learning_rate": 3.319735708129848e-05, "loss": 2.4888, "step": 629 }, { "epoch": 0.17915010337861456, "grad_norm": 1.999667763710022, "learning_rate": 3.3185866130422295e-05, "loss": 2.5945, "step": 630 }, { "epoch": 0.17943446862207266, "grad_norm": 2.3381664752960205, "learning_rate": 3.317437517954611e-05, "loss": 2.0536, "step": 631 }, { "epoch": 0.17971883386553078, "grad_norm": 2.1102583408355713, "learning_rate": 3.3162884228669924e-05, "loss": 2.1843, "step": 632 }, { "epoch": 0.1800031991089889, "grad_norm": 4.009433269500732, "learning_rate": 3.315139327779374e-05, "loss": 3.122, "step": 633 }, { "epoch": 0.18028756435244703, "grad_norm": 2.4775569438934326, "learning_rate": 3.313990232691755e-05, "loss": 3.1036, "step": 634 }, { "epoch": 0.18057192959590515, "grad_norm": 2.1339893341064453, "learning_rate": 3.312841137604137e-05, "loss": 2.8175, "step": 635 }, { "epoch": 0.18085629483936325, "grad_norm": 1.9820502996444702, "learning_rate": 3.311692042516518e-05, "loss": 2.8805, "step": 636 }, { "epoch": 0.18114066008282137, "grad_norm": 1.9298404455184937, "learning_rate": 3.3105429474289e-05, "loss": 2.5823, "step": 637 }, { "epoch": 0.1814250253262795, "grad_norm": 1.915541172027588, "learning_rate": 3.309393852341281e-05, "loss": 2.4167, "step": 638 }, { "epoch": 0.18170939056973762, "grad_norm": 2.0158815383911133, "learning_rate": 3.308244757253663e-05, "loss": 2.2251, "step": 639 }, { "epoch": 0.18199375581319574, "grad_norm": 1.840295672416687, "learning_rate": 3.307095662166045e-05, "loss": 2.2033, "step": 640 }, { "epoch": 0.18227812105665386, "grad_norm": 2.2416555881500244, "learning_rate": 3.305946567078426e-05, "loss": 3.302, "step": 641 }, { "epoch": 0.18256248630011196, "grad_norm": 2.0106818675994873, "learning_rate": 3.3047974719908076e-05, "loss": 2.8823, "step": 642 }, { "epoch": 0.18284685154357008, "grad_norm": 2.0697238445281982, "learning_rate": 3.303648376903189e-05, "loss": 2.8577, "step": 643 }, { "epoch": 0.1831312167870282, "grad_norm": 1.8144038915634155, "learning_rate": 3.3024992818155705e-05, "loss": 2.5007, "step": 644 }, { "epoch": 0.18341558203048633, "grad_norm": 1.9172033071517944, "learning_rate": 3.3013501867279516e-05, "loss": 2.5931, "step": 645 }, { "epoch": 0.18369994727394445, "grad_norm": 2.0230233669281006, "learning_rate": 3.3002010916403334e-05, "loss": 2.3733, "step": 646 }, { "epoch": 0.18398431251740255, "grad_norm": 1.8354929685592651, "learning_rate": 3.2990519965527145e-05, "loss": 2.1317, "step": 647 }, { "epoch": 0.18426867776086067, "grad_norm": 2.032435894012451, "learning_rate": 3.297902901465096e-05, "loss": 1.9827, "step": 648 }, { "epoch": 0.1845530430043188, "grad_norm": 2.6801607608795166, "learning_rate": 3.296753806377478e-05, "loss": 3.3537, "step": 649 }, { "epoch": 0.18483740824777692, "grad_norm": 2.1917881965637207, "learning_rate": 3.29560471128986e-05, "loss": 3.0921, "step": 650 }, { "epoch": 0.18512177349123504, "grad_norm": 1.906441330909729, "learning_rate": 3.294455616202241e-05, "loss": 2.8275, "step": 651 }, { "epoch": 0.18540613873469317, "grad_norm": 1.8117749691009521, "learning_rate": 3.293306521114623e-05, "loss": 2.5992, "step": 652 }, { "epoch": 0.18569050397815126, "grad_norm": 1.962047815322876, "learning_rate": 3.292157426027004e-05, "loss": 2.4936, "step": 653 }, { "epoch": 0.18597486922160938, "grad_norm": 1.9997988939285278, "learning_rate": 3.2910083309393856e-05, "loss": 2.3512, "step": 654 }, { "epoch": 0.1862592344650675, "grad_norm": 2.2575020790100098, "learning_rate": 3.2898592358517674e-05, "loss": 2.3052, "step": 655 }, { "epoch": 0.18654359970852563, "grad_norm": 2.3353512287139893, "learning_rate": 3.2887101407641485e-05, "loss": 2.1795, "step": 656 }, { "epoch": 0.18682796495198375, "grad_norm": 2.2830419540405273, "learning_rate": 3.28756104567653e-05, "loss": 3.2798, "step": 657 }, { "epoch": 0.18711233019544185, "grad_norm": 2.1097829341888428, "learning_rate": 3.2864119505889114e-05, "loss": 3.1461, "step": 658 }, { "epoch": 0.18739669543889997, "grad_norm": 2.0392520427703857, "learning_rate": 3.285262855501293e-05, "loss": 2.865, "step": 659 }, { "epoch": 0.1876810606823581, "grad_norm": 1.9128637313842773, "learning_rate": 3.2841137604136743e-05, "loss": 2.4607, "step": 660 }, { "epoch": 0.18796542592581622, "grad_norm": 2.3386354446411133, "learning_rate": 3.282964665326056e-05, "loss": 2.5321, "step": 661 }, { "epoch": 0.18824979116927434, "grad_norm": 2.1804091930389404, "learning_rate": 3.281815570238437e-05, "loss": 2.372, "step": 662 }, { "epoch": 0.18853415641273247, "grad_norm": 2.139617919921875, "learning_rate": 3.280666475150819e-05, "loss": 2.2559, "step": 663 }, { "epoch": 0.18881852165619056, "grad_norm": 2.0755178928375244, "learning_rate": 3.279517380063201e-05, "loss": 2.3333, "step": 664 }, { "epoch": 0.1891028868996487, "grad_norm": 2.1971609592437744, "learning_rate": 3.278368284975582e-05, "loss": 3.1885, "step": 665 }, { "epoch": 0.1893872521431068, "grad_norm": 2.080216884613037, "learning_rate": 3.277219189887964e-05, "loss": 2.8508, "step": 666 }, { "epoch": 0.18967161738656493, "grad_norm": 2.1171205043792725, "learning_rate": 3.276070094800345e-05, "loss": 2.5981, "step": 667 }, { "epoch": 0.18995598263002306, "grad_norm": 2.2514328956604004, "learning_rate": 3.2749209997127266e-05, "loss": 2.8185, "step": 668 }, { "epoch": 0.19024034787348115, "grad_norm": 2.328221082687378, "learning_rate": 3.273771904625108e-05, "loss": 2.6257, "step": 669 }, { "epoch": 0.19052471311693928, "grad_norm": 2.0298240184783936, "learning_rate": 3.2726228095374895e-05, "loss": 2.2706, "step": 670 }, { "epoch": 0.1908090783603974, "grad_norm": 1.9545693397521973, "learning_rate": 3.2714737144498706e-05, "loss": 2.1073, "step": 671 }, { "epoch": 0.19109344360385552, "grad_norm": 2.0932371616363525, "learning_rate": 3.2703246193622524e-05, "loss": 2.1358, "step": 672 }, { "epoch": 0.19137780884731365, "grad_norm": 2.68083119392395, "learning_rate": 3.269175524274634e-05, "loss": 3.3797, "step": 673 }, { "epoch": 0.19166217409077174, "grad_norm": 2.340952157974243, "learning_rate": 3.268026429187015e-05, "loss": 2.7555, "step": 674 }, { "epoch": 0.19194653933422987, "grad_norm": 2.199805736541748, "learning_rate": 3.266877334099397e-05, "loss": 2.6694, "step": 675 }, { "epoch": 0.192230904577688, "grad_norm": 2.1734702587127686, "learning_rate": 3.265728239011778e-05, "loss": 2.8257, "step": 676 }, { "epoch": 0.1925152698211461, "grad_norm": 2.420964479446411, "learning_rate": 3.26457914392416e-05, "loss": 2.4486, "step": 677 }, { "epoch": 0.19279963506460424, "grad_norm": 1.9791522026062012, "learning_rate": 3.263430048836541e-05, "loss": 2.3201, "step": 678 }, { "epoch": 0.19308400030806236, "grad_norm": 2.2870163917541504, "learning_rate": 3.262280953748923e-05, "loss": 2.1877, "step": 679 }, { "epoch": 0.19336836555152045, "grad_norm": 2.085327625274658, "learning_rate": 3.261131858661304e-05, "loss": 2.0054, "step": 680 }, { "epoch": 0.19365273079497858, "grad_norm": 3.2444210052490234, "learning_rate": 3.259982763573686e-05, "loss": 3.2208, "step": 681 }, { "epoch": 0.1939370960384367, "grad_norm": 2.47835111618042, "learning_rate": 3.2588336684860676e-05, "loss": 3.1121, "step": 682 }, { "epoch": 0.19422146128189482, "grad_norm": 2.056704521179199, "learning_rate": 3.257684573398449e-05, "loss": 2.8531, "step": 683 }, { "epoch": 0.19450582652535295, "grad_norm": 1.9477993249893188, "learning_rate": 3.2565354783108305e-05, "loss": 2.4773, "step": 684 }, { "epoch": 0.19479019176881104, "grad_norm": 2.019652843475342, "learning_rate": 3.2553863832232116e-05, "loss": 2.5318, "step": 685 }, { "epoch": 0.19507455701226917, "grad_norm": 2.164877414703369, "learning_rate": 3.2542372881355934e-05, "loss": 2.2676, "step": 686 }, { "epoch": 0.1953589222557273, "grad_norm": 1.9417436122894287, "learning_rate": 3.253088193047975e-05, "loss": 2.1097, "step": 687 }, { "epoch": 0.19564328749918541, "grad_norm": 2.0217719078063965, "learning_rate": 3.251939097960357e-05, "loss": 1.9464, "step": 688 }, { "epoch": 0.19592765274264354, "grad_norm": 2.3088855743408203, "learning_rate": 3.250790002872738e-05, "loss": 3.1466, "step": 689 }, { "epoch": 0.19621201798610166, "grad_norm": 2.2107951641082764, "learning_rate": 3.24964090778512e-05, "loss": 2.898, "step": 690 }, { "epoch": 0.19649638322955976, "grad_norm": 1.8729802370071411, "learning_rate": 3.248491812697501e-05, "loss": 2.5574, "step": 691 }, { "epoch": 0.19678074847301788, "grad_norm": 1.948519229888916, "learning_rate": 3.247342717609883e-05, "loss": 2.644, "step": 692 }, { "epoch": 0.197065113716476, "grad_norm": 2.479282855987549, "learning_rate": 3.246193622522264e-05, "loss": 2.6382, "step": 693 }, { "epoch": 0.19734947895993413, "grad_norm": 2.2494328022003174, "learning_rate": 3.245044527434646e-05, "loss": 2.0895, "step": 694 }, { "epoch": 0.19763384420339225, "grad_norm": 1.9501408338546753, "learning_rate": 3.243895432347027e-05, "loss": 2.2874, "step": 695 }, { "epoch": 0.19791820944685035, "grad_norm": 1.865615963935852, "learning_rate": 3.2427463372594086e-05, "loss": 2.0848, "step": 696 }, { "epoch": 0.19820257469030847, "grad_norm": 2.376199722290039, "learning_rate": 3.2415972421717904e-05, "loss": 3.3114, "step": 697 }, { "epoch": 0.1984869399337666, "grad_norm": 1.978826880455017, "learning_rate": 3.2404481470841715e-05, "loss": 2.6896, "step": 698 }, { "epoch": 0.19877130517722472, "grad_norm": 1.8862276077270508, "learning_rate": 3.239299051996553e-05, "loss": 2.4757, "step": 699 }, { "epoch": 0.19905567042068284, "grad_norm": 2.253225564956665, "learning_rate": 3.2381499569089344e-05, "loss": 2.7394, "step": 700 }, { "epoch": 0.19934003566414096, "grad_norm": 2.258186101913452, "learning_rate": 3.237000861821316e-05, "loss": 2.6104, "step": 701 }, { "epoch": 0.19962440090759906, "grad_norm": 1.78932785987854, "learning_rate": 3.235851766733697e-05, "loss": 2.2367, "step": 702 }, { "epoch": 0.19990876615105718, "grad_norm": 1.7839140892028809, "learning_rate": 3.234702671646079e-05, "loss": 2.0307, "step": 703 }, { "epoch": 0.2001931313945153, "grad_norm": 1.9078693389892578, "learning_rate": 3.23355357655846e-05, "loss": 1.8974, "step": 704 }, { "epoch": 0.20047749663797343, "grad_norm": 2.489147901535034, "learning_rate": 3.232404481470842e-05, "loss": 3.0721, "step": 705 }, { "epoch": 0.20076186188143155, "grad_norm": 2.5214760303497314, "learning_rate": 3.231255386383224e-05, "loss": 2.9676, "step": 706 }, { "epoch": 0.20104622712488965, "grad_norm": 2.182615280151367, "learning_rate": 3.230106291295605e-05, "loss": 2.9474, "step": 707 }, { "epoch": 0.20133059236834777, "grad_norm": 1.98862624168396, "learning_rate": 3.2289571962079866e-05, "loss": 2.7995, "step": 708 }, { "epoch": 0.2016149576118059, "grad_norm": 1.9838632345199585, "learning_rate": 3.227808101120368e-05, "loss": 2.4476, "step": 709 }, { "epoch": 0.20189932285526402, "grad_norm": 1.9730520248413086, "learning_rate": 3.2266590060327495e-05, "loss": 2.3272, "step": 710 }, { "epoch": 0.20218368809872214, "grad_norm": 1.880563497543335, "learning_rate": 3.2255099109451307e-05, "loss": 2.0332, "step": 711 }, { "epoch": 0.20246805334218027, "grad_norm": 1.99998140335083, "learning_rate": 3.2243608158575124e-05, "loss": 2.2419, "step": 712 }, { "epoch": 0.20275241858563836, "grad_norm": 2.910835027694702, "learning_rate": 3.2232117207698936e-05, "loss": 3.2948, "step": 713 }, { "epoch": 0.20303678382909648, "grad_norm": 2.386836528778076, "learning_rate": 3.2220626256822753e-05, "loss": 2.7419, "step": 714 }, { "epoch": 0.2033211490725546, "grad_norm": 1.9169079065322876, "learning_rate": 3.220913530594657e-05, "loss": 2.6171, "step": 715 }, { "epoch": 0.20360551431601273, "grad_norm": 1.7654900550842285, "learning_rate": 3.219764435507038e-05, "loss": 2.7043, "step": 716 }, { "epoch": 0.20388987955947085, "grad_norm": 1.8636305332183838, "learning_rate": 3.21861534041942e-05, "loss": 2.4612, "step": 717 }, { "epoch": 0.20417424480292895, "grad_norm": 2.11191463470459, "learning_rate": 3.217466245331801e-05, "loss": 2.3006, "step": 718 }, { "epoch": 0.20445861004638707, "grad_norm": 2.05869460105896, "learning_rate": 3.216317150244183e-05, "loss": 2.2266, "step": 719 }, { "epoch": 0.2047429752898452, "grad_norm": 1.921918511390686, "learning_rate": 3.215168055156564e-05, "loss": 1.9656, "step": 720 }, { "epoch": 0.20502734053330332, "grad_norm": 3.3702752590179443, "learning_rate": 3.214018960068946e-05, "loss": 3.208, "step": 721 }, { "epoch": 0.20531170577676144, "grad_norm": 2.2111897468566895, "learning_rate": 3.212869864981327e-05, "loss": 2.8252, "step": 722 }, { "epoch": 0.20559607102021957, "grad_norm": 1.9135637283325195, "learning_rate": 3.2117207698937094e-05, "loss": 2.3325, "step": 723 }, { "epoch": 0.20588043626367766, "grad_norm": 1.9720232486724854, "learning_rate": 3.2105716748060905e-05, "loss": 2.5609, "step": 724 }, { "epoch": 0.2061648015071358, "grad_norm": 2.031601667404175, "learning_rate": 3.209422579718472e-05, "loss": 2.6579, "step": 725 }, { "epoch": 0.2064491667505939, "grad_norm": 1.992948055267334, "learning_rate": 3.2082734846308534e-05, "loss": 2.4377, "step": 726 }, { "epoch": 0.20673353199405203, "grad_norm": 2.026014566421509, "learning_rate": 3.207124389543235e-05, "loss": 1.9672, "step": 727 }, { "epoch": 0.20701789723751016, "grad_norm": 1.9836902618408203, "learning_rate": 3.205975294455616e-05, "loss": 2.2928, "step": 728 }, { "epoch": 0.20730226248096825, "grad_norm": 2.541131019592285, "learning_rate": 3.204826199367998e-05, "loss": 3.3255, "step": 729 }, { "epoch": 0.20758662772442638, "grad_norm": 2.0298244953155518, "learning_rate": 3.20367710428038e-05, "loss": 2.6907, "step": 730 }, { "epoch": 0.2078709929678845, "grad_norm": 1.8890761137008667, "learning_rate": 3.202528009192761e-05, "loss": 2.5107, "step": 731 }, { "epoch": 0.20815535821134262, "grad_norm": 1.8663610219955444, "learning_rate": 3.201378914105143e-05, "loss": 2.4509, "step": 732 }, { "epoch": 0.20843972345480075, "grad_norm": 1.906890630722046, "learning_rate": 3.200229819017524e-05, "loss": 2.5638, "step": 733 }, { "epoch": 0.20872408869825884, "grad_norm": 1.9237983226776123, "learning_rate": 3.199080723929906e-05, "loss": 2.4034, "step": 734 }, { "epoch": 0.20900845394171697, "grad_norm": 2.0110905170440674, "learning_rate": 3.197931628842287e-05, "loss": 2.242, "step": 735 }, { "epoch": 0.2092928191851751, "grad_norm": 2.0842487812042236, "learning_rate": 3.1967825337546686e-05, "loss": 2.1287, "step": 736 }, { "epoch": 0.2095771844286332, "grad_norm": 2.772614002227783, "learning_rate": 3.19563343866705e-05, "loss": 3.0786, "step": 737 }, { "epoch": 0.20986154967209134, "grad_norm": 2.3587253093719482, "learning_rate": 3.1944843435794315e-05, "loss": 2.7391, "step": 738 }, { "epoch": 0.21014591491554946, "grad_norm": 1.897799015045166, "learning_rate": 3.193335248491813e-05, "loss": 2.5878, "step": 739 }, { "epoch": 0.21043028015900755, "grad_norm": 1.919022560119629, "learning_rate": 3.1921861534041944e-05, "loss": 2.6763, "step": 740 }, { "epoch": 0.21071464540246568, "grad_norm": 2.1270596981048584, "learning_rate": 3.191037058316576e-05, "loss": 2.3583, "step": 741 }, { "epoch": 0.2109990106459238, "grad_norm": 1.8899205923080444, "learning_rate": 3.189887963228957e-05, "loss": 2.4652, "step": 742 }, { "epoch": 0.21128337588938192, "grad_norm": 1.9961659908294678, "learning_rate": 3.188738868141339e-05, "loss": 2.275, "step": 743 }, { "epoch": 0.21156774113284005, "grad_norm": 2.287783622741699, "learning_rate": 3.18758977305372e-05, "loss": 2.0559, "step": 744 }, { "epoch": 0.21185210637629814, "grad_norm": 2.3738584518432617, "learning_rate": 3.186440677966102e-05, "loss": 3.1947, "step": 745 }, { "epoch": 0.21213647161975627, "grad_norm": 2.055588960647583, "learning_rate": 3.185291582878483e-05, "loss": 2.881, "step": 746 }, { "epoch": 0.2124208368632144, "grad_norm": 2.0361130237579346, "learning_rate": 3.184142487790865e-05, "loss": 2.6499, "step": 747 }, { "epoch": 0.21270520210667251, "grad_norm": 1.918145775794983, "learning_rate": 3.182993392703247e-05, "loss": 2.3748, "step": 748 }, { "epoch": 0.21298956735013064, "grad_norm": 2.2371819019317627, "learning_rate": 3.181844297615628e-05, "loss": 2.5316, "step": 749 }, { "epoch": 0.21327393259358876, "grad_norm": 2.0490219593048096, "learning_rate": 3.1806952025280096e-05, "loss": 2.3496, "step": 750 }, { "epoch": 0.21355829783704686, "grad_norm": 1.884697437286377, "learning_rate": 3.179546107440391e-05, "loss": 2.168, "step": 751 }, { "epoch": 0.21384266308050498, "grad_norm": 1.9147650003433228, "learning_rate": 3.1783970123527725e-05, "loss": 2.1791, "step": 752 }, { "epoch": 0.2141270283239631, "grad_norm": 2.1511213779449463, "learning_rate": 3.1772479172651536e-05, "loss": 3.073, "step": 753 }, { "epoch": 0.21441139356742123, "grad_norm": 1.8281089067459106, "learning_rate": 3.1760988221775354e-05, "loss": 2.7645, "step": 754 }, { "epoch": 0.21469575881087935, "grad_norm": 1.7798327207565308, "learning_rate": 3.1749497270899165e-05, "loss": 2.6769, "step": 755 }, { "epoch": 0.21498012405433745, "grad_norm": 1.9437909126281738, "learning_rate": 3.173800632002298e-05, "loss": 2.7385, "step": 756 }, { "epoch": 0.21526448929779557, "grad_norm": 2.2421510219573975, "learning_rate": 3.17265153691468e-05, "loss": 2.3904, "step": 757 }, { "epoch": 0.2155488545412537, "grad_norm": 2.002697467803955, "learning_rate": 3.171502441827061e-05, "loss": 2.1965, "step": 758 }, { "epoch": 0.21583321978471182, "grad_norm": 2.018341541290283, "learning_rate": 3.170353346739443e-05, "loss": 2.1783, "step": 759 }, { "epoch": 0.21611758502816994, "grad_norm": 1.9171587228775024, "learning_rate": 3.169204251651825e-05, "loss": 1.9632, "step": 760 }, { "epoch": 0.21640195027162806, "grad_norm": 2.128800868988037, "learning_rate": 3.168055156564206e-05, "loss": 3.1176, "step": 761 }, { "epoch": 0.21668631551508616, "grad_norm": 2.2135539054870605, "learning_rate": 3.1669060614765876e-05, "loss": 2.9083, "step": 762 }, { "epoch": 0.21697068075854428, "grad_norm": 2.01832914352417, "learning_rate": 3.1657569663889694e-05, "loss": 2.5822, "step": 763 }, { "epoch": 0.2172550460020024, "grad_norm": 1.827118992805481, "learning_rate": 3.1646078713013505e-05, "loss": 2.5527, "step": 764 }, { "epoch": 0.21753941124546053, "grad_norm": 1.9703270196914673, "learning_rate": 3.163458776213732e-05, "loss": 2.61, "step": 765 }, { "epoch": 0.21782377648891865, "grad_norm": 2.0175693035125732, "learning_rate": 3.1623096811261134e-05, "loss": 2.1853, "step": 766 }, { "epoch": 0.21810814173237675, "grad_norm": 2.11944580078125, "learning_rate": 3.161160586038495e-05, "loss": 1.7576, "step": 767 }, { "epoch": 0.21839250697583487, "grad_norm": 2.100249767303467, "learning_rate": 3.1600114909508763e-05, "loss": 1.9592, "step": 768 }, { "epoch": 0.218676872219293, "grad_norm": 2.8305320739746094, "learning_rate": 3.158862395863258e-05, "loss": 3.3594, "step": 769 }, { "epoch": 0.21896123746275112, "grad_norm": 2.1420650482177734, "learning_rate": 3.157713300775639e-05, "loss": 2.847, "step": 770 }, { "epoch": 0.21924560270620924, "grad_norm": 1.9816900491714478, "learning_rate": 3.156564205688021e-05, "loss": 2.5702, "step": 771 }, { "epoch": 0.21952996794966737, "grad_norm": 1.9326121807098389, "learning_rate": 3.155415110600403e-05, "loss": 2.6882, "step": 772 }, { "epoch": 0.21981433319312546, "grad_norm": 2.1733944416046143, "learning_rate": 3.154266015512784e-05, "loss": 2.7475, "step": 773 }, { "epoch": 0.22009869843658358, "grad_norm": 1.7192537784576416, "learning_rate": 3.153116920425166e-05, "loss": 2.3554, "step": 774 }, { "epoch": 0.2203830636800417, "grad_norm": 1.9640390872955322, "learning_rate": 3.151967825337547e-05, "loss": 2.1575, "step": 775 }, { "epoch": 0.22066742892349983, "grad_norm": 2.0537362098693848, "learning_rate": 3.1508187302499286e-05, "loss": 2.2478, "step": 776 }, { "epoch": 0.22095179416695795, "grad_norm": 2.381430149078369, "learning_rate": 3.14966963516231e-05, "loss": 2.936, "step": 777 }, { "epoch": 0.22123615941041605, "grad_norm": 2.202801465988159, "learning_rate": 3.1485205400746915e-05, "loss": 2.9745, "step": 778 }, { "epoch": 0.22152052465387417, "grad_norm": 1.809591293334961, "learning_rate": 3.1473714449870726e-05, "loss": 2.6893, "step": 779 }, { "epoch": 0.2218048898973323, "grad_norm": 1.792960524559021, "learning_rate": 3.1462223498994544e-05, "loss": 2.4085, "step": 780 }, { "epoch": 0.22208925514079042, "grad_norm": 1.961672067642212, "learning_rate": 3.145073254811836e-05, "loss": 2.5024, "step": 781 }, { "epoch": 0.22237362038424854, "grad_norm": 1.8171442747116089, "learning_rate": 3.143924159724217e-05, "loss": 2.3142, "step": 782 }, { "epoch": 0.22265798562770667, "grad_norm": 1.9773929119110107, "learning_rate": 3.142775064636599e-05, "loss": 2.1619, "step": 783 }, { "epoch": 0.22294235087116476, "grad_norm": 1.815152645111084, "learning_rate": 3.14162596954898e-05, "loss": 2.0866, "step": 784 }, { "epoch": 0.2232267161146229, "grad_norm": 2.7723894119262695, "learning_rate": 3.140476874461362e-05, "loss": 3.3017, "step": 785 }, { "epoch": 0.223511081358081, "grad_norm": 1.9993256330490112, "learning_rate": 3.139327779373743e-05, "loss": 2.6683, "step": 786 }, { "epoch": 0.22379544660153913, "grad_norm": 1.7609732151031494, "learning_rate": 3.138178684286125e-05, "loss": 2.5893, "step": 787 }, { "epoch": 0.22407981184499726, "grad_norm": 1.7786259651184082, "learning_rate": 3.137029589198506e-05, "loss": 2.4328, "step": 788 }, { "epoch": 0.22436417708845535, "grad_norm": 1.887364149093628, "learning_rate": 3.135880494110888e-05, "loss": 2.5221, "step": 789 }, { "epoch": 0.22464854233191348, "grad_norm": 1.8501337766647339, "learning_rate": 3.1347313990232696e-05, "loss": 2.2003, "step": 790 }, { "epoch": 0.2249329075753716, "grad_norm": 1.7945717573165894, "learning_rate": 3.133582303935651e-05, "loss": 2.1568, "step": 791 }, { "epoch": 0.22521727281882972, "grad_norm": 1.8720512390136719, "learning_rate": 3.1324332088480325e-05, "loss": 2.104, "step": 792 }, { "epoch": 0.22550163806228785, "grad_norm": 2.1769962310791016, "learning_rate": 3.1312841137604136e-05, "loss": 3.2416, "step": 793 }, { "epoch": 0.22578600330574594, "grad_norm": 2.1001853942871094, "learning_rate": 3.1301350186727954e-05, "loss": 2.6237, "step": 794 }, { "epoch": 0.22607036854920406, "grad_norm": 1.819381594657898, "learning_rate": 3.1289859235851765e-05, "loss": 2.5273, "step": 795 }, { "epoch": 0.2263547337926622, "grad_norm": 1.706929326057434, "learning_rate": 3.127836828497558e-05, "loss": 2.5795, "step": 796 }, { "epoch": 0.2266390990361203, "grad_norm": 2.0382096767425537, "learning_rate": 3.12668773340994e-05, "loss": 2.5978, "step": 797 }, { "epoch": 0.22692346427957844, "grad_norm": 1.7794004678726196, "learning_rate": 3.125538638322322e-05, "loss": 2.2474, "step": 798 }, { "epoch": 0.22720782952303656, "grad_norm": 1.8751369714736938, "learning_rate": 3.124389543234703e-05, "loss": 2.0745, "step": 799 }, { "epoch": 0.22749219476649465, "grad_norm": 1.9768788814544678, "learning_rate": 3.123240448147085e-05, "loss": 2.2523, "step": 800 }, { "epoch": 0.22777656000995278, "grad_norm": 2.430659532546997, "learning_rate": 3.122091353059466e-05, "loss": 3.2516, "step": 801 }, { "epoch": 0.2280609252534109, "grad_norm": 1.9983506202697754, "learning_rate": 3.120942257971848e-05, "loss": 2.7283, "step": 802 }, { "epoch": 0.22834529049686902, "grad_norm": 1.8245633840560913, "learning_rate": 3.119793162884229e-05, "loss": 2.4896, "step": 803 }, { "epoch": 0.22862965574032715, "grad_norm": 1.724178433418274, "learning_rate": 3.1186440677966106e-05, "loss": 2.5445, "step": 804 }, { "epoch": 0.22891402098378524, "grad_norm": 1.851968765258789, "learning_rate": 3.1174949727089924e-05, "loss": 2.3454, "step": 805 }, { "epoch": 0.22919838622724337, "grad_norm": 1.7947938442230225, "learning_rate": 3.1163458776213735e-05, "loss": 2.2357, "step": 806 }, { "epoch": 0.2294827514707015, "grad_norm": 2.108400821685791, "learning_rate": 3.115196782533755e-05, "loss": 2.2438, "step": 807 }, { "epoch": 0.2297671167141596, "grad_norm": 2.0238804817199707, "learning_rate": 3.1140476874461364e-05, "loss": 1.9718, "step": 808 }, { "epoch": 0.23005148195761774, "grad_norm": 2.4201109409332275, "learning_rate": 3.112898592358518e-05, "loss": 3.2521, "step": 809 }, { "epoch": 0.23033584720107586, "grad_norm": 1.9365209341049194, "learning_rate": 3.111749497270899e-05, "loss": 2.9715, "step": 810 }, { "epoch": 0.23062021244453396, "grad_norm": 1.670932650566101, "learning_rate": 3.110600402183281e-05, "loss": 2.6555, "step": 811 }, { "epoch": 0.23090457768799208, "grad_norm": 1.8441979885101318, "learning_rate": 3.109451307095662e-05, "loss": 2.6097, "step": 812 }, { "epoch": 0.2311889429314502, "grad_norm": 2.0533034801483154, "learning_rate": 3.108302212008044e-05, "loss": 2.4527, "step": 813 }, { "epoch": 0.23147330817490833, "grad_norm": 1.9839503765106201, "learning_rate": 3.107153116920426e-05, "loss": 2.3322, "step": 814 }, { "epoch": 0.23175767341836645, "grad_norm": 1.8730031251907349, "learning_rate": 3.106004021832807e-05, "loss": 2.0321, "step": 815 }, { "epoch": 0.23204203866182455, "grad_norm": 2.0080573558807373, "learning_rate": 3.1048549267451886e-05, "loss": 2.295, "step": 816 }, { "epoch": 0.23232640390528267, "grad_norm": 2.2020275592803955, "learning_rate": 3.10370583165757e-05, "loss": 3.0295, "step": 817 }, { "epoch": 0.2326107691487408, "grad_norm": 1.7750929594039917, "learning_rate": 3.1025567365699515e-05, "loss": 2.8698, "step": 818 }, { "epoch": 0.23289513439219892, "grad_norm": 1.6771564483642578, "learning_rate": 3.1014076414823326e-05, "loss": 2.417, "step": 819 }, { "epoch": 0.23317949963565704, "grad_norm": 1.8319196701049805, "learning_rate": 3.1002585463947144e-05, "loss": 2.3378, "step": 820 }, { "epoch": 0.23346386487911516, "grad_norm": 2.1067543029785156, "learning_rate": 3.0991094513070955e-05, "loss": 2.434, "step": 821 }, { "epoch": 0.23374823012257326, "grad_norm": 2.0174291133880615, "learning_rate": 3.097960356219477e-05, "loss": 2.2196, "step": 822 }, { "epoch": 0.23403259536603138, "grad_norm": 1.8617424964904785, "learning_rate": 3.096811261131859e-05, "loss": 2.0332, "step": 823 }, { "epoch": 0.2343169606094895, "grad_norm": 1.8981695175170898, "learning_rate": 3.09566216604424e-05, "loss": 2.0747, "step": 824 }, { "epoch": 0.23460132585294763, "grad_norm": 2.3870060443878174, "learning_rate": 3.094513070956622e-05, "loss": 3.1643, "step": 825 }, { "epoch": 0.23488569109640575, "grad_norm": 2.0108344554901123, "learning_rate": 3.093363975869003e-05, "loss": 2.8178, "step": 826 }, { "epoch": 0.23517005633986385, "grad_norm": 2.198622941970825, "learning_rate": 3.092214880781385e-05, "loss": 2.7069, "step": 827 }, { "epoch": 0.23545442158332197, "grad_norm": 2.032430648803711, "learning_rate": 3.091065785693766e-05, "loss": 2.4352, "step": 828 }, { "epoch": 0.2357387868267801, "grad_norm": 1.968604564666748, "learning_rate": 3.089916690606148e-05, "loss": 2.2391, "step": 829 }, { "epoch": 0.23602315207023822, "grad_norm": 1.6530208587646484, "learning_rate": 3.088767595518529e-05, "loss": 2.223, "step": 830 }, { "epoch": 0.23630751731369634, "grad_norm": 2.004181385040283, "learning_rate": 3.087618500430911e-05, "loss": 2.0482, "step": 831 }, { "epoch": 0.23659188255715446, "grad_norm": 2.0865046977996826, "learning_rate": 3.0864694053432925e-05, "loss": 2.1199, "step": 832 }, { "epoch": 0.23687624780061256, "grad_norm": 2.4281649589538574, "learning_rate": 3.0853203102556736e-05, "loss": 3.1147, "step": 833 }, { "epoch": 0.23716061304407068, "grad_norm": 2.327183246612549, "learning_rate": 3.0841712151680554e-05, "loss": 2.7999, "step": 834 }, { "epoch": 0.2374449782875288, "grad_norm": 1.8897318840026855, "learning_rate": 3.083022120080437e-05, "loss": 2.6127, "step": 835 }, { "epoch": 0.23772934353098693, "grad_norm": 1.755764126777649, "learning_rate": 3.081873024992818e-05, "loss": 2.5684, "step": 836 }, { "epoch": 0.23801370877444505, "grad_norm": 1.9734392166137695, "learning_rate": 3.0807239299052e-05, "loss": 2.3049, "step": 837 }, { "epoch": 0.23829807401790315, "grad_norm": 1.9231913089752197, "learning_rate": 3.079574834817582e-05, "loss": 2.3946, "step": 838 }, { "epoch": 0.23858243926136127, "grad_norm": 2.0235962867736816, "learning_rate": 3.078425739729963e-05, "loss": 2.2068, "step": 839 }, { "epoch": 0.2388668045048194, "grad_norm": 2.155137538909912, "learning_rate": 3.077276644642345e-05, "loss": 2.0929, "step": 840 }, { "epoch": 0.23915116974827752, "grad_norm": 2.36690092086792, "learning_rate": 3.076127549554726e-05, "loss": 3.1167, "step": 841 }, { "epoch": 0.23943553499173564, "grad_norm": 2.195228099822998, "learning_rate": 3.074978454467108e-05, "loss": 3.0185, "step": 842 }, { "epoch": 0.23971990023519377, "grad_norm": 1.9150617122650146, "learning_rate": 3.073829359379489e-05, "loss": 2.6971, "step": 843 }, { "epoch": 0.24000426547865186, "grad_norm": 1.8287773132324219, "learning_rate": 3.0726802642918706e-05, "loss": 2.6606, "step": 844 }, { "epoch": 0.24028863072210999, "grad_norm": 1.9123808145523071, "learning_rate": 3.071531169204252e-05, "loss": 2.3256, "step": 845 }, { "epoch": 0.2405729959655681, "grad_norm": 2.077585458755493, "learning_rate": 3.0703820741166335e-05, "loss": 2.1924, "step": 846 }, { "epoch": 0.24085736120902623, "grad_norm": 2.0052852630615234, "learning_rate": 3.069232979029015e-05, "loss": 1.979, "step": 847 }, { "epoch": 0.24114172645248436, "grad_norm": 2.1259374618530273, "learning_rate": 3.0680838839413964e-05, "loss": 2.0226, "step": 848 }, { "epoch": 0.24142609169594245, "grad_norm": 2.7498273849487305, "learning_rate": 3.066934788853778e-05, "loss": 3.2752, "step": 849 }, { "epoch": 0.24171045693940058, "grad_norm": 1.9522055387496948, "learning_rate": 3.065785693766159e-05, "loss": 2.7302, "step": 850 }, { "epoch": 0.2419948221828587, "grad_norm": 1.9767372608184814, "learning_rate": 3.064636598678541e-05, "loss": 2.7091, "step": 851 }, { "epoch": 0.24227918742631682, "grad_norm": 1.8098385334014893, "learning_rate": 3.063487503590922e-05, "loss": 2.5372, "step": 852 }, { "epoch": 0.24256355266977495, "grad_norm": 1.995309829711914, "learning_rate": 3.062338408503304e-05, "loss": 2.3491, "step": 853 }, { "epoch": 0.24284791791323304, "grad_norm": 1.838243842124939, "learning_rate": 3.061189313415685e-05, "loss": 2.3286, "step": 854 }, { "epoch": 0.24313228315669116, "grad_norm": 1.8721081018447876, "learning_rate": 3.060040218328067e-05, "loss": 2.1154, "step": 855 }, { "epoch": 0.2434166484001493, "grad_norm": 2.050607919692993, "learning_rate": 3.0588911232404487e-05, "loss": 1.938, "step": 856 }, { "epoch": 0.2437010136436074, "grad_norm": 2.1715247631073, "learning_rate": 3.05774202815283e-05, "loss": 2.9436, "step": 857 }, { "epoch": 0.24398537888706553, "grad_norm": 2.1600615978240967, "learning_rate": 3.0565929330652116e-05, "loss": 2.6224, "step": 858 }, { "epoch": 0.24426974413052366, "grad_norm": 1.890982747077942, "learning_rate": 3.055443837977593e-05, "loss": 2.6999, "step": 859 }, { "epoch": 0.24455410937398175, "grad_norm": 1.9753085374832153, "learning_rate": 3.0542947428899745e-05, "loss": 2.4645, "step": 860 }, { "epoch": 0.24483847461743988, "grad_norm": 2.096250295639038, "learning_rate": 3.0531456478023556e-05, "loss": 2.534, "step": 861 }, { "epoch": 0.245122839860898, "grad_norm": 1.9475681781768799, "learning_rate": 3.0519965527147374e-05, "loss": 2.1234, "step": 862 }, { "epoch": 0.24540720510435612, "grad_norm": 1.8136141300201416, "learning_rate": 3.0508474576271188e-05, "loss": 1.917, "step": 863 }, { "epoch": 0.24569157034781425, "grad_norm": 2.0293166637420654, "learning_rate": 3.0496983625395003e-05, "loss": 2.2178, "step": 864 }, { "epoch": 0.24597593559127234, "grad_norm": 2.072183847427368, "learning_rate": 3.0485492674518817e-05, "loss": 2.9106, "step": 865 }, { "epoch": 0.24626030083473047, "grad_norm": 1.7826175689697266, "learning_rate": 3.047400172364263e-05, "loss": 2.9296, "step": 866 }, { "epoch": 0.2465446660781886, "grad_norm": 1.876105546951294, "learning_rate": 3.0462510772766446e-05, "loss": 2.6027, "step": 867 }, { "epoch": 0.2468290313216467, "grad_norm": 1.8935751914978027, "learning_rate": 3.045101982189026e-05, "loss": 2.4412, "step": 868 }, { "epoch": 0.24711339656510484, "grad_norm": 2.0665159225463867, "learning_rate": 3.043952887101408e-05, "loss": 2.271, "step": 869 }, { "epoch": 0.24739776180856296, "grad_norm": 1.7189526557922363, "learning_rate": 3.0428037920137893e-05, "loss": 2.2023, "step": 870 }, { "epoch": 0.24768212705202106, "grad_norm": 1.9814810752868652, "learning_rate": 3.041654696926171e-05, "loss": 2.1293, "step": 871 }, { "epoch": 0.24796649229547918, "grad_norm": 1.9453988075256348, "learning_rate": 3.0405056018385525e-05, "loss": 2.0083, "step": 872 }, { "epoch": 0.2482508575389373, "grad_norm": 2.7860076427459717, "learning_rate": 3.039356506750934e-05, "loss": 3.0749, "step": 873 }, { "epoch": 0.24853522278239543, "grad_norm": 1.9733988046646118, "learning_rate": 3.0382074116633154e-05, "loss": 2.6634, "step": 874 }, { "epoch": 0.24881958802585355, "grad_norm": 1.8551710844039917, "learning_rate": 3.0370583165756972e-05, "loss": 2.4293, "step": 875 }, { "epoch": 0.24910395326931165, "grad_norm": 1.7632685899734497, "learning_rate": 3.0359092214880787e-05, "loss": 2.2907, "step": 876 }, { "epoch": 0.24938831851276977, "grad_norm": 1.9409010410308838, "learning_rate": 3.03476012640046e-05, "loss": 2.4988, "step": 877 }, { "epoch": 0.2496726837562279, "grad_norm": 1.7946302890777588, "learning_rate": 3.0336110313128416e-05, "loss": 2.0384, "step": 878 }, { "epoch": 0.24995704899968602, "grad_norm": 2.018123149871826, "learning_rate": 3.032461936225223e-05, "loss": 1.9721, "step": 879 }, { "epoch": 0.2502414142431441, "grad_norm": 1.960157036781311, "learning_rate": 3.0313128411376045e-05, "loss": 2.194, "step": 880 }, { "epoch": 0.25052577948660226, "grad_norm": 2.0886595249176025, "learning_rate": 3.030163746049986e-05, "loss": 2.9749, "step": 881 }, { "epoch": 0.25081014473006036, "grad_norm": 1.888434886932373, "learning_rate": 3.0290146509623674e-05, "loss": 2.8315, "step": 882 }, { "epoch": 0.2510945099735185, "grad_norm": 1.9592381715774536, "learning_rate": 3.0278655558747488e-05, "loss": 2.4951, "step": 883 }, { "epoch": 0.2513788752169766, "grad_norm": 1.7165327072143555, "learning_rate": 3.0267164607871306e-05, "loss": 2.5657, "step": 884 }, { "epoch": 0.2516632404604347, "grad_norm": 2.0365653038024902, "learning_rate": 3.025567365699512e-05, "loss": 2.5552, "step": 885 }, { "epoch": 0.25194760570389285, "grad_norm": 1.9661747217178345, "learning_rate": 3.0244182706118935e-05, "loss": 2.2448, "step": 886 }, { "epoch": 0.25223197094735095, "grad_norm": 1.8233864307403564, "learning_rate": 3.023269175524275e-05, "loss": 2.0185, "step": 887 }, { "epoch": 0.2525163361908091, "grad_norm": 2.1719167232513428, "learning_rate": 3.0221200804366564e-05, "loss": 2.224, "step": 888 }, { "epoch": 0.2528007014342672, "grad_norm": 2.498225450515747, "learning_rate": 3.020970985349038e-05, "loss": 3.2098, "step": 889 }, { "epoch": 0.2530850666777253, "grad_norm": 1.9225410223007202, "learning_rate": 3.0198218902614193e-05, "loss": 2.5611, "step": 890 }, { "epoch": 0.25336943192118344, "grad_norm": 1.8655143976211548, "learning_rate": 3.0186727951738008e-05, "loss": 2.6871, "step": 891 }, { "epoch": 0.25365379716464154, "grad_norm": 1.7953119277954102, "learning_rate": 3.0175237000861822e-05, "loss": 2.6092, "step": 892 }, { "epoch": 0.2539381624080997, "grad_norm": 2.0147297382354736, "learning_rate": 3.016374604998564e-05, "loss": 2.45, "step": 893 }, { "epoch": 0.2542225276515578, "grad_norm": 1.827158808708191, "learning_rate": 3.0152255099109454e-05, "loss": 2.0963, "step": 894 }, { "epoch": 0.2545068928950159, "grad_norm": 1.9638267755508423, "learning_rate": 3.014076414823327e-05, "loss": 1.9395, "step": 895 }, { "epoch": 0.25479125813847403, "grad_norm": 1.9481165409088135, "learning_rate": 3.0129273197357083e-05, "loss": 2.171, "step": 896 }, { "epoch": 0.2550756233819321, "grad_norm": 2.1484596729278564, "learning_rate": 3.0117782246480898e-05, "loss": 3.081, "step": 897 }, { "epoch": 0.2553599886253903, "grad_norm": 1.8102747201919556, "learning_rate": 3.0106291295604712e-05, "loss": 2.6534, "step": 898 }, { "epoch": 0.2556443538688484, "grad_norm": 1.7093149423599243, "learning_rate": 3.0094800344728527e-05, "loss": 2.5904, "step": 899 }, { "epoch": 0.2559287191123065, "grad_norm": 1.7110258340835571, "learning_rate": 3.008330939385234e-05, "loss": 2.6272, "step": 900 }, { "epoch": 0.2562130843557646, "grad_norm": 1.8184210062026978, "learning_rate": 3.0071818442976156e-05, "loss": 2.3048, "step": 901 }, { "epoch": 0.2564974495992227, "grad_norm": 1.7951569557189941, "learning_rate": 3.0060327492099974e-05, "loss": 2.1765, "step": 902 }, { "epoch": 0.25678181484268087, "grad_norm": 1.8752012252807617, "learning_rate": 3.0048836541223788e-05, "loss": 2.065, "step": 903 }, { "epoch": 0.25706618008613896, "grad_norm": 1.838429570198059, "learning_rate": 3.0037345590347603e-05, "loss": 1.9967, "step": 904 }, { "epoch": 0.2573505453295971, "grad_norm": 2.226714849472046, "learning_rate": 3.0025854639471417e-05, "loss": 3.2629, "step": 905 }, { "epoch": 0.2576349105730552, "grad_norm": 1.8551560640335083, "learning_rate": 3.0014363688595232e-05, "loss": 2.7787, "step": 906 }, { "epoch": 0.2579192758165133, "grad_norm": 1.9678069353103638, "learning_rate": 3.0002872737719046e-05, "loss": 2.7359, "step": 907 }, { "epoch": 0.25820364105997146, "grad_norm": 1.7606141567230225, "learning_rate": 2.9991381786842864e-05, "loss": 2.4614, "step": 908 }, { "epoch": 0.25848800630342955, "grad_norm": 1.9030214548110962, "learning_rate": 2.9979890835966682e-05, "loss": 2.6742, "step": 909 }, { "epoch": 0.2587723715468877, "grad_norm": 1.7596023082733154, "learning_rate": 2.9968399885090497e-05, "loss": 2.4495, "step": 910 }, { "epoch": 0.2590567367903458, "grad_norm": 1.7681137323379517, "learning_rate": 2.995690893421431e-05, "loss": 1.8863, "step": 911 }, { "epoch": 0.2593411020338039, "grad_norm": 1.8547093868255615, "learning_rate": 2.9945417983338126e-05, "loss": 1.8526, "step": 912 }, { "epoch": 0.25962546727726205, "grad_norm": 2.3479535579681396, "learning_rate": 2.993392703246194e-05, "loss": 3.0925, "step": 913 }, { "epoch": 0.25990983252072014, "grad_norm": 2.0483782291412354, "learning_rate": 2.9922436081585755e-05, "loss": 2.8808, "step": 914 }, { "epoch": 0.2601941977641783, "grad_norm": 1.8194479942321777, "learning_rate": 2.991094513070957e-05, "loss": 2.7473, "step": 915 }, { "epoch": 0.2604785630076364, "grad_norm": 1.7477372884750366, "learning_rate": 2.9899454179833384e-05, "loss": 2.5109, "step": 916 }, { "epoch": 0.2607629282510945, "grad_norm": 1.8002467155456543, "learning_rate": 2.9887963228957198e-05, "loss": 2.3975, "step": 917 }, { "epoch": 0.26104729349455263, "grad_norm": 1.7082115411758423, "learning_rate": 2.9876472278081016e-05, "loss": 2.0252, "step": 918 }, { "epoch": 0.26133165873801073, "grad_norm": 1.8353215456008911, "learning_rate": 2.986498132720483e-05, "loss": 1.9635, "step": 919 }, { "epoch": 0.2616160239814689, "grad_norm": 1.9129843711853027, "learning_rate": 2.9853490376328645e-05, "loss": 1.8942, "step": 920 }, { "epoch": 0.261900389224927, "grad_norm": 2.272021770477295, "learning_rate": 2.984199942545246e-05, "loss": 3.1266, "step": 921 }, { "epoch": 0.26218475446838513, "grad_norm": 1.8906854391098022, "learning_rate": 2.9830508474576274e-05, "loss": 2.7551, "step": 922 }, { "epoch": 0.2624691197118432, "grad_norm": 1.786133050918579, "learning_rate": 2.981901752370009e-05, "loss": 2.529, "step": 923 }, { "epoch": 0.2627534849553013, "grad_norm": 1.7091472148895264, "learning_rate": 2.9807526572823903e-05, "loss": 2.4381, "step": 924 }, { "epoch": 0.26303785019875947, "grad_norm": 2.077052354812622, "learning_rate": 2.9796035621947717e-05, "loss": 2.4781, "step": 925 }, { "epoch": 0.26332221544221757, "grad_norm": 1.774610996246338, "learning_rate": 2.9784544671071532e-05, "loss": 2.156, "step": 926 }, { "epoch": 0.2636065806856757, "grad_norm": 1.7777446508407593, "learning_rate": 2.977305372019535e-05, "loss": 2.1573, "step": 927 }, { "epoch": 0.2638909459291338, "grad_norm": 1.9106909036636353, "learning_rate": 2.9761562769319164e-05, "loss": 1.9771, "step": 928 }, { "epoch": 0.2641753111725919, "grad_norm": 2.073526620864868, "learning_rate": 2.975007181844298e-05, "loss": 3.0332, "step": 929 }, { "epoch": 0.26445967641605006, "grad_norm": 1.7309014797210693, "learning_rate": 2.9738580867566793e-05, "loss": 2.7578, "step": 930 }, { "epoch": 0.26474404165950816, "grad_norm": 1.7269665002822876, "learning_rate": 2.9727089916690608e-05, "loss": 2.5512, "step": 931 }, { "epoch": 0.2650284069029663, "grad_norm": 1.8180932998657227, "learning_rate": 2.9715598965814422e-05, "loss": 2.6815, "step": 932 }, { "epoch": 0.2653127721464244, "grad_norm": 1.8024221658706665, "learning_rate": 2.9704108014938237e-05, "loss": 2.306, "step": 933 }, { "epoch": 0.2655971373898825, "grad_norm": 1.8379640579223633, "learning_rate": 2.969261706406205e-05, "loss": 2.1043, "step": 934 }, { "epoch": 0.26588150263334065, "grad_norm": 1.738338828086853, "learning_rate": 2.9681126113185866e-05, "loss": 1.9482, "step": 935 }, { "epoch": 0.26616586787679875, "grad_norm": 1.7984474897384644, "learning_rate": 2.9669635162309684e-05, "loss": 2.0764, "step": 936 }, { "epoch": 0.2664502331202569, "grad_norm": 2.280787229537964, "learning_rate": 2.9658144211433498e-05, "loss": 2.9034, "step": 937 }, { "epoch": 0.266734598363715, "grad_norm": 1.7945760488510132, "learning_rate": 2.9646653260557313e-05, "loss": 2.5243, "step": 938 }, { "epoch": 0.2670189636071731, "grad_norm": 1.812246322631836, "learning_rate": 2.9635162309681127e-05, "loss": 2.4984, "step": 939 }, { "epoch": 0.26730332885063124, "grad_norm": 1.9260566234588623, "learning_rate": 2.962367135880494e-05, "loss": 2.5072, "step": 940 }, { "epoch": 0.26758769409408933, "grad_norm": 2.1809868812561035, "learning_rate": 2.9612180407928756e-05, "loss": 2.2971, "step": 941 }, { "epoch": 0.2678720593375475, "grad_norm": 2.036618232727051, "learning_rate": 2.960068945705257e-05, "loss": 2.3523, "step": 942 }, { "epoch": 0.2681564245810056, "grad_norm": 2.0626561641693115, "learning_rate": 2.9589198506176385e-05, "loss": 2.1757, "step": 943 }, { "epoch": 0.26844078982446373, "grad_norm": 1.8505395650863647, "learning_rate": 2.95777075553002e-05, "loss": 2.0444, "step": 944 }, { "epoch": 0.26872515506792183, "grad_norm": 2.1572868824005127, "learning_rate": 2.956621660442402e-05, "loss": 3.1699, "step": 945 }, { "epoch": 0.2690095203113799, "grad_norm": 2.3714983463287354, "learning_rate": 2.9554725653547835e-05, "loss": 2.6934, "step": 946 }, { "epoch": 0.2692938855548381, "grad_norm": 2.13543701171875, "learning_rate": 2.954323470267165e-05, "loss": 2.4942, "step": 947 }, { "epoch": 0.26957825079829617, "grad_norm": 2.133861541748047, "learning_rate": 2.9531743751795464e-05, "loss": 2.5744, "step": 948 }, { "epoch": 0.2698626160417543, "grad_norm": 2.4103362560272217, "learning_rate": 2.952025280091928e-05, "loss": 2.4989, "step": 949 }, { "epoch": 0.2701469812852124, "grad_norm": 1.9884144067764282, "learning_rate": 2.9508761850043093e-05, "loss": 2.3324, "step": 950 }, { "epoch": 0.2704313465286705, "grad_norm": 1.9855095148086548, "learning_rate": 2.949727089916691e-05, "loss": 2.2511, "step": 951 }, { "epoch": 0.27071571177212866, "grad_norm": 1.8973934650421143, "learning_rate": 2.9485779948290726e-05, "loss": 2.0523, "step": 952 }, { "epoch": 0.27100007701558676, "grad_norm": 2.413694143295288, "learning_rate": 2.947428899741454e-05, "loss": 3.0049, "step": 953 }, { "epoch": 0.2712844422590449, "grad_norm": 2.2881429195404053, "learning_rate": 2.9462798046538355e-05, "loss": 2.5529, "step": 954 }, { "epoch": 0.271568807502503, "grad_norm": 1.9285370111465454, "learning_rate": 2.945130709566217e-05, "loss": 2.3945, "step": 955 }, { "epoch": 0.2718531727459611, "grad_norm": 1.9740569591522217, "learning_rate": 2.9439816144785984e-05, "loss": 2.4983, "step": 956 }, { "epoch": 0.27213753798941925, "grad_norm": 1.8763717412948608, "learning_rate": 2.9428325193909798e-05, "loss": 2.4825, "step": 957 }, { "epoch": 0.27242190323287735, "grad_norm": 1.8496977090835571, "learning_rate": 2.9416834243033613e-05, "loss": 2.1357, "step": 958 }, { "epoch": 0.2727062684763355, "grad_norm": 1.9461162090301514, "learning_rate": 2.9405343292157427e-05, "loss": 2.1733, "step": 959 }, { "epoch": 0.2729906337197936, "grad_norm": 2.133432388305664, "learning_rate": 2.9393852341281245e-05, "loss": 2.145, "step": 960 }, { "epoch": 0.2732749989632517, "grad_norm": 2.7687673568725586, "learning_rate": 2.938236139040506e-05, "loss": 3.0316, "step": 961 }, { "epoch": 0.27355936420670984, "grad_norm": 1.9343767166137695, "learning_rate": 2.9370870439528874e-05, "loss": 2.6818, "step": 962 }, { "epoch": 0.27384372945016794, "grad_norm": 1.7821241617202759, "learning_rate": 2.935937948865269e-05, "loss": 2.5806, "step": 963 }, { "epoch": 0.2741280946936261, "grad_norm": 1.7667675018310547, "learning_rate": 2.9347888537776503e-05, "loss": 2.6895, "step": 964 }, { "epoch": 0.2744124599370842, "grad_norm": 1.863686203956604, "learning_rate": 2.9336397586900318e-05, "loss": 2.3492, "step": 965 }, { "epoch": 0.2746968251805423, "grad_norm": 1.9857769012451172, "learning_rate": 2.9324906636024132e-05, "loss": 2.3327, "step": 966 }, { "epoch": 0.27498119042400043, "grad_norm": 1.7995944023132324, "learning_rate": 2.9313415685147947e-05, "loss": 2.0556, "step": 967 }, { "epoch": 0.27526555566745853, "grad_norm": 1.8642942905426025, "learning_rate": 2.930192473427176e-05, "loss": 1.875, "step": 968 }, { "epoch": 0.2755499209109167, "grad_norm": 1.9679466485977173, "learning_rate": 2.929043378339558e-05, "loss": 3.0327, "step": 969 }, { "epoch": 0.2758342861543748, "grad_norm": 1.868360161781311, "learning_rate": 2.9278942832519394e-05, "loss": 2.4991, "step": 970 }, { "epoch": 0.2761186513978329, "grad_norm": 1.9371778964996338, "learning_rate": 2.9267451881643208e-05, "loss": 2.651, "step": 971 }, { "epoch": 0.276403016641291, "grad_norm": 1.7306244373321533, "learning_rate": 2.9255960930767022e-05, "loss": 2.4744, "step": 972 }, { "epoch": 0.2766873818847491, "grad_norm": 1.9607441425323486, "learning_rate": 2.9244469979890837e-05, "loss": 2.3596, "step": 973 }, { "epoch": 0.27697174712820727, "grad_norm": 1.9397883415222168, "learning_rate": 2.923297902901465e-05, "loss": 2.1509, "step": 974 }, { "epoch": 0.27725611237166536, "grad_norm": 1.9228720664978027, "learning_rate": 2.9221488078138466e-05, "loss": 1.9686, "step": 975 }, { "epoch": 0.2775404776151235, "grad_norm": 1.9440864324569702, "learning_rate": 2.920999712726228e-05, "loss": 2.0284, "step": 976 }, { "epoch": 0.2778248428585816, "grad_norm": 2.285125970840454, "learning_rate": 2.9198506176386095e-05, "loss": 2.8307, "step": 977 }, { "epoch": 0.2781092081020397, "grad_norm": 1.86907160282135, "learning_rate": 2.9187015225509913e-05, "loss": 2.6328, "step": 978 }, { "epoch": 0.27839357334549786, "grad_norm": 1.7151695489883423, "learning_rate": 2.9175524274633727e-05, "loss": 2.7284, "step": 979 }, { "epoch": 0.27867793858895595, "grad_norm": 1.6392958164215088, "learning_rate": 2.9164033323757542e-05, "loss": 2.6941, "step": 980 }, { "epoch": 0.2789623038324141, "grad_norm": 1.9084131717681885, "learning_rate": 2.9152542372881356e-05, "loss": 2.2339, "step": 981 }, { "epoch": 0.2792466690758722, "grad_norm": 1.7988026142120361, "learning_rate": 2.9141051422005174e-05, "loss": 2.1509, "step": 982 }, { "epoch": 0.2795310343193303, "grad_norm": 1.8712517023086548, "learning_rate": 2.912956047112899e-05, "loss": 1.9558, "step": 983 }, { "epoch": 0.27981539956278845, "grad_norm": 2.0422396659851074, "learning_rate": 2.9118069520252807e-05, "loss": 2.1704, "step": 984 }, { "epoch": 0.28009976480624654, "grad_norm": 2.0253920555114746, "learning_rate": 2.910657856937662e-05, "loss": 3.0636, "step": 985 }, { "epoch": 0.2803841300497047, "grad_norm": 2.0420773029327393, "learning_rate": 2.9095087618500436e-05, "loss": 2.556, "step": 986 }, { "epoch": 0.2806684952931628, "grad_norm": 1.864678978919983, "learning_rate": 2.908359666762425e-05, "loss": 2.4779, "step": 987 }, { "epoch": 0.2809528605366209, "grad_norm": 1.7852369546890259, "learning_rate": 2.9072105716748065e-05, "loss": 2.409, "step": 988 }, { "epoch": 0.28123722578007904, "grad_norm": 1.8626235723495483, "learning_rate": 2.906061476587188e-05, "loss": 2.2681, "step": 989 }, { "epoch": 0.28152159102353713, "grad_norm": 2.052464485168457, "learning_rate": 2.9049123814995694e-05, "loss": 2.3266, "step": 990 }, { "epoch": 0.2818059562669953, "grad_norm": 1.991090178489685, "learning_rate": 2.9037632864119508e-05, "loss": 2.1501, "step": 991 }, { "epoch": 0.2820903215104534, "grad_norm": 1.8865472078323364, "learning_rate": 2.9026141913243323e-05, "loss": 2.0682, "step": 992 }, { "epoch": 0.28237468675391153, "grad_norm": 2.1116745471954346, "learning_rate": 2.901465096236714e-05, "loss": 2.9731, "step": 993 }, { "epoch": 0.2826590519973696, "grad_norm": 1.8774727582931519, "learning_rate": 2.9003160011490955e-05, "loss": 2.7532, "step": 994 }, { "epoch": 0.2829434172408277, "grad_norm": 1.7747597694396973, "learning_rate": 2.899166906061477e-05, "loss": 2.6523, "step": 995 }, { "epoch": 0.2832277824842859, "grad_norm": 1.8037197589874268, "learning_rate": 2.8980178109738584e-05, "loss": 2.5828, "step": 996 }, { "epoch": 0.28351214772774397, "grad_norm": 2.067134141921997, "learning_rate": 2.89686871588624e-05, "loss": 2.5429, "step": 997 }, { "epoch": 0.2837965129712021, "grad_norm": 2.0115020275115967, "learning_rate": 2.8957196207986213e-05, "loss": 2.1842, "step": 998 }, { "epoch": 0.2840808782146602, "grad_norm": 1.8412772417068481, "learning_rate": 2.8945705257110027e-05, "loss": 2.1457, "step": 999 }, { "epoch": 0.2843652434581183, "grad_norm": 1.7536171674728394, "learning_rate": 2.8934214306233842e-05, "loss": 1.9325, "step": 1000 }, { "epoch": 0.28464960870157646, "grad_norm": 2.1407527923583984, "learning_rate": 2.8922723355357656e-05, "loss": 3.1672, "step": 1001 }, { "epoch": 0.28493397394503456, "grad_norm": 1.750623345375061, "learning_rate": 2.8911232404481474e-05, "loss": 2.5675, "step": 1002 }, { "epoch": 0.2852183391884927, "grad_norm": 1.8202438354492188, "learning_rate": 2.889974145360529e-05, "loss": 2.5414, "step": 1003 }, { "epoch": 0.2855027044319508, "grad_norm": 1.8455865383148193, "learning_rate": 2.8888250502729103e-05, "loss": 2.4263, "step": 1004 }, { "epoch": 0.2857870696754089, "grad_norm": 1.9973098039627075, "learning_rate": 2.8876759551852918e-05, "loss": 2.2655, "step": 1005 }, { "epoch": 0.28607143491886705, "grad_norm": 1.8605165481567383, "learning_rate": 2.8865268600976732e-05, "loss": 2.0906, "step": 1006 }, { "epoch": 0.28635580016232515, "grad_norm": 1.9424145221710205, "learning_rate": 2.8853777650100547e-05, "loss": 2.1295, "step": 1007 }, { "epoch": 0.2866401654057833, "grad_norm": 1.8717995882034302, "learning_rate": 2.884228669922436e-05, "loss": 2.0872, "step": 1008 }, { "epoch": 0.2869245306492414, "grad_norm": 1.9393534660339355, "learning_rate": 2.8830795748348176e-05, "loss": 3.0324, "step": 1009 }, { "epoch": 0.2872088958926995, "grad_norm": 2.0312883853912354, "learning_rate": 2.881930479747199e-05, "loss": 2.7265, "step": 1010 }, { "epoch": 0.28749326113615764, "grad_norm": 2.081376791000366, "learning_rate": 2.8807813846595808e-05, "loss": 2.5069, "step": 1011 }, { "epoch": 0.28777762637961574, "grad_norm": 2.1162948608398438, "learning_rate": 2.8796322895719623e-05, "loss": 2.4216, "step": 1012 }, { "epoch": 0.2880619916230739, "grad_norm": 1.9769059419631958, "learning_rate": 2.8784831944843437e-05, "loss": 2.3386, "step": 1013 }, { "epoch": 0.288346356866532, "grad_norm": 1.8281911611557007, "learning_rate": 2.8773340993967252e-05, "loss": 2.152, "step": 1014 }, { "epoch": 0.2886307221099901, "grad_norm": 1.8548706769943237, "learning_rate": 2.8761850043091066e-05, "loss": 1.9768, "step": 1015 }, { "epoch": 0.28891508735344823, "grad_norm": 1.9629120826721191, "learning_rate": 2.875035909221488e-05, "loss": 2.0301, "step": 1016 }, { "epoch": 0.2891994525969063, "grad_norm": 2.21577525138855, "learning_rate": 2.8738868141338695e-05, "loss": 2.7658, "step": 1017 }, { "epoch": 0.2894838178403645, "grad_norm": 1.955876111984253, "learning_rate": 2.872737719046251e-05, "loss": 2.5863, "step": 1018 }, { "epoch": 0.2897681830838226, "grad_norm": 1.8049885034561157, "learning_rate": 2.871588623958633e-05, "loss": 2.7557, "step": 1019 }, { "epoch": 0.2900525483272807, "grad_norm": 1.851650357246399, "learning_rate": 2.8704395288710145e-05, "loss": 2.3583, "step": 1020 }, { "epoch": 0.2903369135707388, "grad_norm": 1.8646175861358643, "learning_rate": 2.869290433783396e-05, "loss": 2.3485, "step": 1021 }, { "epoch": 0.2906212788141969, "grad_norm": 1.6461886167526245, "learning_rate": 2.8681413386957774e-05, "loss": 2.2649, "step": 1022 }, { "epoch": 0.29090564405765507, "grad_norm": 1.8911197185516357, "learning_rate": 2.866992243608159e-05, "loss": 2.0709, "step": 1023 }, { "epoch": 0.29119000930111316, "grad_norm": 1.8676636219024658, "learning_rate": 2.8658431485205403e-05, "loss": 2.1007, "step": 1024 }, { "epoch": 0.2914743745445713, "grad_norm": 2.4858603477478027, "learning_rate": 2.8646940534329218e-05, "loss": 3.0081, "step": 1025 }, { "epoch": 0.2917587397880294, "grad_norm": 1.9424095153808594, "learning_rate": 2.8635449583453036e-05, "loss": 2.6527, "step": 1026 }, { "epoch": 0.2920431050314875, "grad_norm": 1.7152304649353027, "learning_rate": 2.862395863257685e-05, "loss": 2.6888, "step": 1027 }, { "epoch": 0.29232747027494566, "grad_norm": 1.6697314977645874, "learning_rate": 2.8612467681700665e-05, "loss": 2.5852, "step": 1028 }, { "epoch": 0.29261183551840375, "grad_norm": 1.8688958883285522, "learning_rate": 2.860097673082448e-05, "loss": 2.4024, "step": 1029 }, { "epoch": 0.2928962007618619, "grad_norm": 1.8187808990478516, "learning_rate": 2.8589485779948294e-05, "loss": 2.1133, "step": 1030 }, { "epoch": 0.29318056600532, "grad_norm": 1.6755000352859497, "learning_rate": 2.857799482907211e-05, "loss": 1.9874, "step": 1031 }, { "epoch": 0.2934649312487781, "grad_norm": 1.9520643949508667, "learning_rate": 2.8566503878195923e-05, "loss": 2.0306, "step": 1032 }, { "epoch": 0.29374929649223624, "grad_norm": 2.2317845821380615, "learning_rate": 2.8555012927319737e-05, "loss": 2.956, "step": 1033 }, { "epoch": 0.29403366173569434, "grad_norm": 1.9723296165466309, "learning_rate": 2.8543521976443552e-05, "loss": 2.6379, "step": 1034 }, { "epoch": 0.2943180269791525, "grad_norm": 1.8846988677978516, "learning_rate": 2.853203102556737e-05, "loss": 2.7342, "step": 1035 }, { "epoch": 0.2946023922226106, "grad_norm": 1.7443634271621704, "learning_rate": 2.8520540074691184e-05, "loss": 2.437, "step": 1036 }, { "epoch": 0.2948867574660687, "grad_norm": 1.8220562934875488, "learning_rate": 2.8509049123815e-05, "loss": 2.1899, "step": 1037 }, { "epoch": 0.29517112270952683, "grad_norm": 1.8991121053695679, "learning_rate": 2.8497558172938813e-05, "loss": 2.087, "step": 1038 }, { "epoch": 0.29545548795298493, "grad_norm": 1.7543880939483643, "learning_rate": 2.8486067222062628e-05, "loss": 1.7512, "step": 1039 }, { "epoch": 0.2957398531964431, "grad_norm": 1.95134699344635, "learning_rate": 2.8474576271186442e-05, "loss": 2.0684, "step": 1040 }, { "epoch": 0.2960242184399012, "grad_norm": 2.655040979385376, "learning_rate": 2.8463085320310257e-05, "loss": 2.9242, "step": 1041 }, { "epoch": 0.29630858368335933, "grad_norm": 1.837572693824768, "learning_rate": 2.845159436943407e-05, "loss": 2.4314, "step": 1042 }, { "epoch": 0.2965929489268174, "grad_norm": 1.903134822845459, "learning_rate": 2.8440103418557886e-05, "loss": 2.6958, "step": 1043 }, { "epoch": 0.2968773141702755, "grad_norm": 1.6535474061965942, "learning_rate": 2.8428612467681704e-05, "loss": 2.301, "step": 1044 }, { "epoch": 0.29716167941373367, "grad_norm": 2.0542876720428467, "learning_rate": 2.8417121516805518e-05, "loss": 2.4347, "step": 1045 }, { "epoch": 0.29744604465719177, "grad_norm": 1.8558207750320435, "learning_rate": 2.8405630565929333e-05, "loss": 2.2441, "step": 1046 }, { "epoch": 0.2977304099006499, "grad_norm": 1.7878432273864746, "learning_rate": 2.8394139615053147e-05, "loss": 2.0426, "step": 1047 }, { "epoch": 0.298014775144108, "grad_norm": 1.9130539894104004, "learning_rate": 2.838264866417696e-05, "loss": 2.1265, "step": 1048 }, { "epoch": 0.2982991403875661, "grad_norm": 2.4959192276000977, "learning_rate": 2.8371157713300776e-05, "loss": 2.8223, "step": 1049 }, { "epoch": 0.29858350563102426, "grad_norm": 1.7817506790161133, "learning_rate": 2.835966676242459e-05, "loss": 2.6125, "step": 1050 }, { "epoch": 0.29886787087448236, "grad_norm": 1.8295172452926636, "learning_rate": 2.8348175811548405e-05, "loss": 2.5712, "step": 1051 }, { "epoch": 0.2991522361179405, "grad_norm": 1.658846139907837, "learning_rate": 2.833668486067222e-05, "loss": 2.4154, "step": 1052 }, { "epoch": 0.2994366013613986, "grad_norm": 1.8340063095092773, "learning_rate": 2.8325193909796037e-05, "loss": 2.3476, "step": 1053 }, { "epoch": 0.2997209666048567, "grad_norm": 1.722588062286377, "learning_rate": 2.8313702958919852e-05, "loss": 2.0986, "step": 1054 }, { "epoch": 0.30000533184831485, "grad_norm": 1.6516780853271484, "learning_rate": 2.8302212008043666e-05, "loss": 2.1311, "step": 1055 }, { "epoch": 0.30028969709177294, "grad_norm": 2.02481746673584, "learning_rate": 2.8290721057167484e-05, "loss": 1.7839, "step": 1056 }, { "epoch": 0.3005740623352311, "grad_norm": 2.038386821746826, "learning_rate": 2.82792301062913e-05, "loss": 2.9095, "step": 1057 }, { "epoch": 0.3008584275786892, "grad_norm": 1.7752598524093628, "learning_rate": 2.8267739155415113e-05, "loss": 2.9031, "step": 1058 }, { "epoch": 0.3011427928221473, "grad_norm": 1.6934603452682495, "learning_rate": 2.825624820453893e-05, "loss": 2.4481, "step": 1059 }, { "epoch": 0.30142715806560544, "grad_norm": 1.6980534791946411, "learning_rate": 2.8244757253662746e-05, "loss": 2.4492, "step": 1060 }, { "epoch": 0.30171152330906353, "grad_norm": 1.7166688442230225, "learning_rate": 2.823326630278656e-05, "loss": 2.2513, "step": 1061 }, { "epoch": 0.3019958885525217, "grad_norm": 1.8027641773223877, "learning_rate": 2.8221775351910375e-05, "loss": 2.2973, "step": 1062 }, { "epoch": 0.3022802537959798, "grad_norm": 2.2114596366882324, "learning_rate": 2.821028440103419e-05, "loss": 1.9347, "step": 1063 }, { "epoch": 0.30256461903943793, "grad_norm": 1.8748873472213745, "learning_rate": 2.8198793450158004e-05, "loss": 1.8972, "step": 1064 }, { "epoch": 0.302848984282896, "grad_norm": 1.924533486366272, "learning_rate": 2.8187302499281818e-05, "loss": 3.2114, "step": 1065 }, { "epoch": 0.3031333495263541, "grad_norm": 1.7401739358901978, "learning_rate": 2.8175811548405633e-05, "loss": 2.7883, "step": 1066 }, { "epoch": 0.3034177147698123, "grad_norm": 1.6388260126113892, "learning_rate": 2.8164320597529447e-05, "loss": 2.5342, "step": 1067 }, { "epoch": 0.30370208001327037, "grad_norm": 1.66873037815094, "learning_rate": 2.8152829646653265e-05, "loss": 2.348, "step": 1068 }, { "epoch": 0.3039864452567285, "grad_norm": 1.798418402671814, "learning_rate": 2.814133869577708e-05, "loss": 2.3285, "step": 1069 }, { "epoch": 0.3042708105001866, "grad_norm": 1.6459908485412598, "learning_rate": 2.8129847744900894e-05, "loss": 1.9097, "step": 1070 }, { "epoch": 0.3045551757436447, "grad_norm": 1.6589969396591187, "learning_rate": 2.811835679402471e-05, "loss": 1.9083, "step": 1071 }, { "epoch": 0.30483954098710286, "grad_norm": 1.7330818176269531, "learning_rate": 2.8106865843148523e-05, "loss": 2.0022, "step": 1072 }, { "epoch": 0.30512390623056096, "grad_norm": 2.162515878677368, "learning_rate": 2.8095374892272338e-05, "loss": 3.2133, "step": 1073 }, { "epoch": 0.3054082714740191, "grad_norm": 1.890426516532898, "learning_rate": 2.8083883941396152e-05, "loss": 2.5098, "step": 1074 }, { "epoch": 0.3056926367174772, "grad_norm": 1.9428486824035645, "learning_rate": 2.8072392990519967e-05, "loss": 2.8393, "step": 1075 }, { "epoch": 0.3059770019609353, "grad_norm": 1.7772703170776367, "learning_rate": 2.806090203964378e-05, "loss": 2.5238, "step": 1076 }, { "epoch": 0.30626136720439345, "grad_norm": 2.0038602352142334, "learning_rate": 2.80494110887676e-05, "loss": 2.5622, "step": 1077 }, { "epoch": 0.30654573244785155, "grad_norm": 1.7946076393127441, "learning_rate": 2.8037920137891413e-05, "loss": 2.1265, "step": 1078 }, { "epoch": 0.3068300976913097, "grad_norm": 1.694143533706665, "learning_rate": 2.8026429187015228e-05, "loss": 2.1898, "step": 1079 }, { "epoch": 0.3071144629347678, "grad_norm": 1.8604297637939453, "learning_rate": 2.8014938236139042e-05, "loss": 1.8618, "step": 1080 }, { "epoch": 0.3073988281782259, "grad_norm": 2.2564175128936768, "learning_rate": 2.8003447285262857e-05, "loss": 2.8106, "step": 1081 }, { "epoch": 0.30768319342168404, "grad_norm": 2.0401079654693604, "learning_rate": 2.799195633438667e-05, "loss": 2.7885, "step": 1082 }, { "epoch": 0.30796755866514214, "grad_norm": 1.7820488214492798, "learning_rate": 2.7980465383510486e-05, "loss": 2.4126, "step": 1083 }, { "epoch": 0.3082519239086003, "grad_norm": 1.7269959449768066, "learning_rate": 2.79689744326343e-05, "loss": 2.3303, "step": 1084 }, { "epoch": 0.3085362891520584, "grad_norm": 1.8052058219909668, "learning_rate": 2.7957483481758115e-05, "loss": 2.3413, "step": 1085 }, { "epoch": 0.3088206543955165, "grad_norm": 1.7034716606140137, "learning_rate": 2.7945992530881933e-05, "loss": 2.2142, "step": 1086 }, { "epoch": 0.30910501963897463, "grad_norm": 1.924057960510254, "learning_rate": 2.7934501580005747e-05, "loss": 2.0875, "step": 1087 }, { "epoch": 0.3093893848824327, "grad_norm": 1.9003182649612427, "learning_rate": 2.7923010629129562e-05, "loss": 1.9661, "step": 1088 }, { "epoch": 0.3096737501258909, "grad_norm": 2.4958977699279785, "learning_rate": 2.7911519678253376e-05, "loss": 3.0476, "step": 1089 }, { "epoch": 0.309958115369349, "grad_norm": 2.0897626876831055, "learning_rate": 2.790002872737719e-05, "loss": 2.5666, "step": 1090 }, { "epoch": 0.3102424806128071, "grad_norm": 1.7425907850265503, "learning_rate": 2.7888537776501005e-05, "loss": 2.3865, "step": 1091 }, { "epoch": 0.3105268458562652, "grad_norm": 1.892893671989441, "learning_rate": 2.787704682562482e-05, "loss": 2.4982, "step": 1092 }, { "epoch": 0.3108112110997233, "grad_norm": 2.0055623054504395, "learning_rate": 2.786555587474864e-05, "loss": 2.3453, "step": 1093 }, { "epoch": 0.31109557634318147, "grad_norm": 1.7691258192062378, "learning_rate": 2.7854064923872456e-05, "loss": 2.1304, "step": 1094 }, { "epoch": 0.31137994158663956, "grad_norm": 1.7920550107955933, "learning_rate": 2.784257397299627e-05, "loss": 2.0597, "step": 1095 }, { "epoch": 0.3116643068300977, "grad_norm": 1.657732367515564, "learning_rate": 2.7831083022120085e-05, "loss": 1.8743, "step": 1096 }, { "epoch": 0.3119486720735558, "grad_norm": 2.2476625442504883, "learning_rate": 2.78195920712439e-05, "loss": 3.1486, "step": 1097 }, { "epoch": 0.3122330373170139, "grad_norm": 1.9806352853775024, "learning_rate": 2.7808101120367714e-05, "loss": 2.6852, "step": 1098 }, { "epoch": 0.31251740256047206, "grad_norm": 1.7334058284759521, "learning_rate": 2.7796610169491528e-05, "loss": 2.5363, "step": 1099 }, { "epoch": 0.31280176780393015, "grad_norm": 1.769322395324707, "learning_rate": 2.7785119218615343e-05, "loss": 2.3318, "step": 1100 }, { "epoch": 0.3130861330473883, "grad_norm": 1.7031893730163574, "learning_rate": 2.777362826773916e-05, "loss": 2.4197, "step": 1101 }, { "epoch": 0.3133704982908464, "grad_norm": 1.877233862876892, "learning_rate": 2.7762137316862975e-05, "loss": 2.049, "step": 1102 }, { "epoch": 0.3136548635343045, "grad_norm": 1.7407324314117432, "learning_rate": 2.775064636598679e-05, "loss": 1.8772, "step": 1103 }, { "epoch": 0.31393922877776265, "grad_norm": 1.7369098663330078, "learning_rate": 2.7739155415110604e-05, "loss": 1.9947, "step": 1104 }, { "epoch": 0.31422359402122074, "grad_norm": 2.291818141937256, "learning_rate": 2.772766446423442e-05, "loss": 2.9378, "step": 1105 }, { "epoch": 0.3145079592646789, "grad_norm": 1.7521897554397583, "learning_rate": 2.7716173513358233e-05, "loss": 2.6269, "step": 1106 }, { "epoch": 0.314792324508137, "grad_norm": 1.6815153360366821, "learning_rate": 2.7704682562482047e-05, "loss": 2.3316, "step": 1107 }, { "epoch": 0.3150766897515951, "grad_norm": 1.6132457256317139, "learning_rate": 2.7693191611605862e-05, "loss": 2.1689, "step": 1108 }, { "epoch": 0.31536105499505324, "grad_norm": 1.8604917526245117, "learning_rate": 2.7681700660729676e-05, "loss": 2.3652, "step": 1109 }, { "epoch": 0.31564542023851133, "grad_norm": 1.6801247596740723, "learning_rate": 2.7670209709853494e-05, "loss": 2.0579, "step": 1110 }, { "epoch": 0.3159297854819695, "grad_norm": 1.8979250192642212, "learning_rate": 2.765871875897731e-05, "loss": 2.0514, "step": 1111 }, { "epoch": 0.3162141507254276, "grad_norm": 1.8273488283157349, "learning_rate": 2.7647227808101123e-05, "loss": 2.0111, "step": 1112 }, { "epoch": 0.31649851596888573, "grad_norm": 2.283964157104492, "learning_rate": 2.7635736857224938e-05, "loss": 2.9822, "step": 1113 }, { "epoch": 0.3167828812123438, "grad_norm": 1.8019022941589355, "learning_rate": 2.7624245906348752e-05, "loss": 2.7109, "step": 1114 }, { "epoch": 0.3170672464558019, "grad_norm": 1.7521454095840454, "learning_rate": 2.7612754955472567e-05, "loss": 2.5455, "step": 1115 }, { "epoch": 0.31735161169926007, "grad_norm": 1.7717288732528687, "learning_rate": 2.760126400459638e-05, "loss": 2.2537, "step": 1116 }, { "epoch": 0.31763597694271817, "grad_norm": 1.9506962299346924, "learning_rate": 2.7589773053720196e-05, "loss": 2.2884, "step": 1117 }, { "epoch": 0.3179203421861763, "grad_norm": 1.6748727560043335, "learning_rate": 2.757828210284401e-05, "loss": 2.1113, "step": 1118 }, { "epoch": 0.3182047074296344, "grad_norm": 1.8400766849517822, "learning_rate": 2.7566791151967828e-05, "loss": 2.0633, "step": 1119 }, { "epoch": 0.3184890726730925, "grad_norm": 1.9170554876327515, "learning_rate": 2.7555300201091643e-05, "loss": 1.9511, "step": 1120 }, { "epoch": 0.31877343791655066, "grad_norm": 2.18106746673584, "learning_rate": 2.7543809250215457e-05, "loss": 3.0597, "step": 1121 }, { "epoch": 0.31905780316000876, "grad_norm": 1.7323399782180786, "learning_rate": 2.753231829933927e-05, "loss": 2.3877, "step": 1122 }, { "epoch": 0.3193421684034669, "grad_norm": 1.7470673322677612, "learning_rate": 2.7520827348463086e-05, "loss": 2.5492, "step": 1123 }, { "epoch": 0.319626533646925, "grad_norm": 1.627826452255249, "learning_rate": 2.75093363975869e-05, "loss": 2.3832, "step": 1124 }, { "epoch": 0.3199108988903831, "grad_norm": 1.8971792459487915, "learning_rate": 2.7497845446710715e-05, "loss": 2.2562, "step": 1125 }, { "epoch": 0.32019526413384125, "grad_norm": 1.8873558044433594, "learning_rate": 2.748635449583453e-05, "loss": 2.1615, "step": 1126 }, { "epoch": 0.32047962937729935, "grad_norm": 1.8183141946792603, "learning_rate": 2.7474863544958344e-05, "loss": 2.0914, "step": 1127 }, { "epoch": 0.3207639946207575, "grad_norm": 1.8885418176651, "learning_rate": 2.7463372594082162e-05, "loss": 1.9792, "step": 1128 }, { "epoch": 0.3210483598642156, "grad_norm": 2.204162359237671, "learning_rate": 2.7451881643205977e-05, "loss": 2.9695, "step": 1129 }, { "epoch": 0.3213327251076737, "grad_norm": 1.727662444114685, "learning_rate": 2.7440390692329794e-05, "loss": 2.5204, "step": 1130 }, { "epoch": 0.32161709035113184, "grad_norm": 1.721366286277771, "learning_rate": 2.742889974145361e-05, "loss": 2.5624, "step": 1131 }, { "epoch": 0.32190145559458994, "grad_norm": 1.598515510559082, "learning_rate": 2.7417408790577423e-05, "loss": 2.3068, "step": 1132 }, { "epoch": 0.3221858208380481, "grad_norm": 1.7761754989624023, "learning_rate": 2.7405917839701238e-05, "loss": 2.3844, "step": 1133 }, { "epoch": 0.3224701860815062, "grad_norm": 1.6555432081222534, "learning_rate": 2.7394426888825056e-05, "loss": 2.0163, "step": 1134 }, { "epoch": 0.32275455132496433, "grad_norm": 1.6981431245803833, "learning_rate": 2.738293593794887e-05, "loss": 1.9993, "step": 1135 }, { "epoch": 0.32303891656842243, "grad_norm": 1.702057957649231, "learning_rate": 2.7371444987072685e-05, "loss": 1.8252, "step": 1136 }, { "epoch": 0.3233232818118805, "grad_norm": 1.9638831615447998, "learning_rate": 2.73599540361965e-05, "loss": 3.1115, "step": 1137 }, { "epoch": 0.3236076470553387, "grad_norm": 1.7202662229537964, "learning_rate": 2.7348463085320314e-05, "loss": 2.5926, "step": 1138 }, { "epoch": 0.32389201229879677, "grad_norm": 1.6885807514190674, "learning_rate": 2.7336972134444128e-05, "loss": 2.5377, "step": 1139 }, { "epoch": 0.3241763775422549, "grad_norm": 1.6235450506210327, "learning_rate": 2.7325481183567943e-05, "loss": 2.5253, "step": 1140 }, { "epoch": 0.324460742785713, "grad_norm": 1.7628118991851807, "learning_rate": 2.7313990232691757e-05, "loss": 2.4275, "step": 1141 }, { "epoch": 0.3247451080291711, "grad_norm": 1.8225659132003784, "learning_rate": 2.7302499281815572e-05, "loss": 2.0278, "step": 1142 }, { "epoch": 0.32502947327262927, "grad_norm": 1.8690272569656372, "learning_rate": 2.729100833093939e-05, "loss": 2.0703, "step": 1143 }, { "epoch": 0.32531383851608736, "grad_norm": 1.951603889465332, "learning_rate": 2.7279517380063204e-05, "loss": 1.7402, "step": 1144 }, { "epoch": 0.3255982037595455, "grad_norm": 2.100116491317749, "learning_rate": 2.726802642918702e-05, "loss": 3.0271, "step": 1145 }, { "epoch": 0.3258825690030036, "grad_norm": 1.7133632898330688, "learning_rate": 2.7256535478310833e-05, "loss": 2.5711, "step": 1146 }, { "epoch": 0.3261669342464617, "grad_norm": 1.6197803020477295, "learning_rate": 2.7245044527434648e-05, "loss": 2.4896, "step": 1147 }, { "epoch": 0.32645129948991986, "grad_norm": 1.7283875942230225, "learning_rate": 2.7233553576558462e-05, "loss": 2.3986, "step": 1148 }, { "epoch": 0.32673566473337795, "grad_norm": 1.7507110834121704, "learning_rate": 2.7222062625682277e-05, "loss": 2.1887, "step": 1149 }, { "epoch": 0.3270200299768361, "grad_norm": 1.7281140089035034, "learning_rate": 2.721057167480609e-05, "loss": 2.2903, "step": 1150 }, { "epoch": 0.3273043952202942, "grad_norm": 1.7815526723861694, "learning_rate": 2.7199080723929906e-05, "loss": 2.0782, "step": 1151 }, { "epoch": 0.3275887604637523, "grad_norm": 1.8577396869659424, "learning_rate": 2.7187589773053724e-05, "loss": 1.7293, "step": 1152 }, { "epoch": 0.32787312570721044, "grad_norm": 2.047935962677002, "learning_rate": 2.7176098822177538e-05, "loss": 2.8512, "step": 1153 }, { "epoch": 0.32815749095066854, "grad_norm": 1.804479718208313, "learning_rate": 2.7164607871301353e-05, "loss": 2.7431, "step": 1154 }, { "epoch": 0.3284418561941267, "grad_norm": 1.7864207029342651, "learning_rate": 2.7153116920425167e-05, "loss": 2.4874, "step": 1155 }, { "epoch": 0.3287262214375848, "grad_norm": 1.6858755350112915, "learning_rate": 2.714162596954898e-05, "loss": 2.3767, "step": 1156 }, { "epoch": 0.3290105866810429, "grad_norm": 1.7890626192092896, "learning_rate": 2.7130135018672796e-05, "loss": 2.2339, "step": 1157 }, { "epoch": 0.32929495192450103, "grad_norm": 1.7149113416671753, "learning_rate": 2.711864406779661e-05, "loss": 2.2292, "step": 1158 }, { "epoch": 0.32957931716795913, "grad_norm": 1.7810120582580566, "learning_rate": 2.7107153116920425e-05, "loss": 2.0133, "step": 1159 }, { "epoch": 0.3298636824114173, "grad_norm": 1.7565312385559082, "learning_rate": 2.709566216604424e-05, "loss": 2.0087, "step": 1160 }, { "epoch": 0.3301480476548754, "grad_norm": 2.1705124378204346, "learning_rate": 2.7084171215168057e-05, "loss": 3.0282, "step": 1161 }, { "epoch": 0.3304324128983335, "grad_norm": 1.7916361093521118, "learning_rate": 2.7072680264291872e-05, "loss": 2.7158, "step": 1162 }, { "epoch": 0.3307167781417916, "grad_norm": 1.6537439823150635, "learning_rate": 2.7061189313415686e-05, "loss": 2.4969, "step": 1163 }, { "epoch": 0.3310011433852497, "grad_norm": 1.7181519269943237, "learning_rate": 2.70496983625395e-05, "loss": 2.3481, "step": 1164 }, { "epoch": 0.33128550862870787, "grad_norm": 1.7014647722244263, "learning_rate": 2.7038207411663315e-05, "loss": 2.2512, "step": 1165 }, { "epoch": 0.33156987387216597, "grad_norm": 1.7426505088806152, "learning_rate": 2.702671646078713e-05, "loss": 2.1619, "step": 1166 }, { "epoch": 0.3318542391156241, "grad_norm": 1.7927415370941162, "learning_rate": 2.701522550991095e-05, "loss": 1.9217, "step": 1167 }, { "epoch": 0.3321386043590822, "grad_norm": 1.9813625812530518, "learning_rate": 2.7003734559034766e-05, "loss": 1.89, "step": 1168 }, { "epoch": 0.3324229696025403, "grad_norm": 2.2803986072540283, "learning_rate": 2.699224360815858e-05, "loss": 3.1151, "step": 1169 }, { "epoch": 0.33270733484599846, "grad_norm": 1.8023364543914795, "learning_rate": 2.6980752657282395e-05, "loss": 2.5497, "step": 1170 }, { "epoch": 0.33299170008945655, "grad_norm": 1.5802347660064697, "learning_rate": 2.696926170640621e-05, "loss": 2.3567, "step": 1171 }, { "epoch": 0.3332760653329147, "grad_norm": 1.7448047399520874, "learning_rate": 2.6957770755530024e-05, "loss": 2.4281, "step": 1172 }, { "epoch": 0.3335604305763728, "grad_norm": 1.9900225400924683, "learning_rate": 2.6946279804653838e-05, "loss": 2.4395, "step": 1173 }, { "epoch": 0.3338447958198309, "grad_norm": 1.7050385475158691, "learning_rate": 2.6934788853777653e-05, "loss": 2.0546, "step": 1174 }, { "epoch": 0.33412916106328905, "grad_norm": 1.9430030584335327, "learning_rate": 2.6923297902901467e-05, "loss": 2.0789, "step": 1175 }, { "epoch": 0.33441352630674714, "grad_norm": 1.910507321357727, "learning_rate": 2.6911806952025285e-05, "loss": 1.9512, "step": 1176 }, { "epoch": 0.3346978915502053, "grad_norm": 2.156999349594116, "learning_rate": 2.69003160011491e-05, "loss": 2.7457, "step": 1177 }, { "epoch": 0.3349822567936634, "grad_norm": 1.7482166290283203, "learning_rate": 2.6888825050272914e-05, "loss": 2.7449, "step": 1178 }, { "epoch": 0.3352666220371215, "grad_norm": 1.692886233329773, "learning_rate": 2.687733409939673e-05, "loss": 2.4689, "step": 1179 }, { "epoch": 0.33555098728057964, "grad_norm": 1.6204313039779663, "learning_rate": 2.6865843148520543e-05, "loss": 2.3802, "step": 1180 }, { "epoch": 0.33583535252403773, "grad_norm": 1.7569348812103271, "learning_rate": 2.6854352197644357e-05, "loss": 2.2957, "step": 1181 }, { "epoch": 0.3361197177674959, "grad_norm": 1.6739596128463745, "learning_rate": 2.6842861246768172e-05, "loss": 1.9977, "step": 1182 }, { "epoch": 0.336404083010954, "grad_norm": 1.858077883720398, "learning_rate": 2.6831370295891986e-05, "loss": 2.0055, "step": 1183 }, { "epoch": 0.33668844825441213, "grad_norm": 1.8774821758270264, "learning_rate": 2.68198793450158e-05, "loss": 1.8794, "step": 1184 }, { "epoch": 0.3369728134978702, "grad_norm": 2.0827038288116455, "learning_rate": 2.680838839413962e-05, "loss": 2.8639, "step": 1185 }, { "epoch": 0.3372571787413283, "grad_norm": 1.8128411769866943, "learning_rate": 2.6796897443263433e-05, "loss": 2.4063, "step": 1186 }, { "epoch": 0.3375415439847865, "grad_norm": 1.9702905416488647, "learning_rate": 2.6785406492387248e-05, "loss": 2.4151, "step": 1187 }, { "epoch": 0.33782590922824457, "grad_norm": 1.90627121925354, "learning_rate": 2.6773915541511062e-05, "loss": 2.4171, "step": 1188 }, { "epoch": 0.3381102744717027, "grad_norm": 2.10098934173584, "learning_rate": 2.6762424590634877e-05, "loss": 2.3142, "step": 1189 }, { "epoch": 0.3383946397151608, "grad_norm": 1.7690738439559937, "learning_rate": 2.675093363975869e-05, "loss": 1.9259, "step": 1190 }, { "epoch": 0.3386790049586189, "grad_norm": 1.794833779335022, "learning_rate": 2.6739442688882506e-05, "loss": 1.9992, "step": 1191 }, { "epoch": 0.33896337020207706, "grad_norm": 1.848533034324646, "learning_rate": 2.672795173800632e-05, "loss": 1.9893, "step": 1192 }, { "epoch": 0.33924773544553516, "grad_norm": 2.1067445278167725, "learning_rate": 2.6716460787130135e-05, "loss": 3.1749, "step": 1193 }, { "epoch": 0.3395321006889933, "grad_norm": 1.9302208423614502, "learning_rate": 2.6704969836253953e-05, "loss": 2.6356, "step": 1194 }, { "epoch": 0.3398164659324514, "grad_norm": 1.7722642421722412, "learning_rate": 2.6693478885377767e-05, "loss": 2.4753, "step": 1195 }, { "epoch": 0.3401008311759095, "grad_norm": 1.7334580421447754, "learning_rate": 2.6681987934501582e-05, "loss": 2.2498, "step": 1196 }, { "epoch": 0.34038519641936765, "grad_norm": 1.7633260488510132, "learning_rate": 2.6670496983625396e-05, "loss": 2.2229, "step": 1197 }, { "epoch": 0.34066956166282575, "grad_norm": 1.721509337425232, "learning_rate": 2.665900603274921e-05, "loss": 2.2436, "step": 1198 }, { "epoch": 0.3409539269062839, "grad_norm": 1.93727445602417, "learning_rate": 2.6647515081873025e-05, "loss": 1.9182, "step": 1199 }, { "epoch": 0.341238292149742, "grad_norm": 1.9112025499343872, "learning_rate": 2.663602413099684e-05, "loss": 1.8985, "step": 1200 }, { "epoch": 0.3415226573932001, "grad_norm": 2.4720232486724854, "learning_rate": 2.6624533180120654e-05, "loss": 2.9957, "step": 1201 }, { "epoch": 0.34180702263665824, "grad_norm": 2.0485682487487793, "learning_rate": 2.661304222924447e-05, "loss": 2.5158, "step": 1202 }, { "epoch": 0.34209138788011634, "grad_norm": 1.8902740478515625, "learning_rate": 2.6601551278368287e-05, "loss": 2.3761, "step": 1203 }, { "epoch": 0.3423757531235745, "grad_norm": 1.7648299932479858, "learning_rate": 2.6590060327492104e-05, "loss": 2.0841, "step": 1204 }, { "epoch": 0.3426601183670326, "grad_norm": 1.8326659202575684, "learning_rate": 2.657856937661592e-05, "loss": 2.3462, "step": 1205 }, { "epoch": 0.3429444836104907, "grad_norm": 1.8549422025680542, "learning_rate": 2.6567078425739733e-05, "loss": 2.1884, "step": 1206 }, { "epoch": 0.34322884885394883, "grad_norm": 1.930391550064087, "learning_rate": 2.6555587474863548e-05, "loss": 1.9793, "step": 1207 }, { "epoch": 0.3435132140974069, "grad_norm": 1.8481113910675049, "learning_rate": 2.6544096523987362e-05, "loss": 2.0828, "step": 1208 }, { "epoch": 0.3437975793408651, "grad_norm": 2.0146400928497314, "learning_rate": 2.653260557311118e-05, "loss": 2.8335, "step": 1209 }, { "epoch": 0.3440819445843232, "grad_norm": 1.820291519165039, "learning_rate": 2.6521114622234995e-05, "loss": 2.7538, "step": 1210 }, { "epoch": 0.3443663098277813, "grad_norm": 1.721548080444336, "learning_rate": 2.650962367135881e-05, "loss": 2.4637, "step": 1211 }, { "epoch": 0.3446506750712394, "grad_norm": 1.7482106685638428, "learning_rate": 2.6498132720482624e-05, "loss": 2.2598, "step": 1212 }, { "epoch": 0.3449350403146975, "grad_norm": 1.8595374822616577, "learning_rate": 2.648664176960644e-05, "loss": 2.4933, "step": 1213 }, { "epoch": 0.34521940555815567, "grad_norm": 1.690114974975586, "learning_rate": 2.6475150818730253e-05, "loss": 2.1184, "step": 1214 }, { "epoch": 0.34550377080161376, "grad_norm": 1.6892638206481934, "learning_rate": 2.6463659867854067e-05, "loss": 1.9942, "step": 1215 }, { "epoch": 0.3457881360450719, "grad_norm": 1.8123127222061157, "learning_rate": 2.6452168916977882e-05, "loss": 2.1055, "step": 1216 }, { "epoch": 0.34607250128853, "grad_norm": 2.2426178455352783, "learning_rate": 2.6440677966101696e-05, "loss": 3.0519, "step": 1217 }, { "epoch": 0.3463568665319881, "grad_norm": 1.7458500862121582, "learning_rate": 2.6429187015225514e-05, "loss": 2.6327, "step": 1218 }, { "epoch": 0.34664123177544626, "grad_norm": 1.7254029512405396, "learning_rate": 2.641769606434933e-05, "loss": 2.4534, "step": 1219 }, { "epoch": 0.34692559701890435, "grad_norm": 1.6477024555206299, "learning_rate": 2.6406205113473143e-05, "loss": 2.3594, "step": 1220 }, { "epoch": 0.3472099622623625, "grad_norm": 1.6863490343093872, "learning_rate": 2.6394714162596958e-05, "loss": 2.2598, "step": 1221 }, { "epoch": 0.3474943275058206, "grad_norm": 1.8328365087509155, "learning_rate": 2.6383223211720772e-05, "loss": 2.1452, "step": 1222 }, { "epoch": 0.3477786927492787, "grad_norm": 1.7846546173095703, "learning_rate": 2.6371732260844587e-05, "loss": 2.0761, "step": 1223 }, { "epoch": 0.34806305799273685, "grad_norm": 2.078010082244873, "learning_rate": 2.63602413099684e-05, "loss": 1.9126, "step": 1224 }, { "epoch": 0.34834742323619494, "grad_norm": 2.1503591537475586, "learning_rate": 2.6348750359092216e-05, "loss": 2.875, "step": 1225 }, { "epoch": 0.3486317884796531, "grad_norm": 1.839574933052063, "learning_rate": 2.633725940821603e-05, "loss": 2.4719, "step": 1226 }, { "epoch": 0.3489161537231112, "grad_norm": 1.8120521306991577, "learning_rate": 2.6325768457339848e-05, "loss": 2.2545, "step": 1227 }, { "epoch": 0.3492005189665693, "grad_norm": 1.7394510507583618, "learning_rate": 2.6314277506463663e-05, "loss": 2.2625, "step": 1228 }, { "epoch": 0.34948488421002744, "grad_norm": 1.7700285911560059, "learning_rate": 2.6302786555587477e-05, "loss": 2.2649, "step": 1229 }, { "epoch": 0.34976924945348553, "grad_norm": 1.6092114448547363, "learning_rate": 2.629129560471129e-05, "loss": 1.9294, "step": 1230 }, { "epoch": 0.3500536146969437, "grad_norm": 1.8207448720932007, "learning_rate": 2.6279804653835106e-05, "loss": 1.9099, "step": 1231 }, { "epoch": 0.3503379799404018, "grad_norm": 1.8957747220993042, "learning_rate": 2.626831370295892e-05, "loss": 1.8944, "step": 1232 }, { "epoch": 0.35062234518385993, "grad_norm": 2.1414735317230225, "learning_rate": 2.6256822752082735e-05, "loss": 2.878, "step": 1233 }, { "epoch": 0.350906710427318, "grad_norm": 1.7470128536224365, "learning_rate": 2.624533180120655e-05, "loss": 2.6301, "step": 1234 }, { "epoch": 0.3511910756707761, "grad_norm": 1.71774423122406, "learning_rate": 2.6233840850330364e-05, "loss": 2.5042, "step": 1235 }, { "epoch": 0.35147544091423427, "grad_norm": 1.7391661405563354, "learning_rate": 2.6222349899454182e-05, "loss": 2.5183, "step": 1236 }, { "epoch": 0.35175980615769237, "grad_norm": 1.9395679235458374, "learning_rate": 2.6210858948577996e-05, "loss": 2.2172, "step": 1237 }, { "epoch": 0.3520441714011505, "grad_norm": 1.9249560832977295, "learning_rate": 2.619936799770181e-05, "loss": 2.0831, "step": 1238 }, { "epoch": 0.3523285366446086, "grad_norm": 1.7382254600524902, "learning_rate": 2.6187877046825625e-05, "loss": 2.1057, "step": 1239 }, { "epoch": 0.3526129018880667, "grad_norm": 1.7688852548599243, "learning_rate": 2.617638609594944e-05, "loss": 1.9215, "step": 1240 }, { "epoch": 0.35289726713152486, "grad_norm": 2.368236541748047, "learning_rate": 2.6164895145073258e-05, "loss": 2.7489, "step": 1241 }, { "epoch": 0.35318163237498296, "grad_norm": 1.797239065170288, "learning_rate": 2.6153404194197072e-05, "loss": 2.568, "step": 1242 }, { "epoch": 0.3534659976184411, "grad_norm": 1.586428165435791, "learning_rate": 2.614191324332089e-05, "loss": 2.2321, "step": 1243 }, { "epoch": 0.3537503628618992, "grad_norm": 1.7970319986343384, "learning_rate": 2.6130422292444705e-05, "loss": 2.405, "step": 1244 }, { "epoch": 0.3540347281053573, "grad_norm": 2.1091277599334717, "learning_rate": 2.611893134156852e-05, "loss": 2.5763, "step": 1245 }, { "epoch": 0.35431909334881545, "grad_norm": 1.7261741161346436, "learning_rate": 2.6107440390692334e-05, "loss": 2.3919, "step": 1246 }, { "epoch": 0.35460345859227355, "grad_norm": 1.808210015296936, "learning_rate": 2.6095949439816148e-05, "loss": 2.0507, "step": 1247 }, { "epoch": 0.3548878238357317, "grad_norm": 2.0044949054718018, "learning_rate": 2.6084458488939963e-05, "loss": 1.9141, "step": 1248 }, { "epoch": 0.3551721890791898, "grad_norm": 2.0011098384857178, "learning_rate": 2.6072967538063777e-05, "loss": 2.7051, "step": 1249 }, { "epoch": 0.3554565543226479, "grad_norm": 1.7383923530578613, "learning_rate": 2.606147658718759e-05, "loss": 2.5117, "step": 1250 }, { "epoch": 0.35574091956610604, "grad_norm": 1.8515959978103638, "learning_rate": 2.6049985636311406e-05, "loss": 2.5653, "step": 1251 }, { "epoch": 0.35602528480956414, "grad_norm": 1.7738157510757446, "learning_rate": 2.6038494685435224e-05, "loss": 2.4249, "step": 1252 }, { "epoch": 0.3563096500530223, "grad_norm": 1.834094524383545, "learning_rate": 2.602700373455904e-05, "loss": 2.3869, "step": 1253 }, { "epoch": 0.3565940152964804, "grad_norm": 1.6823465824127197, "learning_rate": 2.6015512783682853e-05, "loss": 1.9128, "step": 1254 }, { "epoch": 0.35687838053993853, "grad_norm": 1.706059217453003, "learning_rate": 2.6004021832806668e-05, "loss": 1.8869, "step": 1255 }, { "epoch": 0.35716274578339663, "grad_norm": 1.8522998094558716, "learning_rate": 2.5992530881930482e-05, "loss": 1.9591, "step": 1256 }, { "epoch": 0.3574471110268547, "grad_norm": 2.1196982860565186, "learning_rate": 2.5981039931054297e-05, "loss": 2.9377, "step": 1257 }, { "epoch": 0.3577314762703129, "grad_norm": 1.8649749755859375, "learning_rate": 2.596954898017811e-05, "loss": 2.5061, "step": 1258 }, { "epoch": 0.35801584151377097, "grad_norm": 1.6296436786651611, "learning_rate": 2.5958058029301926e-05, "loss": 2.5032, "step": 1259 }, { "epoch": 0.3583002067572291, "grad_norm": 1.5869815349578857, "learning_rate": 2.594656707842574e-05, "loss": 2.4272, "step": 1260 }, { "epoch": 0.3585845720006872, "grad_norm": 1.7618229389190674, "learning_rate": 2.5935076127549558e-05, "loss": 2.3934, "step": 1261 }, { "epoch": 0.3588689372441453, "grad_norm": 1.716230034828186, "learning_rate": 2.5923585176673372e-05, "loss": 2.0693, "step": 1262 }, { "epoch": 0.35915330248760347, "grad_norm": 1.8889720439910889, "learning_rate": 2.5912094225797187e-05, "loss": 1.9207, "step": 1263 }, { "epoch": 0.35943766773106156, "grad_norm": 1.825950264930725, "learning_rate": 2.5900603274921e-05, "loss": 1.867, "step": 1264 }, { "epoch": 0.3597220329745197, "grad_norm": 2.1356546878814697, "learning_rate": 2.5889112324044816e-05, "loss": 3.0785, "step": 1265 }, { "epoch": 0.3600063982179778, "grad_norm": 1.845746636390686, "learning_rate": 2.587762137316863e-05, "loss": 2.608, "step": 1266 }, { "epoch": 0.3602907634614359, "grad_norm": 1.7854721546173096, "learning_rate": 2.5866130422292445e-05, "loss": 2.3848, "step": 1267 }, { "epoch": 0.36057512870489405, "grad_norm": 1.6726619005203247, "learning_rate": 2.585463947141626e-05, "loss": 2.5518, "step": 1268 }, { "epoch": 0.36085949394835215, "grad_norm": 1.716640830039978, "learning_rate": 2.5843148520540077e-05, "loss": 2.3346, "step": 1269 }, { "epoch": 0.3611438591918103, "grad_norm": 1.5924497842788696, "learning_rate": 2.5831657569663892e-05, "loss": 1.9842, "step": 1270 }, { "epoch": 0.3614282244352684, "grad_norm": 1.7810031175613403, "learning_rate": 2.5820166618787706e-05, "loss": 1.8548, "step": 1271 }, { "epoch": 0.3617125896787265, "grad_norm": 1.8115458488464355, "learning_rate": 2.580867566791152e-05, "loss": 2.0252, "step": 1272 }, { "epoch": 0.36199695492218464, "grad_norm": 2.0684995651245117, "learning_rate": 2.5797184717035335e-05, "loss": 2.9993, "step": 1273 }, { "epoch": 0.36228132016564274, "grad_norm": 1.792303204536438, "learning_rate": 2.578569376615915e-05, "loss": 2.5244, "step": 1274 }, { "epoch": 0.3625656854091009, "grad_norm": 1.5862210988998413, "learning_rate": 2.5774202815282964e-05, "loss": 2.4175, "step": 1275 }, { "epoch": 0.362850050652559, "grad_norm": 1.706787109375, "learning_rate": 2.576271186440678e-05, "loss": 2.4619, "step": 1276 }, { "epoch": 0.3631344158960171, "grad_norm": 1.7666358947753906, "learning_rate": 2.5751220913530593e-05, "loss": 2.4914, "step": 1277 }, { "epoch": 0.36341878113947523, "grad_norm": 1.6655855178833008, "learning_rate": 2.5739729962654415e-05, "loss": 2.0598, "step": 1278 }, { "epoch": 0.36370314638293333, "grad_norm": 1.6447235345840454, "learning_rate": 2.572823901177823e-05, "loss": 1.734, "step": 1279 }, { "epoch": 0.3639875116263915, "grad_norm": 1.7836151123046875, "learning_rate": 2.5716748060902044e-05, "loss": 1.9249, "step": 1280 }, { "epoch": 0.3642718768698496, "grad_norm": 1.9049650430679321, "learning_rate": 2.5705257110025858e-05, "loss": 2.6863, "step": 1281 }, { "epoch": 0.3645562421133077, "grad_norm": 1.6191275119781494, "learning_rate": 2.5693766159149673e-05, "loss": 2.4752, "step": 1282 }, { "epoch": 0.3648406073567658, "grad_norm": 1.6854596138000488, "learning_rate": 2.5682275208273487e-05, "loss": 2.4672, "step": 1283 }, { "epoch": 0.3651249726002239, "grad_norm": 1.725872278213501, "learning_rate": 2.56707842573973e-05, "loss": 2.4114, "step": 1284 }, { "epoch": 0.36540933784368207, "grad_norm": 1.7640068531036377, "learning_rate": 2.565929330652112e-05, "loss": 2.1844, "step": 1285 }, { "epoch": 0.36569370308714017, "grad_norm": 1.6957398653030396, "learning_rate": 2.5647802355644934e-05, "loss": 2.1487, "step": 1286 }, { "epoch": 0.3659780683305983, "grad_norm": 1.8824515342712402, "learning_rate": 2.563631140476875e-05, "loss": 2.0372, "step": 1287 }, { "epoch": 0.3662624335740564, "grad_norm": 1.8746623992919922, "learning_rate": 2.5624820453892563e-05, "loss": 1.9418, "step": 1288 }, { "epoch": 0.3665467988175145, "grad_norm": 2.1677398681640625, "learning_rate": 2.5613329503016377e-05, "loss": 3.0094, "step": 1289 }, { "epoch": 0.36683116406097266, "grad_norm": 1.7507233619689941, "learning_rate": 2.5601838552140192e-05, "loss": 2.5547, "step": 1290 }, { "epoch": 0.36711552930443075, "grad_norm": 1.6727383136749268, "learning_rate": 2.5590347601264006e-05, "loss": 2.49, "step": 1291 }, { "epoch": 0.3673998945478889, "grad_norm": 1.6071473360061646, "learning_rate": 2.557885665038782e-05, "loss": 2.5057, "step": 1292 }, { "epoch": 0.367684259791347, "grad_norm": 1.853559970855713, "learning_rate": 2.5567365699511635e-05, "loss": 2.135, "step": 1293 }, { "epoch": 0.3679686250348051, "grad_norm": 1.656827449798584, "learning_rate": 2.5555874748635453e-05, "loss": 1.9428, "step": 1294 }, { "epoch": 0.36825299027826325, "grad_norm": 1.7858695983886719, "learning_rate": 2.5544383797759268e-05, "loss": 2.0289, "step": 1295 }, { "epoch": 0.36853735552172134, "grad_norm": 1.7706724405288696, "learning_rate": 2.5532892846883082e-05, "loss": 1.7794, "step": 1296 }, { "epoch": 0.3688217207651795, "grad_norm": 1.8838144540786743, "learning_rate": 2.5521401896006897e-05, "loss": 2.7027, "step": 1297 }, { "epoch": 0.3691060860086376, "grad_norm": 1.6255005598068237, "learning_rate": 2.550991094513071e-05, "loss": 2.7461, "step": 1298 }, { "epoch": 0.3693904512520957, "grad_norm": 1.5868300199508667, "learning_rate": 2.5498419994254526e-05, "loss": 2.4882, "step": 1299 }, { "epoch": 0.36967481649555384, "grad_norm": 1.6790016889572144, "learning_rate": 2.548692904337834e-05, "loss": 2.1273, "step": 1300 }, { "epoch": 0.36995918173901193, "grad_norm": 1.8113175630569458, "learning_rate": 2.5475438092502155e-05, "loss": 2.2047, "step": 1301 }, { "epoch": 0.3702435469824701, "grad_norm": 1.5671573877334595, "learning_rate": 2.546394714162597e-05, "loss": 1.9255, "step": 1302 }, { "epoch": 0.3705279122259282, "grad_norm": 1.6399245262145996, "learning_rate": 2.5452456190749787e-05, "loss": 1.8351, "step": 1303 }, { "epoch": 0.37081227746938633, "grad_norm": 1.7190747261047363, "learning_rate": 2.54409652398736e-05, "loss": 2.0176, "step": 1304 }, { "epoch": 0.3710966427128444, "grad_norm": 2.0205039978027344, "learning_rate": 2.5429474288997416e-05, "loss": 2.9932, "step": 1305 }, { "epoch": 0.3713810079563025, "grad_norm": 1.7755295038223267, "learning_rate": 2.541798333812123e-05, "loss": 2.5928, "step": 1306 }, { "epoch": 0.3716653731997607, "grad_norm": 1.6791408061981201, "learning_rate": 2.5406492387245045e-05, "loss": 2.4596, "step": 1307 }, { "epoch": 0.37194973844321877, "grad_norm": 1.682665467262268, "learning_rate": 2.539500143636886e-05, "loss": 2.284, "step": 1308 }, { "epoch": 0.3722341036866769, "grad_norm": 1.661361813545227, "learning_rate": 2.5383510485492674e-05, "loss": 2.4567, "step": 1309 }, { "epoch": 0.372518468930135, "grad_norm": 1.7297680377960205, "learning_rate": 2.537201953461649e-05, "loss": 1.993, "step": 1310 }, { "epoch": 0.3728028341735931, "grad_norm": 1.7310127019882202, "learning_rate": 2.5360528583740303e-05, "loss": 2.0113, "step": 1311 }, { "epoch": 0.37308719941705126, "grad_norm": 1.7791327238082886, "learning_rate": 2.534903763286412e-05, "loss": 1.8896, "step": 1312 }, { "epoch": 0.37337156466050936, "grad_norm": 2.11208438873291, "learning_rate": 2.5337546681987936e-05, "loss": 3.0813, "step": 1313 }, { "epoch": 0.3736559299039675, "grad_norm": 1.8183382749557495, "learning_rate": 2.532605573111175e-05, "loss": 2.7019, "step": 1314 }, { "epoch": 0.3739402951474256, "grad_norm": 1.6248207092285156, "learning_rate": 2.5314564780235568e-05, "loss": 2.422, "step": 1315 }, { "epoch": 0.3742246603908837, "grad_norm": 1.640504002571106, "learning_rate": 2.5303073829359382e-05, "loss": 2.3868, "step": 1316 }, { "epoch": 0.37450902563434185, "grad_norm": 1.7609598636627197, "learning_rate": 2.5291582878483197e-05, "loss": 2.263, "step": 1317 }, { "epoch": 0.37479339087779995, "grad_norm": 1.7517386674880981, "learning_rate": 2.5280091927607015e-05, "loss": 2.024, "step": 1318 }, { "epoch": 0.3750777561212581, "grad_norm": 1.6678249835968018, "learning_rate": 2.526860097673083e-05, "loss": 1.8084, "step": 1319 }, { "epoch": 0.3753621213647162, "grad_norm": 1.7585467100143433, "learning_rate": 2.5257110025854644e-05, "loss": 1.7552, "step": 1320 }, { "epoch": 0.3756464866081743, "grad_norm": 2.2945332527160645, "learning_rate": 2.5245619074978458e-05, "loss": 3.0485, "step": 1321 }, { "epoch": 0.37593085185163244, "grad_norm": 1.6256428956985474, "learning_rate": 2.5234128124102273e-05, "loss": 2.3769, "step": 1322 }, { "epoch": 0.37621521709509054, "grad_norm": 1.6193193197250366, "learning_rate": 2.5222637173226087e-05, "loss": 2.3007, "step": 1323 }, { "epoch": 0.3764995823385487, "grad_norm": 1.6534637212753296, "learning_rate": 2.5211146222349902e-05, "loss": 2.5241, "step": 1324 }, { "epoch": 0.3767839475820068, "grad_norm": 1.7041131258010864, "learning_rate": 2.5199655271473716e-05, "loss": 2.3242, "step": 1325 }, { "epoch": 0.37706831282546494, "grad_norm": 1.697442889213562, "learning_rate": 2.518816432059753e-05, "loss": 1.9187, "step": 1326 }, { "epoch": 0.37735267806892303, "grad_norm": 1.7257381677627563, "learning_rate": 2.517667336972135e-05, "loss": 1.7997, "step": 1327 }, { "epoch": 0.3776370433123811, "grad_norm": 1.8041011095046997, "learning_rate": 2.5165182418845163e-05, "loss": 1.9322, "step": 1328 }, { "epoch": 0.3779214085558393, "grad_norm": 2.060734748840332, "learning_rate": 2.5153691467968978e-05, "loss": 2.7627, "step": 1329 }, { "epoch": 0.3782057737992974, "grad_norm": 1.7511411905288696, "learning_rate": 2.5142200517092792e-05, "loss": 2.2934, "step": 1330 }, { "epoch": 0.3784901390427555, "grad_norm": 1.6538666486740112, "learning_rate": 2.5130709566216607e-05, "loss": 2.2186, "step": 1331 }, { "epoch": 0.3787745042862136, "grad_norm": 1.672150731086731, "learning_rate": 2.511921861534042e-05, "loss": 2.3287, "step": 1332 }, { "epoch": 0.3790588695296717, "grad_norm": 1.8601032495498657, "learning_rate": 2.5107727664464236e-05, "loss": 2.1725, "step": 1333 }, { "epoch": 0.37934323477312987, "grad_norm": 1.6767491102218628, "learning_rate": 2.509623671358805e-05, "loss": 2.1525, "step": 1334 }, { "epoch": 0.37962760001658796, "grad_norm": 1.72711980342865, "learning_rate": 2.5084745762711865e-05, "loss": 2.1467, "step": 1335 }, { "epoch": 0.3799119652600461, "grad_norm": 1.7311689853668213, "learning_rate": 2.5073254811835683e-05, "loss": 1.8984, "step": 1336 }, { "epoch": 0.3801963305035042, "grad_norm": 2.0441644191741943, "learning_rate": 2.5061763860959497e-05, "loss": 3.0045, "step": 1337 }, { "epoch": 0.3804806957469623, "grad_norm": 1.9976049661636353, "learning_rate": 2.505027291008331e-05, "loss": 2.6461, "step": 1338 }, { "epoch": 0.38076506099042046, "grad_norm": 1.6794681549072266, "learning_rate": 2.5038781959207126e-05, "loss": 2.4184, "step": 1339 }, { "epoch": 0.38104942623387855, "grad_norm": 1.573089599609375, "learning_rate": 2.502729100833094e-05, "loss": 2.3171, "step": 1340 }, { "epoch": 0.3813337914773367, "grad_norm": 1.8047806024551392, "learning_rate": 2.5015800057454755e-05, "loss": 2.4302, "step": 1341 }, { "epoch": 0.3816181567207948, "grad_norm": 1.679032564163208, "learning_rate": 2.500430910657857e-05, "loss": 2.0206, "step": 1342 }, { "epoch": 0.3819025219642529, "grad_norm": 1.6827946901321411, "learning_rate": 2.4992818155702384e-05, "loss": 1.8739, "step": 1343 }, { "epoch": 0.38218688720771105, "grad_norm": 2.0795507431030273, "learning_rate": 2.49813272048262e-05, "loss": 1.9664, "step": 1344 }, { "epoch": 0.38247125245116914, "grad_norm": 2.012922763824463, "learning_rate": 2.4969836253950016e-05, "loss": 2.757, "step": 1345 }, { "epoch": 0.3827556176946273, "grad_norm": 1.62966787815094, "learning_rate": 2.495834530307383e-05, "loss": 2.5295, "step": 1346 }, { "epoch": 0.3830399829380854, "grad_norm": 1.6679270267486572, "learning_rate": 2.4946854352197645e-05, "loss": 2.459, "step": 1347 }, { "epoch": 0.3833243481815435, "grad_norm": 1.615654468536377, "learning_rate": 2.493536340132146e-05, "loss": 2.1912, "step": 1348 }, { "epoch": 0.38360871342500164, "grad_norm": 1.8527601957321167, "learning_rate": 2.4923872450445274e-05, "loss": 2.2239, "step": 1349 }, { "epoch": 0.38389307866845973, "grad_norm": 1.7967239618301392, "learning_rate": 2.491238149956909e-05, "loss": 1.9359, "step": 1350 }, { "epoch": 0.3841774439119179, "grad_norm": 1.7077158689498901, "learning_rate": 2.4900890548692903e-05, "loss": 1.737, "step": 1351 }, { "epoch": 0.384461809155376, "grad_norm": 1.7377227544784546, "learning_rate": 2.4889399597816725e-05, "loss": 1.934, "step": 1352 }, { "epoch": 0.38474617439883413, "grad_norm": 1.9111387729644775, "learning_rate": 2.487790864694054e-05, "loss": 2.776, "step": 1353 }, { "epoch": 0.3850305396422922, "grad_norm": 1.6393592357635498, "learning_rate": 2.4866417696064354e-05, "loss": 2.4337, "step": 1354 }, { "epoch": 0.3853149048857503, "grad_norm": 1.7476547956466675, "learning_rate": 2.4854926745188168e-05, "loss": 2.377, "step": 1355 }, { "epoch": 0.38559927012920847, "grad_norm": 1.8083293437957764, "learning_rate": 2.4843435794311983e-05, "loss": 2.3762, "step": 1356 }, { "epoch": 0.38588363537266657, "grad_norm": 1.9728158712387085, "learning_rate": 2.4831944843435797e-05, "loss": 2.242, "step": 1357 }, { "epoch": 0.3861680006161247, "grad_norm": 1.5531713962554932, "learning_rate": 2.482045389255961e-05, "loss": 2.0673, "step": 1358 }, { "epoch": 0.3864523658595828, "grad_norm": 1.5641489028930664, "learning_rate": 2.4808962941683426e-05, "loss": 1.7698, "step": 1359 }, { "epoch": 0.3867367311030409, "grad_norm": 1.7081369161605835, "learning_rate": 2.4797471990807244e-05, "loss": 1.7401, "step": 1360 }, { "epoch": 0.38702109634649906, "grad_norm": 1.9915111064910889, "learning_rate": 2.478598103993106e-05, "loss": 2.9381, "step": 1361 }, { "epoch": 0.38730546158995716, "grad_norm": 1.6150346994400024, "learning_rate": 2.4774490089054873e-05, "loss": 2.5681, "step": 1362 }, { "epoch": 0.3875898268334153, "grad_norm": 1.6368330717086792, "learning_rate": 2.4762999138178687e-05, "loss": 2.5464, "step": 1363 }, { "epoch": 0.3878741920768734, "grad_norm": 1.6457427740097046, "learning_rate": 2.4751508187302502e-05, "loss": 2.1136, "step": 1364 }, { "epoch": 0.3881585573203315, "grad_norm": 1.6669577360153198, "learning_rate": 2.4740017236426316e-05, "loss": 2.3689, "step": 1365 }, { "epoch": 0.38844292256378965, "grad_norm": 1.6207609176635742, "learning_rate": 2.472852628555013e-05, "loss": 2.0466, "step": 1366 }, { "epoch": 0.38872728780724775, "grad_norm": 1.6581090688705444, "learning_rate": 2.4717035334673945e-05, "loss": 2.1255, "step": 1367 }, { "epoch": 0.3890116530507059, "grad_norm": 1.7444984912872314, "learning_rate": 2.470554438379776e-05, "loss": 1.9976, "step": 1368 }, { "epoch": 0.389296018294164, "grad_norm": 2.0723159313201904, "learning_rate": 2.4694053432921578e-05, "loss": 3.1322, "step": 1369 }, { "epoch": 0.3895803835376221, "grad_norm": 1.8740094900131226, "learning_rate": 2.4682562482045392e-05, "loss": 2.346, "step": 1370 }, { "epoch": 0.38986474878108024, "grad_norm": 1.7882744073867798, "learning_rate": 2.4671071531169207e-05, "loss": 2.3347, "step": 1371 }, { "epoch": 0.39014911402453833, "grad_norm": 1.7288506031036377, "learning_rate": 2.465958058029302e-05, "loss": 2.2182, "step": 1372 }, { "epoch": 0.3904334792679965, "grad_norm": 1.6869946718215942, "learning_rate": 2.4648089629416836e-05, "loss": 2.203, "step": 1373 }, { "epoch": 0.3907178445114546, "grad_norm": 1.6515893936157227, "learning_rate": 2.463659867854065e-05, "loss": 2.1165, "step": 1374 }, { "epoch": 0.39100220975491273, "grad_norm": 1.7650010585784912, "learning_rate": 2.4625107727664465e-05, "loss": 1.8461, "step": 1375 }, { "epoch": 0.39128657499837083, "grad_norm": 1.602225661277771, "learning_rate": 2.461361677678828e-05, "loss": 1.8419, "step": 1376 }, { "epoch": 0.3915709402418289, "grad_norm": 1.930778980255127, "learning_rate": 2.4602125825912094e-05, "loss": 2.7933, "step": 1377 }, { "epoch": 0.3918553054852871, "grad_norm": 1.921008586883545, "learning_rate": 2.4590634875035912e-05, "loss": 2.6368, "step": 1378 }, { "epoch": 0.39213967072874517, "grad_norm": 1.7024277448654175, "learning_rate": 2.4579143924159726e-05, "loss": 2.3727, "step": 1379 }, { "epoch": 0.3924240359722033, "grad_norm": 1.522916316986084, "learning_rate": 2.456765297328354e-05, "loss": 2.2389, "step": 1380 }, { "epoch": 0.3927084012156614, "grad_norm": 1.8298519849777222, "learning_rate": 2.4556162022407355e-05, "loss": 2.3436, "step": 1381 }, { "epoch": 0.3929927664591195, "grad_norm": 1.6771541833877563, "learning_rate": 2.454467107153117e-05, "loss": 1.8835, "step": 1382 }, { "epoch": 0.39327713170257766, "grad_norm": 1.8293532133102417, "learning_rate": 2.4533180120654984e-05, "loss": 2.0321, "step": 1383 }, { "epoch": 0.39356149694603576, "grad_norm": 1.8741817474365234, "learning_rate": 2.45216891697788e-05, "loss": 1.9599, "step": 1384 }, { "epoch": 0.3938458621894939, "grad_norm": 2.148267984390259, "learning_rate": 2.4510198218902613e-05, "loss": 2.7614, "step": 1385 }, { "epoch": 0.394130227432952, "grad_norm": 1.771562933921814, "learning_rate": 2.4498707268026428e-05, "loss": 2.6609, "step": 1386 }, { "epoch": 0.3944145926764101, "grad_norm": 1.6484909057617188, "learning_rate": 2.4487216317150246e-05, "loss": 2.3871, "step": 1387 }, { "epoch": 0.39469895791986825, "grad_norm": 1.6296532154083252, "learning_rate": 2.447572536627406e-05, "loss": 2.3645, "step": 1388 }, { "epoch": 0.39498332316332635, "grad_norm": 1.7516010999679565, "learning_rate": 2.4464234415397878e-05, "loss": 2.1963, "step": 1389 }, { "epoch": 0.3952676884067845, "grad_norm": 1.6003243923187256, "learning_rate": 2.4452743464521692e-05, "loss": 1.9164, "step": 1390 }, { "epoch": 0.3955520536502426, "grad_norm": 1.8290302753448486, "learning_rate": 2.4441252513645507e-05, "loss": 1.9689, "step": 1391 }, { "epoch": 0.3958364188937007, "grad_norm": 1.694347858428955, "learning_rate": 2.442976156276932e-05, "loss": 1.7561, "step": 1392 }, { "epoch": 0.39612078413715884, "grad_norm": 1.91927969455719, "learning_rate": 2.441827061189314e-05, "loss": 2.8435, "step": 1393 }, { "epoch": 0.39640514938061694, "grad_norm": 1.766818642616272, "learning_rate": 2.4406779661016954e-05, "loss": 2.6195, "step": 1394 }, { "epoch": 0.3966895146240751, "grad_norm": 1.7529215812683105, "learning_rate": 2.439528871014077e-05, "loss": 2.455, "step": 1395 }, { "epoch": 0.3969738798675332, "grad_norm": 1.6795532703399658, "learning_rate": 2.4383797759264583e-05, "loss": 2.4504, "step": 1396 }, { "epoch": 0.3972582451109913, "grad_norm": 1.8063428401947021, "learning_rate": 2.4372306808388397e-05, "loss": 2.3136, "step": 1397 }, { "epoch": 0.39754261035444943, "grad_norm": 1.8748013973236084, "learning_rate": 2.4360815857512212e-05, "loss": 1.9549, "step": 1398 }, { "epoch": 0.39782697559790753, "grad_norm": 1.702768087387085, "learning_rate": 2.4349324906636026e-05, "loss": 1.9597, "step": 1399 }, { "epoch": 0.3981113408413657, "grad_norm": 1.8588896989822388, "learning_rate": 2.433783395575984e-05, "loss": 1.9016, "step": 1400 }, { "epoch": 0.3983957060848238, "grad_norm": 1.9412899017333984, "learning_rate": 2.4326343004883655e-05, "loss": 2.6964, "step": 1401 }, { "epoch": 0.3986800713282819, "grad_norm": 1.590203046798706, "learning_rate": 2.4314852054007473e-05, "loss": 2.5482, "step": 1402 }, { "epoch": 0.39896443657174, "grad_norm": 1.613073468208313, "learning_rate": 2.4303361103131288e-05, "loss": 2.3643, "step": 1403 }, { "epoch": 0.3992488018151981, "grad_norm": 1.7016730308532715, "learning_rate": 2.4291870152255102e-05, "loss": 2.179, "step": 1404 }, { "epoch": 0.39953316705865627, "grad_norm": 1.7055073976516724, "learning_rate": 2.4280379201378917e-05, "loss": 2.3988, "step": 1405 }, { "epoch": 0.39981753230211436, "grad_norm": 1.6752086877822876, "learning_rate": 2.426888825050273e-05, "loss": 2.3758, "step": 1406 }, { "epoch": 0.4001018975455725, "grad_norm": 1.6053355932235718, "learning_rate": 2.4257397299626546e-05, "loss": 1.9306, "step": 1407 }, { "epoch": 0.4003862627890306, "grad_norm": 1.7850687503814697, "learning_rate": 2.424590634875036e-05, "loss": 1.8416, "step": 1408 }, { "epoch": 0.4006706280324887, "grad_norm": 2.1916213035583496, "learning_rate": 2.4234415397874175e-05, "loss": 2.8873, "step": 1409 }, { "epoch": 0.40095499327594686, "grad_norm": 1.7736766338348389, "learning_rate": 2.422292444699799e-05, "loss": 2.5443, "step": 1410 }, { "epoch": 0.40123935851940495, "grad_norm": 1.630659818649292, "learning_rate": 2.4211433496121807e-05, "loss": 2.4075, "step": 1411 }, { "epoch": 0.4015237237628631, "grad_norm": 1.5184587240219116, "learning_rate": 2.419994254524562e-05, "loss": 2.2805, "step": 1412 }, { "epoch": 0.4018080890063212, "grad_norm": 1.721583366394043, "learning_rate": 2.4188451594369436e-05, "loss": 2.3186, "step": 1413 }, { "epoch": 0.4020924542497793, "grad_norm": 1.7377300262451172, "learning_rate": 2.417696064349325e-05, "loss": 1.9795, "step": 1414 }, { "epoch": 0.40237681949323745, "grad_norm": 1.7117558717727661, "learning_rate": 2.4165469692617065e-05, "loss": 2.2045, "step": 1415 }, { "epoch": 0.40266118473669554, "grad_norm": 1.7114864587783813, "learning_rate": 2.415397874174088e-05, "loss": 1.799, "step": 1416 }, { "epoch": 0.4029455499801537, "grad_norm": 1.7981421947479248, "learning_rate": 2.4142487790864694e-05, "loss": 3.078, "step": 1417 }, { "epoch": 0.4032299152236118, "grad_norm": 1.600596308708191, "learning_rate": 2.413099683998851e-05, "loss": 2.3393, "step": 1418 }, { "epoch": 0.4035142804670699, "grad_norm": 1.4729454517364502, "learning_rate": 2.4119505889112323e-05, "loss": 2.3691, "step": 1419 }, { "epoch": 0.40379864571052804, "grad_norm": 1.6638671159744263, "learning_rate": 2.410801493823614e-05, "loss": 2.2835, "step": 1420 }, { "epoch": 0.40408301095398613, "grad_norm": 1.79701566696167, "learning_rate": 2.4096523987359955e-05, "loss": 2.2159, "step": 1421 }, { "epoch": 0.4043673761974443, "grad_norm": 1.6714825630187988, "learning_rate": 2.408503303648377e-05, "loss": 1.8111, "step": 1422 }, { "epoch": 0.4046517414409024, "grad_norm": 1.736799716949463, "learning_rate": 2.4073542085607584e-05, "loss": 1.9677, "step": 1423 }, { "epoch": 0.40493610668436053, "grad_norm": 1.7546263933181763, "learning_rate": 2.40620511347314e-05, "loss": 1.8836, "step": 1424 }, { "epoch": 0.4052204719278186, "grad_norm": 1.9755287170410156, "learning_rate": 2.4050560183855213e-05, "loss": 2.9325, "step": 1425 }, { "epoch": 0.4055048371712767, "grad_norm": 1.6950325965881348, "learning_rate": 2.4039069232979035e-05, "loss": 2.7484, "step": 1426 }, { "epoch": 0.4057892024147349, "grad_norm": 1.787856936454773, "learning_rate": 2.402757828210285e-05, "loss": 2.5154, "step": 1427 }, { "epoch": 0.40607356765819297, "grad_norm": 1.7529258728027344, "learning_rate": 2.4016087331226664e-05, "loss": 2.3995, "step": 1428 }, { "epoch": 0.4063579329016511, "grad_norm": 1.8270007371902466, "learning_rate": 2.4004596380350478e-05, "loss": 2.2466, "step": 1429 }, { "epoch": 0.4066422981451092, "grad_norm": 1.6671589612960815, "learning_rate": 2.3993105429474293e-05, "loss": 2.0727, "step": 1430 }, { "epoch": 0.4069266633885673, "grad_norm": 1.8417056798934937, "learning_rate": 2.3981614478598107e-05, "loss": 1.9906, "step": 1431 }, { "epoch": 0.40721102863202546, "grad_norm": 1.794356346130371, "learning_rate": 2.397012352772192e-05, "loss": 1.9631, "step": 1432 }, { "epoch": 0.40749539387548356, "grad_norm": 1.9737939834594727, "learning_rate": 2.3958632576845736e-05, "loss": 2.8793, "step": 1433 }, { "epoch": 0.4077797591189417, "grad_norm": 1.7797672748565674, "learning_rate": 2.394714162596955e-05, "loss": 2.6237, "step": 1434 }, { "epoch": 0.4080641243623998, "grad_norm": 1.782353401184082, "learning_rate": 2.393565067509337e-05, "loss": 2.3761, "step": 1435 }, { "epoch": 0.4083484896058579, "grad_norm": 1.662298560142517, "learning_rate": 2.3924159724217183e-05, "loss": 2.3679, "step": 1436 }, { "epoch": 0.40863285484931605, "grad_norm": 1.7160699367523193, "learning_rate": 2.3912668773340998e-05, "loss": 2.0721, "step": 1437 }, { "epoch": 0.40891722009277415, "grad_norm": 1.5304116010665894, "learning_rate": 2.3901177822464812e-05, "loss": 1.9679, "step": 1438 }, { "epoch": 0.4092015853362323, "grad_norm": 1.6749613285064697, "learning_rate": 2.3889686871588627e-05, "loss": 1.9677, "step": 1439 }, { "epoch": 0.4094859505796904, "grad_norm": 1.8116540908813477, "learning_rate": 2.387819592071244e-05, "loss": 1.7566, "step": 1440 }, { "epoch": 0.4097703158231485, "grad_norm": 2.061837911605835, "learning_rate": 2.3866704969836256e-05, "loss": 2.768, "step": 1441 }, { "epoch": 0.41005468106660664, "grad_norm": 1.6977015733718872, "learning_rate": 2.385521401896007e-05, "loss": 2.4958, "step": 1442 }, { "epoch": 0.41033904631006474, "grad_norm": 1.6543117761611938, "learning_rate": 2.3843723068083885e-05, "loss": 2.4396, "step": 1443 }, { "epoch": 0.4106234115535229, "grad_norm": 1.5568379163742065, "learning_rate": 2.3832232117207702e-05, "loss": 2.2158, "step": 1444 }, { "epoch": 0.410907776796981, "grad_norm": 1.6220964193344116, "learning_rate": 2.3820741166331517e-05, "loss": 2.3038, "step": 1445 }, { "epoch": 0.41119214204043913, "grad_norm": 1.6945890188217163, "learning_rate": 2.380925021545533e-05, "loss": 1.9376, "step": 1446 }, { "epoch": 0.41147650728389723, "grad_norm": 1.5313851833343506, "learning_rate": 2.3797759264579146e-05, "loss": 1.888, "step": 1447 }, { "epoch": 0.4117608725273553, "grad_norm": 1.7938929796218872, "learning_rate": 2.378626831370296e-05, "loss": 1.9618, "step": 1448 }, { "epoch": 0.4120452377708135, "grad_norm": 2.1637260913848877, "learning_rate": 2.3774777362826775e-05, "loss": 2.864, "step": 1449 }, { "epoch": 0.4123296030142716, "grad_norm": 1.8949286937713623, "learning_rate": 2.376328641195059e-05, "loss": 2.675, "step": 1450 }, { "epoch": 0.4126139682577297, "grad_norm": 1.711017370223999, "learning_rate": 2.3751795461074404e-05, "loss": 2.4241, "step": 1451 }, { "epoch": 0.4128983335011878, "grad_norm": 1.6025915145874023, "learning_rate": 2.374030451019822e-05, "loss": 2.3166, "step": 1452 }, { "epoch": 0.4131826987446459, "grad_norm": 1.7555315494537354, "learning_rate": 2.3728813559322036e-05, "loss": 2.0809, "step": 1453 }, { "epoch": 0.41346706398810407, "grad_norm": 1.5702033042907715, "learning_rate": 2.371732260844585e-05, "loss": 1.981, "step": 1454 }, { "epoch": 0.41375142923156216, "grad_norm": 1.6825945377349854, "learning_rate": 2.3705831657569665e-05, "loss": 1.766, "step": 1455 }, { "epoch": 0.4140357944750203, "grad_norm": 1.711393117904663, "learning_rate": 2.369434070669348e-05, "loss": 2.0005, "step": 1456 }, { "epoch": 0.4143201597184784, "grad_norm": 1.917224407196045, "learning_rate": 2.3682849755817294e-05, "loss": 2.8846, "step": 1457 }, { "epoch": 0.4146045249619365, "grad_norm": 1.728300929069519, "learning_rate": 2.367135880494111e-05, "loss": 2.63, "step": 1458 }, { "epoch": 0.41488889020539466, "grad_norm": 1.6932635307312012, "learning_rate": 2.3659867854064923e-05, "loss": 2.3097, "step": 1459 }, { "epoch": 0.41517325544885275, "grad_norm": 1.6644999980926514, "learning_rate": 2.3648376903188738e-05, "loss": 2.3912, "step": 1460 }, { "epoch": 0.4154576206923109, "grad_norm": 1.7875523567199707, "learning_rate": 2.3636885952312552e-05, "loss": 2.3286, "step": 1461 }, { "epoch": 0.415741985935769, "grad_norm": 1.7267717123031616, "learning_rate": 2.362539500143637e-05, "loss": 2.021, "step": 1462 }, { "epoch": 0.4160263511792271, "grad_norm": 1.854394555091858, "learning_rate": 2.3613904050560188e-05, "loss": 2.0849, "step": 1463 }, { "epoch": 0.41631071642268525, "grad_norm": 1.7931166887283325, "learning_rate": 2.3602413099684003e-05, "loss": 1.9079, "step": 1464 }, { "epoch": 0.41659508166614334, "grad_norm": 1.9061031341552734, "learning_rate": 2.3590922148807817e-05, "loss": 2.7233, "step": 1465 }, { "epoch": 0.4168794469096015, "grad_norm": 1.6702359914779663, "learning_rate": 2.357943119793163e-05, "loss": 2.6487, "step": 1466 }, { "epoch": 0.4171638121530596, "grad_norm": 1.5894525051116943, "learning_rate": 2.3567940247055446e-05, "loss": 2.3472, "step": 1467 }, { "epoch": 0.4174481773965177, "grad_norm": 1.6212420463562012, "learning_rate": 2.3556449296179264e-05, "loss": 2.0305, "step": 1468 }, { "epoch": 0.41773254263997583, "grad_norm": 1.8889355659484863, "learning_rate": 2.354495834530308e-05, "loss": 2.5149, "step": 1469 }, { "epoch": 0.41801690788343393, "grad_norm": 1.4953631162643433, "learning_rate": 2.3533467394426893e-05, "loss": 1.877, "step": 1470 }, { "epoch": 0.4183012731268921, "grad_norm": 1.6321781873703003, "learning_rate": 2.3521976443550707e-05, "loss": 1.9186, "step": 1471 }, { "epoch": 0.4185856383703502, "grad_norm": 1.6362193822860718, "learning_rate": 2.3510485492674522e-05, "loss": 1.7191, "step": 1472 }, { "epoch": 0.41887000361380833, "grad_norm": 2.1711504459381104, "learning_rate": 2.3498994541798336e-05, "loss": 2.9604, "step": 1473 }, { "epoch": 0.4191543688572664, "grad_norm": 1.5688064098358154, "learning_rate": 2.348750359092215e-05, "loss": 2.4455, "step": 1474 }, { "epoch": 0.4194387341007245, "grad_norm": 1.5791515111923218, "learning_rate": 2.3476012640045965e-05, "loss": 2.3827, "step": 1475 }, { "epoch": 0.41972309934418267, "grad_norm": 1.5574970245361328, "learning_rate": 2.346452168916978e-05, "loss": 2.2648, "step": 1476 }, { "epoch": 0.42000746458764077, "grad_norm": 1.607318639755249, "learning_rate": 2.3453030738293598e-05, "loss": 2.1887, "step": 1477 }, { "epoch": 0.4202918298310989, "grad_norm": 1.6023167371749878, "learning_rate": 2.3441539787417412e-05, "loss": 1.8484, "step": 1478 }, { "epoch": 0.420576195074557, "grad_norm": 1.614344835281372, "learning_rate": 2.3430048836541227e-05, "loss": 1.7518, "step": 1479 }, { "epoch": 0.4208605603180151, "grad_norm": 1.7779884338378906, "learning_rate": 2.341855788566504e-05, "loss": 1.9386, "step": 1480 }, { "epoch": 0.42114492556147326, "grad_norm": 1.913327932357788, "learning_rate": 2.3407066934788856e-05, "loss": 2.8805, "step": 1481 }, { "epoch": 0.42142929080493136, "grad_norm": 1.888013482093811, "learning_rate": 2.339557598391267e-05, "loss": 2.4296, "step": 1482 }, { "epoch": 0.4217136560483895, "grad_norm": 1.841597080230713, "learning_rate": 2.3384085033036485e-05, "loss": 2.4056, "step": 1483 }, { "epoch": 0.4219980212918476, "grad_norm": 1.6443198919296265, "learning_rate": 2.33725940821603e-05, "loss": 2.2189, "step": 1484 }, { "epoch": 0.4222823865353057, "grad_norm": 1.7513799667358398, "learning_rate": 2.3361103131284114e-05, "loss": 2.3345, "step": 1485 }, { "epoch": 0.42256675177876385, "grad_norm": 1.6593313217163086, "learning_rate": 2.334961218040793e-05, "loss": 2.0602, "step": 1486 }, { "epoch": 0.42285111702222195, "grad_norm": 1.7148724794387817, "learning_rate": 2.3338121229531746e-05, "loss": 1.6755, "step": 1487 }, { "epoch": 0.4231354822656801, "grad_norm": 1.6917473077774048, "learning_rate": 2.332663027865556e-05, "loss": 1.8464, "step": 1488 }, { "epoch": 0.4234198475091382, "grad_norm": 2.1567392349243164, "learning_rate": 2.3315139327779375e-05, "loss": 2.9124, "step": 1489 }, { "epoch": 0.4237042127525963, "grad_norm": 1.7503013610839844, "learning_rate": 2.330364837690319e-05, "loss": 2.5927, "step": 1490 }, { "epoch": 0.42398857799605444, "grad_norm": 1.6987637281417847, "learning_rate": 2.3292157426027004e-05, "loss": 2.4439, "step": 1491 }, { "epoch": 0.42427294323951253, "grad_norm": 1.6804696321487427, "learning_rate": 2.328066647515082e-05, "loss": 2.3747, "step": 1492 }, { "epoch": 0.4245573084829707, "grad_norm": 1.7630577087402344, "learning_rate": 2.3269175524274633e-05, "loss": 2.4236, "step": 1493 }, { "epoch": 0.4248416737264288, "grad_norm": 1.5913448333740234, "learning_rate": 2.3257684573398448e-05, "loss": 2.0831, "step": 1494 }, { "epoch": 0.42512603896988693, "grad_norm": 1.6789977550506592, "learning_rate": 2.3246193622522266e-05, "loss": 1.9894, "step": 1495 }, { "epoch": 0.42541040421334503, "grad_norm": 1.732582688331604, "learning_rate": 2.323470267164608e-05, "loss": 1.7747, "step": 1496 }, { "epoch": 0.4256947694568031, "grad_norm": 1.926916241645813, "learning_rate": 2.3223211720769895e-05, "loss": 2.9434, "step": 1497 }, { "epoch": 0.4259791347002613, "grad_norm": 1.6622836589813232, "learning_rate": 2.321172076989371e-05, "loss": 2.4302, "step": 1498 }, { "epoch": 0.42626349994371937, "grad_norm": 1.64722740650177, "learning_rate": 2.3200229819017524e-05, "loss": 2.1876, "step": 1499 }, { "epoch": 0.4265478651871775, "grad_norm": 1.7458999156951904, "learning_rate": 2.318873886814134e-05, "loss": 2.4106, "step": 1500 }, { "epoch": 0.4268322304306356, "grad_norm": 1.6324485540390015, "learning_rate": 2.317724791726516e-05, "loss": 2.3124, "step": 1501 }, { "epoch": 0.4271165956740937, "grad_norm": 1.6437077522277832, "learning_rate": 2.3165756966388974e-05, "loss": 1.9083, "step": 1502 }, { "epoch": 0.42740096091755186, "grad_norm": 1.6657003164291382, "learning_rate": 2.3154266015512788e-05, "loss": 1.7993, "step": 1503 }, { "epoch": 0.42768532616100996, "grad_norm": 1.6173607110977173, "learning_rate": 2.3142775064636603e-05, "loss": 1.7787, "step": 1504 }, { "epoch": 0.4279696914044681, "grad_norm": 2.0600807666778564, "learning_rate": 2.3131284113760417e-05, "loss": 2.8575, "step": 1505 }, { "epoch": 0.4282540566479262, "grad_norm": 1.759519338607788, "learning_rate": 2.3119793162884232e-05, "loss": 2.4221, "step": 1506 }, { "epoch": 0.4285384218913843, "grad_norm": 1.6098270416259766, "learning_rate": 2.3108302212008046e-05, "loss": 2.4158, "step": 1507 }, { "epoch": 0.42882278713484245, "grad_norm": 1.5840624570846558, "learning_rate": 2.309681126113186e-05, "loss": 2.1282, "step": 1508 }, { "epoch": 0.42910715237830055, "grad_norm": 1.7449572086334229, "learning_rate": 2.3085320310255675e-05, "loss": 2.289, "step": 1509 }, { "epoch": 0.4293915176217587, "grad_norm": 1.5223720073699951, "learning_rate": 2.3073829359379493e-05, "loss": 1.7867, "step": 1510 }, { "epoch": 0.4296758828652168, "grad_norm": 1.6522331237792969, "learning_rate": 2.3062338408503308e-05, "loss": 2.1099, "step": 1511 }, { "epoch": 0.4299602481086749, "grad_norm": 1.652082085609436, "learning_rate": 2.3050847457627122e-05, "loss": 1.9209, "step": 1512 }, { "epoch": 0.43024461335213304, "grad_norm": 1.941834568977356, "learning_rate": 2.3039356506750937e-05, "loss": 2.6694, "step": 1513 }, { "epoch": 0.43052897859559114, "grad_norm": 1.7142715454101562, "learning_rate": 2.302786555587475e-05, "loss": 2.4722, "step": 1514 }, { "epoch": 0.4308133438390493, "grad_norm": 1.7237780094146729, "learning_rate": 2.3016374604998566e-05, "loss": 2.2415, "step": 1515 }, { "epoch": 0.4310977090825074, "grad_norm": 1.696518063545227, "learning_rate": 2.300488365412238e-05, "loss": 2.1774, "step": 1516 }, { "epoch": 0.43138207432596554, "grad_norm": 1.6398987770080566, "learning_rate": 2.2993392703246195e-05, "loss": 2.2672, "step": 1517 }, { "epoch": 0.43166643956942363, "grad_norm": 1.7152602672576904, "learning_rate": 2.298190175237001e-05, "loss": 2.1277, "step": 1518 }, { "epoch": 0.43195080481288173, "grad_norm": 1.612593173980713, "learning_rate": 2.2970410801493827e-05, "loss": 2.1015, "step": 1519 }, { "epoch": 0.4322351700563399, "grad_norm": 1.6528387069702148, "learning_rate": 2.295891985061764e-05, "loss": 1.7369, "step": 1520 }, { "epoch": 0.432519535299798, "grad_norm": 2.0139312744140625, "learning_rate": 2.2947428899741456e-05, "loss": 2.8552, "step": 1521 }, { "epoch": 0.4328039005432561, "grad_norm": 1.6151010990142822, "learning_rate": 2.293593794886527e-05, "loss": 2.1954, "step": 1522 }, { "epoch": 0.4330882657867142, "grad_norm": 1.5266865491867065, "learning_rate": 2.2924446997989085e-05, "loss": 2.2809, "step": 1523 }, { "epoch": 0.4333726310301723, "grad_norm": 1.5820162296295166, "learning_rate": 2.29129560471129e-05, "loss": 2.2162, "step": 1524 }, { "epoch": 0.43365699627363047, "grad_norm": 1.7275466918945312, "learning_rate": 2.2901465096236714e-05, "loss": 2.2792, "step": 1525 }, { "epoch": 0.43394136151708856, "grad_norm": 1.6316412687301636, "learning_rate": 2.288997414536053e-05, "loss": 1.9186, "step": 1526 }, { "epoch": 0.4342257267605467, "grad_norm": 1.7064950466156006, "learning_rate": 2.2878483194484343e-05, "loss": 1.7606, "step": 1527 }, { "epoch": 0.4345100920040048, "grad_norm": 1.7663518190383911, "learning_rate": 2.286699224360816e-05, "loss": 1.7999, "step": 1528 }, { "epoch": 0.4347944572474629, "grad_norm": 1.942549705505371, "learning_rate": 2.2855501292731975e-05, "loss": 3.1272, "step": 1529 }, { "epoch": 0.43507882249092106, "grad_norm": 1.546832799911499, "learning_rate": 2.284401034185579e-05, "loss": 2.4118, "step": 1530 }, { "epoch": 0.43536318773437915, "grad_norm": 1.5846697092056274, "learning_rate": 2.2832519390979604e-05, "loss": 2.3054, "step": 1531 }, { "epoch": 0.4356475529778373, "grad_norm": 1.6705207824707031, "learning_rate": 2.282102844010342e-05, "loss": 2.465, "step": 1532 }, { "epoch": 0.4359319182212954, "grad_norm": 1.7256457805633545, "learning_rate": 2.2809537489227233e-05, "loss": 2.2058, "step": 1533 }, { "epoch": 0.4362162834647535, "grad_norm": 1.6743327379226685, "learning_rate": 2.2798046538351048e-05, "loss": 1.999, "step": 1534 }, { "epoch": 0.43650064870821165, "grad_norm": 1.8469429016113281, "learning_rate": 2.2786555587474862e-05, "loss": 2.0113, "step": 1535 }, { "epoch": 0.43678501395166974, "grad_norm": 1.6663111448287964, "learning_rate": 2.2775064636598677e-05, "loss": 1.9054, "step": 1536 }, { "epoch": 0.4370693791951279, "grad_norm": 1.980746865272522, "learning_rate": 2.2763573685722498e-05, "loss": 2.5987, "step": 1537 }, { "epoch": 0.437353744438586, "grad_norm": 1.5249035358428955, "learning_rate": 2.2752082734846313e-05, "loss": 2.5415, "step": 1538 }, { "epoch": 0.4376381096820441, "grad_norm": 1.5359803438186646, "learning_rate": 2.2740591783970127e-05, "loss": 2.2511, "step": 1539 }, { "epoch": 0.43792247492550224, "grad_norm": 1.671136498451233, "learning_rate": 2.272910083309394e-05, "loss": 2.264, "step": 1540 }, { "epoch": 0.43820684016896033, "grad_norm": 1.6809831857681274, "learning_rate": 2.2717609882217756e-05, "loss": 2.2934, "step": 1541 }, { "epoch": 0.4384912054124185, "grad_norm": 1.785244107246399, "learning_rate": 2.270611893134157e-05, "loss": 1.8482, "step": 1542 }, { "epoch": 0.4387755706558766, "grad_norm": 1.749173641204834, "learning_rate": 2.269462798046539e-05, "loss": 2.0663, "step": 1543 }, { "epoch": 0.43905993589933473, "grad_norm": 1.7575947046279907, "learning_rate": 2.2683137029589203e-05, "loss": 1.9117, "step": 1544 }, { "epoch": 0.4393443011427928, "grad_norm": 1.795770287513733, "learning_rate": 2.2671646078713017e-05, "loss": 2.9625, "step": 1545 }, { "epoch": 0.4396286663862509, "grad_norm": 1.6358546018600464, "learning_rate": 2.2660155127836832e-05, "loss": 2.4055, "step": 1546 }, { "epoch": 0.4399130316297091, "grad_norm": 1.5694764852523804, "learning_rate": 2.2648664176960646e-05, "loss": 2.5108, "step": 1547 }, { "epoch": 0.44019739687316717, "grad_norm": 1.6293818950653076, "learning_rate": 2.263717322608446e-05, "loss": 2.3389, "step": 1548 }, { "epoch": 0.4404817621166253, "grad_norm": 1.683681845664978, "learning_rate": 2.2625682275208275e-05, "loss": 2.1871, "step": 1549 }, { "epoch": 0.4407661273600834, "grad_norm": 1.668007254600525, "learning_rate": 2.261419132433209e-05, "loss": 1.8618, "step": 1550 }, { "epoch": 0.4410504926035415, "grad_norm": 1.5370334386825562, "learning_rate": 2.2602700373455904e-05, "loss": 1.6422, "step": 1551 }, { "epoch": 0.44133485784699966, "grad_norm": 1.6079347133636475, "learning_rate": 2.2591209422579722e-05, "loss": 1.7268, "step": 1552 }, { "epoch": 0.44161922309045776, "grad_norm": 1.7942856550216675, "learning_rate": 2.2579718471703537e-05, "loss": 2.8017, "step": 1553 }, { "epoch": 0.4419035883339159, "grad_norm": 1.694395899772644, "learning_rate": 2.256822752082735e-05, "loss": 2.4659, "step": 1554 }, { "epoch": 0.442187953577374, "grad_norm": 1.6033554077148438, "learning_rate": 2.2556736569951166e-05, "loss": 2.321, "step": 1555 }, { "epoch": 0.4424723188208321, "grad_norm": 1.4896838665008545, "learning_rate": 2.254524561907498e-05, "loss": 2.3603, "step": 1556 }, { "epoch": 0.44275668406429025, "grad_norm": 1.5675745010375977, "learning_rate": 2.2533754668198795e-05, "loss": 2.199, "step": 1557 }, { "epoch": 0.44304104930774835, "grad_norm": 1.5907286405563354, "learning_rate": 2.252226371732261e-05, "loss": 2.0474, "step": 1558 }, { "epoch": 0.4433254145512065, "grad_norm": 1.67164146900177, "learning_rate": 2.2510772766446424e-05, "loss": 1.9384, "step": 1559 }, { "epoch": 0.4436097797946646, "grad_norm": 1.6073744297027588, "learning_rate": 2.249928181557024e-05, "loss": 1.5771, "step": 1560 }, { "epoch": 0.4438941450381227, "grad_norm": 1.9481167793273926, "learning_rate": 2.2487790864694056e-05, "loss": 2.8547, "step": 1561 }, { "epoch": 0.44417851028158084, "grad_norm": 1.627387523651123, "learning_rate": 2.247629991381787e-05, "loss": 2.661, "step": 1562 }, { "epoch": 0.44446287552503894, "grad_norm": 1.665747046470642, "learning_rate": 2.2464808962941685e-05, "loss": 2.2633, "step": 1563 }, { "epoch": 0.4447472407684971, "grad_norm": 1.6608874797821045, "learning_rate": 2.24533180120655e-05, "loss": 2.3801, "step": 1564 }, { "epoch": 0.4450316060119552, "grad_norm": 1.778861165046692, "learning_rate": 2.2441827061189314e-05, "loss": 2.1286, "step": 1565 }, { "epoch": 0.44531597125541333, "grad_norm": 1.678720474243164, "learning_rate": 2.243033611031313e-05, "loss": 2.0441, "step": 1566 }, { "epoch": 0.44560033649887143, "grad_norm": 1.6838959455490112, "learning_rate": 2.2418845159436943e-05, "loss": 1.8895, "step": 1567 }, { "epoch": 0.4458847017423295, "grad_norm": 1.7138351202011108, "learning_rate": 2.2407354208560758e-05, "loss": 1.8344, "step": 1568 }, { "epoch": 0.4461690669857877, "grad_norm": 1.9342299699783325, "learning_rate": 2.2395863257684572e-05, "loss": 2.7738, "step": 1569 }, { "epoch": 0.4464534322292458, "grad_norm": 1.6085205078125, "learning_rate": 2.238437230680839e-05, "loss": 2.4156, "step": 1570 }, { "epoch": 0.4467377974727039, "grad_norm": 1.598381757736206, "learning_rate": 2.2372881355932205e-05, "loss": 2.323, "step": 1571 }, { "epoch": 0.447022162716162, "grad_norm": 1.5285613536834717, "learning_rate": 2.236139040505602e-05, "loss": 2.3501, "step": 1572 }, { "epoch": 0.4473065279596201, "grad_norm": 1.6888141632080078, "learning_rate": 2.2349899454179834e-05, "loss": 2.3968, "step": 1573 }, { "epoch": 0.44759089320307827, "grad_norm": 1.6421995162963867, "learning_rate": 2.233840850330365e-05, "loss": 1.7312, "step": 1574 }, { "epoch": 0.44787525844653636, "grad_norm": 1.7016944885253906, "learning_rate": 2.2326917552427466e-05, "loss": 1.8975, "step": 1575 }, { "epoch": 0.4481596236899945, "grad_norm": 1.8153733015060425, "learning_rate": 2.231542660155128e-05, "loss": 2.0029, "step": 1576 }, { "epoch": 0.4484439889334526, "grad_norm": 1.941930890083313, "learning_rate": 2.23039356506751e-05, "loss": 3.027, "step": 1577 }, { "epoch": 0.4487283541769107, "grad_norm": 1.643111228942871, "learning_rate": 2.2292444699798913e-05, "loss": 2.5447, "step": 1578 }, { "epoch": 0.44901271942036886, "grad_norm": 1.5642459392547607, "learning_rate": 2.2280953748922727e-05, "loss": 2.3714, "step": 1579 }, { "epoch": 0.44929708466382695, "grad_norm": 1.6498128175735474, "learning_rate": 2.2269462798046542e-05, "loss": 2.3655, "step": 1580 }, { "epoch": 0.4495814499072851, "grad_norm": 1.7358462810516357, "learning_rate": 2.2257971847170356e-05, "loss": 2.2202, "step": 1581 }, { "epoch": 0.4498658151507432, "grad_norm": 1.718059778213501, "learning_rate": 2.224648089629417e-05, "loss": 1.8404, "step": 1582 }, { "epoch": 0.4501501803942013, "grad_norm": 1.6849524974822998, "learning_rate": 2.2234989945417985e-05, "loss": 1.857, "step": 1583 }, { "epoch": 0.45043454563765944, "grad_norm": 1.6961820125579834, "learning_rate": 2.22234989945418e-05, "loss": 1.8101, "step": 1584 }, { "epoch": 0.45071891088111754, "grad_norm": 1.9128514528274536, "learning_rate": 2.2212008043665618e-05, "loss": 2.9717, "step": 1585 }, { "epoch": 0.4510032761245757, "grad_norm": 1.4966126680374146, "learning_rate": 2.2200517092789432e-05, "loss": 2.6604, "step": 1586 }, { "epoch": 0.4512876413680338, "grad_norm": 1.5449092388153076, "learning_rate": 2.2189026141913247e-05, "loss": 2.1829, "step": 1587 }, { "epoch": 0.4515720066114919, "grad_norm": 1.6908026933670044, "learning_rate": 2.217753519103706e-05, "loss": 2.2731, "step": 1588 }, { "epoch": 0.45185637185495003, "grad_norm": 1.561723232269287, "learning_rate": 2.2166044240160876e-05, "loss": 2.1934, "step": 1589 }, { "epoch": 0.45214073709840813, "grad_norm": 1.6748071908950806, "learning_rate": 2.215455328928469e-05, "loss": 2.112, "step": 1590 }, { "epoch": 0.4524251023418663, "grad_norm": 1.5931808948516846, "learning_rate": 2.2143062338408505e-05, "loss": 1.853, "step": 1591 }, { "epoch": 0.4527094675853244, "grad_norm": 1.7409586906433105, "learning_rate": 2.213157138753232e-05, "loss": 1.7871, "step": 1592 }, { "epoch": 0.45299383282878253, "grad_norm": 1.953231930732727, "learning_rate": 2.2120080436656134e-05, "loss": 2.8715, "step": 1593 }, { "epoch": 0.4532781980722406, "grad_norm": 1.6238800287246704, "learning_rate": 2.210858948577995e-05, "loss": 2.5115, "step": 1594 }, { "epoch": 0.4535625633156987, "grad_norm": 1.643713116645813, "learning_rate": 2.2097098534903766e-05, "loss": 2.2356, "step": 1595 }, { "epoch": 0.45384692855915687, "grad_norm": 1.5235264301300049, "learning_rate": 2.208560758402758e-05, "loss": 2.348, "step": 1596 }, { "epoch": 0.45413129380261497, "grad_norm": 1.6755728721618652, "learning_rate": 2.2074116633151395e-05, "loss": 2.2908, "step": 1597 }, { "epoch": 0.4544156590460731, "grad_norm": 1.6341913938522339, "learning_rate": 2.206262568227521e-05, "loss": 1.9315, "step": 1598 }, { "epoch": 0.4547000242895312, "grad_norm": 1.8000038862228394, "learning_rate": 2.2051134731399024e-05, "loss": 1.8148, "step": 1599 }, { "epoch": 0.4549843895329893, "grad_norm": 1.7715755701065063, "learning_rate": 2.203964378052284e-05, "loss": 1.9843, "step": 1600 }, { "epoch": 0.45526875477644746, "grad_norm": 1.9607915878295898, "learning_rate": 2.2028152829646653e-05, "loss": 3.0164, "step": 1601 }, { "epoch": 0.45555312001990556, "grad_norm": 1.5718859434127808, "learning_rate": 2.2016661878770468e-05, "loss": 2.5762, "step": 1602 }, { "epoch": 0.4558374852633637, "grad_norm": 1.6624130010604858, "learning_rate": 2.2005170927894285e-05, "loss": 2.3932, "step": 1603 }, { "epoch": 0.4561218505068218, "grad_norm": 1.6082323789596558, "learning_rate": 2.19936799770181e-05, "loss": 2.4261, "step": 1604 }, { "epoch": 0.4564062157502799, "grad_norm": 1.7497981786727905, "learning_rate": 2.1982189026141914e-05, "loss": 2.3493, "step": 1605 }, { "epoch": 0.45669058099373805, "grad_norm": 1.559987187385559, "learning_rate": 2.197069807526573e-05, "loss": 1.8154, "step": 1606 }, { "epoch": 0.45697494623719614, "grad_norm": 1.724812388420105, "learning_rate": 2.1959207124389543e-05, "loss": 1.6993, "step": 1607 }, { "epoch": 0.4572593114806543, "grad_norm": 1.6445400714874268, "learning_rate": 2.1947716173513358e-05, "loss": 1.7773, "step": 1608 }, { "epoch": 0.4575436767241124, "grad_norm": 1.8340013027191162, "learning_rate": 2.1936225222637172e-05, "loss": 2.8774, "step": 1609 }, { "epoch": 0.4578280419675705, "grad_norm": 1.6000295877456665, "learning_rate": 2.1924734271760987e-05, "loss": 2.6376, "step": 1610 }, { "epoch": 0.45811240721102864, "grad_norm": 1.6122288703918457, "learning_rate": 2.1913243320884808e-05, "loss": 2.4566, "step": 1611 }, { "epoch": 0.45839677245448673, "grad_norm": 1.62675940990448, "learning_rate": 2.1901752370008623e-05, "loss": 2.4088, "step": 1612 }, { "epoch": 0.4586811376979449, "grad_norm": 1.7197331190109253, "learning_rate": 2.1890261419132437e-05, "loss": 2.2432, "step": 1613 }, { "epoch": 0.458965502941403, "grad_norm": 1.5948972702026367, "learning_rate": 2.187877046825625e-05, "loss": 1.9617, "step": 1614 }, { "epoch": 0.45924986818486113, "grad_norm": 1.689054012298584, "learning_rate": 2.1867279517380066e-05, "loss": 1.9137, "step": 1615 }, { "epoch": 0.4595342334283192, "grad_norm": 1.829690933227539, "learning_rate": 2.185578856650388e-05, "loss": 1.8653, "step": 1616 }, { "epoch": 0.4598185986717773, "grad_norm": 1.973512887954712, "learning_rate": 2.1844297615627695e-05, "loss": 2.8129, "step": 1617 }, { "epoch": 0.4601029639152355, "grad_norm": 1.612326979637146, "learning_rate": 2.183280666475151e-05, "loss": 2.6233, "step": 1618 }, { "epoch": 0.46038732915869357, "grad_norm": 1.473063588142395, "learning_rate": 2.1821315713875328e-05, "loss": 2.3022, "step": 1619 }, { "epoch": 0.4606716944021517, "grad_norm": 1.552682876586914, "learning_rate": 2.1809824762999142e-05, "loss": 2.1985, "step": 1620 }, { "epoch": 0.4609560596456098, "grad_norm": 1.7268232107162476, "learning_rate": 2.1798333812122957e-05, "loss": 2.2878, "step": 1621 }, { "epoch": 0.4612404248890679, "grad_norm": 1.5425605773925781, "learning_rate": 2.178684286124677e-05, "loss": 1.8743, "step": 1622 }, { "epoch": 0.46152479013252606, "grad_norm": 1.7569302320480347, "learning_rate": 2.1775351910370586e-05, "loss": 1.7704, "step": 1623 }, { "epoch": 0.46180915537598416, "grad_norm": 1.6393110752105713, "learning_rate": 2.17638609594944e-05, "loss": 1.8674, "step": 1624 }, { "epoch": 0.4620935206194423, "grad_norm": 1.9522907733917236, "learning_rate": 2.1752370008618215e-05, "loss": 2.9953, "step": 1625 }, { "epoch": 0.4623778858629004, "grad_norm": 1.7476742267608643, "learning_rate": 2.174087905774203e-05, "loss": 2.4966, "step": 1626 }, { "epoch": 0.4626622511063585, "grad_norm": 1.642991065979004, "learning_rate": 2.1729388106865844e-05, "loss": 2.4291, "step": 1627 }, { "epoch": 0.46294661634981665, "grad_norm": 1.6043994426727295, "learning_rate": 2.171789715598966e-05, "loss": 2.1552, "step": 1628 }, { "epoch": 0.46323098159327475, "grad_norm": 1.703909158706665, "learning_rate": 2.1706406205113476e-05, "loss": 2.2067, "step": 1629 }, { "epoch": 0.4635153468367329, "grad_norm": 1.5352064371109009, "learning_rate": 2.169491525423729e-05, "loss": 2.0239, "step": 1630 }, { "epoch": 0.463799712080191, "grad_norm": 1.6908349990844727, "learning_rate": 2.1683424303361105e-05, "loss": 1.9427, "step": 1631 }, { "epoch": 0.4640840773236491, "grad_norm": 1.7095677852630615, "learning_rate": 2.167193335248492e-05, "loss": 1.7988, "step": 1632 }, { "epoch": 0.46436844256710724, "grad_norm": 2.046193838119507, "learning_rate": 2.1660442401608734e-05, "loss": 3.045, "step": 1633 }, { "epoch": 0.46465280781056534, "grad_norm": 1.6458848714828491, "learning_rate": 2.164895145073255e-05, "loss": 2.6204, "step": 1634 }, { "epoch": 0.4649371730540235, "grad_norm": 1.6441450119018555, "learning_rate": 2.1637460499856363e-05, "loss": 2.482, "step": 1635 }, { "epoch": 0.4652215382974816, "grad_norm": 1.6394556760787964, "learning_rate": 2.1625969548980177e-05, "loss": 2.2559, "step": 1636 }, { "epoch": 0.46550590354093974, "grad_norm": 1.8199498653411865, "learning_rate": 2.1614478598103995e-05, "loss": 2.2969, "step": 1637 }, { "epoch": 0.46579026878439783, "grad_norm": 1.5875297784805298, "learning_rate": 2.160298764722781e-05, "loss": 1.9558, "step": 1638 }, { "epoch": 0.4660746340278559, "grad_norm": 1.6678173542022705, "learning_rate": 2.1591496696351624e-05, "loss": 1.9319, "step": 1639 }, { "epoch": 0.4663589992713141, "grad_norm": 1.7809865474700928, "learning_rate": 2.158000574547544e-05, "loss": 1.8821, "step": 1640 }, { "epoch": 0.4666433645147722, "grad_norm": 2.026603937149048, "learning_rate": 2.1568514794599253e-05, "loss": 2.7958, "step": 1641 }, { "epoch": 0.4669277297582303, "grad_norm": 1.7228740453720093, "learning_rate": 2.1557023843723068e-05, "loss": 2.591, "step": 1642 }, { "epoch": 0.4672120950016884, "grad_norm": 1.6805092096328735, "learning_rate": 2.1545532892846882e-05, "loss": 2.4039, "step": 1643 }, { "epoch": 0.4674964602451465, "grad_norm": 1.605502724647522, "learning_rate": 2.1534041941970697e-05, "loss": 2.2221, "step": 1644 }, { "epoch": 0.46778082548860467, "grad_norm": 1.750071406364441, "learning_rate": 2.152255099109451e-05, "loss": 2.2253, "step": 1645 }, { "epoch": 0.46806519073206276, "grad_norm": 1.907105565071106, "learning_rate": 2.151106004021833e-05, "loss": 1.9662, "step": 1646 }, { "epoch": 0.4683495559755209, "grad_norm": 1.6502389907836914, "learning_rate": 2.1499569089342144e-05, "loss": 1.8562, "step": 1647 }, { "epoch": 0.468633921218979, "grad_norm": 1.72910737991333, "learning_rate": 2.148807813846596e-05, "loss": 1.8048, "step": 1648 }, { "epoch": 0.4689182864624371, "grad_norm": 1.7769256830215454, "learning_rate": 2.1476587187589776e-05, "loss": 2.9131, "step": 1649 }, { "epoch": 0.46920265170589526, "grad_norm": 1.667056918144226, "learning_rate": 2.146509623671359e-05, "loss": 2.3479, "step": 1650 }, { "epoch": 0.46948701694935335, "grad_norm": 1.6939767599105835, "learning_rate": 2.1453605285837405e-05, "loss": 2.4278, "step": 1651 }, { "epoch": 0.4697713821928115, "grad_norm": 1.723417043685913, "learning_rate": 2.1442114334961223e-05, "loss": 2.2906, "step": 1652 }, { "epoch": 0.4700557474362696, "grad_norm": 1.7457711696624756, "learning_rate": 2.1430623384085037e-05, "loss": 2.1299, "step": 1653 }, { "epoch": 0.4703401126797277, "grad_norm": 1.643261432647705, "learning_rate": 2.1419132433208852e-05, "loss": 1.99, "step": 1654 }, { "epoch": 0.47062447792318585, "grad_norm": 1.7526366710662842, "learning_rate": 2.1407641482332666e-05, "loss": 1.8781, "step": 1655 }, { "epoch": 0.47090884316664394, "grad_norm": 1.6480202674865723, "learning_rate": 2.139615053145648e-05, "loss": 1.7935, "step": 1656 }, { "epoch": 0.4711932084101021, "grad_norm": 1.997900128364563, "learning_rate": 2.1384659580580295e-05, "loss": 3.0299, "step": 1657 }, { "epoch": 0.4714775736535602, "grad_norm": 1.7013146877288818, "learning_rate": 2.137316862970411e-05, "loss": 2.548, "step": 1658 }, { "epoch": 0.4717619388970183, "grad_norm": 1.5773371458053589, "learning_rate": 2.1361677678827924e-05, "loss": 2.2042, "step": 1659 }, { "epoch": 0.47204630414047644, "grad_norm": 1.5602200031280518, "learning_rate": 2.135018672795174e-05, "loss": 2.1936, "step": 1660 }, { "epoch": 0.47233066938393453, "grad_norm": 1.7694247961044312, "learning_rate": 2.1338695777075557e-05, "loss": 2.2457, "step": 1661 }, { "epoch": 0.4726150346273927, "grad_norm": 1.5969568490982056, "learning_rate": 2.132720482619937e-05, "loss": 2.0625, "step": 1662 }, { "epoch": 0.4728993998708508, "grad_norm": 1.6549208164215088, "learning_rate": 2.1315713875323186e-05, "loss": 1.9321, "step": 1663 }, { "epoch": 0.47318376511430893, "grad_norm": 1.806251049041748, "learning_rate": 2.1304222924447e-05, "loss": 2.003, "step": 1664 }, { "epoch": 0.473468130357767, "grad_norm": 2.0554563999176025, "learning_rate": 2.1292731973570815e-05, "loss": 2.967, "step": 1665 }, { "epoch": 0.4737524956012251, "grad_norm": 1.6516426801681519, "learning_rate": 2.128124102269463e-05, "loss": 2.5175, "step": 1666 }, { "epoch": 0.47403686084468327, "grad_norm": 1.5493863821029663, "learning_rate": 2.1269750071818444e-05, "loss": 2.385, "step": 1667 }, { "epoch": 0.47432122608814137, "grad_norm": 1.759292483329773, "learning_rate": 2.1258259120942258e-05, "loss": 2.3617, "step": 1668 }, { "epoch": 0.4746055913315995, "grad_norm": 1.6405413150787354, "learning_rate": 2.1246768170066073e-05, "loss": 2.2015, "step": 1669 }, { "epoch": 0.4748899565750576, "grad_norm": 1.686164140701294, "learning_rate": 2.123527721918989e-05, "loss": 2.0404, "step": 1670 }, { "epoch": 0.4751743218185157, "grad_norm": 1.6657214164733887, "learning_rate": 2.1223786268313705e-05, "loss": 1.705, "step": 1671 }, { "epoch": 0.47545868706197386, "grad_norm": 1.5510362386703491, "learning_rate": 2.121229531743752e-05, "loss": 1.8379, "step": 1672 }, { "epoch": 0.47574305230543196, "grad_norm": 1.9575543403625488, "learning_rate": 2.1200804366561334e-05, "loss": 2.9591, "step": 1673 }, { "epoch": 0.4760274175488901, "grad_norm": 1.660929799079895, "learning_rate": 2.118931341568515e-05, "loss": 2.4613, "step": 1674 }, { "epoch": 0.4763117827923482, "grad_norm": 1.6097886562347412, "learning_rate": 2.1177822464808963e-05, "loss": 2.3457, "step": 1675 }, { "epoch": 0.4765961480358063, "grad_norm": 1.6404685974121094, "learning_rate": 2.1166331513932778e-05, "loss": 2.2377, "step": 1676 }, { "epoch": 0.47688051327926445, "grad_norm": 1.6690386533737183, "learning_rate": 2.1154840563056592e-05, "loss": 2.088, "step": 1677 }, { "epoch": 0.47716487852272255, "grad_norm": 1.7172131538391113, "learning_rate": 2.1143349612180407e-05, "loss": 2.1428, "step": 1678 }, { "epoch": 0.4774492437661807, "grad_norm": 1.645805835723877, "learning_rate": 2.1131858661304225e-05, "loss": 1.7262, "step": 1679 }, { "epoch": 0.4777336090096388, "grad_norm": 1.6956250667572021, "learning_rate": 2.112036771042804e-05, "loss": 1.7153, "step": 1680 }, { "epoch": 0.4780179742530969, "grad_norm": 1.8658703565597534, "learning_rate": 2.1108876759551854e-05, "loss": 2.9007, "step": 1681 }, { "epoch": 0.47830233949655504, "grad_norm": 1.5524179935455322, "learning_rate": 2.1097385808675668e-05, "loss": 2.5446, "step": 1682 }, { "epoch": 0.47858670474001314, "grad_norm": 1.4524405002593994, "learning_rate": 2.1085894857799483e-05, "loss": 2.2187, "step": 1683 }, { "epoch": 0.4788710699834713, "grad_norm": 1.5633457899093628, "learning_rate": 2.1074403906923297e-05, "loss": 2.3211, "step": 1684 }, { "epoch": 0.4791554352269294, "grad_norm": 1.6988346576690674, "learning_rate": 2.1062912956047118e-05, "loss": 2.3174, "step": 1685 }, { "epoch": 0.47943980047038753, "grad_norm": 1.747249722480774, "learning_rate": 2.1051422005170933e-05, "loss": 1.9955, "step": 1686 }, { "epoch": 0.47972416571384563, "grad_norm": 1.5953621864318848, "learning_rate": 2.1039931054294747e-05, "loss": 1.939, "step": 1687 }, { "epoch": 0.4800085309573037, "grad_norm": 1.6366169452667236, "learning_rate": 2.1028440103418562e-05, "loss": 1.7721, "step": 1688 }, { "epoch": 0.4802928962007619, "grad_norm": 1.9023154973983765, "learning_rate": 2.1016949152542376e-05, "loss": 2.8568, "step": 1689 }, { "epoch": 0.48057726144421997, "grad_norm": 1.4789276123046875, "learning_rate": 2.100545820166619e-05, "loss": 2.4112, "step": 1690 }, { "epoch": 0.4808616266876781, "grad_norm": 1.5606416463851929, "learning_rate": 2.0993967250790005e-05, "loss": 2.2638, "step": 1691 }, { "epoch": 0.4811459919311362, "grad_norm": 1.5675312280654907, "learning_rate": 2.098247629991382e-05, "loss": 2.3888, "step": 1692 }, { "epoch": 0.4814303571745943, "grad_norm": 1.6891237497329712, "learning_rate": 2.0970985349037634e-05, "loss": 2.1829, "step": 1693 }, { "epoch": 0.48171472241805247, "grad_norm": 1.6167731285095215, "learning_rate": 2.0959494398161452e-05, "loss": 1.9001, "step": 1694 }, { "epoch": 0.48199908766151056, "grad_norm": 1.6935869455337524, "learning_rate": 2.0948003447285267e-05, "loss": 1.8639, "step": 1695 }, { "epoch": 0.4822834529049687, "grad_norm": 1.6257230043411255, "learning_rate": 2.093651249640908e-05, "loss": 1.78, "step": 1696 }, { "epoch": 0.4825678181484268, "grad_norm": 2.0507266521453857, "learning_rate": 2.0925021545532896e-05, "loss": 2.8274, "step": 1697 }, { "epoch": 0.4828521833918849, "grad_norm": 1.5421440601348877, "learning_rate": 2.091353059465671e-05, "loss": 2.5799, "step": 1698 }, { "epoch": 0.48313654863534305, "grad_norm": 1.5370677709579468, "learning_rate": 2.0902039643780525e-05, "loss": 2.2449, "step": 1699 }, { "epoch": 0.48342091387880115, "grad_norm": 1.6264358758926392, "learning_rate": 2.089054869290434e-05, "loss": 2.3207, "step": 1700 }, { "epoch": 0.4837052791222593, "grad_norm": 1.8033897876739502, "learning_rate": 2.0879057742028154e-05, "loss": 2.2928, "step": 1701 }, { "epoch": 0.4839896443657174, "grad_norm": 1.610359787940979, "learning_rate": 2.0867566791151968e-05, "loss": 1.896, "step": 1702 }, { "epoch": 0.4842740096091755, "grad_norm": 1.8859881162643433, "learning_rate": 2.0856075840275786e-05, "loss": 1.648, "step": 1703 }, { "epoch": 0.48455837485263364, "grad_norm": 1.7698290348052979, "learning_rate": 2.08445848893996e-05, "loss": 1.6407, "step": 1704 }, { "epoch": 0.48484274009609174, "grad_norm": 1.8778108358383179, "learning_rate": 2.0833093938523415e-05, "loss": 2.8316, "step": 1705 }, { "epoch": 0.4851271053395499, "grad_norm": 1.6026455163955688, "learning_rate": 2.082160298764723e-05, "loss": 2.4976, "step": 1706 }, { "epoch": 0.485411470583008, "grad_norm": 1.5125036239624023, "learning_rate": 2.0810112036771044e-05, "loss": 2.2848, "step": 1707 }, { "epoch": 0.4856958358264661, "grad_norm": 1.6800127029418945, "learning_rate": 2.079862108589486e-05, "loss": 2.1576, "step": 1708 }, { "epoch": 0.48598020106992423, "grad_norm": 1.6857738494873047, "learning_rate": 2.0787130135018673e-05, "loss": 2.1469, "step": 1709 }, { "epoch": 0.48626456631338233, "grad_norm": 1.5208419561386108, "learning_rate": 2.0775639184142487e-05, "loss": 1.9286, "step": 1710 }, { "epoch": 0.4865489315568405, "grad_norm": 1.5637043714523315, "learning_rate": 2.0764148233266302e-05, "loss": 1.7151, "step": 1711 }, { "epoch": 0.4868332968002986, "grad_norm": 1.6756120920181274, "learning_rate": 2.075265728239012e-05, "loss": 1.8036, "step": 1712 }, { "epoch": 0.4871176620437567, "grad_norm": 1.8630014657974243, "learning_rate": 2.0741166331513934e-05, "loss": 2.9317, "step": 1713 }, { "epoch": 0.4874020272872148, "grad_norm": 1.732492208480835, "learning_rate": 2.072967538063775e-05, "loss": 2.4387, "step": 1714 }, { "epoch": 0.4876863925306729, "grad_norm": 1.6385549306869507, "learning_rate": 2.0718184429761563e-05, "loss": 2.1674, "step": 1715 }, { "epoch": 0.48797075777413107, "grad_norm": 1.5367069244384766, "learning_rate": 2.0706693478885378e-05, "loss": 2.2549, "step": 1716 }, { "epoch": 0.48825512301758917, "grad_norm": 1.765271782875061, "learning_rate": 2.0695202528009192e-05, "loss": 2.2325, "step": 1717 }, { "epoch": 0.4885394882610473, "grad_norm": 1.6616159677505493, "learning_rate": 2.0683711577133007e-05, "loss": 1.806, "step": 1718 }, { "epoch": 0.4888238535045054, "grad_norm": 1.7250800132751465, "learning_rate": 2.067222062625682e-05, "loss": 1.8495, "step": 1719 }, { "epoch": 0.4891082187479635, "grad_norm": 1.7794257402420044, "learning_rate": 2.0660729675380636e-05, "loss": 1.7158, "step": 1720 }, { "epoch": 0.48939258399142166, "grad_norm": 1.9646224975585938, "learning_rate": 2.0649238724504454e-05, "loss": 2.9298, "step": 1721 }, { "epoch": 0.48967694923487975, "grad_norm": 1.6093361377716064, "learning_rate": 2.063774777362827e-05, "loss": 2.4871, "step": 1722 }, { "epoch": 0.4899613144783379, "grad_norm": 1.5131632089614868, "learning_rate": 2.0626256822752086e-05, "loss": 2.2751, "step": 1723 }, { "epoch": 0.490245679721796, "grad_norm": 1.5674437284469604, "learning_rate": 2.06147658718759e-05, "loss": 2.2159, "step": 1724 }, { "epoch": 0.4905300449652541, "grad_norm": 1.5765401124954224, "learning_rate": 2.0603274920999715e-05, "loss": 2.2252, "step": 1725 }, { "epoch": 0.49081441020871225, "grad_norm": 1.6332764625549316, "learning_rate": 2.059178397012353e-05, "loss": 1.9848, "step": 1726 }, { "epoch": 0.49109877545217034, "grad_norm": 1.555993914604187, "learning_rate": 2.0580293019247347e-05, "loss": 1.8058, "step": 1727 }, { "epoch": 0.4913831406956285, "grad_norm": 1.792938470840454, "learning_rate": 2.0568802068371162e-05, "loss": 1.7801, "step": 1728 }, { "epoch": 0.4916675059390866, "grad_norm": 1.881766676902771, "learning_rate": 2.0557311117494976e-05, "loss": 2.734, "step": 1729 }, { "epoch": 0.4919518711825447, "grad_norm": 1.5868914127349854, "learning_rate": 2.054582016661879e-05, "loss": 2.5361, "step": 1730 }, { "epoch": 0.49223623642600284, "grad_norm": 1.59299898147583, "learning_rate": 2.0534329215742605e-05, "loss": 2.3963, "step": 1731 }, { "epoch": 0.49252060166946093, "grad_norm": 1.7048550844192505, "learning_rate": 2.052283826486642e-05, "loss": 2.1933, "step": 1732 }, { "epoch": 0.4928049669129191, "grad_norm": 1.5875484943389893, "learning_rate": 2.0511347313990234e-05, "loss": 2.2065, "step": 1733 }, { "epoch": 0.4930893321563772, "grad_norm": 1.7140761613845825, "learning_rate": 2.049985636311405e-05, "loss": 1.912, "step": 1734 }, { "epoch": 0.49337369739983533, "grad_norm": 1.6499665975570679, "learning_rate": 2.0488365412237863e-05, "loss": 1.9668, "step": 1735 }, { "epoch": 0.4936580626432934, "grad_norm": 1.7199233770370483, "learning_rate": 2.047687446136168e-05, "loss": 1.7098, "step": 1736 }, { "epoch": 0.4939424278867515, "grad_norm": 1.7340093851089478, "learning_rate": 2.0465383510485496e-05, "loss": 2.9317, "step": 1737 }, { "epoch": 0.4942267931302097, "grad_norm": 1.4972059726715088, "learning_rate": 2.045389255960931e-05, "loss": 2.4092, "step": 1738 }, { "epoch": 0.49451115837366777, "grad_norm": 1.5227432250976562, "learning_rate": 2.0442401608733125e-05, "loss": 2.3194, "step": 1739 }, { "epoch": 0.4947955236171259, "grad_norm": 1.5438145399093628, "learning_rate": 2.043091065785694e-05, "loss": 2.4278, "step": 1740 }, { "epoch": 0.495079888860584, "grad_norm": 1.767443060874939, "learning_rate": 2.0419419706980754e-05, "loss": 2.2614, "step": 1741 }, { "epoch": 0.4953642541040421, "grad_norm": 1.471556544303894, "learning_rate": 2.040792875610457e-05, "loss": 1.8055, "step": 1742 }, { "epoch": 0.49564861934750026, "grad_norm": 1.977705478668213, "learning_rate": 2.0396437805228383e-05, "loss": 1.8515, "step": 1743 }, { "epoch": 0.49593298459095836, "grad_norm": 1.7835551500320435, "learning_rate": 2.0384946854352197e-05, "loss": 1.8965, "step": 1744 }, { "epoch": 0.4962173498344165, "grad_norm": 1.7893133163452148, "learning_rate": 2.0373455903476015e-05, "loss": 2.6412, "step": 1745 }, { "epoch": 0.4965017150778746, "grad_norm": 1.5060399770736694, "learning_rate": 2.036196495259983e-05, "loss": 2.176, "step": 1746 }, { "epoch": 0.4967860803213327, "grad_norm": 1.6172282695770264, "learning_rate": 2.0350474001723644e-05, "loss": 2.2377, "step": 1747 }, { "epoch": 0.49707044556479085, "grad_norm": 1.5684025287628174, "learning_rate": 2.033898305084746e-05, "loss": 2.0984, "step": 1748 }, { "epoch": 0.49735481080824895, "grad_norm": 1.6685534715652466, "learning_rate": 2.0327492099971273e-05, "loss": 2.2496, "step": 1749 }, { "epoch": 0.4976391760517071, "grad_norm": 1.6324350833892822, "learning_rate": 2.0316001149095088e-05, "loss": 1.854, "step": 1750 }, { "epoch": 0.4979235412951652, "grad_norm": 1.642563819885254, "learning_rate": 2.0304510198218902e-05, "loss": 1.8248, "step": 1751 }, { "epoch": 0.4982079065386233, "grad_norm": 1.6866960525512695, "learning_rate": 2.0293019247342717e-05, "loss": 2.0292, "step": 1752 }, { "epoch": 0.49849227178208144, "grad_norm": 1.9261857271194458, "learning_rate": 2.028152829646653e-05, "loss": 2.8267, "step": 1753 }, { "epoch": 0.49877663702553954, "grad_norm": 1.5823674201965332, "learning_rate": 2.027003734559035e-05, "loss": 2.3225, "step": 1754 }, { "epoch": 0.4990610022689977, "grad_norm": 1.5094738006591797, "learning_rate": 2.0258546394714164e-05, "loss": 2.2377, "step": 1755 }, { "epoch": 0.4993453675124558, "grad_norm": 1.5388329029083252, "learning_rate": 2.0247055443837978e-05, "loss": 2.4092, "step": 1756 }, { "epoch": 0.49962973275591394, "grad_norm": 1.5663154125213623, "learning_rate": 2.0235564492961793e-05, "loss": 2.1407, "step": 1757 }, { "epoch": 0.49991409799937203, "grad_norm": 1.633455753326416, "learning_rate": 2.0224073542085607e-05, "loss": 2.0007, "step": 1758 }, { "epoch": 0.5001984632428301, "grad_norm": 1.7223169803619385, "learning_rate": 2.0212582591209425e-05, "loss": 2.0154, "step": 1759 }, { "epoch": 0.5004828284862882, "grad_norm": 1.8982787132263184, "learning_rate": 2.0201091640333243e-05, "loss": 1.9042, "step": 1760 }, { "epoch": 0.5007671937297464, "grad_norm": 1.855883240699768, "learning_rate": 2.0189600689457057e-05, "loss": 2.6409, "step": 1761 }, { "epoch": 0.5010515589732045, "grad_norm": 1.5711190700531006, "learning_rate": 2.0178109738580872e-05, "loss": 2.5989, "step": 1762 }, { "epoch": 0.5013359242166626, "grad_norm": 1.565077781677246, "learning_rate": 2.0166618787704686e-05, "loss": 2.5103, "step": 1763 }, { "epoch": 0.5016202894601207, "grad_norm": 1.5790541172027588, "learning_rate": 2.01551278368285e-05, "loss": 2.2145, "step": 1764 }, { "epoch": 0.5019046547035788, "grad_norm": 1.712709665298462, "learning_rate": 2.0143636885952315e-05, "loss": 2.2995, "step": 1765 }, { "epoch": 0.502189019947037, "grad_norm": 1.6883087158203125, "learning_rate": 2.013214593507613e-05, "loss": 1.8596, "step": 1766 }, { "epoch": 0.5024733851904951, "grad_norm": 1.6610372066497803, "learning_rate": 2.0120654984199944e-05, "loss": 1.9617, "step": 1767 }, { "epoch": 0.5027577504339532, "grad_norm": 1.5222752094268799, "learning_rate": 2.010916403332376e-05, "loss": 1.8691, "step": 1768 }, { "epoch": 0.5030421156774113, "grad_norm": 1.8320504426956177, "learning_rate": 2.0097673082447577e-05, "loss": 2.722, "step": 1769 }, { "epoch": 0.5033264809208694, "grad_norm": 1.5912829637527466, "learning_rate": 2.008618213157139e-05, "loss": 2.4662, "step": 1770 }, { "epoch": 0.5036108461643276, "grad_norm": 1.522308588027954, "learning_rate": 2.0074691180695206e-05, "loss": 2.3028, "step": 1771 }, { "epoch": 0.5038952114077857, "grad_norm": 1.5681263208389282, "learning_rate": 2.006320022981902e-05, "loss": 2.2298, "step": 1772 }, { "epoch": 0.5041795766512438, "grad_norm": 1.6516104936599731, "learning_rate": 2.0051709278942835e-05, "loss": 2.2985, "step": 1773 }, { "epoch": 0.5044639418947019, "grad_norm": 1.5645509958267212, "learning_rate": 2.004021832806665e-05, "loss": 1.942, "step": 1774 }, { "epoch": 0.50474830713816, "grad_norm": 1.656414270401001, "learning_rate": 2.0028727377190464e-05, "loss": 1.7982, "step": 1775 }, { "epoch": 0.5050326723816182, "grad_norm": 1.6322808265686035, "learning_rate": 2.0017236426314278e-05, "loss": 1.9504, "step": 1776 }, { "epoch": 0.5053170376250763, "grad_norm": 1.7643145322799683, "learning_rate": 2.0005745475438093e-05, "loss": 2.765, "step": 1777 }, { "epoch": 0.5056014028685344, "grad_norm": 1.471491813659668, "learning_rate": 1.999425452456191e-05, "loss": 2.2441, "step": 1778 }, { "epoch": 0.5058857681119925, "grad_norm": 1.5488718748092651, "learning_rate": 1.9982763573685725e-05, "loss": 2.3885, "step": 1779 }, { "epoch": 0.5061701333554506, "grad_norm": 1.5598701238632202, "learning_rate": 1.997127262280954e-05, "loss": 2.3161, "step": 1780 }, { "epoch": 0.5064544985989088, "grad_norm": 1.5675969123840332, "learning_rate": 1.9959781671933354e-05, "loss": 2.3845, "step": 1781 }, { "epoch": 0.5067388638423669, "grad_norm": 1.645579218864441, "learning_rate": 1.994829072105717e-05, "loss": 1.949, "step": 1782 }, { "epoch": 0.507023229085825, "grad_norm": 1.514928936958313, "learning_rate": 1.9936799770180983e-05, "loss": 1.8749, "step": 1783 }, { "epoch": 0.5073075943292831, "grad_norm": 1.7080934047698975, "learning_rate": 1.9925308819304798e-05, "loss": 1.6213, "step": 1784 }, { "epoch": 0.5075919595727412, "grad_norm": 2.008892297744751, "learning_rate": 1.9913817868428612e-05, "loss": 2.8028, "step": 1785 }, { "epoch": 0.5078763248161994, "grad_norm": 1.6706644296646118, "learning_rate": 1.9902326917552427e-05, "loss": 2.1027, "step": 1786 }, { "epoch": 0.5081606900596575, "grad_norm": 1.5543371438980103, "learning_rate": 1.9890835966676244e-05, "loss": 2.4243, "step": 1787 }, { "epoch": 0.5084450553031156, "grad_norm": 1.555815577507019, "learning_rate": 1.987934501580006e-05, "loss": 2.3077, "step": 1788 }, { "epoch": 0.5087294205465737, "grad_norm": 1.6928585767745972, "learning_rate": 1.9867854064923873e-05, "loss": 2.0664, "step": 1789 }, { "epoch": 0.5090137857900318, "grad_norm": 1.5728840827941895, "learning_rate": 1.985636311404769e-05, "loss": 2.0858, "step": 1790 }, { "epoch": 0.50929815103349, "grad_norm": 1.7079205513000488, "learning_rate": 1.9844872163171506e-05, "loss": 1.8084, "step": 1791 }, { "epoch": 0.5095825162769481, "grad_norm": 1.6424134969711304, "learning_rate": 1.983338121229532e-05, "loss": 1.7926, "step": 1792 }, { "epoch": 0.5098668815204062, "grad_norm": 1.9183549880981445, "learning_rate": 1.9821890261419135e-05, "loss": 2.7518, "step": 1793 }, { "epoch": 0.5101512467638643, "grad_norm": 1.5652717351913452, "learning_rate": 1.981039931054295e-05, "loss": 2.2392, "step": 1794 }, { "epoch": 0.5104356120073225, "grad_norm": 1.5693566799163818, "learning_rate": 1.9798908359666764e-05, "loss": 2.5368, "step": 1795 }, { "epoch": 0.5107199772507806, "grad_norm": 1.57747220993042, "learning_rate": 1.978741740879058e-05, "loss": 2.2448, "step": 1796 }, { "epoch": 0.5110043424942387, "grad_norm": 1.6220982074737549, "learning_rate": 1.9775926457914393e-05, "loss": 1.9995, "step": 1797 }, { "epoch": 0.5112887077376967, "grad_norm": 1.5088688135147095, "learning_rate": 1.9764435507038207e-05, "loss": 1.9295, "step": 1798 }, { "epoch": 0.5115730729811548, "grad_norm": 1.5756409168243408, "learning_rate": 1.9752944556162025e-05, "loss": 1.8758, "step": 1799 }, { "epoch": 0.511857438224613, "grad_norm": 1.6049522161483765, "learning_rate": 1.974145360528584e-05, "loss": 1.6545, "step": 1800 }, { "epoch": 0.5121418034680711, "grad_norm": 1.9730896949768066, "learning_rate": 1.9729962654409654e-05, "loss": 2.7769, "step": 1801 }, { "epoch": 0.5124261687115292, "grad_norm": 1.4819504022598267, "learning_rate": 1.971847170353347e-05, "loss": 2.2833, "step": 1802 }, { "epoch": 0.5127105339549873, "grad_norm": 1.6456480026245117, "learning_rate": 1.9706980752657283e-05, "loss": 2.3185, "step": 1803 }, { "epoch": 0.5129948991984454, "grad_norm": 1.5290210247039795, "learning_rate": 1.9695489801781098e-05, "loss": 2.4907, "step": 1804 }, { "epoch": 0.5132792644419036, "grad_norm": 1.5902469158172607, "learning_rate": 1.9683998850904912e-05, "loss": 2.2612, "step": 1805 }, { "epoch": 0.5135636296853617, "grad_norm": 1.622529149055481, "learning_rate": 1.967250790002873e-05, "loss": 1.9592, "step": 1806 }, { "epoch": 0.5138479949288198, "grad_norm": 1.5335959196090698, "learning_rate": 1.9661016949152545e-05, "loss": 1.7958, "step": 1807 }, { "epoch": 0.5141323601722779, "grad_norm": 1.5710722208023071, "learning_rate": 1.964952599827636e-05, "loss": 1.7861, "step": 1808 }, { "epoch": 0.514416725415736, "grad_norm": 1.9415019750595093, "learning_rate": 1.9638035047400174e-05, "loss": 2.8067, "step": 1809 }, { "epoch": 0.5147010906591942, "grad_norm": 1.686766266822815, "learning_rate": 1.9626544096523988e-05, "loss": 2.4224, "step": 1810 }, { "epoch": 0.5149854559026523, "grad_norm": 1.6373934745788574, "learning_rate": 1.9615053145647806e-05, "loss": 2.3926, "step": 1811 }, { "epoch": 0.5152698211461104, "grad_norm": 1.5860379934310913, "learning_rate": 1.960356219477162e-05, "loss": 2.213, "step": 1812 }, { "epoch": 0.5155541863895685, "grad_norm": 1.5629751682281494, "learning_rate": 1.9592071243895435e-05, "loss": 2.2401, "step": 1813 }, { "epoch": 0.5158385516330266, "grad_norm": 1.6226354837417603, "learning_rate": 1.958058029301925e-05, "loss": 1.8855, "step": 1814 }, { "epoch": 0.5161229168764848, "grad_norm": 1.6399784088134766, "learning_rate": 1.9569089342143064e-05, "loss": 1.9354, "step": 1815 }, { "epoch": 0.5164072821199429, "grad_norm": 1.8511910438537598, "learning_rate": 1.955759839126688e-05, "loss": 1.7383, "step": 1816 }, { "epoch": 0.516691647363401, "grad_norm": 1.8263779878616333, "learning_rate": 1.9546107440390693e-05, "loss": 2.7295, "step": 1817 }, { "epoch": 0.5169760126068591, "grad_norm": 1.721635341644287, "learning_rate": 1.9534616489514507e-05, "loss": 2.4588, "step": 1818 }, { "epoch": 0.5172603778503172, "grad_norm": 1.6186027526855469, "learning_rate": 1.9523125538638322e-05, "loss": 2.4666, "step": 1819 }, { "epoch": 0.5175447430937754, "grad_norm": 1.6999125480651855, "learning_rate": 1.951163458776214e-05, "loss": 2.21, "step": 1820 }, { "epoch": 0.5178291083372335, "grad_norm": 1.6916403770446777, "learning_rate": 1.9500143636885954e-05, "loss": 2.037, "step": 1821 }, { "epoch": 0.5181134735806916, "grad_norm": 1.541050672531128, "learning_rate": 1.948865268600977e-05, "loss": 1.8888, "step": 1822 }, { "epoch": 0.5183978388241497, "grad_norm": 1.5618538856506348, "learning_rate": 1.9477161735133583e-05, "loss": 1.8478, "step": 1823 }, { "epoch": 0.5186822040676078, "grad_norm": 1.6209863424301147, "learning_rate": 1.94656707842574e-05, "loss": 1.6114, "step": 1824 }, { "epoch": 0.518966569311066, "grad_norm": 1.7759110927581787, "learning_rate": 1.9454179833381216e-05, "loss": 2.6206, "step": 1825 }, { "epoch": 0.5192509345545241, "grad_norm": 1.5494519472122192, "learning_rate": 1.944268888250503e-05, "loss": 2.3457, "step": 1826 }, { "epoch": 0.5195352997979822, "grad_norm": 1.6379374265670776, "learning_rate": 1.9431197931628845e-05, "loss": 2.581, "step": 1827 }, { "epoch": 0.5198196650414403, "grad_norm": 1.5918887853622437, "learning_rate": 1.941970698075266e-05, "loss": 2.1037, "step": 1828 }, { "epoch": 0.5201040302848984, "grad_norm": 1.5805288553237915, "learning_rate": 1.9408216029876474e-05, "loss": 2.2555, "step": 1829 }, { "epoch": 0.5203883955283566, "grad_norm": 1.653382420539856, "learning_rate": 1.9396725079000288e-05, "loss": 1.7606, "step": 1830 }, { "epoch": 0.5206727607718147, "grad_norm": 1.5923447608947754, "learning_rate": 1.9385234128124103e-05, "loss": 1.6076, "step": 1831 }, { "epoch": 0.5209571260152728, "grad_norm": 1.7855467796325684, "learning_rate": 1.937374317724792e-05, "loss": 1.8589, "step": 1832 }, { "epoch": 0.5212414912587309, "grad_norm": 1.8567280769348145, "learning_rate": 1.9362252226371735e-05, "loss": 2.7604, "step": 1833 }, { "epoch": 0.521525856502189, "grad_norm": 1.6221263408660889, "learning_rate": 1.935076127549555e-05, "loss": 2.4281, "step": 1834 }, { "epoch": 0.5218102217456472, "grad_norm": 1.6806731224060059, "learning_rate": 1.9339270324619364e-05, "loss": 2.4162, "step": 1835 }, { "epoch": 0.5220945869891053, "grad_norm": 1.55227792263031, "learning_rate": 1.932777937374318e-05, "loss": 2.0863, "step": 1836 }, { "epoch": 0.5223789522325634, "grad_norm": 1.5924606323242188, "learning_rate": 1.9316288422866993e-05, "loss": 2.004, "step": 1837 }, { "epoch": 0.5226633174760215, "grad_norm": 1.6249966621398926, "learning_rate": 1.9304797471990808e-05, "loss": 2.0434, "step": 1838 }, { "epoch": 0.5229476827194796, "grad_norm": 1.702721357345581, "learning_rate": 1.9293306521114622e-05, "loss": 1.8394, "step": 1839 }, { "epoch": 0.5232320479629378, "grad_norm": 1.7455133199691772, "learning_rate": 1.9281815570238437e-05, "loss": 1.5562, "step": 1840 }, { "epoch": 0.5235164132063959, "grad_norm": 1.8886913061141968, "learning_rate": 1.9270324619362254e-05, "loss": 2.6912, "step": 1841 }, { "epoch": 0.523800778449854, "grad_norm": 1.5527037382125854, "learning_rate": 1.925883366848607e-05, "loss": 2.4175, "step": 1842 }, { "epoch": 0.524085143693312, "grad_norm": 1.5094817876815796, "learning_rate": 1.9247342717609883e-05, "loss": 2.299, "step": 1843 }, { "epoch": 0.5243695089367703, "grad_norm": 1.5052844285964966, "learning_rate": 1.92358517667337e-05, "loss": 2.3148, "step": 1844 }, { "epoch": 0.5246538741802284, "grad_norm": 1.5606803894042969, "learning_rate": 1.9224360815857516e-05, "loss": 2.2779, "step": 1845 }, { "epoch": 0.5249382394236864, "grad_norm": 1.6374552249908447, "learning_rate": 1.921286986498133e-05, "loss": 1.9273, "step": 1846 }, { "epoch": 0.5252226046671445, "grad_norm": 1.6898576021194458, "learning_rate": 1.9201378914105145e-05, "loss": 1.8248, "step": 1847 }, { "epoch": 0.5255069699106026, "grad_norm": 1.6066820621490479, "learning_rate": 1.918988796322896e-05, "loss": 1.8573, "step": 1848 }, { "epoch": 0.5257913351540608, "grad_norm": 1.8908296823501587, "learning_rate": 1.9178397012352774e-05, "loss": 2.9586, "step": 1849 }, { "epoch": 0.5260757003975189, "grad_norm": 1.7448999881744385, "learning_rate": 1.9166906061476588e-05, "loss": 2.5243, "step": 1850 }, { "epoch": 0.526360065640977, "grad_norm": 1.5253217220306396, "learning_rate": 1.9155415110600403e-05, "loss": 2.3295, "step": 1851 }, { "epoch": 0.5266444308844351, "grad_norm": 1.5445467233657837, "learning_rate": 1.9143924159724217e-05, "loss": 2.3908, "step": 1852 }, { "epoch": 0.5269287961278932, "grad_norm": 1.622870922088623, "learning_rate": 1.9132433208848035e-05, "loss": 2.0477, "step": 1853 }, { "epoch": 0.5272131613713514, "grad_norm": 1.7335789203643799, "learning_rate": 1.912094225797185e-05, "loss": 1.821, "step": 1854 }, { "epoch": 0.5274975266148095, "grad_norm": 1.7417482137680054, "learning_rate": 1.9109451307095664e-05, "loss": 1.7534, "step": 1855 }, { "epoch": 0.5277818918582676, "grad_norm": 1.7831518650054932, "learning_rate": 1.909796035621948e-05, "loss": 2.0581, "step": 1856 }, { "epoch": 0.5280662571017257, "grad_norm": 1.9567564725875854, "learning_rate": 1.9086469405343293e-05, "loss": 2.7799, "step": 1857 }, { "epoch": 0.5283506223451838, "grad_norm": 1.6343289613723755, "learning_rate": 1.9074978454467108e-05, "loss": 2.5412, "step": 1858 }, { "epoch": 0.528634987588642, "grad_norm": 1.4926238059997559, "learning_rate": 1.9063487503590922e-05, "loss": 2.3056, "step": 1859 }, { "epoch": 0.5289193528321001, "grad_norm": 1.5979171991348267, "learning_rate": 1.9051996552714737e-05, "loss": 2.4987, "step": 1860 }, { "epoch": 0.5292037180755582, "grad_norm": 1.6570512056350708, "learning_rate": 1.9040505601838555e-05, "loss": 2.0305, "step": 1861 }, { "epoch": 0.5294880833190163, "grad_norm": 1.517991065979004, "learning_rate": 1.902901465096237e-05, "loss": 1.672, "step": 1862 }, { "epoch": 0.5297724485624744, "grad_norm": 1.5992649793624878, "learning_rate": 1.9017523700086184e-05, "loss": 1.8708, "step": 1863 }, { "epoch": 0.5300568138059326, "grad_norm": 1.6488360166549683, "learning_rate": 1.9006032749209998e-05, "loss": 1.7175, "step": 1864 }, { "epoch": 0.5303411790493907, "grad_norm": 1.810335397720337, "learning_rate": 1.8994541798333816e-05, "loss": 2.8107, "step": 1865 }, { "epoch": 0.5306255442928488, "grad_norm": 1.5411492586135864, "learning_rate": 1.898305084745763e-05, "loss": 2.4042, "step": 1866 }, { "epoch": 0.5309099095363069, "grad_norm": 1.6033616065979004, "learning_rate": 1.8971559896581445e-05, "loss": 2.5207, "step": 1867 }, { "epoch": 0.531194274779765, "grad_norm": 1.451252818107605, "learning_rate": 1.896006894570526e-05, "loss": 2.2705, "step": 1868 }, { "epoch": 0.5314786400232232, "grad_norm": 1.614965558052063, "learning_rate": 1.8948577994829074e-05, "loss": 2.3188, "step": 1869 }, { "epoch": 0.5317630052666813, "grad_norm": 1.570432186126709, "learning_rate": 1.893708704395289e-05, "loss": 1.901, "step": 1870 }, { "epoch": 0.5320473705101394, "grad_norm": 1.603621244430542, "learning_rate": 1.8925596093076703e-05, "loss": 1.811, "step": 1871 }, { "epoch": 0.5323317357535975, "grad_norm": 1.7074507474899292, "learning_rate": 1.8914105142200517e-05, "loss": 1.7383, "step": 1872 }, { "epoch": 0.5326161009970556, "grad_norm": 1.7993121147155762, "learning_rate": 1.8902614191324332e-05, "loss": 2.8412, "step": 1873 }, { "epoch": 0.5329004662405138, "grad_norm": 1.4372435808181763, "learning_rate": 1.889112324044815e-05, "loss": 2.4849, "step": 1874 }, { "epoch": 0.5331848314839719, "grad_norm": 1.4756051301956177, "learning_rate": 1.8879632289571964e-05, "loss": 2.3527, "step": 1875 }, { "epoch": 0.53346919672743, "grad_norm": 1.54763925075531, "learning_rate": 1.886814133869578e-05, "loss": 2.1188, "step": 1876 }, { "epoch": 0.5337535619708881, "grad_norm": 1.6387181282043457, "learning_rate": 1.8856650387819593e-05, "loss": 2.2232, "step": 1877 }, { "epoch": 0.5340379272143462, "grad_norm": 1.5445959568023682, "learning_rate": 1.8845159436943408e-05, "loss": 1.96, "step": 1878 }, { "epoch": 0.5343222924578044, "grad_norm": 1.6059736013412476, "learning_rate": 1.8833668486067222e-05, "loss": 1.9384, "step": 1879 }, { "epoch": 0.5346066577012625, "grad_norm": 1.5326387882232666, "learning_rate": 1.882217753519104e-05, "loss": 1.8572, "step": 1880 }, { "epoch": 0.5348910229447206, "grad_norm": 1.9614207744598389, "learning_rate": 1.8810686584314855e-05, "loss": 2.8123, "step": 1881 }, { "epoch": 0.5351753881881787, "grad_norm": 1.532706618309021, "learning_rate": 1.879919563343867e-05, "loss": 2.3232, "step": 1882 }, { "epoch": 0.5354597534316368, "grad_norm": 1.612025499343872, "learning_rate": 1.8787704682562484e-05, "loss": 2.4058, "step": 1883 }, { "epoch": 0.535744118675095, "grad_norm": 1.535540223121643, "learning_rate": 1.8776213731686298e-05, "loss": 2.2764, "step": 1884 }, { "epoch": 0.5360284839185531, "grad_norm": 1.7752254009246826, "learning_rate": 1.8764722780810113e-05, "loss": 2.1374, "step": 1885 }, { "epoch": 0.5363128491620112, "grad_norm": 1.6558598279953003, "learning_rate": 1.875323182993393e-05, "loss": 1.9204, "step": 1886 }, { "epoch": 0.5365972144054693, "grad_norm": 1.5669152736663818, "learning_rate": 1.8741740879057745e-05, "loss": 1.9425, "step": 1887 }, { "epoch": 0.5368815796489275, "grad_norm": 1.711891531944275, "learning_rate": 1.873024992818156e-05, "loss": 1.8079, "step": 1888 }, { "epoch": 0.5371659448923856, "grad_norm": 1.939569115638733, "learning_rate": 1.8718758977305374e-05, "loss": 2.7149, "step": 1889 }, { "epoch": 0.5374503101358437, "grad_norm": 1.6168116331100464, "learning_rate": 1.870726802642919e-05, "loss": 2.6034, "step": 1890 }, { "epoch": 0.5377346753793018, "grad_norm": 1.5140300989151, "learning_rate": 1.8695777075553003e-05, "loss": 2.2716, "step": 1891 }, { "epoch": 0.5380190406227598, "grad_norm": 1.440793752670288, "learning_rate": 1.8684286124676817e-05, "loss": 2.2802, "step": 1892 }, { "epoch": 0.538303405866218, "grad_norm": 1.5787031650543213, "learning_rate": 1.8672795173800632e-05, "loss": 2.3291, "step": 1893 }, { "epoch": 0.5385877711096761, "grad_norm": 1.5322247743606567, "learning_rate": 1.8661304222924446e-05, "loss": 2.0667, "step": 1894 }, { "epoch": 0.5388721363531342, "grad_norm": 1.6390894651412964, "learning_rate": 1.8649813272048264e-05, "loss": 1.8755, "step": 1895 }, { "epoch": 0.5391565015965923, "grad_norm": 1.7358583211898804, "learning_rate": 1.863832232117208e-05, "loss": 1.8302, "step": 1896 }, { "epoch": 0.5394408668400504, "grad_norm": 1.8744136095046997, "learning_rate": 1.8626831370295893e-05, "loss": 2.8263, "step": 1897 }, { "epoch": 0.5397252320835086, "grad_norm": 1.5263346433639526, "learning_rate": 1.861534041941971e-05, "loss": 2.1965, "step": 1898 }, { "epoch": 0.5400095973269667, "grad_norm": 1.5258394479751587, "learning_rate": 1.8603849468543526e-05, "loss": 2.2005, "step": 1899 }, { "epoch": 0.5402939625704248, "grad_norm": 1.5305837392807007, "learning_rate": 1.859235851766734e-05, "loss": 2.2732, "step": 1900 }, { "epoch": 0.5405783278138829, "grad_norm": 1.7153434753417969, "learning_rate": 1.8580867566791155e-05, "loss": 2.2237, "step": 1901 }, { "epoch": 0.540862693057341, "grad_norm": 1.5634775161743164, "learning_rate": 1.856937661591497e-05, "loss": 1.9519, "step": 1902 }, { "epoch": 0.5411470583007992, "grad_norm": 1.6455775499343872, "learning_rate": 1.8557885665038784e-05, "loss": 1.8184, "step": 1903 }, { "epoch": 0.5414314235442573, "grad_norm": 1.5984224081039429, "learning_rate": 1.8546394714162598e-05, "loss": 1.7093, "step": 1904 }, { "epoch": 0.5417157887877154, "grad_norm": 2.2204861640930176, "learning_rate": 1.8534903763286413e-05, "loss": 2.7596, "step": 1905 }, { "epoch": 0.5420001540311735, "grad_norm": 1.8560435771942139, "learning_rate": 1.8523412812410227e-05, "loss": 2.5364, "step": 1906 }, { "epoch": 0.5422845192746316, "grad_norm": 1.5567165613174438, "learning_rate": 1.8511921861534045e-05, "loss": 2.2509, "step": 1907 }, { "epoch": 0.5425688845180898, "grad_norm": 1.4999059438705444, "learning_rate": 1.850043091065786e-05, "loss": 2.153, "step": 1908 }, { "epoch": 0.5428532497615479, "grad_norm": 1.6572489738464355, "learning_rate": 1.8488939959781674e-05, "loss": 1.9657, "step": 1909 }, { "epoch": 0.543137615005006, "grad_norm": 1.5145512819290161, "learning_rate": 1.847744900890549e-05, "loss": 1.7628, "step": 1910 }, { "epoch": 0.5434219802484641, "grad_norm": 1.5486762523651123, "learning_rate": 1.8465958058029303e-05, "loss": 1.8377, "step": 1911 }, { "epoch": 0.5437063454919222, "grad_norm": 1.6072850227355957, "learning_rate": 1.8454467107153118e-05, "loss": 1.9599, "step": 1912 }, { "epoch": 0.5439907107353804, "grad_norm": 1.9221049547195435, "learning_rate": 1.8442976156276932e-05, "loss": 2.8359, "step": 1913 }, { "epoch": 0.5442750759788385, "grad_norm": 1.6474133729934692, "learning_rate": 1.8431485205400747e-05, "loss": 2.495, "step": 1914 }, { "epoch": 0.5445594412222966, "grad_norm": 1.5209523439407349, "learning_rate": 1.841999425452456e-05, "loss": 2.2543, "step": 1915 }, { "epoch": 0.5448438064657547, "grad_norm": 1.592429280281067, "learning_rate": 1.840850330364838e-05, "loss": 2.2111, "step": 1916 }, { "epoch": 0.5451281717092128, "grad_norm": 1.5777971744537354, "learning_rate": 1.8397012352772193e-05, "loss": 2.1635, "step": 1917 }, { "epoch": 0.545412536952671, "grad_norm": 1.5663307905197144, "learning_rate": 1.8385521401896008e-05, "loss": 1.8126, "step": 1918 }, { "epoch": 0.5456969021961291, "grad_norm": 1.643549919128418, "learning_rate": 1.8374030451019826e-05, "loss": 1.9639, "step": 1919 }, { "epoch": 0.5459812674395872, "grad_norm": 1.7581907510757446, "learning_rate": 1.836253950014364e-05, "loss": 1.5631, "step": 1920 }, { "epoch": 0.5462656326830453, "grad_norm": 1.8245182037353516, "learning_rate": 1.8351048549267455e-05, "loss": 2.8506, "step": 1921 }, { "epoch": 0.5465499979265034, "grad_norm": 1.53452730178833, "learning_rate": 1.833955759839127e-05, "loss": 2.3347, "step": 1922 }, { "epoch": 0.5468343631699616, "grad_norm": 1.4857192039489746, "learning_rate": 1.8328066647515084e-05, "loss": 2.3888, "step": 1923 }, { "epoch": 0.5471187284134197, "grad_norm": 1.5653821229934692, "learning_rate": 1.83165756966389e-05, "loss": 2.2097, "step": 1924 }, { "epoch": 0.5474030936568778, "grad_norm": 1.7162553071975708, "learning_rate": 1.8305084745762713e-05, "loss": 1.9651, "step": 1925 }, { "epoch": 0.5476874589003359, "grad_norm": 1.5265828371047974, "learning_rate": 1.8293593794886527e-05, "loss": 1.8044, "step": 1926 }, { "epoch": 0.547971824143794, "grad_norm": 1.5719175338745117, "learning_rate": 1.8282102844010342e-05, "loss": 1.8203, "step": 1927 }, { "epoch": 0.5482561893872522, "grad_norm": 1.6208674907684326, "learning_rate": 1.827061189313416e-05, "loss": 1.777, "step": 1928 }, { "epoch": 0.5485405546307103, "grad_norm": 1.8437464237213135, "learning_rate": 1.8259120942257974e-05, "loss": 2.8108, "step": 1929 }, { "epoch": 0.5488249198741684, "grad_norm": 1.4935005903244019, "learning_rate": 1.824762999138179e-05, "loss": 2.4553, "step": 1930 }, { "epoch": 0.5491092851176265, "grad_norm": 1.4502531290054321, "learning_rate": 1.8236139040505603e-05, "loss": 2.1391, "step": 1931 }, { "epoch": 0.5493936503610846, "grad_norm": 1.467806100845337, "learning_rate": 1.8224648089629418e-05, "loss": 2.0988, "step": 1932 }, { "epoch": 0.5496780156045428, "grad_norm": 1.5642067193984985, "learning_rate": 1.8213157138753232e-05, "loss": 2.1998, "step": 1933 }, { "epoch": 0.5499623808480009, "grad_norm": 1.6454665660858154, "learning_rate": 1.8201666187877047e-05, "loss": 1.8338, "step": 1934 }, { "epoch": 0.550246746091459, "grad_norm": 1.665605068206787, "learning_rate": 1.8190175237000865e-05, "loss": 1.8698, "step": 1935 }, { "epoch": 0.5505311113349171, "grad_norm": 1.661318063735962, "learning_rate": 1.817868428612468e-05, "loss": 1.7389, "step": 1936 }, { "epoch": 0.5508154765783753, "grad_norm": 1.778904676437378, "learning_rate": 1.8167193335248494e-05, "loss": 2.6794, "step": 1937 }, { "epoch": 0.5510998418218334, "grad_norm": 1.7070825099945068, "learning_rate": 1.8155702384372308e-05, "loss": 2.4678, "step": 1938 }, { "epoch": 0.5513842070652915, "grad_norm": 1.5696340799331665, "learning_rate": 1.8144211433496123e-05, "loss": 2.2423, "step": 1939 }, { "epoch": 0.5516685723087495, "grad_norm": 1.664480209350586, "learning_rate": 1.813272048261994e-05, "loss": 2.2692, "step": 1940 }, { "epoch": 0.5519529375522076, "grad_norm": 1.7250727415084839, "learning_rate": 1.8121229531743755e-05, "loss": 1.8935, "step": 1941 }, { "epoch": 0.5522373027956659, "grad_norm": 1.5921610593795776, "learning_rate": 1.810973858086757e-05, "loss": 1.9107, "step": 1942 }, { "epoch": 0.552521668039124, "grad_norm": 1.5735629796981812, "learning_rate": 1.8098247629991384e-05, "loss": 1.8625, "step": 1943 }, { "epoch": 0.552806033282582, "grad_norm": 1.7604540586471558, "learning_rate": 1.80867566791152e-05, "loss": 1.9191, "step": 1944 }, { "epoch": 0.5530903985260401, "grad_norm": 1.894787311553955, "learning_rate": 1.8075265728239013e-05, "loss": 2.8069, "step": 1945 }, { "epoch": 0.5533747637694982, "grad_norm": 1.5562604665756226, "learning_rate": 1.8063774777362827e-05, "loss": 2.5382, "step": 1946 }, { "epoch": 0.5536591290129564, "grad_norm": 1.5096544027328491, "learning_rate": 1.8052283826486642e-05, "loss": 2.174, "step": 1947 }, { "epoch": 0.5539434942564145, "grad_norm": 1.5471742153167725, "learning_rate": 1.8040792875610456e-05, "loss": 2.3403, "step": 1948 }, { "epoch": 0.5542278594998726, "grad_norm": 1.6458371877670288, "learning_rate": 1.8029301924734274e-05, "loss": 2.0733, "step": 1949 }, { "epoch": 0.5545122247433307, "grad_norm": 1.7109332084655762, "learning_rate": 1.801781097385809e-05, "loss": 1.8824, "step": 1950 }, { "epoch": 0.5547965899867888, "grad_norm": 1.7361643314361572, "learning_rate": 1.8006320022981903e-05, "loss": 1.8698, "step": 1951 }, { "epoch": 0.555080955230247, "grad_norm": 1.6732168197631836, "learning_rate": 1.7994829072105718e-05, "loss": 1.774, "step": 1952 }, { "epoch": 0.5553653204737051, "grad_norm": 1.7529971599578857, "learning_rate": 1.7983338121229532e-05, "loss": 2.8649, "step": 1953 }, { "epoch": 0.5556496857171632, "grad_norm": 1.5565195083618164, "learning_rate": 1.797184717035335e-05, "loss": 2.4457, "step": 1954 }, { "epoch": 0.5559340509606213, "grad_norm": 1.5663349628448486, "learning_rate": 1.7960356219477165e-05, "loss": 2.2654, "step": 1955 }, { "epoch": 0.5562184162040794, "grad_norm": 1.509903907775879, "learning_rate": 1.794886526860098e-05, "loss": 2.1682, "step": 1956 }, { "epoch": 0.5565027814475376, "grad_norm": 1.7437077760696411, "learning_rate": 1.7937374317724794e-05, "loss": 2.1682, "step": 1957 }, { "epoch": 0.5567871466909957, "grad_norm": 1.532533049583435, "learning_rate": 1.7925883366848608e-05, "loss": 1.7354, "step": 1958 }, { "epoch": 0.5570715119344538, "grad_norm": 1.6096084117889404, "learning_rate": 1.7914392415972423e-05, "loss": 1.8484, "step": 1959 }, { "epoch": 0.5573558771779119, "grad_norm": 1.8258168697357178, "learning_rate": 1.7902901465096237e-05, "loss": 1.807, "step": 1960 }, { "epoch": 0.55764024242137, "grad_norm": 2.004221200942993, "learning_rate": 1.789141051422005e-05, "loss": 2.7515, "step": 1961 }, { "epoch": 0.5579246076648282, "grad_norm": 1.593041181564331, "learning_rate": 1.787991956334387e-05, "loss": 2.3786, "step": 1962 }, { "epoch": 0.5582089729082863, "grad_norm": 1.5780787467956543, "learning_rate": 1.7868428612467684e-05, "loss": 2.3528, "step": 1963 }, { "epoch": 0.5584933381517444, "grad_norm": 1.5928815603256226, "learning_rate": 1.78569376615915e-05, "loss": 2.2974, "step": 1964 }, { "epoch": 0.5587777033952025, "grad_norm": 1.7977780103683472, "learning_rate": 1.7845446710715313e-05, "loss": 2.2807, "step": 1965 }, { "epoch": 0.5590620686386606, "grad_norm": 1.5917166471481323, "learning_rate": 1.7833955759839128e-05, "loss": 2.0174, "step": 1966 }, { "epoch": 0.5593464338821188, "grad_norm": 1.55897057056427, "learning_rate": 1.7822464808962942e-05, "loss": 1.8759, "step": 1967 }, { "epoch": 0.5596307991255769, "grad_norm": 1.6804914474487305, "learning_rate": 1.7810973858086757e-05, "loss": 1.7898, "step": 1968 }, { "epoch": 0.559915164369035, "grad_norm": 1.7221648693084717, "learning_rate": 1.779948290721057e-05, "loss": 2.6948, "step": 1969 }, { "epoch": 0.5601995296124931, "grad_norm": 1.5615862607955933, "learning_rate": 1.7787991956334386e-05, "loss": 2.2914, "step": 1970 }, { "epoch": 0.5604838948559512, "grad_norm": 1.5630532503128052, "learning_rate": 1.7776501005458203e-05, "loss": 2.4075, "step": 1971 }, { "epoch": 0.5607682600994094, "grad_norm": 1.6465175151824951, "learning_rate": 1.7765010054582018e-05, "loss": 2.2431, "step": 1972 }, { "epoch": 0.5610526253428675, "grad_norm": 1.6053401231765747, "learning_rate": 1.7753519103705832e-05, "loss": 1.9444, "step": 1973 }, { "epoch": 0.5613369905863256, "grad_norm": 1.5108147859573364, "learning_rate": 1.774202815282965e-05, "loss": 1.8403, "step": 1974 }, { "epoch": 0.5616213558297837, "grad_norm": 1.814316987991333, "learning_rate": 1.7730537201953465e-05, "loss": 1.7395, "step": 1975 }, { "epoch": 0.5619057210732418, "grad_norm": 1.6270887851715088, "learning_rate": 1.771904625107728e-05, "loss": 1.6755, "step": 1976 }, { "epoch": 0.5621900863167, "grad_norm": 1.9045475721359253, "learning_rate": 1.7707555300201094e-05, "loss": 2.8776, "step": 1977 }, { "epoch": 0.5624744515601581, "grad_norm": 1.6713111400604248, "learning_rate": 1.769606434932491e-05, "loss": 2.3999, "step": 1978 }, { "epoch": 0.5627588168036162, "grad_norm": 1.4755371809005737, "learning_rate": 1.7684573398448723e-05, "loss": 2.344, "step": 1979 }, { "epoch": 0.5630431820470743, "grad_norm": 1.6609379053115845, "learning_rate": 1.7673082447572537e-05, "loss": 2.2334, "step": 1980 }, { "epoch": 0.5633275472905324, "grad_norm": 1.750351071357727, "learning_rate": 1.7661591496696352e-05, "loss": 2.2961, "step": 1981 }, { "epoch": 0.5636119125339906, "grad_norm": 1.47465181350708, "learning_rate": 1.7650100545820166e-05, "loss": 1.7603, "step": 1982 }, { "epoch": 0.5638962777774487, "grad_norm": 1.627179503440857, "learning_rate": 1.7638609594943984e-05, "loss": 1.8822, "step": 1983 }, { "epoch": 0.5641806430209068, "grad_norm": 1.570872187614441, "learning_rate": 1.76271186440678e-05, "loss": 1.8734, "step": 1984 }, { "epoch": 0.5644650082643649, "grad_norm": 1.7837358713150024, "learning_rate": 1.7615627693191613e-05, "loss": 2.7609, "step": 1985 }, { "epoch": 0.5647493735078231, "grad_norm": 1.5438477993011475, "learning_rate": 1.7604136742315428e-05, "loss": 2.2791, "step": 1986 }, { "epoch": 0.5650337387512812, "grad_norm": 1.561362385749817, "learning_rate": 1.7592645791439242e-05, "loss": 2.1921, "step": 1987 }, { "epoch": 0.5653181039947393, "grad_norm": 1.4173671007156372, "learning_rate": 1.7581154840563057e-05, "loss": 2.0708, "step": 1988 }, { "epoch": 0.5656024692381973, "grad_norm": 1.5849429368972778, "learning_rate": 1.756966388968687e-05, "loss": 2.1582, "step": 1989 }, { "epoch": 0.5658868344816554, "grad_norm": 1.5969016551971436, "learning_rate": 1.7558172938810686e-05, "loss": 1.9153, "step": 1990 }, { "epoch": 0.5661711997251136, "grad_norm": 1.72384774684906, "learning_rate": 1.7546681987934504e-05, "loss": 2.1121, "step": 1991 }, { "epoch": 0.5664555649685717, "grad_norm": 1.6643520593643188, "learning_rate": 1.7535191037058318e-05, "loss": 1.7982, "step": 1992 }, { "epoch": 0.5667399302120298, "grad_norm": 1.7195711135864258, "learning_rate": 1.7523700086182133e-05, "loss": 2.8548, "step": 1993 }, { "epoch": 0.5670242954554879, "grad_norm": 1.5349563360214233, "learning_rate": 1.7512209135305947e-05, "loss": 2.5764, "step": 1994 }, { "epoch": 0.567308660698946, "grad_norm": 1.562687873840332, "learning_rate": 1.7500718184429765e-05, "loss": 2.1619, "step": 1995 }, { "epoch": 0.5675930259424042, "grad_norm": 1.4912785291671753, "learning_rate": 1.748922723355358e-05, "loss": 2.2925, "step": 1996 }, { "epoch": 0.5678773911858623, "grad_norm": 1.5238652229309082, "learning_rate": 1.7477736282677394e-05, "loss": 2.0571, "step": 1997 }, { "epoch": 0.5681617564293204, "grad_norm": 1.570028305053711, "learning_rate": 1.746624533180121e-05, "loss": 2.0634, "step": 1998 }, { "epoch": 0.5684461216727785, "grad_norm": 1.6997617483139038, "learning_rate": 1.7454754380925023e-05, "loss": 1.7017, "step": 1999 }, { "epoch": 0.5687304869162366, "grad_norm": 1.747353434562683, "learning_rate": 1.7443263430048837e-05, "loss": 1.8186, "step": 2000 }, { "epoch": 0.5690148521596948, "grad_norm": 1.8693517446517944, "learning_rate": 1.7431772479172652e-05, "loss": 2.8259, "step": 2001 }, { "epoch": 0.5692992174031529, "grad_norm": 1.6010723114013672, "learning_rate": 1.7420281528296466e-05, "loss": 2.4006, "step": 2002 }, { "epoch": 0.569583582646611, "grad_norm": 1.4531267881393433, "learning_rate": 1.740879057742028e-05, "loss": 2.1439, "step": 2003 }, { "epoch": 0.5698679478900691, "grad_norm": 1.6961573362350464, "learning_rate": 1.73972996265441e-05, "loss": 2.2823, "step": 2004 }, { "epoch": 0.5701523131335272, "grad_norm": 1.5460631847381592, "learning_rate": 1.7385808675667913e-05, "loss": 2.1329, "step": 2005 }, { "epoch": 0.5704366783769854, "grad_norm": 1.644081950187683, "learning_rate": 1.7374317724791728e-05, "loss": 2.0236, "step": 2006 }, { "epoch": 0.5707210436204435, "grad_norm": 1.6294313669204712, "learning_rate": 1.7362826773915542e-05, "loss": 2.004, "step": 2007 }, { "epoch": 0.5710054088639016, "grad_norm": 1.7416433095932007, "learning_rate": 1.7351335823039357e-05, "loss": 1.9497, "step": 2008 }, { "epoch": 0.5712897741073597, "grad_norm": 1.8446623086929321, "learning_rate": 1.7339844872163175e-05, "loss": 2.8675, "step": 2009 }, { "epoch": 0.5715741393508178, "grad_norm": 1.5892642736434937, "learning_rate": 1.732835392128699e-05, "loss": 2.3443, "step": 2010 }, { "epoch": 0.571858504594276, "grad_norm": 1.5367027521133423, "learning_rate": 1.7316862970410804e-05, "loss": 2.2238, "step": 2011 }, { "epoch": 0.5721428698377341, "grad_norm": 1.5910507440567017, "learning_rate": 1.7305372019534618e-05, "loss": 2.2285, "step": 2012 }, { "epoch": 0.5724272350811922, "grad_norm": 1.6503397226333618, "learning_rate": 1.7293881068658433e-05, "loss": 2.1693, "step": 2013 }, { "epoch": 0.5727116003246503, "grad_norm": 1.5087186098098755, "learning_rate": 1.7282390117782247e-05, "loss": 1.896, "step": 2014 }, { "epoch": 0.5729959655681084, "grad_norm": 1.6095649003982544, "learning_rate": 1.727089916690606e-05, "loss": 1.773, "step": 2015 }, { "epoch": 0.5732803308115666, "grad_norm": 1.5037708282470703, "learning_rate": 1.725940821602988e-05, "loss": 1.7249, "step": 2016 }, { "epoch": 0.5735646960550247, "grad_norm": 1.788989543914795, "learning_rate": 1.7247917265153694e-05, "loss": 2.8071, "step": 2017 }, { "epoch": 0.5738490612984828, "grad_norm": 1.5129423141479492, "learning_rate": 1.723642631427751e-05, "loss": 2.3458, "step": 2018 }, { "epoch": 0.5741334265419409, "grad_norm": 1.4829000234603882, "learning_rate": 1.7224935363401323e-05, "loss": 2.2329, "step": 2019 }, { "epoch": 0.574417791785399, "grad_norm": 1.5166312456130981, "learning_rate": 1.7213444412525138e-05, "loss": 2.1336, "step": 2020 }, { "epoch": 0.5747021570288572, "grad_norm": 1.6192970275878906, "learning_rate": 1.7201953461648952e-05, "loss": 2.2265, "step": 2021 }, { "epoch": 0.5749865222723153, "grad_norm": 1.7159581184387207, "learning_rate": 1.7190462510772767e-05, "loss": 1.9326, "step": 2022 }, { "epoch": 0.5752708875157734, "grad_norm": 1.5626370906829834, "learning_rate": 1.717897155989658e-05, "loss": 1.7684, "step": 2023 }, { "epoch": 0.5755552527592315, "grad_norm": 1.6683768033981323, "learning_rate": 1.7167480609020396e-05, "loss": 1.9421, "step": 2024 }, { "epoch": 0.5758396180026896, "grad_norm": 1.9297045469284058, "learning_rate": 1.7155989658144213e-05, "loss": 2.9594, "step": 2025 }, { "epoch": 0.5761239832461478, "grad_norm": 1.5986518859863281, "learning_rate": 1.7144498707268028e-05, "loss": 2.334, "step": 2026 }, { "epoch": 0.5764083484896059, "grad_norm": 1.5038700103759766, "learning_rate": 1.7133007756391842e-05, "loss": 2.326, "step": 2027 }, { "epoch": 0.576692713733064, "grad_norm": 1.4739896059036255, "learning_rate": 1.712151680551566e-05, "loss": 2.2516, "step": 2028 }, { "epoch": 0.5769770789765221, "grad_norm": 1.5917458534240723, "learning_rate": 1.7110025854639475e-05, "loss": 2.0803, "step": 2029 }, { "epoch": 0.5772614442199802, "grad_norm": 1.491292953491211, "learning_rate": 1.709853490376329e-05, "loss": 1.8907, "step": 2030 }, { "epoch": 0.5775458094634384, "grad_norm": 1.5379160642623901, "learning_rate": 1.7087043952887104e-05, "loss": 1.8301, "step": 2031 }, { "epoch": 0.5778301747068965, "grad_norm": 1.6341428756713867, "learning_rate": 1.7075553002010918e-05, "loss": 1.7232, "step": 2032 }, { "epoch": 0.5781145399503546, "grad_norm": 1.6281121969223022, "learning_rate": 1.7064062051134733e-05, "loss": 2.7037, "step": 2033 }, { "epoch": 0.5783989051938127, "grad_norm": 1.5146305561065674, "learning_rate": 1.7052571100258547e-05, "loss": 2.4446, "step": 2034 }, { "epoch": 0.5786832704372709, "grad_norm": 1.5931079387664795, "learning_rate": 1.7041080149382362e-05, "loss": 2.2561, "step": 2035 }, { "epoch": 0.578967635680729, "grad_norm": 1.4791065454483032, "learning_rate": 1.7029589198506176e-05, "loss": 2.3593, "step": 2036 }, { "epoch": 0.579252000924187, "grad_norm": 1.6104813814163208, "learning_rate": 1.7018098247629994e-05, "loss": 2.1088, "step": 2037 }, { "epoch": 0.5795363661676451, "grad_norm": 1.5709761381149292, "learning_rate": 1.700660729675381e-05, "loss": 1.7519, "step": 2038 }, { "epoch": 0.5798207314111032, "grad_norm": 1.6428778171539307, "learning_rate": 1.6995116345877623e-05, "loss": 1.9199, "step": 2039 }, { "epoch": 0.5801050966545614, "grad_norm": 1.7062665224075317, "learning_rate": 1.6983625395001438e-05, "loss": 1.9117, "step": 2040 }, { "epoch": 0.5803894618980195, "grad_norm": 1.9232088327407837, "learning_rate": 1.6972134444125252e-05, "loss": 2.822, "step": 2041 }, { "epoch": 0.5806738271414776, "grad_norm": 1.44762122631073, "learning_rate": 1.6960643493249067e-05, "loss": 2.3914, "step": 2042 }, { "epoch": 0.5809581923849357, "grad_norm": 1.5696653127670288, "learning_rate": 1.694915254237288e-05, "loss": 2.2531, "step": 2043 }, { "epoch": 0.5812425576283938, "grad_norm": 1.4691784381866455, "learning_rate": 1.6937661591496696e-05, "loss": 2.225, "step": 2044 }, { "epoch": 0.581526922871852, "grad_norm": 1.6073601245880127, "learning_rate": 1.692617064062051e-05, "loss": 1.9158, "step": 2045 }, { "epoch": 0.5818112881153101, "grad_norm": 1.5055420398712158, "learning_rate": 1.6914679689744328e-05, "loss": 2.0171, "step": 2046 }, { "epoch": 0.5820956533587682, "grad_norm": 1.6332578659057617, "learning_rate": 1.6903188738868143e-05, "loss": 1.7642, "step": 2047 }, { "epoch": 0.5823800186022263, "grad_norm": 1.577101230621338, "learning_rate": 1.6891697787991957e-05, "loss": 1.8056, "step": 2048 }, { "epoch": 0.5826643838456844, "grad_norm": 1.963454008102417, "learning_rate": 1.6880206837115775e-05, "loss": 2.8995, "step": 2049 }, { "epoch": 0.5829487490891426, "grad_norm": 1.5220768451690674, "learning_rate": 1.686871588623959e-05, "loss": 2.2895, "step": 2050 }, { "epoch": 0.5832331143326007, "grad_norm": 1.5367721319198608, "learning_rate": 1.6857224935363404e-05, "loss": 2.2004, "step": 2051 }, { "epoch": 0.5835174795760588, "grad_norm": 1.5274112224578857, "learning_rate": 1.684573398448722e-05, "loss": 2.2675, "step": 2052 }, { "epoch": 0.5838018448195169, "grad_norm": 1.7082741260528564, "learning_rate": 1.6834243033611033e-05, "loss": 2.1209, "step": 2053 }, { "epoch": 0.584086210062975, "grad_norm": 1.5683553218841553, "learning_rate": 1.6822752082734847e-05, "loss": 1.9012, "step": 2054 }, { "epoch": 0.5843705753064332, "grad_norm": 1.6328856945037842, "learning_rate": 1.6811261131858662e-05, "loss": 1.9311, "step": 2055 }, { "epoch": 0.5846549405498913, "grad_norm": 1.782463550567627, "learning_rate": 1.6799770180982476e-05, "loss": 1.8888, "step": 2056 }, { "epoch": 0.5849393057933494, "grad_norm": 1.9230624437332153, "learning_rate": 1.678827923010629e-05, "loss": 3.0453, "step": 2057 }, { "epoch": 0.5852236710368075, "grad_norm": 1.576603651046753, "learning_rate": 1.677678827923011e-05, "loss": 2.2623, "step": 2058 }, { "epoch": 0.5855080362802656, "grad_norm": 1.4937617778778076, "learning_rate": 1.6765297328353923e-05, "loss": 2.1262, "step": 2059 }, { "epoch": 0.5857924015237238, "grad_norm": 1.6465567350387573, "learning_rate": 1.6753806377477738e-05, "loss": 2.2345, "step": 2060 }, { "epoch": 0.5860767667671819, "grad_norm": 1.6568986177444458, "learning_rate": 1.6742315426601552e-05, "loss": 2.1957, "step": 2061 }, { "epoch": 0.58636113201064, "grad_norm": 1.6795825958251953, "learning_rate": 1.6730824475725367e-05, "loss": 1.8565, "step": 2062 }, { "epoch": 0.5866454972540981, "grad_norm": 1.547038197517395, "learning_rate": 1.671933352484918e-05, "loss": 1.8675, "step": 2063 }, { "epoch": 0.5869298624975562, "grad_norm": 1.557686686515808, "learning_rate": 1.6707842573972996e-05, "loss": 1.7547, "step": 2064 }, { "epoch": 0.5872142277410144, "grad_norm": 1.8700066804885864, "learning_rate": 1.6696351623096814e-05, "loss": 2.7392, "step": 2065 }, { "epoch": 0.5874985929844725, "grad_norm": 1.6624739170074463, "learning_rate": 1.6684860672220628e-05, "loss": 2.343, "step": 2066 }, { "epoch": 0.5877829582279306, "grad_norm": 1.5265758037567139, "learning_rate": 1.6673369721344443e-05, "loss": 2.1868, "step": 2067 }, { "epoch": 0.5880673234713887, "grad_norm": 1.5248119831085205, "learning_rate": 1.6661878770468257e-05, "loss": 2.1564, "step": 2068 }, { "epoch": 0.5883516887148468, "grad_norm": 1.6401458978652954, "learning_rate": 1.665038781959207e-05, "loss": 2.2195, "step": 2069 }, { "epoch": 0.588636053958305, "grad_norm": 1.5524535179138184, "learning_rate": 1.663889686871589e-05, "loss": 1.9903, "step": 2070 }, { "epoch": 0.5889204192017631, "grad_norm": 1.6761585474014282, "learning_rate": 1.6627405917839704e-05, "loss": 1.9134, "step": 2071 }, { "epoch": 0.5892047844452212, "grad_norm": 1.6284050941467285, "learning_rate": 1.661591496696352e-05, "loss": 1.8622, "step": 2072 }, { "epoch": 0.5894891496886793, "grad_norm": 1.7775750160217285, "learning_rate": 1.6604424016087333e-05, "loss": 2.7387, "step": 2073 }, { "epoch": 0.5897735149321374, "grad_norm": 1.5672471523284912, "learning_rate": 1.6592933065211147e-05, "loss": 2.13, "step": 2074 }, { "epoch": 0.5900578801755956, "grad_norm": 2.415464401245117, "learning_rate": 1.6581442114334962e-05, "loss": 2.58, "step": 2075 }, { "epoch": 0.5903422454190537, "grad_norm": 1.550248622894287, "learning_rate": 1.6569951163458776e-05, "loss": 2.2516, "step": 2076 }, { "epoch": 0.5906266106625118, "grad_norm": 1.564470887184143, "learning_rate": 1.655846021258259e-05, "loss": 1.9016, "step": 2077 }, { "epoch": 0.5909109759059699, "grad_norm": 1.5428235530853271, "learning_rate": 1.6546969261706405e-05, "loss": 1.9438, "step": 2078 }, { "epoch": 0.5911953411494281, "grad_norm": 1.5634739398956299, "learning_rate": 1.6535478310830223e-05, "loss": 2.0286, "step": 2079 }, { "epoch": 0.5914797063928862, "grad_norm": 1.6420611143112183, "learning_rate": 1.6523987359954038e-05, "loss": 1.7369, "step": 2080 }, { "epoch": 0.5917640716363443, "grad_norm": 1.8491171598434448, "learning_rate": 1.6512496409077852e-05, "loss": 2.7506, "step": 2081 }, { "epoch": 0.5920484368798024, "grad_norm": 1.6593526601791382, "learning_rate": 1.6501005458201667e-05, "loss": 2.4749, "step": 2082 }, { "epoch": 0.5923328021232604, "grad_norm": 1.6108300685882568, "learning_rate": 1.648951450732548e-05, "loss": 2.4418, "step": 2083 }, { "epoch": 0.5926171673667187, "grad_norm": 1.489844799041748, "learning_rate": 1.64780235564493e-05, "loss": 2.2861, "step": 2084 }, { "epoch": 0.5929015326101768, "grad_norm": 1.5152863264083862, "learning_rate": 1.6466532605573114e-05, "loss": 2.1108, "step": 2085 }, { "epoch": 0.5931858978536348, "grad_norm": 1.5476469993591309, "learning_rate": 1.6455041654696928e-05, "loss": 1.9956, "step": 2086 }, { "epoch": 0.5934702630970929, "grad_norm": 1.568286418914795, "learning_rate": 1.6443550703820743e-05, "loss": 1.7746, "step": 2087 }, { "epoch": 0.593754628340551, "grad_norm": 1.673620581626892, "learning_rate": 1.6432059752944557e-05, "loss": 1.8101, "step": 2088 }, { "epoch": 0.5940389935840092, "grad_norm": 1.8000257015228271, "learning_rate": 1.6420568802068372e-05, "loss": 2.8105, "step": 2089 }, { "epoch": 0.5943233588274673, "grad_norm": 1.4337655305862427, "learning_rate": 1.6409077851192186e-05, "loss": 2.507, "step": 2090 }, { "epoch": 0.5946077240709254, "grad_norm": 1.5374799966812134, "learning_rate": 1.6397586900316004e-05, "loss": 2.4551, "step": 2091 }, { "epoch": 0.5948920893143835, "grad_norm": 1.586155891418457, "learning_rate": 1.638609594943982e-05, "loss": 2.0942, "step": 2092 }, { "epoch": 0.5951764545578416, "grad_norm": 1.6072989702224731, "learning_rate": 1.6374604998563633e-05, "loss": 1.9915, "step": 2093 }, { "epoch": 0.5954608198012998, "grad_norm": 1.4546177387237549, "learning_rate": 1.6363114047687448e-05, "loss": 1.8326, "step": 2094 }, { "epoch": 0.5957451850447579, "grad_norm": 1.5891377925872803, "learning_rate": 1.6351623096811262e-05, "loss": 1.7843, "step": 2095 }, { "epoch": 0.596029550288216, "grad_norm": 1.6567867994308472, "learning_rate": 1.6340132145935077e-05, "loss": 1.622, "step": 2096 }, { "epoch": 0.5963139155316741, "grad_norm": 1.8327213525772095, "learning_rate": 1.632864119505889e-05, "loss": 2.7612, "step": 2097 }, { "epoch": 0.5965982807751322, "grad_norm": 1.5466746091842651, "learning_rate": 1.6317150244182706e-05, "loss": 2.2606, "step": 2098 }, { "epoch": 0.5968826460185904, "grad_norm": 1.4704080820083618, "learning_rate": 1.630565929330652e-05, "loss": 2.2202, "step": 2099 }, { "epoch": 0.5971670112620485, "grad_norm": 1.5352089405059814, "learning_rate": 1.6294168342430338e-05, "loss": 2.1893, "step": 2100 }, { "epoch": 0.5974513765055066, "grad_norm": 1.6276265382766724, "learning_rate": 1.6282677391554152e-05, "loss": 2.0548, "step": 2101 }, { "epoch": 0.5977357417489647, "grad_norm": 1.4630635976791382, "learning_rate": 1.6271186440677967e-05, "loss": 1.799, "step": 2102 }, { "epoch": 0.5980201069924228, "grad_norm": 1.538522720336914, "learning_rate": 1.6259695489801785e-05, "loss": 1.6713, "step": 2103 }, { "epoch": 0.598304472235881, "grad_norm": 1.6037315130233765, "learning_rate": 1.62482045389256e-05, "loss": 1.6671, "step": 2104 }, { "epoch": 0.5985888374793391, "grad_norm": 1.940868854522705, "learning_rate": 1.6236713588049414e-05, "loss": 2.8063, "step": 2105 }, { "epoch": 0.5988732027227972, "grad_norm": 1.5579652786254883, "learning_rate": 1.622522263717323e-05, "loss": 2.3295, "step": 2106 }, { "epoch": 0.5991575679662553, "grad_norm": 1.6156672239303589, "learning_rate": 1.6213731686297043e-05, "loss": 2.2363, "step": 2107 }, { "epoch": 0.5994419332097134, "grad_norm": 1.485302209854126, "learning_rate": 1.6202240735420857e-05, "loss": 2.0772, "step": 2108 }, { "epoch": 0.5997262984531716, "grad_norm": 1.7746816873550415, "learning_rate": 1.6190749784544672e-05, "loss": 2.1031, "step": 2109 }, { "epoch": 0.6000106636966297, "grad_norm": 1.6292321681976318, "learning_rate": 1.6179258833668486e-05, "loss": 1.9516, "step": 2110 }, { "epoch": 0.6002950289400878, "grad_norm": 1.6927971839904785, "learning_rate": 1.61677678827923e-05, "loss": 1.9776, "step": 2111 }, { "epoch": 0.6005793941835459, "grad_norm": 1.7361969947814941, "learning_rate": 1.615627693191612e-05, "loss": 1.6757, "step": 2112 }, { "epoch": 0.600863759427004, "grad_norm": 1.7664377689361572, "learning_rate": 1.6144785981039933e-05, "loss": 2.7698, "step": 2113 }, { "epoch": 0.6011481246704622, "grad_norm": 1.4889873266220093, "learning_rate": 1.6133295030163748e-05, "loss": 2.3977, "step": 2114 }, { "epoch": 0.6014324899139203, "grad_norm": 1.5173231363296509, "learning_rate": 1.6121804079287562e-05, "loss": 2.2699, "step": 2115 }, { "epoch": 0.6017168551573784, "grad_norm": 1.5664979219436646, "learning_rate": 1.6110313128411377e-05, "loss": 2.2895, "step": 2116 }, { "epoch": 0.6020012204008365, "grad_norm": 1.7025697231292725, "learning_rate": 1.609882217753519e-05, "loss": 1.9624, "step": 2117 }, { "epoch": 0.6022855856442946, "grad_norm": 1.7116023302078247, "learning_rate": 1.6087331226659006e-05, "loss": 2.1481, "step": 2118 }, { "epoch": 0.6025699508877528, "grad_norm": 1.53733491897583, "learning_rate": 1.607584027578282e-05, "loss": 1.6645, "step": 2119 }, { "epoch": 0.6028543161312109, "grad_norm": 1.602069616317749, "learning_rate": 1.6064349324906635e-05, "loss": 1.7552, "step": 2120 }, { "epoch": 0.603138681374669, "grad_norm": 1.7021903991699219, "learning_rate": 1.6052858374030453e-05, "loss": 2.7208, "step": 2121 }, { "epoch": 0.6034230466181271, "grad_norm": 1.5884177684783936, "learning_rate": 1.6041367423154267e-05, "loss": 2.4558, "step": 2122 }, { "epoch": 0.6037074118615852, "grad_norm": 1.600091814994812, "learning_rate": 1.602987647227808e-05, "loss": 2.3824, "step": 2123 }, { "epoch": 0.6039917771050434, "grad_norm": 1.636725902557373, "learning_rate": 1.60183855214019e-05, "loss": 2.2555, "step": 2124 }, { "epoch": 0.6042761423485015, "grad_norm": 1.5665066242218018, "learning_rate": 1.6006894570525714e-05, "loss": 2.2357, "step": 2125 }, { "epoch": 0.6045605075919596, "grad_norm": 1.6503568887710571, "learning_rate": 1.599540361964953e-05, "loss": 1.9319, "step": 2126 }, { "epoch": 0.6048448728354177, "grad_norm": 1.774845838546753, "learning_rate": 1.5983912668773343e-05, "loss": 1.8753, "step": 2127 }, { "epoch": 0.6051292380788759, "grad_norm": 1.626198649406433, "learning_rate": 1.5972421717897157e-05, "loss": 1.9712, "step": 2128 }, { "epoch": 0.605413603322334, "grad_norm": 1.8184738159179688, "learning_rate": 1.5960930767020972e-05, "loss": 2.7941, "step": 2129 }, { "epoch": 0.605697968565792, "grad_norm": 1.5862571001052856, "learning_rate": 1.5949439816144786e-05, "loss": 2.421, "step": 2130 }, { "epoch": 0.6059823338092502, "grad_norm": 1.5402880907058716, "learning_rate": 1.59379488652686e-05, "loss": 2.462, "step": 2131 }, { "epoch": 0.6062666990527082, "grad_norm": 1.606608510017395, "learning_rate": 1.5926457914392415e-05, "loss": 2.1879, "step": 2132 }, { "epoch": 0.6065510642961665, "grad_norm": 1.640724539756775, "learning_rate": 1.5914966963516233e-05, "loss": 1.9824, "step": 2133 }, { "epoch": 0.6068354295396245, "grad_norm": 1.5415105819702148, "learning_rate": 1.5903476012640048e-05, "loss": 1.8615, "step": 2134 }, { "epoch": 0.6071197947830826, "grad_norm": 1.5723168849945068, "learning_rate": 1.5891985061763862e-05, "loss": 1.8926, "step": 2135 }, { "epoch": 0.6074041600265407, "grad_norm": 1.752859115600586, "learning_rate": 1.5880494110887677e-05, "loss": 1.9364, "step": 2136 }, { "epoch": 0.6076885252699988, "grad_norm": 1.7561215162277222, "learning_rate": 1.586900316001149e-05, "loss": 2.7265, "step": 2137 }, { "epoch": 0.607972890513457, "grad_norm": 1.5687557458877563, "learning_rate": 1.5857512209135306e-05, "loss": 2.1813, "step": 2138 }, { "epoch": 0.6082572557569151, "grad_norm": 1.5815242528915405, "learning_rate": 1.5846021258259124e-05, "loss": 2.4614, "step": 2139 }, { "epoch": 0.6085416210003732, "grad_norm": 1.4205738306045532, "learning_rate": 1.5834530307382938e-05, "loss": 2.0676, "step": 2140 }, { "epoch": 0.6088259862438313, "grad_norm": 1.630874752998352, "learning_rate": 1.5823039356506753e-05, "loss": 2.286, "step": 2141 }, { "epoch": 0.6091103514872894, "grad_norm": 1.5306339263916016, "learning_rate": 1.5811548405630567e-05, "loss": 2.0203, "step": 2142 }, { "epoch": 0.6093947167307476, "grad_norm": 1.5277224779129028, "learning_rate": 1.5800057454754382e-05, "loss": 1.8091, "step": 2143 }, { "epoch": 0.6096790819742057, "grad_norm": 1.5925323963165283, "learning_rate": 1.5788566503878196e-05, "loss": 1.7929, "step": 2144 }, { "epoch": 0.6099634472176638, "grad_norm": 1.7087624073028564, "learning_rate": 1.5777075553002014e-05, "loss": 2.8013, "step": 2145 }, { "epoch": 0.6102478124611219, "grad_norm": 1.4798260927200317, "learning_rate": 1.576558460212583e-05, "loss": 2.6274, "step": 2146 }, { "epoch": 0.61053217770458, "grad_norm": 1.5136479139328003, "learning_rate": 1.5754093651249643e-05, "loss": 2.2147, "step": 2147 }, { "epoch": 0.6108165429480382, "grad_norm": 1.4526771306991577, "learning_rate": 1.5742602700373458e-05, "loss": 2.3316, "step": 2148 }, { "epoch": 0.6111009081914963, "grad_norm": 1.671984076499939, "learning_rate": 1.5731111749497272e-05, "loss": 2.0726, "step": 2149 }, { "epoch": 0.6113852734349544, "grad_norm": 1.5075078010559082, "learning_rate": 1.5719620798621087e-05, "loss": 2.0302, "step": 2150 }, { "epoch": 0.6116696386784125, "grad_norm": 1.594140648841858, "learning_rate": 1.57081298477449e-05, "loss": 1.9607, "step": 2151 }, { "epoch": 0.6119540039218706, "grad_norm": 1.7576320171356201, "learning_rate": 1.5696638896868716e-05, "loss": 1.9199, "step": 2152 }, { "epoch": 0.6122383691653288, "grad_norm": 1.8919973373413086, "learning_rate": 1.568514794599253e-05, "loss": 2.8498, "step": 2153 }, { "epoch": 0.6125227344087869, "grad_norm": 1.564929723739624, "learning_rate": 1.5673656995116348e-05, "loss": 2.2753, "step": 2154 }, { "epoch": 0.612807099652245, "grad_norm": 1.453860878944397, "learning_rate": 1.5662166044240162e-05, "loss": 2.2765, "step": 2155 }, { "epoch": 0.6130914648957031, "grad_norm": 1.5218002796173096, "learning_rate": 1.5650675093363977e-05, "loss": 2.1331, "step": 2156 }, { "epoch": 0.6133758301391612, "grad_norm": 1.625298261642456, "learning_rate": 1.563918414248779e-05, "loss": 2.1687, "step": 2157 }, { "epoch": 0.6136601953826194, "grad_norm": 1.6260716915130615, "learning_rate": 1.562769319161161e-05, "loss": 1.9181, "step": 2158 }, { "epoch": 0.6139445606260775, "grad_norm": 1.7083520889282227, "learning_rate": 1.5616202240735424e-05, "loss": 1.9319, "step": 2159 }, { "epoch": 0.6142289258695356, "grad_norm": 1.6045297384262085, "learning_rate": 1.560471128985924e-05, "loss": 1.6412, "step": 2160 }, { "epoch": 0.6145132911129937, "grad_norm": 1.6960153579711914, "learning_rate": 1.5593220338983053e-05, "loss": 2.6496, "step": 2161 }, { "epoch": 0.6147976563564518, "grad_norm": 1.5479867458343506, "learning_rate": 1.5581729388106867e-05, "loss": 2.3291, "step": 2162 }, { "epoch": 0.61508202159991, "grad_norm": 1.4145187139511108, "learning_rate": 1.5570238437230682e-05, "loss": 2.2698, "step": 2163 }, { "epoch": 0.6153663868433681, "grad_norm": 1.6084132194519043, "learning_rate": 1.5558747486354496e-05, "loss": 2.3009, "step": 2164 }, { "epoch": 0.6156507520868262, "grad_norm": 1.6727561950683594, "learning_rate": 1.554725653547831e-05, "loss": 2.2078, "step": 2165 }, { "epoch": 0.6159351173302843, "grad_norm": 1.6095800399780273, "learning_rate": 1.553576558460213e-05, "loss": 1.9408, "step": 2166 }, { "epoch": 0.6162194825737424, "grad_norm": 1.6844741106033325, "learning_rate": 1.5524274633725943e-05, "loss": 1.9492, "step": 2167 }, { "epoch": 0.6165038478172006, "grad_norm": 1.600579857826233, "learning_rate": 1.5512783682849758e-05, "loss": 1.6956, "step": 2168 }, { "epoch": 0.6167882130606587, "grad_norm": 1.7465009689331055, "learning_rate": 1.5501292731973572e-05, "loss": 2.7124, "step": 2169 }, { "epoch": 0.6170725783041168, "grad_norm": 1.466794729232788, "learning_rate": 1.5489801781097387e-05, "loss": 2.2909, "step": 2170 }, { "epoch": 0.6173569435475749, "grad_norm": 1.493714451789856, "learning_rate": 1.54783108302212e-05, "loss": 2.1008, "step": 2171 }, { "epoch": 0.617641308791033, "grad_norm": 1.5686396360397339, "learning_rate": 1.5466819879345016e-05, "loss": 2.1567, "step": 2172 }, { "epoch": 0.6179256740344912, "grad_norm": 1.6050747632980347, "learning_rate": 1.545532892846883e-05, "loss": 2.2713, "step": 2173 }, { "epoch": 0.6182100392779493, "grad_norm": 1.5931973457336426, "learning_rate": 1.5443837977592645e-05, "loss": 1.797, "step": 2174 }, { "epoch": 0.6184944045214074, "grad_norm": 1.662131667137146, "learning_rate": 1.5432347026716463e-05, "loss": 1.7863, "step": 2175 }, { "epoch": 0.6187787697648655, "grad_norm": 1.6877639293670654, "learning_rate": 1.5420856075840277e-05, "loss": 1.9392, "step": 2176 }, { "epoch": 0.6190631350083237, "grad_norm": 1.8865476846694946, "learning_rate": 1.540936512496409e-05, "loss": 2.7452, "step": 2177 }, { "epoch": 0.6193475002517818, "grad_norm": 1.5432252883911133, "learning_rate": 1.539787417408791e-05, "loss": 2.5342, "step": 2178 }, { "epoch": 0.6196318654952399, "grad_norm": 1.513899564743042, "learning_rate": 1.5386383223211724e-05, "loss": 2.1821, "step": 2179 }, { "epoch": 0.619916230738698, "grad_norm": 1.5674052238464355, "learning_rate": 1.537489227233554e-05, "loss": 2.2672, "step": 2180 }, { "epoch": 0.620200595982156, "grad_norm": 1.5859272480010986, "learning_rate": 1.5363401321459353e-05, "loss": 2.1825, "step": 2181 }, { "epoch": 0.6204849612256143, "grad_norm": 1.5028396844863892, "learning_rate": 1.5351910370583167e-05, "loss": 1.7984, "step": 2182 }, { "epoch": 0.6207693264690723, "grad_norm": 1.5256812572479248, "learning_rate": 1.5340419419706982e-05, "loss": 1.8812, "step": 2183 }, { "epoch": 0.6210536917125304, "grad_norm": 1.6204417943954468, "learning_rate": 1.5328928468830796e-05, "loss": 1.8866, "step": 2184 }, { "epoch": 0.6213380569559885, "grad_norm": 1.6233776807785034, "learning_rate": 1.531743751795461e-05, "loss": 2.7469, "step": 2185 }, { "epoch": 0.6216224221994466, "grad_norm": 1.5075547695159912, "learning_rate": 1.5305946567078425e-05, "loss": 2.4263, "step": 2186 }, { "epoch": 0.6219067874429048, "grad_norm": 1.5008249282836914, "learning_rate": 1.5294455616202243e-05, "loss": 2.1231, "step": 2187 }, { "epoch": 0.6221911526863629, "grad_norm": 1.5601019859313965, "learning_rate": 1.5282964665326058e-05, "loss": 2.3066, "step": 2188 }, { "epoch": 0.622475517929821, "grad_norm": 1.6898422241210938, "learning_rate": 1.5271473714449872e-05, "loss": 2.2774, "step": 2189 }, { "epoch": 0.6227598831732791, "grad_norm": 1.5225181579589844, "learning_rate": 1.5259982763573687e-05, "loss": 2.0043, "step": 2190 }, { "epoch": 0.6230442484167372, "grad_norm": 1.565322756767273, "learning_rate": 1.5248491812697501e-05, "loss": 1.7469, "step": 2191 }, { "epoch": 0.6233286136601954, "grad_norm": 1.6591078042984009, "learning_rate": 1.5237000861821316e-05, "loss": 1.4503, "step": 2192 }, { "epoch": 0.6236129789036535, "grad_norm": 1.8940781354904175, "learning_rate": 1.522550991094513e-05, "loss": 2.7695, "step": 2193 }, { "epoch": 0.6238973441471116, "grad_norm": 1.5857594013214111, "learning_rate": 1.5214018960068946e-05, "loss": 2.3266, "step": 2194 }, { "epoch": 0.6241817093905697, "grad_norm": 1.531599998474121, "learning_rate": 1.5202528009192763e-05, "loss": 2.3562, "step": 2195 }, { "epoch": 0.6244660746340278, "grad_norm": 1.4814765453338623, "learning_rate": 1.5191037058316577e-05, "loss": 2.1034, "step": 2196 }, { "epoch": 0.624750439877486, "grad_norm": 1.5699845552444458, "learning_rate": 1.5179546107440393e-05, "loss": 2.18, "step": 2197 }, { "epoch": 0.6250348051209441, "grad_norm": 1.572761058807373, "learning_rate": 1.5168055156564208e-05, "loss": 1.9689, "step": 2198 }, { "epoch": 0.6253191703644022, "grad_norm": 1.5882624387741089, "learning_rate": 1.5156564205688022e-05, "loss": 1.9178, "step": 2199 }, { "epoch": 0.6256035356078603, "grad_norm": 1.979588508605957, "learning_rate": 1.5145073254811837e-05, "loss": 1.8587, "step": 2200 }, { "epoch": 0.6258879008513184, "grad_norm": 1.7905445098876953, "learning_rate": 1.5133582303935653e-05, "loss": 2.6646, "step": 2201 }, { "epoch": 0.6261722660947766, "grad_norm": 1.5020902156829834, "learning_rate": 1.5122091353059468e-05, "loss": 2.4876, "step": 2202 }, { "epoch": 0.6264566313382347, "grad_norm": 1.551763653755188, "learning_rate": 1.5110600402183282e-05, "loss": 2.347, "step": 2203 }, { "epoch": 0.6267409965816928, "grad_norm": 1.5747159719467163, "learning_rate": 1.5099109451307097e-05, "loss": 2.3735, "step": 2204 }, { "epoch": 0.6270253618251509, "grad_norm": 1.567586064338684, "learning_rate": 1.5087618500430911e-05, "loss": 2.1383, "step": 2205 }, { "epoch": 0.627309727068609, "grad_norm": 1.5105654001235962, "learning_rate": 1.5076127549554727e-05, "loss": 1.7974, "step": 2206 }, { "epoch": 0.6275940923120672, "grad_norm": 1.600792646408081, "learning_rate": 1.5064636598678542e-05, "loss": 1.8991, "step": 2207 }, { "epoch": 0.6278784575555253, "grad_norm": 1.6282237768173218, "learning_rate": 1.5053145647802356e-05, "loss": 1.7965, "step": 2208 }, { "epoch": 0.6281628227989834, "grad_norm": 1.7487695217132568, "learning_rate": 1.504165469692617e-05, "loss": 2.6253, "step": 2209 }, { "epoch": 0.6284471880424415, "grad_norm": 1.627800703048706, "learning_rate": 1.5030163746049987e-05, "loss": 2.3377, "step": 2210 }, { "epoch": 0.6287315532858996, "grad_norm": 1.490409255027771, "learning_rate": 1.5018672795173801e-05, "loss": 2.1716, "step": 2211 }, { "epoch": 0.6290159185293578, "grad_norm": 1.5944234132766724, "learning_rate": 1.5007181844297616e-05, "loss": 2.2221, "step": 2212 }, { "epoch": 0.6293002837728159, "grad_norm": 1.5915591716766357, "learning_rate": 1.4995690893421432e-05, "loss": 2.2536, "step": 2213 }, { "epoch": 0.629584649016274, "grad_norm": 1.5468132495880127, "learning_rate": 1.4984199942545248e-05, "loss": 2.0741, "step": 2214 }, { "epoch": 0.6298690142597321, "grad_norm": 1.5128260850906372, "learning_rate": 1.4972708991669063e-05, "loss": 1.7709, "step": 2215 }, { "epoch": 0.6301533795031902, "grad_norm": 1.6346701383590698, "learning_rate": 1.4961218040792877e-05, "loss": 1.7228, "step": 2216 }, { "epoch": 0.6304377447466484, "grad_norm": 1.812436580657959, "learning_rate": 1.4949727089916692e-05, "loss": 2.7526, "step": 2217 }, { "epoch": 0.6307221099901065, "grad_norm": 1.562059998512268, "learning_rate": 1.4938236139040508e-05, "loss": 2.4138, "step": 2218 }, { "epoch": 0.6310064752335646, "grad_norm": 1.6043922901153564, "learning_rate": 1.4926745188164322e-05, "loss": 2.2918, "step": 2219 }, { "epoch": 0.6312908404770227, "grad_norm": 1.5691092014312744, "learning_rate": 1.4915254237288137e-05, "loss": 2.2282, "step": 2220 }, { "epoch": 0.6315752057204808, "grad_norm": 1.5854427814483643, "learning_rate": 1.4903763286411951e-05, "loss": 2.2005, "step": 2221 }, { "epoch": 0.631859570963939, "grad_norm": 1.552298903465271, "learning_rate": 1.4892272335535766e-05, "loss": 1.7499, "step": 2222 }, { "epoch": 0.6321439362073971, "grad_norm": 1.7358125448226929, "learning_rate": 1.4880781384659582e-05, "loss": 1.886, "step": 2223 }, { "epoch": 0.6324283014508552, "grad_norm": 1.6180943250656128, "learning_rate": 1.4869290433783397e-05, "loss": 1.7801, "step": 2224 }, { "epoch": 0.6327126666943133, "grad_norm": 1.9027990102767944, "learning_rate": 1.4857799482907211e-05, "loss": 2.6241, "step": 2225 }, { "epoch": 0.6329970319377715, "grad_norm": 1.5672250986099243, "learning_rate": 1.4846308532031026e-05, "loss": 2.4785, "step": 2226 }, { "epoch": 0.6332813971812296, "grad_norm": 1.5182520151138306, "learning_rate": 1.4834817581154842e-05, "loss": 2.0982, "step": 2227 }, { "epoch": 0.6335657624246877, "grad_norm": 1.4269053936004639, "learning_rate": 1.4823326630278656e-05, "loss": 2.3107, "step": 2228 }, { "epoch": 0.6338501276681457, "grad_norm": 1.6807674169540405, "learning_rate": 1.481183567940247e-05, "loss": 2.0994, "step": 2229 }, { "epoch": 0.6341344929116038, "grad_norm": 1.5306531190872192, "learning_rate": 1.4800344728526285e-05, "loss": 1.8668, "step": 2230 }, { "epoch": 0.634418858155062, "grad_norm": 1.683131456375122, "learning_rate": 1.47888537776501e-05, "loss": 1.7841, "step": 2231 }, { "epoch": 0.6347032233985201, "grad_norm": 1.6257297992706299, "learning_rate": 1.4777362826773918e-05, "loss": 1.7767, "step": 2232 }, { "epoch": 0.6349875886419782, "grad_norm": 1.8746527433395386, "learning_rate": 1.4765871875897732e-05, "loss": 2.8278, "step": 2233 }, { "epoch": 0.6352719538854363, "grad_norm": 1.5843089818954468, "learning_rate": 1.4754380925021547e-05, "loss": 2.3801, "step": 2234 }, { "epoch": 0.6355563191288944, "grad_norm": 1.465401291847229, "learning_rate": 1.4742889974145363e-05, "loss": 2.0188, "step": 2235 }, { "epoch": 0.6358406843723526, "grad_norm": 1.4717556238174438, "learning_rate": 1.4731399023269177e-05, "loss": 2.1638, "step": 2236 }, { "epoch": 0.6361250496158107, "grad_norm": 1.733504056930542, "learning_rate": 1.4719908072392992e-05, "loss": 2.2512, "step": 2237 }, { "epoch": 0.6364094148592688, "grad_norm": 1.602497935295105, "learning_rate": 1.4708417121516806e-05, "loss": 1.9198, "step": 2238 }, { "epoch": 0.6366937801027269, "grad_norm": 1.6287175416946411, "learning_rate": 1.4696926170640623e-05, "loss": 1.8728, "step": 2239 }, { "epoch": 0.636978145346185, "grad_norm": 1.640389084815979, "learning_rate": 1.4685435219764437e-05, "loss": 1.6453, "step": 2240 }, { "epoch": 0.6372625105896432, "grad_norm": 1.8420612812042236, "learning_rate": 1.4673944268888252e-05, "loss": 2.6964, "step": 2241 }, { "epoch": 0.6375468758331013, "grad_norm": 1.4961106777191162, "learning_rate": 1.4662453318012066e-05, "loss": 2.2855, "step": 2242 }, { "epoch": 0.6378312410765594, "grad_norm": 1.5498943328857422, "learning_rate": 1.465096236713588e-05, "loss": 2.3792, "step": 2243 }, { "epoch": 0.6381156063200175, "grad_norm": 1.5839142799377441, "learning_rate": 1.4639471416259697e-05, "loss": 2.2614, "step": 2244 }, { "epoch": 0.6383999715634756, "grad_norm": 1.598946452140808, "learning_rate": 1.4627980465383511e-05, "loss": 2.2075, "step": 2245 }, { "epoch": 0.6386843368069338, "grad_norm": 1.521291732788086, "learning_rate": 1.4616489514507326e-05, "loss": 2.1571, "step": 2246 }, { "epoch": 0.6389687020503919, "grad_norm": 1.5280981063842773, "learning_rate": 1.460499856363114e-05, "loss": 1.6476, "step": 2247 }, { "epoch": 0.63925306729385, "grad_norm": 1.6235344409942627, "learning_rate": 1.4593507612754956e-05, "loss": 1.8018, "step": 2248 }, { "epoch": 0.6395374325373081, "grad_norm": 1.906637191772461, "learning_rate": 1.4582016661878771e-05, "loss": 2.8116, "step": 2249 }, { "epoch": 0.6398217977807662, "grad_norm": 1.4746451377868652, "learning_rate": 1.4570525711002587e-05, "loss": 2.4155, "step": 2250 }, { "epoch": 0.6401061630242244, "grad_norm": 1.536787748336792, "learning_rate": 1.4559034760126403e-05, "loss": 2.2476, "step": 2251 }, { "epoch": 0.6403905282676825, "grad_norm": 1.4829517602920532, "learning_rate": 1.4547543809250218e-05, "loss": 2.1656, "step": 2252 }, { "epoch": 0.6406748935111406, "grad_norm": 1.6278111934661865, "learning_rate": 1.4536052858374032e-05, "loss": 2.025, "step": 2253 }, { "epoch": 0.6409592587545987, "grad_norm": 1.5534253120422363, "learning_rate": 1.4524561907497847e-05, "loss": 1.9709, "step": 2254 }, { "epoch": 0.6412436239980568, "grad_norm": 1.5600004196166992, "learning_rate": 1.4513070956621661e-05, "loss": 1.8011, "step": 2255 }, { "epoch": 0.641527989241515, "grad_norm": 1.7009464502334595, "learning_rate": 1.4501580005745477e-05, "loss": 1.7419, "step": 2256 }, { "epoch": 0.6418123544849731, "grad_norm": 1.6545665264129639, "learning_rate": 1.4490089054869292e-05, "loss": 2.6367, "step": 2257 }, { "epoch": 0.6420967197284312, "grad_norm": 1.542686939239502, "learning_rate": 1.4478598103993106e-05, "loss": 2.5222, "step": 2258 }, { "epoch": 0.6423810849718893, "grad_norm": 1.5785605907440186, "learning_rate": 1.4467107153116921e-05, "loss": 2.2131, "step": 2259 }, { "epoch": 0.6426654502153474, "grad_norm": 1.6065642833709717, "learning_rate": 1.4455616202240737e-05, "loss": 2.1774, "step": 2260 }, { "epoch": 0.6429498154588056, "grad_norm": 1.5818967819213867, "learning_rate": 1.4444125251364552e-05, "loss": 2.0904, "step": 2261 }, { "epoch": 0.6432341807022637, "grad_norm": 1.6355520486831665, "learning_rate": 1.4432634300488366e-05, "loss": 1.8927, "step": 2262 }, { "epoch": 0.6435185459457218, "grad_norm": 1.528612732887268, "learning_rate": 1.442114334961218e-05, "loss": 1.8309, "step": 2263 }, { "epoch": 0.6438029111891799, "grad_norm": 1.5770155191421509, "learning_rate": 1.4409652398735995e-05, "loss": 1.6619, "step": 2264 }, { "epoch": 0.644087276432638, "grad_norm": 1.7672600746154785, "learning_rate": 1.4398161447859811e-05, "loss": 2.5805, "step": 2265 }, { "epoch": 0.6443716416760962, "grad_norm": 1.5915486812591553, "learning_rate": 1.4386670496983626e-05, "loss": 2.2712, "step": 2266 }, { "epoch": 0.6446560069195543, "grad_norm": 1.3617949485778809, "learning_rate": 1.437517954610744e-05, "loss": 2.1554, "step": 2267 }, { "epoch": 0.6449403721630124, "grad_norm": 1.487271785736084, "learning_rate": 1.4363688595231255e-05, "loss": 2.3376, "step": 2268 }, { "epoch": 0.6452247374064705, "grad_norm": 1.9369791746139526, "learning_rate": 1.4352197644355073e-05, "loss": 2.1762, "step": 2269 }, { "epoch": 0.6455091026499287, "grad_norm": 1.513728380203247, "learning_rate": 1.4340706693478887e-05, "loss": 1.8318, "step": 2270 }, { "epoch": 0.6457934678933868, "grad_norm": 1.602360725402832, "learning_rate": 1.4329215742602702e-05, "loss": 1.8081, "step": 2271 }, { "epoch": 0.6460778331368449, "grad_norm": 1.706312656402588, "learning_rate": 1.4317724791726518e-05, "loss": 1.8961, "step": 2272 }, { "epoch": 0.646362198380303, "grad_norm": 1.8679949045181274, "learning_rate": 1.4306233840850332e-05, "loss": 2.7414, "step": 2273 }, { "epoch": 0.646646563623761, "grad_norm": 1.5807439088821411, "learning_rate": 1.4294742889974147e-05, "loss": 2.4749, "step": 2274 }, { "epoch": 0.6469309288672193, "grad_norm": 1.6467163562774658, "learning_rate": 1.4283251939097961e-05, "loss": 2.4233, "step": 2275 }, { "epoch": 0.6472152941106774, "grad_norm": 1.611040711402893, "learning_rate": 1.4271760988221776e-05, "loss": 2.148, "step": 2276 }, { "epoch": 0.6474996593541354, "grad_norm": 1.6380242109298706, "learning_rate": 1.4260270037345592e-05, "loss": 2.1739, "step": 2277 }, { "epoch": 0.6477840245975935, "grad_norm": 1.6218241453170776, "learning_rate": 1.4248779086469407e-05, "loss": 2.0744, "step": 2278 }, { "epoch": 0.6480683898410516, "grad_norm": 1.4787720441818237, "learning_rate": 1.4237288135593221e-05, "loss": 1.6869, "step": 2279 }, { "epoch": 0.6483527550845098, "grad_norm": 1.682263731956482, "learning_rate": 1.4225797184717036e-05, "loss": 1.7572, "step": 2280 }, { "epoch": 0.6486371203279679, "grad_norm": 1.9067274332046509, "learning_rate": 1.4214306233840852e-05, "loss": 2.7824, "step": 2281 }, { "epoch": 0.648921485571426, "grad_norm": 1.4960787296295166, "learning_rate": 1.4202815282964666e-05, "loss": 2.3718, "step": 2282 }, { "epoch": 0.6492058508148841, "grad_norm": 1.5059878826141357, "learning_rate": 1.419132433208848e-05, "loss": 2.1535, "step": 2283 }, { "epoch": 0.6494902160583422, "grad_norm": 1.685180902481079, "learning_rate": 1.4179833381212295e-05, "loss": 2.2137, "step": 2284 }, { "epoch": 0.6497745813018004, "grad_norm": 1.5441550016403198, "learning_rate": 1.416834243033611e-05, "loss": 2.3031, "step": 2285 }, { "epoch": 0.6500589465452585, "grad_norm": 1.5095455646514893, "learning_rate": 1.4156851479459926e-05, "loss": 1.8482, "step": 2286 }, { "epoch": 0.6503433117887166, "grad_norm": 1.5256438255310059, "learning_rate": 1.4145360528583742e-05, "loss": 1.7711, "step": 2287 }, { "epoch": 0.6506276770321747, "grad_norm": 1.6489698886871338, "learning_rate": 1.4133869577707557e-05, "loss": 1.64, "step": 2288 }, { "epoch": 0.6509120422756328, "grad_norm": 1.7157350778579712, "learning_rate": 1.4122378626831373e-05, "loss": 2.8341, "step": 2289 }, { "epoch": 0.651196407519091, "grad_norm": 1.5686113834381104, "learning_rate": 1.4110887675955187e-05, "loss": 2.6388, "step": 2290 }, { "epoch": 0.6514807727625491, "grad_norm": 1.576197624206543, "learning_rate": 1.4099396725079002e-05, "loss": 2.2588, "step": 2291 }, { "epoch": 0.6517651380060072, "grad_norm": 1.5347895622253418, "learning_rate": 1.4087905774202816e-05, "loss": 2.1164, "step": 2292 }, { "epoch": 0.6520495032494653, "grad_norm": 1.6401748657226562, "learning_rate": 1.4076414823326633e-05, "loss": 2.1258, "step": 2293 }, { "epoch": 0.6523338684929234, "grad_norm": 1.5072190761566162, "learning_rate": 1.4064923872450447e-05, "loss": 1.9321, "step": 2294 }, { "epoch": 0.6526182337363816, "grad_norm": 1.5900171995162964, "learning_rate": 1.4053432921574262e-05, "loss": 1.8916, "step": 2295 }, { "epoch": 0.6529025989798397, "grad_norm": 1.6210306882858276, "learning_rate": 1.4041941970698076e-05, "loss": 1.6101, "step": 2296 }, { "epoch": 0.6531869642232978, "grad_norm": 1.8477280139923096, "learning_rate": 1.403045101982189e-05, "loss": 2.6045, "step": 2297 }, { "epoch": 0.6534713294667559, "grad_norm": 1.5514870882034302, "learning_rate": 1.4018960068945707e-05, "loss": 2.4303, "step": 2298 }, { "epoch": 0.653755694710214, "grad_norm": 1.5255701541900635, "learning_rate": 1.4007469118069521e-05, "loss": 2.3312, "step": 2299 }, { "epoch": 0.6540400599536722, "grad_norm": 1.5493617057800293, "learning_rate": 1.3995978167193336e-05, "loss": 2.3112, "step": 2300 }, { "epoch": 0.6543244251971303, "grad_norm": 1.477663278579712, "learning_rate": 1.398448721631715e-05, "loss": 2.0823, "step": 2301 }, { "epoch": 0.6546087904405884, "grad_norm": 1.534895420074463, "learning_rate": 1.3972996265440966e-05, "loss": 1.9711, "step": 2302 }, { "epoch": 0.6548931556840465, "grad_norm": 1.559821605682373, "learning_rate": 1.3961505314564781e-05, "loss": 1.9114, "step": 2303 }, { "epoch": 0.6551775209275046, "grad_norm": 1.5802251100540161, "learning_rate": 1.3950014363688595e-05, "loss": 1.7692, "step": 2304 }, { "epoch": 0.6554618861709628, "grad_norm": 1.8919142484664917, "learning_rate": 1.393852341281241e-05, "loss": 2.8514, "step": 2305 }, { "epoch": 0.6557462514144209, "grad_norm": 1.5178377628326416, "learning_rate": 1.3927032461936228e-05, "loss": 2.2836, "step": 2306 }, { "epoch": 0.656030616657879, "grad_norm": 1.4845335483551025, "learning_rate": 1.3915541511060042e-05, "loss": 2.4409, "step": 2307 }, { "epoch": 0.6563149819013371, "grad_norm": 1.55623459815979, "learning_rate": 1.3904050560183857e-05, "loss": 2.245, "step": 2308 }, { "epoch": 0.6565993471447952, "grad_norm": 1.5961564779281616, "learning_rate": 1.3892559609307671e-05, "loss": 2.1436, "step": 2309 }, { "epoch": 0.6568837123882534, "grad_norm": 1.593917727470398, "learning_rate": 1.3881068658431487e-05, "loss": 2.1096, "step": 2310 }, { "epoch": 0.6571680776317115, "grad_norm": 1.4851388931274414, "learning_rate": 1.3869577707555302e-05, "loss": 1.7558, "step": 2311 }, { "epoch": 0.6574524428751696, "grad_norm": 1.7430486679077148, "learning_rate": 1.3858086756679116e-05, "loss": 1.7607, "step": 2312 }, { "epoch": 0.6577368081186277, "grad_norm": 1.8176724910736084, "learning_rate": 1.3846595805802931e-05, "loss": 2.7509, "step": 2313 }, { "epoch": 0.6580211733620858, "grad_norm": 1.513588309288025, "learning_rate": 1.3835104854926747e-05, "loss": 2.405, "step": 2314 }, { "epoch": 0.658305538605544, "grad_norm": 1.621985912322998, "learning_rate": 1.3823613904050562e-05, "loss": 2.3489, "step": 2315 }, { "epoch": 0.6585899038490021, "grad_norm": 1.4367046356201172, "learning_rate": 1.3812122953174376e-05, "loss": 2.2808, "step": 2316 }, { "epoch": 0.6588742690924602, "grad_norm": 1.577368974685669, "learning_rate": 1.380063200229819e-05, "loss": 2.1214, "step": 2317 }, { "epoch": 0.6591586343359183, "grad_norm": 1.4581912755966187, "learning_rate": 1.3789141051422005e-05, "loss": 2.0159, "step": 2318 }, { "epoch": 0.6594429995793765, "grad_norm": 1.500917911529541, "learning_rate": 1.3777650100545821e-05, "loss": 1.7401, "step": 2319 }, { "epoch": 0.6597273648228346, "grad_norm": 1.6946115493774414, "learning_rate": 1.3766159149669636e-05, "loss": 1.9459, "step": 2320 }, { "epoch": 0.6600117300662927, "grad_norm": 1.7423439025878906, "learning_rate": 1.375466819879345e-05, "loss": 2.8295, "step": 2321 }, { "epoch": 0.6602960953097508, "grad_norm": 1.5395139455795288, "learning_rate": 1.3743177247917265e-05, "loss": 2.4043, "step": 2322 }, { "epoch": 0.6605804605532088, "grad_norm": 1.6088899374008179, "learning_rate": 1.3731686297041081e-05, "loss": 2.396, "step": 2323 }, { "epoch": 0.660864825796667, "grad_norm": 1.542049765586853, "learning_rate": 1.3720195346164897e-05, "loss": 2.2012, "step": 2324 }, { "epoch": 0.6611491910401252, "grad_norm": 1.5389901399612427, "learning_rate": 1.3708704395288712e-05, "loss": 2.02, "step": 2325 }, { "epoch": 0.6614335562835832, "grad_norm": 1.6238247156143188, "learning_rate": 1.3697213444412528e-05, "loss": 1.9875, "step": 2326 }, { "epoch": 0.6617179215270413, "grad_norm": 1.5157362222671509, "learning_rate": 1.3685722493536342e-05, "loss": 1.7811, "step": 2327 }, { "epoch": 0.6620022867704994, "grad_norm": 1.6152019500732422, "learning_rate": 1.3674231542660157e-05, "loss": 1.7777, "step": 2328 }, { "epoch": 0.6622866520139576, "grad_norm": 1.740449070930481, "learning_rate": 1.3662740591783971e-05, "loss": 2.7021, "step": 2329 }, { "epoch": 0.6625710172574157, "grad_norm": 1.5307239294052124, "learning_rate": 1.3651249640907786e-05, "loss": 2.3465, "step": 2330 }, { "epoch": 0.6628553825008738, "grad_norm": 1.5852240324020386, "learning_rate": 1.3639758690031602e-05, "loss": 2.477, "step": 2331 }, { "epoch": 0.6631397477443319, "grad_norm": 1.6011126041412354, "learning_rate": 1.3628267739155417e-05, "loss": 2.2583, "step": 2332 }, { "epoch": 0.66342411298779, "grad_norm": 1.549425482749939, "learning_rate": 1.3616776788279231e-05, "loss": 2.2773, "step": 2333 }, { "epoch": 0.6637084782312482, "grad_norm": 1.449255347251892, "learning_rate": 1.3605285837403046e-05, "loss": 1.7077, "step": 2334 }, { "epoch": 0.6639928434747063, "grad_norm": 1.624582290649414, "learning_rate": 1.3593794886526862e-05, "loss": 1.7985, "step": 2335 }, { "epoch": 0.6642772087181644, "grad_norm": 1.6954373121261597, "learning_rate": 1.3582303935650676e-05, "loss": 1.7301, "step": 2336 }, { "epoch": 0.6645615739616225, "grad_norm": 1.8019691705703735, "learning_rate": 1.357081298477449e-05, "loss": 2.6443, "step": 2337 }, { "epoch": 0.6648459392050806, "grad_norm": 1.6157490015029907, "learning_rate": 1.3559322033898305e-05, "loss": 2.248, "step": 2338 }, { "epoch": 0.6651303044485388, "grad_norm": 1.6532044410705566, "learning_rate": 1.354783108302212e-05, "loss": 2.4353, "step": 2339 }, { "epoch": 0.6654146696919969, "grad_norm": 1.4396045207977295, "learning_rate": 1.3536340132145936e-05, "loss": 2.1747, "step": 2340 }, { "epoch": 0.665699034935455, "grad_norm": 1.4797991514205933, "learning_rate": 1.352484918126975e-05, "loss": 2.0431, "step": 2341 }, { "epoch": 0.6659834001789131, "grad_norm": 1.3988265991210938, "learning_rate": 1.3513358230393565e-05, "loss": 1.858, "step": 2342 }, { "epoch": 0.6662677654223712, "grad_norm": 1.6098911762237549, "learning_rate": 1.3501867279517383e-05, "loss": 1.735, "step": 2343 }, { "epoch": 0.6665521306658294, "grad_norm": 1.6491178274154663, "learning_rate": 1.3490376328641197e-05, "loss": 1.6631, "step": 2344 }, { "epoch": 0.6668364959092875, "grad_norm": 1.7857693433761597, "learning_rate": 1.3478885377765012e-05, "loss": 2.6628, "step": 2345 }, { "epoch": 0.6671208611527456, "grad_norm": 1.5431671142578125, "learning_rate": 1.3467394426888826e-05, "loss": 2.4461, "step": 2346 }, { "epoch": 0.6674052263962037, "grad_norm": 1.5646437406539917, "learning_rate": 1.3455903476012642e-05, "loss": 2.3102, "step": 2347 }, { "epoch": 0.6676895916396618, "grad_norm": 1.4588571786880493, "learning_rate": 1.3444412525136457e-05, "loss": 2.3354, "step": 2348 }, { "epoch": 0.66797395688312, "grad_norm": 1.5933592319488525, "learning_rate": 1.3432921574260271e-05, "loss": 2.2924, "step": 2349 }, { "epoch": 0.6682583221265781, "grad_norm": 1.7044545412063599, "learning_rate": 1.3421430623384086e-05, "loss": 1.9148, "step": 2350 }, { "epoch": 0.6685426873700362, "grad_norm": 1.6366909742355347, "learning_rate": 1.34099396725079e-05, "loss": 1.6891, "step": 2351 }, { "epoch": 0.6688270526134943, "grad_norm": 1.6043225526809692, "learning_rate": 1.3398448721631717e-05, "loss": 1.895, "step": 2352 }, { "epoch": 0.6691114178569524, "grad_norm": 1.8012562990188599, "learning_rate": 1.3386957770755531e-05, "loss": 2.7992, "step": 2353 }, { "epoch": 0.6693957831004106, "grad_norm": 1.4551502466201782, "learning_rate": 1.3375466819879346e-05, "loss": 2.3551, "step": 2354 }, { "epoch": 0.6696801483438687, "grad_norm": 1.5812779664993286, "learning_rate": 1.336397586900316e-05, "loss": 2.2085, "step": 2355 }, { "epoch": 0.6699645135873268, "grad_norm": 1.529133677482605, "learning_rate": 1.3352484918126976e-05, "loss": 2.4194, "step": 2356 }, { "epoch": 0.6702488788307849, "grad_norm": 1.595837116241455, "learning_rate": 1.3340993967250791e-05, "loss": 2.1076, "step": 2357 }, { "epoch": 0.670533244074243, "grad_norm": 1.6244161128997803, "learning_rate": 1.3329503016374605e-05, "loss": 1.8285, "step": 2358 }, { "epoch": 0.6708176093177012, "grad_norm": 1.5791698694229126, "learning_rate": 1.331801206549842e-05, "loss": 1.8041, "step": 2359 }, { "epoch": 0.6711019745611593, "grad_norm": 1.7883336544036865, "learning_rate": 1.3306521114622234e-05, "loss": 1.7612, "step": 2360 }, { "epoch": 0.6713863398046174, "grad_norm": 1.7180142402648926, "learning_rate": 1.3295030163746052e-05, "loss": 2.9109, "step": 2361 }, { "epoch": 0.6716707050480755, "grad_norm": 1.4666606187820435, "learning_rate": 1.3283539212869867e-05, "loss": 2.3537, "step": 2362 }, { "epoch": 0.6719550702915336, "grad_norm": 1.450372338294983, "learning_rate": 1.3272048261993681e-05, "loss": 2.4139, "step": 2363 }, { "epoch": 0.6722394355349918, "grad_norm": 1.5204261541366577, "learning_rate": 1.3260557311117497e-05, "loss": 2.2118, "step": 2364 }, { "epoch": 0.6725238007784499, "grad_norm": 1.5298410654067993, "learning_rate": 1.3249066360241312e-05, "loss": 2.2147, "step": 2365 }, { "epoch": 0.672808166021908, "grad_norm": 1.5501554012298584, "learning_rate": 1.3237575409365126e-05, "loss": 1.9778, "step": 2366 }, { "epoch": 0.6730925312653661, "grad_norm": 1.5184673070907593, "learning_rate": 1.3226084458488941e-05, "loss": 1.7726, "step": 2367 }, { "epoch": 0.6733768965088243, "grad_norm": 1.6755380630493164, "learning_rate": 1.3214593507612757e-05, "loss": 1.7409, "step": 2368 }, { "epoch": 0.6736612617522824, "grad_norm": 1.799485683441162, "learning_rate": 1.3203102556736572e-05, "loss": 2.698, "step": 2369 }, { "epoch": 0.6739456269957405, "grad_norm": 1.6598178148269653, "learning_rate": 1.3191611605860386e-05, "loss": 2.3039, "step": 2370 }, { "epoch": 0.6742299922391986, "grad_norm": 1.4246594905853271, "learning_rate": 1.31801206549842e-05, "loss": 2.1442, "step": 2371 }, { "epoch": 0.6745143574826566, "grad_norm": 1.5940912961959839, "learning_rate": 1.3168629704108015e-05, "loss": 2.3078, "step": 2372 }, { "epoch": 0.6747987227261149, "grad_norm": 1.6379321813583374, "learning_rate": 1.3157138753231831e-05, "loss": 2.1717, "step": 2373 }, { "epoch": 0.675083087969573, "grad_norm": 1.4846510887145996, "learning_rate": 1.3145647802355646e-05, "loss": 1.9535, "step": 2374 }, { "epoch": 0.675367453213031, "grad_norm": 1.5312621593475342, "learning_rate": 1.313415685147946e-05, "loss": 1.7979, "step": 2375 }, { "epoch": 0.6756518184564891, "grad_norm": 1.6713464260101318, "learning_rate": 1.3122665900603275e-05, "loss": 1.9716, "step": 2376 }, { "epoch": 0.6759361836999472, "grad_norm": 1.6956110000610352, "learning_rate": 1.3111174949727091e-05, "loss": 2.5694, "step": 2377 }, { "epoch": 0.6762205489434054, "grad_norm": 1.5432319641113281, "learning_rate": 1.3099683998850905e-05, "loss": 2.3722, "step": 2378 }, { "epoch": 0.6765049141868635, "grad_norm": 1.5072269439697266, "learning_rate": 1.308819304797472e-05, "loss": 2.347, "step": 2379 }, { "epoch": 0.6767892794303216, "grad_norm": 1.498325228691101, "learning_rate": 1.3076702097098536e-05, "loss": 2.2973, "step": 2380 }, { "epoch": 0.6770736446737797, "grad_norm": 1.5500154495239258, "learning_rate": 1.3065211146222352e-05, "loss": 2.1524, "step": 2381 }, { "epoch": 0.6773580099172378, "grad_norm": 1.6011747121810913, "learning_rate": 1.3053720195346167e-05, "loss": 1.7051, "step": 2382 }, { "epoch": 0.677642375160696, "grad_norm": 1.5794252157211304, "learning_rate": 1.3042229244469981e-05, "loss": 1.6992, "step": 2383 }, { "epoch": 0.6779267404041541, "grad_norm": 1.5260065793991089, "learning_rate": 1.3030738293593796e-05, "loss": 1.7597, "step": 2384 }, { "epoch": 0.6782111056476122, "grad_norm": 1.744638204574585, "learning_rate": 1.3019247342717612e-05, "loss": 2.9247, "step": 2385 }, { "epoch": 0.6784954708910703, "grad_norm": 1.6044799089431763, "learning_rate": 1.3007756391841427e-05, "loss": 2.3649, "step": 2386 }, { "epoch": 0.6787798361345284, "grad_norm": 1.4609475135803223, "learning_rate": 1.2996265440965241e-05, "loss": 2.2682, "step": 2387 }, { "epoch": 0.6790642013779866, "grad_norm": 1.4124441146850586, "learning_rate": 1.2984774490089056e-05, "loss": 2.2928, "step": 2388 }, { "epoch": 0.6793485666214447, "grad_norm": 1.5999109745025635, "learning_rate": 1.297328353921287e-05, "loss": 2.0884, "step": 2389 }, { "epoch": 0.6796329318649028, "grad_norm": 1.541373372077942, "learning_rate": 1.2961792588336686e-05, "loss": 1.8306, "step": 2390 }, { "epoch": 0.6799172971083609, "grad_norm": 1.6004866361618042, "learning_rate": 1.29503016374605e-05, "loss": 1.7566, "step": 2391 }, { "epoch": 0.680201662351819, "grad_norm": 1.6891027688980103, "learning_rate": 1.2938810686584315e-05, "loss": 1.5669, "step": 2392 }, { "epoch": 0.6804860275952772, "grad_norm": 1.9808518886566162, "learning_rate": 1.292731973570813e-05, "loss": 2.8488, "step": 2393 }, { "epoch": 0.6807703928387353, "grad_norm": 1.5064984560012817, "learning_rate": 1.2915828784831946e-05, "loss": 2.449, "step": 2394 }, { "epoch": 0.6810547580821934, "grad_norm": 1.4997645616531372, "learning_rate": 1.290433783395576e-05, "loss": 2.2506, "step": 2395 }, { "epoch": 0.6813391233256515, "grad_norm": 1.476811170578003, "learning_rate": 1.2892846883079575e-05, "loss": 2.0701, "step": 2396 }, { "epoch": 0.6816234885691096, "grad_norm": 1.593111515045166, "learning_rate": 1.288135593220339e-05, "loss": 2.0792, "step": 2397 }, { "epoch": 0.6819078538125678, "grad_norm": 1.5982271432876587, "learning_rate": 1.2869864981327207e-05, "loss": 1.815, "step": 2398 }, { "epoch": 0.6821922190560259, "grad_norm": 1.615859866142273, "learning_rate": 1.2858374030451022e-05, "loss": 1.8881, "step": 2399 }, { "epoch": 0.682476584299484, "grad_norm": 1.6525177955627441, "learning_rate": 1.2846883079574836e-05, "loss": 1.9245, "step": 2400 }, { "epoch": 0.6827609495429421, "grad_norm": 1.8938461542129517, "learning_rate": 1.283539212869865e-05, "loss": 2.9311, "step": 2401 }, { "epoch": 0.6830453147864002, "grad_norm": 1.5215036869049072, "learning_rate": 1.2823901177822467e-05, "loss": 2.1695, "step": 2402 }, { "epoch": 0.6833296800298584, "grad_norm": 1.4118622541427612, "learning_rate": 1.2812410226946281e-05, "loss": 2.3208, "step": 2403 }, { "epoch": 0.6836140452733165, "grad_norm": 1.486366868019104, "learning_rate": 1.2800919276070096e-05, "loss": 2.3337, "step": 2404 }, { "epoch": 0.6838984105167746, "grad_norm": 1.5969897508621216, "learning_rate": 1.278942832519391e-05, "loss": 2.1099, "step": 2405 }, { "epoch": 0.6841827757602327, "grad_norm": 1.4879251718521118, "learning_rate": 1.2777937374317727e-05, "loss": 1.9579, "step": 2406 }, { "epoch": 0.6844671410036908, "grad_norm": 1.6099344491958618, "learning_rate": 1.2766446423441541e-05, "loss": 2.0261, "step": 2407 }, { "epoch": 0.684751506247149, "grad_norm": 1.8013091087341309, "learning_rate": 1.2754955472565356e-05, "loss": 1.7967, "step": 2408 }, { "epoch": 0.6850358714906071, "grad_norm": 1.6463696956634521, "learning_rate": 1.274346452168917e-05, "loss": 2.7894, "step": 2409 }, { "epoch": 0.6853202367340652, "grad_norm": 1.5260536670684814, "learning_rate": 1.2731973570812985e-05, "loss": 2.5281, "step": 2410 }, { "epoch": 0.6856046019775233, "grad_norm": 1.5363825559616089, "learning_rate": 1.27204826199368e-05, "loss": 1.9302, "step": 2411 }, { "epoch": 0.6858889672209814, "grad_norm": 1.5252505540847778, "learning_rate": 1.2708991669060615e-05, "loss": 2.3634, "step": 2412 }, { "epoch": 0.6861733324644396, "grad_norm": 1.609386682510376, "learning_rate": 1.269750071818443e-05, "loss": 1.9163, "step": 2413 }, { "epoch": 0.6864576977078977, "grad_norm": 1.467077612876892, "learning_rate": 1.2686009767308244e-05, "loss": 1.8251, "step": 2414 }, { "epoch": 0.6867420629513558, "grad_norm": 1.5783828496932983, "learning_rate": 1.267451881643206e-05, "loss": 1.7815, "step": 2415 }, { "epoch": 0.6870264281948139, "grad_norm": 1.699668049812317, "learning_rate": 1.2663027865555875e-05, "loss": 1.9008, "step": 2416 }, { "epoch": 0.6873107934382721, "grad_norm": 1.7475955486297607, "learning_rate": 1.2651536914679691e-05, "loss": 2.4304, "step": 2417 }, { "epoch": 0.6875951586817302, "grad_norm": 1.4119583368301392, "learning_rate": 1.2640045963803507e-05, "loss": 2.3749, "step": 2418 }, { "epoch": 0.6878795239251883, "grad_norm": 1.54697847366333, "learning_rate": 1.2628555012927322e-05, "loss": 2.1809, "step": 2419 }, { "epoch": 0.6881638891686463, "grad_norm": 1.5237066745758057, "learning_rate": 1.2617064062051136e-05, "loss": 2.2084, "step": 2420 }, { "epoch": 0.6884482544121044, "grad_norm": 1.586302399635315, "learning_rate": 1.2605573111174951e-05, "loss": 1.9825, "step": 2421 }, { "epoch": 0.6887326196555627, "grad_norm": 1.619920253753662, "learning_rate": 1.2594082160298765e-05, "loss": 1.9729, "step": 2422 }, { "epoch": 0.6890169848990207, "grad_norm": 1.5657825469970703, "learning_rate": 1.2582591209422582e-05, "loss": 1.8286, "step": 2423 }, { "epoch": 0.6893013501424788, "grad_norm": 1.5684531927108765, "learning_rate": 1.2571100258546396e-05, "loss": 1.7035, "step": 2424 }, { "epoch": 0.6895857153859369, "grad_norm": 1.7014023065567017, "learning_rate": 1.255960930767021e-05, "loss": 2.7972, "step": 2425 }, { "epoch": 0.689870080629395, "grad_norm": 1.52834951877594, "learning_rate": 1.2548118356794025e-05, "loss": 2.3543, "step": 2426 }, { "epoch": 0.6901544458728532, "grad_norm": 1.3747142553329468, "learning_rate": 1.2536627405917841e-05, "loss": 2.1109, "step": 2427 }, { "epoch": 0.6904388111163113, "grad_norm": 1.534885287284851, "learning_rate": 1.2525136455041656e-05, "loss": 2.2004, "step": 2428 }, { "epoch": 0.6907231763597694, "grad_norm": 1.6728806495666504, "learning_rate": 1.251364550416547e-05, "loss": 2.1012, "step": 2429 }, { "epoch": 0.6910075416032275, "grad_norm": 1.5786833763122559, "learning_rate": 1.2502154553289285e-05, "loss": 2.0211, "step": 2430 }, { "epoch": 0.6912919068466856, "grad_norm": 1.4964499473571777, "learning_rate": 1.24906636024131e-05, "loss": 1.7223, "step": 2431 }, { "epoch": 0.6915762720901438, "grad_norm": 1.5604283809661865, "learning_rate": 1.2479172651536915e-05, "loss": 1.73, "step": 2432 }, { "epoch": 0.6918606373336019, "grad_norm": 2.041099786758423, "learning_rate": 1.246768170066073e-05, "loss": 2.7647, "step": 2433 }, { "epoch": 0.69214500257706, "grad_norm": 1.5619685649871826, "learning_rate": 1.2456190749784544e-05, "loss": 2.2241, "step": 2434 }, { "epoch": 0.6924293678205181, "grad_norm": 1.4527276754379272, "learning_rate": 1.2444699798908362e-05, "loss": 2.2705, "step": 2435 }, { "epoch": 0.6927137330639762, "grad_norm": 1.5282810926437378, "learning_rate": 1.2433208848032177e-05, "loss": 2.0866, "step": 2436 }, { "epoch": 0.6929980983074344, "grad_norm": 1.5058673620224, "learning_rate": 1.2421717897155991e-05, "loss": 1.8953, "step": 2437 }, { "epoch": 0.6932824635508925, "grad_norm": 1.6031697988510132, "learning_rate": 1.2410226946279806e-05, "loss": 1.9201, "step": 2438 }, { "epoch": 0.6935668287943506, "grad_norm": 1.544476866722107, "learning_rate": 1.2398735995403622e-05, "loss": 1.7643, "step": 2439 }, { "epoch": 0.6938511940378087, "grad_norm": 1.6882764101028442, "learning_rate": 1.2387245044527436e-05, "loss": 1.7574, "step": 2440 }, { "epoch": 0.6941355592812668, "grad_norm": 1.8056814670562744, "learning_rate": 1.2375754093651251e-05, "loss": 2.5871, "step": 2441 }, { "epoch": 0.694419924524725, "grad_norm": 1.553879976272583, "learning_rate": 1.2364263142775065e-05, "loss": 2.5545, "step": 2442 }, { "epoch": 0.6947042897681831, "grad_norm": 1.5317288637161255, "learning_rate": 1.235277219189888e-05, "loss": 2.2901, "step": 2443 }, { "epoch": 0.6949886550116412, "grad_norm": 1.6215896606445312, "learning_rate": 1.2341281241022696e-05, "loss": 2.2079, "step": 2444 }, { "epoch": 0.6952730202550993, "grad_norm": 1.5378239154815674, "learning_rate": 1.232979029014651e-05, "loss": 2.0399, "step": 2445 }, { "epoch": 0.6955573854985574, "grad_norm": 1.4927102327346802, "learning_rate": 1.2318299339270325e-05, "loss": 1.8721, "step": 2446 }, { "epoch": 0.6958417507420156, "grad_norm": 1.48208749294281, "learning_rate": 1.230680838839414e-05, "loss": 1.8783, "step": 2447 }, { "epoch": 0.6961261159854737, "grad_norm": 1.5241432189941406, "learning_rate": 1.2295317437517956e-05, "loss": 1.7324, "step": 2448 }, { "epoch": 0.6964104812289318, "grad_norm": 1.8973292112350464, "learning_rate": 1.228382648664177e-05, "loss": 2.7123, "step": 2449 }, { "epoch": 0.6966948464723899, "grad_norm": 1.539310097694397, "learning_rate": 1.2272335535765585e-05, "loss": 2.3683, "step": 2450 }, { "epoch": 0.696979211715848, "grad_norm": 1.5558347702026367, "learning_rate": 1.22608445848894e-05, "loss": 2.0833, "step": 2451 }, { "epoch": 0.6972635769593062, "grad_norm": 1.690713882446289, "learning_rate": 1.2249353634013214e-05, "loss": 2.1543, "step": 2452 }, { "epoch": 0.6975479422027643, "grad_norm": 1.7084331512451172, "learning_rate": 1.223786268313703e-05, "loss": 2.2718, "step": 2453 }, { "epoch": 0.6978323074462224, "grad_norm": 1.5845692157745361, "learning_rate": 1.2226371732260846e-05, "loss": 1.7258, "step": 2454 }, { "epoch": 0.6981166726896805, "grad_norm": 1.7029811143875122, "learning_rate": 1.221488078138466e-05, "loss": 1.7476, "step": 2455 }, { "epoch": 0.6984010379331386, "grad_norm": 1.6282069683074951, "learning_rate": 1.2203389830508477e-05, "loss": 1.7149, "step": 2456 }, { "epoch": 0.6986854031765968, "grad_norm": 1.8195720911026, "learning_rate": 1.2191898879632291e-05, "loss": 2.6883, "step": 2457 }, { "epoch": 0.6989697684200549, "grad_norm": 1.4874529838562012, "learning_rate": 1.2180407928756106e-05, "loss": 2.3012, "step": 2458 }, { "epoch": 0.699254133663513, "grad_norm": 1.4426288604736328, "learning_rate": 1.216891697787992e-05, "loss": 2.2598, "step": 2459 }, { "epoch": 0.6995384989069711, "grad_norm": 1.4771324396133423, "learning_rate": 1.2157426027003737e-05, "loss": 2.1432, "step": 2460 }, { "epoch": 0.6998228641504293, "grad_norm": 1.6694068908691406, "learning_rate": 1.2145935076127551e-05, "loss": 2.1884, "step": 2461 }, { "epoch": 0.7001072293938874, "grad_norm": 1.5652145147323608, "learning_rate": 1.2134444125251366e-05, "loss": 1.8736, "step": 2462 }, { "epoch": 0.7003915946373455, "grad_norm": 1.6377602815628052, "learning_rate": 1.212295317437518e-05, "loss": 1.7648, "step": 2463 }, { "epoch": 0.7006759598808036, "grad_norm": 1.671975016593933, "learning_rate": 1.2111462223498995e-05, "loss": 1.8393, "step": 2464 }, { "epoch": 0.7009603251242617, "grad_norm": 1.6467212438583374, "learning_rate": 1.209997127262281e-05, "loss": 2.8314, "step": 2465 }, { "epoch": 0.7012446903677199, "grad_norm": 1.6222443580627441, "learning_rate": 1.2088480321746625e-05, "loss": 2.4483, "step": 2466 }, { "epoch": 0.701529055611178, "grad_norm": 1.5044670104980469, "learning_rate": 1.207698937087044e-05, "loss": 2.4326, "step": 2467 }, { "epoch": 0.701813420854636, "grad_norm": 1.5599663257598877, "learning_rate": 1.2065498419994254e-05, "loss": 2.0106, "step": 2468 }, { "epoch": 0.7020977860980941, "grad_norm": 1.4764515161514282, "learning_rate": 1.205400746911807e-05, "loss": 2.0588, "step": 2469 }, { "epoch": 0.7023821513415522, "grad_norm": 1.4903563261032104, "learning_rate": 1.2042516518241885e-05, "loss": 1.8838, "step": 2470 }, { "epoch": 0.7026665165850104, "grad_norm": 1.5704025030136108, "learning_rate": 1.20310255673657e-05, "loss": 1.8783, "step": 2471 }, { "epoch": 0.7029508818284685, "grad_norm": 1.671426773071289, "learning_rate": 1.2019534616489517e-05, "loss": 1.6243, "step": 2472 }, { "epoch": 0.7032352470719266, "grad_norm": 1.8858857154846191, "learning_rate": 1.2008043665613332e-05, "loss": 2.6409, "step": 2473 }, { "epoch": 0.7035196123153847, "grad_norm": 1.4502723217010498, "learning_rate": 1.1996552714737146e-05, "loss": 2.4368, "step": 2474 }, { "epoch": 0.7038039775588428, "grad_norm": 1.4333308935165405, "learning_rate": 1.198506176386096e-05, "loss": 2.2744, "step": 2475 }, { "epoch": 0.704088342802301, "grad_norm": 1.4810882806777954, "learning_rate": 1.1973570812984775e-05, "loss": 2.105, "step": 2476 }, { "epoch": 0.7043727080457591, "grad_norm": 1.6196544170379639, "learning_rate": 1.1962079862108592e-05, "loss": 2.1092, "step": 2477 }, { "epoch": 0.7046570732892172, "grad_norm": 1.455788254737854, "learning_rate": 1.1950588911232406e-05, "loss": 1.75, "step": 2478 }, { "epoch": 0.7049414385326753, "grad_norm": 1.5926382541656494, "learning_rate": 1.193909796035622e-05, "loss": 1.8092, "step": 2479 }, { "epoch": 0.7052258037761334, "grad_norm": 1.7150598764419556, "learning_rate": 1.1927607009480035e-05, "loss": 1.7442, "step": 2480 }, { "epoch": 0.7055101690195916, "grad_norm": 1.7297258377075195, "learning_rate": 1.1916116058603851e-05, "loss": 2.6048, "step": 2481 }, { "epoch": 0.7057945342630497, "grad_norm": 1.6914091110229492, "learning_rate": 1.1904625107727666e-05, "loss": 2.3913, "step": 2482 }, { "epoch": 0.7060788995065078, "grad_norm": 1.6037940979003906, "learning_rate": 1.189313415685148e-05, "loss": 2.2043, "step": 2483 }, { "epoch": 0.7063632647499659, "grad_norm": 1.5348221063613892, "learning_rate": 1.1881643205975295e-05, "loss": 2.2177, "step": 2484 }, { "epoch": 0.706647629993424, "grad_norm": 1.5680232048034668, "learning_rate": 1.187015225509911e-05, "loss": 2.2766, "step": 2485 }, { "epoch": 0.7069319952368822, "grad_norm": 1.5366462469100952, "learning_rate": 1.1858661304222925e-05, "loss": 1.98, "step": 2486 }, { "epoch": 0.7072163604803403, "grad_norm": 1.4476945400238037, "learning_rate": 1.184717035334674e-05, "loss": 1.8068, "step": 2487 }, { "epoch": 0.7075007257237984, "grad_norm": 1.6980016231536865, "learning_rate": 1.1835679402470554e-05, "loss": 1.6514, "step": 2488 }, { "epoch": 0.7077850909672565, "grad_norm": 1.753934383392334, "learning_rate": 1.1824188451594369e-05, "loss": 2.7271, "step": 2489 }, { "epoch": 0.7080694562107146, "grad_norm": 1.530096411705017, "learning_rate": 1.1812697500718185e-05, "loss": 2.0594, "step": 2490 }, { "epoch": 0.7083538214541728, "grad_norm": 1.539100170135498, "learning_rate": 1.1801206549842001e-05, "loss": 2.0438, "step": 2491 }, { "epoch": 0.7086381866976309, "grad_norm": 1.5810004472732544, "learning_rate": 1.1789715598965816e-05, "loss": 2.2391, "step": 2492 }, { "epoch": 0.708922551941089, "grad_norm": 1.5543794631958008, "learning_rate": 1.1778224648089632e-05, "loss": 2.0453, "step": 2493 }, { "epoch": 0.7092069171845471, "grad_norm": 1.462825894355774, "learning_rate": 1.1766733697213446e-05, "loss": 1.8446, "step": 2494 }, { "epoch": 0.7094912824280052, "grad_norm": 1.5893888473510742, "learning_rate": 1.1755242746337261e-05, "loss": 1.6117, "step": 2495 }, { "epoch": 0.7097756476714634, "grad_norm": 1.5945665836334229, "learning_rate": 1.1743751795461075e-05, "loss": 1.6783, "step": 2496 }, { "epoch": 0.7100600129149215, "grad_norm": 1.7469823360443115, "learning_rate": 1.173226084458489e-05, "loss": 2.6954, "step": 2497 }, { "epoch": 0.7103443781583796, "grad_norm": 1.4920471906661987, "learning_rate": 1.1720769893708706e-05, "loss": 2.2442, "step": 2498 }, { "epoch": 0.7106287434018377, "grad_norm": 1.5059243440628052, "learning_rate": 1.170927894283252e-05, "loss": 2.3696, "step": 2499 }, { "epoch": 0.7109131086452958, "grad_norm": 1.5132187604904175, "learning_rate": 1.1697787991956335e-05, "loss": 2.0736, "step": 2500 }, { "epoch": 0.711197473888754, "grad_norm": 1.6220301389694214, "learning_rate": 1.168629704108015e-05, "loss": 2.0143, "step": 2501 }, { "epoch": 0.7114818391322121, "grad_norm": 1.5194025039672852, "learning_rate": 1.1674806090203966e-05, "loss": 1.9679, "step": 2502 }, { "epoch": 0.7117662043756702, "grad_norm": 1.4100217819213867, "learning_rate": 1.166331513932778e-05, "loss": 1.7964, "step": 2503 }, { "epoch": 0.7120505696191283, "grad_norm": 1.6122227907180786, "learning_rate": 1.1651824188451595e-05, "loss": 1.7849, "step": 2504 }, { "epoch": 0.7123349348625864, "grad_norm": 1.7167197465896606, "learning_rate": 1.164033323757541e-05, "loss": 2.6167, "step": 2505 }, { "epoch": 0.7126193001060446, "grad_norm": 1.4899656772613525, "learning_rate": 1.1628842286699224e-05, "loss": 2.3321, "step": 2506 }, { "epoch": 0.7129036653495027, "grad_norm": 1.5323848724365234, "learning_rate": 1.161735133582304e-05, "loss": 2.3636, "step": 2507 }, { "epoch": 0.7131880305929608, "grad_norm": 1.7162936925888062, "learning_rate": 1.1605860384946855e-05, "loss": 2.2844, "step": 2508 }, { "epoch": 0.7134723958364189, "grad_norm": 1.5394136905670166, "learning_rate": 1.159436943407067e-05, "loss": 2.2002, "step": 2509 }, { "epoch": 0.7137567610798771, "grad_norm": 1.561732292175293, "learning_rate": 1.1582878483194487e-05, "loss": 1.8321, "step": 2510 }, { "epoch": 0.7140411263233352, "grad_norm": 1.4912663698196411, "learning_rate": 1.1571387532318301e-05, "loss": 1.6438, "step": 2511 }, { "epoch": 0.7143254915667933, "grad_norm": 1.7454664707183838, "learning_rate": 1.1559896581442116e-05, "loss": 2.0865, "step": 2512 }, { "epoch": 0.7146098568102514, "grad_norm": 1.7953159809112549, "learning_rate": 1.154840563056593e-05, "loss": 2.5611, "step": 2513 }, { "epoch": 0.7148942220537094, "grad_norm": 1.570483922958374, "learning_rate": 1.1536914679689747e-05, "loss": 2.1698, "step": 2514 }, { "epoch": 0.7151785872971677, "grad_norm": 1.4322417974472046, "learning_rate": 1.1525423728813561e-05, "loss": 2.1472, "step": 2515 }, { "epoch": 0.7154629525406258, "grad_norm": 1.5140742063522339, "learning_rate": 1.1513932777937376e-05, "loss": 2.2549, "step": 2516 }, { "epoch": 0.7157473177840838, "grad_norm": 1.485280990600586, "learning_rate": 1.150244182706119e-05, "loss": 2.0911, "step": 2517 }, { "epoch": 0.7160316830275419, "grad_norm": 1.444021224975586, "learning_rate": 1.1490950876185005e-05, "loss": 1.8068, "step": 2518 }, { "epoch": 0.716316048271, "grad_norm": 1.5001165866851807, "learning_rate": 1.147945992530882e-05, "loss": 1.7745, "step": 2519 }, { "epoch": 0.7166004135144582, "grad_norm": 1.7177432775497437, "learning_rate": 1.1467968974432635e-05, "loss": 1.7376, "step": 2520 }, { "epoch": 0.7168847787579163, "grad_norm": 1.9216365814208984, "learning_rate": 1.145647802355645e-05, "loss": 2.783, "step": 2521 }, { "epoch": 0.7171691440013744, "grad_norm": 1.6360090970993042, "learning_rate": 1.1444987072680264e-05, "loss": 2.4359, "step": 2522 }, { "epoch": 0.7174535092448325, "grad_norm": 1.5208964347839355, "learning_rate": 1.143349612180408e-05, "loss": 2.3083, "step": 2523 }, { "epoch": 0.7177378744882906, "grad_norm": 1.394752860069275, "learning_rate": 1.1422005170927895e-05, "loss": 2.1679, "step": 2524 }, { "epoch": 0.7180222397317488, "grad_norm": 1.508109211921692, "learning_rate": 1.141051422005171e-05, "loss": 2.1873, "step": 2525 }, { "epoch": 0.7183066049752069, "grad_norm": 1.7108025550842285, "learning_rate": 1.1399023269175524e-05, "loss": 1.8141, "step": 2526 }, { "epoch": 0.718590970218665, "grad_norm": 1.727073311805725, "learning_rate": 1.1387532318299338e-05, "loss": 1.7839, "step": 2527 }, { "epoch": 0.7188753354621231, "grad_norm": 1.7177026271820068, "learning_rate": 1.1376041367423156e-05, "loss": 1.8421, "step": 2528 }, { "epoch": 0.7191597007055812, "grad_norm": 1.9634953737258911, "learning_rate": 1.136455041654697e-05, "loss": 2.5981, "step": 2529 }, { "epoch": 0.7194440659490394, "grad_norm": 1.5275763273239136, "learning_rate": 1.1353059465670785e-05, "loss": 2.3648, "step": 2530 }, { "epoch": 0.7197284311924975, "grad_norm": 1.5566397905349731, "learning_rate": 1.1341568514794601e-05, "loss": 2.3055, "step": 2531 }, { "epoch": 0.7200127964359556, "grad_norm": 1.6546510457992554, "learning_rate": 1.1330077563918416e-05, "loss": 2.1899, "step": 2532 }, { "epoch": 0.7202971616794137, "grad_norm": 1.5946438312530518, "learning_rate": 1.131858661304223e-05, "loss": 2.1639, "step": 2533 }, { "epoch": 0.7205815269228718, "grad_norm": 1.4848626852035522, "learning_rate": 1.1307095662166045e-05, "loss": 2.0955, "step": 2534 }, { "epoch": 0.72086589216633, "grad_norm": 1.4589171409606934, "learning_rate": 1.1295604711289861e-05, "loss": 1.7393, "step": 2535 }, { "epoch": 0.7211502574097881, "grad_norm": 1.6969494819641113, "learning_rate": 1.1284113760413676e-05, "loss": 1.3365, "step": 2536 }, { "epoch": 0.7214346226532462, "grad_norm": 1.7486660480499268, "learning_rate": 1.127262280953749e-05, "loss": 2.695, "step": 2537 }, { "epoch": 0.7217189878967043, "grad_norm": 1.4772002696990967, "learning_rate": 1.1261131858661305e-05, "loss": 2.4941, "step": 2538 }, { "epoch": 0.7220033531401624, "grad_norm": 1.4565849304199219, "learning_rate": 1.124964090778512e-05, "loss": 2.2789, "step": 2539 }, { "epoch": 0.7222877183836206, "grad_norm": 1.3858977556228638, "learning_rate": 1.1238149956908935e-05, "loss": 2.2516, "step": 2540 }, { "epoch": 0.7225720836270787, "grad_norm": 1.4847400188446045, "learning_rate": 1.122665900603275e-05, "loss": 2.1125, "step": 2541 }, { "epoch": 0.7228564488705368, "grad_norm": 1.653690218925476, "learning_rate": 1.1215168055156564e-05, "loss": 1.8551, "step": 2542 }, { "epoch": 0.7231408141139949, "grad_norm": 1.6189554929733276, "learning_rate": 1.1203677104280379e-05, "loss": 1.6745, "step": 2543 }, { "epoch": 0.723425179357453, "grad_norm": 1.6225018501281738, "learning_rate": 1.1192186153404195e-05, "loss": 1.881, "step": 2544 }, { "epoch": 0.7237095446009112, "grad_norm": 1.7607388496398926, "learning_rate": 1.118069520252801e-05, "loss": 2.5443, "step": 2545 }, { "epoch": 0.7239939098443693, "grad_norm": 1.4778530597686768, "learning_rate": 1.1169204251651826e-05, "loss": 2.347, "step": 2546 }, { "epoch": 0.7242782750878274, "grad_norm": 1.4404771327972412, "learning_rate": 1.115771330077564e-05, "loss": 2.4381, "step": 2547 }, { "epoch": 0.7245626403312855, "grad_norm": 1.7989895343780518, "learning_rate": 1.1146222349899456e-05, "loss": 2.2025, "step": 2548 }, { "epoch": 0.7248470055747436, "grad_norm": 1.5871349573135376, "learning_rate": 1.1134731399023271e-05, "loss": 2.1601, "step": 2549 }, { "epoch": 0.7251313708182018, "grad_norm": 1.521226167678833, "learning_rate": 1.1123240448147085e-05, "loss": 1.7278, "step": 2550 }, { "epoch": 0.7254157360616599, "grad_norm": 1.6469875574111938, "learning_rate": 1.11117494972709e-05, "loss": 1.6268, "step": 2551 }, { "epoch": 0.725700101305118, "grad_norm": 1.732232689857483, "learning_rate": 1.1100258546394716e-05, "loss": 1.616, "step": 2552 }, { "epoch": 0.7259844665485761, "grad_norm": 1.7754976749420166, "learning_rate": 1.108876759551853e-05, "loss": 2.5168, "step": 2553 }, { "epoch": 0.7262688317920342, "grad_norm": 1.471461296081543, "learning_rate": 1.1077276644642345e-05, "loss": 2.3877, "step": 2554 }, { "epoch": 0.7265531970354924, "grad_norm": 1.427992820739746, "learning_rate": 1.106578569376616e-05, "loss": 2.1713, "step": 2555 }, { "epoch": 0.7268375622789505, "grad_norm": 1.6102306842803955, "learning_rate": 1.1054294742889976e-05, "loss": 2.1996, "step": 2556 }, { "epoch": 0.7271219275224086, "grad_norm": 1.6785222291946411, "learning_rate": 1.104280379201379e-05, "loss": 2.0858, "step": 2557 }, { "epoch": 0.7274062927658667, "grad_norm": 1.556443214416504, "learning_rate": 1.1031312841137605e-05, "loss": 1.9964, "step": 2558 }, { "epoch": 0.7276906580093249, "grad_norm": 1.5980945825576782, "learning_rate": 1.101982189026142e-05, "loss": 1.9399, "step": 2559 }, { "epoch": 0.727975023252783, "grad_norm": 1.6788307428359985, "learning_rate": 1.1008330939385234e-05, "loss": 1.7525, "step": 2560 }, { "epoch": 0.7282593884962411, "grad_norm": 1.6237465143203735, "learning_rate": 1.099683998850905e-05, "loss": 2.7299, "step": 2561 }, { "epoch": 0.7285437537396992, "grad_norm": 1.56391179561615, "learning_rate": 1.0985349037632864e-05, "loss": 2.5252, "step": 2562 }, { "epoch": 0.7288281189831572, "grad_norm": 1.4667418003082275, "learning_rate": 1.0973858086756679e-05, "loss": 2.1989, "step": 2563 }, { "epoch": 0.7291124842266155, "grad_norm": 1.421413540840149, "learning_rate": 1.0962367135880493e-05, "loss": 2.199, "step": 2564 }, { "epoch": 0.7293968494700735, "grad_norm": 1.588476538658142, "learning_rate": 1.0950876185004311e-05, "loss": 2.1981, "step": 2565 }, { "epoch": 0.7296812147135316, "grad_norm": 1.5892815589904785, "learning_rate": 1.0939385234128126e-05, "loss": 1.884, "step": 2566 }, { "epoch": 0.7299655799569897, "grad_norm": 1.5493581295013428, "learning_rate": 1.092789428325194e-05, "loss": 1.8578, "step": 2567 }, { "epoch": 0.7302499452004478, "grad_norm": 1.7328555583953857, "learning_rate": 1.0916403332375755e-05, "loss": 1.6838, "step": 2568 }, { "epoch": 0.730534310443906, "grad_norm": 1.8126568794250488, "learning_rate": 1.0904912381499571e-05, "loss": 2.7335, "step": 2569 }, { "epoch": 0.7308186756873641, "grad_norm": 1.505807638168335, "learning_rate": 1.0893421430623386e-05, "loss": 2.3911, "step": 2570 }, { "epoch": 0.7311030409308222, "grad_norm": 1.6133103370666504, "learning_rate": 1.08819304797472e-05, "loss": 2.2435, "step": 2571 }, { "epoch": 0.7313874061742803, "grad_norm": 1.4566494226455688, "learning_rate": 1.0870439528871015e-05, "loss": 2.0838, "step": 2572 }, { "epoch": 0.7316717714177384, "grad_norm": 1.5375003814697266, "learning_rate": 1.085894857799483e-05, "loss": 1.9548, "step": 2573 }, { "epoch": 0.7319561366611966, "grad_norm": 1.5447187423706055, "learning_rate": 1.0847457627118645e-05, "loss": 2.0166, "step": 2574 }, { "epoch": 0.7322405019046547, "grad_norm": 1.5107059478759766, "learning_rate": 1.083596667624246e-05, "loss": 1.6546, "step": 2575 }, { "epoch": 0.7325248671481128, "grad_norm": 1.5727344751358032, "learning_rate": 1.0824475725366274e-05, "loss": 1.6626, "step": 2576 }, { "epoch": 0.7328092323915709, "grad_norm": 1.8032009601593018, "learning_rate": 1.0812984774490089e-05, "loss": 2.8995, "step": 2577 }, { "epoch": 0.733093597635029, "grad_norm": 1.4891507625579834, "learning_rate": 1.0801493823613905e-05, "loss": 2.3765, "step": 2578 }, { "epoch": 0.7333779628784872, "grad_norm": 1.535091519355774, "learning_rate": 1.079000287273772e-05, "loss": 2.2866, "step": 2579 }, { "epoch": 0.7336623281219453, "grad_norm": 1.510955572128296, "learning_rate": 1.0778511921861534e-05, "loss": 2.2064, "step": 2580 }, { "epoch": 0.7339466933654034, "grad_norm": 1.803688406944275, "learning_rate": 1.0767020970985348e-05, "loss": 2.2372, "step": 2581 }, { "epoch": 0.7342310586088615, "grad_norm": 1.6097159385681152, "learning_rate": 1.0755530020109165e-05, "loss": 1.8109, "step": 2582 }, { "epoch": 0.7345154238523196, "grad_norm": 1.478907585144043, "learning_rate": 1.074403906923298e-05, "loss": 1.6932, "step": 2583 }, { "epoch": 0.7347997890957778, "grad_norm": 1.6126059293746948, "learning_rate": 1.0732548118356795e-05, "loss": 1.6228, "step": 2584 }, { "epoch": 0.7350841543392359, "grad_norm": 1.7208243608474731, "learning_rate": 1.0721057167480611e-05, "loss": 2.7996, "step": 2585 }, { "epoch": 0.735368519582694, "grad_norm": 1.4276622533798218, "learning_rate": 1.0709566216604426e-05, "loss": 2.3561, "step": 2586 }, { "epoch": 0.7356528848261521, "grad_norm": 1.5668575763702393, "learning_rate": 1.069807526572824e-05, "loss": 2.2804, "step": 2587 }, { "epoch": 0.7359372500696102, "grad_norm": 1.4144526720046997, "learning_rate": 1.0686584314852055e-05, "loss": 2.1127, "step": 2588 }, { "epoch": 0.7362216153130684, "grad_norm": 1.6393625736236572, "learning_rate": 1.067509336397587e-05, "loss": 2.2331, "step": 2589 }, { "epoch": 0.7365059805565265, "grad_norm": 1.552177906036377, "learning_rate": 1.0663602413099686e-05, "loss": 1.876, "step": 2590 }, { "epoch": 0.7367903457999846, "grad_norm": 1.5180331468582153, "learning_rate": 1.06521114622235e-05, "loss": 1.9381, "step": 2591 }, { "epoch": 0.7370747110434427, "grad_norm": 1.8120715618133545, "learning_rate": 1.0640620511347315e-05, "loss": 1.7459, "step": 2592 }, { "epoch": 0.7373590762869008, "grad_norm": 1.9727323055267334, "learning_rate": 1.0629129560471129e-05, "loss": 2.7591, "step": 2593 }, { "epoch": 0.737643441530359, "grad_norm": 1.6499255895614624, "learning_rate": 1.0617638609594945e-05, "loss": 2.582, "step": 2594 }, { "epoch": 0.7379278067738171, "grad_norm": 1.4931328296661377, "learning_rate": 1.060614765871876e-05, "loss": 2.2723, "step": 2595 }, { "epoch": 0.7382121720172752, "grad_norm": 1.4535672664642334, "learning_rate": 1.0594656707842574e-05, "loss": 2.1424, "step": 2596 }, { "epoch": 0.7384965372607333, "grad_norm": 1.6324405670166016, "learning_rate": 1.0583165756966389e-05, "loss": 2.2187, "step": 2597 }, { "epoch": 0.7387809025041914, "grad_norm": 1.4527398347854614, "learning_rate": 1.0571674806090203e-05, "loss": 1.8947, "step": 2598 }, { "epoch": 0.7390652677476496, "grad_norm": 1.51231050491333, "learning_rate": 1.056018385521402e-05, "loss": 1.6234, "step": 2599 }, { "epoch": 0.7393496329911077, "grad_norm": 1.6000384092330933, "learning_rate": 1.0548692904337834e-05, "loss": 1.8085, "step": 2600 }, { "epoch": 0.7396339982345658, "grad_norm": 1.772000789642334, "learning_rate": 1.0537201953461649e-05, "loss": 2.6206, "step": 2601 }, { "epoch": 0.7399183634780239, "grad_norm": 1.5927568674087524, "learning_rate": 1.0525711002585466e-05, "loss": 2.4873, "step": 2602 }, { "epoch": 0.740202728721482, "grad_norm": 1.451069951057434, "learning_rate": 1.0514220051709281e-05, "loss": 2.0906, "step": 2603 }, { "epoch": 0.7404870939649402, "grad_norm": 1.6845619678497314, "learning_rate": 1.0502729100833095e-05, "loss": 2.1397, "step": 2604 }, { "epoch": 0.7407714592083983, "grad_norm": 1.546721339225769, "learning_rate": 1.049123814995691e-05, "loss": 1.8256, "step": 2605 }, { "epoch": 0.7410558244518564, "grad_norm": 1.4821118116378784, "learning_rate": 1.0479747199080726e-05, "loss": 1.842, "step": 2606 }, { "epoch": 0.7413401896953145, "grad_norm": 1.5237010717391968, "learning_rate": 1.046825624820454e-05, "loss": 1.6978, "step": 2607 }, { "epoch": 0.7416245549387727, "grad_norm": 1.6505897045135498, "learning_rate": 1.0456765297328355e-05, "loss": 1.6463, "step": 2608 }, { "epoch": 0.7419089201822308, "grad_norm": 1.657488226890564, "learning_rate": 1.044527434645217e-05, "loss": 2.9463, "step": 2609 }, { "epoch": 0.7421932854256889, "grad_norm": 1.5707652568817139, "learning_rate": 1.0433783395575984e-05, "loss": 2.5416, "step": 2610 }, { "epoch": 0.742477650669147, "grad_norm": 1.4029492139816284, "learning_rate": 1.04222924446998e-05, "loss": 2.2495, "step": 2611 }, { "epoch": 0.742762015912605, "grad_norm": 1.707994818687439, "learning_rate": 1.0410801493823615e-05, "loss": 2.0821, "step": 2612 }, { "epoch": 0.7430463811560633, "grad_norm": 1.5762234926223755, "learning_rate": 1.039931054294743e-05, "loss": 2.0927, "step": 2613 }, { "epoch": 0.7433307463995213, "grad_norm": 1.567622423171997, "learning_rate": 1.0387819592071244e-05, "loss": 1.8988, "step": 2614 }, { "epoch": 0.7436151116429794, "grad_norm": 1.5860332250595093, "learning_rate": 1.037632864119506e-05, "loss": 1.9538, "step": 2615 }, { "epoch": 0.7438994768864375, "grad_norm": 1.6096969842910767, "learning_rate": 1.0364837690318874e-05, "loss": 1.8601, "step": 2616 }, { "epoch": 0.7441838421298956, "grad_norm": 1.703778862953186, "learning_rate": 1.0353346739442689e-05, "loss": 2.7728, "step": 2617 }, { "epoch": 0.7444682073733538, "grad_norm": 1.4619109630584717, "learning_rate": 1.0341855788566503e-05, "loss": 2.373, "step": 2618 }, { "epoch": 0.7447525726168119, "grad_norm": 1.4133814573287964, "learning_rate": 1.0330364837690318e-05, "loss": 2.2617, "step": 2619 }, { "epoch": 0.74503693786027, "grad_norm": 1.5261253118515015, "learning_rate": 1.0318873886814136e-05, "loss": 2.1614, "step": 2620 }, { "epoch": 0.7453213031037281, "grad_norm": 1.649577021598816, "learning_rate": 1.030738293593795e-05, "loss": 2.1312, "step": 2621 }, { "epoch": 0.7456056683471862, "grad_norm": 1.4830260276794434, "learning_rate": 1.0295891985061765e-05, "loss": 1.6617, "step": 2622 }, { "epoch": 0.7458900335906444, "grad_norm": 1.6154065132141113, "learning_rate": 1.0284401034185581e-05, "loss": 1.8633, "step": 2623 }, { "epoch": 0.7461743988341025, "grad_norm": 1.634304404258728, "learning_rate": 1.0272910083309395e-05, "loss": 1.88, "step": 2624 }, { "epoch": 0.7464587640775606, "grad_norm": 1.7923297882080078, "learning_rate": 1.026141913243321e-05, "loss": 2.8546, "step": 2625 }, { "epoch": 0.7467431293210187, "grad_norm": 1.4359058141708374, "learning_rate": 1.0249928181557024e-05, "loss": 2.2801, "step": 2626 }, { "epoch": 0.7470274945644768, "grad_norm": 1.4793764352798462, "learning_rate": 1.023843723068084e-05, "loss": 2.3711, "step": 2627 }, { "epoch": 0.747311859807935, "grad_norm": 1.487067699432373, "learning_rate": 1.0226946279804655e-05, "loss": 2.1701, "step": 2628 }, { "epoch": 0.7475962250513931, "grad_norm": 1.7713996171951294, "learning_rate": 1.021545532892847e-05, "loss": 2.283, "step": 2629 }, { "epoch": 0.7478805902948512, "grad_norm": 1.474770188331604, "learning_rate": 1.0203964378052284e-05, "loss": 1.8782, "step": 2630 }, { "epoch": 0.7481649555383093, "grad_norm": 1.6031092405319214, "learning_rate": 1.0192473427176099e-05, "loss": 1.89, "step": 2631 }, { "epoch": 0.7484493207817674, "grad_norm": 1.7248785495758057, "learning_rate": 1.0180982476299915e-05, "loss": 1.8929, "step": 2632 }, { "epoch": 0.7487336860252256, "grad_norm": 1.6073254346847534, "learning_rate": 1.016949152542373e-05, "loss": 2.6679, "step": 2633 }, { "epoch": 0.7490180512686837, "grad_norm": 1.4250975847244263, "learning_rate": 1.0158000574547544e-05, "loss": 2.393, "step": 2634 }, { "epoch": 0.7493024165121418, "grad_norm": 1.3941985368728638, "learning_rate": 1.0146509623671358e-05, "loss": 2.203, "step": 2635 }, { "epoch": 0.7495867817555999, "grad_norm": 1.540351390838623, "learning_rate": 1.0135018672795175e-05, "loss": 2.0514, "step": 2636 }, { "epoch": 0.749871146999058, "grad_norm": 1.4758607149124146, "learning_rate": 1.0123527721918989e-05, "loss": 2.1415, "step": 2637 }, { "epoch": 0.7501555122425162, "grad_norm": 1.5170818567276, "learning_rate": 1.0112036771042804e-05, "loss": 1.7861, "step": 2638 }, { "epoch": 0.7504398774859743, "grad_norm": 1.5630824565887451, "learning_rate": 1.0100545820166621e-05, "loss": 1.7773, "step": 2639 }, { "epoch": 0.7507242427294324, "grad_norm": 1.5273665189743042, "learning_rate": 1.0089054869290436e-05, "loss": 1.7984, "step": 2640 }, { "epoch": 0.7510086079728905, "grad_norm": 1.8208073377609253, "learning_rate": 1.007756391841425e-05, "loss": 2.9004, "step": 2641 }, { "epoch": 0.7512929732163486, "grad_norm": 1.5931475162506104, "learning_rate": 1.0066072967538065e-05, "loss": 2.4002, "step": 2642 }, { "epoch": 0.7515773384598068, "grad_norm": 1.410474181175232, "learning_rate": 1.005458201666188e-05, "loss": 2.0302, "step": 2643 }, { "epoch": 0.7518617037032649, "grad_norm": 1.5305248498916626, "learning_rate": 1.0043091065785696e-05, "loss": 2.0728, "step": 2644 }, { "epoch": 0.752146068946723, "grad_norm": 1.7503458261489868, "learning_rate": 1.003160011490951e-05, "loss": 2.1175, "step": 2645 }, { "epoch": 0.7524304341901811, "grad_norm": 1.5125000476837158, "learning_rate": 1.0020109164033325e-05, "loss": 1.8522, "step": 2646 }, { "epoch": 0.7527147994336392, "grad_norm": 1.5034544467926025, "learning_rate": 1.0008618213157139e-05, "loss": 1.7503, "step": 2647 }, { "epoch": 0.7529991646770974, "grad_norm": 1.5701968669891357, "learning_rate": 9.997127262280955e-06, "loss": 1.7253, "step": 2648 }, { "epoch": 0.7532835299205555, "grad_norm": 1.7175260782241821, "learning_rate": 9.98563631140477e-06, "loss": 2.6342, "step": 2649 }, { "epoch": 0.7535678951640136, "grad_norm": 1.4930824041366577, "learning_rate": 9.974145360528584e-06, "loss": 2.4192, "step": 2650 }, { "epoch": 0.7538522604074717, "grad_norm": 1.5410807132720947, "learning_rate": 9.962654409652399e-06, "loss": 2.3138, "step": 2651 }, { "epoch": 0.7541366256509299, "grad_norm": 1.4877644777297974, "learning_rate": 9.951163458776213e-06, "loss": 2.0459, "step": 2652 }, { "epoch": 0.754420990894388, "grad_norm": 1.711830973625183, "learning_rate": 9.93967250790003e-06, "loss": 2.1824, "step": 2653 }, { "epoch": 0.7547053561378461, "grad_norm": 1.4872764348983765, "learning_rate": 9.928181557023846e-06, "loss": 1.7254, "step": 2654 }, { "epoch": 0.7549897213813042, "grad_norm": 1.5591570138931274, "learning_rate": 9.91669060614766e-06, "loss": 1.9, "step": 2655 }, { "epoch": 0.7552740866247623, "grad_norm": 1.5367933511734009, "learning_rate": 9.905199655271475e-06, "loss": 1.6445, "step": 2656 }, { "epoch": 0.7555584518682205, "grad_norm": 1.752350091934204, "learning_rate": 9.89370870439529e-06, "loss": 2.8917, "step": 2657 }, { "epoch": 0.7558428171116786, "grad_norm": 1.5216292142868042, "learning_rate": 9.882217753519104e-06, "loss": 2.4243, "step": 2658 }, { "epoch": 0.7561271823551367, "grad_norm": 1.5508999824523926, "learning_rate": 9.87072680264292e-06, "loss": 2.1663, "step": 2659 }, { "epoch": 0.7564115475985947, "grad_norm": 1.4459530115127563, "learning_rate": 9.859235851766734e-06, "loss": 2.1307, "step": 2660 }, { "epoch": 0.7566959128420528, "grad_norm": 1.627680778503418, "learning_rate": 9.847744900890549e-06, "loss": 2.0607, "step": 2661 }, { "epoch": 0.756980278085511, "grad_norm": 1.4881747961044312, "learning_rate": 9.836253950014365e-06, "loss": 1.9483, "step": 2662 }, { "epoch": 0.7572646433289691, "grad_norm": 1.6278051137924194, "learning_rate": 9.82476299913818e-06, "loss": 1.794, "step": 2663 }, { "epoch": 0.7575490085724272, "grad_norm": 1.6977025270462036, "learning_rate": 9.813272048261994e-06, "loss": 1.6891, "step": 2664 }, { "epoch": 0.7578333738158853, "grad_norm": 1.795324444770813, "learning_rate": 9.80178109738581e-06, "loss": 2.4871, "step": 2665 }, { "epoch": 0.7581177390593434, "grad_norm": 1.48861563205719, "learning_rate": 9.790290146509625e-06, "loss": 2.4192, "step": 2666 }, { "epoch": 0.7584021043028016, "grad_norm": 1.4271656274795532, "learning_rate": 9.77879919563344e-06, "loss": 2.1096, "step": 2667 }, { "epoch": 0.7586864695462597, "grad_norm": 1.4701792001724243, "learning_rate": 9.767308244757254e-06, "loss": 2.0263, "step": 2668 }, { "epoch": 0.7589708347897178, "grad_norm": 1.5513591766357422, "learning_rate": 9.75581729388107e-06, "loss": 2.2207, "step": 2669 }, { "epoch": 0.7592552000331759, "grad_norm": 1.4835832118988037, "learning_rate": 9.744326343004884e-06, "loss": 1.9584, "step": 2670 }, { "epoch": 0.759539565276634, "grad_norm": 1.5424950122833252, "learning_rate": 9.7328353921287e-06, "loss": 2.0076, "step": 2671 }, { "epoch": 0.7598239305200922, "grad_norm": 1.5847523212432861, "learning_rate": 9.721344441252515e-06, "loss": 1.712, "step": 2672 }, { "epoch": 0.7601082957635503, "grad_norm": 1.8330718278884888, "learning_rate": 9.70985349037633e-06, "loss": 2.7035, "step": 2673 }, { "epoch": 0.7603926610070084, "grad_norm": 1.533942699432373, "learning_rate": 9.698362539500144e-06, "loss": 2.1846, "step": 2674 }, { "epoch": 0.7606770262504665, "grad_norm": 1.5470412969589233, "learning_rate": 9.68687158862396e-06, "loss": 2.2428, "step": 2675 }, { "epoch": 0.7609613914939246, "grad_norm": 1.4431195259094238, "learning_rate": 9.675380637747775e-06, "loss": 2.2741, "step": 2676 }, { "epoch": 0.7612457567373828, "grad_norm": 1.7669216394424438, "learning_rate": 9.66388968687159e-06, "loss": 2.05, "step": 2677 }, { "epoch": 0.7615301219808409, "grad_norm": 1.4941767454147339, "learning_rate": 9.652398735995404e-06, "loss": 2.0978, "step": 2678 }, { "epoch": 0.761814487224299, "grad_norm": 1.4513996839523315, "learning_rate": 9.640907785119218e-06, "loss": 1.7291, "step": 2679 }, { "epoch": 0.7620988524677571, "grad_norm": 1.507140874862671, "learning_rate": 9.629416834243034e-06, "loss": 1.7039, "step": 2680 }, { "epoch": 0.7623832177112152, "grad_norm": 1.6555514335632324, "learning_rate": 9.61792588336685e-06, "loss": 2.5856, "step": 2681 }, { "epoch": 0.7626675829546734, "grad_norm": 1.4625468254089355, "learning_rate": 9.606434932490665e-06, "loss": 2.4481, "step": 2682 }, { "epoch": 0.7629519481981315, "grad_norm": 1.55473792552948, "learning_rate": 9.59494398161448e-06, "loss": 2.0131, "step": 2683 }, { "epoch": 0.7632363134415896, "grad_norm": 1.5071228742599487, "learning_rate": 9.583453030738294e-06, "loss": 2.2608, "step": 2684 }, { "epoch": 0.7635206786850477, "grad_norm": 1.4757564067840576, "learning_rate": 9.571962079862109e-06, "loss": 1.9793, "step": 2685 }, { "epoch": 0.7638050439285058, "grad_norm": 1.4441161155700684, "learning_rate": 9.560471128985925e-06, "loss": 1.7435, "step": 2686 }, { "epoch": 0.764089409171964, "grad_norm": 1.4009804725646973, "learning_rate": 9.54898017810974e-06, "loss": 1.7192, "step": 2687 }, { "epoch": 0.7643737744154221, "grad_norm": 1.5814915895462036, "learning_rate": 9.537489227233554e-06, "loss": 1.6818, "step": 2688 }, { "epoch": 0.7646581396588802, "grad_norm": 1.7841424942016602, "learning_rate": 9.525998276357368e-06, "loss": 2.8207, "step": 2689 }, { "epoch": 0.7649425049023383, "grad_norm": 1.4650540351867676, "learning_rate": 9.514507325481185e-06, "loss": 2.3041, "step": 2690 }, { "epoch": 0.7652268701457964, "grad_norm": 1.519219994544983, "learning_rate": 9.503016374604999e-06, "loss": 2.0641, "step": 2691 }, { "epoch": 0.7655112353892546, "grad_norm": 1.5591269731521606, "learning_rate": 9.491525423728815e-06, "loss": 2.3594, "step": 2692 }, { "epoch": 0.7657956006327127, "grad_norm": 1.680830717086792, "learning_rate": 9.48003447285263e-06, "loss": 2.3668, "step": 2693 }, { "epoch": 0.7660799658761708, "grad_norm": 1.5440500974655151, "learning_rate": 9.468543521976444e-06, "loss": 1.9979, "step": 2694 }, { "epoch": 0.7663643311196289, "grad_norm": 1.6357736587524414, "learning_rate": 9.457052571100259e-06, "loss": 1.729, "step": 2695 }, { "epoch": 0.766648696363087, "grad_norm": 1.5347177982330322, "learning_rate": 9.445561620224075e-06, "loss": 1.8021, "step": 2696 }, { "epoch": 0.7669330616065452, "grad_norm": 1.7074389457702637, "learning_rate": 9.43407066934789e-06, "loss": 2.7968, "step": 2697 }, { "epoch": 0.7672174268500033, "grad_norm": 1.4788157939910889, "learning_rate": 9.422579718471704e-06, "loss": 2.5411, "step": 2698 }, { "epoch": 0.7675017920934614, "grad_norm": 1.428257703781128, "learning_rate": 9.41108876759552e-06, "loss": 2.3632, "step": 2699 }, { "epoch": 0.7677861573369195, "grad_norm": 1.4511548280715942, "learning_rate": 9.399597816719335e-06, "loss": 2.102, "step": 2700 }, { "epoch": 0.7680705225803777, "grad_norm": 1.6660867929458618, "learning_rate": 9.388106865843149e-06, "loss": 1.9556, "step": 2701 }, { "epoch": 0.7683548878238358, "grad_norm": 1.6087857484817505, "learning_rate": 9.376615914966965e-06, "loss": 1.9751, "step": 2702 }, { "epoch": 0.7686392530672939, "grad_norm": 1.5220601558685303, "learning_rate": 9.36512496409078e-06, "loss": 1.8027, "step": 2703 }, { "epoch": 0.768923618310752, "grad_norm": 1.727441668510437, "learning_rate": 9.353634013214594e-06, "loss": 1.7694, "step": 2704 }, { "epoch": 0.76920798355421, "grad_norm": 1.7294251918792725, "learning_rate": 9.342143062338409e-06, "loss": 2.6689, "step": 2705 }, { "epoch": 0.7694923487976683, "grad_norm": 1.4868618249893188, "learning_rate": 9.330652111462223e-06, "loss": 2.4436, "step": 2706 }, { "epoch": 0.7697767140411264, "grad_norm": 1.4418073892593384, "learning_rate": 9.31916116058604e-06, "loss": 2.1285, "step": 2707 }, { "epoch": 0.7700610792845844, "grad_norm": 1.4674290418624878, "learning_rate": 9.307670209709856e-06, "loss": 1.9759, "step": 2708 }, { "epoch": 0.7703454445280425, "grad_norm": 1.6296724081039429, "learning_rate": 9.29617925883367e-06, "loss": 2.1011, "step": 2709 }, { "epoch": 0.7706298097715006, "grad_norm": 1.5555256605148315, "learning_rate": 9.284688307957485e-06, "loss": 1.7846, "step": 2710 }, { "epoch": 0.7709141750149588, "grad_norm": 1.4738327264785767, "learning_rate": 9.273197357081299e-06, "loss": 1.5781, "step": 2711 }, { "epoch": 0.7711985402584169, "grad_norm": 1.5029698610305786, "learning_rate": 9.261706406205114e-06, "loss": 1.5587, "step": 2712 }, { "epoch": 0.771482905501875, "grad_norm": 1.7328534126281738, "learning_rate": 9.25021545532893e-06, "loss": 2.6136, "step": 2713 }, { "epoch": 0.7717672707453331, "grad_norm": 1.5154101848602295, "learning_rate": 9.238724504452744e-06, "loss": 2.4293, "step": 2714 }, { "epoch": 0.7720516359887912, "grad_norm": 1.420172929763794, "learning_rate": 9.227233553576559e-06, "loss": 2.1027, "step": 2715 }, { "epoch": 0.7723360012322494, "grad_norm": 1.5309847593307495, "learning_rate": 9.215742602700373e-06, "loss": 2.3935, "step": 2716 }, { "epoch": 0.7726203664757075, "grad_norm": 1.5749589204788208, "learning_rate": 9.20425165182419e-06, "loss": 1.9977, "step": 2717 }, { "epoch": 0.7729047317191656, "grad_norm": 1.5268434286117554, "learning_rate": 9.192760700948004e-06, "loss": 1.7747, "step": 2718 }, { "epoch": 0.7731890969626237, "grad_norm": 1.5345548391342163, "learning_rate": 9.18126975007182e-06, "loss": 1.7534, "step": 2719 }, { "epoch": 0.7734734622060818, "grad_norm": 1.646863341331482, "learning_rate": 9.169778799195635e-06, "loss": 1.7113, "step": 2720 }, { "epoch": 0.77375782744954, "grad_norm": 1.6630712747573853, "learning_rate": 9.15828784831945e-06, "loss": 2.6318, "step": 2721 }, { "epoch": 0.7740421926929981, "grad_norm": 1.4465360641479492, "learning_rate": 9.146796897443264e-06, "loss": 2.5043, "step": 2722 }, { "epoch": 0.7743265579364562, "grad_norm": 1.4710736274719238, "learning_rate": 9.13530594656708e-06, "loss": 2.1993, "step": 2723 }, { "epoch": 0.7746109231799143, "grad_norm": 1.5619592666625977, "learning_rate": 9.123814995690894e-06, "loss": 2.1542, "step": 2724 }, { "epoch": 0.7748952884233724, "grad_norm": 1.507546305656433, "learning_rate": 9.112324044814709e-06, "loss": 2.048, "step": 2725 }, { "epoch": 0.7751796536668306, "grad_norm": 1.5877704620361328, "learning_rate": 9.100833093938523e-06, "loss": 1.8837, "step": 2726 }, { "epoch": 0.7754640189102887, "grad_norm": 1.5766810178756714, "learning_rate": 9.08934214306234e-06, "loss": 1.7013, "step": 2727 }, { "epoch": 0.7757483841537468, "grad_norm": 1.535407304763794, "learning_rate": 9.077851192186154e-06, "loss": 1.726, "step": 2728 }, { "epoch": 0.7760327493972049, "grad_norm": 1.723103404045105, "learning_rate": 9.06636024130997e-06, "loss": 2.5775, "step": 2729 }, { "epoch": 0.776317114640663, "grad_norm": 1.5206818580627441, "learning_rate": 9.054869290433785e-06, "loss": 2.4745, "step": 2730 }, { "epoch": 0.7766014798841212, "grad_norm": 1.486801266670227, "learning_rate": 9.0433783395576e-06, "loss": 2.3211, "step": 2731 }, { "epoch": 0.7768858451275793, "grad_norm": 1.546639323234558, "learning_rate": 9.031887388681414e-06, "loss": 2.2065, "step": 2732 }, { "epoch": 0.7771702103710374, "grad_norm": 1.6100459098815918, "learning_rate": 9.020396437805228e-06, "loss": 2.1838, "step": 2733 }, { "epoch": 0.7774545756144955, "grad_norm": 1.537714958190918, "learning_rate": 9.008905486929044e-06, "loss": 1.845, "step": 2734 }, { "epoch": 0.7777389408579536, "grad_norm": 1.5195116996765137, "learning_rate": 8.997414536052859e-06, "loss": 1.8668, "step": 2735 }, { "epoch": 0.7780233061014118, "grad_norm": 1.630244255065918, "learning_rate": 8.985923585176675e-06, "loss": 1.8279, "step": 2736 }, { "epoch": 0.7783076713448699, "grad_norm": 1.72090744972229, "learning_rate": 8.97443263430049e-06, "loss": 2.7764, "step": 2737 }, { "epoch": 0.778592036588328, "grad_norm": 1.5594855546951294, "learning_rate": 8.962941683424304e-06, "loss": 2.2368, "step": 2738 }, { "epoch": 0.7788764018317861, "grad_norm": 1.4783302545547485, "learning_rate": 8.951450732548119e-06, "loss": 2.2075, "step": 2739 }, { "epoch": 0.7791607670752442, "grad_norm": 1.5036635398864746, "learning_rate": 8.939959781671935e-06, "loss": 2.1962, "step": 2740 }, { "epoch": 0.7794451323187024, "grad_norm": 1.511878252029419, "learning_rate": 8.92846883079575e-06, "loss": 1.9705, "step": 2741 }, { "epoch": 0.7797294975621605, "grad_norm": 1.6692416667938232, "learning_rate": 8.916977879919564e-06, "loss": 2.0001, "step": 2742 }, { "epoch": 0.7800138628056186, "grad_norm": 1.5080974102020264, "learning_rate": 8.905486929043378e-06, "loss": 1.5839, "step": 2743 }, { "epoch": 0.7802982280490767, "grad_norm": 1.500978946685791, "learning_rate": 8.893995978167193e-06, "loss": 1.7545, "step": 2744 }, { "epoch": 0.7805825932925348, "grad_norm": 1.763393521308899, "learning_rate": 8.882505027291009e-06, "loss": 2.7148, "step": 2745 }, { "epoch": 0.780866958535993, "grad_norm": 1.525657057762146, "learning_rate": 8.871014076414825e-06, "loss": 2.4186, "step": 2746 }, { "epoch": 0.7811513237794511, "grad_norm": 1.4492183923721313, "learning_rate": 8.85952312553864e-06, "loss": 2.1651, "step": 2747 }, { "epoch": 0.7814356890229092, "grad_norm": 1.5383533239364624, "learning_rate": 8.848032174662454e-06, "loss": 2.1932, "step": 2748 }, { "epoch": 0.7817200542663673, "grad_norm": 1.5642298460006714, "learning_rate": 8.836541223786269e-06, "loss": 2.1424, "step": 2749 }, { "epoch": 0.7820044195098255, "grad_norm": 1.4735430479049683, "learning_rate": 8.825050272910083e-06, "loss": 1.7743, "step": 2750 }, { "epoch": 0.7822887847532836, "grad_norm": 1.368130087852478, "learning_rate": 8.8135593220339e-06, "loss": 1.8134, "step": 2751 }, { "epoch": 0.7825731499967417, "grad_norm": 1.6192513704299927, "learning_rate": 8.802068371157714e-06, "loss": 1.7028, "step": 2752 }, { "epoch": 0.7828575152401998, "grad_norm": 1.9017709493637085, "learning_rate": 8.790577420281528e-06, "loss": 2.8195, "step": 2753 }, { "epoch": 0.7831418804836578, "grad_norm": 1.5791484117507935, "learning_rate": 8.779086469405343e-06, "loss": 2.4216, "step": 2754 }, { "epoch": 0.783426245727116, "grad_norm": 1.4580869674682617, "learning_rate": 8.767595518529159e-06, "loss": 2.0641, "step": 2755 }, { "epoch": 0.7837106109705742, "grad_norm": 1.4634650945663452, "learning_rate": 8.756104567652974e-06, "loss": 2.1984, "step": 2756 }, { "epoch": 0.7839949762140322, "grad_norm": 1.5937823057174683, "learning_rate": 8.74461361677679e-06, "loss": 1.9328, "step": 2757 }, { "epoch": 0.7842793414574903, "grad_norm": 1.5349475145339966, "learning_rate": 8.733122665900604e-06, "loss": 1.7728, "step": 2758 }, { "epoch": 0.7845637067009484, "grad_norm": 1.4689794778823853, "learning_rate": 8.721631715024419e-06, "loss": 1.7538, "step": 2759 }, { "epoch": 0.7848480719444066, "grad_norm": 1.59420645236969, "learning_rate": 8.710140764148233e-06, "loss": 1.587, "step": 2760 }, { "epoch": 0.7851324371878647, "grad_norm": 1.7142568826675415, "learning_rate": 8.69864981327205e-06, "loss": 2.6669, "step": 2761 }, { "epoch": 0.7854168024313228, "grad_norm": 1.4829474687576294, "learning_rate": 8.687158862395864e-06, "loss": 2.5419, "step": 2762 }, { "epoch": 0.7857011676747809, "grad_norm": 1.5407049655914307, "learning_rate": 8.675667911519678e-06, "loss": 2.2923, "step": 2763 }, { "epoch": 0.785985532918239, "grad_norm": 1.489302158355713, "learning_rate": 8.664176960643495e-06, "loss": 2.3303, "step": 2764 }, { "epoch": 0.7862698981616972, "grad_norm": 1.5373653173446655, "learning_rate": 8.652686009767309e-06, "loss": 2.0711, "step": 2765 }, { "epoch": 0.7865542634051553, "grad_norm": 1.6663075685501099, "learning_rate": 8.641195058891124e-06, "loss": 1.8473, "step": 2766 }, { "epoch": 0.7868386286486134, "grad_norm": 1.595422387123108, "learning_rate": 8.62970410801494e-06, "loss": 1.7362, "step": 2767 }, { "epoch": 0.7871229938920715, "grad_norm": 1.601948857307434, "learning_rate": 8.618213157138754e-06, "loss": 1.6511, "step": 2768 }, { "epoch": 0.7874073591355296, "grad_norm": 1.8554433584213257, "learning_rate": 8.606722206262569e-06, "loss": 2.7364, "step": 2769 }, { "epoch": 0.7876917243789878, "grad_norm": 1.4515372514724731, "learning_rate": 8.595231255386383e-06, "loss": 2.6112, "step": 2770 }, { "epoch": 0.7879760896224459, "grad_norm": 1.4031703472137451, "learning_rate": 8.583740304510198e-06, "loss": 2.2281, "step": 2771 }, { "epoch": 0.788260454865904, "grad_norm": 1.527431845664978, "learning_rate": 8.572249353634014e-06, "loss": 2.2233, "step": 2772 }, { "epoch": 0.7885448201093621, "grad_norm": 1.635292649269104, "learning_rate": 8.56075840275783e-06, "loss": 2.1587, "step": 2773 }, { "epoch": 0.7888291853528202, "grad_norm": 1.6808167695999146, "learning_rate": 8.549267451881645e-06, "loss": 1.8091, "step": 2774 }, { "epoch": 0.7891135505962784, "grad_norm": 1.5428953170776367, "learning_rate": 8.537776501005459e-06, "loss": 1.9059, "step": 2775 }, { "epoch": 0.7893979158397365, "grad_norm": 1.5486587285995483, "learning_rate": 8.526285550129274e-06, "loss": 1.6714, "step": 2776 }, { "epoch": 0.7896822810831946, "grad_norm": 1.6755555868148804, "learning_rate": 8.514794599253088e-06, "loss": 2.6297, "step": 2777 }, { "epoch": 0.7899666463266527, "grad_norm": 1.4705214500427246, "learning_rate": 8.503303648376904e-06, "loss": 2.2428, "step": 2778 }, { "epoch": 0.7902510115701108, "grad_norm": 1.4628565311431885, "learning_rate": 8.491812697500719e-06, "loss": 2.0421, "step": 2779 }, { "epoch": 0.790535376813569, "grad_norm": 1.5100350379943848, "learning_rate": 8.480321746624533e-06, "loss": 2.2767, "step": 2780 }, { "epoch": 0.7908197420570271, "grad_norm": 1.6801701784133911, "learning_rate": 8.468830795748348e-06, "loss": 1.9363, "step": 2781 }, { "epoch": 0.7911041073004852, "grad_norm": 1.6616177558898926, "learning_rate": 8.457339844872164e-06, "loss": 2.0598, "step": 2782 }, { "epoch": 0.7913884725439433, "grad_norm": 1.5380514860153198, "learning_rate": 8.445848893995979e-06, "loss": 1.7708, "step": 2783 }, { "epoch": 0.7916728377874014, "grad_norm": 1.5327441692352295, "learning_rate": 8.434357943119795e-06, "loss": 1.7078, "step": 2784 }, { "epoch": 0.7919572030308596, "grad_norm": 1.597601294517517, "learning_rate": 8.42286699224361e-06, "loss": 2.5618, "step": 2785 }, { "epoch": 0.7922415682743177, "grad_norm": 1.663575291633606, "learning_rate": 8.411376041367424e-06, "loss": 2.422, "step": 2786 }, { "epoch": 0.7925259335177758, "grad_norm": 1.4837387800216675, "learning_rate": 8.399885090491238e-06, "loss": 2.3795, "step": 2787 }, { "epoch": 0.7928102987612339, "grad_norm": 1.405079960823059, "learning_rate": 8.388394139615054e-06, "loss": 2.2017, "step": 2788 }, { "epoch": 0.793094664004692, "grad_norm": 1.6203116178512573, "learning_rate": 8.376903188738869e-06, "loss": 2.1513, "step": 2789 }, { "epoch": 0.7933790292481502, "grad_norm": 1.4136089086532593, "learning_rate": 8.365412237862683e-06, "loss": 1.6959, "step": 2790 }, { "epoch": 0.7936633944916083, "grad_norm": 1.5652225017547607, "learning_rate": 8.353921286986498e-06, "loss": 1.7965, "step": 2791 }, { "epoch": 0.7939477597350664, "grad_norm": 1.5910707712173462, "learning_rate": 8.342430336110314e-06, "loss": 1.6998, "step": 2792 }, { "epoch": 0.7942321249785245, "grad_norm": 1.7475180625915527, "learning_rate": 8.330939385234129e-06, "loss": 2.6038, "step": 2793 }, { "epoch": 0.7945164902219826, "grad_norm": 1.538881540298462, "learning_rate": 8.319448434357945e-06, "loss": 2.217, "step": 2794 }, { "epoch": 0.7948008554654408, "grad_norm": 1.5075610876083374, "learning_rate": 8.30795748348176e-06, "loss": 2.4162, "step": 2795 }, { "epoch": 0.7950852207088989, "grad_norm": 1.4764316082000732, "learning_rate": 8.296466532605574e-06, "loss": 2.1782, "step": 2796 }, { "epoch": 0.795369585952357, "grad_norm": 1.5661760568618774, "learning_rate": 8.284975581729388e-06, "loss": 2.1276, "step": 2797 }, { "epoch": 0.7956539511958151, "grad_norm": 1.4711788892745972, "learning_rate": 8.273484630853203e-06, "loss": 1.7211, "step": 2798 }, { "epoch": 0.7959383164392733, "grad_norm": 1.5930848121643066, "learning_rate": 8.261993679977019e-06, "loss": 1.6124, "step": 2799 }, { "epoch": 0.7962226816827314, "grad_norm": 1.6392720937728882, "learning_rate": 8.250502729100833e-06, "loss": 1.8286, "step": 2800 }, { "epoch": 0.7965070469261895, "grad_norm": 1.7386542558670044, "learning_rate": 8.23901177822465e-06, "loss": 2.7341, "step": 2801 }, { "epoch": 0.7967914121696476, "grad_norm": 1.4948952198028564, "learning_rate": 8.227520827348464e-06, "loss": 2.4624, "step": 2802 }, { "epoch": 0.7970757774131056, "grad_norm": 1.3838168382644653, "learning_rate": 8.216029876472279e-06, "loss": 2.3202, "step": 2803 }, { "epoch": 0.7973601426565639, "grad_norm": 1.4264721870422363, "learning_rate": 8.204538925596093e-06, "loss": 2.2274, "step": 2804 }, { "epoch": 0.797644507900022, "grad_norm": 1.5542453527450562, "learning_rate": 8.19304797471991e-06, "loss": 2.0876, "step": 2805 }, { "epoch": 0.79792887314348, "grad_norm": 1.5131659507751465, "learning_rate": 8.181557023843724e-06, "loss": 1.8612, "step": 2806 }, { "epoch": 0.7982132383869381, "grad_norm": 1.5086268186569214, "learning_rate": 8.170066072967538e-06, "loss": 1.7961, "step": 2807 }, { "epoch": 0.7984976036303962, "grad_norm": 1.5793956518173218, "learning_rate": 8.158575122091353e-06, "loss": 1.6068, "step": 2808 }, { "epoch": 0.7987819688738544, "grad_norm": 1.7621535062789917, "learning_rate": 8.147084171215169e-06, "loss": 2.4022, "step": 2809 }, { "epoch": 0.7990663341173125, "grad_norm": 1.5189746618270874, "learning_rate": 8.135593220338983e-06, "loss": 2.3452, "step": 2810 }, { "epoch": 0.7993506993607706, "grad_norm": 1.418518304824829, "learning_rate": 8.1241022694628e-06, "loss": 1.9866, "step": 2811 }, { "epoch": 0.7996350646042287, "grad_norm": 1.5955320596694946, "learning_rate": 8.112611318586614e-06, "loss": 2.1267, "step": 2812 }, { "epoch": 0.7999194298476868, "grad_norm": 1.7210025787353516, "learning_rate": 8.101120367710429e-06, "loss": 1.9749, "step": 2813 }, { "epoch": 0.800203795091145, "grad_norm": 1.5319161415100098, "learning_rate": 8.089629416834243e-06, "loss": 1.9589, "step": 2814 }, { "epoch": 0.8004881603346031, "grad_norm": 1.6114243268966675, "learning_rate": 8.07813846595806e-06, "loss": 1.6376, "step": 2815 }, { "epoch": 0.8007725255780612, "grad_norm": 1.6235712766647339, "learning_rate": 8.066647515081874e-06, "loss": 1.7676, "step": 2816 }, { "epoch": 0.8010568908215193, "grad_norm": 1.5523582696914673, "learning_rate": 8.055156564205688e-06, "loss": 2.7133, "step": 2817 }, { "epoch": 0.8013412560649774, "grad_norm": 1.4934026002883911, "learning_rate": 8.043665613329503e-06, "loss": 2.5263, "step": 2818 }, { "epoch": 0.8016256213084356, "grad_norm": 1.4201688766479492, "learning_rate": 8.032174662453317e-06, "loss": 2.214, "step": 2819 }, { "epoch": 0.8019099865518937, "grad_norm": 1.5365192890167236, "learning_rate": 8.020683711577134e-06, "loss": 2.2509, "step": 2820 }, { "epoch": 0.8021943517953518, "grad_norm": 1.5041007995605469, "learning_rate": 8.00919276070095e-06, "loss": 2.1733, "step": 2821 }, { "epoch": 0.8024787170388099, "grad_norm": 1.4607468843460083, "learning_rate": 7.997701809824764e-06, "loss": 1.8385, "step": 2822 }, { "epoch": 0.802763082282268, "grad_norm": 1.6158239841461182, "learning_rate": 7.986210858948579e-06, "loss": 1.8724, "step": 2823 }, { "epoch": 0.8030474475257262, "grad_norm": 1.6089496612548828, "learning_rate": 7.974719908072393e-06, "loss": 1.746, "step": 2824 }, { "epoch": 0.8033318127691843, "grad_norm": 1.6502569913864136, "learning_rate": 7.963228957196208e-06, "loss": 2.6797, "step": 2825 }, { "epoch": 0.8036161780126424, "grad_norm": 1.4397003650665283, "learning_rate": 7.951738006320024e-06, "loss": 2.2595, "step": 2826 }, { "epoch": 0.8039005432561005, "grad_norm": 1.5017975568771362, "learning_rate": 7.940247055443838e-06, "loss": 2.3404, "step": 2827 }, { "epoch": 0.8041849084995586, "grad_norm": 1.4788340330123901, "learning_rate": 7.928756104567653e-06, "loss": 2.2218, "step": 2828 }, { "epoch": 0.8044692737430168, "grad_norm": 1.5507515668869019, "learning_rate": 7.917265153691469e-06, "loss": 2.0377, "step": 2829 }, { "epoch": 0.8047536389864749, "grad_norm": 1.5333271026611328, "learning_rate": 7.905774202815284e-06, "loss": 1.948, "step": 2830 }, { "epoch": 0.805038004229933, "grad_norm": 1.4439997673034668, "learning_rate": 7.894283251939098e-06, "loss": 1.7283, "step": 2831 }, { "epoch": 0.8053223694733911, "grad_norm": 1.5981254577636719, "learning_rate": 7.882792301062914e-06, "loss": 1.899, "step": 2832 }, { "epoch": 0.8056067347168492, "grad_norm": 1.6823999881744385, "learning_rate": 7.871301350186729e-06, "loss": 2.8227, "step": 2833 }, { "epoch": 0.8058910999603074, "grad_norm": 1.593544840812683, "learning_rate": 7.859810399310543e-06, "loss": 2.4295, "step": 2834 }, { "epoch": 0.8061754652037655, "grad_norm": 1.3846356868743896, "learning_rate": 7.848319448434358e-06, "loss": 2.0042, "step": 2835 }, { "epoch": 0.8064598304472236, "grad_norm": 1.4523168802261353, "learning_rate": 7.836828497558174e-06, "loss": 2.0895, "step": 2836 }, { "epoch": 0.8067441956906817, "grad_norm": 1.5731124877929688, "learning_rate": 7.825337546681988e-06, "loss": 2.2216, "step": 2837 }, { "epoch": 0.8070285609341398, "grad_norm": 1.5912824869155884, "learning_rate": 7.813846595805805e-06, "loss": 2.0229, "step": 2838 }, { "epoch": 0.807312926177598, "grad_norm": 1.5180526971817017, "learning_rate": 7.80235564492962e-06, "loss": 1.8176, "step": 2839 }, { "epoch": 0.8075972914210561, "grad_norm": 1.5623652935028076, "learning_rate": 7.790864694053434e-06, "loss": 1.8123, "step": 2840 }, { "epoch": 0.8078816566645142, "grad_norm": 1.7535358667373657, "learning_rate": 7.779373743177248e-06, "loss": 2.7814, "step": 2841 }, { "epoch": 0.8081660219079723, "grad_norm": 1.5314868688583374, "learning_rate": 7.767882792301064e-06, "loss": 2.2653, "step": 2842 }, { "epoch": 0.8084503871514305, "grad_norm": 1.4723985195159912, "learning_rate": 7.756391841424879e-06, "loss": 2.2257, "step": 2843 }, { "epoch": 0.8087347523948886, "grad_norm": 1.5310198068618774, "learning_rate": 7.744900890548693e-06, "loss": 2.0548, "step": 2844 }, { "epoch": 0.8090191176383467, "grad_norm": 1.6289122104644775, "learning_rate": 7.733409939672508e-06, "loss": 2.1181, "step": 2845 }, { "epoch": 0.8093034828818048, "grad_norm": 1.669812798500061, "learning_rate": 7.721918988796322e-06, "loss": 1.6431, "step": 2846 }, { "epoch": 0.8095878481252629, "grad_norm": 1.6252498626708984, "learning_rate": 7.710428037920139e-06, "loss": 1.7344, "step": 2847 }, { "epoch": 0.8098722133687211, "grad_norm": 1.6311345100402832, "learning_rate": 7.698937087043955e-06, "loss": 1.8725, "step": 2848 }, { "epoch": 0.8101565786121792, "grad_norm": 1.6725844144821167, "learning_rate": 7.68744613616777e-06, "loss": 2.5774, "step": 2849 }, { "epoch": 0.8104409438556373, "grad_norm": 1.5067769289016724, "learning_rate": 7.675955185291584e-06, "loss": 2.2774, "step": 2850 }, { "epoch": 0.8107253090990953, "grad_norm": 1.5363556146621704, "learning_rate": 7.664464234415398e-06, "loss": 2.1134, "step": 2851 }, { "epoch": 0.8110096743425534, "grad_norm": 1.4846501350402832, "learning_rate": 7.652973283539213e-06, "loss": 2.0919, "step": 2852 }, { "epoch": 0.8112940395860117, "grad_norm": 1.6034198999404907, "learning_rate": 7.641482332663029e-06, "loss": 2.0541, "step": 2853 }, { "epoch": 0.8115784048294697, "grad_norm": 1.49429452419281, "learning_rate": 7.629991381786843e-06, "loss": 1.6646, "step": 2854 }, { "epoch": 0.8118627700729278, "grad_norm": 1.4957629442214966, "learning_rate": 7.618500430910658e-06, "loss": 1.8006, "step": 2855 }, { "epoch": 0.8121471353163859, "grad_norm": 1.5566238164901733, "learning_rate": 7.607009480034473e-06, "loss": 1.7759, "step": 2856 }, { "epoch": 0.812431500559844, "grad_norm": 1.7334346771240234, "learning_rate": 7.595518529158289e-06, "loss": 2.607, "step": 2857 }, { "epoch": 0.8127158658033022, "grad_norm": 1.5797719955444336, "learning_rate": 7.584027578282104e-06, "loss": 2.4472, "step": 2858 }, { "epoch": 0.8130002310467603, "grad_norm": 1.4985685348510742, "learning_rate": 7.572536627405918e-06, "loss": 2.4235, "step": 2859 }, { "epoch": 0.8132845962902184, "grad_norm": 1.4722380638122559, "learning_rate": 7.561045676529734e-06, "loss": 2.1274, "step": 2860 }, { "epoch": 0.8135689615336765, "grad_norm": 1.4994633197784424, "learning_rate": 7.549554725653548e-06, "loss": 2.074, "step": 2861 }, { "epoch": 0.8138533267771346, "grad_norm": 1.689273476600647, "learning_rate": 7.538063774777364e-06, "loss": 2.0419, "step": 2862 }, { "epoch": 0.8141376920205928, "grad_norm": 1.5861849784851074, "learning_rate": 7.526572823901178e-06, "loss": 1.9478, "step": 2863 }, { "epoch": 0.8144220572640509, "grad_norm": 1.5588481426239014, "learning_rate": 7.5150818730249935e-06, "loss": 1.7538, "step": 2864 }, { "epoch": 0.814706422507509, "grad_norm": 1.675533413887024, "learning_rate": 7.503590922148808e-06, "loss": 2.7183, "step": 2865 }, { "epoch": 0.8149907877509671, "grad_norm": 1.4782207012176514, "learning_rate": 7.492099971272624e-06, "loss": 2.4106, "step": 2866 }, { "epoch": 0.8152751529944252, "grad_norm": 1.4612501859664917, "learning_rate": 7.480609020396439e-06, "loss": 2.4035, "step": 2867 }, { "epoch": 0.8155595182378834, "grad_norm": 1.5237157344818115, "learning_rate": 7.469118069520254e-06, "loss": 2.203, "step": 2868 }, { "epoch": 0.8158438834813415, "grad_norm": 1.601366639137268, "learning_rate": 7.4576271186440685e-06, "loss": 2.0569, "step": 2869 }, { "epoch": 0.8161282487247996, "grad_norm": 1.5333240032196045, "learning_rate": 7.446136167767883e-06, "loss": 1.932, "step": 2870 }, { "epoch": 0.8164126139682577, "grad_norm": 1.6135165691375732, "learning_rate": 7.434645216891698e-06, "loss": 1.879, "step": 2871 }, { "epoch": 0.8166969792117158, "grad_norm": 1.542605996131897, "learning_rate": 7.423154266015513e-06, "loss": 1.7374, "step": 2872 }, { "epoch": 0.816981344455174, "grad_norm": 1.7298074960708618, "learning_rate": 7.411663315139328e-06, "loss": 2.7893, "step": 2873 }, { "epoch": 0.8172657096986321, "grad_norm": 1.439632773399353, "learning_rate": 7.400172364263143e-06, "loss": 2.1772, "step": 2874 }, { "epoch": 0.8175500749420902, "grad_norm": 1.3654167652130127, "learning_rate": 7.388681413386959e-06, "loss": 2.2241, "step": 2875 }, { "epoch": 0.8178344401855483, "grad_norm": 1.4207921028137207, "learning_rate": 7.377190462510773e-06, "loss": 2.1142, "step": 2876 }, { "epoch": 0.8181188054290064, "grad_norm": 1.5437437295913696, "learning_rate": 7.365699511634589e-06, "loss": 2.0925, "step": 2877 }, { "epoch": 0.8184031706724646, "grad_norm": 1.6208105087280273, "learning_rate": 7.354208560758403e-06, "loss": 1.9039, "step": 2878 }, { "epoch": 0.8186875359159227, "grad_norm": 1.5807172060012817, "learning_rate": 7.3427176098822185e-06, "loss": 1.8186, "step": 2879 }, { "epoch": 0.8189719011593808, "grad_norm": 1.6478073596954346, "learning_rate": 7.331226659006033e-06, "loss": 1.7717, "step": 2880 }, { "epoch": 0.8192562664028389, "grad_norm": 1.6527962684631348, "learning_rate": 7.319735708129848e-06, "loss": 2.7834, "step": 2881 }, { "epoch": 0.819540631646297, "grad_norm": 1.4940738677978516, "learning_rate": 7.308244757253663e-06, "loss": 2.2835, "step": 2882 }, { "epoch": 0.8198249968897552, "grad_norm": 1.490037202835083, "learning_rate": 7.296753806377478e-06, "loss": 2.1039, "step": 2883 }, { "epoch": 0.8201093621332133, "grad_norm": 1.4153006076812744, "learning_rate": 7.2852628555012936e-06, "loss": 2.1536, "step": 2884 }, { "epoch": 0.8203937273766714, "grad_norm": 1.5724902153015137, "learning_rate": 7.273771904625109e-06, "loss": 1.9262, "step": 2885 }, { "epoch": 0.8206780926201295, "grad_norm": 1.4987521171569824, "learning_rate": 7.262280953748923e-06, "loss": 1.9364, "step": 2886 }, { "epoch": 0.8209624578635876, "grad_norm": 1.650689721107483, "learning_rate": 7.250790002872739e-06, "loss": 1.7343, "step": 2887 }, { "epoch": 0.8212468231070458, "grad_norm": 1.5347603559494019, "learning_rate": 7.239299051996553e-06, "loss": 1.9424, "step": 2888 }, { "epoch": 0.8215311883505039, "grad_norm": 1.7783067226409912, "learning_rate": 7.227808101120369e-06, "loss": 2.81, "step": 2889 }, { "epoch": 0.821815553593962, "grad_norm": 1.4985435009002686, "learning_rate": 7.216317150244183e-06, "loss": 2.3066, "step": 2890 }, { "epoch": 0.8220999188374201, "grad_norm": 1.5006333589553833, "learning_rate": 7.204826199367998e-06, "loss": 2.191, "step": 2891 }, { "epoch": 0.8223842840808783, "grad_norm": 1.4915610551834106, "learning_rate": 7.193335248491813e-06, "loss": 2.0556, "step": 2892 }, { "epoch": 0.8226686493243364, "grad_norm": 1.5882519483566284, "learning_rate": 7.1818442976156274e-06, "loss": 2.0908, "step": 2893 }, { "epoch": 0.8229530145677945, "grad_norm": 1.4619516134262085, "learning_rate": 7.170353346739444e-06, "loss": 1.9773, "step": 2894 }, { "epoch": 0.8232373798112526, "grad_norm": 1.5004616975784302, "learning_rate": 7.158862395863259e-06, "loss": 1.9638, "step": 2895 }, { "epoch": 0.8235217450547107, "grad_norm": 1.5177069902420044, "learning_rate": 7.1473714449870735e-06, "loss": 1.6566, "step": 2896 }, { "epoch": 0.8238061102981689, "grad_norm": 1.724021315574646, "learning_rate": 7.135880494110888e-06, "loss": 2.7129, "step": 2897 }, { "epoch": 0.824090475541627, "grad_norm": 1.537143349647522, "learning_rate": 7.124389543234703e-06, "loss": 2.3473, "step": 2898 }, { "epoch": 0.824374840785085, "grad_norm": 1.3727134466171265, "learning_rate": 7.112898592358518e-06, "loss": 2.4072, "step": 2899 }, { "epoch": 0.8246592060285431, "grad_norm": 1.4793521165847778, "learning_rate": 7.101407641482333e-06, "loss": 2.1391, "step": 2900 }, { "epoch": 0.8249435712720012, "grad_norm": 1.4474788904190063, "learning_rate": 7.089916690606148e-06, "loss": 2.084, "step": 2901 }, { "epoch": 0.8252279365154594, "grad_norm": 1.6021233797073364, "learning_rate": 7.078425739729963e-06, "loss": 1.8724, "step": 2902 }, { "epoch": 0.8255123017589175, "grad_norm": 1.5263599157333374, "learning_rate": 7.066934788853778e-06, "loss": 1.7535, "step": 2903 }, { "epoch": 0.8257966670023756, "grad_norm": 1.5314630270004272, "learning_rate": 7.055443837977594e-06, "loss": 1.7294, "step": 2904 }, { "epoch": 0.8260810322458337, "grad_norm": 1.6846193075180054, "learning_rate": 7.043952887101408e-06, "loss": 2.6378, "step": 2905 }, { "epoch": 0.8263653974892918, "grad_norm": 1.6511030197143555, "learning_rate": 7.0324619362252235e-06, "loss": 2.4237, "step": 2906 }, { "epoch": 0.82664976273275, "grad_norm": 1.453942894935608, "learning_rate": 7.020970985349038e-06, "loss": 2.3405, "step": 2907 }, { "epoch": 0.8269341279762081, "grad_norm": 1.4035779237747192, "learning_rate": 7.009480034472853e-06, "loss": 2.0887, "step": 2908 }, { "epoch": 0.8272184932196662, "grad_norm": 1.5787320137023926, "learning_rate": 6.997989083596668e-06, "loss": 2.1067, "step": 2909 }, { "epoch": 0.8275028584631243, "grad_norm": 1.6145730018615723, "learning_rate": 6.986498132720483e-06, "loss": 1.9487, "step": 2910 }, { "epoch": 0.8277872237065824, "grad_norm": 1.4792077541351318, "learning_rate": 6.975007181844298e-06, "loss": 1.8149, "step": 2911 }, { "epoch": 0.8280715889500406, "grad_norm": 1.5759203433990479, "learning_rate": 6.963516230968114e-06, "loss": 1.8977, "step": 2912 }, { "epoch": 0.8283559541934987, "grad_norm": 1.6006420850753784, "learning_rate": 6.952025280091928e-06, "loss": 2.8743, "step": 2913 }, { "epoch": 0.8286403194369568, "grad_norm": 1.4774819612503052, "learning_rate": 6.940534329215744e-06, "loss": 2.3842, "step": 2914 }, { "epoch": 0.8289246846804149, "grad_norm": 1.4915317296981812, "learning_rate": 6.929043378339558e-06, "loss": 2.2381, "step": 2915 }, { "epoch": 0.829209049923873, "grad_norm": 1.4956070184707642, "learning_rate": 6.9175524274633736e-06, "loss": 2.2751, "step": 2916 }, { "epoch": 0.8294934151673312, "grad_norm": 1.5358397960662842, "learning_rate": 6.906061476587188e-06, "loss": 2.0869, "step": 2917 }, { "epoch": 0.8297777804107893, "grad_norm": 1.4759082794189453, "learning_rate": 6.8945705257110026e-06, "loss": 1.6791, "step": 2918 }, { "epoch": 0.8300621456542474, "grad_norm": 1.5984234809875488, "learning_rate": 6.883079574834818e-06, "loss": 1.7364, "step": 2919 }, { "epoch": 0.8303465108977055, "grad_norm": 1.6775093078613281, "learning_rate": 6.871588623958632e-06, "loss": 1.734, "step": 2920 }, { "epoch": 0.8306308761411636, "grad_norm": 1.687727928161621, "learning_rate": 6.860097673082449e-06, "loss": 2.5652, "step": 2921 }, { "epoch": 0.8309152413846218, "grad_norm": 1.4916083812713623, "learning_rate": 6.848606722206264e-06, "loss": 2.2731, "step": 2922 }, { "epoch": 0.8311996066280799, "grad_norm": 1.407136082649231, "learning_rate": 6.8371157713300784e-06, "loss": 2.1255, "step": 2923 }, { "epoch": 0.831483971871538, "grad_norm": 1.390185832977295, "learning_rate": 6.825624820453893e-06, "loss": 2.043, "step": 2924 }, { "epoch": 0.8317683371149961, "grad_norm": 1.599908471107483, "learning_rate": 6.814133869577708e-06, "loss": 2.3225, "step": 2925 }, { "epoch": 0.8320527023584542, "grad_norm": 1.5953835248947144, "learning_rate": 6.802642918701523e-06, "loss": 1.7706, "step": 2926 }, { "epoch": 0.8323370676019124, "grad_norm": 1.5901626348495483, "learning_rate": 6.791151967825338e-06, "loss": 1.7223, "step": 2927 }, { "epoch": 0.8326214328453705, "grad_norm": 1.617637038230896, "learning_rate": 6.779661016949153e-06, "loss": 1.9098, "step": 2928 }, { "epoch": 0.8329057980888286, "grad_norm": 1.7057212591171265, "learning_rate": 6.768170066072968e-06, "loss": 2.6873, "step": 2929 }, { "epoch": 0.8331901633322867, "grad_norm": 1.514479637145996, "learning_rate": 6.7566791151967825e-06, "loss": 2.2949, "step": 2930 }, { "epoch": 0.8334745285757448, "grad_norm": 1.5152493715286255, "learning_rate": 6.745188164320599e-06, "loss": 2.2854, "step": 2931 }, { "epoch": 0.833758893819203, "grad_norm": 1.4838507175445557, "learning_rate": 6.733697213444413e-06, "loss": 1.9852, "step": 2932 }, { "epoch": 0.8340432590626611, "grad_norm": 1.5288348197937012, "learning_rate": 6.7222062625682285e-06, "loss": 1.994, "step": 2933 }, { "epoch": 0.8343276243061192, "grad_norm": 1.4767318964004517, "learning_rate": 6.710715311692043e-06, "loss": 1.6974, "step": 2934 }, { "epoch": 0.8346119895495773, "grad_norm": 1.5459532737731934, "learning_rate": 6.699224360815858e-06, "loss": 1.9026, "step": 2935 }, { "epoch": 0.8348963547930354, "grad_norm": 1.5643906593322754, "learning_rate": 6.687733409939673e-06, "loss": 1.7265, "step": 2936 }, { "epoch": 0.8351807200364936, "grad_norm": 1.6530026197433472, "learning_rate": 6.676242459063488e-06, "loss": 2.6701, "step": 2937 }, { "epoch": 0.8354650852799517, "grad_norm": 1.464538812637329, "learning_rate": 6.664751508187303e-06, "loss": 2.3489, "step": 2938 }, { "epoch": 0.8357494505234098, "grad_norm": 1.469451904296875, "learning_rate": 6.653260557311117e-06, "loss": 2.242, "step": 2939 }, { "epoch": 0.8360338157668679, "grad_norm": 1.4956061840057373, "learning_rate": 6.641769606434933e-06, "loss": 2.0947, "step": 2940 }, { "epoch": 0.8363181810103261, "grad_norm": 1.5736510753631592, "learning_rate": 6.630278655558749e-06, "loss": 2.1535, "step": 2941 }, { "epoch": 0.8366025462537842, "grad_norm": 1.4406914710998535, "learning_rate": 6.618787704682563e-06, "loss": 1.8535, "step": 2942 }, { "epoch": 0.8368869114972423, "grad_norm": 1.5601533651351929, "learning_rate": 6.6072967538063786e-06, "loss": 1.9428, "step": 2943 }, { "epoch": 0.8371712767407004, "grad_norm": 1.4611488580703735, "learning_rate": 6.595805802930193e-06, "loss": 1.6424, "step": 2944 }, { "epoch": 0.8374556419841584, "grad_norm": 1.6867340803146362, "learning_rate": 6.5843148520540075e-06, "loss": 2.6731, "step": 2945 }, { "epoch": 0.8377400072276167, "grad_norm": 1.4799604415893555, "learning_rate": 6.572823901177823e-06, "loss": 2.3161, "step": 2946 }, { "epoch": 0.8380243724710748, "grad_norm": 1.4663941860198975, "learning_rate": 6.561332950301637e-06, "loss": 2.2613, "step": 2947 }, { "epoch": 0.8383087377145328, "grad_norm": 1.496065616607666, "learning_rate": 6.549841999425453e-06, "loss": 1.9605, "step": 2948 }, { "epoch": 0.8385931029579909, "grad_norm": 1.416529893875122, "learning_rate": 6.538351048549268e-06, "loss": 2.041, "step": 2949 }, { "epoch": 0.838877468201449, "grad_norm": 1.5188525915145874, "learning_rate": 6.5268600976730834e-06, "loss": 2.0049, "step": 2950 }, { "epoch": 0.8391618334449072, "grad_norm": 1.6561343669891357, "learning_rate": 6.515369146796898e-06, "loss": 1.7761, "step": 2951 }, { "epoch": 0.8394461986883653, "grad_norm": 1.8842443227767944, "learning_rate": 6.503878195920713e-06, "loss": 1.8444, "step": 2952 }, { "epoch": 0.8397305639318234, "grad_norm": 1.5928751230239868, "learning_rate": 6.492387245044528e-06, "loss": 2.7766, "step": 2953 }, { "epoch": 0.8400149291752815, "grad_norm": 1.551367998123169, "learning_rate": 6.480896294168343e-06, "loss": 2.3994, "step": 2954 }, { "epoch": 0.8402992944187396, "grad_norm": 1.4871084690093994, "learning_rate": 6.469405343292158e-06, "loss": 2.2015, "step": 2955 }, { "epoch": 0.8405836596621978, "grad_norm": 1.6148115396499634, "learning_rate": 6.457914392415973e-06, "loss": 2.1799, "step": 2956 }, { "epoch": 0.8408680249056559, "grad_norm": 1.5156315565109253, "learning_rate": 6.4464234415397874e-06, "loss": 2.031, "step": 2957 }, { "epoch": 0.841152390149114, "grad_norm": 1.4778313636779785, "learning_rate": 6.434932490663604e-06, "loss": 1.9324, "step": 2958 }, { "epoch": 0.8414367553925721, "grad_norm": 1.5155595541000366, "learning_rate": 6.423441539787418e-06, "loss": 1.7774, "step": 2959 }, { "epoch": 0.8417211206360302, "grad_norm": 1.6561177968978882, "learning_rate": 6.4119505889112335e-06, "loss": 1.724, "step": 2960 }, { "epoch": 0.8420054858794884, "grad_norm": 2.0607149600982666, "learning_rate": 6.400459638035048e-06, "loss": 2.829, "step": 2961 }, { "epoch": 0.8422898511229465, "grad_norm": 1.4684460163116455, "learning_rate": 6.388968687158863e-06, "loss": 2.5184, "step": 2962 }, { "epoch": 0.8425742163664046, "grad_norm": 1.4178595542907715, "learning_rate": 6.377477736282678e-06, "loss": 2.165, "step": 2963 }, { "epoch": 0.8428585816098627, "grad_norm": 1.4885823726654053, "learning_rate": 6.365986785406492e-06, "loss": 2.027, "step": 2964 }, { "epoch": 0.8431429468533208, "grad_norm": 1.5183528661727905, "learning_rate": 6.354495834530308e-06, "loss": 1.9819, "step": 2965 }, { "epoch": 0.843427312096779, "grad_norm": 1.3794087171554565, "learning_rate": 6.343004883654122e-06, "loss": 1.5737, "step": 2966 }, { "epoch": 0.8437116773402371, "grad_norm": 1.5426974296569824, "learning_rate": 6.3315139327779375e-06, "loss": 1.6998, "step": 2967 }, { "epoch": 0.8439960425836952, "grad_norm": 1.6509177684783936, "learning_rate": 6.320022981901754e-06, "loss": 1.8888, "step": 2968 }, { "epoch": 0.8442804078271533, "grad_norm": 1.6451570987701416, "learning_rate": 6.308532031025568e-06, "loss": 2.8388, "step": 2969 }, { "epoch": 0.8445647730706114, "grad_norm": 1.47929048538208, "learning_rate": 6.297041080149383e-06, "loss": 2.4065, "step": 2970 }, { "epoch": 0.8448491383140696, "grad_norm": 1.3882609605789185, "learning_rate": 6.285550129273198e-06, "loss": 2.1397, "step": 2971 }, { "epoch": 0.8451335035575277, "grad_norm": 1.4089328050613403, "learning_rate": 6.2740591783970125e-06, "loss": 2.13, "step": 2972 }, { "epoch": 0.8454178688009858, "grad_norm": 1.5933607816696167, "learning_rate": 6.262568227520828e-06, "loss": 2.1391, "step": 2973 }, { "epoch": 0.8457022340444439, "grad_norm": 1.4820233583450317, "learning_rate": 6.251077276644642e-06, "loss": 1.9708, "step": 2974 }, { "epoch": 0.845986599287902, "grad_norm": 1.4918299913406372, "learning_rate": 6.239586325768458e-06, "loss": 1.8649, "step": 2975 }, { "epoch": 0.8462709645313602, "grad_norm": 1.7036802768707275, "learning_rate": 6.228095374892272e-06, "loss": 1.8209, "step": 2976 }, { "epoch": 0.8465553297748183, "grad_norm": 1.7224177122116089, "learning_rate": 6.216604424016088e-06, "loss": 2.6908, "step": 2977 }, { "epoch": 0.8468396950182764, "grad_norm": 1.5237011909484863, "learning_rate": 6.205113473139903e-06, "loss": 2.4184, "step": 2978 }, { "epoch": 0.8471240602617345, "grad_norm": 1.4036983251571655, "learning_rate": 6.193622522263718e-06, "loss": 2.0392, "step": 2979 }, { "epoch": 0.8474084255051926, "grad_norm": 1.3745940923690796, "learning_rate": 6.182131571387533e-06, "loss": 2.136, "step": 2980 }, { "epoch": 0.8476927907486508, "grad_norm": 1.3899210691452026, "learning_rate": 6.170640620511348e-06, "loss": 1.8753, "step": 2981 }, { "epoch": 0.8479771559921089, "grad_norm": 1.5072089433670044, "learning_rate": 6.159149669635163e-06, "loss": 1.8412, "step": 2982 }, { "epoch": 0.848261521235567, "grad_norm": 1.4439475536346436, "learning_rate": 6.147658718758978e-06, "loss": 1.5971, "step": 2983 }, { "epoch": 0.8485458864790251, "grad_norm": 1.7163138389587402, "learning_rate": 6.1361677678827924e-06, "loss": 1.6792, "step": 2984 }, { "epoch": 0.8488302517224832, "grad_norm": 1.6258455514907837, "learning_rate": 6.124676817006607e-06, "loss": 2.6053, "step": 2985 }, { "epoch": 0.8491146169659414, "grad_norm": 1.390105962753296, "learning_rate": 6.113185866130423e-06, "loss": 2.3118, "step": 2986 }, { "epoch": 0.8493989822093995, "grad_norm": 1.5217338800430298, "learning_rate": 6.1016949152542385e-06, "loss": 2.1152, "step": 2987 }, { "epoch": 0.8496833474528576, "grad_norm": 1.5048413276672363, "learning_rate": 6.090203964378053e-06, "loss": 2.2262, "step": 2988 }, { "epoch": 0.8499677126963157, "grad_norm": 1.4718562364578247, "learning_rate": 6.078713013501868e-06, "loss": 2.1621, "step": 2989 }, { "epoch": 0.8502520779397739, "grad_norm": 1.3779963254928589, "learning_rate": 6.067222062625683e-06, "loss": 1.6434, "step": 2990 }, { "epoch": 0.850536443183232, "grad_norm": 1.5905683040618896, "learning_rate": 6.055731111749497e-06, "loss": 1.7679, "step": 2991 }, { "epoch": 0.8508208084266901, "grad_norm": 1.6249622106552124, "learning_rate": 6.044240160873313e-06, "loss": 1.661, "step": 2992 }, { "epoch": 0.8511051736701482, "grad_norm": 1.6707121133804321, "learning_rate": 6.032749209997127e-06, "loss": 2.5263, "step": 2993 }, { "epoch": 0.8513895389136062, "grad_norm": 1.5046759843826294, "learning_rate": 6.0212582591209425e-06, "loss": 2.2967, "step": 2994 }, { "epoch": 0.8516739041570645, "grad_norm": 1.5674413442611694, "learning_rate": 6.009767308244759e-06, "loss": 2.0386, "step": 2995 }, { "epoch": 0.8519582694005225, "grad_norm": 1.4231250286102295, "learning_rate": 5.998276357368573e-06, "loss": 2.318, "step": 2996 }, { "epoch": 0.8522426346439806, "grad_norm": 1.5579756498336792, "learning_rate": 5.986785406492388e-06, "loss": 2.0154, "step": 2997 }, { "epoch": 0.8525269998874387, "grad_norm": 1.404220700263977, "learning_rate": 5.975294455616203e-06, "loss": 1.7704, "step": 2998 }, { "epoch": 0.8528113651308968, "grad_norm": 1.4892497062683105, "learning_rate": 5.9638035047400175e-06, "loss": 1.7247, "step": 2999 }, { "epoch": 0.853095730374355, "grad_norm": 1.6335406303405762, "learning_rate": 5.952312553863833e-06, "loss": 1.5733, "step": 3000 }, { "epoch": 0.8533800956178131, "grad_norm": 1.781394124031067, "learning_rate": 5.940821602987647e-06, "loss": 2.7558, "step": 3001 }, { "epoch": 0.8536644608612712, "grad_norm": 1.4314653873443604, "learning_rate": 5.929330652111463e-06, "loss": 2.3129, "step": 3002 }, { "epoch": 0.8539488261047293, "grad_norm": 1.5230185985565186, "learning_rate": 5.917839701235277e-06, "loss": 2.3062, "step": 3003 }, { "epoch": 0.8542331913481874, "grad_norm": 1.4675681591033936, "learning_rate": 5.9063487503590925e-06, "loss": 2.0741, "step": 3004 }, { "epoch": 0.8545175565916456, "grad_norm": 1.5174206495285034, "learning_rate": 5.894857799482908e-06, "loss": 2.0766, "step": 3005 }, { "epoch": 0.8548019218351037, "grad_norm": 1.480103850364685, "learning_rate": 5.883366848606723e-06, "loss": 1.6913, "step": 3006 }, { "epoch": 0.8550862870785618, "grad_norm": 1.5968501567840576, "learning_rate": 5.871875897730538e-06, "loss": 1.7072, "step": 3007 }, { "epoch": 0.8553706523220199, "grad_norm": 1.528976321220398, "learning_rate": 5.860384946854353e-06, "loss": 1.6712, "step": 3008 }, { "epoch": 0.855655017565478, "grad_norm": 1.6280978918075562, "learning_rate": 5.8488939959781676e-06, "loss": 2.822, "step": 3009 }, { "epoch": 0.8559393828089362, "grad_norm": 1.4882813692092896, "learning_rate": 5.837403045101983e-06, "loss": 2.0284, "step": 3010 }, { "epoch": 0.8562237480523943, "grad_norm": 1.4951153993606567, "learning_rate": 5.825912094225797e-06, "loss": 2.2519, "step": 3011 }, { "epoch": 0.8565081132958524, "grad_norm": 1.4803876876831055, "learning_rate": 5.814421143349612e-06, "loss": 2.2105, "step": 3012 }, { "epoch": 0.8567924785393105, "grad_norm": 1.4944387674331665, "learning_rate": 5.802930192473427e-06, "loss": 2.0987, "step": 3013 }, { "epoch": 0.8570768437827686, "grad_norm": 1.4236083030700684, "learning_rate": 5.7914392415972434e-06, "loss": 1.8932, "step": 3014 }, { "epoch": 0.8573612090262268, "grad_norm": 1.4983583688735962, "learning_rate": 5.779948290721058e-06, "loss": 1.6304, "step": 3015 }, { "epoch": 0.8576455742696849, "grad_norm": 1.554520606994629, "learning_rate": 5.768457339844873e-06, "loss": 1.5071, "step": 3016 }, { "epoch": 0.857929939513143, "grad_norm": 1.8354604244232178, "learning_rate": 5.756966388968688e-06, "loss": 2.5812, "step": 3017 }, { "epoch": 0.8582143047566011, "grad_norm": 1.4702214002609253, "learning_rate": 5.745475438092502e-06, "loss": 2.4053, "step": 3018 }, { "epoch": 0.8584986700000592, "grad_norm": 1.5303442478179932, "learning_rate": 5.733984487216318e-06, "loss": 2.3453, "step": 3019 }, { "epoch": 0.8587830352435174, "grad_norm": 1.5670809745788574, "learning_rate": 5.722493536340132e-06, "loss": 2.2886, "step": 3020 }, { "epoch": 0.8590674004869755, "grad_norm": 1.6706324815750122, "learning_rate": 5.7110025854639475e-06, "loss": 1.9736, "step": 3021 }, { "epoch": 0.8593517657304336, "grad_norm": 1.497307300567627, "learning_rate": 5.699511634587762e-06, "loss": 1.6649, "step": 3022 }, { "epoch": 0.8596361309738917, "grad_norm": 1.4424810409545898, "learning_rate": 5.688020683711578e-06, "loss": 1.7273, "step": 3023 }, { "epoch": 0.8599204962173498, "grad_norm": 1.5607413053512573, "learning_rate": 5.676529732835393e-06, "loss": 1.7178, "step": 3024 }, { "epoch": 0.860204861460808, "grad_norm": 1.6604523658752441, "learning_rate": 5.665038781959208e-06, "loss": 2.5183, "step": 3025 }, { "epoch": 0.8604892267042661, "grad_norm": 1.4753596782684326, "learning_rate": 5.6535478310830225e-06, "loss": 2.4315, "step": 3026 }, { "epoch": 0.8607735919477242, "grad_norm": 1.5106394290924072, "learning_rate": 5.642056880206838e-06, "loss": 2.2461, "step": 3027 }, { "epoch": 0.8610579571911823, "grad_norm": 1.4064358472824097, "learning_rate": 5.630565929330652e-06, "loss": 1.9892, "step": 3028 }, { "epoch": 0.8613423224346404, "grad_norm": 1.5657269954681396, "learning_rate": 5.619074978454468e-06, "loss": 2.0241, "step": 3029 }, { "epoch": 0.8616266876780986, "grad_norm": 1.3951325416564941, "learning_rate": 5.607584027578282e-06, "loss": 1.962, "step": 3030 }, { "epoch": 0.8619110529215567, "grad_norm": 1.3956987857818604, "learning_rate": 5.5960930767020975e-06, "loss": 1.7142, "step": 3031 }, { "epoch": 0.8621954181650148, "grad_norm": 1.8063596487045288, "learning_rate": 5.584602125825913e-06, "loss": 1.7746, "step": 3032 }, { "epoch": 0.8624797834084729, "grad_norm": 1.7616422176361084, "learning_rate": 5.573111174949728e-06, "loss": 2.6338, "step": 3033 }, { "epoch": 0.8627641486519311, "grad_norm": 1.576029896736145, "learning_rate": 5.561620224073543e-06, "loss": 2.487, "step": 3034 }, { "epoch": 0.8630485138953892, "grad_norm": 1.469588279724121, "learning_rate": 5.550129273197358e-06, "loss": 2.3708, "step": 3035 }, { "epoch": 0.8633328791388473, "grad_norm": 1.5429444313049316, "learning_rate": 5.5386383223211726e-06, "loss": 2.1626, "step": 3036 }, { "epoch": 0.8636172443823054, "grad_norm": 1.5128809213638306, "learning_rate": 5.527147371444988e-06, "loss": 1.9956, "step": 3037 }, { "epoch": 0.8639016096257635, "grad_norm": 1.458830714225769, "learning_rate": 5.515656420568802e-06, "loss": 1.9701, "step": 3038 }, { "epoch": 0.8641859748692217, "grad_norm": 1.585025429725647, "learning_rate": 5.504165469692617e-06, "loss": 1.8178, "step": 3039 }, { "epoch": 0.8644703401126798, "grad_norm": 1.6396241188049316, "learning_rate": 5.492674518816432e-06, "loss": 1.7052, "step": 3040 }, { "epoch": 0.8647547053561379, "grad_norm": 1.7168043851852417, "learning_rate": 5.481183567940247e-06, "loss": 2.8315, "step": 3041 }, { "epoch": 0.865039070599596, "grad_norm": 1.431821346282959, "learning_rate": 5.469692617064063e-06, "loss": 2.2904, "step": 3042 }, { "epoch": 0.865323435843054, "grad_norm": 1.5637871026992798, "learning_rate": 5.458201666187877e-06, "loss": 2.2041, "step": 3043 }, { "epoch": 0.8656078010865123, "grad_norm": 1.4506059885025024, "learning_rate": 5.446710715311693e-06, "loss": 2.2017, "step": 3044 }, { "epoch": 0.8658921663299703, "grad_norm": 1.5410045385360718, "learning_rate": 5.435219764435507e-06, "loss": 1.9422, "step": 3045 }, { "epoch": 0.8661765315734284, "grad_norm": 1.4460687637329102, "learning_rate": 5.423728813559323e-06, "loss": 1.7594, "step": 3046 }, { "epoch": 0.8664608968168865, "grad_norm": 1.5550706386566162, "learning_rate": 5.412237862683137e-06, "loss": 1.7515, "step": 3047 }, { "epoch": 0.8667452620603446, "grad_norm": 1.5752551555633545, "learning_rate": 5.4007469118069524e-06, "loss": 1.644, "step": 3048 }, { "epoch": 0.8670296273038028, "grad_norm": 1.8624625205993652, "learning_rate": 5.389255960930767e-06, "loss": 2.817, "step": 3049 }, { "epoch": 0.8673139925472609, "grad_norm": 1.416124939918518, "learning_rate": 5.377765010054582e-06, "loss": 2.1468, "step": 3050 }, { "epoch": 0.867598357790719, "grad_norm": 1.4211061000823975, "learning_rate": 5.366274059178398e-06, "loss": 2.0979, "step": 3051 }, { "epoch": 0.8678827230341771, "grad_norm": 1.4626717567443848, "learning_rate": 5.354783108302213e-06, "loss": 2.0164, "step": 3052 }, { "epoch": 0.8681670882776352, "grad_norm": 1.486775279045105, "learning_rate": 5.3432921574260275e-06, "loss": 2.0579, "step": 3053 }, { "epoch": 0.8684514535210934, "grad_norm": 1.5586199760437012, "learning_rate": 5.331801206549843e-06, "loss": 2.1097, "step": 3054 }, { "epoch": 0.8687358187645515, "grad_norm": 1.5133928060531616, "learning_rate": 5.320310255673657e-06, "loss": 1.8172, "step": 3055 }, { "epoch": 0.8690201840080096, "grad_norm": 1.5888675451278687, "learning_rate": 5.308819304797473e-06, "loss": 1.7316, "step": 3056 }, { "epoch": 0.8693045492514677, "grad_norm": 1.7980897426605225, "learning_rate": 5.297328353921287e-06, "loss": 2.8514, "step": 3057 }, { "epoch": 0.8695889144949258, "grad_norm": 1.4976806640625, "learning_rate": 5.285837403045102e-06, "loss": 2.5206, "step": 3058 }, { "epoch": 0.869873279738384, "grad_norm": 1.5453336238861084, "learning_rate": 5.274346452168917e-06, "loss": 2.4075, "step": 3059 }, { "epoch": 0.8701576449818421, "grad_norm": 1.3958678245544434, "learning_rate": 5.262855501292733e-06, "loss": 1.9729, "step": 3060 }, { "epoch": 0.8704420102253002, "grad_norm": 1.5089367628097534, "learning_rate": 5.251364550416548e-06, "loss": 2.061, "step": 3061 }, { "epoch": 0.8707263754687583, "grad_norm": 1.5372157096862793, "learning_rate": 5.239873599540363e-06, "loss": 1.8647, "step": 3062 }, { "epoch": 0.8710107407122164, "grad_norm": 1.4671186208724976, "learning_rate": 5.2283826486641775e-06, "loss": 1.6414, "step": 3063 }, { "epoch": 0.8712951059556746, "grad_norm": 1.5816073417663574, "learning_rate": 5.216891697787992e-06, "loss": 1.8357, "step": 3064 }, { "epoch": 0.8715794711991327, "grad_norm": 1.703484296798706, "learning_rate": 5.205400746911807e-06, "loss": 2.8886, "step": 3065 }, { "epoch": 0.8718638364425908, "grad_norm": 1.511811375617981, "learning_rate": 5.193909796035622e-06, "loss": 2.3373, "step": 3066 }, { "epoch": 0.8721482016860489, "grad_norm": 1.5414661169052124, "learning_rate": 5.182418845159437e-06, "loss": 2.1505, "step": 3067 }, { "epoch": 0.872432566929507, "grad_norm": 1.5230942964553833, "learning_rate": 5.170927894283252e-06, "loss": 2.1039, "step": 3068 }, { "epoch": 0.8727169321729652, "grad_norm": 1.4703636169433594, "learning_rate": 5.159436943407068e-06, "loss": 2.0667, "step": 3069 }, { "epoch": 0.8730012974164233, "grad_norm": 1.432626485824585, "learning_rate": 5.147945992530882e-06, "loss": 1.9027, "step": 3070 }, { "epoch": 0.8732856626598814, "grad_norm": 1.5644500255584717, "learning_rate": 5.136455041654698e-06, "loss": 1.756, "step": 3071 }, { "epoch": 0.8735700279033395, "grad_norm": 1.6351237297058105, "learning_rate": 5.124964090778512e-06, "loss": 1.7897, "step": 3072 }, { "epoch": 0.8738543931467976, "grad_norm": 1.7098308801651, "learning_rate": 5.113473139902328e-06, "loss": 2.7434, "step": 3073 }, { "epoch": 0.8741387583902558, "grad_norm": 1.4680440425872803, "learning_rate": 5.101982189026142e-06, "loss": 2.1812, "step": 3074 }, { "epoch": 0.8744231236337139, "grad_norm": 1.4263336658477783, "learning_rate": 5.0904912381499574e-06, "loss": 2.2004, "step": 3075 }, { "epoch": 0.874707488877172, "grad_norm": 1.4721750020980835, "learning_rate": 5.079000287273772e-06, "loss": 2.1114, "step": 3076 }, { "epoch": 0.8749918541206301, "grad_norm": 1.6234413385391235, "learning_rate": 5.067509336397587e-06, "loss": 2.1707, "step": 3077 }, { "epoch": 0.8752762193640882, "grad_norm": 1.5787984132766724, "learning_rate": 5.056018385521402e-06, "loss": 1.9247, "step": 3078 }, { "epoch": 0.8755605846075464, "grad_norm": 1.4935133457183838, "learning_rate": 5.044527434645218e-06, "loss": 1.78, "step": 3079 }, { "epoch": 0.8758449498510045, "grad_norm": 1.624784231185913, "learning_rate": 5.0330364837690325e-06, "loss": 1.7151, "step": 3080 }, { "epoch": 0.8761293150944626, "grad_norm": 1.7499675750732422, "learning_rate": 5.021545532892848e-06, "loss": 2.8374, "step": 3081 }, { "epoch": 0.8764136803379207, "grad_norm": 1.4725526571273804, "learning_rate": 5.010054582016662e-06, "loss": 2.3646, "step": 3082 }, { "epoch": 0.8766980455813789, "grad_norm": 1.5129636526107788, "learning_rate": 4.998563631140478e-06, "loss": 2.2392, "step": 3083 }, { "epoch": 0.876982410824837, "grad_norm": 1.4263098239898682, "learning_rate": 4.987072680264292e-06, "loss": 2.2247, "step": 3084 }, { "epoch": 0.8772667760682951, "grad_norm": 1.5456427335739136, "learning_rate": 4.975581729388107e-06, "loss": 2.068, "step": 3085 }, { "epoch": 0.8775511413117532, "grad_norm": 1.4494200944900513, "learning_rate": 4.964090778511923e-06, "loss": 1.893, "step": 3086 }, { "epoch": 0.8778355065552113, "grad_norm": 1.5122965574264526, "learning_rate": 4.952599827635737e-06, "loss": 1.6924, "step": 3087 }, { "epoch": 0.8781198717986695, "grad_norm": 1.593127965927124, "learning_rate": 4.941108876759552e-06, "loss": 1.7498, "step": 3088 }, { "epoch": 0.8784042370421276, "grad_norm": 1.7108339071273804, "learning_rate": 4.929617925883367e-06, "loss": 2.7341, "step": 3089 }, { "epoch": 0.8786886022855857, "grad_norm": 1.4968798160552979, "learning_rate": 4.9181269750071825e-06, "loss": 2.4878, "step": 3090 }, { "epoch": 0.8789729675290437, "grad_norm": 1.514430284500122, "learning_rate": 4.906636024130997e-06, "loss": 2.2634, "step": 3091 }, { "epoch": 0.8792573327725018, "grad_norm": 1.5105502605438232, "learning_rate": 4.895145073254812e-06, "loss": 2.2145, "step": 3092 }, { "epoch": 0.87954169801596, "grad_norm": 1.5424399375915527, "learning_rate": 4.883654122378627e-06, "loss": 2.1025, "step": 3093 }, { "epoch": 0.8798260632594181, "grad_norm": 1.4774196147918701, "learning_rate": 4.872163171502442e-06, "loss": 1.7497, "step": 3094 }, { "epoch": 0.8801104285028762, "grad_norm": 1.5772838592529297, "learning_rate": 4.8606722206262575e-06, "loss": 1.6591, "step": 3095 }, { "epoch": 0.8803947937463343, "grad_norm": 1.5816643238067627, "learning_rate": 4.849181269750072e-06, "loss": 1.6053, "step": 3096 }, { "epoch": 0.8806791589897924, "grad_norm": 1.8602535724639893, "learning_rate": 4.837690318873887e-06, "loss": 2.8624, "step": 3097 }, { "epoch": 0.8809635242332506, "grad_norm": 1.4142849445343018, "learning_rate": 4.826199367997702e-06, "loss": 2.1765, "step": 3098 }, { "epoch": 0.8812478894767087, "grad_norm": 1.418143391609192, "learning_rate": 4.814708417121517e-06, "loss": 2.142, "step": 3099 }, { "epoch": 0.8815322547201668, "grad_norm": 1.5086448192596436, "learning_rate": 4.8032174662453326e-06, "loss": 2.2362, "step": 3100 }, { "epoch": 0.8818166199636249, "grad_norm": 1.5796101093292236, "learning_rate": 4.791726515369147e-06, "loss": 2.2314, "step": 3101 }, { "epoch": 0.882100985207083, "grad_norm": 1.4776320457458496, "learning_rate": 4.780235564492962e-06, "loss": 2.0073, "step": 3102 }, { "epoch": 0.8823853504505412, "grad_norm": 1.5670294761657715, "learning_rate": 4.768744613616777e-06, "loss": 1.7092, "step": 3103 }, { "epoch": 0.8826697156939993, "grad_norm": 1.5772371292114258, "learning_rate": 4.757253662740592e-06, "loss": 1.6023, "step": 3104 }, { "epoch": 0.8829540809374574, "grad_norm": 1.6653498411178589, "learning_rate": 4.745762711864408e-06, "loss": 2.7154, "step": 3105 }, { "epoch": 0.8832384461809155, "grad_norm": 1.4277734756469727, "learning_rate": 4.734271760988222e-06, "loss": 2.4472, "step": 3106 }, { "epoch": 0.8835228114243736, "grad_norm": 1.4376863241195679, "learning_rate": 4.7227808101120374e-06, "loss": 2.1116, "step": 3107 }, { "epoch": 0.8838071766678318, "grad_norm": 1.4949778318405151, "learning_rate": 4.711289859235852e-06, "loss": 2.2258, "step": 3108 }, { "epoch": 0.8840915419112899, "grad_norm": 1.6463311910629272, "learning_rate": 4.699798908359667e-06, "loss": 2.1172, "step": 3109 }, { "epoch": 0.884375907154748, "grad_norm": 1.5631059408187866, "learning_rate": 4.688307957483483e-06, "loss": 1.8706, "step": 3110 }, { "epoch": 0.8846602723982061, "grad_norm": 1.5554920434951782, "learning_rate": 4.676817006607297e-06, "loss": 1.8467, "step": 3111 }, { "epoch": 0.8849446376416642, "grad_norm": 1.6312092542648315, "learning_rate": 4.665326055731112e-06, "loss": 1.6004, "step": 3112 }, { "epoch": 0.8852290028851224, "grad_norm": 1.807760238647461, "learning_rate": 4.653835104854928e-06, "loss": 2.7044, "step": 3113 }, { "epoch": 0.8855133681285805, "grad_norm": 1.6211071014404297, "learning_rate": 4.642344153978742e-06, "loss": 2.3121, "step": 3114 }, { "epoch": 0.8857977333720386, "grad_norm": 1.6699950695037842, "learning_rate": 4.630853203102557e-06, "loss": 2.2832, "step": 3115 }, { "epoch": 0.8860820986154967, "grad_norm": 1.495404839515686, "learning_rate": 4.619362252226372e-06, "loss": 2.0071, "step": 3116 }, { "epoch": 0.8863664638589548, "grad_norm": 1.4680708646774292, "learning_rate": 4.607871301350187e-06, "loss": 2.1704, "step": 3117 }, { "epoch": 0.886650829102413, "grad_norm": 1.5451794862747192, "learning_rate": 4.596380350474002e-06, "loss": 1.8087, "step": 3118 }, { "epoch": 0.8869351943458711, "grad_norm": 1.470167636871338, "learning_rate": 4.584889399597817e-06, "loss": 1.7698, "step": 3119 }, { "epoch": 0.8872195595893292, "grad_norm": 1.5515146255493164, "learning_rate": 4.573398448721632e-06, "loss": 1.6454, "step": 3120 }, { "epoch": 0.8875039248327873, "grad_norm": 1.7271732091903687, "learning_rate": 4.561907497845447e-06, "loss": 2.6316, "step": 3121 }, { "epoch": 0.8877882900762454, "grad_norm": 1.494756817817688, "learning_rate": 4.550416546969262e-06, "loss": 2.4271, "step": 3122 }, { "epoch": 0.8880726553197036, "grad_norm": 1.4232012033462524, "learning_rate": 4.538925596093077e-06, "loss": 2.2557, "step": 3123 }, { "epoch": 0.8883570205631617, "grad_norm": 1.4829121828079224, "learning_rate": 4.527434645216892e-06, "loss": 2.0524, "step": 3124 }, { "epoch": 0.8886413858066198, "grad_norm": 1.5503113269805908, "learning_rate": 4.515943694340707e-06, "loss": 1.9053, "step": 3125 }, { "epoch": 0.8889257510500779, "grad_norm": 1.4779075384140015, "learning_rate": 4.504452743464522e-06, "loss": 1.8903, "step": 3126 }, { "epoch": 0.889210116293536, "grad_norm": 1.5646677017211914, "learning_rate": 4.4929617925883376e-06, "loss": 1.7303, "step": 3127 }, { "epoch": 0.8894944815369942, "grad_norm": 1.709120273590088, "learning_rate": 4.481470841712152e-06, "loss": 1.7812, "step": 3128 }, { "epoch": 0.8897788467804523, "grad_norm": 1.6658837795257568, "learning_rate": 4.469979890835967e-06, "loss": 2.8185, "step": 3129 }, { "epoch": 0.8900632120239104, "grad_norm": 1.4445295333862305, "learning_rate": 4.458488939959782e-06, "loss": 2.2779, "step": 3130 }, { "epoch": 0.8903475772673685, "grad_norm": 1.5250149965286255, "learning_rate": 4.446997989083596e-06, "loss": 2.2863, "step": 3131 }, { "epoch": 0.8906319425108267, "grad_norm": 1.364835262298584, "learning_rate": 4.435507038207413e-06, "loss": 2.0326, "step": 3132 }, { "epoch": 0.8909163077542848, "grad_norm": 1.4369089603424072, "learning_rate": 4.424016087331227e-06, "loss": 2.0469, "step": 3133 }, { "epoch": 0.8912006729977429, "grad_norm": 1.4388717412948608, "learning_rate": 4.4125251364550416e-06, "loss": 1.7271, "step": 3134 }, { "epoch": 0.891485038241201, "grad_norm": 1.51015305519104, "learning_rate": 4.401034185578857e-06, "loss": 1.7673, "step": 3135 }, { "epoch": 0.891769403484659, "grad_norm": 1.6226184368133545, "learning_rate": 4.389543234702671e-06, "loss": 1.6646, "step": 3136 }, { "epoch": 0.8920537687281173, "grad_norm": 1.579350233078003, "learning_rate": 4.378052283826487e-06, "loss": 2.6567, "step": 3137 }, { "epoch": 0.8923381339715754, "grad_norm": 1.427051067352295, "learning_rate": 4.366561332950302e-06, "loss": 2.104, "step": 3138 }, { "epoch": 0.8926224992150334, "grad_norm": 1.3372541666030884, "learning_rate": 4.355070382074117e-06, "loss": 2.2061, "step": 3139 }, { "epoch": 0.8929068644584915, "grad_norm": 1.4767935276031494, "learning_rate": 4.343579431197932e-06, "loss": 2.2415, "step": 3140 }, { "epoch": 0.8931912297019496, "grad_norm": 1.5126450061798096, "learning_rate": 4.332088480321747e-06, "loss": 2.0232, "step": 3141 }, { "epoch": 0.8934755949454078, "grad_norm": 1.5773214101791382, "learning_rate": 4.320597529445562e-06, "loss": 2.0683, "step": 3142 }, { "epoch": 0.8937599601888659, "grad_norm": 1.64472496509552, "learning_rate": 4.309106578569377e-06, "loss": 1.9053, "step": 3143 }, { "epoch": 0.894044325432324, "grad_norm": 1.5076712369918823, "learning_rate": 4.297615627693192e-06, "loss": 1.7959, "step": 3144 }, { "epoch": 0.8943286906757821, "grad_norm": 1.8122453689575195, "learning_rate": 4.286124676817007e-06, "loss": 2.8296, "step": 3145 }, { "epoch": 0.8946130559192402, "grad_norm": 1.409722924232483, "learning_rate": 4.274633725940822e-06, "loss": 2.3145, "step": 3146 }, { "epoch": 0.8948974211626984, "grad_norm": 1.43026602268219, "learning_rate": 4.263142775064637e-06, "loss": 2.2631, "step": 3147 }, { "epoch": 0.8951817864061565, "grad_norm": 1.4173415899276733, "learning_rate": 4.251651824188452e-06, "loss": 2.1985, "step": 3148 }, { "epoch": 0.8954661516496146, "grad_norm": 1.4719431400299072, "learning_rate": 4.240160873312267e-06, "loss": 1.9246, "step": 3149 }, { "epoch": 0.8957505168930727, "grad_norm": 1.5066853761672974, "learning_rate": 4.228669922436082e-06, "loss": 1.8011, "step": 3150 }, { "epoch": 0.8960348821365308, "grad_norm": 1.6725085973739624, "learning_rate": 4.217178971559897e-06, "loss": 1.608, "step": 3151 }, { "epoch": 0.896319247379989, "grad_norm": 1.6424719095230103, "learning_rate": 4.205688020683712e-06, "loss": 1.7574, "step": 3152 }, { "epoch": 0.8966036126234471, "grad_norm": 1.7039910554885864, "learning_rate": 4.194197069807527e-06, "loss": 2.7038, "step": 3153 }, { "epoch": 0.8968879778669052, "grad_norm": 1.482321858406067, "learning_rate": 4.182706118931342e-06, "loss": 2.2022, "step": 3154 }, { "epoch": 0.8971723431103633, "grad_norm": 1.4395989179611206, "learning_rate": 4.171215168055157e-06, "loss": 2.3795, "step": 3155 }, { "epoch": 0.8974567083538214, "grad_norm": 1.381151795387268, "learning_rate": 4.159724217178972e-06, "loss": 2.2011, "step": 3156 }, { "epoch": 0.8977410735972796, "grad_norm": 1.4382047653198242, "learning_rate": 4.148233266302787e-06, "loss": 1.9888, "step": 3157 }, { "epoch": 0.8980254388407377, "grad_norm": 1.566701889038086, "learning_rate": 4.136742315426601e-06, "loss": 1.9158, "step": 3158 }, { "epoch": 0.8983098040841958, "grad_norm": 1.452852725982666, "learning_rate": 4.125251364550417e-06, "loss": 1.7248, "step": 3159 }, { "epoch": 0.8985941693276539, "grad_norm": 1.540461778640747, "learning_rate": 4.113760413674232e-06, "loss": 1.8487, "step": 3160 }, { "epoch": 0.898878534571112, "grad_norm": 2.046682119369507, "learning_rate": 4.1022694627980466e-06, "loss": 2.6915, "step": 3161 }, { "epoch": 0.8991628998145702, "grad_norm": 1.4611966609954834, "learning_rate": 4.090778511921862e-06, "loss": 2.324, "step": 3162 }, { "epoch": 0.8994472650580283, "grad_norm": 1.5413190126419067, "learning_rate": 4.079287561045676e-06, "loss": 2.1069, "step": 3163 }, { "epoch": 0.8997316303014864, "grad_norm": 1.4921023845672607, "learning_rate": 4.067796610169492e-06, "loss": 2.1568, "step": 3164 }, { "epoch": 0.9000159955449445, "grad_norm": 1.4238756895065308, "learning_rate": 4.056305659293307e-06, "loss": 2.0992, "step": 3165 }, { "epoch": 0.9003003607884026, "grad_norm": 1.2893463373184204, "learning_rate": 4.044814708417122e-06, "loss": 1.7327, "step": 3166 }, { "epoch": 0.9005847260318608, "grad_norm": 1.4908276796340942, "learning_rate": 4.033323757540937e-06, "loss": 1.6974, "step": 3167 }, { "epoch": 0.9008690912753189, "grad_norm": 1.6396807432174683, "learning_rate": 4.0218328066647514e-06, "loss": 1.8493, "step": 3168 }, { "epoch": 0.901153456518777, "grad_norm": 1.881540298461914, "learning_rate": 4.010341855788567e-06, "loss": 2.7723, "step": 3169 }, { "epoch": 0.9014378217622351, "grad_norm": 1.3765074014663696, "learning_rate": 3.998850904912382e-06, "loss": 2.4163, "step": 3170 }, { "epoch": 0.9017221870056932, "grad_norm": 1.4607453346252441, "learning_rate": 3.987359954036197e-06, "loss": 2.1944, "step": 3171 }, { "epoch": 0.9020065522491514, "grad_norm": 1.5134079456329346, "learning_rate": 3.975869003160012e-06, "loss": 2.1223, "step": 3172 }, { "epoch": 0.9022909174926095, "grad_norm": 1.612130880355835, "learning_rate": 3.9643780522838265e-06, "loss": 2.0902, "step": 3173 }, { "epoch": 0.9025752827360676, "grad_norm": 1.4546161890029907, "learning_rate": 3.952887101407642e-06, "loss": 1.6594, "step": 3174 }, { "epoch": 0.9028596479795257, "grad_norm": 1.5486887693405151, "learning_rate": 3.941396150531457e-06, "loss": 1.7874, "step": 3175 }, { "epoch": 0.9031440132229838, "grad_norm": 1.671090006828308, "learning_rate": 3.929905199655272e-06, "loss": 1.7745, "step": 3176 }, { "epoch": 0.903428378466442, "grad_norm": 1.8405499458312988, "learning_rate": 3.918414248779087e-06, "loss": 2.6482, "step": 3177 }, { "epoch": 0.9037127437099001, "grad_norm": 1.4907866716384888, "learning_rate": 3.906923297902902e-06, "loss": 2.3249, "step": 3178 }, { "epoch": 0.9039971089533582, "grad_norm": 1.5165135860443115, "learning_rate": 3.895432347026717e-06, "loss": 2.4738, "step": 3179 }, { "epoch": 0.9042814741968163, "grad_norm": 1.5014480352401733, "learning_rate": 3.883941396150532e-06, "loss": 2.1646, "step": 3180 }, { "epoch": 0.9045658394402745, "grad_norm": 1.5247701406478882, "learning_rate": 3.872450445274347e-06, "loss": 1.9272, "step": 3181 }, { "epoch": 0.9048502046837326, "grad_norm": 1.585174798965454, "learning_rate": 3.860959494398161e-06, "loss": 1.9112, "step": 3182 }, { "epoch": 0.9051345699271907, "grad_norm": 1.5576118230819702, "learning_rate": 3.849468543521977e-06, "loss": 1.716, "step": 3183 }, { "epoch": 0.9054189351706488, "grad_norm": 1.6155881881713867, "learning_rate": 3.837977592645792e-06, "loss": 1.7258, "step": 3184 }, { "epoch": 0.9057033004141068, "grad_norm": 1.6325162649154663, "learning_rate": 3.826486641769606e-06, "loss": 2.4844, "step": 3185 }, { "epoch": 0.9059876656575651, "grad_norm": 1.38741934299469, "learning_rate": 3.814995690893422e-06, "loss": 2.1399, "step": 3186 }, { "epoch": 0.9062720309010232, "grad_norm": 1.4506083726882935, "learning_rate": 3.8035047400172366e-06, "loss": 2.076, "step": 3187 }, { "epoch": 0.9065563961444812, "grad_norm": 1.423922061920166, "learning_rate": 3.792013789141052e-06, "loss": 2.0409, "step": 3188 }, { "epoch": 0.9068407613879393, "grad_norm": 1.591650366783142, "learning_rate": 3.780522838264867e-06, "loss": 2.1596, "step": 3189 }, { "epoch": 0.9071251266313974, "grad_norm": 1.5626674890518188, "learning_rate": 3.769031887388682e-06, "loss": 1.9831, "step": 3190 }, { "epoch": 0.9074094918748556, "grad_norm": 1.5110690593719482, "learning_rate": 3.7575409365124967e-06, "loss": 1.9154, "step": 3191 }, { "epoch": 0.9076938571183137, "grad_norm": 1.5417031049728394, "learning_rate": 3.746049985636312e-06, "loss": 1.704, "step": 3192 }, { "epoch": 0.9079782223617718, "grad_norm": 1.6698137521743774, "learning_rate": 3.734559034760127e-06, "loss": 2.7173, "step": 3193 }, { "epoch": 0.9082625876052299, "grad_norm": 1.58273446559906, "learning_rate": 3.7230680838839415e-06, "loss": 2.3419, "step": 3194 }, { "epoch": 0.908546952848688, "grad_norm": 1.4195032119750977, "learning_rate": 3.7115771330077564e-06, "loss": 2.184, "step": 3195 }, { "epoch": 0.9088313180921462, "grad_norm": 1.9811179637908936, "learning_rate": 3.7000861821315713e-06, "loss": 2.1592, "step": 3196 }, { "epoch": 0.9091156833356043, "grad_norm": 1.8835865259170532, "learning_rate": 3.6885952312553867e-06, "loss": 2.1639, "step": 3197 }, { "epoch": 0.9094000485790624, "grad_norm": 1.4895811080932617, "learning_rate": 3.6771042803792016e-06, "loss": 1.8653, "step": 3198 }, { "epoch": 0.9096844138225205, "grad_norm": 1.5817967653274536, "learning_rate": 3.6656133295030165e-06, "loss": 1.8198, "step": 3199 }, { "epoch": 0.9099687790659786, "grad_norm": 1.4373753070831299, "learning_rate": 3.6541223786268314e-06, "loss": 1.5907, "step": 3200 }, { "epoch": 0.9102531443094368, "grad_norm": 1.6298754215240479, "learning_rate": 3.6426314277506468e-06, "loss": 2.7107, "step": 3201 }, { "epoch": 0.9105375095528949, "grad_norm": 1.4616210460662842, "learning_rate": 3.6311404768744617e-06, "loss": 2.2934, "step": 3202 }, { "epoch": 0.910821874796353, "grad_norm": 1.4932990074157715, "learning_rate": 3.6196495259982766e-06, "loss": 1.951, "step": 3203 }, { "epoch": 0.9111062400398111, "grad_norm": 1.4640005826950073, "learning_rate": 3.6081585751220915e-06, "loss": 1.9661, "step": 3204 }, { "epoch": 0.9113906052832692, "grad_norm": 1.583443284034729, "learning_rate": 3.5966676242459065e-06, "loss": 2.0104, "step": 3205 }, { "epoch": 0.9116749705267274, "grad_norm": 1.4007854461669922, "learning_rate": 3.585176673369722e-06, "loss": 1.9239, "step": 3206 }, { "epoch": 0.9119593357701855, "grad_norm": 1.590077519416809, "learning_rate": 3.5736857224935367e-06, "loss": 1.8776, "step": 3207 }, { "epoch": 0.9122437010136436, "grad_norm": 1.5077543258666992, "learning_rate": 3.5621947716173517e-06, "loss": 1.6225, "step": 3208 }, { "epoch": 0.9125280662571017, "grad_norm": 1.8528907299041748, "learning_rate": 3.5507038207411666e-06, "loss": 2.6378, "step": 3209 }, { "epoch": 0.9128124315005598, "grad_norm": 1.5426160097122192, "learning_rate": 3.5392128698649815e-06, "loss": 2.4834, "step": 3210 }, { "epoch": 0.913096796744018, "grad_norm": 1.3771498203277588, "learning_rate": 3.527721918988797e-06, "loss": 2.1898, "step": 3211 }, { "epoch": 0.9133811619874761, "grad_norm": 1.4479544162750244, "learning_rate": 3.5162309681126118e-06, "loss": 2.2442, "step": 3212 }, { "epoch": 0.9136655272309342, "grad_norm": 1.5544335842132568, "learning_rate": 3.5047400172364267e-06, "loss": 1.9227, "step": 3213 }, { "epoch": 0.9139498924743923, "grad_norm": 1.4518970251083374, "learning_rate": 3.4932490663602416e-06, "loss": 1.8666, "step": 3214 }, { "epoch": 0.9142342577178504, "grad_norm": 1.4834986925125122, "learning_rate": 3.481758115484057e-06, "loss": 1.5869, "step": 3215 }, { "epoch": 0.9145186229613086, "grad_norm": 1.5631589889526367, "learning_rate": 3.470267164607872e-06, "loss": 1.6409, "step": 3216 }, { "epoch": 0.9148029882047667, "grad_norm": 1.7660410404205322, "learning_rate": 3.4587762137316868e-06, "loss": 2.7967, "step": 3217 }, { "epoch": 0.9150873534482248, "grad_norm": 1.4638726711273193, "learning_rate": 3.4472852628555013e-06, "loss": 2.276, "step": 3218 }, { "epoch": 0.9153717186916829, "grad_norm": 1.5217621326446533, "learning_rate": 3.435794311979316e-06, "loss": 2.2588, "step": 3219 }, { "epoch": 0.915656083935141, "grad_norm": 1.4932063817977905, "learning_rate": 3.424303361103132e-06, "loss": 2.138, "step": 3220 }, { "epoch": 0.9159404491785992, "grad_norm": 1.5329244136810303, "learning_rate": 3.4128124102269465e-06, "loss": 1.9642, "step": 3221 }, { "epoch": 0.9162248144220573, "grad_norm": 1.458665132522583, "learning_rate": 3.4013214593507614e-06, "loss": 1.961, "step": 3222 }, { "epoch": 0.9165091796655154, "grad_norm": 1.5886058807373047, "learning_rate": 3.3898305084745763e-06, "loss": 1.9216, "step": 3223 }, { "epoch": 0.9167935449089735, "grad_norm": 1.5155012607574463, "learning_rate": 3.3783395575983912e-06, "loss": 1.754, "step": 3224 }, { "epoch": 0.9170779101524317, "grad_norm": 1.733697772026062, "learning_rate": 3.3668486067222066e-06, "loss": 2.5523, "step": 3225 }, { "epoch": 0.9173622753958898, "grad_norm": 1.5041455030441284, "learning_rate": 3.3553576558460215e-06, "loss": 2.2183, "step": 3226 }, { "epoch": 0.9176466406393479, "grad_norm": 1.443900465965271, "learning_rate": 3.3438667049698364e-06, "loss": 2.2949, "step": 3227 }, { "epoch": 0.917931005882806, "grad_norm": 1.4563335180282593, "learning_rate": 3.3323757540936513e-06, "loss": 2.2165, "step": 3228 }, { "epoch": 0.9182153711262641, "grad_norm": 1.5183465480804443, "learning_rate": 3.3208848032174667e-06, "loss": 2.1452, "step": 3229 }, { "epoch": 0.9184997363697223, "grad_norm": 1.4686776399612427, "learning_rate": 3.3093938523412816e-06, "loss": 1.9584, "step": 3230 }, { "epoch": 0.9187841016131804, "grad_norm": 1.4295648336410522, "learning_rate": 3.2979029014650965e-06, "loss": 1.7519, "step": 3231 }, { "epoch": 0.9190684668566385, "grad_norm": 1.6376121044158936, "learning_rate": 3.2864119505889114e-06, "loss": 1.6658, "step": 3232 }, { "epoch": 0.9193528321000966, "grad_norm": 1.892962098121643, "learning_rate": 3.2749209997127264e-06, "loss": 2.629, "step": 3233 }, { "epoch": 0.9196371973435546, "grad_norm": 1.4390010833740234, "learning_rate": 3.2634300488365417e-06, "loss": 2.4514, "step": 3234 }, { "epoch": 0.9199215625870129, "grad_norm": 1.4880425930023193, "learning_rate": 3.2519390979603566e-06, "loss": 2.1385, "step": 3235 }, { "epoch": 0.920205927830471, "grad_norm": 1.5567518472671509, "learning_rate": 3.2404481470841716e-06, "loss": 2.1202, "step": 3236 }, { "epoch": 0.920490293073929, "grad_norm": 1.4381585121154785, "learning_rate": 3.2289571962079865e-06, "loss": 1.9348, "step": 3237 }, { "epoch": 0.9207746583173871, "grad_norm": 1.4756149053573608, "learning_rate": 3.217466245331802e-06, "loss": 1.8763, "step": 3238 }, { "epoch": 0.9210590235608452, "grad_norm": 1.4677278995513916, "learning_rate": 3.2059752944556167e-06, "loss": 1.5907, "step": 3239 }, { "epoch": 0.9213433888043034, "grad_norm": 1.5422618389129639, "learning_rate": 3.1944843435794317e-06, "loss": 1.6591, "step": 3240 }, { "epoch": 0.9216277540477615, "grad_norm": 1.696669101715088, "learning_rate": 3.182993392703246e-06, "loss": 2.6402, "step": 3241 }, { "epoch": 0.9219121192912196, "grad_norm": 1.6080213785171509, "learning_rate": 3.171502441827061e-06, "loss": 2.5319, "step": 3242 }, { "epoch": 0.9221964845346777, "grad_norm": 1.4872413873672485, "learning_rate": 3.160011490950877e-06, "loss": 2.1528, "step": 3243 }, { "epoch": 0.9224808497781358, "grad_norm": 1.3897939920425415, "learning_rate": 3.1485205400746913e-06, "loss": 2.2001, "step": 3244 }, { "epoch": 0.922765215021594, "grad_norm": 1.5631881952285767, "learning_rate": 3.1370295891985063e-06, "loss": 2.1586, "step": 3245 }, { "epoch": 0.9230495802650521, "grad_norm": 1.603376865386963, "learning_rate": 3.125538638322321e-06, "loss": 1.8584, "step": 3246 }, { "epoch": 0.9233339455085102, "grad_norm": 1.4845857620239258, "learning_rate": 3.114047687446136e-06, "loss": 1.6781, "step": 3247 }, { "epoch": 0.9236183107519683, "grad_norm": 1.5035042762756348, "learning_rate": 3.1025567365699515e-06, "loss": 1.5507, "step": 3248 }, { "epoch": 0.9239026759954264, "grad_norm": 1.5952107906341553, "learning_rate": 3.0910657856937664e-06, "loss": 2.6295, "step": 3249 }, { "epoch": 0.9241870412388846, "grad_norm": 1.4013127088546753, "learning_rate": 3.0795748348175813e-06, "loss": 2.59, "step": 3250 }, { "epoch": 0.9244714064823427, "grad_norm": 1.4684206247329712, "learning_rate": 3.0680838839413962e-06, "loss": 2.1826, "step": 3251 }, { "epoch": 0.9247557717258008, "grad_norm": 1.4950830936431885, "learning_rate": 3.0565929330652116e-06, "loss": 2.2631, "step": 3252 }, { "epoch": 0.9250401369692589, "grad_norm": 1.56867253780365, "learning_rate": 3.0451019821890265e-06, "loss": 2.0641, "step": 3253 }, { "epoch": 0.925324502212717, "grad_norm": 1.5613471269607544, "learning_rate": 3.0336110313128414e-06, "loss": 1.8448, "step": 3254 }, { "epoch": 0.9256088674561752, "grad_norm": 1.413219690322876, "learning_rate": 3.0221200804366563e-06, "loss": 2.0414, "step": 3255 }, { "epoch": 0.9258932326996333, "grad_norm": 1.5721766948699951, "learning_rate": 3.0106291295604712e-06, "loss": 1.4875, "step": 3256 }, { "epoch": 0.9261775979430914, "grad_norm": 1.5911824703216553, "learning_rate": 2.9991381786842866e-06, "loss": 2.5628, "step": 3257 }, { "epoch": 0.9264619631865495, "grad_norm": 1.427064299583435, "learning_rate": 2.9876472278081015e-06, "loss": 2.5189, "step": 3258 }, { "epoch": 0.9267463284300076, "grad_norm": 1.47379732131958, "learning_rate": 2.9761562769319164e-06, "loss": 2.258, "step": 3259 }, { "epoch": 0.9270306936734658, "grad_norm": 1.4227964878082275, "learning_rate": 2.9646653260557313e-06, "loss": 1.9227, "step": 3260 }, { "epoch": 0.9273150589169239, "grad_norm": 1.488387107849121, "learning_rate": 2.9531743751795463e-06, "loss": 1.8924, "step": 3261 }, { "epoch": 0.927599424160382, "grad_norm": 1.4256445169448853, "learning_rate": 2.9416834243033616e-06, "loss": 1.6688, "step": 3262 }, { "epoch": 0.9278837894038401, "grad_norm": 1.5635275840759277, "learning_rate": 2.9301924734271765e-06, "loss": 1.7883, "step": 3263 }, { "epoch": 0.9281681546472982, "grad_norm": 1.5392005443572998, "learning_rate": 2.9187015225509915e-06, "loss": 1.6098, "step": 3264 }, { "epoch": 0.9284525198907564, "grad_norm": 1.5983314514160156, "learning_rate": 2.907210571674806e-06, "loss": 2.6295, "step": 3265 }, { "epoch": 0.9287368851342145, "grad_norm": 1.55398690700531, "learning_rate": 2.8957196207986217e-06, "loss": 2.2437, "step": 3266 }, { "epoch": 0.9290212503776726, "grad_norm": 1.477097511291504, "learning_rate": 2.8842286699224366e-06, "loss": 2.2931, "step": 3267 }, { "epoch": 0.9293056156211307, "grad_norm": 1.5535800457000732, "learning_rate": 2.872737719046251e-06, "loss": 2.1373, "step": 3268 }, { "epoch": 0.9295899808645888, "grad_norm": 1.570197582244873, "learning_rate": 2.861246768170066e-06, "loss": 2.0627, "step": 3269 }, { "epoch": 0.929874346108047, "grad_norm": 1.4615015983581543, "learning_rate": 2.849755817293881e-06, "loss": 1.8171, "step": 3270 }, { "epoch": 0.9301587113515051, "grad_norm": 1.552833080291748, "learning_rate": 2.8382648664176963e-06, "loss": 1.7411, "step": 3271 }, { "epoch": 0.9304430765949632, "grad_norm": 1.5648410320281982, "learning_rate": 2.8267739155415112e-06, "loss": 1.6361, "step": 3272 }, { "epoch": 0.9307274418384213, "grad_norm": 1.7800242900848389, "learning_rate": 2.815282964665326e-06, "loss": 2.5435, "step": 3273 }, { "epoch": 0.9310118070818795, "grad_norm": 1.5224663019180298, "learning_rate": 2.803792013789141e-06, "loss": 2.364, "step": 3274 }, { "epoch": 0.9312961723253376, "grad_norm": 1.4838275909423828, "learning_rate": 2.7923010629129564e-06, "loss": 2.1583, "step": 3275 }, { "epoch": 0.9315805375687957, "grad_norm": 1.468105435371399, "learning_rate": 2.7808101120367714e-06, "loss": 2.3038, "step": 3276 }, { "epoch": 0.9318649028122538, "grad_norm": 1.6450117826461792, "learning_rate": 2.7693191611605863e-06, "loss": 2.2681, "step": 3277 }, { "epoch": 0.9321492680557119, "grad_norm": 1.4350481033325195, "learning_rate": 2.757828210284401e-06, "loss": 1.7551, "step": 3278 }, { "epoch": 0.9324336332991701, "grad_norm": 1.4676655530929565, "learning_rate": 2.746337259408216e-06, "loss": 1.7063, "step": 3279 }, { "epoch": 0.9327179985426282, "grad_norm": 1.562709927558899, "learning_rate": 2.7348463085320315e-06, "loss": 1.7359, "step": 3280 }, { "epoch": 0.9330023637860863, "grad_norm": 1.739317536354065, "learning_rate": 2.7233553576558464e-06, "loss": 2.5635, "step": 3281 }, { "epoch": 0.9332867290295443, "grad_norm": 1.480629801750183, "learning_rate": 2.7118644067796613e-06, "loss": 2.3922, "step": 3282 }, { "epoch": 0.9335710942730024, "grad_norm": 1.4481021165847778, "learning_rate": 2.7003734559034762e-06, "loss": 2.2305, "step": 3283 }, { "epoch": 0.9338554595164607, "grad_norm": 1.4051294326782227, "learning_rate": 2.688882505027291e-06, "loss": 1.8938, "step": 3284 }, { "epoch": 0.9341398247599187, "grad_norm": 1.5220015048980713, "learning_rate": 2.6773915541511065e-06, "loss": 2.045, "step": 3285 }, { "epoch": 0.9344241900033768, "grad_norm": 1.5136582851409912, "learning_rate": 2.6659006032749214e-06, "loss": 1.8875, "step": 3286 }, { "epoch": 0.9347085552468349, "grad_norm": 1.5294973850250244, "learning_rate": 2.6544096523987363e-06, "loss": 1.8228, "step": 3287 }, { "epoch": 0.934992920490293, "grad_norm": 1.4998856782913208, "learning_rate": 2.642918701522551e-06, "loss": 1.5929, "step": 3288 }, { "epoch": 0.9352772857337512, "grad_norm": 1.751221776008606, "learning_rate": 2.6314277506463666e-06, "loss": 2.6353, "step": 3289 }, { "epoch": 0.9355616509772093, "grad_norm": 1.5697683095932007, "learning_rate": 2.6199367997701815e-06, "loss": 2.68, "step": 3290 }, { "epoch": 0.9358460162206674, "grad_norm": 1.5043424367904663, "learning_rate": 2.608445848893996e-06, "loss": 2.1733, "step": 3291 }, { "epoch": 0.9361303814641255, "grad_norm": 1.4452601671218872, "learning_rate": 2.596954898017811e-06, "loss": 2.0397, "step": 3292 }, { "epoch": 0.9364147467075836, "grad_norm": 1.478661298751831, "learning_rate": 2.585463947141626e-06, "loss": 1.8285, "step": 3293 }, { "epoch": 0.9366991119510418, "grad_norm": 1.4804670810699463, "learning_rate": 2.573972996265441e-06, "loss": 1.9527, "step": 3294 }, { "epoch": 0.9369834771944999, "grad_norm": 1.5798107385635376, "learning_rate": 2.562482045389256e-06, "loss": 1.8947, "step": 3295 }, { "epoch": 0.937267842437958, "grad_norm": 1.7154254913330078, "learning_rate": 2.550991094513071e-06, "loss": 1.7322, "step": 3296 }, { "epoch": 0.9375522076814161, "grad_norm": 1.6413829326629639, "learning_rate": 2.539500143636886e-06, "loss": 2.9358, "step": 3297 }, { "epoch": 0.9378365729248742, "grad_norm": 1.5562938451766968, "learning_rate": 2.528009192760701e-06, "loss": 2.3359, "step": 3298 }, { "epoch": 0.9381209381683324, "grad_norm": 1.3978956937789917, "learning_rate": 2.5165182418845162e-06, "loss": 2.2585, "step": 3299 }, { "epoch": 0.9384053034117905, "grad_norm": 1.5323485136032104, "learning_rate": 2.505027291008331e-06, "loss": 2.1701, "step": 3300 }, { "epoch": 0.9386896686552486, "grad_norm": 1.5247138738632202, "learning_rate": 2.493536340132146e-06, "loss": 2.2914, "step": 3301 }, { "epoch": 0.9389740338987067, "grad_norm": 1.4950916767120361, "learning_rate": 2.4820453892559614e-06, "loss": 1.9327, "step": 3302 }, { "epoch": 0.9392583991421648, "grad_norm": 1.5644844770431519, "learning_rate": 2.470554438379776e-06, "loss": 1.7878, "step": 3303 }, { "epoch": 0.939542764385623, "grad_norm": 1.7486357688903809, "learning_rate": 2.4590634875035913e-06, "loss": 1.5255, "step": 3304 }, { "epoch": 0.9398271296290811, "grad_norm": 1.6616047620773315, "learning_rate": 2.447572536627406e-06, "loss": 2.4573, "step": 3305 }, { "epoch": 0.9401114948725392, "grad_norm": 1.5083410739898682, "learning_rate": 2.436081585751221e-06, "loss": 2.2363, "step": 3306 }, { "epoch": 0.9403958601159973, "grad_norm": 1.4322084188461304, "learning_rate": 2.424590634875036e-06, "loss": 2.1424, "step": 3307 }, { "epoch": 0.9406802253594554, "grad_norm": 1.466874122619629, "learning_rate": 2.413099683998851e-06, "loss": 2.1491, "step": 3308 }, { "epoch": 0.9409645906029136, "grad_norm": 1.532874584197998, "learning_rate": 2.4016087331226663e-06, "loss": 1.9099, "step": 3309 }, { "epoch": 0.9412489558463717, "grad_norm": 1.3527659177780151, "learning_rate": 2.390117782246481e-06, "loss": 1.8746, "step": 3310 }, { "epoch": 0.9415333210898298, "grad_norm": 1.550384521484375, "learning_rate": 2.378626831370296e-06, "loss": 1.8256, "step": 3311 }, { "epoch": 0.9418176863332879, "grad_norm": 1.514704704284668, "learning_rate": 2.367135880494111e-06, "loss": 1.5771, "step": 3312 }, { "epoch": 0.942102051576746, "grad_norm": 1.789668083190918, "learning_rate": 2.355644929617926e-06, "loss": 2.8358, "step": 3313 }, { "epoch": 0.9423864168202042, "grad_norm": 1.4652148485183716, "learning_rate": 2.3441539787417413e-06, "loss": 2.312, "step": 3314 }, { "epoch": 0.9426707820636623, "grad_norm": 1.5131518840789795, "learning_rate": 2.332663027865556e-06, "loss": 2.0083, "step": 3315 }, { "epoch": 0.9429551473071204, "grad_norm": 1.439961314201355, "learning_rate": 2.321172076989371e-06, "loss": 2.076, "step": 3316 }, { "epoch": 0.9432395125505785, "grad_norm": 1.473046898841858, "learning_rate": 2.309681126113186e-06, "loss": 1.969, "step": 3317 }, { "epoch": 0.9435238777940366, "grad_norm": 1.4626710414886475, "learning_rate": 2.298190175237001e-06, "loss": 1.7359, "step": 3318 }, { "epoch": 0.9438082430374948, "grad_norm": 1.4370124340057373, "learning_rate": 2.286699224360816e-06, "loss": 1.7492, "step": 3319 }, { "epoch": 0.9440926082809529, "grad_norm": 1.5686242580413818, "learning_rate": 2.275208273484631e-06, "loss": 1.683, "step": 3320 }, { "epoch": 0.944376973524411, "grad_norm": 1.6691089868545532, "learning_rate": 2.263717322608446e-06, "loss": 2.6069, "step": 3321 }, { "epoch": 0.9446613387678691, "grad_norm": 1.55553138256073, "learning_rate": 2.252226371732261e-06, "loss": 2.3096, "step": 3322 }, { "epoch": 0.9449457040113273, "grad_norm": 1.4959852695465088, "learning_rate": 2.240735420856076e-06, "loss": 2.4402, "step": 3323 }, { "epoch": 0.9452300692547854, "grad_norm": 1.4532426595687866, "learning_rate": 2.229244469979891e-06, "loss": 2.0286, "step": 3324 }, { "epoch": 0.9455144344982435, "grad_norm": 1.565828561782837, "learning_rate": 2.2177535191037063e-06, "loss": 2.217, "step": 3325 }, { "epoch": 0.9457987997417016, "grad_norm": 1.489469289779663, "learning_rate": 2.2062625682275208e-06, "loss": 1.8138, "step": 3326 }, { "epoch": 0.9460831649851597, "grad_norm": 1.5402168035507202, "learning_rate": 2.1947716173513357e-06, "loss": 1.8234, "step": 3327 }, { "epoch": 0.9463675302286179, "grad_norm": 1.4881329536437988, "learning_rate": 2.183280666475151e-06, "loss": 1.4388, "step": 3328 }, { "epoch": 0.946651895472076, "grad_norm": 1.8732259273529053, "learning_rate": 2.171789715598966e-06, "loss": 2.7893, "step": 3329 }, { "epoch": 0.946936260715534, "grad_norm": 1.452468752861023, "learning_rate": 2.160298764722781e-06, "loss": 2.2695, "step": 3330 }, { "epoch": 0.9472206259589921, "grad_norm": 1.4889235496520996, "learning_rate": 2.148807813846596e-06, "loss": 2.2459, "step": 3331 }, { "epoch": 0.9475049912024502, "grad_norm": 1.4310044050216675, "learning_rate": 2.137316862970411e-06, "loss": 2.1622, "step": 3332 }, { "epoch": 0.9477893564459084, "grad_norm": 1.490521788597107, "learning_rate": 2.125825912094226e-06, "loss": 1.9426, "step": 3333 }, { "epoch": 0.9480737216893665, "grad_norm": 1.4765541553497314, "learning_rate": 2.114334961218041e-06, "loss": 1.8035, "step": 3334 }, { "epoch": 0.9483580869328246, "grad_norm": 1.5888854265213013, "learning_rate": 2.102844010341856e-06, "loss": 1.8977, "step": 3335 }, { "epoch": 0.9486424521762827, "grad_norm": 1.5300711393356323, "learning_rate": 2.091353059465671e-06, "loss": 1.8379, "step": 3336 }, { "epoch": 0.9489268174197408, "grad_norm": 1.5477615594863892, "learning_rate": 2.079862108589486e-06, "loss": 2.6793, "step": 3337 }, { "epoch": 0.949211182663199, "grad_norm": 1.5292421579360962, "learning_rate": 2.0683711577133007e-06, "loss": 2.3593, "step": 3338 }, { "epoch": 0.9494955479066571, "grad_norm": 1.4530328512191772, "learning_rate": 2.056880206837116e-06, "loss": 2.1165, "step": 3339 }, { "epoch": 0.9497799131501152, "grad_norm": 1.475620150566101, "learning_rate": 2.045389255960931e-06, "loss": 2.1348, "step": 3340 }, { "epoch": 0.9500642783935733, "grad_norm": 1.4726742506027222, "learning_rate": 2.033898305084746e-06, "loss": 2.1875, "step": 3341 }, { "epoch": 0.9503486436370314, "grad_norm": 1.4152458906173706, "learning_rate": 2.022407354208561e-06, "loss": 1.9168, "step": 3342 }, { "epoch": 0.9506330088804896, "grad_norm": 1.4417401552200317, "learning_rate": 2.0109164033323757e-06, "loss": 1.778, "step": 3343 }, { "epoch": 0.9509173741239477, "grad_norm": 1.5183541774749756, "learning_rate": 1.999425452456191e-06, "loss": 1.6503, "step": 3344 }, { "epoch": 0.9512017393674058, "grad_norm": 1.5476274490356445, "learning_rate": 1.987934501580006e-06, "loss": 2.781, "step": 3345 }, { "epoch": 0.9514861046108639, "grad_norm": 1.5080674886703491, "learning_rate": 1.976443550703821e-06, "loss": 2.0026, "step": 3346 }, { "epoch": 0.951770469854322, "grad_norm": 1.437429666519165, "learning_rate": 1.964952599827636e-06, "loss": 2.3271, "step": 3347 }, { "epoch": 0.9520548350977802, "grad_norm": 1.526536226272583, "learning_rate": 1.953461648951451e-06, "loss": 2.0797, "step": 3348 }, { "epoch": 0.9523392003412383, "grad_norm": 1.5311537981033325, "learning_rate": 1.941970698075266e-06, "loss": 1.9877, "step": 3349 }, { "epoch": 0.9526235655846964, "grad_norm": 1.4898496866226196, "learning_rate": 1.9304797471990806e-06, "loss": 1.8399, "step": 3350 }, { "epoch": 0.9529079308281545, "grad_norm": 1.5804930925369263, "learning_rate": 1.918988796322896e-06, "loss": 1.8401, "step": 3351 }, { "epoch": 0.9531922960716126, "grad_norm": 1.619555950164795, "learning_rate": 1.907497845446711e-06, "loss": 1.7878, "step": 3352 }, { "epoch": 0.9534766613150708, "grad_norm": 1.7056188583374023, "learning_rate": 1.896006894570526e-06, "loss": 2.5584, "step": 3353 }, { "epoch": 0.9537610265585289, "grad_norm": 1.5180672407150269, "learning_rate": 1.884515943694341e-06, "loss": 2.3481, "step": 3354 }, { "epoch": 0.954045391801987, "grad_norm": 1.4261457920074463, "learning_rate": 1.873024992818156e-06, "loss": 2.0368, "step": 3355 }, { "epoch": 0.9543297570454451, "grad_norm": 1.4864054918289185, "learning_rate": 1.8615340419419707e-06, "loss": 2.1562, "step": 3356 }, { "epoch": 0.9546141222889032, "grad_norm": 1.6229420900344849, "learning_rate": 1.8500430910657857e-06, "loss": 1.8096, "step": 3357 }, { "epoch": 0.9548984875323614, "grad_norm": 1.468389630317688, "learning_rate": 1.8385521401896008e-06, "loss": 1.7469, "step": 3358 }, { "epoch": 0.9551828527758195, "grad_norm": 2.151934862136841, "learning_rate": 1.8270611893134157e-06, "loss": 1.9153, "step": 3359 }, { "epoch": 0.9554672180192776, "grad_norm": 1.6833430528640747, "learning_rate": 1.8155702384372309e-06, "loss": 1.7895, "step": 3360 }, { "epoch": 0.9557515832627357, "grad_norm": 1.6929632425308228, "learning_rate": 1.8040792875610458e-06, "loss": 2.6418, "step": 3361 }, { "epoch": 0.9560359485061938, "grad_norm": 1.4506680965423584, "learning_rate": 1.792588336684861e-06, "loss": 2.3191, "step": 3362 }, { "epoch": 0.956320313749652, "grad_norm": 1.452488899230957, "learning_rate": 1.7810973858086758e-06, "loss": 2.2384, "step": 3363 }, { "epoch": 0.9566046789931101, "grad_norm": 1.4387120008468628, "learning_rate": 1.7696064349324907e-06, "loss": 1.9809, "step": 3364 }, { "epoch": 0.9568890442365682, "grad_norm": 1.4808820486068726, "learning_rate": 1.7581154840563059e-06, "loss": 1.9542, "step": 3365 }, { "epoch": 0.9571734094800263, "grad_norm": 1.5310330390930176, "learning_rate": 1.7466245331801208e-06, "loss": 1.9475, "step": 3366 }, { "epoch": 0.9574577747234844, "grad_norm": 1.628447413444519, "learning_rate": 1.735133582303936e-06, "loss": 1.8739, "step": 3367 }, { "epoch": 0.9577421399669426, "grad_norm": 1.5938904285430908, "learning_rate": 1.7236426314277506e-06, "loss": 1.7342, "step": 3368 }, { "epoch": 0.9580265052104007, "grad_norm": 1.7595031261444092, "learning_rate": 1.712151680551566e-06, "loss": 2.6314, "step": 3369 }, { "epoch": 0.9583108704538588, "grad_norm": 1.4485570192337036, "learning_rate": 1.7006607296753807e-06, "loss": 2.3092, "step": 3370 }, { "epoch": 0.9585952356973169, "grad_norm": 1.341435432434082, "learning_rate": 1.6891697787991956e-06, "loss": 2.1334, "step": 3371 }, { "epoch": 0.9588796009407751, "grad_norm": 1.397361159324646, "learning_rate": 1.6776788279230107e-06, "loss": 2.1447, "step": 3372 }, { "epoch": 0.9591639661842332, "grad_norm": 1.4863638877868652, "learning_rate": 1.6661878770468257e-06, "loss": 2.0255, "step": 3373 }, { "epoch": 0.9594483314276913, "grad_norm": 1.468654751777649, "learning_rate": 1.6546969261706408e-06, "loss": 1.7851, "step": 3374 }, { "epoch": 0.9597326966711494, "grad_norm": 1.470919132232666, "learning_rate": 1.6432059752944557e-06, "loss": 1.7603, "step": 3375 }, { "epoch": 0.9600170619146075, "grad_norm": 1.625359296798706, "learning_rate": 1.6317150244182709e-06, "loss": 1.7318, "step": 3376 }, { "epoch": 0.9603014271580657, "grad_norm": 1.6317858695983887, "learning_rate": 1.6202240735420858e-06, "loss": 2.6952, "step": 3377 }, { "epoch": 0.9605857924015238, "grad_norm": 1.432524561882019, "learning_rate": 1.608733122665901e-06, "loss": 2.3517, "step": 3378 }, { "epoch": 0.9608701576449818, "grad_norm": 1.5162678956985474, "learning_rate": 1.5972421717897158e-06, "loss": 2.374, "step": 3379 }, { "epoch": 0.9611545228884399, "grad_norm": 1.4993191957473755, "learning_rate": 1.5857512209135305e-06, "loss": 2.2357, "step": 3380 }, { "epoch": 0.961438888131898, "grad_norm": 1.4974647760391235, "learning_rate": 1.5742602700373457e-06, "loss": 1.9876, "step": 3381 }, { "epoch": 0.9617232533753562, "grad_norm": 1.4593394994735718, "learning_rate": 1.5627693191611606e-06, "loss": 1.7961, "step": 3382 }, { "epoch": 0.9620076186188143, "grad_norm": 1.453155517578125, "learning_rate": 1.5512783682849757e-06, "loss": 1.6868, "step": 3383 }, { "epoch": 0.9622919838622724, "grad_norm": 1.6229658126831055, "learning_rate": 1.5397874174087906e-06, "loss": 1.8026, "step": 3384 }, { "epoch": 0.9625763491057305, "grad_norm": 1.7281709909439087, "learning_rate": 1.5282964665326058e-06, "loss": 2.781, "step": 3385 }, { "epoch": 0.9628607143491886, "grad_norm": 1.487370491027832, "learning_rate": 1.5168055156564207e-06, "loss": 2.3773, "step": 3386 }, { "epoch": 0.9631450795926468, "grad_norm": 1.5010656118392944, "learning_rate": 1.5053145647802356e-06, "loss": 2.1523, "step": 3387 }, { "epoch": 0.9634294448361049, "grad_norm": 1.5002316236495972, "learning_rate": 1.4938236139040508e-06, "loss": 2.0299, "step": 3388 }, { "epoch": 0.963713810079563, "grad_norm": 1.5855457782745361, "learning_rate": 1.4823326630278657e-06, "loss": 2.1472, "step": 3389 }, { "epoch": 0.9639981753230211, "grad_norm": 1.4458882808685303, "learning_rate": 1.4708417121516808e-06, "loss": 1.7629, "step": 3390 }, { "epoch": 0.9642825405664792, "grad_norm": 1.5790014266967773, "learning_rate": 1.4593507612754957e-06, "loss": 1.6007, "step": 3391 }, { "epoch": 0.9645669058099374, "grad_norm": 1.5922648906707764, "learning_rate": 1.4478598103993109e-06, "loss": 1.9121, "step": 3392 }, { "epoch": 0.9648512710533955, "grad_norm": 1.733428955078125, "learning_rate": 1.4363688595231256e-06, "loss": 2.5225, "step": 3393 }, { "epoch": 0.9651356362968536, "grad_norm": 1.4528639316558838, "learning_rate": 1.4248779086469405e-06, "loss": 2.1728, "step": 3394 }, { "epoch": 0.9654200015403117, "grad_norm": 1.411819338798523, "learning_rate": 1.4133869577707556e-06, "loss": 2.1298, "step": 3395 }, { "epoch": 0.9657043667837698, "grad_norm": 1.5605361461639404, "learning_rate": 1.4018960068945705e-06, "loss": 2.3136, "step": 3396 }, { "epoch": 0.965988732027228, "grad_norm": 1.5014349222183228, "learning_rate": 1.3904050560183857e-06, "loss": 2.0917, "step": 3397 }, { "epoch": 0.9662730972706861, "grad_norm": 1.4370369911193848, "learning_rate": 1.3789141051422006e-06, "loss": 1.8056, "step": 3398 }, { "epoch": 0.9665574625141442, "grad_norm": 1.5899243354797363, "learning_rate": 1.3674231542660157e-06, "loss": 1.5606, "step": 3399 }, { "epoch": 0.9668418277576023, "grad_norm": 1.4517974853515625, "learning_rate": 1.3559322033898307e-06, "loss": 1.5532, "step": 3400 }, { "epoch": 0.9671261930010604, "grad_norm": 1.7034136056900024, "learning_rate": 1.3444412525136456e-06, "loss": 2.5908, "step": 3401 }, { "epoch": 0.9674105582445186, "grad_norm": 1.504889726638794, "learning_rate": 1.3329503016374607e-06, "loss": 2.4146, "step": 3402 }, { "epoch": 0.9676949234879767, "grad_norm": 1.5229851007461548, "learning_rate": 1.3214593507612754e-06, "loss": 2.2832, "step": 3403 }, { "epoch": 0.9679792887314348, "grad_norm": 1.3611642122268677, "learning_rate": 1.3099683998850908e-06, "loss": 2.2432, "step": 3404 }, { "epoch": 0.9682636539748929, "grad_norm": 1.4854189157485962, "learning_rate": 1.2984774490089055e-06, "loss": 2.1156, "step": 3405 }, { "epoch": 0.968548019218351, "grad_norm": 1.5415984392166138, "learning_rate": 1.2869864981327206e-06, "loss": 1.859, "step": 3406 }, { "epoch": 0.9688323844618092, "grad_norm": 1.4857087135314941, "learning_rate": 1.2754955472565355e-06, "loss": 1.6399, "step": 3407 }, { "epoch": 0.9691167497052673, "grad_norm": 1.531732201576233, "learning_rate": 1.2640045963803504e-06, "loss": 1.6132, "step": 3408 }, { "epoch": 0.9694011149487254, "grad_norm": 1.7313965559005737, "learning_rate": 1.2525136455041656e-06, "loss": 2.5503, "step": 3409 }, { "epoch": 0.9696854801921835, "grad_norm": 1.430211067199707, "learning_rate": 1.2410226946279807e-06, "loss": 2.2446, "step": 3410 }, { "epoch": 0.9699698454356416, "grad_norm": 1.432405948638916, "learning_rate": 1.2295317437517956e-06, "loss": 2.2009, "step": 3411 }, { "epoch": 0.9702542106790998, "grad_norm": 1.5122909545898438, "learning_rate": 1.2180407928756105e-06, "loss": 2.2101, "step": 3412 }, { "epoch": 0.9705385759225579, "grad_norm": 1.592177391052246, "learning_rate": 1.2065498419994255e-06, "loss": 2.1943, "step": 3413 }, { "epoch": 0.970822941166016, "grad_norm": 1.4828754663467407, "learning_rate": 1.1950588911232406e-06, "loss": 2.0317, "step": 3414 }, { "epoch": 0.9711073064094741, "grad_norm": 1.4396264553070068, "learning_rate": 1.1835679402470555e-06, "loss": 1.7245, "step": 3415 }, { "epoch": 0.9713916716529322, "grad_norm": 1.5686695575714111, "learning_rate": 1.1720769893708707e-06, "loss": 1.8843, "step": 3416 }, { "epoch": 0.9716760368963904, "grad_norm": 1.7028121948242188, "learning_rate": 1.1605860384946856e-06, "loss": 2.6639, "step": 3417 }, { "epoch": 0.9719604021398485, "grad_norm": 1.5416810512542725, "learning_rate": 1.1490950876185005e-06, "loss": 2.38, "step": 3418 }, { "epoch": 0.9722447673833066, "grad_norm": 1.492241621017456, "learning_rate": 1.1376041367423154e-06, "loss": 2.2302, "step": 3419 }, { "epoch": 0.9725291326267647, "grad_norm": 1.4459635019302368, "learning_rate": 1.1261131858661306e-06, "loss": 2.0947, "step": 3420 }, { "epoch": 0.9728134978702229, "grad_norm": 1.649179458618164, "learning_rate": 1.1146222349899455e-06, "loss": 2.1094, "step": 3421 }, { "epoch": 0.973097863113681, "grad_norm": 1.3947135210037231, "learning_rate": 1.1031312841137604e-06, "loss": 1.8, "step": 3422 }, { "epoch": 0.9733822283571391, "grad_norm": 1.423638105392456, "learning_rate": 1.0916403332375755e-06, "loss": 1.7245, "step": 3423 }, { "epoch": 0.9736665936005972, "grad_norm": 1.5806896686553955, "learning_rate": 1.0801493823613904e-06, "loss": 1.8189, "step": 3424 }, { "epoch": 0.9739509588440552, "grad_norm": 1.8673456907272339, "learning_rate": 1.0686584314852056e-06, "loss": 2.8048, "step": 3425 }, { "epoch": 0.9742353240875135, "grad_norm": 1.4717233180999756, "learning_rate": 1.0571674806090205e-06, "loss": 2.0696, "step": 3426 }, { "epoch": 0.9745196893309716, "grad_norm": 1.3976044654846191, "learning_rate": 1.0456765297328354e-06, "loss": 2.1881, "step": 3427 }, { "epoch": 0.9748040545744296, "grad_norm": 1.4841516017913818, "learning_rate": 1.0341855788566503e-06, "loss": 2.1457, "step": 3428 }, { "epoch": 0.9750884198178877, "grad_norm": 1.4041268825531006, "learning_rate": 1.0226946279804655e-06, "loss": 1.9074, "step": 3429 }, { "epoch": 0.9753727850613458, "grad_norm": 1.5698847770690918, "learning_rate": 1.0112036771042804e-06, "loss": 1.8711, "step": 3430 }, { "epoch": 0.975657150304804, "grad_norm": 1.568758249282837, "learning_rate": 9.997127262280955e-07, "loss": 1.7552, "step": 3431 }, { "epoch": 0.9759415155482621, "grad_norm": 1.540923833847046, "learning_rate": 9.882217753519104e-07, "loss": 1.724, "step": 3432 }, { "epoch": 0.9762258807917202, "grad_norm": 1.6241703033447266, "learning_rate": 9.767308244757256e-07, "loss": 2.583, "step": 3433 }, { "epoch": 0.9765102460351783, "grad_norm": 1.4737218618392944, "learning_rate": 9.652398735995403e-07, "loss": 2.2108, "step": 3434 }, { "epoch": 0.9767946112786364, "grad_norm": 1.477249026298523, "learning_rate": 9.537489227233554e-07, "loss": 2.4118, "step": 3435 }, { "epoch": 0.9770789765220946, "grad_norm": 1.4675695896148682, "learning_rate": 9.422579718471705e-07, "loss": 2.2902, "step": 3436 }, { "epoch": 0.9773633417655527, "grad_norm": 1.5511778593063354, "learning_rate": 9.307670209709854e-07, "loss": 2.0277, "step": 3437 }, { "epoch": 0.9776477070090108, "grad_norm": 1.3725906610488892, "learning_rate": 9.192760700948004e-07, "loss": 1.8596, "step": 3438 }, { "epoch": 0.9779320722524689, "grad_norm": 1.5125075578689575, "learning_rate": 9.077851192186154e-07, "loss": 1.551, "step": 3439 }, { "epoch": 0.978216437495927, "grad_norm": 1.536952257156372, "learning_rate": 8.962941683424305e-07, "loss": 1.6953, "step": 3440 }, { "epoch": 0.9785008027393852, "grad_norm": 1.8010717630386353, "learning_rate": 8.848032174662454e-07, "loss": 2.8914, "step": 3441 }, { "epoch": 0.9787851679828433, "grad_norm": 1.4743620157241821, "learning_rate": 8.733122665900604e-07, "loss": 2.3008, "step": 3442 }, { "epoch": 0.9790695332263014, "grad_norm": 1.5010536909103394, "learning_rate": 8.618213157138753e-07, "loss": 1.9155, "step": 3443 }, { "epoch": 0.9793538984697595, "grad_norm": 1.4484606981277466, "learning_rate": 8.503303648376903e-07, "loss": 2.0419, "step": 3444 }, { "epoch": 0.9796382637132176, "grad_norm": 1.5516077280044556, "learning_rate": 8.388394139615054e-07, "loss": 2.034, "step": 3445 }, { "epoch": 0.9799226289566758, "grad_norm": 1.445167899131775, "learning_rate": 8.273484630853204e-07, "loss": 1.7347, "step": 3446 }, { "epoch": 0.9802069942001339, "grad_norm": 1.5177215337753296, "learning_rate": 8.158575122091354e-07, "loss": 1.7245, "step": 3447 }, { "epoch": 0.980491359443592, "grad_norm": 1.4989851713180542, "learning_rate": 8.043665613329505e-07, "loss": 1.7891, "step": 3448 }, { "epoch": 0.9807757246870501, "grad_norm": 1.808464527130127, "learning_rate": 7.928756104567653e-07, "loss": 2.9027, "step": 3449 }, { "epoch": 0.9810600899305082, "grad_norm": 1.4827134609222412, "learning_rate": 7.813846595805803e-07, "loss": 2.3215, "step": 3450 }, { "epoch": 0.9813444551739664, "grad_norm": 1.4429081678390503, "learning_rate": 7.698937087043953e-07, "loss": 2.2759, "step": 3451 }, { "epoch": 0.9816288204174245, "grad_norm": 1.4872095584869385, "learning_rate": 7.584027578282104e-07, "loss": 2.0688, "step": 3452 }, { "epoch": 0.9819131856608826, "grad_norm": 1.5652166604995728, "learning_rate": 7.469118069520254e-07, "loss": 2.0771, "step": 3453 }, { "epoch": 0.9821975509043407, "grad_norm": 1.380708932876587, "learning_rate": 7.354208560758404e-07, "loss": 1.616, "step": 3454 }, { "epoch": 0.9824819161477988, "grad_norm": 1.489150047302246, "learning_rate": 7.239299051996554e-07, "loss": 1.7372, "step": 3455 }, { "epoch": 0.982766281391257, "grad_norm": 1.7191890478134155, "learning_rate": 7.124389543234702e-07, "loss": 1.7137, "step": 3456 }, { "epoch": 0.9830506466347151, "grad_norm": 1.6472156047821045, "learning_rate": 7.009480034472853e-07, "loss": 2.8431, "step": 3457 }, { "epoch": 0.9833350118781732, "grad_norm": 1.4679713249206543, "learning_rate": 6.894570525711003e-07, "loss": 2.2365, "step": 3458 }, { "epoch": 0.9836193771216313, "grad_norm": 1.43304443359375, "learning_rate": 6.779661016949153e-07, "loss": 2.2726, "step": 3459 }, { "epoch": 0.9839037423650894, "grad_norm": 1.4966747760772705, "learning_rate": 6.664751508187304e-07, "loss": 1.9848, "step": 3460 }, { "epoch": 0.9841881076085476, "grad_norm": 1.5169103145599365, "learning_rate": 6.549841999425454e-07, "loss": 2.0286, "step": 3461 }, { "epoch": 0.9844724728520057, "grad_norm": 1.6073898077011108, "learning_rate": 6.434932490663603e-07, "loss": 1.8712, "step": 3462 }, { "epoch": 0.9847568380954638, "grad_norm": 1.4505630731582642, "learning_rate": 6.320022981901752e-07, "loss": 1.6837, "step": 3463 }, { "epoch": 0.9850412033389219, "grad_norm": 1.5838265419006348, "learning_rate": 6.205113473139904e-07, "loss": 1.7908, "step": 3464 }, { "epoch": 0.9853255685823801, "grad_norm": 1.5671101808547974, "learning_rate": 6.090203964378053e-07, "loss": 2.6081, "step": 3465 }, { "epoch": 0.9856099338258382, "grad_norm": 1.4666972160339355, "learning_rate": 5.975294455616203e-07, "loss": 2.4855, "step": 3466 }, { "epoch": 0.9858942990692963, "grad_norm": 1.4629732370376587, "learning_rate": 5.860384946854353e-07, "loss": 2.1884, "step": 3467 }, { "epoch": 0.9861786643127544, "grad_norm": 1.4864407777786255, "learning_rate": 5.745475438092502e-07, "loss": 2.1033, "step": 3468 }, { "epoch": 0.9864630295562125, "grad_norm": 1.498170256614685, "learning_rate": 5.630565929330653e-07, "loss": 2.0259, "step": 3469 }, { "epoch": 0.9867473947996707, "grad_norm": 1.402068018913269, "learning_rate": 5.515656420568802e-07, "loss": 1.8348, "step": 3470 }, { "epoch": 0.9870317600431288, "grad_norm": 1.5257517099380493, "learning_rate": 5.400746911806952e-07, "loss": 1.8354, "step": 3471 }, { "epoch": 0.9873161252865869, "grad_norm": 1.5966296195983887, "learning_rate": 5.285837403045103e-07, "loss": 1.7858, "step": 3472 }, { "epoch": 0.987600490530045, "grad_norm": 1.67741060256958, "learning_rate": 5.170927894283252e-07, "loss": 2.6827, "step": 3473 }, { "epoch": 0.987884855773503, "grad_norm": 1.4048272371292114, "learning_rate": 5.056018385521402e-07, "loss": 2.5054, "step": 3474 }, { "epoch": 0.9881692210169613, "grad_norm": 1.3851925134658813, "learning_rate": 4.941108876759552e-07, "loss": 2.1778, "step": 3475 }, { "epoch": 0.9884535862604193, "grad_norm": 1.344907283782959, "learning_rate": 4.826199367997701e-07, "loss": 2.2011, "step": 3476 }, { "epoch": 0.9887379515038774, "grad_norm": 1.4560372829437256, "learning_rate": 4.711289859235852e-07, "loss": 2.0771, "step": 3477 }, { "epoch": 0.9890223167473355, "grad_norm": 1.4100260734558105, "learning_rate": 4.596380350474002e-07, "loss": 1.7795, "step": 3478 }, { "epoch": 0.9893066819907936, "grad_norm": 1.4562536478042603, "learning_rate": 4.481470841712152e-07, "loss": 1.7464, "step": 3479 }, { "epoch": 0.9895910472342518, "grad_norm": 1.5763298273086548, "learning_rate": 4.366561332950302e-07, "loss": 1.6895, "step": 3480 }, { "epoch": 0.9898754124777099, "grad_norm": 1.698847770690918, "learning_rate": 4.251651824188452e-07, "loss": 2.9543, "step": 3481 }, { "epoch": 0.990159777721168, "grad_norm": 1.475989818572998, "learning_rate": 4.136742315426602e-07, "loss": 2.3288, "step": 3482 }, { "epoch": 0.9904441429646261, "grad_norm": 1.4876697063446045, "learning_rate": 4.0218328066647523e-07, "loss": 2.1684, "step": 3483 }, { "epoch": 0.9907285082080842, "grad_norm": 1.5078527927398682, "learning_rate": 3.9069232979029015e-07, "loss": 2.2732, "step": 3484 }, { "epoch": 0.9910128734515424, "grad_norm": 1.5910695791244507, "learning_rate": 3.792013789141052e-07, "loss": 2.215, "step": 3485 }, { "epoch": 0.9912972386950005, "grad_norm": 1.6093436479568481, "learning_rate": 3.677104280379202e-07, "loss": 1.9322, "step": 3486 }, { "epoch": 0.9915816039384586, "grad_norm": 1.5375397205352783, "learning_rate": 3.562194771617351e-07, "loss": 1.7414, "step": 3487 }, { "epoch": 0.9918659691819167, "grad_norm": 1.5303829908370972, "learning_rate": 3.4472852628555015e-07, "loss": 1.4854, "step": 3488 }, { "epoch": 0.9921503344253748, "grad_norm": 1.6482408046722412, "learning_rate": 3.332375754093652e-07, "loss": 2.4752, "step": 3489 }, { "epoch": 0.992434699668833, "grad_norm": 1.387835144996643, "learning_rate": 3.2174662453318015e-07, "loss": 2.0685, "step": 3490 }, { "epoch": 0.9927190649122911, "grad_norm": 1.428237795829773, "learning_rate": 3.102556736569952e-07, "loss": 2.1544, "step": 3491 }, { "epoch": 0.9930034301557492, "grad_norm": 1.4636019468307495, "learning_rate": 2.9876472278081015e-07, "loss": 2.2585, "step": 3492 }, { "epoch": 0.9932877953992073, "grad_norm": 1.5067780017852783, "learning_rate": 2.872737719046251e-07, "loss": 1.8351, "step": 3493 }, { "epoch": 0.9935721606426654, "grad_norm": 1.483904480934143, "learning_rate": 2.757828210284401e-07, "loss": 1.6935, "step": 3494 }, { "epoch": 0.9938565258861236, "grad_norm": 1.4138548374176025, "learning_rate": 2.642918701522551e-07, "loss": 1.8169, "step": 3495 }, { "epoch": 0.9941408911295817, "grad_norm": 1.464961290359497, "learning_rate": 2.528009192760701e-07, "loss": 1.7828, "step": 3496 }, { "epoch": 0.9944252563730398, "grad_norm": 1.8348134756088257, "learning_rate": 2.4130996839988507e-07, "loss": 2.8307, "step": 3497 }, { "epoch": 0.9947096216164979, "grad_norm": 1.4368764162063599, "learning_rate": 2.298190175237001e-07, "loss": 2.3679, "step": 3498 }, { "epoch": 0.994993986859956, "grad_norm": 1.479984998703003, "learning_rate": 2.183280666475151e-07, "loss": 2.2969, "step": 3499 }, { "epoch": 0.9952783521034142, "grad_norm": 1.4134024381637573, "learning_rate": 2.068371157713301e-07, "loss": 2.2069, "step": 3500 }, { "epoch": 0.9955627173468723, "grad_norm": 1.640987515449524, "learning_rate": 1.9534616489514507e-07, "loss": 1.9712, "step": 3501 }, { "epoch": 0.9958470825903304, "grad_norm": 1.4915003776550293, "learning_rate": 1.838552140189601e-07, "loss": 1.6915, "step": 3502 }, { "epoch": 0.9961314478337885, "grad_norm": 1.5747153759002686, "learning_rate": 1.7236426314277507e-07, "loss": 1.5998, "step": 3503 }, { "epoch": 0.9964158130772466, "grad_norm": 1.5918612480163574, "learning_rate": 1.6087331226659008e-07, "loss": 1.7691, "step": 3504 }, { "epoch": 0.9967001783207048, "grad_norm": 1.6233141422271729, "learning_rate": 1.4938236139040508e-07, "loss": 2.538, "step": 3505 }, { "epoch": 0.9969845435641629, "grad_norm": 1.5444676876068115, "learning_rate": 1.3789141051422005e-07, "loss": 2.3451, "step": 3506 }, { "epoch": 0.997268908807621, "grad_norm": 1.4213060140609741, "learning_rate": 1.2640045963803505e-07, "loss": 2.1811, "step": 3507 }, { "epoch": 0.9975532740510791, "grad_norm": 1.5667362213134766, "learning_rate": 1.1490950876185005e-07, "loss": 2.0772, "step": 3508 }, { "epoch": 0.9978376392945372, "grad_norm": 1.6838546991348267, "learning_rate": 1.0341855788566505e-07, "loss": 1.8718, "step": 3509 }, { "epoch": 0.9981220045379954, "grad_norm": 1.4781723022460938, "learning_rate": 9.192760700948005e-08, "loss": 1.916, "step": 3510 }, { "epoch": 0.9984063697814535, "grad_norm": 1.4474114179611206, "learning_rate": 8.043665613329504e-08, "loss": 1.7148, "step": 3511 }, { "epoch": 0.9986907350249116, "grad_norm": 1.7419354915618896, "learning_rate": 6.894570525711002e-08, "loss": 1.6992, "step": 3512 }, { "epoch": 0.9989751002683697, "grad_norm": 1.6991440057754517, "learning_rate": 5.7454754380925025e-08, "loss": 2.6591, "step": 3513 }, { "epoch": 0.9992594655118279, "grad_norm": 1.4011856317520142, "learning_rate": 4.5963803504740025e-08, "loss": 2.2933, "step": 3514 }, { "epoch": 0.999543830755286, "grad_norm": 1.5167193412780762, "learning_rate": 3.447285262855501e-08, "loss": 2.3063, "step": 3515 }, { "epoch": 0.9998281959987441, "grad_norm": 1.423835039138794, "learning_rate": 2.2981901752370013e-08, "loss": 2.2698, "step": 3516 }, { "epoch": 1.0, "grad_norm": 1.7970972061157227, "learning_rate": 1.1490950876185006e-08, "loss": 2.0559, "step": 3517 } ], "logging_steps": 1, "max_steps": 3517, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.433231803074478e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }