diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24653 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3517, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002843652434581183, + "grad_norm": 49.12633514404297, + "learning_rate": 0.0, + "loss": 13.2624, + "step": 1 + }, + { + "epoch": 0.0005687304869162366, + "grad_norm": 52.63550567626953, + "learning_rate": 1.111111111111111e-06, + "loss": 12.8107, + "step": 2 + }, + { + "epoch": 0.000853095730374355, + "grad_norm": 51.16219711303711, + "learning_rate": 2.222222222222222e-06, + "loss": 12.7371, + "step": 3 + }, + { + "epoch": 0.0011374609738324733, + "grad_norm": 48.81122970581055, + "learning_rate": 3.3333333333333333e-06, + "loss": 11.5423, + "step": 4 + }, + { + "epoch": 0.0014218262172905917, + "grad_norm": 49.11295700073242, + "learning_rate": 4.444444444444444e-06, + "loss": 11.0671, + "step": 5 + }, + { + "epoch": 0.00170619146074871, + "grad_norm": 43.88680648803711, + "learning_rate": 5.555555555555557e-06, + "loss": 10.293, + "step": 6 + }, + { + "epoch": 0.0019905567042068284, + "grad_norm": 41.188690185546875, + "learning_rate": 6.666666666666667e-06, + "loss": 9.5132, + "step": 7 + }, + { + "epoch": 0.0022749219476649466, + "grad_norm": 34.85954284667969, + "learning_rate": 7.77777777777778e-06, + "loss": 8.9321, + "step": 8 + }, + { + "epoch": 0.002559287191123065, + "grad_norm": 40.25858688354492, + "learning_rate": 8.888888888888888e-06, + "loss": 12.101, + "step": 9 + }, + { + "epoch": 0.0028436524345811834, + "grad_norm": 37.3657112121582, + "learning_rate": 1e-05, + "loss": 10.7739, + "step": 10 + }, + { + "epoch": 0.0031280176780393017, + "grad_norm": 31.502487182617188, + "learning_rate": 1.1111111111111113e-05, + "loss": 10.0982, + "step": 11 + }, + { + "epoch": 0.00341238292149742, + "grad_norm": 33.45932388305664, + "learning_rate": 1.2222222222222224e-05, + "loss": 9.332, + "step": 12 + }, + { + "epoch": 0.0036967481649555385, + "grad_norm": 27.317989349365234, + "learning_rate": 1.3333333333333333e-05, + "loss": 8.051, + "step": 13 + }, + { + "epoch": 0.003981113408413657, + "grad_norm": 19.718740463256836, + "learning_rate": 1.4444444444444446e-05, + "loss": 7.3297, + "step": 14 + }, + { + "epoch": 0.004265478651871775, + "grad_norm": 18.006954193115234, + "learning_rate": 1.555555555555556e-05, + "loss": 6.4547, + "step": 15 + }, + { + "epoch": 0.004549843895329893, + "grad_norm": 14.854341506958008, + "learning_rate": 1.6666666666666667e-05, + "loss": 6.0132, + "step": 16 + }, + { + "epoch": 0.004834209138788011, + "grad_norm": 24.93276596069336, + "learning_rate": 1.7777777777777777e-05, + "loss": 9.1995, + "step": 17 + }, + { + "epoch": 0.00511857438224613, + "grad_norm": 22.159881591796875, + "learning_rate": 1.888888888888889e-05, + "loss": 8.4067, + "step": 18 + }, + { + "epoch": 0.005402939625704249, + "grad_norm": 17.916210174560547, + "learning_rate": 2e-05, + "loss": 7.7151, + "step": 19 + }, + { + "epoch": 0.005687304869162367, + "grad_norm": 15.477251052856445, + "learning_rate": 2.1111111111111114e-05, + "loss": 7.1676, + "step": 20 + }, + { + "epoch": 0.005971670112620485, + "grad_norm": 15.920812606811523, + "learning_rate": 2.2222222222222227e-05, + "loss": 6.4084, + "step": 21 + }, + { + "epoch": 0.006256035356078603, + "grad_norm": 15.113493919372559, + "learning_rate": 2.3333333333333336e-05, + "loss": 6.1668, + "step": 22 + }, + { + "epoch": 0.0065404005995367215, + "grad_norm": 11.510760307312012, + "learning_rate": 2.444444444444445e-05, + "loss": 5.4598, + "step": 23 + }, + { + "epoch": 0.00682476584299484, + "grad_norm": 7.77994966506958, + "learning_rate": 2.5555555555555554e-05, + "loss": 5.2792, + "step": 24 + }, + { + "epoch": 0.007109131086452958, + "grad_norm": 12.673455238342285, + "learning_rate": 2.6666666666666667e-05, + "loss": 7.6026, + "step": 25 + }, + { + "epoch": 0.007393496329911077, + "grad_norm": 13.792389869689941, + "learning_rate": 2.777777777777778e-05, + "loss": 7.2058, + "step": 26 + }, + { + "epoch": 0.007677861573369195, + "grad_norm": 10.524365425109863, + "learning_rate": 2.888888888888889e-05, + "loss": 6.3227, + "step": 27 + }, + { + "epoch": 0.007962226816827313, + "grad_norm": 105.05674743652344, + "learning_rate": 3.0000000000000004e-05, + "loss": 6.1134, + "step": 28 + }, + { + "epoch": 0.008246592060285432, + "grad_norm": 13.615738868713379, + "learning_rate": 3.111111111111112e-05, + "loss": 5.7431, + "step": 29 + }, + { + "epoch": 0.00853095730374355, + "grad_norm": 17.219816207885742, + "learning_rate": 3.222222222222223e-05, + "loss": 5.403, + "step": 30 + }, + { + "epoch": 0.008815322547201668, + "grad_norm": 9.333234786987305, + "learning_rate": 3.3333333333333335e-05, + "loss": 4.8142, + "step": 31 + }, + { + "epoch": 0.009099687790659786, + "grad_norm": 86.27250671386719, + "learning_rate": 3.444444444444445e-05, + "loss": 4.7489, + "step": 32 + }, + { + "epoch": 0.009384053034117905, + "grad_norm": 18.35126304626465, + "learning_rate": 3.555555555555555e-05, + "loss": 6.8241, + "step": 33 + }, + { + "epoch": 0.009668418277576023, + "grad_norm": 22.33354377746582, + "learning_rate": 3.6666666666666666e-05, + "loss": 6.552, + "step": 34 + }, + { + "epoch": 0.009952783521034141, + "grad_norm": 21.162761688232422, + "learning_rate": 3.777777777777778e-05, + "loss": 6.3146, + "step": 35 + }, + { + "epoch": 0.01023714876449226, + "grad_norm": 18.522459030151367, + "learning_rate": 3.888888888888889e-05, + "loss": 5.7251, + "step": 36 + }, + { + "epoch": 0.010521514007950379, + "grad_norm": 19.82053565979004, + "learning_rate": 4e-05, + "loss": 5.5137, + "step": 37 + }, + { + "epoch": 0.010805879251408497, + "grad_norm": 16.579193115234375, + "learning_rate": 3.998850904912382e-05, + "loss": 5.1559, + "step": 38 + }, + { + "epoch": 0.011090244494866616, + "grad_norm": 6.53575325012207, + "learning_rate": 3.997701809824763e-05, + "loss": 4.8303, + "step": 39 + }, + { + "epoch": 0.011374609738324734, + "grad_norm": 16.461362838745117, + "learning_rate": 3.996552714737145e-05, + "loss": 4.6216, + "step": 40 + }, + { + "epoch": 0.011658974981782852, + "grad_norm": 24.829219818115234, + "learning_rate": 3.995403619649526e-05, + "loss": 6.8729, + "step": 41 + }, + { + "epoch": 0.01194334022524097, + "grad_norm": 23.324426651000977, + "learning_rate": 3.994254524561908e-05, + "loss": 6.3201, + "step": 42 + }, + { + "epoch": 0.012227705468699088, + "grad_norm": 20.28281021118164, + "learning_rate": 3.993105429474289e-05, + "loss": 5.8256, + "step": 43 + }, + { + "epoch": 0.012512070712157207, + "grad_norm": 15.08171272277832, + "learning_rate": 3.991956334386671e-05, + "loss": 5.3353, + "step": 44 + }, + { + "epoch": 0.012796435955615325, + "grad_norm": 9.521646499633789, + "learning_rate": 3.990807239299052e-05, + "loss": 5.1606, + "step": 45 + }, + { + "epoch": 0.013080801199073443, + "grad_norm": 14.245501518249512, + "learning_rate": 3.989658144211434e-05, + "loss": 4.9048, + "step": 46 + }, + { + "epoch": 0.013365166442531561, + "grad_norm": 16.89498519897461, + "learning_rate": 3.9885090491238155e-05, + "loss": 4.4536, + "step": 47 + }, + { + "epoch": 0.01364953168598968, + "grad_norm": 11.991838455200195, + "learning_rate": 3.9873599540361966e-05, + "loss": 4.1927, + "step": 48 + }, + { + "epoch": 0.013933896929447798, + "grad_norm": 14.163195610046387, + "learning_rate": 3.9862108589485784e-05, + "loss": 6.2989, + "step": 49 + }, + { + "epoch": 0.014218262172905916, + "grad_norm": 11.865116119384766, + "learning_rate": 3.9850617638609595e-05, + "loss": 5.6554, + "step": 50 + }, + { + "epoch": 0.014502627416364034, + "grad_norm": 8.582511901855469, + "learning_rate": 3.983912668773341e-05, + "loss": 5.5022, + "step": 51 + }, + { + "epoch": 0.014786992659822154, + "grad_norm": 9.730727195739746, + "learning_rate": 3.9827635736857224e-05, + "loss": 4.9733, + "step": 52 + }, + { + "epoch": 0.015071357903280272, + "grad_norm": 11.268597602844238, + "learning_rate": 3.981614478598104e-05, + "loss": 4.6759, + "step": 53 + }, + { + "epoch": 0.01535572314673839, + "grad_norm": 13.982292175292969, + "learning_rate": 3.980465383510485e-05, + "loss": 4.7606, + "step": 54 + }, + { + "epoch": 0.015640088390196507, + "grad_norm": 8.659544944763184, + "learning_rate": 3.979316288422867e-05, + "loss": 4.394, + "step": 55 + }, + { + "epoch": 0.015924453633654627, + "grad_norm": 5.434732437133789, + "learning_rate": 3.978167193335249e-05, + "loss": 3.9735, + "step": 56 + }, + { + "epoch": 0.016208818877112743, + "grad_norm": 5.0773773193359375, + "learning_rate": 3.977018098247631e-05, + "loss": 5.9472, + "step": 57 + }, + { + "epoch": 0.016493184120570863, + "grad_norm": 6.471149444580078, + "learning_rate": 3.975869003160012e-05, + "loss": 5.2329, + "step": 58 + }, + { + "epoch": 0.016777549364028983, + "grad_norm": 8.178523063659668, + "learning_rate": 3.9747199080723936e-05, + "loss": 5.1182, + "step": 59 + }, + { + "epoch": 0.0170619146074871, + "grad_norm": 10.304596900939941, + "learning_rate": 3.973570812984775e-05, + "loss": 4.8543, + "step": 60 + }, + { + "epoch": 0.01734627985094522, + "grad_norm": 9.378915786743164, + "learning_rate": 3.9724217178971565e-05, + "loss": 4.4329, + "step": 61 + }, + { + "epoch": 0.017630645094403336, + "grad_norm": 7.112333297729492, + "learning_rate": 3.971272622809538e-05, + "loss": 4.1529, + "step": 62 + }, + { + "epoch": 0.017915010337861456, + "grad_norm": 6.022056579589844, + "learning_rate": 3.9701235277219194e-05, + "loss": 4.0711, + "step": 63 + }, + { + "epoch": 0.018199375581319573, + "grad_norm": 6.005387783050537, + "learning_rate": 3.968974432634301e-05, + "loss": 3.7158, + "step": 64 + }, + { + "epoch": 0.018483740824777693, + "grad_norm": 11.265110969543457, + "learning_rate": 3.967825337546682e-05, + "loss": 5.7308, + "step": 65 + }, + { + "epoch": 0.01876810606823581, + "grad_norm": 15.99368953704834, + "learning_rate": 3.966676242459064e-05, + "loss": 5.2962, + "step": 66 + }, + { + "epoch": 0.01905247131169393, + "grad_norm": 18.645151138305664, + "learning_rate": 3.965527147371445e-05, + "loss": 5.2347, + "step": 67 + }, + { + "epoch": 0.019336836555152045, + "grad_norm": 16.649398803710938, + "learning_rate": 3.964378052283827e-05, + "loss": 4.9301, + "step": 68 + }, + { + "epoch": 0.019621201798610165, + "grad_norm": 12.267605781555176, + "learning_rate": 3.963228957196208e-05, + "loss": 4.6981, + "step": 69 + }, + { + "epoch": 0.019905567042068282, + "grad_norm": 8.383939743041992, + "learning_rate": 3.96207986210859e-05, + "loss": 4.4809, + "step": 70 + }, + { + "epoch": 0.020189932285526402, + "grad_norm": 8.573983192443848, + "learning_rate": 3.9609307670209717e-05, + "loss": 3.8532, + "step": 71 + }, + { + "epoch": 0.02047429752898452, + "grad_norm": 11.06430435180664, + "learning_rate": 3.959781671933353e-05, + "loss": 4.1749, + "step": 72 + }, + { + "epoch": 0.02075866277244264, + "grad_norm": 10.86804485321045, + "learning_rate": 3.9586325768457346e-05, + "loss": 5.4128, + "step": 73 + }, + { + "epoch": 0.021043028015900758, + "grad_norm": 10.805957794189453, + "learning_rate": 3.957483481758116e-05, + "loss": 4.873, + "step": 74 + }, + { + "epoch": 0.021327393259358875, + "grad_norm": 8.829920768737793, + "learning_rate": 3.9563343866704975e-05, + "loss": 4.7101, + "step": 75 + }, + { + "epoch": 0.021611758502816995, + "grad_norm": 7.231428146362305, + "learning_rate": 3.9551852915828786e-05, + "loss": 4.696, + "step": 76 + }, + { + "epoch": 0.02189612374627511, + "grad_norm": 5.561997413635254, + "learning_rate": 3.9540361964952603e-05, + "loss": 4.1954, + "step": 77 + }, + { + "epoch": 0.02218048898973323, + "grad_norm": 7.825724124908447, + "learning_rate": 3.9528871014076415e-05, + "loss": 3.9524, + "step": 78 + }, + { + "epoch": 0.022464854233191348, + "grad_norm": 7.717831611633301, + "learning_rate": 3.951738006320023e-05, + "loss": 3.5277, + "step": 79 + }, + { + "epoch": 0.022749219476649468, + "grad_norm": 6.9925618171691895, + "learning_rate": 3.950588911232405e-05, + "loss": 3.3675, + "step": 80 + }, + { + "epoch": 0.023033584720107584, + "grad_norm": 6.320919513702393, + "learning_rate": 3.949439816144786e-05, + "loss": 5.2571, + "step": 81 + }, + { + "epoch": 0.023317949963565704, + "grad_norm": 5.361174583435059, + "learning_rate": 3.948290721057168e-05, + "loss": 4.7185, + "step": 82 + }, + { + "epoch": 0.02360231520702382, + "grad_norm": 4.8023362159729, + "learning_rate": 3.947141625969549e-05, + "loss": 4.4968, + "step": 83 + }, + { + "epoch": 0.02388668045048194, + "grad_norm": 6.406538963317871, + "learning_rate": 3.945992530881931e-05, + "loss": 4.3588, + "step": 84 + }, + { + "epoch": 0.024171045693940057, + "grad_norm": 9.289493560791016, + "learning_rate": 3.944843435794312e-05, + "loss": 4.2662, + "step": 85 + }, + { + "epoch": 0.024455410937398177, + "grad_norm": 9.779614448547363, + "learning_rate": 3.943694340706694e-05, + "loss": 4.169, + "step": 86 + }, + { + "epoch": 0.024739776180856293, + "grad_norm": 8.45078182220459, + "learning_rate": 3.942545245619075e-05, + "loss": 3.7212, + "step": 87 + }, + { + "epoch": 0.025024141424314413, + "grad_norm": 7.9592084884643555, + "learning_rate": 3.9413961505314566e-05, + "loss": 3.7354, + "step": 88 + }, + { + "epoch": 0.025308506667772533, + "grad_norm": 6.674322128295898, + "learning_rate": 3.9402470554438384e-05, + "loss": 4.912, + "step": 89 + }, + { + "epoch": 0.02559287191123065, + "grad_norm": 5.126534461975098, + "learning_rate": 3.9390979603562195e-05, + "loss": 4.7649, + "step": 90 + }, + { + "epoch": 0.02587723715468877, + "grad_norm": 7.692286014556885, + "learning_rate": 3.937948865268601e-05, + "loss": 4.4216, + "step": 91 + }, + { + "epoch": 0.026161602398146886, + "grad_norm": 13.10142993927002, + "learning_rate": 3.9367997701809824e-05, + "loss": 4.4418, + "step": 92 + }, + { + "epoch": 0.026445967641605006, + "grad_norm": 13.905843734741211, + "learning_rate": 3.935650675093364e-05, + "loss": 4.2435, + "step": 93 + }, + { + "epoch": 0.026730332885063122, + "grad_norm": 9.100831985473633, + "learning_rate": 3.934501580005746e-05, + "loss": 3.7519, + "step": 94 + }, + { + "epoch": 0.027014698128521242, + "grad_norm": 6.695760726928711, + "learning_rate": 3.933352484918128e-05, + "loss": 3.3665, + "step": 95 + }, + { + "epoch": 0.02729906337197936, + "grad_norm": 3.8529748916625977, + "learning_rate": 3.932203389830509e-05, + "loss": 3.6497, + "step": 96 + }, + { + "epoch": 0.02758342861543748, + "grad_norm": 7.971911907196045, + "learning_rate": 3.931054294742891e-05, + "loss": 4.8833, + "step": 97 + }, + { + "epoch": 0.027867793858895595, + "grad_norm": 9.789183616638184, + "learning_rate": 3.929905199655272e-05, + "loss": 4.6882, + "step": 98 + }, + { + "epoch": 0.028152159102353715, + "grad_norm": 9.128085136413574, + "learning_rate": 3.9287561045676536e-05, + "loss": 4.4283, + "step": 99 + }, + { + "epoch": 0.028436524345811832, + "grad_norm": 7.517631530761719, + "learning_rate": 3.927607009480035e-05, + "loss": 4.0737, + "step": 100 + }, + { + "epoch": 0.02872088958926995, + "grad_norm": 8.111870765686035, + "learning_rate": 3.9264579143924165e-05, + "loss": 3.9202, + "step": 101 + }, + { + "epoch": 0.029005254832728068, + "grad_norm": 6.428286075592041, + "learning_rate": 3.9253088193047976e-05, + "loss": 3.8114, + "step": 102 + }, + { + "epoch": 0.029289620076186188, + "grad_norm": 3.650003671646118, + "learning_rate": 3.9241597242171794e-05, + "loss": 3.4783, + "step": 103 + }, + { + "epoch": 0.029573985319644308, + "grad_norm": 3.3745474815368652, + "learning_rate": 3.923010629129561e-05, + "loss": 3.2149, + "step": 104 + }, + { + "epoch": 0.029858350563102425, + "grad_norm": 6.395346164703369, + "learning_rate": 3.921861534041942e-05, + "loss": 4.6489, + "step": 105 + }, + { + "epoch": 0.030142715806560545, + "grad_norm": 4.820883750915527, + "learning_rate": 3.920712438954324e-05, + "loss": 4.141, + "step": 106 + }, + { + "epoch": 0.03042708105001866, + "grad_norm": 5.000969409942627, + "learning_rate": 3.919563343866705e-05, + "loss": 4.0283, + "step": 107 + }, + { + "epoch": 0.03071144629347678, + "grad_norm": 6.004735469818115, + "learning_rate": 3.918414248779087e-05, + "loss": 4.0384, + "step": 108 + }, + { + "epoch": 0.030995811536934897, + "grad_norm": 4.477464199066162, + "learning_rate": 3.917265153691468e-05, + "loss": 3.8376, + "step": 109 + }, + { + "epoch": 0.031280176780393014, + "grad_norm": 3.1061179637908936, + "learning_rate": 3.91611605860385e-05, + "loss": 3.7448, + "step": 110 + }, + { + "epoch": 0.03156454202385114, + "grad_norm": 4.4171600341796875, + "learning_rate": 3.914966963516231e-05, + "loss": 3.4045, + "step": 111 + }, + { + "epoch": 0.031848907267309254, + "grad_norm": 5.526491641998291, + "learning_rate": 3.913817868428613e-05, + "loss": 3.2656, + "step": 112 + }, + { + "epoch": 0.03213327251076737, + "grad_norm": 8.430256843566895, + "learning_rate": 3.9126687733409946e-05, + "loss": 4.759, + "step": 113 + }, + { + "epoch": 0.03241763775422549, + "grad_norm": 7.523831367492676, + "learning_rate": 3.911519678253376e-05, + "loss": 4.3822, + "step": 114 + }, + { + "epoch": 0.03270200299768361, + "grad_norm": 6.166808605194092, + "learning_rate": 3.9103705831657575e-05, + "loss": 3.8537, + "step": 115 + }, + { + "epoch": 0.03298636824114173, + "grad_norm": 4.141753196716309, + "learning_rate": 3.9092214880781386e-05, + "loss": 3.9163, + "step": 116 + }, + { + "epoch": 0.03327073348459984, + "grad_norm": 4.088106632232666, + "learning_rate": 3.9080723929905204e-05, + "loss": 3.7164, + "step": 117 + }, + { + "epoch": 0.03355509872805797, + "grad_norm": 7.6691107749938965, + "learning_rate": 3.9069232979029015e-05, + "loss": 3.4528, + "step": 118 + }, + { + "epoch": 0.03383946397151608, + "grad_norm": 8.339359283447266, + "learning_rate": 3.905774202815283e-05, + "loss": 3.0847, + "step": 119 + }, + { + "epoch": 0.0341238292149742, + "grad_norm": 5.894288063049316, + "learning_rate": 3.9046251077276644e-05, + "loss": 3.1057, + "step": 120 + }, + { + "epoch": 0.034408194458432316, + "grad_norm": 4.960434436798096, + "learning_rate": 3.903476012640046e-05, + "loss": 4.8553, + "step": 121 + }, + { + "epoch": 0.03469255970189044, + "grad_norm": 4.705537796020508, + "learning_rate": 3.902326917552428e-05, + "loss": 4.2162, + "step": 122 + }, + { + "epoch": 0.034976924945348556, + "grad_norm": 3.077615737915039, + "learning_rate": 3.901177822464809e-05, + "loss": 4.2706, + "step": 123 + }, + { + "epoch": 0.03526129018880667, + "grad_norm": 3.1837899684906006, + "learning_rate": 3.900028727377191e-05, + "loss": 3.9014, + "step": 124 + }, + { + "epoch": 0.03554565543226479, + "grad_norm": 6.796784400939941, + "learning_rate": 3.898879632289572e-05, + "loss": 3.7483, + "step": 125 + }, + { + "epoch": 0.03583002067572291, + "grad_norm": 6.224754810333252, + "learning_rate": 3.897730537201954e-05, + "loss": 3.6634, + "step": 126 + }, + { + "epoch": 0.03611438591918103, + "grad_norm": 4.740405559539795, + "learning_rate": 3.896581442114335e-05, + "loss": 3.378, + "step": 127 + }, + { + "epoch": 0.036398751162639145, + "grad_norm": 4.660193920135498, + "learning_rate": 3.8954323470267167e-05, + "loss": 3.2241, + "step": 128 + }, + { + "epoch": 0.03668311640609726, + "grad_norm": 6.38066291809082, + "learning_rate": 3.894283251939098e-05, + "loss": 4.5271, + "step": 129 + }, + { + "epoch": 0.036967481649555385, + "grad_norm": 4.577174186706543, + "learning_rate": 3.89313415685148e-05, + "loss": 4.356, + "step": 130 + }, + { + "epoch": 0.0372518468930135, + "grad_norm": 4.075042247772217, + "learning_rate": 3.8919850617638613e-05, + "loss": 3.8877, + "step": 131 + }, + { + "epoch": 0.03753621213647162, + "grad_norm": 5.604481220245361, + "learning_rate": 3.890835966676243e-05, + "loss": 3.8834, + "step": 132 + }, + { + "epoch": 0.03782057737992974, + "grad_norm": 5.59628438949585, + "learning_rate": 3.889686871588624e-05, + "loss": 3.7806, + "step": 133 + }, + { + "epoch": 0.03810494262338786, + "grad_norm": 5.5282206535339355, + "learning_rate": 3.888537776501006e-05, + "loss": 3.4985, + "step": 134 + }, + { + "epoch": 0.038389307866845974, + "grad_norm": 5.6480865478515625, + "learning_rate": 3.887388681413387e-05, + "loss": 3.4003, + "step": 135 + }, + { + "epoch": 0.03867367311030409, + "grad_norm": 3.6527063846588135, + "learning_rate": 3.886239586325769e-05, + "loss": 2.9528, + "step": 136 + }, + { + "epoch": 0.038958038353762214, + "grad_norm": 4.994736671447754, + "learning_rate": 3.885090491238151e-05, + "loss": 4.6074, + "step": 137 + }, + { + "epoch": 0.03924240359722033, + "grad_norm": 5.265069484710693, + "learning_rate": 3.883941396150532e-05, + "loss": 3.9333, + "step": 138 + }, + { + "epoch": 0.03952676884067845, + "grad_norm": 5.489358901977539, + "learning_rate": 3.8827923010629136e-05, + "loss": 3.9805, + "step": 139 + }, + { + "epoch": 0.039811134084136564, + "grad_norm": 5.382353782653809, + "learning_rate": 3.881643205975295e-05, + "loss": 3.9738, + "step": 140 + }, + { + "epoch": 0.04009549932759469, + "grad_norm": 6.055698394775391, + "learning_rate": 3.8804941108876765e-05, + "loss": 3.621, + "step": 141 + }, + { + "epoch": 0.040379864571052804, + "grad_norm": 5.292023181915283, + "learning_rate": 3.8793450158000576e-05, + "loss": 3.5441, + "step": 142 + }, + { + "epoch": 0.04066422981451092, + "grad_norm": 3.348857879638672, + "learning_rate": 3.8781959207124394e-05, + "loss": 2.9929, + "step": 143 + }, + { + "epoch": 0.04094859505796904, + "grad_norm": 3.1222856044769287, + "learning_rate": 3.8770468256248205e-05, + "loss": 2.8722, + "step": 144 + }, + { + "epoch": 0.04123296030142716, + "grad_norm": 8.240327835083008, + "learning_rate": 3.875897730537202e-05, + "loss": 4.6144, + "step": 145 + }, + { + "epoch": 0.04151732554488528, + "grad_norm": 7.63242769241333, + "learning_rate": 3.874748635449584e-05, + "loss": 4.1743, + "step": 146 + }, + { + "epoch": 0.04180169078834339, + "grad_norm": 6.464611530303955, + "learning_rate": 3.873599540361965e-05, + "loss": 3.898, + "step": 147 + }, + { + "epoch": 0.042086056031801516, + "grad_norm": 6.321760177612305, + "learning_rate": 3.872450445274347e-05, + "loss": 3.8015, + "step": 148 + }, + { + "epoch": 0.04237042127525963, + "grad_norm": 4.688291549682617, + "learning_rate": 3.871301350186728e-05, + "loss": 3.5913, + "step": 149 + }, + { + "epoch": 0.04265478651871775, + "grad_norm": 3.5380592346191406, + "learning_rate": 3.87015225509911e-05, + "loss": 3.2963, + "step": 150 + }, + { + "epoch": 0.042939151762175866, + "grad_norm": 3.186795473098755, + "learning_rate": 3.869003160011491e-05, + "loss": 3.1665, + "step": 151 + }, + { + "epoch": 0.04322351700563399, + "grad_norm": 3.351107597351074, + "learning_rate": 3.867854064923873e-05, + "loss": 3.0804, + "step": 152 + }, + { + "epoch": 0.043507882249092106, + "grad_norm": 6.562431812286377, + "learning_rate": 3.866704969836254e-05, + "loss": 4.5577, + "step": 153 + }, + { + "epoch": 0.04379224749255022, + "grad_norm": 5.137665748596191, + "learning_rate": 3.865555874748636e-05, + "loss": 3.9937, + "step": 154 + }, + { + "epoch": 0.04407661273600834, + "grad_norm": 4.792855262756348, + "learning_rate": 3.8644067796610175e-05, + "loss": 3.6608, + "step": 155 + }, + { + "epoch": 0.04436097797946646, + "grad_norm": 4.835788726806641, + "learning_rate": 3.8632576845733986e-05, + "loss": 3.7309, + "step": 156 + }, + { + "epoch": 0.04464534322292458, + "grad_norm": 4.597768306732178, + "learning_rate": 3.8621085894857804e-05, + "loss": 3.348, + "step": 157 + }, + { + "epoch": 0.044929708466382695, + "grad_norm": 4.271462440490723, + "learning_rate": 3.8609594943981615e-05, + "loss": 3.3393, + "step": 158 + }, + { + "epoch": 0.04521407370984081, + "grad_norm": 3.781461477279663, + "learning_rate": 3.859810399310543e-05, + "loss": 3.0811, + "step": 159 + }, + { + "epoch": 0.045498438953298935, + "grad_norm": 3.5072243213653564, + "learning_rate": 3.8586613042229244e-05, + "loss": 2.8874, + "step": 160 + }, + { + "epoch": 0.04578280419675705, + "grad_norm": 4.078586101531982, + "learning_rate": 3.857512209135306e-05, + "loss": 4.3578, + "step": 161 + }, + { + "epoch": 0.04606716944021517, + "grad_norm": 4.5630784034729, + "learning_rate": 3.856363114047687e-05, + "loss": 4.0361, + "step": 162 + }, + { + "epoch": 0.04635153468367329, + "grad_norm": 4.149646282196045, + "learning_rate": 3.855214018960069e-05, + "loss": 3.7167, + "step": 163 + }, + { + "epoch": 0.04663589992713141, + "grad_norm": 3.7590112686157227, + "learning_rate": 3.854064923872451e-05, + "loss": 3.3838, + "step": 164 + }, + { + "epoch": 0.046920265170589524, + "grad_norm": 4.184499263763428, + "learning_rate": 3.852915828784832e-05, + "loss": 3.5132, + "step": 165 + }, + { + "epoch": 0.04720463041404764, + "grad_norm": 4.422394275665283, + "learning_rate": 3.851766733697214e-05, + "loss": 3.2853, + "step": 166 + }, + { + "epoch": 0.047488995657505764, + "grad_norm": 4.050603866577148, + "learning_rate": 3.8506176386095956e-05, + "loss": 2.9437, + "step": 167 + }, + { + "epoch": 0.04777336090096388, + "grad_norm": 4.047832489013672, + "learning_rate": 3.849468543521977e-05, + "loss": 2.949, + "step": 168 + }, + { + "epoch": 0.048057726144422, + "grad_norm": 4.343886375427246, + "learning_rate": 3.8483194484343585e-05, + "loss": 4.5678, + "step": 169 + }, + { + "epoch": 0.048342091387880114, + "grad_norm": 3.9210119247436523, + "learning_rate": 3.84717035334674e-05, + "loss": 3.8641, + "step": 170 + }, + { + "epoch": 0.04862645663133824, + "grad_norm": 3.6741321086883545, + "learning_rate": 3.8460212582591214e-05, + "loss": 3.498, + "step": 171 + }, + { + "epoch": 0.048910821874796354, + "grad_norm": 3.9248976707458496, + "learning_rate": 3.844872163171503e-05, + "loss": 3.6691, + "step": 172 + }, + { + "epoch": 0.04919518711825447, + "grad_norm": 4.195116996765137, + "learning_rate": 3.843723068083884e-05, + "loss": 3.4649, + "step": 173 + }, + { + "epoch": 0.049479552361712587, + "grad_norm": 4.228196144104004, + "learning_rate": 3.842573972996266e-05, + "loss": 3.1212, + "step": 174 + }, + { + "epoch": 0.04976391760517071, + "grad_norm": 4.049994468688965, + "learning_rate": 3.841424877908647e-05, + "loss": 3.144, + "step": 175 + }, + { + "epoch": 0.050048282848628826, + "grad_norm": 3.637624502182007, + "learning_rate": 3.840275782821029e-05, + "loss": 2.8251, + "step": 176 + }, + { + "epoch": 0.05033264809208694, + "grad_norm": 3.211629867553711, + "learning_rate": 3.83912668773341e-05, + "loss": 4.115, + "step": 177 + }, + { + "epoch": 0.050617013335545066, + "grad_norm": 3.2752835750579834, + "learning_rate": 3.837977592645792e-05, + "loss": 3.8892, + "step": 178 + }, + { + "epoch": 0.05090137857900318, + "grad_norm": 3.31851863861084, + "learning_rate": 3.8368284975581736e-05, + "loss": 3.4402, + "step": 179 + }, + { + "epoch": 0.0511857438224613, + "grad_norm": 3.9213197231292725, + "learning_rate": 3.835679402470555e-05, + "loss": 3.3855, + "step": 180 + }, + { + "epoch": 0.051470109065919416, + "grad_norm": 4.550943851470947, + "learning_rate": 3.8345303073829365e-05, + "loss": 3.3385, + "step": 181 + }, + { + "epoch": 0.05175447430937754, + "grad_norm": 3.618424654006958, + "learning_rate": 3.8333812122953177e-05, + "loss": 3.0774, + "step": 182 + }, + { + "epoch": 0.052038839552835656, + "grad_norm": 2.8778865337371826, + "learning_rate": 3.8322321172076994e-05, + "loss": 2.9035, + "step": 183 + }, + { + "epoch": 0.05232320479629377, + "grad_norm": 2.7407846450805664, + "learning_rate": 3.8310830221200806e-05, + "loss": 2.8378, + "step": 184 + }, + { + "epoch": 0.05260757003975189, + "grad_norm": 3.778500556945801, + "learning_rate": 3.8299339270324623e-05, + "loss": 4.2712, + "step": 185 + }, + { + "epoch": 0.05289193528321001, + "grad_norm": 4.52739143371582, + "learning_rate": 3.8287848319448435e-05, + "loss": 3.698, + "step": 186 + }, + { + "epoch": 0.05317630052666813, + "grad_norm": 4.739940166473389, + "learning_rate": 3.827635736857225e-05, + "loss": 3.7459, + "step": 187 + }, + { + "epoch": 0.053460665770126245, + "grad_norm": 4.952041149139404, + "learning_rate": 3.826486641769607e-05, + "loss": 3.355, + "step": 188 + }, + { + "epoch": 0.05374503101358436, + "grad_norm": 3.8098363876342773, + "learning_rate": 3.825337546681988e-05, + "loss": 3.2634, + "step": 189 + }, + { + "epoch": 0.054029396257042485, + "grad_norm": 3.0846543312072754, + "learning_rate": 3.82418845159437e-05, + "loss": 3.0904, + "step": 190 + }, + { + "epoch": 0.0543137615005006, + "grad_norm": 2.909727096557617, + "learning_rate": 3.823039356506751e-05, + "loss": 2.9777, + "step": 191 + }, + { + "epoch": 0.05459812674395872, + "grad_norm": 3.1065726280212402, + "learning_rate": 3.821890261419133e-05, + "loss": 2.7359, + "step": 192 + }, + { + "epoch": 0.05488249198741684, + "grad_norm": 7.011332988739014, + "learning_rate": 3.820741166331514e-05, + "loss": 4.007, + "step": 193 + }, + { + "epoch": 0.05516685723087496, + "grad_norm": 5.220023155212402, + "learning_rate": 3.819592071243896e-05, + "loss": 3.6895, + "step": 194 + }, + { + "epoch": 0.055451222474333074, + "grad_norm": 4.016128063201904, + "learning_rate": 3.818442976156277e-05, + "loss": 3.6332, + "step": 195 + }, + { + "epoch": 0.05573558771779119, + "grad_norm": 3.669994831085205, + "learning_rate": 3.8172938810686586e-05, + "loss": 3.395, + "step": 196 + }, + { + "epoch": 0.056019952961249314, + "grad_norm": 3.2430341243743896, + "learning_rate": 3.8161447859810404e-05, + "loss": 3.4718, + "step": 197 + }, + { + "epoch": 0.05630431820470743, + "grad_norm": 2.657214403152466, + "learning_rate": 3.8149956908934215e-05, + "loss": 3.0473, + "step": 198 + }, + { + "epoch": 0.05658868344816555, + "grad_norm": 2.9551098346710205, + "learning_rate": 3.813846595805803e-05, + "loss": 2.7037, + "step": 199 + }, + { + "epoch": 0.056873048691623664, + "grad_norm": 2.8400704860687256, + "learning_rate": 3.8126975007181844e-05, + "loss": 2.6874, + "step": 200 + }, + { + "epoch": 0.05715741393508179, + "grad_norm": 3.2741611003875732, + "learning_rate": 3.811548405630566e-05, + "loss": 4.1424, + "step": 201 + }, + { + "epoch": 0.0574417791785399, + "grad_norm": 3.464838981628418, + "learning_rate": 3.810399310542947e-05, + "loss": 3.7012, + "step": 202 + }, + { + "epoch": 0.05772614442199802, + "grad_norm": 2.831378698348999, + "learning_rate": 3.809250215455329e-05, + "loss": 3.4572, + "step": 203 + }, + { + "epoch": 0.058010509665456136, + "grad_norm": 2.67508864402771, + "learning_rate": 3.808101120367711e-05, + "loss": 3.5624, + "step": 204 + }, + { + "epoch": 0.05829487490891426, + "grad_norm": 2.527092218399048, + "learning_rate": 3.806952025280093e-05, + "loss": 3.2684, + "step": 205 + }, + { + "epoch": 0.058579240152372376, + "grad_norm": 3.604074478149414, + "learning_rate": 3.805802930192474e-05, + "loss": 3.1068, + "step": 206 + }, + { + "epoch": 0.05886360539583049, + "grad_norm": 3.566804885864258, + "learning_rate": 3.8046538351048556e-05, + "loss": 2.9472, + "step": 207 + }, + { + "epoch": 0.059147970639288616, + "grad_norm": 3.518846273422241, + "learning_rate": 3.803504740017237e-05, + "loss": 2.7991, + "step": 208 + }, + { + "epoch": 0.05943233588274673, + "grad_norm": 5.438695430755615, + "learning_rate": 3.8023556449296185e-05, + "loss": 4.3441, + "step": 209 + }, + { + "epoch": 0.05971670112620485, + "grad_norm": 4.187833786010742, + "learning_rate": 3.8012065498419996e-05, + "loss": 3.7446, + "step": 210 + }, + { + "epoch": 0.060001066369662966, + "grad_norm": 2.9121296405792236, + "learning_rate": 3.8000574547543814e-05, + "loss": 3.4443, + "step": 211 + }, + { + "epoch": 0.06028543161312109, + "grad_norm": 3.0015816688537598, + "learning_rate": 3.798908359666763e-05, + "loss": 3.2636, + "step": 212 + }, + { + "epoch": 0.060569796856579206, + "grad_norm": 4.640799045562744, + "learning_rate": 3.797759264579144e-05, + "loss": 3.5289, + "step": 213 + }, + { + "epoch": 0.06085416210003732, + "grad_norm": 4.131928443908691, + "learning_rate": 3.796610169491526e-05, + "loss": 3.0966, + "step": 214 + }, + { + "epoch": 0.06113852734349544, + "grad_norm": 3.303802251815796, + "learning_rate": 3.795461074403907e-05, + "loss": 2.6751, + "step": 215 + }, + { + "epoch": 0.06142289258695356, + "grad_norm": 3.197770595550537, + "learning_rate": 3.794311979316289e-05, + "loss": 2.8337, + "step": 216 + }, + { + "epoch": 0.06170725783041168, + "grad_norm": 3.2084145545959473, + "learning_rate": 3.79316288422867e-05, + "loss": 3.9611, + "step": 217 + }, + { + "epoch": 0.061991623073869795, + "grad_norm": 2.7938461303710938, + "learning_rate": 3.792013789141052e-05, + "loss": 3.6372, + "step": 218 + }, + { + "epoch": 0.06227598831732791, + "grad_norm": 3.456489324569702, + "learning_rate": 3.790864694053433e-05, + "loss": 3.4849, + "step": 219 + }, + { + "epoch": 0.06256035356078603, + "grad_norm": 4.275204658508301, + "learning_rate": 3.789715598965815e-05, + "loss": 3.2791, + "step": 220 + }, + { + "epoch": 0.06284471880424415, + "grad_norm": 3.960286855697632, + "learning_rate": 3.7885665038781966e-05, + "loss": 3.3154, + "step": 221 + }, + { + "epoch": 0.06312908404770227, + "grad_norm": 3.113219738006592, + "learning_rate": 3.787417408790578e-05, + "loss": 3.0051, + "step": 222 + }, + { + "epoch": 0.06341344929116038, + "grad_norm": 2.509889841079712, + "learning_rate": 3.7862683137029595e-05, + "loss": 2.7279, + "step": 223 + }, + { + "epoch": 0.06369781453461851, + "grad_norm": 2.478565216064453, + "learning_rate": 3.7851192186153406e-05, + "loss": 2.6428, + "step": 224 + }, + { + "epoch": 0.06398217977807663, + "grad_norm": 4.031311988830566, + "learning_rate": 3.7839701235277224e-05, + "loss": 3.8377, + "step": 225 + }, + { + "epoch": 0.06426654502153474, + "grad_norm": 4.466932773590088, + "learning_rate": 3.7828210284401035e-05, + "loss": 3.516, + "step": 226 + }, + { + "epoch": 0.06455091026499286, + "grad_norm": 4.715795040130615, + "learning_rate": 3.781671933352485e-05, + "loss": 3.545, + "step": 227 + }, + { + "epoch": 0.06483527550845097, + "grad_norm": 3.541882276535034, + "learning_rate": 3.7805228382648664e-05, + "loss": 3.2869, + "step": 228 + }, + { + "epoch": 0.0651196407519091, + "grad_norm": 3.242234945297241, + "learning_rate": 3.779373743177248e-05, + "loss": 3.0996, + "step": 229 + }, + { + "epoch": 0.06540400599536722, + "grad_norm": 2.6109092235565186, + "learning_rate": 3.77822464808963e-05, + "loss": 2.9875, + "step": 230 + }, + { + "epoch": 0.06568837123882533, + "grad_norm": 2.639467239379883, + "learning_rate": 3.777075553002011e-05, + "loss": 2.8205, + "step": 231 + }, + { + "epoch": 0.06597273648228345, + "grad_norm": 2.852081775665283, + "learning_rate": 3.775926457914393e-05, + "loss": 2.8712, + "step": 232 + }, + { + "epoch": 0.06625710172574158, + "grad_norm": 4.4990339279174805, + "learning_rate": 3.774777362826774e-05, + "loss": 3.8808, + "step": 233 + }, + { + "epoch": 0.06654146696919969, + "grad_norm": 5.339641571044922, + "learning_rate": 3.773628267739156e-05, + "loss": 3.8167, + "step": 234 + }, + { + "epoch": 0.06682583221265781, + "grad_norm": 3.1748464107513428, + "learning_rate": 3.772479172651537e-05, + "loss": 3.5924, + "step": 235 + }, + { + "epoch": 0.06711019745611593, + "grad_norm": 3.4898841381073, + "learning_rate": 3.7713300775639187e-05, + "loss": 3.2489, + "step": 236 + }, + { + "epoch": 0.06739456269957404, + "grad_norm": 3.2079555988311768, + "learning_rate": 3.7701809824763e-05, + "loss": 3.1664, + "step": 237 + }, + { + "epoch": 0.06767892794303217, + "grad_norm": 2.7962253093719482, + "learning_rate": 3.7690318873886816e-05, + "loss": 2.8651, + "step": 238 + }, + { + "epoch": 0.06796329318649028, + "grad_norm": 3.1500935554504395, + "learning_rate": 3.767882792301063e-05, + "loss": 2.7629, + "step": 239 + }, + { + "epoch": 0.0682476584299484, + "grad_norm": 2.5772945880889893, + "learning_rate": 3.7667336972134444e-05, + "loss": 2.6508, + "step": 240 + }, + { + "epoch": 0.06853202367340652, + "grad_norm": 4.086240291595459, + "learning_rate": 3.765584602125826e-05, + "loss": 3.8843, + "step": 241 + }, + { + "epoch": 0.06881638891686463, + "grad_norm": 2.856458902359009, + "learning_rate": 3.764435507038208e-05, + "loss": 3.4318, + "step": 242 + }, + { + "epoch": 0.06910075416032276, + "grad_norm": 3.179042100906372, + "learning_rate": 3.763286411950589e-05, + "loss": 3.0556, + "step": 243 + }, + { + "epoch": 0.06938511940378088, + "grad_norm": 3.2401700019836426, + "learning_rate": 3.762137316862971e-05, + "loss": 3.3399, + "step": 244 + }, + { + "epoch": 0.06966948464723899, + "grad_norm": 2.8616445064544678, + "learning_rate": 3.760988221775353e-05, + "loss": 3.0333, + "step": 245 + }, + { + "epoch": 0.06995384989069711, + "grad_norm": 2.7594053745269775, + "learning_rate": 3.759839126687734e-05, + "loss": 2.9015, + "step": 246 + }, + { + "epoch": 0.07023821513415522, + "grad_norm": 2.3787291049957275, + "learning_rate": 3.7586900316001156e-05, + "loss": 2.6074, + "step": 247 + }, + { + "epoch": 0.07052258037761334, + "grad_norm": 2.604524612426758, + "learning_rate": 3.757540936512497e-05, + "loss": 2.5631, + "step": 248 + }, + { + "epoch": 0.07080694562107147, + "grad_norm": 5.74310302734375, + "learning_rate": 3.7563918414248785e-05, + "loss": 3.9635, + "step": 249 + }, + { + "epoch": 0.07109131086452958, + "grad_norm": 4.676276206970215, + "learning_rate": 3.7552427463372596e-05, + "loss": 3.4952, + "step": 250 + }, + { + "epoch": 0.0713756761079877, + "grad_norm": 3.3065385818481445, + "learning_rate": 3.7540936512496414e-05, + "loss": 3.2999, + "step": 251 + }, + { + "epoch": 0.07166004135144582, + "grad_norm": 2.9583752155303955, + "learning_rate": 3.7529445561620225e-05, + "loss": 3.2314, + "step": 252 + }, + { + "epoch": 0.07194440659490393, + "grad_norm": 3.083742380142212, + "learning_rate": 3.751795461074404e-05, + "loss": 3.0786, + "step": 253 + }, + { + "epoch": 0.07222877183836206, + "grad_norm": 3.241976261138916, + "learning_rate": 3.750646365986786e-05, + "loss": 2.8807, + "step": 254 + }, + { + "epoch": 0.07251313708182018, + "grad_norm": 2.7543532848358154, + "learning_rate": 3.749497270899167e-05, + "loss": 2.8752, + "step": 255 + }, + { + "epoch": 0.07279750232527829, + "grad_norm": 2.6907711029052734, + "learning_rate": 3.748348175811549e-05, + "loss": 2.7082, + "step": 256 + }, + { + "epoch": 0.07308186756873641, + "grad_norm": 3.050748586654663, + "learning_rate": 3.74719908072393e-05, + "loss": 3.8712, + "step": 257 + }, + { + "epoch": 0.07336623281219452, + "grad_norm": 3.0048818588256836, + "learning_rate": 3.746049985636312e-05, + "loss": 3.4344, + "step": 258 + }, + { + "epoch": 0.07365059805565265, + "grad_norm": 2.973074197769165, + "learning_rate": 3.744900890548693e-05, + "loss": 3.1367, + "step": 259 + }, + { + "epoch": 0.07393496329911077, + "grad_norm": 3.0169665813446045, + "learning_rate": 3.743751795461075e-05, + "loss": 3.4544, + "step": 260 + }, + { + "epoch": 0.07421932854256888, + "grad_norm": 2.8069064617156982, + "learning_rate": 3.742602700373456e-05, + "loss": 3.1697, + "step": 261 + }, + { + "epoch": 0.074503693786027, + "grad_norm": 2.524655818939209, + "learning_rate": 3.741453605285838e-05, + "loss": 2.9794, + "step": 262 + }, + { + "epoch": 0.07478805902948513, + "grad_norm": 2.5447611808776855, + "learning_rate": 3.7403045101982195e-05, + "loss": 2.5633, + "step": 263 + }, + { + "epoch": 0.07507242427294324, + "grad_norm": 3.1086103916168213, + "learning_rate": 3.7391554151106006e-05, + "loss": 2.6274, + "step": 264 + }, + { + "epoch": 0.07535678951640136, + "grad_norm": 3.7446553707122803, + "learning_rate": 3.7380063200229824e-05, + "loss": 4.0418, + "step": 265 + }, + { + "epoch": 0.07564115475985948, + "grad_norm": 3.5950560569763184, + "learning_rate": 3.7368572249353635e-05, + "loss": 3.3891, + "step": 266 + }, + { + "epoch": 0.07592552000331759, + "grad_norm": 3.037245273590088, + "learning_rate": 3.735708129847745e-05, + "loss": 3.389, + "step": 267 + }, + { + "epoch": 0.07620988524677572, + "grad_norm": 2.6339914798736572, + "learning_rate": 3.7345590347601264e-05, + "loss": 3.0648, + "step": 268 + }, + { + "epoch": 0.07649425049023383, + "grad_norm": 2.379084587097168, + "learning_rate": 3.733409939672508e-05, + "loss": 2.9169, + "step": 269 + }, + { + "epoch": 0.07677861573369195, + "grad_norm": 2.9946842193603516, + "learning_rate": 3.732260844584889e-05, + "loss": 2.7304, + "step": 270 + }, + { + "epoch": 0.07706298097715007, + "grad_norm": 3.1141395568847656, + "learning_rate": 3.731111749497271e-05, + "loss": 2.7589, + "step": 271 + }, + { + "epoch": 0.07734734622060818, + "grad_norm": 2.931774854660034, + "learning_rate": 3.729962654409653e-05, + "loss": 2.61, + "step": 272 + }, + { + "epoch": 0.0776317114640663, + "grad_norm": 4.752538681030273, + "learning_rate": 3.728813559322034e-05, + "loss": 3.82, + "step": 273 + }, + { + "epoch": 0.07791607670752443, + "grad_norm": 3.4177722930908203, + "learning_rate": 3.727664464234416e-05, + "loss": 3.3525, + "step": 274 + }, + { + "epoch": 0.07820044195098254, + "grad_norm": 2.749371290206909, + "learning_rate": 3.726515369146797e-05, + "loss": 3.1633, + "step": 275 + }, + { + "epoch": 0.07848480719444066, + "grad_norm": 3.0202836990356445, + "learning_rate": 3.725366274059179e-05, + "loss": 3.2241, + "step": 276 + }, + { + "epoch": 0.07876917243789877, + "grad_norm": 3.951889753341675, + "learning_rate": 3.72421717897156e-05, + "loss": 3.1181, + "step": 277 + }, + { + "epoch": 0.0790535376813569, + "grad_norm": 3.7062735557556152, + "learning_rate": 3.723068083883942e-05, + "loss": 2.924, + "step": 278 + }, + { + "epoch": 0.07933790292481502, + "grad_norm": 2.9112234115600586, + "learning_rate": 3.7219189887963234e-05, + "loss": 2.6434, + "step": 279 + }, + { + "epoch": 0.07962226816827313, + "grad_norm": 2.5431931018829346, + "learning_rate": 3.720769893708705e-05, + "loss": 2.5729, + "step": 280 + }, + { + "epoch": 0.07990663341173125, + "grad_norm": 3.293665647506714, + "learning_rate": 3.719620798621086e-05, + "loss": 3.6881, + "step": 281 + }, + { + "epoch": 0.08019099865518937, + "grad_norm": 3.0450286865234375, + "learning_rate": 3.718471703533468e-05, + "loss": 3.6725, + "step": 282 + }, + { + "epoch": 0.08047536389864748, + "grad_norm": 2.8695693016052246, + "learning_rate": 3.717322608445849e-05, + "loss": 3.4356, + "step": 283 + }, + { + "epoch": 0.08075972914210561, + "grad_norm": 2.7940287590026855, + "learning_rate": 3.716173513358231e-05, + "loss": 3.1586, + "step": 284 + }, + { + "epoch": 0.08104409438556373, + "grad_norm": 3.127352476119995, + "learning_rate": 3.715024418270612e-05, + "loss": 3.0592, + "step": 285 + }, + { + "epoch": 0.08132845962902184, + "grad_norm": 2.77877140045166, + "learning_rate": 3.713875323182994e-05, + "loss": 2.6598, + "step": 286 + }, + { + "epoch": 0.08161282487247996, + "grad_norm": 2.7355434894561768, + "learning_rate": 3.7127262280953756e-05, + "loss": 2.5461, + "step": 287 + }, + { + "epoch": 0.08189719011593807, + "grad_norm": 2.800222873687744, + "learning_rate": 3.711577133007757e-05, + "loss": 2.6196, + "step": 288 + }, + { + "epoch": 0.0821815553593962, + "grad_norm": 3.4335219860076904, + "learning_rate": 3.7104280379201385e-05, + "loss": 3.6921, + "step": 289 + }, + { + "epoch": 0.08246592060285432, + "grad_norm": 3.0871481895446777, + "learning_rate": 3.7092789428325196e-05, + "loss": 3.2295, + "step": 290 + }, + { + "epoch": 0.08275028584631243, + "grad_norm": 3.372642993927002, + "learning_rate": 3.7081298477449014e-05, + "loss": 3.5505, + "step": 291 + }, + { + "epoch": 0.08303465108977055, + "grad_norm": 2.830824613571167, + "learning_rate": 3.7069807526572825e-05, + "loss": 3.1795, + "step": 292 + }, + { + "epoch": 0.08331901633322868, + "grad_norm": 2.850874423980713, + "learning_rate": 3.705831657569664e-05, + "loss": 2.9981, + "step": 293 + }, + { + "epoch": 0.08360338157668679, + "grad_norm": 2.798980712890625, + "learning_rate": 3.7046825624820454e-05, + "loss": 2.7752, + "step": 294 + }, + { + "epoch": 0.08388774682014491, + "grad_norm": 2.3986728191375732, + "learning_rate": 3.703533467394427e-05, + "loss": 2.4156, + "step": 295 + }, + { + "epoch": 0.08417211206360303, + "grad_norm": 2.329164505004883, + "learning_rate": 3.702384372306809e-05, + "loss": 2.5282, + "step": 296 + }, + { + "epoch": 0.08445647730706114, + "grad_norm": 3.428501844406128, + "learning_rate": 3.70123527721919e-05, + "loss": 3.8649, + "step": 297 + }, + { + "epoch": 0.08474084255051927, + "grad_norm": 3.2161340713500977, + "learning_rate": 3.700086182131572e-05, + "loss": 3.4523, + "step": 298 + }, + { + "epoch": 0.08502520779397738, + "grad_norm": 2.801135778427124, + "learning_rate": 3.698937087043953e-05, + "loss": 3.3231, + "step": 299 + }, + { + "epoch": 0.0853095730374355, + "grad_norm": 2.886110544204712, + "learning_rate": 3.697787991956335e-05, + "loss": 2.918, + "step": 300 + }, + { + "epoch": 0.08559393828089362, + "grad_norm": 3.050361156463623, + "learning_rate": 3.696638896868716e-05, + "loss": 2.9013, + "step": 301 + }, + { + "epoch": 0.08587830352435173, + "grad_norm": 2.4915997982025146, + "learning_rate": 3.695489801781098e-05, + "loss": 2.9062, + "step": 302 + }, + { + "epoch": 0.08616266876780986, + "grad_norm": 2.6376094818115234, + "learning_rate": 3.694340706693479e-05, + "loss": 2.8681, + "step": 303 + }, + { + "epoch": 0.08644703401126798, + "grad_norm": 2.602095603942871, + "learning_rate": 3.6931916116058606e-05, + "loss": 2.6519, + "step": 304 + }, + { + "epoch": 0.08673139925472609, + "grad_norm": 3.274116277694702, + "learning_rate": 3.6920425165182424e-05, + "loss": 3.812, + "step": 305 + }, + { + "epoch": 0.08701576449818421, + "grad_norm": 2.873872756958008, + "learning_rate": 3.6908934214306235e-05, + "loss": 3.3963, + "step": 306 + }, + { + "epoch": 0.08730012974164232, + "grad_norm": 2.432584047317505, + "learning_rate": 3.689744326343005e-05, + "loss": 3.2271, + "step": 307 + }, + { + "epoch": 0.08758449498510044, + "grad_norm": 2.374149799346924, + "learning_rate": 3.6885952312553864e-05, + "loss": 3.4667, + "step": 308 + }, + { + "epoch": 0.08786886022855857, + "grad_norm": 2.5668742656707764, + "learning_rate": 3.687446136167768e-05, + "loss": 2.9856, + "step": 309 + }, + { + "epoch": 0.08815322547201668, + "grad_norm": 2.4312264919281006, + "learning_rate": 3.686297041080149e-05, + "loss": 2.6518, + "step": 310 + }, + { + "epoch": 0.0884375907154748, + "grad_norm": 2.417278528213501, + "learning_rate": 3.685147945992531e-05, + "loss": 2.5552, + "step": 311 + }, + { + "epoch": 0.08872195595893292, + "grad_norm": 2.399350166320801, + "learning_rate": 3.683998850904912e-05, + "loss": 2.5939, + "step": 312 + }, + { + "epoch": 0.08900632120239103, + "grad_norm": 4.476291656494141, + "learning_rate": 3.682849755817294e-05, + "loss": 3.7203, + "step": 313 + }, + { + "epoch": 0.08929068644584916, + "grad_norm": 2.964276075363159, + "learning_rate": 3.681700660729676e-05, + "loss": 3.3704, + "step": 314 + }, + { + "epoch": 0.08957505168930728, + "grad_norm": 2.778978109359741, + "learning_rate": 3.6805515656420576e-05, + "loss": 3.3189, + "step": 315 + }, + { + "epoch": 0.08985941693276539, + "grad_norm": 3.131481647491455, + "learning_rate": 3.679402470554439e-05, + "loss": 3.2355, + "step": 316 + }, + { + "epoch": 0.09014378217622351, + "grad_norm": 3.234072208404541, + "learning_rate": 3.6782533754668205e-05, + "loss": 2.8635, + "step": 317 + }, + { + "epoch": 0.09042814741968162, + "grad_norm": 3.1541082859039307, + "learning_rate": 3.6771042803792016e-05, + "loss": 2.7849, + "step": 318 + }, + { + "epoch": 0.09071251266313975, + "grad_norm": 2.8236806392669678, + "learning_rate": 3.6759551852915834e-05, + "loss": 2.7041, + "step": 319 + }, + { + "epoch": 0.09099687790659787, + "grad_norm": 2.4180235862731934, + "learning_rate": 3.674806090203965e-05, + "loss": 2.4594, + "step": 320 + }, + { + "epoch": 0.09128124315005598, + "grad_norm": 4.0737080574035645, + "learning_rate": 3.673656995116346e-05, + "loss": 3.5109, + "step": 321 + }, + { + "epoch": 0.0915656083935141, + "grad_norm": 4.438284873962402, + "learning_rate": 3.672507900028728e-05, + "loss": 3.2919, + "step": 322 + }, + { + "epoch": 0.09184997363697223, + "grad_norm": 4.133691310882568, + "learning_rate": 3.671358804941109e-05, + "loss": 3.0562, + "step": 323 + }, + { + "epoch": 0.09213433888043034, + "grad_norm": 4.211462020874023, + "learning_rate": 3.670209709853491e-05, + "loss": 3.2076, + "step": 324 + }, + { + "epoch": 0.09241870412388846, + "grad_norm": 3.6107709407806396, + "learning_rate": 3.669060614765872e-05, + "loss": 3.003, + "step": 325 + }, + { + "epoch": 0.09270306936734658, + "grad_norm": 2.793802261352539, + "learning_rate": 3.667911519678254e-05, + "loss": 2.8852, + "step": 326 + }, + { + "epoch": 0.09298743461080469, + "grad_norm": 2.2556843757629395, + "learning_rate": 3.666762424590635e-05, + "loss": 2.5742, + "step": 327 + }, + { + "epoch": 0.09327179985426282, + "grad_norm": 2.088768243789673, + "learning_rate": 3.665613329503017e-05, + "loss": 2.5716, + "step": 328 + }, + { + "epoch": 0.09355616509772093, + "grad_norm": 4.9838056564331055, + "learning_rate": 3.6644642344153986e-05, + "loss": 3.7839, + "step": 329 + }, + { + "epoch": 0.09384053034117905, + "grad_norm": 3.877955436706543, + "learning_rate": 3.66331513932778e-05, + "loss": 3.2162, + "step": 330 + }, + { + "epoch": 0.09412489558463717, + "grad_norm": 3.442711353302002, + "learning_rate": 3.6621660442401615e-05, + "loss": 3.2394, + "step": 331 + }, + { + "epoch": 0.09440926082809528, + "grad_norm": 3.3739242553710938, + "learning_rate": 3.6610169491525426e-05, + "loss": 3.1204, + "step": 332 + }, + { + "epoch": 0.0946936260715534, + "grad_norm": 3.4325954914093018, + "learning_rate": 3.6598678540649244e-05, + "loss": 2.7708, + "step": 333 + }, + { + "epoch": 0.09497799131501153, + "grad_norm": 2.4555609226226807, + "learning_rate": 3.6587187589773055e-05, + "loss": 2.5941, + "step": 334 + }, + { + "epoch": 0.09526235655846964, + "grad_norm": 2.201622724533081, + "learning_rate": 3.657569663889687e-05, + "loss": 2.5229, + "step": 335 + }, + { + "epoch": 0.09554672180192776, + "grad_norm": 2.3770065307617188, + "learning_rate": 3.6564205688020684e-05, + "loss": 2.4801, + "step": 336 + }, + { + "epoch": 0.09583108704538587, + "grad_norm": 3.16601300239563, + "learning_rate": 3.65527147371445e-05, + "loss": 3.9019, + "step": 337 + }, + { + "epoch": 0.096115452288844, + "grad_norm": 2.69059419631958, + "learning_rate": 3.654122378626832e-05, + "loss": 3.2074, + "step": 338 + }, + { + "epoch": 0.09639981753230212, + "grad_norm": 2.622850179672241, + "learning_rate": 3.652973283539213e-05, + "loss": 3.1565, + "step": 339 + }, + { + "epoch": 0.09668418277576023, + "grad_norm": 2.384047746658325, + "learning_rate": 3.651824188451595e-05, + "loss": 2.9815, + "step": 340 + }, + { + "epoch": 0.09696854801921835, + "grad_norm": 2.255505084991455, + "learning_rate": 3.650675093363976e-05, + "loss": 3.0731, + "step": 341 + }, + { + "epoch": 0.09725291326267647, + "grad_norm": 2.502235174179077, + "learning_rate": 3.649525998276358e-05, + "loss": 2.795, + "step": 342 + }, + { + "epoch": 0.09753727850613458, + "grad_norm": 2.6988747119903564, + "learning_rate": 3.648376903188739e-05, + "loss": 2.6843, + "step": 343 + }, + { + "epoch": 0.09782164374959271, + "grad_norm": 2.856539011001587, + "learning_rate": 3.6472278081011206e-05, + "loss": 2.4656, + "step": 344 + }, + { + "epoch": 0.09810600899305083, + "grad_norm": 3.2413103580474854, + "learning_rate": 3.646078713013502e-05, + "loss": 3.8422, + "step": 345 + }, + { + "epoch": 0.09839037423650894, + "grad_norm": 2.434211254119873, + "learning_rate": 3.6449296179258835e-05, + "loss": 3.0491, + "step": 346 + }, + { + "epoch": 0.09867473947996706, + "grad_norm": 2.2801740169525146, + "learning_rate": 3.643780522838265e-05, + "loss": 3.0232, + "step": 347 + }, + { + "epoch": 0.09895910472342517, + "grad_norm": 2.3498308658599854, + "learning_rate": 3.6426314277506464e-05, + "loss": 2.9397, + "step": 348 + }, + { + "epoch": 0.0992434699668833, + "grad_norm": 2.8017773628234863, + "learning_rate": 3.641482332663028e-05, + "loss": 2.8337, + "step": 349 + }, + { + "epoch": 0.09952783521034142, + "grad_norm": 2.93746018409729, + "learning_rate": 3.6403332375754093e-05, + "loss": 2.7192, + "step": 350 + }, + { + "epoch": 0.09981220045379953, + "grad_norm": 3.0276105403900146, + "learning_rate": 3.639184142487791e-05, + "loss": 2.3286, + "step": 351 + }, + { + "epoch": 0.10009656569725765, + "grad_norm": 2.725423812866211, + "learning_rate": 3.638035047400173e-05, + "loss": 2.298, + "step": 352 + }, + { + "epoch": 0.10038093094071578, + "grad_norm": 2.9816744327545166, + "learning_rate": 3.636885952312554e-05, + "loss": 3.6773, + "step": 353 + }, + { + "epoch": 0.10066529618417389, + "grad_norm": 2.4625086784362793, + "learning_rate": 3.635736857224936e-05, + "loss": 3.2648, + "step": 354 + }, + { + "epoch": 0.10094966142763201, + "grad_norm": 2.325002670288086, + "learning_rate": 3.6345877621373176e-05, + "loss": 2.9206, + "step": 355 + }, + { + "epoch": 0.10123402667109013, + "grad_norm": 2.643895149230957, + "learning_rate": 3.633438667049699e-05, + "loss": 2.6485, + "step": 356 + }, + { + "epoch": 0.10151839191454824, + "grad_norm": 2.9441092014312744, + "learning_rate": 3.6322895719620805e-05, + "loss": 2.8929, + "step": 357 + }, + { + "epoch": 0.10180275715800637, + "grad_norm": 2.6018552780151367, + "learning_rate": 3.6311404768744616e-05, + "loss": 2.4757, + "step": 358 + }, + { + "epoch": 0.10208712240146448, + "grad_norm": 2.31733775138855, + "learning_rate": 3.6299913817868434e-05, + "loss": 2.6008, + "step": 359 + }, + { + "epoch": 0.1023714876449226, + "grad_norm": 2.3203630447387695, + "learning_rate": 3.6288422866992245e-05, + "loss": 2.4726, + "step": 360 + }, + { + "epoch": 0.10265585288838072, + "grad_norm": 3.1516456604003906, + "learning_rate": 3.627693191611606e-05, + "loss": 3.8334, + "step": 361 + }, + { + "epoch": 0.10294021813183883, + "grad_norm": 2.976469039916992, + "learning_rate": 3.626544096523988e-05, + "loss": 3.2071, + "step": 362 + }, + { + "epoch": 0.10322458337529695, + "grad_norm": 2.7915210723876953, + "learning_rate": 3.625395001436369e-05, + "loss": 3.0369, + "step": 363 + }, + { + "epoch": 0.10350894861875508, + "grad_norm": 3.034956932067871, + "learning_rate": 3.624245906348751e-05, + "loss": 3.1236, + "step": 364 + }, + { + "epoch": 0.10379331386221319, + "grad_norm": 2.835688591003418, + "learning_rate": 3.623096811261132e-05, + "loss": 2.8524, + "step": 365 + }, + { + "epoch": 0.10407767910567131, + "grad_norm": 2.3832132816314697, + "learning_rate": 3.621947716173514e-05, + "loss": 2.4496, + "step": 366 + }, + { + "epoch": 0.10436204434912942, + "grad_norm": 2.339113473892212, + "learning_rate": 3.620798621085895e-05, + "loss": 2.4954, + "step": 367 + }, + { + "epoch": 0.10464640959258754, + "grad_norm": 2.407395362854004, + "learning_rate": 3.619649525998277e-05, + "loss": 2.568, + "step": 368 + }, + { + "epoch": 0.10493077483604567, + "grad_norm": 4.512346267700195, + "learning_rate": 3.618500430910658e-05, + "loss": 3.5987, + "step": 369 + }, + { + "epoch": 0.10521514007950378, + "grad_norm": 3.320215940475464, + "learning_rate": 3.61735133582304e-05, + "loss": 3.0956, + "step": 370 + }, + { + "epoch": 0.1054995053229619, + "grad_norm": 3.0713627338409424, + "learning_rate": 3.6162022407354215e-05, + "loss": 3.1864, + "step": 371 + }, + { + "epoch": 0.10578387056642002, + "grad_norm": 2.845768690109253, + "learning_rate": 3.6150531456478026e-05, + "loss": 3.0029, + "step": 372 + }, + { + "epoch": 0.10606823580987813, + "grad_norm": 2.5338187217712402, + "learning_rate": 3.6139040505601844e-05, + "loss": 2.6617, + "step": 373 + }, + { + "epoch": 0.10635260105333626, + "grad_norm": 2.5313353538513184, + "learning_rate": 3.6127549554725655e-05, + "loss": 2.5754, + "step": 374 + }, + { + "epoch": 0.10663696629679438, + "grad_norm": 2.485309362411499, + "learning_rate": 3.611605860384947e-05, + "loss": 2.4186, + "step": 375 + }, + { + "epoch": 0.10692133154025249, + "grad_norm": 2.544818878173828, + "learning_rate": 3.6104567652973284e-05, + "loss": 2.5501, + "step": 376 + }, + { + "epoch": 0.10720569678371061, + "grad_norm": 2.9218149185180664, + "learning_rate": 3.60930767020971e-05, + "loss": 3.7049, + "step": 377 + }, + { + "epoch": 0.10749006202716872, + "grad_norm": 2.9214773178100586, + "learning_rate": 3.608158575122091e-05, + "loss": 3.0877, + "step": 378 + }, + { + "epoch": 0.10777442727062685, + "grad_norm": 2.4883925914764404, + "learning_rate": 3.607009480034473e-05, + "loss": 3.1767, + "step": 379 + }, + { + "epoch": 0.10805879251408497, + "grad_norm": 2.176980972290039, + "learning_rate": 3.605860384946855e-05, + "loss": 2.9752, + "step": 380 + }, + { + "epoch": 0.10834315775754308, + "grad_norm": 2.442439317703247, + "learning_rate": 3.604711289859236e-05, + "loss": 3.0929, + "step": 381 + }, + { + "epoch": 0.1086275230010012, + "grad_norm": 2.6657745838165283, + "learning_rate": 3.603562194771618e-05, + "loss": 2.6189, + "step": 382 + }, + { + "epoch": 0.10891188824445933, + "grad_norm": 2.3688509464263916, + "learning_rate": 3.602413099683999e-05, + "loss": 2.5404, + "step": 383 + }, + { + "epoch": 0.10919625348791744, + "grad_norm": 2.235816717147827, + "learning_rate": 3.601264004596381e-05, + "loss": 2.4137, + "step": 384 + }, + { + "epoch": 0.10948061873137556, + "grad_norm": 3.8358049392700195, + "learning_rate": 3.600114909508762e-05, + "loss": 3.5268, + "step": 385 + }, + { + "epoch": 0.10976498397483368, + "grad_norm": 2.5932207107543945, + "learning_rate": 3.5989658144211436e-05, + "loss": 3.1315, + "step": 386 + }, + { + "epoch": 0.11004934921829179, + "grad_norm": 2.4010274410247803, + "learning_rate": 3.597816719333525e-05, + "loss": 3.072, + "step": 387 + }, + { + "epoch": 0.11033371446174992, + "grad_norm": 2.247072219848633, + "learning_rate": 3.5966676242459065e-05, + "loss": 2.5931, + "step": 388 + }, + { + "epoch": 0.11061807970520803, + "grad_norm": 3.017591953277588, + "learning_rate": 3.595518529158288e-05, + "loss": 2.8756, + "step": 389 + }, + { + "epoch": 0.11090244494866615, + "grad_norm": 2.5938799381256104, + "learning_rate": 3.59436943407067e-05, + "loss": 2.869, + "step": 390 + }, + { + "epoch": 0.11118681019212427, + "grad_norm": 2.039341926574707, + "learning_rate": 3.593220338983051e-05, + "loss": 2.4231, + "step": 391 + }, + { + "epoch": 0.11147117543558238, + "grad_norm": 2.2805583477020264, + "learning_rate": 3.592071243895433e-05, + "loss": 2.377, + "step": 392 + }, + { + "epoch": 0.1117555406790405, + "grad_norm": 2.4253814220428467, + "learning_rate": 3.590922148807814e-05, + "loss": 3.549, + "step": 393 + }, + { + "epoch": 0.11203990592249863, + "grad_norm": 2.2920830249786377, + "learning_rate": 3.589773053720196e-05, + "loss": 3.2533, + "step": 394 + }, + { + "epoch": 0.11232427116595674, + "grad_norm": 2.4961812496185303, + "learning_rate": 3.588623958632577e-05, + "loss": 2.8119, + "step": 395 + }, + { + "epoch": 0.11260863640941486, + "grad_norm": 2.3366754055023193, + "learning_rate": 3.587474863544959e-05, + "loss": 3.0371, + "step": 396 + }, + { + "epoch": 0.11289300165287297, + "grad_norm": 2.443265914916992, + "learning_rate": 3.5863257684573405e-05, + "loss": 2.731, + "step": 397 + }, + { + "epoch": 0.1131773668963311, + "grad_norm": 2.1697330474853516, + "learning_rate": 3.5851766733697216e-05, + "loss": 2.548, + "step": 398 + }, + { + "epoch": 0.11346173213978922, + "grad_norm": 2.0168380737304688, + "learning_rate": 3.5840275782821034e-05, + "loss": 2.4326, + "step": 399 + }, + { + "epoch": 0.11374609738324733, + "grad_norm": 2.136674642562866, + "learning_rate": 3.5828784831944845e-05, + "loss": 2.3445, + "step": 400 + }, + { + "epoch": 0.11403046262670545, + "grad_norm": 2.96239972114563, + "learning_rate": 3.581729388106866e-05, + "loss": 3.7026, + "step": 401 + }, + { + "epoch": 0.11431482787016357, + "grad_norm": 2.7048444747924805, + "learning_rate": 3.5805802930192474e-05, + "loss": 3.0997, + "step": 402 + }, + { + "epoch": 0.11459919311362168, + "grad_norm": 2.455439805984497, + "learning_rate": 3.579431197931629e-05, + "loss": 2.9647, + "step": 403 + }, + { + "epoch": 0.1148835583570798, + "grad_norm": 2.2661333084106445, + "learning_rate": 3.57828210284401e-05, + "loss": 2.7126, + "step": 404 + }, + { + "epoch": 0.11516792360053793, + "grad_norm": 2.03705096244812, + "learning_rate": 3.577133007756392e-05, + "loss": 2.755, + "step": 405 + }, + { + "epoch": 0.11545228884399604, + "grad_norm": 2.140838384628296, + "learning_rate": 3.575983912668774e-05, + "loss": 2.5794, + "step": 406 + }, + { + "epoch": 0.11573665408745416, + "grad_norm": 2.352357864379883, + "learning_rate": 3.574834817581155e-05, + "loss": 2.64, + "step": 407 + }, + { + "epoch": 0.11602101933091227, + "grad_norm": 2.3507237434387207, + "learning_rate": 3.573685722493537e-05, + "loss": 2.2045, + "step": 408 + }, + { + "epoch": 0.1163053845743704, + "grad_norm": 4.098755359649658, + "learning_rate": 3.572536627405918e-05, + "loss": 3.437, + "step": 409 + }, + { + "epoch": 0.11658974981782852, + "grad_norm": 2.8039045333862305, + "learning_rate": 3.5713875323183e-05, + "loss": 3.2256, + "step": 410 + }, + { + "epoch": 0.11687411506128663, + "grad_norm": 2.3482019901275635, + "learning_rate": 3.570238437230681e-05, + "loss": 2.9395, + "step": 411 + }, + { + "epoch": 0.11715848030474475, + "grad_norm": 2.0725553035736084, + "learning_rate": 3.5690893421430626e-05, + "loss": 2.8758, + "step": 412 + }, + { + "epoch": 0.11744284554820288, + "grad_norm": 2.3215219974517822, + "learning_rate": 3.567940247055444e-05, + "loss": 2.7822, + "step": 413 + }, + { + "epoch": 0.11772721079166099, + "grad_norm": 2.770766258239746, + "learning_rate": 3.5667911519678255e-05, + "loss": 2.5678, + "step": 414 + }, + { + "epoch": 0.11801157603511911, + "grad_norm": 2.5529537200927734, + "learning_rate": 3.565642056880207e-05, + "loss": 2.429, + "step": 415 + }, + { + "epoch": 0.11829594127857723, + "grad_norm": 2.2458651065826416, + "learning_rate": 3.5644929617925884e-05, + "loss": 2.1978, + "step": 416 + }, + { + "epoch": 0.11858030652203534, + "grad_norm": 2.636418581008911, + "learning_rate": 3.56334386670497e-05, + "loss": 3.3172, + "step": 417 + }, + { + "epoch": 0.11886467176549347, + "grad_norm": 2.1322054862976074, + "learning_rate": 3.562194771617351e-05, + "loss": 3.2257, + "step": 418 + }, + { + "epoch": 0.11914903700895157, + "grad_norm": 2.089116334915161, + "learning_rate": 3.561045676529733e-05, + "loss": 3.1797, + "step": 419 + }, + { + "epoch": 0.1194334022524097, + "grad_norm": 2.1360151767730713, + "learning_rate": 3.559896581442114e-05, + "loss": 2.8716, + "step": 420 + }, + { + "epoch": 0.11971776749586782, + "grad_norm": 2.4757323265075684, + "learning_rate": 3.558747486354496e-05, + "loss": 2.9043, + "step": 421 + }, + { + "epoch": 0.12000213273932593, + "grad_norm": 2.3627967834472656, + "learning_rate": 3.557598391266877e-05, + "loss": 2.4626, + "step": 422 + }, + { + "epoch": 0.12028649798278405, + "grad_norm": 2.1213998794555664, + "learning_rate": 3.556449296179259e-05, + "loss": 2.5062, + "step": 423 + }, + { + "epoch": 0.12057086322624218, + "grad_norm": 2.1772518157958984, + "learning_rate": 3.555300201091641e-05, + "loss": 2.3489, + "step": 424 + }, + { + "epoch": 0.12085522846970029, + "grad_norm": 3.624419689178467, + "learning_rate": 3.554151106004022e-05, + "loss": 3.6556, + "step": 425 + }, + { + "epoch": 0.12113959371315841, + "grad_norm": 2.4470715522766113, + "learning_rate": 3.5530020109164036e-05, + "loss": 3.1925, + "step": 426 + }, + { + "epoch": 0.12142395895661652, + "grad_norm": 2.2214372158050537, + "learning_rate": 3.5518529158287854e-05, + "loss": 2.9998, + "step": 427 + }, + { + "epoch": 0.12170832420007464, + "grad_norm": 2.347656011581421, + "learning_rate": 3.5507038207411665e-05, + "loss": 2.8243, + "step": 428 + }, + { + "epoch": 0.12199268944353277, + "grad_norm": 2.0817830562591553, + "learning_rate": 3.549554725653548e-05, + "loss": 2.8815, + "step": 429 + }, + { + "epoch": 0.12227705468699088, + "grad_norm": 1.9819862842559814, + "learning_rate": 3.54840563056593e-05, + "loss": 2.751, + "step": 430 + }, + { + "epoch": 0.122561419930449, + "grad_norm": 2.2460458278656006, + "learning_rate": 3.547256535478311e-05, + "loss": 2.6482, + "step": 431 + }, + { + "epoch": 0.12284578517390712, + "grad_norm": 2.3552284240722656, + "learning_rate": 3.546107440390693e-05, + "loss": 2.5644, + "step": 432 + }, + { + "epoch": 0.12313015041736523, + "grad_norm": 2.6936285495758057, + "learning_rate": 3.544958345303074e-05, + "loss": 3.5913, + "step": 433 + }, + { + "epoch": 0.12341451566082336, + "grad_norm": 2.459479570388794, + "learning_rate": 3.543809250215456e-05, + "loss": 2.9851, + "step": 434 + }, + { + "epoch": 0.12369888090428148, + "grad_norm": 2.364466667175293, + "learning_rate": 3.542660155127837e-05, + "loss": 2.9548, + "step": 435 + }, + { + "epoch": 0.12398324614773959, + "grad_norm": 2.216580629348755, + "learning_rate": 3.541511060040219e-05, + "loss": 2.9091, + "step": 436 + }, + { + "epoch": 0.12426761139119771, + "grad_norm": 2.2435667514801025, + "learning_rate": 3.5403619649526e-05, + "loss": 2.8564, + "step": 437 + }, + { + "epoch": 0.12455197663465582, + "grad_norm": 2.2413361072540283, + "learning_rate": 3.539212869864982e-05, + "loss": 2.586, + "step": 438 + }, + { + "epoch": 0.12483634187811395, + "grad_norm": 2.0291380882263184, + "learning_rate": 3.5380637747773635e-05, + "loss": 2.4712, + "step": 439 + }, + { + "epoch": 0.12512070712157206, + "grad_norm": 2.161757707595825, + "learning_rate": 3.5369146796897446e-05, + "loss": 2.3247, + "step": 440 + }, + { + "epoch": 0.12540507236503018, + "grad_norm": 2.918247938156128, + "learning_rate": 3.5357655846021264e-05, + "loss": 3.5365, + "step": 441 + }, + { + "epoch": 0.1256894376084883, + "grad_norm": 2.381178855895996, + "learning_rate": 3.5346164895145075e-05, + "loss": 3.15, + "step": 442 + }, + { + "epoch": 0.12597380285194643, + "grad_norm": 2.349804162979126, + "learning_rate": 3.533467394426889e-05, + "loss": 3.0955, + "step": 443 + }, + { + "epoch": 0.12625816809540455, + "grad_norm": 2.0576179027557373, + "learning_rate": 3.5323182993392704e-05, + "loss": 2.8447, + "step": 444 + }, + { + "epoch": 0.12654253333886264, + "grad_norm": 2.2097201347351074, + "learning_rate": 3.531169204251652e-05, + "loss": 2.6086, + "step": 445 + }, + { + "epoch": 0.12682689858232077, + "grad_norm": 2.347313642501831, + "learning_rate": 3.530020109164033e-05, + "loss": 2.5803, + "step": 446 + }, + { + "epoch": 0.1271112638257789, + "grad_norm": 2.059239625930786, + "learning_rate": 3.528871014076415e-05, + "loss": 2.5247, + "step": 447 + }, + { + "epoch": 0.12739562906923702, + "grad_norm": 2.106543779373169, + "learning_rate": 3.527721918988797e-05, + "loss": 2.1429, + "step": 448 + }, + { + "epoch": 0.12767999431269514, + "grad_norm": 4.565354824066162, + "learning_rate": 3.526572823901178e-05, + "loss": 3.4247, + "step": 449 + }, + { + "epoch": 0.12796435955615326, + "grad_norm": 3.1165966987609863, + "learning_rate": 3.52542372881356e-05, + "loss": 3.3897, + "step": 450 + }, + { + "epoch": 0.12824872479961136, + "grad_norm": 2.431389808654785, + "learning_rate": 3.524274633725941e-05, + "loss": 2.8458, + "step": 451 + }, + { + "epoch": 0.12853309004306948, + "grad_norm": 2.352386713027954, + "learning_rate": 3.5231255386383226e-05, + "loss": 2.8419, + "step": 452 + }, + { + "epoch": 0.1288174552865276, + "grad_norm": 2.195875406265259, + "learning_rate": 3.521976443550704e-05, + "loss": 2.7576, + "step": 453 + }, + { + "epoch": 0.12910182052998573, + "grad_norm": 2.407545566558838, + "learning_rate": 3.5208273484630855e-05, + "loss": 2.4436, + "step": 454 + }, + { + "epoch": 0.12938618577344385, + "grad_norm": 2.3552427291870117, + "learning_rate": 3.5196782533754666e-05, + "loss": 2.3348, + "step": 455 + }, + { + "epoch": 0.12967055101690195, + "grad_norm": 2.576530933380127, + "learning_rate": 3.5185291582878484e-05, + "loss": 2.3373, + "step": 456 + }, + { + "epoch": 0.12995491626036007, + "grad_norm": 2.8271255493164062, + "learning_rate": 3.51738006320023e-05, + "loss": 3.4477, + "step": 457 + }, + { + "epoch": 0.1302392815038182, + "grad_norm": 2.433382987976074, + "learning_rate": 3.516230968112611e-05, + "loss": 3.0918, + "step": 458 + }, + { + "epoch": 0.13052364674727632, + "grad_norm": 2.547974109649658, + "learning_rate": 3.515081873024993e-05, + "loss": 3.0768, + "step": 459 + }, + { + "epoch": 0.13080801199073444, + "grad_norm": 2.379182815551758, + "learning_rate": 3.513932777937374e-05, + "loss": 2.886, + "step": 460 + }, + { + "epoch": 0.13109237723419256, + "grad_norm": 2.6086502075195312, + "learning_rate": 3.512783682849756e-05, + "loss": 2.7411, + "step": 461 + }, + { + "epoch": 0.13137674247765066, + "grad_norm": 2.6192541122436523, + "learning_rate": 3.511634587762137e-05, + "loss": 2.6317, + "step": 462 + }, + { + "epoch": 0.13166110772110878, + "grad_norm": 2.5311689376831055, + "learning_rate": 3.5104854926745196e-05, + "loss": 2.4297, + "step": 463 + }, + { + "epoch": 0.1319454729645669, + "grad_norm": 1.9520777463912964, + "learning_rate": 3.509336397586901e-05, + "loss": 2.3321, + "step": 464 + }, + { + "epoch": 0.13222983820802503, + "grad_norm": 3.1615819931030273, + "learning_rate": 3.5081873024992825e-05, + "loss": 3.519, + "step": 465 + }, + { + "epoch": 0.13251420345148315, + "grad_norm": 2.8588624000549316, + "learning_rate": 3.5070382074116636e-05, + "loss": 3.0476, + "step": 466 + }, + { + "epoch": 0.13279856869494125, + "grad_norm": 2.8191323280334473, + "learning_rate": 3.5058891123240454e-05, + "loss": 2.92, + "step": 467 + }, + { + "epoch": 0.13308293393839937, + "grad_norm": 2.7357499599456787, + "learning_rate": 3.5047400172364265e-05, + "loss": 2.8323, + "step": 468 + }, + { + "epoch": 0.1333672991818575, + "grad_norm": 2.820420026779175, + "learning_rate": 3.503590922148808e-05, + "loss": 2.6254, + "step": 469 + }, + { + "epoch": 0.13365166442531562, + "grad_norm": 2.1745669841766357, + "learning_rate": 3.5024418270611894e-05, + "loss": 2.5728, + "step": 470 + }, + { + "epoch": 0.13393602966877374, + "grad_norm": 1.995119571685791, + "learning_rate": 3.501292731973571e-05, + "loss": 2.3181, + "step": 471 + }, + { + "epoch": 0.13422039491223187, + "grad_norm": 2.149378776550293, + "learning_rate": 3.500143636885953e-05, + "loss": 2.1623, + "step": 472 + }, + { + "epoch": 0.13450476015568996, + "grad_norm": 3.25516414642334, + "learning_rate": 3.498994541798334e-05, + "loss": 3.3434, + "step": 473 + }, + { + "epoch": 0.13478912539914809, + "grad_norm": 2.725456714630127, + "learning_rate": 3.497845446710716e-05, + "loss": 3.1657, + "step": 474 + }, + { + "epoch": 0.1350734906426062, + "grad_norm": 2.5315914154052734, + "learning_rate": 3.496696351623097e-05, + "loss": 3.0051, + "step": 475 + }, + { + "epoch": 0.13535785588606433, + "grad_norm": 2.51139760017395, + "learning_rate": 3.495547256535479e-05, + "loss": 2.8382, + "step": 476 + }, + { + "epoch": 0.13564222112952246, + "grad_norm": 2.4551467895507812, + "learning_rate": 3.49439816144786e-05, + "loss": 2.6955, + "step": 477 + }, + { + "epoch": 0.13592658637298055, + "grad_norm": 2.2206430435180664, + "learning_rate": 3.493249066360242e-05, + "loss": 2.5027, + "step": 478 + }, + { + "epoch": 0.13621095161643867, + "grad_norm": 2.4770166873931885, + "learning_rate": 3.492099971272623e-05, + "loss": 2.2923, + "step": 479 + }, + { + "epoch": 0.1364953168598968, + "grad_norm": 2.7834012508392334, + "learning_rate": 3.4909508761850046e-05, + "loss": 2.3686, + "step": 480 + }, + { + "epoch": 0.13677968210335492, + "grad_norm": 3.358705520629883, + "learning_rate": 3.4898017810973864e-05, + "loss": 3.3837, + "step": 481 + }, + { + "epoch": 0.13706404734681304, + "grad_norm": 2.3323278427124023, + "learning_rate": 3.4886526860097675e-05, + "loss": 2.8571, + "step": 482 + }, + { + "epoch": 0.13734841259027114, + "grad_norm": 2.330890417098999, + "learning_rate": 3.487503590922149e-05, + "loss": 2.7978, + "step": 483 + }, + { + "epoch": 0.13763277783372926, + "grad_norm": 2.012991428375244, + "learning_rate": 3.4863544958345304e-05, + "loss": 2.8301, + "step": 484 + }, + { + "epoch": 0.1379171430771874, + "grad_norm": 2.383934736251831, + "learning_rate": 3.485205400746912e-05, + "loss": 2.6773, + "step": 485 + }, + { + "epoch": 0.1382015083206455, + "grad_norm": 2.576904773712158, + "learning_rate": 3.484056305659293e-05, + "loss": 2.5425, + "step": 486 + }, + { + "epoch": 0.13848587356410363, + "grad_norm": 2.482586145401001, + "learning_rate": 3.482907210571675e-05, + "loss": 2.3212, + "step": 487 + }, + { + "epoch": 0.13877023880756176, + "grad_norm": 2.269639253616333, + "learning_rate": 3.481758115484056e-05, + "loss": 2.244, + "step": 488 + }, + { + "epoch": 0.13905460405101985, + "grad_norm": 2.263761281967163, + "learning_rate": 3.480609020396438e-05, + "loss": 3.6028, + "step": 489 + }, + { + "epoch": 0.13933896929447798, + "grad_norm": 2.271108388900757, + "learning_rate": 3.47945992530882e-05, + "loss": 3.1406, + "step": 490 + }, + { + "epoch": 0.1396233345379361, + "grad_norm": 2.075087547302246, + "learning_rate": 3.478310830221201e-05, + "loss": 2.6854, + "step": 491 + }, + { + "epoch": 0.13990769978139422, + "grad_norm": 2.018319845199585, + "learning_rate": 3.4771617351335827e-05, + "loss": 2.8959, + "step": 492 + }, + { + "epoch": 0.14019206502485235, + "grad_norm": 2.369109630584717, + "learning_rate": 3.476012640045964e-05, + "loss": 2.7621, + "step": 493 + }, + { + "epoch": 0.14047643026831044, + "grad_norm": 2.4382967948913574, + "learning_rate": 3.4748635449583456e-05, + "loss": 2.4256, + "step": 494 + }, + { + "epoch": 0.14076079551176857, + "grad_norm": 2.1346540451049805, + "learning_rate": 3.473714449870727e-05, + "loss": 2.4076, + "step": 495 + }, + { + "epoch": 0.1410451607552267, + "grad_norm": 2.194206953048706, + "learning_rate": 3.4725653547831085e-05, + "loss": 2.1643, + "step": 496 + }, + { + "epoch": 0.1413295259986848, + "grad_norm": 3.0039777755737305, + "learning_rate": 3.4714162596954896e-05, + "loss": 3.4954, + "step": 497 + }, + { + "epoch": 0.14161389124214294, + "grad_norm": 2.4674746990203857, + "learning_rate": 3.4702671646078714e-05, + "loss": 3.1891, + "step": 498 + }, + { + "epoch": 0.14189825648560106, + "grad_norm": 2.319490432739258, + "learning_rate": 3.469118069520253e-05, + "loss": 2.7782, + "step": 499 + }, + { + "epoch": 0.14218262172905916, + "grad_norm": 2.3277428150177, + "learning_rate": 3.467968974432635e-05, + "loss": 2.6973, + "step": 500 + }, + { + "epoch": 0.14246698697251728, + "grad_norm": 2.28387188911438, + "learning_rate": 3.466819879345016e-05, + "loss": 2.8054, + "step": 501 + }, + { + "epoch": 0.1427513522159754, + "grad_norm": 2.186497688293457, + "learning_rate": 3.465670784257398e-05, + "loss": 2.7287, + "step": 502 + }, + { + "epoch": 0.14303571745943353, + "grad_norm": 2.455045700073242, + "learning_rate": 3.464521689169779e-05, + "loss": 2.4847, + "step": 503 + }, + { + "epoch": 0.14332008270289165, + "grad_norm": 2.292936086654663, + "learning_rate": 3.463372594082161e-05, + "loss": 2.3205, + "step": 504 + }, + { + "epoch": 0.14360444794634974, + "grad_norm": 2.924741744995117, + "learning_rate": 3.4622234989945425e-05, + "loss": 3.244, + "step": 505 + }, + { + "epoch": 0.14388881318980787, + "grad_norm": 2.4970264434814453, + "learning_rate": 3.4610744039069236e-05, + "loss": 3.0944, + "step": 506 + }, + { + "epoch": 0.144173178433266, + "grad_norm": 2.490755319595337, + "learning_rate": 3.4599253088193054e-05, + "loss": 3.0244, + "step": 507 + }, + { + "epoch": 0.14445754367672412, + "grad_norm": 2.4618477821350098, + "learning_rate": 3.4587762137316865e-05, + "loss": 2.6005, + "step": 508 + }, + { + "epoch": 0.14474190892018224, + "grad_norm": 2.505988121032715, + "learning_rate": 3.457627118644068e-05, + "loss": 2.5832, + "step": 509 + }, + { + "epoch": 0.14502627416364036, + "grad_norm": 2.5353870391845703, + "learning_rate": 3.4564780235564494e-05, + "loss": 2.3534, + "step": 510 + }, + { + "epoch": 0.14531063940709846, + "grad_norm": 2.2250514030456543, + "learning_rate": 3.455328928468831e-05, + "loss": 2.2805, + "step": 511 + }, + { + "epoch": 0.14559500465055658, + "grad_norm": 2.35164475440979, + "learning_rate": 3.454179833381212e-05, + "loss": 2.214, + "step": 512 + }, + { + "epoch": 0.1458793698940147, + "grad_norm": 2.7433321475982666, + "learning_rate": 3.453030738293594e-05, + "loss": 3.4267, + "step": 513 + }, + { + "epoch": 0.14616373513747283, + "grad_norm": 2.7333459854125977, + "learning_rate": 3.451881643205976e-05, + "loss": 3.1427, + "step": 514 + }, + { + "epoch": 0.14644810038093095, + "grad_norm": 2.3346779346466064, + "learning_rate": 3.450732548118357e-05, + "loss": 2.7976, + "step": 515 + }, + { + "epoch": 0.14673246562438905, + "grad_norm": 2.188016176223755, + "learning_rate": 3.449583453030739e-05, + "loss": 2.7736, + "step": 516 + }, + { + "epoch": 0.14701683086784717, + "grad_norm": 2.3532261848449707, + "learning_rate": 3.44843435794312e-05, + "loss": 2.6379, + "step": 517 + }, + { + "epoch": 0.1473011961113053, + "grad_norm": 2.1835358142852783, + "learning_rate": 3.447285262855502e-05, + "loss": 2.2907, + "step": 518 + }, + { + "epoch": 0.14758556135476342, + "grad_norm": 1.9752788543701172, + "learning_rate": 3.446136167767883e-05, + "loss": 2.1688, + "step": 519 + }, + { + "epoch": 0.14786992659822154, + "grad_norm": 2.0870256423950195, + "learning_rate": 3.4449870726802646e-05, + "loss": 2.1826, + "step": 520 + }, + { + "epoch": 0.14815429184167966, + "grad_norm": 3.138823986053467, + "learning_rate": 3.443837977592646e-05, + "loss": 3.3426, + "step": 521 + }, + { + "epoch": 0.14843865708513776, + "grad_norm": 2.220222234725952, + "learning_rate": 3.4426888825050275e-05, + "loss": 2.9125, + "step": 522 + }, + { + "epoch": 0.14872302232859588, + "grad_norm": 2.0182180404663086, + "learning_rate": 3.441539787417409e-05, + "loss": 2.54, + "step": 523 + }, + { + "epoch": 0.149007387572054, + "grad_norm": 2.3546602725982666, + "learning_rate": 3.4403906923297904e-05, + "loss": 2.7855, + "step": 524 + }, + { + "epoch": 0.14929175281551213, + "grad_norm": 2.4429872035980225, + "learning_rate": 3.439241597242172e-05, + "loss": 2.6199, + "step": 525 + }, + { + "epoch": 0.14957611805897025, + "grad_norm": 2.0123140811920166, + "learning_rate": 3.438092502154553e-05, + "loss": 2.4555, + "step": 526 + }, + { + "epoch": 0.14986048330242835, + "grad_norm": 2.059921979904175, + "learning_rate": 3.436943407066935e-05, + "loss": 2.3095, + "step": 527 + }, + { + "epoch": 0.15014484854588647, + "grad_norm": 2.259676456451416, + "learning_rate": 3.435794311979316e-05, + "loss": 2.32, + "step": 528 + }, + { + "epoch": 0.1504292137893446, + "grad_norm": 2.4949564933776855, + "learning_rate": 3.434645216891698e-05, + "loss": 3.1353, + "step": 529 + }, + { + "epoch": 0.15071357903280272, + "grad_norm": 2.08988881111145, + "learning_rate": 3.433496121804079e-05, + "loss": 3.0615, + "step": 530 + }, + { + "epoch": 0.15099794427626084, + "grad_norm": 2.143328905105591, + "learning_rate": 3.432347026716461e-05, + "loss": 2.7796, + "step": 531 + }, + { + "epoch": 0.15128230951971897, + "grad_norm": 2.148768663406372, + "learning_rate": 3.431197931628843e-05, + "loss": 2.6519, + "step": 532 + }, + { + "epoch": 0.15156667476317706, + "grad_norm": 2.143799066543579, + "learning_rate": 3.430048836541224e-05, + "loss": 2.7152, + "step": 533 + }, + { + "epoch": 0.15185104000663519, + "grad_norm": 2.04683780670166, + "learning_rate": 3.4288997414536056e-05, + "loss": 2.3695, + "step": 534 + }, + { + "epoch": 0.1521354052500933, + "grad_norm": 2.0776238441467285, + "learning_rate": 3.427750646365987e-05, + "loss": 2.2031, + "step": 535 + }, + { + "epoch": 0.15241977049355143, + "grad_norm": 2.1074013710021973, + "learning_rate": 3.4266015512783685e-05, + "loss": 2.3794, + "step": 536 + }, + { + "epoch": 0.15270413573700956, + "grad_norm": 3.231952667236328, + "learning_rate": 3.42545245619075e-05, + "loss": 3.2961, + "step": 537 + }, + { + "epoch": 0.15298850098046765, + "grad_norm": 2.548482656478882, + "learning_rate": 3.424303361103132e-05, + "loss": 2.9752, + "step": 538 + }, + { + "epoch": 0.15327286622392577, + "grad_norm": 2.271571397781372, + "learning_rate": 3.423154266015513e-05, + "loss": 2.9183, + "step": 539 + }, + { + "epoch": 0.1535572314673839, + "grad_norm": 2.0318918228149414, + "learning_rate": 3.422005170927895e-05, + "loss": 2.6897, + "step": 540 + }, + { + "epoch": 0.15384159671084202, + "grad_norm": 2.172313690185547, + "learning_rate": 3.420856075840276e-05, + "loss": 2.6676, + "step": 541 + }, + { + "epoch": 0.15412596195430014, + "grad_norm": 1.9842435121536255, + "learning_rate": 3.419706980752658e-05, + "loss": 2.3229, + "step": 542 + }, + { + "epoch": 0.15441032719775824, + "grad_norm": 1.9959310293197632, + "learning_rate": 3.418557885665039e-05, + "loss": 2.1866, + "step": 543 + }, + { + "epoch": 0.15469469244121636, + "grad_norm": 2.2947375774383545, + "learning_rate": 3.417408790577421e-05, + "loss": 2.3691, + "step": 544 + }, + { + "epoch": 0.1549790576846745, + "grad_norm": 3.7838504314422607, + "learning_rate": 3.416259695489802e-05, + "loss": 3.3556, + "step": 545 + }, + { + "epoch": 0.1552634229281326, + "grad_norm": 2.482161045074463, + "learning_rate": 3.4151106004021837e-05, + "loss": 3.0544, + "step": 546 + }, + { + "epoch": 0.15554778817159073, + "grad_norm": 2.1925017833709717, + "learning_rate": 3.4139615053145654e-05, + "loss": 2.8413, + "step": 547 + }, + { + "epoch": 0.15583215341504886, + "grad_norm": 1.944459080696106, + "learning_rate": 3.4128124102269466e-05, + "loss": 2.7084, + "step": 548 + }, + { + "epoch": 0.15611651865850695, + "grad_norm": 1.926611065864563, + "learning_rate": 3.4116633151393283e-05, + "loss": 2.5604, + "step": 549 + }, + { + "epoch": 0.15640088390196508, + "grad_norm": 1.8744678497314453, + "learning_rate": 3.4105142200517095e-05, + "loss": 2.2331, + "step": 550 + }, + { + "epoch": 0.1566852491454232, + "grad_norm": 2.163674831390381, + "learning_rate": 3.409365124964091e-05, + "loss": 2.2382, + "step": 551 + }, + { + "epoch": 0.15696961438888132, + "grad_norm": 2.251648426055908, + "learning_rate": 3.4082160298764724e-05, + "loss": 2.0852, + "step": 552 + }, + { + "epoch": 0.15725397963233945, + "grad_norm": 2.419774055480957, + "learning_rate": 3.407066934788854e-05, + "loss": 3.393, + "step": 553 + }, + { + "epoch": 0.15753834487579754, + "grad_norm": 1.9137029647827148, + "learning_rate": 3.405917839701235e-05, + "loss": 3.2236, + "step": 554 + }, + { + "epoch": 0.15782271011925567, + "grad_norm": 1.9087616205215454, + "learning_rate": 3.404768744613617e-05, + "loss": 2.869, + "step": 555 + }, + { + "epoch": 0.1581070753627138, + "grad_norm": 1.9507688283920288, + "learning_rate": 3.403619649525999e-05, + "loss": 2.833, + "step": 556 + }, + { + "epoch": 0.1583914406061719, + "grad_norm": 2.030203342437744, + "learning_rate": 3.40247055443838e-05, + "loss": 2.6474, + "step": 557 + }, + { + "epoch": 0.15867580584963004, + "grad_norm": 2.2038793563842773, + "learning_rate": 3.401321459350762e-05, + "loss": 2.3888, + "step": 558 + }, + { + "epoch": 0.15896017109308816, + "grad_norm": 2.0459814071655273, + "learning_rate": 3.400172364263143e-05, + "loss": 2.2976, + "step": 559 + }, + { + "epoch": 0.15924453633654626, + "grad_norm": 2.089040517807007, + "learning_rate": 3.3990232691755246e-05, + "loss": 2.3395, + "step": 560 + }, + { + "epoch": 0.15952890158000438, + "grad_norm": 2.3878250122070312, + "learning_rate": 3.397874174087906e-05, + "loss": 3.3155, + "step": 561 + }, + { + "epoch": 0.1598132668234625, + "grad_norm": 2.1346888542175293, + "learning_rate": 3.3967250790002875e-05, + "loss": 3.1624, + "step": 562 + }, + { + "epoch": 0.16009763206692063, + "grad_norm": 2.1869750022888184, + "learning_rate": 3.3955759839126686e-05, + "loss": 2.6892, + "step": 563 + }, + { + "epoch": 0.16038199731037875, + "grad_norm": 2.045581102371216, + "learning_rate": 3.3944268888250504e-05, + "loss": 2.4891, + "step": 564 + }, + { + "epoch": 0.16066636255383684, + "grad_norm": 2.3556089401245117, + "learning_rate": 3.393277793737432e-05, + "loss": 2.7332, + "step": 565 + }, + { + "epoch": 0.16095072779729497, + "grad_norm": 2.4558780193328857, + "learning_rate": 3.392128698649813e-05, + "loss": 2.4167, + "step": 566 + }, + { + "epoch": 0.1612350930407531, + "grad_norm": 2.398458957672119, + "learning_rate": 3.390979603562195e-05, + "loss": 2.336, + "step": 567 + }, + { + "epoch": 0.16151945828421121, + "grad_norm": 2.256385564804077, + "learning_rate": 3.389830508474576e-05, + "loss": 2.3393, + "step": 568 + }, + { + "epoch": 0.16180382352766934, + "grad_norm": 2.4298534393310547, + "learning_rate": 3.388681413386958e-05, + "loss": 3.3351, + "step": 569 + }, + { + "epoch": 0.16208818877112746, + "grad_norm": 2.2216269969940186, + "learning_rate": 3.387532318299339e-05, + "loss": 3.1062, + "step": 570 + }, + { + "epoch": 0.16237255401458556, + "grad_norm": 2.279859781265259, + "learning_rate": 3.386383223211721e-05, + "loss": 2.7871, + "step": 571 + }, + { + "epoch": 0.16265691925804368, + "grad_norm": 2.4930570125579834, + "learning_rate": 3.385234128124102e-05, + "loss": 2.6784, + "step": 572 + }, + { + "epoch": 0.1629412845015018, + "grad_norm": 3.0474438667297363, + "learning_rate": 3.384085033036484e-05, + "loss": 2.7209, + "step": 573 + }, + { + "epoch": 0.16322564974495993, + "grad_norm": 2.276129961013794, + "learning_rate": 3.3829359379488656e-05, + "loss": 2.3874, + "step": 574 + }, + { + "epoch": 0.16351001498841805, + "grad_norm": 2.0157735347747803, + "learning_rate": 3.3817868428612474e-05, + "loss": 2.3545, + "step": 575 + }, + { + "epoch": 0.16379438023187615, + "grad_norm": 2.2889609336853027, + "learning_rate": 3.3806377477736285e-05, + "loss": 2.1654, + "step": 576 + }, + { + "epoch": 0.16407874547533427, + "grad_norm": 2.966151714324951, + "learning_rate": 3.37948865268601e-05, + "loss": 3.3948, + "step": 577 + }, + { + "epoch": 0.1643631107187924, + "grad_norm": 2.7245430946350098, + "learning_rate": 3.3783395575983914e-05, + "loss": 3.0326, + "step": 578 + }, + { + "epoch": 0.16464747596225052, + "grad_norm": 2.4125325679779053, + "learning_rate": 3.377190462510773e-05, + "loss": 2.7461, + "step": 579 + }, + { + "epoch": 0.16493184120570864, + "grad_norm": 2.4555015563964844, + "learning_rate": 3.376041367423155e-05, + "loss": 2.6876, + "step": 580 + }, + { + "epoch": 0.16521620644916676, + "grad_norm": 2.3780357837677, + "learning_rate": 3.374892272335536e-05, + "loss": 2.549, + "step": 581 + }, + { + "epoch": 0.16550057169262486, + "grad_norm": 1.9920421838760376, + "learning_rate": 3.373743177247918e-05, + "loss": 2.3843, + "step": 582 + }, + { + "epoch": 0.16578493693608298, + "grad_norm": 1.880908489227295, + "learning_rate": 3.372594082160299e-05, + "loss": 2.1699, + "step": 583 + }, + { + "epoch": 0.1660693021795411, + "grad_norm": 2.317730188369751, + "learning_rate": 3.371444987072681e-05, + "loss": 2.3338, + "step": 584 + }, + { + "epoch": 0.16635366742299923, + "grad_norm": 3.1581149101257324, + "learning_rate": 3.370295891985062e-05, + "loss": 3.3314, + "step": 585 + }, + { + "epoch": 0.16663803266645735, + "grad_norm": 2.4569809436798096, + "learning_rate": 3.369146796897444e-05, + "loss": 2.8549, + "step": 586 + }, + { + "epoch": 0.16692239790991545, + "grad_norm": 2.0873701572418213, + "learning_rate": 3.367997701809825e-05, + "loss": 2.6875, + "step": 587 + }, + { + "epoch": 0.16720676315337357, + "grad_norm": 2.1216537952423096, + "learning_rate": 3.3668486067222066e-05, + "loss": 2.7253, + "step": 588 + }, + { + "epoch": 0.1674911283968317, + "grad_norm": 1.9596827030181885, + "learning_rate": 3.3656995116345884e-05, + "loss": 2.4127, + "step": 589 + }, + { + "epoch": 0.16777549364028982, + "grad_norm": 2.244654417037964, + "learning_rate": 3.3645504165469695e-05, + "loss": 2.4795, + "step": 590 + }, + { + "epoch": 0.16805985888374794, + "grad_norm": 2.387822389602661, + "learning_rate": 3.363401321459351e-05, + "loss": 2.413, + "step": 591 + }, + { + "epoch": 0.16834422412720607, + "grad_norm": 2.2715797424316406, + "learning_rate": 3.3622522263717324e-05, + "loss": 2.3535, + "step": 592 + }, + { + "epoch": 0.16862858937066416, + "grad_norm": 3.2900567054748535, + "learning_rate": 3.361103131284114e-05, + "loss": 3.2949, + "step": 593 + }, + { + "epoch": 0.16891295461412228, + "grad_norm": 2.340487241744995, + "learning_rate": 3.359954036196495e-05, + "loss": 3.0242, + "step": 594 + }, + { + "epoch": 0.1691973198575804, + "grad_norm": 1.9996750354766846, + "learning_rate": 3.358804941108877e-05, + "loss": 2.7541, + "step": 595 + }, + { + "epoch": 0.16948168510103853, + "grad_norm": 1.8965973854064941, + "learning_rate": 3.357655846021258e-05, + "loss": 2.6425, + "step": 596 + }, + { + "epoch": 0.16976605034449666, + "grad_norm": 2.097822427749634, + "learning_rate": 3.35650675093364e-05, + "loss": 2.6672, + "step": 597 + }, + { + "epoch": 0.17005041558795475, + "grad_norm": 2.171520471572876, + "learning_rate": 3.355357655846022e-05, + "loss": 2.2089, + "step": 598 + }, + { + "epoch": 0.17033478083141287, + "grad_norm": 2.1892077922821045, + "learning_rate": 3.354208560758403e-05, + "loss": 2.2999, + "step": 599 + }, + { + "epoch": 0.170619146074871, + "grad_norm": 1.8929511308670044, + "learning_rate": 3.3530594656707847e-05, + "loss": 2.0265, + "step": 600 + }, + { + "epoch": 0.17090351131832912, + "grad_norm": 2.2485220432281494, + "learning_rate": 3.351910370583166e-05, + "loss": 3.4067, + "step": 601 + }, + { + "epoch": 0.17118787656178724, + "grad_norm": 2.0794124603271484, + "learning_rate": 3.3507612754955476e-05, + "loss": 3.0856, + "step": 602 + }, + { + "epoch": 0.17147224180524534, + "grad_norm": 1.9198224544525146, + "learning_rate": 3.349612180407929e-05, + "loss": 2.7309, + "step": 603 + }, + { + "epoch": 0.17175660704870346, + "grad_norm": 1.9311753511428833, + "learning_rate": 3.3484630853203105e-05, + "loss": 2.5093, + "step": 604 + }, + { + "epoch": 0.1720409722921616, + "grad_norm": 2.2155637741088867, + "learning_rate": 3.3473139902326916e-05, + "loss": 2.4131, + "step": 605 + }, + { + "epoch": 0.1723253375356197, + "grad_norm": 2.4310479164123535, + "learning_rate": 3.3461648951450733e-05, + "loss": 2.3546, + "step": 606 + }, + { + "epoch": 0.17260970277907783, + "grad_norm": 2.170459747314453, + "learning_rate": 3.345015800057455e-05, + "loss": 2.1606, + "step": 607 + }, + { + "epoch": 0.17289406802253596, + "grad_norm": 2.082836627960205, + "learning_rate": 3.343866704969836e-05, + "loss": 2.1429, + "step": 608 + }, + { + "epoch": 0.17317843326599405, + "grad_norm": 2.6757774353027344, + "learning_rate": 3.342717609882218e-05, + "loss": 3.3129, + "step": 609 + }, + { + "epoch": 0.17346279850945218, + "grad_norm": 1.9752081632614136, + "learning_rate": 3.341568514794599e-05, + "loss": 2.8585, + "step": 610 + }, + { + "epoch": 0.1737471637529103, + "grad_norm": 2.269960880279541, + "learning_rate": 3.340419419706981e-05, + "loss": 2.5409, + "step": 611 + }, + { + "epoch": 0.17403152899636842, + "grad_norm": 2.2884328365325928, + "learning_rate": 3.339270324619363e-05, + "loss": 2.6172, + "step": 612 + }, + { + "epoch": 0.17431589423982655, + "grad_norm": 2.2851369380950928, + "learning_rate": 3.3381212295317445e-05, + "loss": 2.6341, + "step": 613 + }, + { + "epoch": 0.17460025948328464, + "grad_norm": 1.8490828275680542, + "learning_rate": 3.3369721344441256e-05, + "loss": 2.2735, + "step": 614 + }, + { + "epoch": 0.17488462472674277, + "grad_norm": 1.9733513593673706, + "learning_rate": 3.3358230393565074e-05, + "loss": 2.2607, + "step": 615 + }, + { + "epoch": 0.1751689899702009, + "grad_norm": 1.9823896884918213, + "learning_rate": 3.3346739442688885e-05, + "loss": 2.0564, + "step": 616 + }, + { + "epoch": 0.175453355213659, + "grad_norm": 2.479914665222168, + "learning_rate": 3.33352484918127e-05, + "loss": 3.352, + "step": 617 + }, + { + "epoch": 0.17573772045711714, + "grad_norm": 2.3320770263671875, + "learning_rate": 3.3323757540936514e-05, + "loss": 2.8721, + "step": 618 + }, + { + "epoch": 0.17602208570057526, + "grad_norm": 2.451782703399658, + "learning_rate": 3.331226659006033e-05, + "loss": 2.8098, + "step": 619 + }, + { + "epoch": 0.17630645094403335, + "grad_norm": 2.2985215187072754, + "learning_rate": 3.330077563918414e-05, + "loss": 2.7563, + "step": 620 + }, + { + "epoch": 0.17659081618749148, + "grad_norm": 2.119804859161377, + "learning_rate": 3.328928468830796e-05, + "loss": 2.5972, + "step": 621 + }, + { + "epoch": 0.1768751814309496, + "grad_norm": 2.0667002201080322, + "learning_rate": 3.327779373743178e-05, + "loss": 2.2254, + "step": 622 + }, + { + "epoch": 0.17715954667440773, + "grad_norm": 1.9303321838378906, + "learning_rate": 3.326630278655559e-05, + "loss": 2.4747, + "step": 623 + }, + { + "epoch": 0.17744391191786585, + "grad_norm": 2.147676706314087, + "learning_rate": 3.325481183567941e-05, + "loss": 1.9882, + "step": 624 + }, + { + "epoch": 0.17772827716132394, + "grad_norm": 2.619396448135376, + "learning_rate": 3.324332088480322e-05, + "loss": 3.2464, + "step": 625 + }, + { + "epoch": 0.17801264240478207, + "grad_norm": 2.6688385009765625, + "learning_rate": 3.323182993392704e-05, + "loss": 3.0559, + "step": 626 + }, + { + "epoch": 0.1782970076482402, + "grad_norm": 1.9910519123077393, + "learning_rate": 3.322033898305085e-05, + "loss": 2.7364, + "step": 627 + }, + { + "epoch": 0.17858137289169831, + "grad_norm": 2.037667989730835, + "learning_rate": 3.3208848032174666e-05, + "loss": 2.3946, + "step": 628 + }, + { + "epoch": 0.17886573813515644, + "grad_norm": 2.0019378662109375, + "learning_rate": 3.319735708129848e-05, + "loss": 2.4888, + "step": 629 + }, + { + "epoch": 0.17915010337861456, + "grad_norm": 1.999667763710022, + "learning_rate": 3.3185866130422295e-05, + "loss": 2.5945, + "step": 630 + }, + { + "epoch": 0.17943446862207266, + "grad_norm": 2.3381664752960205, + "learning_rate": 3.317437517954611e-05, + "loss": 2.0536, + "step": 631 + }, + { + "epoch": 0.17971883386553078, + "grad_norm": 2.1102583408355713, + "learning_rate": 3.3162884228669924e-05, + "loss": 2.1843, + "step": 632 + }, + { + "epoch": 0.1800031991089889, + "grad_norm": 4.009433269500732, + "learning_rate": 3.315139327779374e-05, + "loss": 3.122, + "step": 633 + }, + { + "epoch": 0.18028756435244703, + "grad_norm": 2.4775569438934326, + "learning_rate": 3.313990232691755e-05, + "loss": 3.1036, + "step": 634 + }, + { + "epoch": 0.18057192959590515, + "grad_norm": 2.1339893341064453, + "learning_rate": 3.312841137604137e-05, + "loss": 2.8175, + "step": 635 + }, + { + "epoch": 0.18085629483936325, + "grad_norm": 1.9820502996444702, + "learning_rate": 3.311692042516518e-05, + "loss": 2.8805, + "step": 636 + }, + { + "epoch": 0.18114066008282137, + "grad_norm": 1.9298404455184937, + "learning_rate": 3.3105429474289e-05, + "loss": 2.5823, + "step": 637 + }, + { + "epoch": 0.1814250253262795, + "grad_norm": 1.915541172027588, + "learning_rate": 3.309393852341281e-05, + "loss": 2.4167, + "step": 638 + }, + { + "epoch": 0.18170939056973762, + "grad_norm": 2.0158815383911133, + "learning_rate": 3.308244757253663e-05, + "loss": 2.2251, + "step": 639 + }, + { + "epoch": 0.18199375581319574, + "grad_norm": 1.840295672416687, + "learning_rate": 3.307095662166045e-05, + "loss": 2.2033, + "step": 640 + }, + { + "epoch": 0.18227812105665386, + "grad_norm": 2.2416555881500244, + "learning_rate": 3.305946567078426e-05, + "loss": 3.302, + "step": 641 + }, + { + "epoch": 0.18256248630011196, + "grad_norm": 2.0106818675994873, + "learning_rate": 3.3047974719908076e-05, + "loss": 2.8823, + "step": 642 + }, + { + "epoch": 0.18284685154357008, + "grad_norm": 2.0697238445281982, + "learning_rate": 3.303648376903189e-05, + "loss": 2.8577, + "step": 643 + }, + { + "epoch": 0.1831312167870282, + "grad_norm": 1.8144038915634155, + "learning_rate": 3.3024992818155705e-05, + "loss": 2.5007, + "step": 644 + }, + { + "epoch": 0.18341558203048633, + "grad_norm": 1.9172033071517944, + "learning_rate": 3.3013501867279516e-05, + "loss": 2.5931, + "step": 645 + }, + { + "epoch": 0.18369994727394445, + "grad_norm": 2.0230233669281006, + "learning_rate": 3.3002010916403334e-05, + "loss": 2.3733, + "step": 646 + }, + { + "epoch": 0.18398431251740255, + "grad_norm": 1.8354929685592651, + "learning_rate": 3.2990519965527145e-05, + "loss": 2.1317, + "step": 647 + }, + { + "epoch": 0.18426867776086067, + "grad_norm": 2.032435894012451, + "learning_rate": 3.297902901465096e-05, + "loss": 1.9827, + "step": 648 + }, + { + "epoch": 0.1845530430043188, + "grad_norm": 2.6801607608795166, + "learning_rate": 3.296753806377478e-05, + "loss": 3.3537, + "step": 649 + }, + { + "epoch": 0.18483740824777692, + "grad_norm": 2.1917881965637207, + "learning_rate": 3.29560471128986e-05, + "loss": 3.0921, + "step": 650 + }, + { + "epoch": 0.18512177349123504, + "grad_norm": 1.906441330909729, + "learning_rate": 3.294455616202241e-05, + "loss": 2.8275, + "step": 651 + }, + { + "epoch": 0.18540613873469317, + "grad_norm": 1.8117749691009521, + "learning_rate": 3.293306521114623e-05, + "loss": 2.5992, + "step": 652 + }, + { + "epoch": 0.18569050397815126, + "grad_norm": 1.962047815322876, + "learning_rate": 3.292157426027004e-05, + "loss": 2.4936, + "step": 653 + }, + { + "epoch": 0.18597486922160938, + "grad_norm": 1.9997988939285278, + "learning_rate": 3.2910083309393856e-05, + "loss": 2.3512, + "step": 654 + }, + { + "epoch": 0.1862592344650675, + "grad_norm": 2.2575020790100098, + "learning_rate": 3.2898592358517674e-05, + "loss": 2.3052, + "step": 655 + }, + { + "epoch": 0.18654359970852563, + "grad_norm": 2.3353512287139893, + "learning_rate": 3.2887101407641485e-05, + "loss": 2.1795, + "step": 656 + }, + { + "epoch": 0.18682796495198375, + "grad_norm": 2.2830419540405273, + "learning_rate": 3.28756104567653e-05, + "loss": 3.2798, + "step": 657 + }, + { + "epoch": 0.18711233019544185, + "grad_norm": 2.1097829341888428, + "learning_rate": 3.2864119505889114e-05, + "loss": 3.1461, + "step": 658 + }, + { + "epoch": 0.18739669543889997, + "grad_norm": 2.0392520427703857, + "learning_rate": 3.285262855501293e-05, + "loss": 2.865, + "step": 659 + }, + { + "epoch": 0.1876810606823581, + "grad_norm": 1.9128637313842773, + "learning_rate": 3.2841137604136743e-05, + "loss": 2.4607, + "step": 660 + }, + { + "epoch": 0.18796542592581622, + "grad_norm": 2.3386354446411133, + "learning_rate": 3.282964665326056e-05, + "loss": 2.5321, + "step": 661 + }, + { + "epoch": 0.18824979116927434, + "grad_norm": 2.1804091930389404, + "learning_rate": 3.281815570238437e-05, + "loss": 2.372, + "step": 662 + }, + { + "epoch": 0.18853415641273247, + "grad_norm": 2.139617919921875, + "learning_rate": 3.280666475150819e-05, + "loss": 2.2559, + "step": 663 + }, + { + "epoch": 0.18881852165619056, + "grad_norm": 2.0755178928375244, + "learning_rate": 3.279517380063201e-05, + "loss": 2.3333, + "step": 664 + }, + { + "epoch": 0.1891028868996487, + "grad_norm": 2.1971609592437744, + "learning_rate": 3.278368284975582e-05, + "loss": 3.1885, + "step": 665 + }, + { + "epoch": 0.1893872521431068, + "grad_norm": 2.080216884613037, + "learning_rate": 3.277219189887964e-05, + "loss": 2.8508, + "step": 666 + }, + { + "epoch": 0.18967161738656493, + "grad_norm": 2.1171205043792725, + "learning_rate": 3.276070094800345e-05, + "loss": 2.5981, + "step": 667 + }, + { + "epoch": 0.18995598263002306, + "grad_norm": 2.2514328956604004, + "learning_rate": 3.2749209997127266e-05, + "loss": 2.8185, + "step": 668 + }, + { + "epoch": 0.19024034787348115, + "grad_norm": 2.328221082687378, + "learning_rate": 3.273771904625108e-05, + "loss": 2.6257, + "step": 669 + }, + { + "epoch": 0.19052471311693928, + "grad_norm": 2.0298240184783936, + "learning_rate": 3.2726228095374895e-05, + "loss": 2.2706, + "step": 670 + }, + { + "epoch": 0.1908090783603974, + "grad_norm": 1.9545693397521973, + "learning_rate": 3.2714737144498706e-05, + "loss": 2.1073, + "step": 671 + }, + { + "epoch": 0.19109344360385552, + "grad_norm": 2.0932371616363525, + "learning_rate": 3.2703246193622524e-05, + "loss": 2.1358, + "step": 672 + }, + { + "epoch": 0.19137780884731365, + "grad_norm": 2.68083119392395, + "learning_rate": 3.269175524274634e-05, + "loss": 3.3797, + "step": 673 + }, + { + "epoch": 0.19166217409077174, + "grad_norm": 2.340952157974243, + "learning_rate": 3.268026429187015e-05, + "loss": 2.7555, + "step": 674 + }, + { + "epoch": 0.19194653933422987, + "grad_norm": 2.199805736541748, + "learning_rate": 3.266877334099397e-05, + "loss": 2.6694, + "step": 675 + }, + { + "epoch": 0.192230904577688, + "grad_norm": 2.1734702587127686, + "learning_rate": 3.265728239011778e-05, + "loss": 2.8257, + "step": 676 + }, + { + "epoch": 0.1925152698211461, + "grad_norm": 2.420964479446411, + "learning_rate": 3.26457914392416e-05, + "loss": 2.4486, + "step": 677 + }, + { + "epoch": 0.19279963506460424, + "grad_norm": 1.9791522026062012, + "learning_rate": 3.263430048836541e-05, + "loss": 2.3201, + "step": 678 + }, + { + "epoch": 0.19308400030806236, + "grad_norm": 2.2870163917541504, + "learning_rate": 3.262280953748923e-05, + "loss": 2.1877, + "step": 679 + }, + { + "epoch": 0.19336836555152045, + "grad_norm": 2.085327625274658, + "learning_rate": 3.261131858661304e-05, + "loss": 2.0054, + "step": 680 + }, + { + "epoch": 0.19365273079497858, + "grad_norm": 3.2444210052490234, + "learning_rate": 3.259982763573686e-05, + "loss": 3.2208, + "step": 681 + }, + { + "epoch": 0.1939370960384367, + "grad_norm": 2.47835111618042, + "learning_rate": 3.2588336684860676e-05, + "loss": 3.1121, + "step": 682 + }, + { + "epoch": 0.19422146128189482, + "grad_norm": 2.056704521179199, + "learning_rate": 3.257684573398449e-05, + "loss": 2.8531, + "step": 683 + }, + { + "epoch": 0.19450582652535295, + "grad_norm": 1.9477993249893188, + "learning_rate": 3.2565354783108305e-05, + "loss": 2.4773, + "step": 684 + }, + { + "epoch": 0.19479019176881104, + "grad_norm": 2.019652843475342, + "learning_rate": 3.2553863832232116e-05, + "loss": 2.5318, + "step": 685 + }, + { + "epoch": 0.19507455701226917, + "grad_norm": 2.164877414703369, + "learning_rate": 3.2542372881355934e-05, + "loss": 2.2676, + "step": 686 + }, + { + "epoch": 0.1953589222557273, + "grad_norm": 1.9417436122894287, + "learning_rate": 3.253088193047975e-05, + "loss": 2.1097, + "step": 687 + }, + { + "epoch": 0.19564328749918541, + "grad_norm": 2.0217719078063965, + "learning_rate": 3.251939097960357e-05, + "loss": 1.9464, + "step": 688 + }, + { + "epoch": 0.19592765274264354, + "grad_norm": 2.3088855743408203, + "learning_rate": 3.250790002872738e-05, + "loss": 3.1466, + "step": 689 + }, + { + "epoch": 0.19621201798610166, + "grad_norm": 2.2107951641082764, + "learning_rate": 3.24964090778512e-05, + "loss": 2.898, + "step": 690 + }, + { + "epoch": 0.19649638322955976, + "grad_norm": 1.8729802370071411, + "learning_rate": 3.248491812697501e-05, + "loss": 2.5574, + "step": 691 + }, + { + "epoch": 0.19678074847301788, + "grad_norm": 1.948519229888916, + "learning_rate": 3.247342717609883e-05, + "loss": 2.644, + "step": 692 + }, + { + "epoch": 0.197065113716476, + "grad_norm": 2.479282855987549, + "learning_rate": 3.246193622522264e-05, + "loss": 2.6382, + "step": 693 + }, + { + "epoch": 0.19734947895993413, + "grad_norm": 2.2494328022003174, + "learning_rate": 3.245044527434646e-05, + "loss": 2.0895, + "step": 694 + }, + { + "epoch": 0.19763384420339225, + "grad_norm": 1.9501408338546753, + "learning_rate": 3.243895432347027e-05, + "loss": 2.2874, + "step": 695 + }, + { + "epoch": 0.19791820944685035, + "grad_norm": 1.865615963935852, + "learning_rate": 3.2427463372594086e-05, + "loss": 2.0848, + "step": 696 + }, + { + "epoch": 0.19820257469030847, + "grad_norm": 2.376199722290039, + "learning_rate": 3.2415972421717904e-05, + "loss": 3.3114, + "step": 697 + }, + { + "epoch": 0.1984869399337666, + "grad_norm": 1.978826880455017, + "learning_rate": 3.2404481470841715e-05, + "loss": 2.6896, + "step": 698 + }, + { + "epoch": 0.19877130517722472, + "grad_norm": 1.8862276077270508, + "learning_rate": 3.239299051996553e-05, + "loss": 2.4757, + "step": 699 + }, + { + "epoch": 0.19905567042068284, + "grad_norm": 2.253225564956665, + "learning_rate": 3.2381499569089344e-05, + "loss": 2.7394, + "step": 700 + }, + { + "epoch": 0.19934003566414096, + "grad_norm": 2.258186101913452, + "learning_rate": 3.237000861821316e-05, + "loss": 2.6104, + "step": 701 + }, + { + "epoch": 0.19962440090759906, + "grad_norm": 1.78932785987854, + "learning_rate": 3.235851766733697e-05, + "loss": 2.2367, + "step": 702 + }, + { + "epoch": 0.19990876615105718, + "grad_norm": 1.7839140892028809, + "learning_rate": 3.234702671646079e-05, + "loss": 2.0307, + "step": 703 + }, + { + "epoch": 0.2001931313945153, + "grad_norm": 1.9078693389892578, + "learning_rate": 3.23355357655846e-05, + "loss": 1.8974, + "step": 704 + }, + { + "epoch": 0.20047749663797343, + "grad_norm": 2.489147901535034, + "learning_rate": 3.232404481470842e-05, + "loss": 3.0721, + "step": 705 + }, + { + "epoch": 0.20076186188143155, + "grad_norm": 2.5214760303497314, + "learning_rate": 3.231255386383224e-05, + "loss": 2.9676, + "step": 706 + }, + { + "epoch": 0.20104622712488965, + "grad_norm": 2.182615280151367, + "learning_rate": 3.230106291295605e-05, + "loss": 2.9474, + "step": 707 + }, + { + "epoch": 0.20133059236834777, + "grad_norm": 1.98862624168396, + "learning_rate": 3.2289571962079866e-05, + "loss": 2.7995, + "step": 708 + }, + { + "epoch": 0.2016149576118059, + "grad_norm": 1.9838632345199585, + "learning_rate": 3.227808101120368e-05, + "loss": 2.4476, + "step": 709 + }, + { + "epoch": 0.20189932285526402, + "grad_norm": 1.9730520248413086, + "learning_rate": 3.2266590060327495e-05, + "loss": 2.3272, + "step": 710 + }, + { + "epoch": 0.20218368809872214, + "grad_norm": 1.880563497543335, + "learning_rate": 3.2255099109451307e-05, + "loss": 2.0332, + "step": 711 + }, + { + "epoch": 0.20246805334218027, + "grad_norm": 1.99998140335083, + "learning_rate": 3.2243608158575124e-05, + "loss": 2.2419, + "step": 712 + }, + { + "epoch": 0.20275241858563836, + "grad_norm": 2.910835027694702, + "learning_rate": 3.2232117207698936e-05, + "loss": 3.2948, + "step": 713 + }, + { + "epoch": 0.20303678382909648, + "grad_norm": 2.386836528778076, + "learning_rate": 3.2220626256822753e-05, + "loss": 2.7419, + "step": 714 + }, + { + "epoch": 0.2033211490725546, + "grad_norm": 1.9169079065322876, + "learning_rate": 3.220913530594657e-05, + "loss": 2.6171, + "step": 715 + }, + { + "epoch": 0.20360551431601273, + "grad_norm": 1.7654900550842285, + "learning_rate": 3.219764435507038e-05, + "loss": 2.7043, + "step": 716 + }, + { + "epoch": 0.20388987955947085, + "grad_norm": 1.8636305332183838, + "learning_rate": 3.21861534041942e-05, + "loss": 2.4612, + "step": 717 + }, + { + "epoch": 0.20417424480292895, + "grad_norm": 2.11191463470459, + "learning_rate": 3.217466245331801e-05, + "loss": 2.3006, + "step": 718 + }, + { + "epoch": 0.20445861004638707, + "grad_norm": 2.05869460105896, + "learning_rate": 3.216317150244183e-05, + "loss": 2.2266, + "step": 719 + }, + { + "epoch": 0.2047429752898452, + "grad_norm": 1.921918511390686, + "learning_rate": 3.215168055156564e-05, + "loss": 1.9656, + "step": 720 + }, + { + "epoch": 0.20502734053330332, + "grad_norm": 3.3702752590179443, + "learning_rate": 3.214018960068946e-05, + "loss": 3.208, + "step": 721 + }, + { + "epoch": 0.20531170577676144, + "grad_norm": 2.2111897468566895, + "learning_rate": 3.212869864981327e-05, + "loss": 2.8252, + "step": 722 + }, + { + "epoch": 0.20559607102021957, + "grad_norm": 1.9135637283325195, + "learning_rate": 3.2117207698937094e-05, + "loss": 2.3325, + "step": 723 + }, + { + "epoch": 0.20588043626367766, + "grad_norm": 1.9720232486724854, + "learning_rate": 3.2105716748060905e-05, + "loss": 2.5609, + "step": 724 + }, + { + "epoch": 0.2061648015071358, + "grad_norm": 2.031601667404175, + "learning_rate": 3.209422579718472e-05, + "loss": 2.6579, + "step": 725 + }, + { + "epoch": 0.2064491667505939, + "grad_norm": 1.992948055267334, + "learning_rate": 3.2082734846308534e-05, + "loss": 2.4377, + "step": 726 + }, + { + "epoch": 0.20673353199405203, + "grad_norm": 2.026014566421509, + "learning_rate": 3.207124389543235e-05, + "loss": 1.9672, + "step": 727 + }, + { + "epoch": 0.20701789723751016, + "grad_norm": 1.9836902618408203, + "learning_rate": 3.205975294455616e-05, + "loss": 2.2928, + "step": 728 + }, + { + "epoch": 0.20730226248096825, + "grad_norm": 2.541131019592285, + "learning_rate": 3.204826199367998e-05, + "loss": 3.3255, + "step": 729 + }, + { + "epoch": 0.20758662772442638, + "grad_norm": 2.0298244953155518, + "learning_rate": 3.20367710428038e-05, + "loss": 2.6907, + "step": 730 + }, + { + "epoch": 0.2078709929678845, + "grad_norm": 1.8890761137008667, + "learning_rate": 3.202528009192761e-05, + "loss": 2.5107, + "step": 731 + }, + { + "epoch": 0.20815535821134262, + "grad_norm": 1.8663610219955444, + "learning_rate": 3.201378914105143e-05, + "loss": 2.4509, + "step": 732 + }, + { + "epoch": 0.20843972345480075, + "grad_norm": 1.906890630722046, + "learning_rate": 3.200229819017524e-05, + "loss": 2.5638, + "step": 733 + }, + { + "epoch": 0.20872408869825884, + "grad_norm": 1.9237983226776123, + "learning_rate": 3.199080723929906e-05, + "loss": 2.4034, + "step": 734 + }, + { + "epoch": 0.20900845394171697, + "grad_norm": 2.0110905170440674, + "learning_rate": 3.197931628842287e-05, + "loss": 2.242, + "step": 735 + }, + { + "epoch": 0.2092928191851751, + "grad_norm": 2.0842487812042236, + "learning_rate": 3.1967825337546686e-05, + "loss": 2.1287, + "step": 736 + }, + { + "epoch": 0.2095771844286332, + "grad_norm": 2.772614002227783, + "learning_rate": 3.19563343866705e-05, + "loss": 3.0786, + "step": 737 + }, + { + "epoch": 0.20986154967209134, + "grad_norm": 2.3587253093719482, + "learning_rate": 3.1944843435794315e-05, + "loss": 2.7391, + "step": 738 + }, + { + "epoch": 0.21014591491554946, + "grad_norm": 1.897799015045166, + "learning_rate": 3.193335248491813e-05, + "loss": 2.5878, + "step": 739 + }, + { + "epoch": 0.21043028015900755, + "grad_norm": 1.919022560119629, + "learning_rate": 3.1921861534041944e-05, + "loss": 2.6763, + "step": 740 + }, + { + "epoch": 0.21071464540246568, + "grad_norm": 2.1270596981048584, + "learning_rate": 3.191037058316576e-05, + "loss": 2.3583, + "step": 741 + }, + { + "epoch": 0.2109990106459238, + "grad_norm": 1.8899205923080444, + "learning_rate": 3.189887963228957e-05, + "loss": 2.4652, + "step": 742 + }, + { + "epoch": 0.21128337588938192, + "grad_norm": 1.9961659908294678, + "learning_rate": 3.188738868141339e-05, + "loss": 2.275, + "step": 743 + }, + { + "epoch": 0.21156774113284005, + "grad_norm": 2.287783622741699, + "learning_rate": 3.18758977305372e-05, + "loss": 2.0559, + "step": 744 + }, + { + "epoch": 0.21185210637629814, + "grad_norm": 2.3738584518432617, + "learning_rate": 3.186440677966102e-05, + "loss": 3.1947, + "step": 745 + }, + { + "epoch": 0.21213647161975627, + "grad_norm": 2.055588960647583, + "learning_rate": 3.185291582878483e-05, + "loss": 2.881, + "step": 746 + }, + { + "epoch": 0.2124208368632144, + "grad_norm": 2.0361130237579346, + "learning_rate": 3.184142487790865e-05, + "loss": 2.6499, + "step": 747 + }, + { + "epoch": 0.21270520210667251, + "grad_norm": 1.918145775794983, + "learning_rate": 3.182993392703247e-05, + "loss": 2.3748, + "step": 748 + }, + { + "epoch": 0.21298956735013064, + "grad_norm": 2.2371819019317627, + "learning_rate": 3.181844297615628e-05, + "loss": 2.5316, + "step": 749 + }, + { + "epoch": 0.21327393259358876, + "grad_norm": 2.0490219593048096, + "learning_rate": 3.1806952025280096e-05, + "loss": 2.3496, + "step": 750 + }, + { + "epoch": 0.21355829783704686, + "grad_norm": 1.884697437286377, + "learning_rate": 3.179546107440391e-05, + "loss": 2.168, + "step": 751 + }, + { + "epoch": 0.21384266308050498, + "grad_norm": 1.9147650003433228, + "learning_rate": 3.1783970123527725e-05, + "loss": 2.1791, + "step": 752 + }, + { + "epoch": 0.2141270283239631, + "grad_norm": 2.1511213779449463, + "learning_rate": 3.1772479172651536e-05, + "loss": 3.073, + "step": 753 + }, + { + "epoch": 0.21441139356742123, + "grad_norm": 1.8281089067459106, + "learning_rate": 3.1760988221775354e-05, + "loss": 2.7645, + "step": 754 + }, + { + "epoch": 0.21469575881087935, + "grad_norm": 1.7798327207565308, + "learning_rate": 3.1749497270899165e-05, + "loss": 2.6769, + "step": 755 + }, + { + "epoch": 0.21498012405433745, + "grad_norm": 1.9437909126281738, + "learning_rate": 3.173800632002298e-05, + "loss": 2.7385, + "step": 756 + }, + { + "epoch": 0.21526448929779557, + "grad_norm": 2.2421510219573975, + "learning_rate": 3.17265153691468e-05, + "loss": 2.3904, + "step": 757 + }, + { + "epoch": 0.2155488545412537, + "grad_norm": 2.002697467803955, + "learning_rate": 3.171502441827061e-05, + "loss": 2.1965, + "step": 758 + }, + { + "epoch": 0.21583321978471182, + "grad_norm": 2.018341541290283, + "learning_rate": 3.170353346739443e-05, + "loss": 2.1783, + "step": 759 + }, + { + "epoch": 0.21611758502816994, + "grad_norm": 1.9171587228775024, + "learning_rate": 3.169204251651825e-05, + "loss": 1.9632, + "step": 760 + }, + { + "epoch": 0.21640195027162806, + "grad_norm": 2.128800868988037, + "learning_rate": 3.168055156564206e-05, + "loss": 3.1176, + "step": 761 + }, + { + "epoch": 0.21668631551508616, + "grad_norm": 2.2135539054870605, + "learning_rate": 3.1669060614765876e-05, + "loss": 2.9083, + "step": 762 + }, + { + "epoch": 0.21697068075854428, + "grad_norm": 2.01832914352417, + "learning_rate": 3.1657569663889694e-05, + "loss": 2.5822, + "step": 763 + }, + { + "epoch": 0.2172550460020024, + "grad_norm": 1.827118992805481, + "learning_rate": 3.1646078713013505e-05, + "loss": 2.5527, + "step": 764 + }, + { + "epoch": 0.21753941124546053, + "grad_norm": 1.9703270196914673, + "learning_rate": 3.163458776213732e-05, + "loss": 2.61, + "step": 765 + }, + { + "epoch": 0.21782377648891865, + "grad_norm": 2.0175693035125732, + "learning_rate": 3.1623096811261134e-05, + "loss": 2.1853, + "step": 766 + }, + { + "epoch": 0.21810814173237675, + "grad_norm": 2.11944580078125, + "learning_rate": 3.161160586038495e-05, + "loss": 1.7576, + "step": 767 + }, + { + "epoch": 0.21839250697583487, + "grad_norm": 2.100249767303467, + "learning_rate": 3.1600114909508763e-05, + "loss": 1.9592, + "step": 768 + }, + { + "epoch": 0.218676872219293, + "grad_norm": 2.8305320739746094, + "learning_rate": 3.158862395863258e-05, + "loss": 3.3594, + "step": 769 + }, + { + "epoch": 0.21896123746275112, + "grad_norm": 2.1420650482177734, + "learning_rate": 3.157713300775639e-05, + "loss": 2.847, + "step": 770 + }, + { + "epoch": 0.21924560270620924, + "grad_norm": 1.9816900491714478, + "learning_rate": 3.156564205688021e-05, + "loss": 2.5702, + "step": 771 + }, + { + "epoch": 0.21952996794966737, + "grad_norm": 1.9326121807098389, + "learning_rate": 3.155415110600403e-05, + "loss": 2.6882, + "step": 772 + }, + { + "epoch": 0.21981433319312546, + "grad_norm": 2.1733944416046143, + "learning_rate": 3.154266015512784e-05, + "loss": 2.7475, + "step": 773 + }, + { + "epoch": 0.22009869843658358, + "grad_norm": 1.7192537784576416, + "learning_rate": 3.153116920425166e-05, + "loss": 2.3554, + "step": 774 + }, + { + "epoch": 0.2203830636800417, + "grad_norm": 1.9640390872955322, + "learning_rate": 3.151967825337547e-05, + "loss": 2.1575, + "step": 775 + }, + { + "epoch": 0.22066742892349983, + "grad_norm": 2.0537362098693848, + "learning_rate": 3.1508187302499286e-05, + "loss": 2.2478, + "step": 776 + }, + { + "epoch": 0.22095179416695795, + "grad_norm": 2.381430149078369, + "learning_rate": 3.14966963516231e-05, + "loss": 2.936, + "step": 777 + }, + { + "epoch": 0.22123615941041605, + "grad_norm": 2.202801465988159, + "learning_rate": 3.1485205400746915e-05, + "loss": 2.9745, + "step": 778 + }, + { + "epoch": 0.22152052465387417, + "grad_norm": 1.809591293334961, + "learning_rate": 3.1473714449870726e-05, + "loss": 2.6893, + "step": 779 + }, + { + "epoch": 0.2218048898973323, + "grad_norm": 1.792960524559021, + "learning_rate": 3.1462223498994544e-05, + "loss": 2.4085, + "step": 780 + }, + { + "epoch": 0.22208925514079042, + "grad_norm": 1.961672067642212, + "learning_rate": 3.145073254811836e-05, + "loss": 2.5024, + "step": 781 + }, + { + "epoch": 0.22237362038424854, + "grad_norm": 1.8171442747116089, + "learning_rate": 3.143924159724217e-05, + "loss": 2.3142, + "step": 782 + }, + { + "epoch": 0.22265798562770667, + "grad_norm": 1.9773929119110107, + "learning_rate": 3.142775064636599e-05, + "loss": 2.1619, + "step": 783 + }, + { + "epoch": 0.22294235087116476, + "grad_norm": 1.815152645111084, + "learning_rate": 3.14162596954898e-05, + "loss": 2.0866, + "step": 784 + }, + { + "epoch": 0.2232267161146229, + "grad_norm": 2.7723894119262695, + "learning_rate": 3.140476874461362e-05, + "loss": 3.3017, + "step": 785 + }, + { + "epoch": 0.223511081358081, + "grad_norm": 1.9993256330490112, + "learning_rate": 3.139327779373743e-05, + "loss": 2.6683, + "step": 786 + }, + { + "epoch": 0.22379544660153913, + "grad_norm": 1.7609732151031494, + "learning_rate": 3.138178684286125e-05, + "loss": 2.5893, + "step": 787 + }, + { + "epoch": 0.22407981184499726, + "grad_norm": 1.7786259651184082, + "learning_rate": 3.137029589198506e-05, + "loss": 2.4328, + "step": 788 + }, + { + "epoch": 0.22436417708845535, + "grad_norm": 1.887364149093628, + "learning_rate": 3.135880494110888e-05, + "loss": 2.5221, + "step": 789 + }, + { + "epoch": 0.22464854233191348, + "grad_norm": 1.8501337766647339, + "learning_rate": 3.1347313990232696e-05, + "loss": 2.2003, + "step": 790 + }, + { + "epoch": 0.2249329075753716, + "grad_norm": 1.7945717573165894, + "learning_rate": 3.133582303935651e-05, + "loss": 2.1568, + "step": 791 + }, + { + "epoch": 0.22521727281882972, + "grad_norm": 1.8720512390136719, + "learning_rate": 3.1324332088480325e-05, + "loss": 2.104, + "step": 792 + }, + { + "epoch": 0.22550163806228785, + "grad_norm": 2.1769962310791016, + "learning_rate": 3.1312841137604136e-05, + "loss": 3.2416, + "step": 793 + }, + { + "epoch": 0.22578600330574594, + "grad_norm": 2.1001853942871094, + "learning_rate": 3.1301350186727954e-05, + "loss": 2.6237, + "step": 794 + }, + { + "epoch": 0.22607036854920406, + "grad_norm": 1.819381594657898, + "learning_rate": 3.1289859235851765e-05, + "loss": 2.5273, + "step": 795 + }, + { + "epoch": 0.2263547337926622, + "grad_norm": 1.706929326057434, + "learning_rate": 3.127836828497558e-05, + "loss": 2.5795, + "step": 796 + }, + { + "epoch": 0.2266390990361203, + "grad_norm": 2.0382096767425537, + "learning_rate": 3.12668773340994e-05, + "loss": 2.5978, + "step": 797 + }, + { + "epoch": 0.22692346427957844, + "grad_norm": 1.7794004678726196, + "learning_rate": 3.125538638322322e-05, + "loss": 2.2474, + "step": 798 + }, + { + "epoch": 0.22720782952303656, + "grad_norm": 1.8751369714736938, + "learning_rate": 3.124389543234703e-05, + "loss": 2.0745, + "step": 799 + }, + { + "epoch": 0.22749219476649465, + "grad_norm": 1.9768788814544678, + "learning_rate": 3.123240448147085e-05, + "loss": 2.2523, + "step": 800 + }, + { + "epoch": 0.22777656000995278, + "grad_norm": 2.430659532546997, + "learning_rate": 3.122091353059466e-05, + "loss": 3.2516, + "step": 801 + }, + { + "epoch": 0.2280609252534109, + "grad_norm": 1.9983506202697754, + "learning_rate": 3.120942257971848e-05, + "loss": 2.7283, + "step": 802 + }, + { + "epoch": 0.22834529049686902, + "grad_norm": 1.8245633840560913, + "learning_rate": 3.119793162884229e-05, + "loss": 2.4896, + "step": 803 + }, + { + "epoch": 0.22862965574032715, + "grad_norm": 1.724178433418274, + "learning_rate": 3.1186440677966106e-05, + "loss": 2.5445, + "step": 804 + }, + { + "epoch": 0.22891402098378524, + "grad_norm": 1.851968765258789, + "learning_rate": 3.1174949727089924e-05, + "loss": 2.3454, + "step": 805 + }, + { + "epoch": 0.22919838622724337, + "grad_norm": 1.7947938442230225, + "learning_rate": 3.1163458776213735e-05, + "loss": 2.2357, + "step": 806 + }, + { + "epoch": 0.2294827514707015, + "grad_norm": 2.108400821685791, + "learning_rate": 3.115196782533755e-05, + "loss": 2.2438, + "step": 807 + }, + { + "epoch": 0.2297671167141596, + "grad_norm": 2.0238804817199707, + "learning_rate": 3.1140476874461364e-05, + "loss": 1.9718, + "step": 808 + }, + { + "epoch": 0.23005148195761774, + "grad_norm": 2.4201109409332275, + "learning_rate": 3.112898592358518e-05, + "loss": 3.2521, + "step": 809 + }, + { + "epoch": 0.23033584720107586, + "grad_norm": 1.9365209341049194, + "learning_rate": 3.111749497270899e-05, + "loss": 2.9715, + "step": 810 + }, + { + "epoch": 0.23062021244453396, + "grad_norm": 1.670932650566101, + "learning_rate": 3.110600402183281e-05, + "loss": 2.6555, + "step": 811 + }, + { + "epoch": 0.23090457768799208, + "grad_norm": 1.8441979885101318, + "learning_rate": 3.109451307095662e-05, + "loss": 2.6097, + "step": 812 + }, + { + "epoch": 0.2311889429314502, + "grad_norm": 2.0533034801483154, + "learning_rate": 3.108302212008044e-05, + "loss": 2.4527, + "step": 813 + }, + { + "epoch": 0.23147330817490833, + "grad_norm": 1.9839503765106201, + "learning_rate": 3.107153116920426e-05, + "loss": 2.3322, + "step": 814 + }, + { + "epoch": 0.23175767341836645, + "grad_norm": 1.8730031251907349, + "learning_rate": 3.106004021832807e-05, + "loss": 2.0321, + "step": 815 + }, + { + "epoch": 0.23204203866182455, + "grad_norm": 2.0080573558807373, + "learning_rate": 3.1048549267451886e-05, + "loss": 2.295, + "step": 816 + }, + { + "epoch": 0.23232640390528267, + "grad_norm": 2.2020275592803955, + "learning_rate": 3.10370583165757e-05, + "loss": 3.0295, + "step": 817 + }, + { + "epoch": 0.2326107691487408, + "grad_norm": 1.7750929594039917, + "learning_rate": 3.1025567365699515e-05, + "loss": 2.8698, + "step": 818 + }, + { + "epoch": 0.23289513439219892, + "grad_norm": 1.6771564483642578, + "learning_rate": 3.1014076414823326e-05, + "loss": 2.417, + "step": 819 + }, + { + "epoch": 0.23317949963565704, + "grad_norm": 1.8319196701049805, + "learning_rate": 3.1002585463947144e-05, + "loss": 2.3378, + "step": 820 + }, + { + "epoch": 0.23346386487911516, + "grad_norm": 2.1067543029785156, + "learning_rate": 3.0991094513070955e-05, + "loss": 2.434, + "step": 821 + }, + { + "epoch": 0.23374823012257326, + "grad_norm": 2.0174291133880615, + "learning_rate": 3.097960356219477e-05, + "loss": 2.2196, + "step": 822 + }, + { + "epoch": 0.23403259536603138, + "grad_norm": 1.8617424964904785, + "learning_rate": 3.096811261131859e-05, + "loss": 2.0332, + "step": 823 + }, + { + "epoch": 0.2343169606094895, + "grad_norm": 1.8981695175170898, + "learning_rate": 3.09566216604424e-05, + "loss": 2.0747, + "step": 824 + }, + { + "epoch": 0.23460132585294763, + "grad_norm": 2.3870060443878174, + "learning_rate": 3.094513070956622e-05, + "loss": 3.1643, + "step": 825 + }, + { + "epoch": 0.23488569109640575, + "grad_norm": 2.0108344554901123, + "learning_rate": 3.093363975869003e-05, + "loss": 2.8178, + "step": 826 + }, + { + "epoch": 0.23517005633986385, + "grad_norm": 2.198622941970825, + "learning_rate": 3.092214880781385e-05, + "loss": 2.7069, + "step": 827 + }, + { + "epoch": 0.23545442158332197, + "grad_norm": 2.032430648803711, + "learning_rate": 3.091065785693766e-05, + "loss": 2.4352, + "step": 828 + }, + { + "epoch": 0.2357387868267801, + "grad_norm": 1.968604564666748, + "learning_rate": 3.089916690606148e-05, + "loss": 2.2391, + "step": 829 + }, + { + "epoch": 0.23602315207023822, + "grad_norm": 1.6530208587646484, + "learning_rate": 3.088767595518529e-05, + "loss": 2.223, + "step": 830 + }, + { + "epoch": 0.23630751731369634, + "grad_norm": 2.004181385040283, + "learning_rate": 3.087618500430911e-05, + "loss": 2.0482, + "step": 831 + }, + { + "epoch": 0.23659188255715446, + "grad_norm": 2.0865046977996826, + "learning_rate": 3.0864694053432925e-05, + "loss": 2.1199, + "step": 832 + }, + { + "epoch": 0.23687624780061256, + "grad_norm": 2.4281649589538574, + "learning_rate": 3.0853203102556736e-05, + "loss": 3.1147, + "step": 833 + }, + { + "epoch": 0.23716061304407068, + "grad_norm": 2.327183246612549, + "learning_rate": 3.0841712151680554e-05, + "loss": 2.7999, + "step": 834 + }, + { + "epoch": 0.2374449782875288, + "grad_norm": 1.8897318840026855, + "learning_rate": 3.083022120080437e-05, + "loss": 2.6127, + "step": 835 + }, + { + "epoch": 0.23772934353098693, + "grad_norm": 1.755764126777649, + "learning_rate": 3.081873024992818e-05, + "loss": 2.5684, + "step": 836 + }, + { + "epoch": 0.23801370877444505, + "grad_norm": 1.9734392166137695, + "learning_rate": 3.0807239299052e-05, + "loss": 2.3049, + "step": 837 + }, + { + "epoch": 0.23829807401790315, + "grad_norm": 1.9231913089752197, + "learning_rate": 3.079574834817582e-05, + "loss": 2.3946, + "step": 838 + }, + { + "epoch": 0.23858243926136127, + "grad_norm": 2.0235962867736816, + "learning_rate": 3.078425739729963e-05, + "loss": 2.2068, + "step": 839 + }, + { + "epoch": 0.2388668045048194, + "grad_norm": 2.155137538909912, + "learning_rate": 3.077276644642345e-05, + "loss": 2.0929, + "step": 840 + }, + { + "epoch": 0.23915116974827752, + "grad_norm": 2.36690092086792, + "learning_rate": 3.076127549554726e-05, + "loss": 3.1167, + "step": 841 + }, + { + "epoch": 0.23943553499173564, + "grad_norm": 2.195228099822998, + "learning_rate": 3.074978454467108e-05, + "loss": 3.0185, + "step": 842 + }, + { + "epoch": 0.23971990023519377, + "grad_norm": 1.9150617122650146, + "learning_rate": 3.073829359379489e-05, + "loss": 2.6971, + "step": 843 + }, + { + "epoch": 0.24000426547865186, + "grad_norm": 1.8287773132324219, + "learning_rate": 3.0726802642918706e-05, + "loss": 2.6606, + "step": 844 + }, + { + "epoch": 0.24028863072210999, + "grad_norm": 1.9123808145523071, + "learning_rate": 3.071531169204252e-05, + "loss": 2.3256, + "step": 845 + }, + { + "epoch": 0.2405729959655681, + "grad_norm": 2.077585458755493, + "learning_rate": 3.0703820741166335e-05, + "loss": 2.1924, + "step": 846 + }, + { + "epoch": 0.24085736120902623, + "grad_norm": 2.0052852630615234, + "learning_rate": 3.069232979029015e-05, + "loss": 1.979, + "step": 847 + }, + { + "epoch": 0.24114172645248436, + "grad_norm": 2.1259374618530273, + "learning_rate": 3.0680838839413964e-05, + "loss": 2.0226, + "step": 848 + }, + { + "epoch": 0.24142609169594245, + "grad_norm": 2.7498273849487305, + "learning_rate": 3.066934788853778e-05, + "loss": 3.2752, + "step": 849 + }, + { + "epoch": 0.24171045693940058, + "grad_norm": 1.9522055387496948, + "learning_rate": 3.065785693766159e-05, + "loss": 2.7302, + "step": 850 + }, + { + "epoch": 0.2419948221828587, + "grad_norm": 1.9767372608184814, + "learning_rate": 3.064636598678541e-05, + "loss": 2.7091, + "step": 851 + }, + { + "epoch": 0.24227918742631682, + "grad_norm": 1.8098385334014893, + "learning_rate": 3.063487503590922e-05, + "loss": 2.5372, + "step": 852 + }, + { + "epoch": 0.24256355266977495, + "grad_norm": 1.995309829711914, + "learning_rate": 3.062338408503304e-05, + "loss": 2.3491, + "step": 853 + }, + { + "epoch": 0.24284791791323304, + "grad_norm": 1.838243842124939, + "learning_rate": 3.061189313415685e-05, + "loss": 2.3286, + "step": 854 + }, + { + "epoch": 0.24313228315669116, + "grad_norm": 1.8721081018447876, + "learning_rate": 3.060040218328067e-05, + "loss": 2.1154, + "step": 855 + }, + { + "epoch": 0.2434166484001493, + "grad_norm": 2.050607919692993, + "learning_rate": 3.0588911232404487e-05, + "loss": 1.938, + "step": 856 + }, + { + "epoch": 0.2437010136436074, + "grad_norm": 2.1715247631073, + "learning_rate": 3.05774202815283e-05, + "loss": 2.9436, + "step": 857 + }, + { + "epoch": 0.24398537888706553, + "grad_norm": 2.1600615978240967, + "learning_rate": 3.0565929330652116e-05, + "loss": 2.6224, + "step": 858 + }, + { + "epoch": 0.24426974413052366, + "grad_norm": 1.890982747077942, + "learning_rate": 3.055443837977593e-05, + "loss": 2.6999, + "step": 859 + }, + { + "epoch": 0.24455410937398175, + "grad_norm": 1.9753085374832153, + "learning_rate": 3.0542947428899745e-05, + "loss": 2.4645, + "step": 860 + }, + { + "epoch": 0.24483847461743988, + "grad_norm": 2.096250295639038, + "learning_rate": 3.0531456478023556e-05, + "loss": 2.534, + "step": 861 + }, + { + "epoch": 0.245122839860898, + "grad_norm": 1.9475681781768799, + "learning_rate": 3.0519965527147374e-05, + "loss": 2.1234, + "step": 862 + }, + { + "epoch": 0.24540720510435612, + "grad_norm": 1.8136141300201416, + "learning_rate": 3.0508474576271188e-05, + "loss": 1.917, + "step": 863 + }, + { + "epoch": 0.24569157034781425, + "grad_norm": 2.0293166637420654, + "learning_rate": 3.0496983625395003e-05, + "loss": 2.2178, + "step": 864 + }, + { + "epoch": 0.24597593559127234, + "grad_norm": 2.072183847427368, + "learning_rate": 3.0485492674518817e-05, + "loss": 2.9106, + "step": 865 + }, + { + "epoch": 0.24626030083473047, + "grad_norm": 1.7826175689697266, + "learning_rate": 3.047400172364263e-05, + "loss": 2.9296, + "step": 866 + }, + { + "epoch": 0.2465446660781886, + "grad_norm": 1.876105546951294, + "learning_rate": 3.0462510772766446e-05, + "loss": 2.6027, + "step": 867 + }, + { + "epoch": 0.2468290313216467, + "grad_norm": 1.8935751914978027, + "learning_rate": 3.045101982189026e-05, + "loss": 2.4412, + "step": 868 + }, + { + "epoch": 0.24711339656510484, + "grad_norm": 2.0665159225463867, + "learning_rate": 3.043952887101408e-05, + "loss": 2.271, + "step": 869 + }, + { + "epoch": 0.24739776180856296, + "grad_norm": 1.7189526557922363, + "learning_rate": 3.0428037920137893e-05, + "loss": 2.2023, + "step": 870 + }, + { + "epoch": 0.24768212705202106, + "grad_norm": 1.9814810752868652, + "learning_rate": 3.041654696926171e-05, + "loss": 2.1293, + "step": 871 + }, + { + "epoch": 0.24796649229547918, + "grad_norm": 1.9453988075256348, + "learning_rate": 3.0405056018385525e-05, + "loss": 2.0083, + "step": 872 + }, + { + "epoch": 0.2482508575389373, + "grad_norm": 2.7860076427459717, + "learning_rate": 3.039356506750934e-05, + "loss": 3.0749, + "step": 873 + }, + { + "epoch": 0.24853522278239543, + "grad_norm": 1.9733988046646118, + "learning_rate": 3.0382074116633154e-05, + "loss": 2.6634, + "step": 874 + }, + { + "epoch": 0.24881958802585355, + "grad_norm": 1.8551710844039917, + "learning_rate": 3.0370583165756972e-05, + "loss": 2.4293, + "step": 875 + }, + { + "epoch": 0.24910395326931165, + "grad_norm": 1.7632685899734497, + "learning_rate": 3.0359092214880787e-05, + "loss": 2.2907, + "step": 876 + }, + { + "epoch": 0.24938831851276977, + "grad_norm": 1.9409010410308838, + "learning_rate": 3.03476012640046e-05, + "loss": 2.4988, + "step": 877 + }, + { + "epoch": 0.2496726837562279, + "grad_norm": 1.7946302890777588, + "learning_rate": 3.0336110313128416e-05, + "loss": 2.0384, + "step": 878 + }, + { + "epoch": 0.24995704899968602, + "grad_norm": 2.018123149871826, + "learning_rate": 3.032461936225223e-05, + "loss": 1.9721, + "step": 879 + }, + { + "epoch": 0.2502414142431441, + "grad_norm": 1.960157036781311, + "learning_rate": 3.0313128411376045e-05, + "loss": 2.194, + "step": 880 + }, + { + "epoch": 0.25052577948660226, + "grad_norm": 2.0886595249176025, + "learning_rate": 3.030163746049986e-05, + "loss": 2.9749, + "step": 881 + }, + { + "epoch": 0.25081014473006036, + "grad_norm": 1.888434886932373, + "learning_rate": 3.0290146509623674e-05, + "loss": 2.8315, + "step": 882 + }, + { + "epoch": 0.2510945099735185, + "grad_norm": 1.9592381715774536, + "learning_rate": 3.0278655558747488e-05, + "loss": 2.4951, + "step": 883 + }, + { + "epoch": 0.2513788752169766, + "grad_norm": 1.7165327072143555, + "learning_rate": 3.0267164607871306e-05, + "loss": 2.5657, + "step": 884 + }, + { + "epoch": 0.2516632404604347, + "grad_norm": 2.0365653038024902, + "learning_rate": 3.025567365699512e-05, + "loss": 2.5552, + "step": 885 + }, + { + "epoch": 0.25194760570389285, + "grad_norm": 1.9661747217178345, + "learning_rate": 3.0244182706118935e-05, + "loss": 2.2448, + "step": 886 + }, + { + "epoch": 0.25223197094735095, + "grad_norm": 1.8233864307403564, + "learning_rate": 3.023269175524275e-05, + "loss": 2.0185, + "step": 887 + }, + { + "epoch": 0.2525163361908091, + "grad_norm": 2.1719167232513428, + "learning_rate": 3.0221200804366564e-05, + "loss": 2.224, + "step": 888 + }, + { + "epoch": 0.2528007014342672, + "grad_norm": 2.498225450515747, + "learning_rate": 3.020970985349038e-05, + "loss": 3.2098, + "step": 889 + }, + { + "epoch": 0.2530850666777253, + "grad_norm": 1.9225410223007202, + "learning_rate": 3.0198218902614193e-05, + "loss": 2.5611, + "step": 890 + }, + { + "epoch": 0.25336943192118344, + "grad_norm": 1.8655143976211548, + "learning_rate": 3.0186727951738008e-05, + "loss": 2.6871, + "step": 891 + }, + { + "epoch": 0.25365379716464154, + "grad_norm": 1.7953119277954102, + "learning_rate": 3.0175237000861822e-05, + "loss": 2.6092, + "step": 892 + }, + { + "epoch": 0.2539381624080997, + "grad_norm": 2.0147297382354736, + "learning_rate": 3.016374604998564e-05, + "loss": 2.45, + "step": 893 + }, + { + "epoch": 0.2542225276515578, + "grad_norm": 1.827158808708191, + "learning_rate": 3.0152255099109454e-05, + "loss": 2.0963, + "step": 894 + }, + { + "epoch": 0.2545068928950159, + "grad_norm": 1.9638267755508423, + "learning_rate": 3.014076414823327e-05, + "loss": 1.9395, + "step": 895 + }, + { + "epoch": 0.25479125813847403, + "grad_norm": 1.9481165409088135, + "learning_rate": 3.0129273197357083e-05, + "loss": 2.171, + "step": 896 + }, + { + "epoch": 0.2550756233819321, + "grad_norm": 2.1484596729278564, + "learning_rate": 3.0117782246480898e-05, + "loss": 3.081, + "step": 897 + }, + { + "epoch": 0.2553599886253903, + "grad_norm": 1.8102747201919556, + "learning_rate": 3.0106291295604712e-05, + "loss": 2.6534, + "step": 898 + }, + { + "epoch": 0.2556443538688484, + "grad_norm": 1.7093149423599243, + "learning_rate": 3.0094800344728527e-05, + "loss": 2.5904, + "step": 899 + }, + { + "epoch": 0.2559287191123065, + "grad_norm": 1.7110258340835571, + "learning_rate": 3.008330939385234e-05, + "loss": 2.6272, + "step": 900 + }, + { + "epoch": 0.2562130843557646, + "grad_norm": 1.8184210062026978, + "learning_rate": 3.0071818442976156e-05, + "loss": 2.3048, + "step": 901 + }, + { + "epoch": 0.2564974495992227, + "grad_norm": 1.7951569557189941, + "learning_rate": 3.0060327492099974e-05, + "loss": 2.1765, + "step": 902 + }, + { + "epoch": 0.25678181484268087, + "grad_norm": 1.8752012252807617, + "learning_rate": 3.0048836541223788e-05, + "loss": 2.065, + "step": 903 + }, + { + "epoch": 0.25706618008613896, + "grad_norm": 1.838429570198059, + "learning_rate": 3.0037345590347603e-05, + "loss": 1.9967, + "step": 904 + }, + { + "epoch": 0.2573505453295971, + "grad_norm": 2.226714849472046, + "learning_rate": 3.0025854639471417e-05, + "loss": 3.2629, + "step": 905 + }, + { + "epoch": 0.2576349105730552, + "grad_norm": 1.8551560640335083, + "learning_rate": 3.0014363688595232e-05, + "loss": 2.7787, + "step": 906 + }, + { + "epoch": 0.2579192758165133, + "grad_norm": 1.9678069353103638, + "learning_rate": 3.0002872737719046e-05, + "loss": 2.7359, + "step": 907 + }, + { + "epoch": 0.25820364105997146, + "grad_norm": 1.7606141567230225, + "learning_rate": 2.9991381786842864e-05, + "loss": 2.4614, + "step": 908 + }, + { + "epoch": 0.25848800630342955, + "grad_norm": 1.9030214548110962, + "learning_rate": 2.9979890835966682e-05, + "loss": 2.6742, + "step": 909 + }, + { + "epoch": 0.2587723715468877, + "grad_norm": 1.7596023082733154, + "learning_rate": 2.9968399885090497e-05, + "loss": 2.4495, + "step": 910 + }, + { + "epoch": 0.2590567367903458, + "grad_norm": 1.7681137323379517, + "learning_rate": 2.995690893421431e-05, + "loss": 1.8863, + "step": 911 + }, + { + "epoch": 0.2593411020338039, + "grad_norm": 1.8547093868255615, + "learning_rate": 2.9945417983338126e-05, + "loss": 1.8526, + "step": 912 + }, + { + "epoch": 0.25962546727726205, + "grad_norm": 2.3479535579681396, + "learning_rate": 2.993392703246194e-05, + "loss": 3.0925, + "step": 913 + }, + { + "epoch": 0.25990983252072014, + "grad_norm": 2.0483782291412354, + "learning_rate": 2.9922436081585755e-05, + "loss": 2.8808, + "step": 914 + }, + { + "epoch": 0.2601941977641783, + "grad_norm": 1.8194479942321777, + "learning_rate": 2.991094513070957e-05, + "loss": 2.7473, + "step": 915 + }, + { + "epoch": 0.2604785630076364, + "grad_norm": 1.7477372884750366, + "learning_rate": 2.9899454179833384e-05, + "loss": 2.5109, + "step": 916 + }, + { + "epoch": 0.2607629282510945, + "grad_norm": 1.8002467155456543, + "learning_rate": 2.9887963228957198e-05, + "loss": 2.3975, + "step": 917 + }, + { + "epoch": 0.26104729349455263, + "grad_norm": 1.7082115411758423, + "learning_rate": 2.9876472278081016e-05, + "loss": 2.0252, + "step": 918 + }, + { + "epoch": 0.26133165873801073, + "grad_norm": 1.8353215456008911, + "learning_rate": 2.986498132720483e-05, + "loss": 1.9635, + "step": 919 + }, + { + "epoch": 0.2616160239814689, + "grad_norm": 1.9129843711853027, + "learning_rate": 2.9853490376328645e-05, + "loss": 1.8942, + "step": 920 + }, + { + "epoch": 0.261900389224927, + "grad_norm": 2.272021770477295, + "learning_rate": 2.984199942545246e-05, + "loss": 3.1266, + "step": 921 + }, + { + "epoch": 0.26218475446838513, + "grad_norm": 1.8906854391098022, + "learning_rate": 2.9830508474576274e-05, + "loss": 2.7551, + "step": 922 + }, + { + "epoch": 0.2624691197118432, + "grad_norm": 1.786133050918579, + "learning_rate": 2.981901752370009e-05, + "loss": 2.529, + "step": 923 + }, + { + "epoch": 0.2627534849553013, + "grad_norm": 1.7091472148895264, + "learning_rate": 2.9807526572823903e-05, + "loss": 2.4381, + "step": 924 + }, + { + "epoch": 0.26303785019875947, + "grad_norm": 2.077052354812622, + "learning_rate": 2.9796035621947717e-05, + "loss": 2.4781, + "step": 925 + }, + { + "epoch": 0.26332221544221757, + "grad_norm": 1.774610996246338, + "learning_rate": 2.9784544671071532e-05, + "loss": 2.156, + "step": 926 + }, + { + "epoch": 0.2636065806856757, + "grad_norm": 1.7777446508407593, + "learning_rate": 2.977305372019535e-05, + "loss": 2.1573, + "step": 927 + }, + { + "epoch": 0.2638909459291338, + "grad_norm": 1.9106909036636353, + "learning_rate": 2.9761562769319164e-05, + "loss": 1.9771, + "step": 928 + }, + { + "epoch": 0.2641753111725919, + "grad_norm": 2.073526620864868, + "learning_rate": 2.975007181844298e-05, + "loss": 3.0332, + "step": 929 + }, + { + "epoch": 0.26445967641605006, + "grad_norm": 1.7309014797210693, + "learning_rate": 2.9738580867566793e-05, + "loss": 2.7578, + "step": 930 + }, + { + "epoch": 0.26474404165950816, + "grad_norm": 1.7269665002822876, + "learning_rate": 2.9727089916690608e-05, + "loss": 2.5512, + "step": 931 + }, + { + "epoch": 0.2650284069029663, + "grad_norm": 1.8180932998657227, + "learning_rate": 2.9715598965814422e-05, + "loss": 2.6815, + "step": 932 + }, + { + "epoch": 0.2653127721464244, + "grad_norm": 1.8024221658706665, + "learning_rate": 2.9704108014938237e-05, + "loss": 2.306, + "step": 933 + }, + { + "epoch": 0.2655971373898825, + "grad_norm": 1.8379640579223633, + "learning_rate": 2.969261706406205e-05, + "loss": 2.1043, + "step": 934 + }, + { + "epoch": 0.26588150263334065, + "grad_norm": 1.738338828086853, + "learning_rate": 2.9681126113185866e-05, + "loss": 1.9482, + "step": 935 + }, + { + "epoch": 0.26616586787679875, + "grad_norm": 1.7984474897384644, + "learning_rate": 2.9669635162309684e-05, + "loss": 2.0764, + "step": 936 + }, + { + "epoch": 0.2664502331202569, + "grad_norm": 2.280787229537964, + "learning_rate": 2.9658144211433498e-05, + "loss": 2.9034, + "step": 937 + }, + { + "epoch": 0.266734598363715, + "grad_norm": 1.7945760488510132, + "learning_rate": 2.9646653260557313e-05, + "loss": 2.5243, + "step": 938 + }, + { + "epoch": 0.2670189636071731, + "grad_norm": 1.812246322631836, + "learning_rate": 2.9635162309681127e-05, + "loss": 2.4984, + "step": 939 + }, + { + "epoch": 0.26730332885063124, + "grad_norm": 1.9260566234588623, + "learning_rate": 2.962367135880494e-05, + "loss": 2.5072, + "step": 940 + }, + { + "epoch": 0.26758769409408933, + "grad_norm": 2.1809868812561035, + "learning_rate": 2.9612180407928756e-05, + "loss": 2.2971, + "step": 941 + }, + { + "epoch": 0.2678720593375475, + "grad_norm": 2.036618232727051, + "learning_rate": 2.960068945705257e-05, + "loss": 2.3523, + "step": 942 + }, + { + "epoch": 0.2681564245810056, + "grad_norm": 2.0626561641693115, + "learning_rate": 2.9589198506176385e-05, + "loss": 2.1757, + "step": 943 + }, + { + "epoch": 0.26844078982446373, + "grad_norm": 1.8505395650863647, + "learning_rate": 2.95777075553002e-05, + "loss": 2.0444, + "step": 944 + }, + { + "epoch": 0.26872515506792183, + "grad_norm": 2.1572868824005127, + "learning_rate": 2.956621660442402e-05, + "loss": 3.1699, + "step": 945 + }, + { + "epoch": 0.2690095203113799, + "grad_norm": 2.3714983463287354, + "learning_rate": 2.9554725653547835e-05, + "loss": 2.6934, + "step": 946 + }, + { + "epoch": 0.2692938855548381, + "grad_norm": 2.13543701171875, + "learning_rate": 2.954323470267165e-05, + "loss": 2.4942, + "step": 947 + }, + { + "epoch": 0.26957825079829617, + "grad_norm": 2.133861541748047, + "learning_rate": 2.9531743751795464e-05, + "loss": 2.5744, + "step": 948 + }, + { + "epoch": 0.2698626160417543, + "grad_norm": 2.4103362560272217, + "learning_rate": 2.952025280091928e-05, + "loss": 2.4989, + "step": 949 + }, + { + "epoch": 0.2701469812852124, + "grad_norm": 1.9884144067764282, + "learning_rate": 2.9508761850043093e-05, + "loss": 2.3324, + "step": 950 + }, + { + "epoch": 0.2704313465286705, + "grad_norm": 1.9855095148086548, + "learning_rate": 2.949727089916691e-05, + "loss": 2.2511, + "step": 951 + }, + { + "epoch": 0.27071571177212866, + "grad_norm": 1.8973934650421143, + "learning_rate": 2.9485779948290726e-05, + "loss": 2.0523, + "step": 952 + }, + { + "epoch": 0.27100007701558676, + "grad_norm": 2.413694143295288, + "learning_rate": 2.947428899741454e-05, + "loss": 3.0049, + "step": 953 + }, + { + "epoch": 0.2712844422590449, + "grad_norm": 2.2881429195404053, + "learning_rate": 2.9462798046538355e-05, + "loss": 2.5529, + "step": 954 + }, + { + "epoch": 0.271568807502503, + "grad_norm": 1.9285370111465454, + "learning_rate": 2.945130709566217e-05, + "loss": 2.3945, + "step": 955 + }, + { + "epoch": 0.2718531727459611, + "grad_norm": 1.9740569591522217, + "learning_rate": 2.9439816144785984e-05, + "loss": 2.4983, + "step": 956 + }, + { + "epoch": 0.27213753798941925, + "grad_norm": 1.8763717412948608, + "learning_rate": 2.9428325193909798e-05, + "loss": 2.4825, + "step": 957 + }, + { + "epoch": 0.27242190323287735, + "grad_norm": 1.8496977090835571, + "learning_rate": 2.9416834243033613e-05, + "loss": 2.1357, + "step": 958 + }, + { + "epoch": 0.2727062684763355, + "grad_norm": 1.9461162090301514, + "learning_rate": 2.9405343292157427e-05, + "loss": 2.1733, + "step": 959 + }, + { + "epoch": 0.2729906337197936, + "grad_norm": 2.133432388305664, + "learning_rate": 2.9393852341281245e-05, + "loss": 2.145, + "step": 960 + }, + { + "epoch": 0.2732749989632517, + "grad_norm": 2.7687673568725586, + "learning_rate": 2.938236139040506e-05, + "loss": 3.0316, + "step": 961 + }, + { + "epoch": 0.27355936420670984, + "grad_norm": 1.9343767166137695, + "learning_rate": 2.9370870439528874e-05, + "loss": 2.6818, + "step": 962 + }, + { + "epoch": 0.27384372945016794, + "grad_norm": 1.7821241617202759, + "learning_rate": 2.935937948865269e-05, + "loss": 2.5806, + "step": 963 + }, + { + "epoch": 0.2741280946936261, + "grad_norm": 1.7667675018310547, + "learning_rate": 2.9347888537776503e-05, + "loss": 2.6895, + "step": 964 + }, + { + "epoch": 0.2744124599370842, + "grad_norm": 1.863686203956604, + "learning_rate": 2.9336397586900318e-05, + "loss": 2.3492, + "step": 965 + }, + { + "epoch": 0.2746968251805423, + "grad_norm": 1.9857769012451172, + "learning_rate": 2.9324906636024132e-05, + "loss": 2.3327, + "step": 966 + }, + { + "epoch": 0.27498119042400043, + "grad_norm": 1.7995944023132324, + "learning_rate": 2.9313415685147947e-05, + "loss": 2.0556, + "step": 967 + }, + { + "epoch": 0.27526555566745853, + "grad_norm": 1.8642942905426025, + "learning_rate": 2.930192473427176e-05, + "loss": 1.875, + "step": 968 + }, + { + "epoch": 0.2755499209109167, + "grad_norm": 1.9679466485977173, + "learning_rate": 2.929043378339558e-05, + "loss": 3.0327, + "step": 969 + }, + { + "epoch": 0.2758342861543748, + "grad_norm": 1.868360161781311, + "learning_rate": 2.9278942832519394e-05, + "loss": 2.4991, + "step": 970 + }, + { + "epoch": 0.2761186513978329, + "grad_norm": 1.9371778964996338, + "learning_rate": 2.9267451881643208e-05, + "loss": 2.651, + "step": 971 + }, + { + "epoch": 0.276403016641291, + "grad_norm": 1.7306244373321533, + "learning_rate": 2.9255960930767022e-05, + "loss": 2.4744, + "step": 972 + }, + { + "epoch": 0.2766873818847491, + "grad_norm": 1.9607441425323486, + "learning_rate": 2.9244469979890837e-05, + "loss": 2.3596, + "step": 973 + }, + { + "epoch": 0.27697174712820727, + "grad_norm": 1.9397883415222168, + "learning_rate": 2.923297902901465e-05, + "loss": 2.1509, + "step": 974 + }, + { + "epoch": 0.27725611237166536, + "grad_norm": 1.9228720664978027, + "learning_rate": 2.9221488078138466e-05, + "loss": 1.9686, + "step": 975 + }, + { + "epoch": 0.2775404776151235, + "grad_norm": 1.9440864324569702, + "learning_rate": 2.920999712726228e-05, + "loss": 2.0284, + "step": 976 + }, + { + "epoch": 0.2778248428585816, + "grad_norm": 2.285125970840454, + "learning_rate": 2.9198506176386095e-05, + "loss": 2.8307, + "step": 977 + }, + { + "epoch": 0.2781092081020397, + "grad_norm": 1.86907160282135, + "learning_rate": 2.9187015225509913e-05, + "loss": 2.6328, + "step": 978 + }, + { + "epoch": 0.27839357334549786, + "grad_norm": 1.7151695489883423, + "learning_rate": 2.9175524274633727e-05, + "loss": 2.7284, + "step": 979 + }, + { + "epoch": 0.27867793858895595, + "grad_norm": 1.6392958164215088, + "learning_rate": 2.9164033323757542e-05, + "loss": 2.6941, + "step": 980 + }, + { + "epoch": 0.2789623038324141, + "grad_norm": 1.9084131717681885, + "learning_rate": 2.9152542372881356e-05, + "loss": 2.2339, + "step": 981 + }, + { + "epoch": 0.2792466690758722, + "grad_norm": 1.7988026142120361, + "learning_rate": 2.9141051422005174e-05, + "loss": 2.1509, + "step": 982 + }, + { + "epoch": 0.2795310343193303, + "grad_norm": 1.8712517023086548, + "learning_rate": 2.912956047112899e-05, + "loss": 1.9558, + "step": 983 + }, + { + "epoch": 0.27981539956278845, + "grad_norm": 2.0422396659851074, + "learning_rate": 2.9118069520252807e-05, + "loss": 2.1704, + "step": 984 + }, + { + "epoch": 0.28009976480624654, + "grad_norm": 2.0253920555114746, + "learning_rate": 2.910657856937662e-05, + "loss": 3.0636, + "step": 985 + }, + { + "epoch": 0.2803841300497047, + "grad_norm": 2.0420773029327393, + "learning_rate": 2.9095087618500436e-05, + "loss": 2.556, + "step": 986 + }, + { + "epoch": 0.2806684952931628, + "grad_norm": 1.864678978919983, + "learning_rate": 2.908359666762425e-05, + "loss": 2.4779, + "step": 987 + }, + { + "epoch": 0.2809528605366209, + "grad_norm": 1.7852369546890259, + "learning_rate": 2.9072105716748065e-05, + "loss": 2.409, + "step": 988 + }, + { + "epoch": 0.28123722578007904, + "grad_norm": 1.8626235723495483, + "learning_rate": 2.906061476587188e-05, + "loss": 2.2681, + "step": 989 + }, + { + "epoch": 0.28152159102353713, + "grad_norm": 2.052464485168457, + "learning_rate": 2.9049123814995694e-05, + "loss": 2.3266, + "step": 990 + }, + { + "epoch": 0.2818059562669953, + "grad_norm": 1.991090178489685, + "learning_rate": 2.9037632864119508e-05, + "loss": 2.1501, + "step": 991 + }, + { + "epoch": 0.2820903215104534, + "grad_norm": 1.8865472078323364, + "learning_rate": 2.9026141913243323e-05, + "loss": 2.0682, + "step": 992 + }, + { + "epoch": 0.28237468675391153, + "grad_norm": 2.1116745471954346, + "learning_rate": 2.901465096236714e-05, + "loss": 2.9731, + "step": 993 + }, + { + "epoch": 0.2826590519973696, + "grad_norm": 1.8774727582931519, + "learning_rate": 2.9003160011490955e-05, + "loss": 2.7532, + "step": 994 + }, + { + "epoch": 0.2829434172408277, + "grad_norm": 1.7747597694396973, + "learning_rate": 2.899166906061477e-05, + "loss": 2.6523, + "step": 995 + }, + { + "epoch": 0.2832277824842859, + "grad_norm": 1.8037197589874268, + "learning_rate": 2.8980178109738584e-05, + "loss": 2.5828, + "step": 996 + }, + { + "epoch": 0.28351214772774397, + "grad_norm": 2.067134141921997, + "learning_rate": 2.89686871588624e-05, + "loss": 2.5429, + "step": 997 + }, + { + "epoch": 0.2837965129712021, + "grad_norm": 2.0115020275115967, + "learning_rate": 2.8957196207986213e-05, + "loss": 2.1842, + "step": 998 + }, + { + "epoch": 0.2840808782146602, + "grad_norm": 1.8412772417068481, + "learning_rate": 2.8945705257110027e-05, + "loss": 2.1457, + "step": 999 + }, + { + "epoch": 0.2843652434581183, + "grad_norm": 1.7536171674728394, + "learning_rate": 2.8934214306233842e-05, + "loss": 1.9325, + "step": 1000 + }, + { + "epoch": 0.28464960870157646, + "grad_norm": 2.1407527923583984, + "learning_rate": 2.8922723355357656e-05, + "loss": 3.1672, + "step": 1001 + }, + { + "epoch": 0.28493397394503456, + "grad_norm": 1.750623345375061, + "learning_rate": 2.8911232404481474e-05, + "loss": 2.5675, + "step": 1002 + }, + { + "epoch": 0.2852183391884927, + "grad_norm": 1.8202438354492188, + "learning_rate": 2.889974145360529e-05, + "loss": 2.5414, + "step": 1003 + }, + { + "epoch": 0.2855027044319508, + "grad_norm": 1.8455865383148193, + "learning_rate": 2.8888250502729103e-05, + "loss": 2.4263, + "step": 1004 + }, + { + "epoch": 0.2857870696754089, + "grad_norm": 1.9973098039627075, + "learning_rate": 2.8876759551852918e-05, + "loss": 2.2655, + "step": 1005 + }, + { + "epoch": 0.28607143491886705, + "grad_norm": 1.8605165481567383, + "learning_rate": 2.8865268600976732e-05, + "loss": 2.0906, + "step": 1006 + }, + { + "epoch": 0.28635580016232515, + "grad_norm": 1.9424145221710205, + "learning_rate": 2.8853777650100547e-05, + "loss": 2.1295, + "step": 1007 + }, + { + "epoch": 0.2866401654057833, + "grad_norm": 1.8717995882034302, + "learning_rate": 2.884228669922436e-05, + "loss": 2.0872, + "step": 1008 + }, + { + "epoch": 0.2869245306492414, + "grad_norm": 1.9393534660339355, + "learning_rate": 2.8830795748348176e-05, + "loss": 3.0324, + "step": 1009 + }, + { + "epoch": 0.2872088958926995, + "grad_norm": 2.0312883853912354, + "learning_rate": 2.881930479747199e-05, + "loss": 2.7265, + "step": 1010 + }, + { + "epoch": 0.28749326113615764, + "grad_norm": 2.081376791000366, + "learning_rate": 2.8807813846595808e-05, + "loss": 2.5069, + "step": 1011 + }, + { + "epoch": 0.28777762637961574, + "grad_norm": 2.1162948608398438, + "learning_rate": 2.8796322895719623e-05, + "loss": 2.4216, + "step": 1012 + }, + { + "epoch": 0.2880619916230739, + "grad_norm": 1.9769059419631958, + "learning_rate": 2.8784831944843437e-05, + "loss": 2.3386, + "step": 1013 + }, + { + "epoch": 0.288346356866532, + "grad_norm": 1.8281911611557007, + "learning_rate": 2.8773340993967252e-05, + "loss": 2.152, + "step": 1014 + }, + { + "epoch": 0.2886307221099901, + "grad_norm": 1.8548706769943237, + "learning_rate": 2.8761850043091066e-05, + "loss": 1.9768, + "step": 1015 + }, + { + "epoch": 0.28891508735344823, + "grad_norm": 1.9629120826721191, + "learning_rate": 2.875035909221488e-05, + "loss": 2.0301, + "step": 1016 + }, + { + "epoch": 0.2891994525969063, + "grad_norm": 2.21577525138855, + "learning_rate": 2.8738868141338695e-05, + "loss": 2.7658, + "step": 1017 + }, + { + "epoch": 0.2894838178403645, + "grad_norm": 1.955876111984253, + "learning_rate": 2.872737719046251e-05, + "loss": 2.5863, + "step": 1018 + }, + { + "epoch": 0.2897681830838226, + "grad_norm": 1.8049885034561157, + "learning_rate": 2.871588623958633e-05, + "loss": 2.7557, + "step": 1019 + }, + { + "epoch": 0.2900525483272807, + "grad_norm": 1.851650357246399, + "learning_rate": 2.8704395288710145e-05, + "loss": 2.3583, + "step": 1020 + }, + { + "epoch": 0.2903369135707388, + "grad_norm": 1.8646175861358643, + "learning_rate": 2.869290433783396e-05, + "loss": 2.3485, + "step": 1021 + }, + { + "epoch": 0.2906212788141969, + "grad_norm": 1.6461886167526245, + "learning_rate": 2.8681413386957774e-05, + "loss": 2.2649, + "step": 1022 + }, + { + "epoch": 0.29090564405765507, + "grad_norm": 1.8911197185516357, + "learning_rate": 2.866992243608159e-05, + "loss": 2.0709, + "step": 1023 + }, + { + "epoch": 0.29119000930111316, + "grad_norm": 1.8676636219024658, + "learning_rate": 2.8658431485205403e-05, + "loss": 2.1007, + "step": 1024 + }, + { + "epoch": 0.2914743745445713, + "grad_norm": 2.4858603477478027, + "learning_rate": 2.8646940534329218e-05, + "loss": 3.0081, + "step": 1025 + }, + { + "epoch": 0.2917587397880294, + "grad_norm": 1.9424095153808594, + "learning_rate": 2.8635449583453036e-05, + "loss": 2.6527, + "step": 1026 + }, + { + "epoch": 0.2920431050314875, + "grad_norm": 1.7152304649353027, + "learning_rate": 2.862395863257685e-05, + "loss": 2.6888, + "step": 1027 + }, + { + "epoch": 0.29232747027494566, + "grad_norm": 1.6697314977645874, + "learning_rate": 2.8612467681700665e-05, + "loss": 2.5852, + "step": 1028 + }, + { + "epoch": 0.29261183551840375, + "grad_norm": 1.8688958883285522, + "learning_rate": 2.860097673082448e-05, + "loss": 2.4024, + "step": 1029 + }, + { + "epoch": 0.2928962007618619, + "grad_norm": 1.8187808990478516, + "learning_rate": 2.8589485779948294e-05, + "loss": 2.1133, + "step": 1030 + }, + { + "epoch": 0.29318056600532, + "grad_norm": 1.6755000352859497, + "learning_rate": 2.857799482907211e-05, + "loss": 1.9874, + "step": 1031 + }, + { + "epoch": 0.2934649312487781, + "grad_norm": 1.9520643949508667, + "learning_rate": 2.8566503878195923e-05, + "loss": 2.0306, + "step": 1032 + }, + { + "epoch": 0.29374929649223624, + "grad_norm": 2.2317845821380615, + "learning_rate": 2.8555012927319737e-05, + "loss": 2.956, + "step": 1033 + }, + { + "epoch": 0.29403366173569434, + "grad_norm": 1.9723296165466309, + "learning_rate": 2.8543521976443552e-05, + "loss": 2.6379, + "step": 1034 + }, + { + "epoch": 0.2943180269791525, + "grad_norm": 1.8846988677978516, + "learning_rate": 2.853203102556737e-05, + "loss": 2.7342, + "step": 1035 + }, + { + "epoch": 0.2946023922226106, + "grad_norm": 1.7443634271621704, + "learning_rate": 2.8520540074691184e-05, + "loss": 2.437, + "step": 1036 + }, + { + "epoch": 0.2948867574660687, + "grad_norm": 1.8220562934875488, + "learning_rate": 2.8509049123815e-05, + "loss": 2.1899, + "step": 1037 + }, + { + "epoch": 0.29517112270952683, + "grad_norm": 1.8991121053695679, + "learning_rate": 2.8497558172938813e-05, + "loss": 2.087, + "step": 1038 + }, + { + "epoch": 0.29545548795298493, + "grad_norm": 1.7543880939483643, + "learning_rate": 2.8486067222062628e-05, + "loss": 1.7512, + "step": 1039 + }, + { + "epoch": 0.2957398531964431, + "grad_norm": 1.95134699344635, + "learning_rate": 2.8474576271186442e-05, + "loss": 2.0684, + "step": 1040 + }, + { + "epoch": 0.2960242184399012, + "grad_norm": 2.655040979385376, + "learning_rate": 2.8463085320310257e-05, + "loss": 2.9242, + "step": 1041 + }, + { + "epoch": 0.29630858368335933, + "grad_norm": 1.837572693824768, + "learning_rate": 2.845159436943407e-05, + "loss": 2.4314, + "step": 1042 + }, + { + "epoch": 0.2965929489268174, + "grad_norm": 1.903134822845459, + "learning_rate": 2.8440103418557886e-05, + "loss": 2.6958, + "step": 1043 + }, + { + "epoch": 0.2968773141702755, + "grad_norm": 1.6535474061965942, + "learning_rate": 2.8428612467681704e-05, + "loss": 2.301, + "step": 1044 + }, + { + "epoch": 0.29716167941373367, + "grad_norm": 2.0542876720428467, + "learning_rate": 2.8417121516805518e-05, + "loss": 2.4347, + "step": 1045 + }, + { + "epoch": 0.29744604465719177, + "grad_norm": 1.8558207750320435, + "learning_rate": 2.8405630565929333e-05, + "loss": 2.2441, + "step": 1046 + }, + { + "epoch": 0.2977304099006499, + "grad_norm": 1.7878432273864746, + "learning_rate": 2.8394139615053147e-05, + "loss": 2.0426, + "step": 1047 + }, + { + "epoch": 0.298014775144108, + "grad_norm": 1.9130539894104004, + "learning_rate": 2.838264866417696e-05, + "loss": 2.1265, + "step": 1048 + }, + { + "epoch": 0.2982991403875661, + "grad_norm": 2.4959192276000977, + "learning_rate": 2.8371157713300776e-05, + "loss": 2.8223, + "step": 1049 + }, + { + "epoch": 0.29858350563102426, + "grad_norm": 1.7817506790161133, + "learning_rate": 2.835966676242459e-05, + "loss": 2.6125, + "step": 1050 + }, + { + "epoch": 0.29886787087448236, + "grad_norm": 1.8295172452926636, + "learning_rate": 2.8348175811548405e-05, + "loss": 2.5712, + "step": 1051 + }, + { + "epoch": 0.2991522361179405, + "grad_norm": 1.658846139907837, + "learning_rate": 2.833668486067222e-05, + "loss": 2.4154, + "step": 1052 + }, + { + "epoch": 0.2994366013613986, + "grad_norm": 1.8340063095092773, + "learning_rate": 2.8325193909796037e-05, + "loss": 2.3476, + "step": 1053 + }, + { + "epoch": 0.2997209666048567, + "grad_norm": 1.722588062286377, + "learning_rate": 2.8313702958919852e-05, + "loss": 2.0986, + "step": 1054 + }, + { + "epoch": 0.30000533184831485, + "grad_norm": 1.6516780853271484, + "learning_rate": 2.8302212008043666e-05, + "loss": 2.1311, + "step": 1055 + }, + { + "epoch": 0.30028969709177294, + "grad_norm": 2.02481746673584, + "learning_rate": 2.8290721057167484e-05, + "loss": 1.7839, + "step": 1056 + }, + { + "epoch": 0.3005740623352311, + "grad_norm": 2.038386821746826, + "learning_rate": 2.82792301062913e-05, + "loss": 2.9095, + "step": 1057 + }, + { + "epoch": 0.3008584275786892, + "grad_norm": 1.7752598524093628, + "learning_rate": 2.8267739155415113e-05, + "loss": 2.9031, + "step": 1058 + }, + { + "epoch": 0.3011427928221473, + "grad_norm": 1.6934603452682495, + "learning_rate": 2.825624820453893e-05, + "loss": 2.4481, + "step": 1059 + }, + { + "epoch": 0.30142715806560544, + "grad_norm": 1.6980534791946411, + "learning_rate": 2.8244757253662746e-05, + "loss": 2.4492, + "step": 1060 + }, + { + "epoch": 0.30171152330906353, + "grad_norm": 1.7166688442230225, + "learning_rate": 2.823326630278656e-05, + "loss": 2.2513, + "step": 1061 + }, + { + "epoch": 0.3019958885525217, + "grad_norm": 1.8027641773223877, + "learning_rate": 2.8221775351910375e-05, + "loss": 2.2973, + "step": 1062 + }, + { + "epoch": 0.3022802537959798, + "grad_norm": 2.2114596366882324, + "learning_rate": 2.821028440103419e-05, + "loss": 1.9347, + "step": 1063 + }, + { + "epoch": 0.30256461903943793, + "grad_norm": 1.8748873472213745, + "learning_rate": 2.8198793450158004e-05, + "loss": 1.8972, + "step": 1064 + }, + { + "epoch": 0.302848984282896, + "grad_norm": 1.924533486366272, + "learning_rate": 2.8187302499281818e-05, + "loss": 3.2114, + "step": 1065 + }, + { + "epoch": 0.3031333495263541, + "grad_norm": 1.7401739358901978, + "learning_rate": 2.8175811548405633e-05, + "loss": 2.7883, + "step": 1066 + }, + { + "epoch": 0.3034177147698123, + "grad_norm": 1.6388260126113892, + "learning_rate": 2.8164320597529447e-05, + "loss": 2.5342, + "step": 1067 + }, + { + "epoch": 0.30370208001327037, + "grad_norm": 1.66873037815094, + "learning_rate": 2.8152829646653265e-05, + "loss": 2.348, + "step": 1068 + }, + { + "epoch": 0.3039864452567285, + "grad_norm": 1.798418402671814, + "learning_rate": 2.814133869577708e-05, + "loss": 2.3285, + "step": 1069 + }, + { + "epoch": 0.3042708105001866, + "grad_norm": 1.6459908485412598, + "learning_rate": 2.8129847744900894e-05, + "loss": 1.9097, + "step": 1070 + }, + { + "epoch": 0.3045551757436447, + "grad_norm": 1.6589969396591187, + "learning_rate": 2.811835679402471e-05, + "loss": 1.9083, + "step": 1071 + }, + { + "epoch": 0.30483954098710286, + "grad_norm": 1.7330818176269531, + "learning_rate": 2.8106865843148523e-05, + "loss": 2.0022, + "step": 1072 + }, + { + "epoch": 0.30512390623056096, + "grad_norm": 2.162515878677368, + "learning_rate": 2.8095374892272338e-05, + "loss": 3.2133, + "step": 1073 + }, + { + "epoch": 0.3054082714740191, + "grad_norm": 1.890426516532898, + "learning_rate": 2.8083883941396152e-05, + "loss": 2.5098, + "step": 1074 + }, + { + "epoch": 0.3056926367174772, + "grad_norm": 1.9428486824035645, + "learning_rate": 2.8072392990519967e-05, + "loss": 2.8393, + "step": 1075 + }, + { + "epoch": 0.3059770019609353, + "grad_norm": 1.7772703170776367, + "learning_rate": 2.806090203964378e-05, + "loss": 2.5238, + "step": 1076 + }, + { + "epoch": 0.30626136720439345, + "grad_norm": 2.0038602352142334, + "learning_rate": 2.80494110887676e-05, + "loss": 2.5622, + "step": 1077 + }, + { + "epoch": 0.30654573244785155, + "grad_norm": 1.7946076393127441, + "learning_rate": 2.8037920137891413e-05, + "loss": 2.1265, + "step": 1078 + }, + { + "epoch": 0.3068300976913097, + "grad_norm": 1.694143533706665, + "learning_rate": 2.8026429187015228e-05, + "loss": 2.1898, + "step": 1079 + }, + { + "epoch": 0.3071144629347678, + "grad_norm": 1.8604297637939453, + "learning_rate": 2.8014938236139042e-05, + "loss": 1.8618, + "step": 1080 + }, + { + "epoch": 0.3073988281782259, + "grad_norm": 2.2564175128936768, + "learning_rate": 2.8003447285262857e-05, + "loss": 2.8106, + "step": 1081 + }, + { + "epoch": 0.30768319342168404, + "grad_norm": 2.0401079654693604, + "learning_rate": 2.799195633438667e-05, + "loss": 2.7885, + "step": 1082 + }, + { + "epoch": 0.30796755866514214, + "grad_norm": 1.7820488214492798, + "learning_rate": 2.7980465383510486e-05, + "loss": 2.4126, + "step": 1083 + }, + { + "epoch": 0.3082519239086003, + "grad_norm": 1.7269959449768066, + "learning_rate": 2.79689744326343e-05, + "loss": 2.3303, + "step": 1084 + }, + { + "epoch": 0.3085362891520584, + "grad_norm": 1.8052058219909668, + "learning_rate": 2.7957483481758115e-05, + "loss": 2.3413, + "step": 1085 + }, + { + "epoch": 0.3088206543955165, + "grad_norm": 1.7034716606140137, + "learning_rate": 2.7945992530881933e-05, + "loss": 2.2142, + "step": 1086 + }, + { + "epoch": 0.30910501963897463, + "grad_norm": 1.924057960510254, + "learning_rate": 2.7934501580005747e-05, + "loss": 2.0875, + "step": 1087 + }, + { + "epoch": 0.3093893848824327, + "grad_norm": 1.9003182649612427, + "learning_rate": 2.7923010629129562e-05, + "loss": 1.9661, + "step": 1088 + }, + { + "epoch": 0.3096737501258909, + "grad_norm": 2.4958977699279785, + "learning_rate": 2.7911519678253376e-05, + "loss": 3.0476, + "step": 1089 + }, + { + "epoch": 0.309958115369349, + "grad_norm": 2.0897626876831055, + "learning_rate": 2.790002872737719e-05, + "loss": 2.5666, + "step": 1090 + }, + { + "epoch": 0.3102424806128071, + "grad_norm": 1.7425907850265503, + "learning_rate": 2.7888537776501005e-05, + "loss": 2.3865, + "step": 1091 + }, + { + "epoch": 0.3105268458562652, + "grad_norm": 1.892893671989441, + "learning_rate": 2.787704682562482e-05, + "loss": 2.4982, + "step": 1092 + }, + { + "epoch": 0.3108112110997233, + "grad_norm": 2.0055623054504395, + "learning_rate": 2.786555587474864e-05, + "loss": 2.3453, + "step": 1093 + }, + { + "epoch": 0.31109557634318147, + "grad_norm": 1.7691258192062378, + "learning_rate": 2.7854064923872456e-05, + "loss": 2.1304, + "step": 1094 + }, + { + "epoch": 0.31137994158663956, + "grad_norm": 1.7920550107955933, + "learning_rate": 2.784257397299627e-05, + "loss": 2.0597, + "step": 1095 + }, + { + "epoch": 0.3116643068300977, + "grad_norm": 1.657732367515564, + "learning_rate": 2.7831083022120085e-05, + "loss": 1.8743, + "step": 1096 + }, + { + "epoch": 0.3119486720735558, + "grad_norm": 2.2476625442504883, + "learning_rate": 2.78195920712439e-05, + "loss": 3.1486, + "step": 1097 + }, + { + "epoch": 0.3122330373170139, + "grad_norm": 1.9806352853775024, + "learning_rate": 2.7808101120367714e-05, + "loss": 2.6852, + "step": 1098 + }, + { + "epoch": 0.31251740256047206, + "grad_norm": 1.7334058284759521, + "learning_rate": 2.7796610169491528e-05, + "loss": 2.5363, + "step": 1099 + }, + { + "epoch": 0.31280176780393015, + "grad_norm": 1.769322395324707, + "learning_rate": 2.7785119218615343e-05, + "loss": 2.3318, + "step": 1100 + }, + { + "epoch": 0.3130861330473883, + "grad_norm": 1.7031893730163574, + "learning_rate": 2.777362826773916e-05, + "loss": 2.4197, + "step": 1101 + }, + { + "epoch": 0.3133704982908464, + "grad_norm": 1.877233862876892, + "learning_rate": 2.7762137316862975e-05, + "loss": 2.049, + "step": 1102 + }, + { + "epoch": 0.3136548635343045, + "grad_norm": 1.7407324314117432, + "learning_rate": 2.775064636598679e-05, + "loss": 1.8772, + "step": 1103 + }, + { + "epoch": 0.31393922877776265, + "grad_norm": 1.7369098663330078, + "learning_rate": 2.7739155415110604e-05, + "loss": 1.9947, + "step": 1104 + }, + { + "epoch": 0.31422359402122074, + "grad_norm": 2.291818141937256, + "learning_rate": 2.772766446423442e-05, + "loss": 2.9378, + "step": 1105 + }, + { + "epoch": 0.3145079592646789, + "grad_norm": 1.7521897554397583, + "learning_rate": 2.7716173513358233e-05, + "loss": 2.6269, + "step": 1106 + }, + { + "epoch": 0.314792324508137, + "grad_norm": 1.6815153360366821, + "learning_rate": 2.7704682562482047e-05, + "loss": 2.3316, + "step": 1107 + }, + { + "epoch": 0.3150766897515951, + "grad_norm": 1.6132457256317139, + "learning_rate": 2.7693191611605862e-05, + "loss": 2.1689, + "step": 1108 + }, + { + "epoch": 0.31536105499505324, + "grad_norm": 1.8604917526245117, + "learning_rate": 2.7681700660729676e-05, + "loss": 2.3652, + "step": 1109 + }, + { + "epoch": 0.31564542023851133, + "grad_norm": 1.6801247596740723, + "learning_rate": 2.7670209709853494e-05, + "loss": 2.0579, + "step": 1110 + }, + { + "epoch": 0.3159297854819695, + "grad_norm": 1.8979250192642212, + "learning_rate": 2.765871875897731e-05, + "loss": 2.0514, + "step": 1111 + }, + { + "epoch": 0.3162141507254276, + "grad_norm": 1.8273488283157349, + "learning_rate": 2.7647227808101123e-05, + "loss": 2.0111, + "step": 1112 + }, + { + "epoch": 0.31649851596888573, + "grad_norm": 2.283964157104492, + "learning_rate": 2.7635736857224938e-05, + "loss": 2.9822, + "step": 1113 + }, + { + "epoch": 0.3167828812123438, + "grad_norm": 1.8019022941589355, + "learning_rate": 2.7624245906348752e-05, + "loss": 2.7109, + "step": 1114 + }, + { + "epoch": 0.3170672464558019, + "grad_norm": 1.7521454095840454, + "learning_rate": 2.7612754955472567e-05, + "loss": 2.5455, + "step": 1115 + }, + { + "epoch": 0.31735161169926007, + "grad_norm": 1.7717288732528687, + "learning_rate": 2.760126400459638e-05, + "loss": 2.2537, + "step": 1116 + }, + { + "epoch": 0.31763597694271817, + "grad_norm": 1.9506962299346924, + "learning_rate": 2.7589773053720196e-05, + "loss": 2.2884, + "step": 1117 + }, + { + "epoch": 0.3179203421861763, + "grad_norm": 1.6748727560043335, + "learning_rate": 2.757828210284401e-05, + "loss": 2.1113, + "step": 1118 + }, + { + "epoch": 0.3182047074296344, + "grad_norm": 1.8400766849517822, + "learning_rate": 2.7566791151967828e-05, + "loss": 2.0633, + "step": 1119 + }, + { + "epoch": 0.3184890726730925, + "grad_norm": 1.9170554876327515, + "learning_rate": 2.7555300201091643e-05, + "loss": 1.9511, + "step": 1120 + }, + { + "epoch": 0.31877343791655066, + "grad_norm": 2.18106746673584, + "learning_rate": 2.7543809250215457e-05, + "loss": 3.0597, + "step": 1121 + }, + { + "epoch": 0.31905780316000876, + "grad_norm": 1.7323399782180786, + "learning_rate": 2.753231829933927e-05, + "loss": 2.3877, + "step": 1122 + }, + { + "epoch": 0.3193421684034669, + "grad_norm": 1.7470673322677612, + "learning_rate": 2.7520827348463086e-05, + "loss": 2.5492, + "step": 1123 + }, + { + "epoch": 0.319626533646925, + "grad_norm": 1.627826452255249, + "learning_rate": 2.75093363975869e-05, + "loss": 2.3832, + "step": 1124 + }, + { + "epoch": 0.3199108988903831, + "grad_norm": 1.8971792459487915, + "learning_rate": 2.7497845446710715e-05, + "loss": 2.2562, + "step": 1125 + }, + { + "epoch": 0.32019526413384125, + "grad_norm": 1.8873558044433594, + "learning_rate": 2.748635449583453e-05, + "loss": 2.1615, + "step": 1126 + }, + { + "epoch": 0.32047962937729935, + "grad_norm": 1.8183141946792603, + "learning_rate": 2.7474863544958344e-05, + "loss": 2.0914, + "step": 1127 + }, + { + "epoch": 0.3207639946207575, + "grad_norm": 1.8885418176651, + "learning_rate": 2.7463372594082162e-05, + "loss": 1.9792, + "step": 1128 + }, + { + "epoch": 0.3210483598642156, + "grad_norm": 2.204162359237671, + "learning_rate": 2.7451881643205977e-05, + "loss": 2.9695, + "step": 1129 + }, + { + "epoch": 0.3213327251076737, + "grad_norm": 1.727662444114685, + "learning_rate": 2.7440390692329794e-05, + "loss": 2.5204, + "step": 1130 + }, + { + "epoch": 0.32161709035113184, + "grad_norm": 1.721366286277771, + "learning_rate": 2.742889974145361e-05, + "loss": 2.5624, + "step": 1131 + }, + { + "epoch": 0.32190145559458994, + "grad_norm": 1.598515510559082, + "learning_rate": 2.7417408790577423e-05, + "loss": 2.3068, + "step": 1132 + }, + { + "epoch": 0.3221858208380481, + "grad_norm": 1.7761754989624023, + "learning_rate": 2.7405917839701238e-05, + "loss": 2.3844, + "step": 1133 + }, + { + "epoch": 0.3224701860815062, + "grad_norm": 1.6555432081222534, + "learning_rate": 2.7394426888825056e-05, + "loss": 2.0163, + "step": 1134 + }, + { + "epoch": 0.32275455132496433, + "grad_norm": 1.6981431245803833, + "learning_rate": 2.738293593794887e-05, + "loss": 1.9993, + "step": 1135 + }, + { + "epoch": 0.32303891656842243, + "grad_norm": 1.702057957649231, + "learning_rate": 2.7371444987072685e-05, + "loss": 1.8252, + "step": 1136 + }, + { + "epoch": 0.3233232818118805, + "grad_norm": 1.9638831615447998, + "learning_rate": 2.73599540361965e-05, + "loss": 3.1115, + "step": 1137 + }, + { + "epoch": 0.3236076470553387, + "grad_norm": 1.7202662229537964, + "learning_rate": 2.7348463085320314e-05, + "loss": 2.5926, + "step": 1138 + }, + { + "epoch": 0.32389201229879677, + "grad_norm": 1.6885807514190674, + "learning_rate": 2.7336972134444128e-05, + "loss": 2.5377, + "step": 1139 + }, + { + "epoch": 0.3241763775422549, + "grad_norm": 1.6235450506210327, + "learning_rate": 2.7325481183567943e-05, + "loss": 2.5253, + "step": 1140 + }, + { + "epoch": 0.324460742785713, + "grad_norm": 1.7628118991851807, + "learning_rate": 2.7313990232691757e-05, + "loss": 2.4275, + "step": 1141 + }, + { + "epoch": 0.3247451080291711, + "grad_norm": 1.8225659132003784, + "learning_rate": 2.7302499281815572e-05, + "loss": 2.0278, + "step": 1142 + }, + { + "epoch": 0.32502947327262927, + "grad_norm": 1.8690272569656372, + "learning_rate": 2.729100833093939e-05, + "loss": 2.0703, + "step": 1143 + }, + { + "epoch": 0.32531383851608736, + "grad_norm": 1.951603889465332, + "learning_rate": 2.7279517380063204e-05, + "loss": 1.7402, + "step": 1144 + }, + { + "epoch": 0.3255982037595455, + "grad_norm": 2.100116491317749, + "learning_rate": 2.726802642918702e-05, + "loss": 3.0271, + "step": 1145 + }, + { + "epoch": 0.3258825690030036, + "grad_norm": 1.7133632898330688, + "learning_rate": 2.7256535478310833e-05, + "loss": 2.5711, + "step": 1146 + }, + { + "epoch": 0.3261669342464617, + "grad_norm": 1.6197803020477295, + "learning_rate": 2.7245044527434648e-05, + "loss": 2.4896, + "step": 1147 + }, + { + "epoch": 0.32645129948991986, + "grad_norm": 1.7283875942230225, + "learning_rate": 2.7233553576558462e-05, + "loss": 2.3986, + "step": 1148 + }, + { + "epoch": 0.32673566473337795, + "grad_norm": 1.7507110834121704, + "learning_rate": 2.7222062625682277e-05, + "loss": 2.1887, + "step": 1149 + }, + { + "epoch": 0.3270200299768361, + "grad_norm": 1.7281140089035034, + "learning_rate": 2.721057167480609e-05, + "loss": 2.2903, + "step": 1150 + }, + { + "epoch": 0.3273043952202942, + "grad_norm": 1.7815526723861694, + "learning_rate": 2.7199080723929906e-05, + "loss": 2.0782, + "step": 1151 + }, + { + "epoch": 0.3275887604637523, + "grad_norm": 1.8577396869659424, + "learning_rate": 2.7187589773053724e-05, + "loss": 1.7293, + "step": 1152 + }, + { + "epoch": 0.32787312570721044, + "grad_norm": 2.047935962677002, + "learning_rate": 2.7176098822177538e-05, + "loss": 2.8512, + "step": 1153 + }, + { + "epoch": 0.32815749095066854, + "grad_norm": 1.804479718208313, + "learning_rate": 2.7164607871301353e-05, + "loss": 2.7431, + "step": 1154 + }, + { + "epoch": 0.3284418561941267, + "grad_norm": 1.7864207029342651, + "learning_rate": 2.7153116920425167e-05, + "loss": 2.4874, + "step": 1155 + }, + { + "epoch": 0.3287262214375848, + "grad_norm": 1.6858755350112915, + "learning_rate": 2.714162596954898e-05, + "loss": 2.3767, + "step": 1156 + }, + { + "epoch": 0.3290105866810429, + "grad_norm": 1.7890626192092896, + "learning_rate": 2.7130135018672796e-05, + "loss": 2.2339, + "step": 1157 + }, + { + "epoch": 0.32929495192450103, + "grad_norm": 1.7149113416671753, + "learning_rate": 2.711864406779661e-05, + "loss": 2.2292, + "step": 1158 + }, + { + "epoch": 0.32957931716795913, + "grad_norm": 1.7810120582580566, + "learning_rate": 2.7107153116920425e-05, + "loss": 2.0133, + "step": 1159 + }, + { + "epoch": 0.3298636824114173, + "grad_norm": 1.7565312385559082, + "learning_rate": 2.709566216604424e-05, + "loss": 2.0087, + "step": 1160 + }, + { + "epoch": 0.3301480476548754, + "grad_norm": 2.1705124378204346, + "learning_rate": 2.7084171215168057e-05, + "loss": 3.0282, + "step": 1161 + }, + { + "epoch": 0.3304324128983335, + "grad_norm": 1.7916361093521118, + "learning_rate": 2.7072680264291872e-05, + "loss": 2.7158, + "step": 1162 + }, + { + "epoch": 0.3307167781417916, + "grad_norm": 1.6537439823150635, + "learning_rate": 2.7061189313415686e-05, + "loss": 2.4969, + "step": 1163 + }, + { + "epoch": 0.3310011433852497, + "grad_norm": 1.7181519269943237, + "learning_rate": 2.70496983625395e-05, + "loss": 2.3481, + "step": 1164 + }, + { + "epoch": 0.33128550862870787, + "grad_norm": 1.7014647722244263, + "learning_rate": 2.7038207411663315e-05, + "loss": 2.2512, + "step": 1165 + }, + { + "epoch": 0.33156987387216597, + "grad_norm": 1.7426505088806152, + "learning_rate": 2.702671646078713e-05, + "loss": 2.1619, + "step": 1166 + }, + { + "epoch": 0.3318542391156241, + "grad_norm": 1.7927415370941162, + "learning_rate": 2.701522550991095e-05, + "loss": 1.9217, + "step": 1167 + }, + { + "epoch": 0.3321386043590822, + "grad_norm": 1.9813625812530518, + "learning_rate": 2.7003734559034766e-05, + "loss": 1.89, + "step": 1168 + }, + { + "epoch": 0.3324229696025403, + "grad_norm": 2.2803986072540283, + "learning_rate": 2.699224360815858e-05, + "loss": 3.1151, + "step": 1169 + }, + { + "epoch": 0.33270733484599846, + "grad_norm": 1.8023364543914795, + "learning_rate": 2.6980752657282395e-05, + "loss": 2.5497, + "step": 1170 + }, + { + "epoch": 0.33299170008945655, + "grad_norm": 1.5802347660064697, + "learning_rate": 2.696926170640621e-05, + "loss": 2.3567, + "step": 1171 + }, + { + "epoch": 0.3332760653329147, + "grad_norm": 1.7448047399520874, + "learning_rate": 2.6957770755530024e-05, + "loss": 2.4281, + "step": 1172 + }, + { + "epoch": 0.3335604305763728, + "grad_norm": 1.9900225400924683, + "learning_rate": 2.6946279804653838e-05, + "loss": 2.4395, + "step": 1173 + }, + { + "epoch": 0.3338447958198309, + "grad_norm": 1.7050385475158691, + "learning_rate": 2.6934788853777653e-05, + "loss": 2.0546, + "step": 1174 + }, + { + "epoch": 0.33412916106328905, + "grad_norm": 1.9430030584335327, + "learning_rate": 2.6923297902901467e-05, + "loss": 2.0789, + "step": 1175 + }, + { + "epoch": 0.33441352630674714, + "grad_norm": 1.910507321357727, + "learning_rate": 2.6911806952025285e-05, + "loss": 1.9512, + "step": 1176 + }, + { + "epoch": 0.3346978915502053, + "grad_norm": 2.156999349594116, + "learning_rate": 2.69003160011491e-05, + "loss": 2.7457, + "step": 1177 + }, + { + "epoch": 0.3349822567936634, + "grad_norm": 1.7482166290283203, + "learning_rate": 2.6888825050272914e-05, + "loss": 2.7449, + "step": 1178 + }, + { + "epoch": 0.3352666220371215, + "grad_norm": 1.692886233329773, + "learning_rate": 2.687733409939673e-05, + "loss": 2.4689, + "step": 1179 + }, + { + "epoch": 0.33555098728057964, + "grad_norm": 1.6204313039779663, + "learning_rate": 2.6865843148520543e-05, + "loss": 2.3802, + "step": 1180 + }, + { + "epoch": 0.33583535252403773, + "grad_norm": 1.7569348812103271, + "learning_rate": 2.6854352197644357e-05, + "loss": 2.2957, + "step": 1181 + }, + { + "epoch": 0.3361197177674959, + "grad_norm": 1.6739596128463745, + "learning_rate": 2.6842861246768172e-05, + "loss": 1.9977, + "step": 1182 + }, + { + "epoch": 0.336404083010954, + "grad_norm": 1.858077883720398, + "learning_rate": 2.6831370295891986e-05, + "loss": 2.0055, + "step": 1183 + }, + { + "epoch": 0.33668844825441213, + "grad_norm": 1.8774821758270264, + "learning_rate": 2.68198793450158e-05, + "loss": 1.8794, + "step": 1184 + }, + { + "epoch": 0.3369728134978702, + "grad_norm": 2.0827038288116455, + "learning_rate": 2.680838839413962e-05, + "loss": 2.8639, + "step": 1185 + }, + { + "epoch": 0.3372571787413283, + "grad_norm": 1.8128411769866943, + "learning_rate": 2.6796897443263433e-05, + "loss": 2.4063, + "step": 1186 + }, + { + "epoch": 0.3375415439847865, + "grad_norm": 1.9702905416488647, + "learning_rate": 2.6785406492387248e-05, + "loss": 2.4151, + "step": 1187 + }, + { + "epoch": 0.33782590922824457, + "grad_norm": 1.90627121925354, + "learning_rate": 2.6773915541511062e-05, + "loss": 2.4171, + "step": 1188 + }, + { + "epoch": 0.3381102744717027, + "grad_norm": 2.10098934173584, + "learning_rate": 2.6762424590634877e-05, + "loss": 2.3142, + "step": 1189 + }, + { + "epoch": 0.3383946397151608, + "grad_norm": 1.7690738439559937, + "learning_rate": 2.675093363975869e-05, + "loss": 1.9259, + "step": 1190 + }, + { + "epoch": 0.3386790049586189, + "grad_norm": 1.794833779335022, + "learning_rate": 2.6739442688882506e-05, + "loss": 1.9992, + "step": 1191 + }, + { + "epoch": 0.33896337020207706, + "grad_norm": 1.848533034324646, + "learning_rate": 2.672795173800632e-05, + "loss": 1.9893, + "step": 1192 + }, + { + "epoch": 0.33924773544553516, + "grad_norm": 2.1067445278167725, + "learning_rate": 2.6716460787130135e-05, + "loss": 3.1749, + "step": 1193 + }, + { + "epoch": 0.3395321006889933, + "grad_norm": 1.9302208423614502, + "learning_rate": 2.6704969836253953e-05, + "loss": 2.6356, + "step": 1194 + }, + { + "epoch": 0.3398164659324514, + "grad_norm": 1.7722642421722412, + "learning_rate": 2.6693478885377767e-05, + "loss": 2.4753, + "step": 1195 + }, + { + "epoch": 0.3401008311759095, + "grad_norm": 1.7334580421447754, + "learning_rate": 2.6681987934501582e-05, + "loss": 2.2498, + "step": 1196 + }, + { + "epoch": 0.34038519641936765, + "grad_norm": 1.7633260488510132, + "learning_rate": 2.6670496983625396e-05, + "loss": 2.2229, + "step": 1197 + }, + { + "epoch": 0.34066956166282575, + "grad_norm": 1.721509337425232, + "learning_rate": 2.665900603274921e-05, + "loss": 2.2436, + "step": 1198 + }, + { + "epoch": 0.3409539269062839, + "grad_norm": 1.93727445602417, + "learning_rate": 2.6647515081873025e-05, + "loss": 1.9182, + "step": 1199 + }, + { + "epoch": 0.341238292149742, + "grad_norm": 1.9112025499343872, + "learning_rate": 2.663602413099684e-05, + "loss": 1.8985, + "step": 1200 + }, + { + "epoch": 0.3415226573932001, + "grad_norm": 2.4720232486724854, + "learning_rate": 2.6624533180120654e-05, + "loss": 2.9957, + "step": 1201 + }, + { + "epoch": 0.34180702263665824, + "grad_norm": 2.0485682487487793, + "learning_rate": 2.661304222924447e-05, + "loss": 2.5158, + "step": 1202 + }, + { + "epoch": 0.34209138788011634, + "grad_norm": 1.8902740478515625, + "learning_rate": 2.6601551278368287e-05, + "loss": 2.3761, + "step": 1203 + }, + { + "epoch": 0.3423757531235745, + "grad_norm": 1.7648299932479858, + "learning_rate": 2.6590060327492104e-05, + "loss": 2.0841, + "step": 1204 + }, + { + "epoch": 0.3426601183670326, + "grad_norm": 1.8326659202575684, + "learning_rate": 2.657856937661592e-05, + "loss": 2.3462, + "step": 1205 + }, + { + "epoch": 0.3429444836104907, + "grad_norm": 1.8549422025680542, + "learning_rate": 2.6567078425739733e-05, + "loss": 2.1884, + "step": 1206 + }, + { + "epoch": 0.34322884885394883, + "grad_norm": 1.930391550064087, + "learning_rate": 2.6555587474863548e-05, + "loss": 1.9793, + "step": 1207 + }, + { + "epoch": 0.3435132140974069, + "grad_norm": 1.8481113910675049, + "learning_rate": 2.6544096523987362e-05, + "loss": 2.0828, + "step": 1208 + }, + { + "epoch": 0.3437975793408651, + "grad_norm": 2.0146400928497314, + "learning_rate": 2.653260557311118e-05, + "loss": 2.8335, + "step": 1209 + }, + { + "epoch": 0.3440819445843232, + "grad_norm": 1.820291519165039, + "learning_rate": 2.6521114622234995e-05, + "loss": 2.7538, + "step": 1210 + }, + { + "epoch": 0.3443663098277813, + "grad_norm": 1.721548080444336, + "learning_rate": 2.650962367135881e-05, + "loss": 2.4637, + "step": 1211 + }, + { + "epoch": 0.3446506750712394, + "grad_norm": 1.7482106685638428, + "learning_rate": 2.6498132720482624e-05, + "loss": 2.2598, + "step": 1212 + }, + { + "epoch": 0.3449350403146975, + "grad_norm": 1.8595374822616577, + "learning_rate": 2.648664176960644e-05, + "loss": 2.4933, + "step": 1213 + }, + { + "epoch": 0.34521940555815567, + "grad_norm": 1.690114974975586, + "learning_rate": 2.6475150818730253e-05, + "loss": 2.1184, + "step": 1214 + }, + { + "epoch": 0.34550377080161376, + "grad_norm": 1.6892638206481934, + "learning_rate": 2.6463659867854067e-05, + "loss": 1.9942, + "step": 1215 + }, + { + "epoch": 0.3457881360450719, + "grad_norm": 1.8123127222061157, + "learning_rate": 2.6452168916977882e-05, + "loss": 2.1055, + "step": 1216 + }, + { + "epoch": 0.34607250128853, + "grad_norm": 2.2426178455352783, + "learning_rate": 2.6440677966101696e-05, + "loss": 3.0519, + "step": 1217 + }, + { + "epoch": 0.3463568665319881, + "grad_norm": 1.7458500862121582, + "learning_rate": 2.6429187015225514e-05, + "loss": 2.6327, + "step": 1218 + }, + { + "epoch": 0.34664123177544626, + "grad_norm": 1.7254029512405396, + "learning_rate": 2.641769606434933e-05, + "loss": 2.4534, + "step": 1219 + }, + { + "epoch": 0.34692559701890435, + "grad_norm": 1.6477024555206299, + "learning_rate": 2.6406205113473143e-05, + "loss": 2.3594, + "step": 1220 + }, + { + "epoch": 0.3472099622623625, + "grad_norm": 1.6863490343093872, + "learning_rate": 2.6394714162596958e-05, + "loss": 2.2598, + "step": 1221 + }, + { + "epoch": 0.3474943275058206, + "grad_norm": 1.8328365087509155, + "learning_rate": 2.6383223211720772e-05, + "loss": 2.1452, + "step": 1222 + }, + { + "epoch": 0.3477786927492787, + "grad_norm": 1.7846546173095703, + "learning_rate": 2.6371732260844587e-05, + "loss": 2.0761, + "step": 1223 + }, + { + "epoch": 0.34806305799273685, + "grad_norm": 2.078010082244873, + "learning_rate": 2.63602413099684e-05, + "loss": 1.9126, + "step": 1224 + }, + { + "epoch": 0.34834742323619494, + "grad_norm": 2.1503591537475586, + "learning_rate": 2.6348750359092216e-05, + "loss": 2.875, + "step": 1225 + }, + { + "epoch": 0.3486317884796531, + "grad_norm": 1.839574933052063, + "learning_rate": 2.633725940821603e-05, + "loss": 2.4719, + "step": 1226 + }, + { + "epoch": 0.3489161537231112, + "grad_norm": 1.8120521306991577, + "learning_rate": 2.6325768457339848e-05, + "loss": 2.2545, + "step": 1227 + }, + { + "epoch": 0.3492005189665693, + "grad_norm": 1.7394510507583618, + "learning_rate": 2.6314277506463663e-05, + "loss": 2.2625, + "step": 1228 + }, + { + "epoch": 0.34948488421002744, + "grad_norm": 1.7700285911560059, + "learning_rate": 2.6302786555587477e-05, + "loss": 2.2649, + "step": 1229 + }, + { + "epoch": 0.34976924945348553, + "grad_norm": 1.6092114448547363, + "learning_rate": 2.629129560471129e-05, + "loss": 1.9294, + "step": 1230 + }, + { + "epoch": 0.3500536146969437, + "grad_norm": 1.8207448720932007, + "learning_rate": 2.6279804653835106e-05, + "loss": 1.9099, + "step": 1231 + }, + { + "epoch": 0.3503379799404018, + "grad_norm": 1.8957747220993042, + "learning_rate": 2.626831370295892e-05, + "loss": 1.8944, + "step": 1232 + }, + { + "epoch": 0.35062234518385993, + "grad_norm": 2.1414735317230225, + "learning_rate": 2.6256822752082735e-05, + "loss": 2.878, + "step": 1233 + }, + { + "epoch": 0.350906710427318, + "grad_norm": 1.7470128536224365, + "learning_rate": 2.624533180120655e-05, + "loss": 2.6301, + "step": 1234 + }, + { + "epoch": 0.3511910756707761, + "grad_norm": 1.71774423122406, + "learning_rate": 2.6233840850330364e-05, + "loss": 2.5042, + "step": 1235 + }, + { + "epoch": 0.35147544091423427, + "grad_norm": 1.7391661405563354, + "learning_rate": 2.6222349899454182e-05, + "loss": 2.5183, + "step": 1236 + }, + { + "epoch": 0.35175980615769237, + "grad_norm": 1.9395679235458374, + "learning_rate": 2.6210858948577996e-05, + "loss": 2.2172, + "step": 1237 + }, + { + "epoch": 0.3520441714011505, + "grad_norm": 1.9249560832977295, + "learning_rate": 2.619936799770181e-05, + "loss": 2.0831, + "step": 1238 + }, + { + "epoch": 0.3523285366446086, + "grad_norm": 1.7382254600524902, + "learning_rate": 2.6187877046825625e-05, + "loss": 2.1057, + "step": 1239 + }, + { + "epoch": 0.3526129018880667, + "grad_norm": 1.7688852548599243, + "learning_rate": 2.617638609594944e-05, + "loss": 1.9215, + "step": 1240 + }, + { + "epoch": 0.35289726713152486, + "grad_norm": 2.368236541748047, + "learning_rate": 2.6164895145073258e-05, + "loss": 2.7489, + "step": 1241 + }, + { + "epoch": 0.35318163237498296, + "grad_norm": 1.797239065170288, + "learning_rate": 2.6153404194197072e-05, + "loss": 2.568, + "step": 1242 + }, + { + "epoch": 0.3534659976184411, + "grad_norm": 1.586428165435791, + "learning_rate": 2.614191324332089e-05, + "loss": 2.2321, + "step": 1243 + }, + { + "epoch": 0.3537503628618992, + "grad_norm": 1.7970319986343384, + "learning_rate": 2.6130422292444705e-05, + "loss": 2.405, + "step": 1244 + }, + { + "epoch": 0.3540347281053573, + "grad_norm": 2.1091277599334717, + "learning_rate": 2.611893134156852e-05, + "loss": 2.5763, + "step": 1245 + }, + { + "epoch": 0.35431909334881545, + "grad_norm": 1.7261741161346436, + "learning_rate": 2.6107440390692334e-05, + "loss": 2.3919, + "step": 1246 + }, + { + "epoch": 0.35460345859227355, + "grad_norm": 1.808210015296936, + "learning_rate": 2.6095949439816148e-05, + "loss": 2.0507, + "step": 1247 + }, + { + "epoch": 0.3548878238357317, + "grad_norm": 2.0044949054718018, + "learning_rate": 2.6084458488939963e-05, + "loss": 1.9141, + "step": 1248 + }, + { + "epoch": 0.3551721890791898, + "grad_norm": 2.0011098384857178, + "learning_rate": 2.6072967538063777e-05, + "loss": 2.7051, + "step": 1249 + }, + { + "epoch": 0.3554565543226479, + "grad_norm": 1.7383923530578613, + "learning_rate": 2.606147658718759e-05, + "loss": 2.5117, + "step": 1250 + }, + { + "epoch": 0.35574091956610604, + "grad_norm": 1.8515959978103638, + "learning_rate": 2.6049985636311406e-05, + "loss": 2.5653, + "step": 1251 + }, + { + "epoch": 0.35602528480956414, + "grad_norm": 1.7738157510757446, + "learning_rate": 2.6038494685435224e-05, + "loss": 2.4249, + "step": 1252 + }, + { + "epoch": 0.3563096500530223, + "grad_norm": 1.834094524383545, + "learning_rate": 2.602700373455904e-05, + "loss": 2.3869, + "step": 1253 + }, + { + "epoch": 0.3565940152964804, + "grad_norm": 1.6823465824127197, + "learning_rate": 2.6015512783682853e-05, + "loss": 1.9128, + "step": 1254 + }, + { + "epoch": 0.35687838053993853, + "grad_norm": 1.706059217453003, + "learning_rate": 2.6004021832806668e-05, + "loss": 1.8869, + "step": 1255 + }, + { + "epoch": 0.35716274578339663, + "grad_norm": 1.8522998094558716, + "learning_rate": 2.5992530881930482e-05, + "loss": 1.9591, + "step": 1256 + }, + { + "epoch": 0.3574471110268547, + "grad_norm": 2.1196982860565186, + "learning_rate": 2.5981039931054297e-05, + "loss": 2.9377, + "step": 1257 + }, + { + "epoch": 0.3577314762703129, + "grad_norm": 1.8649749755859375, + "learning_rate": 2.596954898017811e-05, + "loss": 2.5061, + "step": 1258 + }, + { + "epoch": 0.35801584151377097, + "grad_norm": 1.6296436786651611, + "learning_rate": 2.5958058029301926e-05, + "loss": 2.5032, + "step": 1259 + }, + { + "epoch": 0.3583002067572291, + "grad_norm": 1.5869815349578857, + "learning_rate": 2.594656707842574e-05, + "loss": 2.4272, + "step": 1260 + }, + { + "epoch": 0.3585845720006872, + "grad_norm": 1.7618229389190674, + "learning_rate": 2.5935076127549558e-05, + "loss": 2.3934, + "step": 1261 + }, + { + "epoch": 0.3588689372441453, + "grad_norm": 1.716230034828186, + "learning_rate": 2.5923585176673372e-05, + "loss": 2.0693, + "step": 1262 + }, + { + "epoch": 0.35915330248760347, + "grad_norm": 1.8889720439910889, + "learning_rate": 2.5912094225797187e-05, + "loss": 1.9207, + "step": 1263 + }, + { + "epoch": 0.35943766773106156, + "grad_norm": 1.825950264930725, + "learning_rate": 2.5900603274921e-05, + "loss": 1.867, + "step": 1264 + }, + { + "epoch": 0.3597220329745197, + "grad_norm": 2.1356546878814697, + "learning_rate": 2.5889112324044816e-05, + "loss": 3.0785, + "step": 1265 + }, + { + "epoch": 0.3600063982179778, + "grad_norm": 1.845746636390686, + "learning_rate": 2.587762137316863e-05, + "loss": 2.608, + "step": 1266 + }, + { + "epoch": 0.3602907634614359, + "grad_norm": 1.7854721546173096, + "learning_rate": 2.5866130422292445e-05, + "loss": 2.3848, + "step": 1267 + }, + { + "epoch": 0.36057512870489405, + "grad_norm": 1.6726619005203247, + "learning_rate": 2.585463947141626e-05, + "loss": 2.5518, + "step": 1268 + }, + { + "epoch": 0.36085949394835215, + "grad_norm": 1.716640830039978, + "learning_rate": 2.5843148520540077e-05, + "loss": 2.3346, + "step": 1269 + }, + { + "epoch": 0.3611438591918103, + "grad_norm": 1.5924497842788696, + "learning_rate": 2.5831657569663892e-05, + "loss": 1.9842, + "step": 1270 + }, + { + "epoch": 0.3614282244352684, + "grad_norm": 1.7810031175613403, + "learning_rate": 2.5820166618787706e-05, + "loss": 1.8548, + "step": 1271 + }, + { + "epoch": 0.3617125896787265, + "grad_norm": 1.8115458488464355, + "learning_rate": 2.580867566791152e-05, + "loss": 2.0252, + "step": 1272 + }, + { + "epoch": 0.36199695492218464, + "grad_norm": 2.0684995651245117, + "learning_rate": 2.5797184717035335e-05, + "loss": 2.9993, + "step": 1273 + }, + { + "epoch": 0.36228132016564274, + "grad_norm": 1.792303204536438, + "learning_rate": 2.578569376615915e-05, + "loss": 2.5244, + "step": 1274 + }, + { + "epoch": 0.3625656854091009, + "grad_norm": 1.5862210988998413, + "learning_rate": 2.5774202815282964e-05, + "loss": 2.4175, + "step": 1275 + }, + { + "epoch": 0.362850050652559, + "grad_norm": 1.706787109375, + "learning_rate": 2.576271186440678e-05, + "loss": 2.4619, + "step": 1276 + }, + { + "epoch": 0.3631344158960171, + "grad_norm": 1.7666358947753906, + "learning_rate": 2.5751220913530593e-05, + "loss": 2.4914, + "step": 1277 + }, + { + "epoch": 0.36341878113947523, + "grad_norm": 1.6655855178833008, + "learning_rate": 2.5739729962654415e-05, + "loss": 2.0598, + "step": 1278 + }, + { + "epoch": 0.36370314638293333, + "grad_norm": 1.6447235345840454, + "learning_rate": 2.572823901177823e-05, + "loss": 1.734, + "step": 1279 + }, + { + "epoch": 0.3639875116263915, + "grad_norm": 1.7836151123046875, + "learning_rate": 2.5716748060902044e-05, + "loss": 1.9249, + "step": 1280 + }, + { + "epoch": 0.3642718768698496, + "grad_norm": 1.9049650430679321, + "learning_rate": 2.5705257110025858e-05, + "loss": 2.6863, + "step": 1281 + }, + { + "epoch": 0.3645562421133077, + "grad_norm": 1.6191275119781494, + "learning_rate": 2.5693766159149673e-05, + "loss": 2.4752, + "step": 1282 + }, + { + "epoch": 0.3648406073567658, + "grad_norm": 1.6854596138000488, + "learning_rate": 2.5682275208273487e-05, + "loss": 2.4672, + "step": 1283 + }, + { + "epoch": 0.3651249726002239, + "grad_norm": 1.725872278213501, + "learning_rate": 2.56707842573973e-05, + "loss": 2.4114, + "step": 1284 + }, + { + "epoch": 0.36540933784368207, + "grad_norm": 1.7640068531036377, + "learning_rate": 2.565929330652112e-05, + "loss": 2.1844, + "step": 1285 + }, + { + "epoch": 0.36569370308714017, + "grad_norm": 1.6957398653030396, + "learning_rate": 2.5647802355644934e-05, + "loss": 2.1487, + "step": 1286 + }, + { + "epoch": 0.3659780683305983, + "grad_norm": 1.8824515342712402, + "learning_rate": 2.563631140476875e-05, + "loss": 2.0372, + "step": 1287 + }, + { + "epoch": 0.3662624335740564, + "grad_norm": 1.8746623992919922, + "learning_rate": 2.5624820453892563e-05, + "loss": 1.9418, + "step": 1288 + }, + { + "epoch": 0.3665467988175145, + "grad_norm": 2.1677398681640625, + "learning_rate": 2.5613329503016377e-05, + "loss": 3.0094, + "step": 1289 + }, + { + "epoch": 0.36683116406097266, + "grad_norm": 1.7507233619689941, + "learning_rate": 2.5601838552140192e-05, + "loss": 2.5547, + "step": 1290 + }, + { + "epoch": 0.36711552930443075, + "grad_norm": 1.6727383136749268, + "learning_rate": 2.5590347601264006e-05, + "loss": 2.49, + "step": 1291 + }, + { + "epoch": 0.3673998945478889, + "grad_norm": 1.6071473360061646, + "learning_rate": 2.557885665038782e-05, + "loss": 2.5057, + "step": 1292 + }, + { + "epoch": 0.367684259791347, + "grad_norm": 1.853559970855713, + "learning_rate": 2.5567365699511635e-05, + "loss": 2.135, + "step": 1293 + }, + { + "epoch": 0.3679686250348051, + "grad_norm": 1.656827449798584, + "learning_rate": 2.5555874748635453e-05, + "loss": 1.9428, + "step": 1294 + }, + { + "epoch": 0.36825299027826325, + "grad_norm": 1.7858695983886719, + "learning_rate": 2.5544383797759268e-05, + "loss": 2.0289, + "step": 1295 + }, + { + "epoch": 0.36853735552172134, + "grad_norm": 1.7706724405288696, + "learning_rate": 2.5532892846883082e-05, + "loss": 1.7794, + "step": 1296 + }, + { + "epoch": 0.3688217207651795, + "grad_norm": 1.8838144540786743, + "learning_rate": 2.5521401896006897e-05, + "loss": 2.7027, + "step": 1297 + }, + { + "epoch": 0.3691060860086376, + "grad_norm": 1.6255005598068237, + "learning_rate": 2.550991094513071e-05, + "loss": 2.7461, + "step": 1298 + }, + { + "epoch": 0.3693904512520957, + "grad_norm": 1.5868300199508667, + "learning_rate": 2.5498419994254526e-05, + "loss": 2.4882, + "step": 1299 + }, + { + "epoch": 0.36967481649555384, + "grad_norm": 1.6790016889572144, + "learning_rate": 2.548692904337834e-05, + "loss": 2.1273, + "step": 1300 + }, + { + "epoch": 0.36995918173901193, + "grad_norm": 1.8113175630569458, + "learning_rate": 2.5475438092502155e-05, + "loss": 2.2047, + "step": 1301 + }, + { + "epoch": 0.3702435469824701, + "grad_norm": 1.5671573877334595, + "learning_rate": 2.546394714162597e-05, + "loss": 1.9255, + "step": 1302 + }, + { + "epoch": 0.3705279122259282, + "grad_norm": 1.6399245262145996, + "learning_rate": 2.5452456190749787e-05, + "loss": 1.8351, + "step": 1303 + }, + { + "epoch": 0.37081227746938633, + "grad_norm": 1.7190747261047363, + "learning_rate": 2.54409652398736e-05, + "loss": 2.0176, + "step": 1304 + }, + { + "epoch": 0.3710966427128444, + "grad_norm": 2.0205039978027344, + "learning_rate": 2.5429474288997416e-05, + "loss": 2.9932, + "step": 1305 + }, + { + "epoch": 0.3713810079563025, + "grad_norm": 1.7755295038223267, + "learning_rate": 2.541798333812123e-05, + "loss": 2.5928, + "step": 1306 + }, + { + "epoch": 0.3716653731997607, + "grad_norm": 1.6791408061981201, + "learning_rate": 2.5406492387245045e-05, + "loss": 2.4596, + "step": 1307 + }, + { + "epoch": 0.37194973844321877, + "grad_norm": 1.682665467262268, + "learning_rate": 2.539500143636886e-05, + "loss": 2.284, + "step": 1308 + }, + { + "epoch": 0.3722341036866769, + "grad_norm": 1.661361813545227, + "learning_rate": 2.5383510485492674e-05, + "loss": 2.4567, + "step": 1309 + }, + { + "epoch": 0.372518468930135, + "grad_norm": 1.7297680377960205, + "learning_rate": 2.537201953461649e-05, + "loss": 1.993, + "step": 1310 + }, + { + "epoch": 0.3728028341735931, + "grad_norm": 1.7310127019882202, + "learning_rate": 2.5360528583740303e-05, + "loss": 2.0113, + "step": 1311 + }, + { + "epoch": 0.37308719941705126, + "grad_norm": 1.7791327238082886, + "learning_rate": 2.534903763286412e-05, + "loss": 1.8896, + "step": 1312 + }, + { + "epoch": 0.37337156466050936, + "grad_norm": 2.11208438873291, + "learning_rate": 2.5337546681987936e-05, + "loss": 3.0813, + "step": 1313 + }, + { + "epoch": 0.3736559299039675, + "grad_norm": 1.8183382749557495, + "learning_rate": 2.532605573111175e-05, + "loss": 2.7019, + "step": 1314 + }, + { + "epoch": 0.3739402951474256, + "grad_norm": 1.6248207092285156, + "learning_rate": 2.5314564780235568e-05, + "loss": 2.422, + "step": 1315 + }, + { + "epoch": 0.3742246603908837, + "grad_norm": 1.640504002571106, + "learning_rate": 2.5303073829359382e-05, + "loss": 2.3868, + "step": 1316 + }, + { + "epoch": 0.37450902563434185, + "grad_norm": 1.7609598636627197, + "learning_rate": 2.5291582878483197e-05, + "loss": 2.263, + "step": 1317 + }, + { + "epoch": 0.37479339087779995, + "grad_norm": 1.7517386674880981, + "learning_rate": 2.5280091927607015e-05, + "loss": 2.024, + "step": 1318 + }, + { + "epoch": 0.3750777561212581, + "grad_norm": 1.6678249835968018, + "learning_rate": 2.526860097673083e-05, + "loss": 1.8084, + "step": 1319 + }, + { + "epoch": 0.3753621213647162, + "grad_norm": 1.7585467100143433, + "learning_rate": 2.5257110025854644e-05, + "loss": 1.7552, + "step": 1320 + }, + { + "epoch": 0.3756464866081743, + "grad_norm": 2.2945332527160645, + "learning_rate": 2.5245619074978458e-05, + "loss": 3.0485, + "step": 1321 + }, + { + "epoch": 0.37593085185163244, + "grad_norm": 1.6256428956985474, + "learning_rate": 2.5234128124102273e-05, + "loss": 2.3769, + "step": 1322 + }, + { + "epoch": 0.37621521709509054, + "grad_norm": 1.6193193197250366, + "learning_rate": 2.5222637173226087e-05, + "loss": 2.3007, + "step": 1323 + }, + { + "epoch": 0.3764995823385487, + "grad_norm": 1.6534637212753296, + "learning_rate": 2.5211146222349902e-05, + "loss": 2.5241, + "step": 1324 + }, + { + "epoch": 0.3767839475820068, + "grad_norm": 1.7041131258010864, + "learning_rate": 2.5199655271473716e-05, + "loss": 2.3242, + "step": 1325 + }, + { + "epoch": 0.37706831282546494, + "grad_norm": 1.697442889213562, + "learning_rate": 2.518816432059753e-05, + "loss": 1.9187, + "step": 1326 + }, + { + "epoch": 0.37735267806892303, + "grad_norm": 1.7257381677627563, + "learning_rate": 2.517667336972135e-05, + "loss": 1.7997, + "step": 1327 + }, + { + "epoch": 0.3776370433123811, + "grad_norm": 1.8041011095046997, + "learning_rate": 2.5165182418845163e-05, + "loss": 1.9322, + "step": 1328 + }, + { + "epoch": 0.3779214085558393, + "grad_norm": 2.060734748840332, + "learning_rate": 2.5153691467968978e-05, + "loss": 2.7627, + "step": 1329 + }, + { + "epoch": 0.3782057737992974, + "grad_norm": 1.7511411905288696, + "learning_rate": 2.5142200517092792e-05, + "loss": 2.2934, + "step": 1330 + }, + { + "epoch": 0.3784901390427555, + "grad_norm": 1.6538666486740112, + "learning_rate": 2.5130709566216607e-05, + "loss": 2.2186, + "step": 1331 + }, + { + "epoch": 0.3787745042862136, + "grad_norm": 1.672150731086731, + "learning_rate": 2.511921861534042e-05, + "loss": 2.3287, + "step": 1332 + }, + { + "epoch": 0.3790588695296717, + "grad_norm": 1.8601032495498657, + "learning_rate": 2.5107727664464236e-05, + "loss": 2.1725, + "step": 1333 + }, + { + "epoch": 0.37934323477312987, + "grad_norm": 1.6767491102218628, + "learning_rate": 2.509623671358805e-05, + "loss": 2.1525, + "step": 1334 + }, + { + "epoch": 0.37962760001658796, + "grad_norm": 1.72711980342865, + "learning_rate": 2.5084745762711865e-05, + "loss": 2.1467, + "step": 1335 + }, + { + "epoch": 0.3799119652600461, + "grad_norm": 1.7311689853668213, + "learning_rate": 2.5073254811835683e-05, + "loss": 1.8984, + "step": 1336 + }, + { + "epoch": 0.3801963305035042, + "grad_norm": 2.0441644191741943, + "learning_rate": 2.5061763860959497e-05, + "loss": 3.0045, + "step": 1337 + }, + { + "epoch": 0.3804806957469623, + "grad_norm": 1.9976049661636353, + "learning_rate": 2.505027291008331e-05, + "loss": 2.6461, + "step": 1338 + }, + { + "epoch": 0.38076506099042046, + "grad_norm": 1.6794681549072266, + "learning_rate": 2.5038781959207126e-05, + "loss": 2.4184, + "step": 1339 + }, + { + "epoch": 0.38104942623387855, + "grad_norm": 1.573089599609375, + "learning_rate": 2.502729100833094e-05, + "loss": 2.3171, + "step": 1340 + }, + { + "epoch": 0.3813337914773367, + "grad_norm": 1.8047806024551392, + "learning_rate": 2.5015800057454755e-05, + "loss": 2.4302, + "step": 1341 + }, + { + "epoch": 0.3816181567207948, + "grad_norm": 1.679032564163208, + "learning_rate": 2.500430910657857e-05, + "loss": 2.0206, + "step": 1342 + }, + { + "epoch": 0.3819025219642529, + "grad_norm": 1.6827946901321411, + "learning_rate": 2.4992818155702384e-05, + "loss": 1.8739, + "step": 1343 + }, + { + "epoch": 0.38218688720771105, + "grad_norm": 2.0795507431030273, + "learning_rate": 2.49813272048262e-05, + "loss": 1.9664, + "step": 1344 + }, + { + "epoch": 0.38247125245116914, + "grad_norm": 2.012922763824463, + "learning_rate": 2.4969836253950016e-05, + "loss": 2.757, + "step": 1345 + }, + { + "epoch": 0.3827556176946273, + "grad_norm": 1.62966787815094, + "learning_rate": 2.495834530307383e-05, + "loss": 2.5295, + "step": 1346 + }, + { + "epoch": 0.3830399829380854, + "grad_norm": 1.6679270267486572, + "learning_rate": 2.4946854352197645e-05, + "loss": 2.459, + "step": 1347 + }, + { + "epoch": 0.3833243481815435, + "grad_norm": 1.615654468536377, + "learning_rate": 2.493536340132146e-05, + "loss": 2.1912, + "step": 1348 + }, + { + "epoch": 0.38360871342500164, + "grad_norm": 1.8527601957321167, + "learning_rate": 2.4923872450445274e-05, + "loss": 2.2239, + "step": 1349 + }, + { + "epoch": 0.38389307866845973, + "grad_norm": 1.7967239618301392, + "learning_rate": 2.491238149956909e-05, + "loss": 1.9359, + "step": 1350 + }, + { + "epoch": 0.3841774439119179, + "grad_norm": 1.7077158689498901, + "learning_rate": 2.4900890548692903e-05, + "loss": 1.737, + "step": 1351 + }, + { + "epoch": 0.384461809155376, + "grad_norm": 1.7377227544784546, + "learning_rate": 2.4889399597816725e-05, + "loss": 1.934, + "step": 1352 + }, + { + "epoch": 0.38474617439883413, + "grad_norm": 1.9111387729644775, + "learning_rate": 2.487790864694054e-05, + "loss": 2.776, + "step": 1353 + }, + { + "epoch": 0.3850305396422922, + "grad_norm": 1.6393592357635498, + "learning_rate": 2.4866417696064354e-05, + "loss": 2.4337, + "step": 1354 + }, + { + "epoch": 0.3853149048857503, + "grad_norm": 1.7476547956466675, + "learning_rate": 2.4854926745188168e-05, + "loss": 2.377, + "step": 1355 + }, + { + "epoch": 0.38559927012920847, + "grad_norm": 1.8083293437957764, + "learning_rate": 2.4843435794311983e-05, + "loss": 2.3762, + "step": 1356 + }, + { + "epoch": 0.38588363537266657, + "grad_norm": 1.9728158712387085, + "learning_rate": 2.4831944843435797e-05, + "loss": 2.242, + "step": 1357 + }, + { + "epoch": 0.3861680006161247, + "grad_norm": 1.5531713962554932, + "learning_rate": 2.482045389255961e-05, + "loss": 2.0673, + "step": 1358 + }, + { + "epoch": 0.3864523658595828, + "grad_norm": 1.5641489028930664, + "learning_rate": 2.4808962941683426e-05, + "loss": 1.7698, + "step": 1359 + }, + { + "epoch": 0.3867367311030409, + "grad_norm": 1.7081369161605835, + "learning_rate": 2.4797471990807244e-05, + "loss": 1.7401, + "step": 1360 + }, + { + "epoch": 0.38702109634649906, + "grad_norm": 1.9915111064910889, + "learning_rate": 2.478598103993106e-05, + "loss": 2.9381, + "step": 1361 + }, + { + "epoch": 0.38730546158995716, + "grad_norm": 1.6150346994400024, + "learning_rate": 2.4774490089054873e-05, + "loss": 2.5681, + "step": 1362 + }, + { + "epoch": 0.3875898268334153, + "grad_norm": 1.6368330717086792, + "learning_rate": 2.4762999138178687e-05, + "loss": 2.5464, + "step": 1363 + }, + { + "epoch": 0.3878741920768734, + "grad_norm": 1.6457427740097046, + "learning_rate": 2.4751508187302502e-05, + "loss": 2.1136, + "step": 1364 + }, + { + "epoch": 0.3881585573203315, + "grad_norm": 1.6669577360153198, + "learning_rate": 2.4740017236426316e-05, + "loss": 2.3689, + "step": 1365 + }, + { + "epoch": 0.38844292256378965, + "grad_norm": 1.6207609176635742, + "learning_rate": 2.472852628555013e-05, + "loss": 2.0466, + "step": 1366 + }, + { + "epoch": 0.38872728780724775, + "grad_norm": 1.6581090688705444, + "learning_rate": 2.4717035334673945e-05, + "loss": 2.1255, + "step": 1367 + }, + { + "epoch": 0.3890116530507059, + "grad_norm": 1.7444984912872314, + "learning_rate": 2.470554438379776e-05, + "loss": 1.9976, + "step": 1368 + }, + { + "epoch": 0.389296018294164, + "grad_norm": 2.0723159313201904, + "learning_rate": 2.4694053432921578e-05, + "loss": 3.1322, + "step": 1369 + }, + { + "epoch": 0.3895803835376221, + "grad_norm": 1.8740094900131226, + "learning_rate": 2.4682562482045392e-05, + "loss": 2.346, + "step": 1370 + }, + { + "epoch": 0.38986474878108024, + "grad_norm": 1.7882744073867798, + "learning_rate": 2.4671071531169207e-05, + "loss": 2.3347, + "step": 1371 + }, + { + "epoch": 0.39014911402453833, + "grad_norm": 1.7288506031036377, + "learning_rate": 2.465958058029302e-05, + "loss": 2.2182, + "step": 1372 + }, + { + "epoch": 0.3904334792679965, + "grad_norm": 1.6869946718215942, + "learning_rate": 2.4648089629416836e-05, + "loss": 2.203, + "step": 1373 + }, + { + "epoch": 0.3907178445114546, + "grad_norm": 1.6515893936157227, + "learning_rate": 2.463659867854065e-05, + "loss": 2.1165, + "step": 1374 + }, + { + "epoch": 0.39100220975491273, + "grad_norm": 1.7650010585784912, + "learning_rate": 2.4625107727664465e-05, + "loss": 1.8461, + "step": 1375 + }, + { + "epoch": 0.39128657499837083, + "grad_norm": 1.602225661277771, + "learning_rate": 2.461361677678828e-05, + "loss": 1.8419, + "step": 1376 + }, + { + "epoch": 0.3915709402418289, + "grad_norm": 1.930778980255127, + "learning_rate": 2.4602125825912094e-05, + "loss": 2.7933, + "step": 1377 + }, + { + "epoch": 0.3918553054852871, + "grad_norm": 1.921008586883545, + "learning_rate": 2.4590634875035912e-05, + "loss": 2.6368, + "step": 1378 + }, + { + "epoch": 0.39213967072874517, + "grad_norm": 1.7024277448654175, + "learning_rate": 2.4579143924159726e-05, + "loss": 2.3727, + "step": 1379 + }, + { + "epoch": 0.3924240359722033, + "grad_norm": 1.522916316986084, + "learning_rate": 2.456765297328354e-05, + "loss": 2.2389, + "step": 1380 + }, + { + "epoch": 0.3927084012156614, + "grad_norm": 1.8298519849777222, + "learning_rate": 2.4556162022407355e-05, + "loss": 2.3436, + "step": 1381 + }, + { + "epoch": 0.3929927664591195, + "grad_norm": 1.6771541833877563, + "learning_rate": 2.454467107153117e-05, + "loss": 1.8835, + "step": 1382 + }, + { + "epoch": 0.39327713170257766, + "grad_norm": 1.8293532133102417, + "learning_rate": 2.4533180120654984e-05, + "loss": 2.0321, + "step": 1383 + }, + { + "epoch": 0.39356149694603576, + "grad_norm": 1.8741817474365234, + "learning_rate": 2.45216891697788e-05, + "loss": 1.9599, + "step": 1384 + }, + { + "epoch": 0.3938458621894939, + "grad_norm": 2.148267984390259, + "learning_rate": 2.4510198218902613e-05, + "loss": 2.7614, + "step": 1385 + }, + { + "epoch": 0.394130227432952, + "grad_norm": 1.771562933921814, + "learning_rate": 2.4498707268026428e-05, + "loss": 2.6609, + "step": 1386 + }, + { + "epoch": 0.3944145926764101, + "grad_norm": 1.6484909057617188, + "learning_rate": 2.4487216317150246e-05, + "loss": 2.3871, + "step": 1387 + }, + { + "epoch": 0.39469895791986825, + "grad_norm": 1.6296532154083252, + "learning_rate": 2.447572536627406e-05, + "loss": 2.3645, + "step": 1388 + }, + { + "epoch": 0.39498332316332635, + "grad_norm": 1.7516010999679565, + "learning_rate": 2.4464234415397878e-05, + "loss": 2.1963, + "step": 1389 + }, + { + "epoch": 0.3952676884067845, + "grad_norm": 1.6003243923187256, + "learning_rate": 2.4452743464521692e-05, + "loss": 1.9164, + "step": 1390 + }, + { + "epoch": 0.3955520536502426, + "grad_norm": 1.8290302753448486, + "learning_rate": 2.4441252513645507e-05, + "loss": 1.9689, + "step": 1391 + }, + { + "epoch": 0.3958364188937007, + "grad_norm": 1.694347858428955, + "learning_rate": 2.442976156276932e-05, + "loss": 1.7561, + "step": 1392 + }, + { + "epoch": 0.39612078413715884, + "grad_norm": 1.91927969455719, + "learning_rate": 2.441827061189314e-05, + "loss": 2.8435, + "step": 1393 + }, + { + "epoch": 0.39640514938061694, + "grad_norm": 1.766818642616272, + "learning_rate": 2.4406779661016954e-05, + "loss": 2.6195, + "step": 1394 + }, + { + "epoch": 0.3966895146240751, + "grad_norm": 1.7529215812683105, + "learning_rate": 2.439528871014077e-05, + "loss": 2.455, + "step": 1395 + }, + { + "epoch": 0.3969738798675332, + "grad_norm": 1.6795532703399658, + "learning_rate": 2.4383797759264583e-05, + "loss": 2.4504, + "step": 1396 + }, + { + "epoch": 0.3972582451109913, + "grad_norm": 1.8063428401947021, + "learning_rate": 2.4372306808388397e-05, + "loss": 2.3136, + "step": 1397 + }, + { + "epoch": 0.39754261035444943, + "grad_norm": 1.8748013973236084, + "learning_rate": 2.4360815857512212e-05, + "loss": 1.9549, + "step": 1398 + }, + { + "epoch": 0.39782697559790753, + "grad_norm": 1.702768087387085, + "learning_rate": 2.4349324906636026e-05, + "loss": 1.9597, + "step": 1399 + }, + { + "epoch": 0.3981113408413657, + "grad_norm": 1.8588896989822388, + "learning_rate": 2.433783395575984e-05, + "loss": 1.9016, + "step": 1400 + }, + { + "epoch": 0.3983957060848238, + "grad_norm": 1.9412899017333984, + "learning_rate": 2.4326343004883655e-05, + "loss": 2.6964, + "step": 1401 + }, + { + "epoch": 0.3986800713282819, + "grad_norm": 1.590203046798706, + "learning_rate": 2.4314852054007473e-05, + "loss": 2.5482, + "step": 1402 + }, + { + "epoch": 0.39896443657174, + "grad_norm": 1.613073468208313, + "learning_rate": 2.4303361103131288e-05, + "loss": 2.3643, + "step": 1403 + }, + { + "epoch": 0.3992488018151981, + "grad_norm": 1.7016730308532715, + "learning_rate": 2.4291870152255102e-05, + "loss": 2.179, + "step": 1404 + }, + { + "epoch": 0.39953316705865627, + "grad_norm": 1.7055073976516724, + "learning_rate": 2.4280379201378917e-05, + "loss": 2.3988, + "step": 1405 + }, + { + "epoch": 0.39981753230211436, + "grad_norm": 1.6752086877822876, + "learning_rate": 2.426888825050273e-05, + "loss": 2.3758, + "step": 1406 + }, + { + "epoch": 0.4001018975455725, + "grad_norm": 1.6053355932235718, + "learning_rate": 2.4257397299626546e-05, + "loss": 1.9306, + "step": 1407 + }, + { + "epoch": 0.4003862627890306, + "grad_norm": 1.7850687503814697, + "learning_rate": 2.424590634875036e-05, + "loss": 1.8416, + "step": 1408 + }, + { + "epoch": 0.4006706280324887, + "grad_norm": 2.1916213035583496, + "learning_rate": 2.4234415397874175e-05, + "loss": 2.8873, + "step": 1409 + }, + { + "epoch": 0.40095499327594686, + "grad_norm": 1.7736766338348389, + "learning_rate": 2.422292444699799e-05, + "loss": 2.5443, + "step": 1410 + }, + { + "epoch": 0.40123935851940495, + "grad_norm": 1.630659818649292, + "learning_rate": 2.4211433496121807e-05, + "loss": 2.4075, + "step": 1411 + }, + { + "epoch": 0.4015237237628631, + "grad_norm": 1.5184587240219116, + "learning_rate": 2.419994254524562e-05, + "loss": 2.2805, + "step": 1412 + }, + { + "epoch": 0.4018080890063212, + "grad_norm": 1.721583366394043, + "learning_rate": 2.4188451594369436e-05, + "loss": 2.3186, + "step": 1413 + }, + { + "epoch": 0.4020924542497793, + "grad_norm": 1.7377300262451172, + "learning_rate": 2.417696064349325e-05, + "loss": 1.9795, + "step": 1414 + }, + { + "epoch": 0.40237681949323745, + "grad_norm": 1.7117558717727661, + "learning_rate": 2.4165469692617065e-05, + "loss": 2.2045, + "step": 1415 + }, + { + "epoch": 0.40266118473669554, + "grad_norm": 1.7114864587783813, + "learning_rate": 2.415397874174088e-05, + "loss": 1.799, + "step": 1416 + }, + { + "epoch": 0.4029455499801537, + "grad_norm": 1.7981421947479248, + "learning_rate": 2.4142487790864694e-05, + "loss": 3.078, + "step": 1417 + }, + { + "epoch": 0.4032299152236118, + "grad_norm": 1.600596308708191, + "learning_rate": 2.413099683998851e-05, + "loss": 2.3393, + "step": 1418 + }, + { + "epoch": 0.4035142804670699, + "grad_norm": 1.4729454517364502, + "learning_rate": 2.4119505889112323e-05, + "loss": 2.3691, + "step": 1419 + }, + { + "epoch": 0.40379864571052804, + "grad_norm": 1.6638671159744263, + "learning_rate": 2.410801493823614e-05, + "loss": 2.2835, + "step": 1420 + }, + { + "epoch": 0.40408301095398613, + "grad_norm": 1.79701566696167, + "learning_rate": 2.4096523987359955e-05, + "loss": 2.2159, + "step": 1421 + }, + { + "epoch": 0.4043673761974443, + "grad_norm": 1.6714825630187988, + "learning_rate": 2.408503303648377e-05, + "loss": 1.8111, + "step": 1422 + }, + { + "epoch": 0.4046517414409024, + "grad_norm": 1.736799716949463, + "learning_rate": 2.4073542085607584e-05, + "loss": 1.9677, + "step": 1423 + }, + { + "epoch": 0.40493610668436053, + "grad_norm": 1.7546263933181763, + "learning_rate": 2.40620511347314e-05, + "loss": 1.8836, + "step": 1424 + }, + { + "epoch": 0.4052204719278186, + "grad_norm": 1.9755287170410156, + "learning_rate": 2.4050560183855213e-05, + "loss": 2.9325, + "step": 1425 + }, + { + "epoch": 0.4055048371712767, + "grad_norm": 1.6950325965881348, + "learning_rate": 2.4039069232979035e-05, + "loss": 2.7484, + "step": 1426 + }, + { + "epoch": 0.4057892024147349, + "grad_norm": 1.787856936454773, + "learning_rate": 2.402757828210285e-05, + "loss": 2.5154, + "step": 1427 + }, + { + "epoch": 0.40607356765819297, + "grad_norm": 1.7529258728027344, + "learning_rate": 2.4016087331226664e-05, + "loss": 2.3995, + "step": 1428 + }, + { + "epoch": 0.4063579329016511, + "grad_norm": 1.8270007371902466, + "learning_rate": 2.4004596380350478e-05, + "loss": 2.2466, + "step": 1429 + }, + { + "epoch": 0.4066422981451092, + "grad_norm": 1.6671589612960815, + "learning_rate": 2.3993105429474293e-05, + "loss": 2.0727, + "step": 1430 + }, + { + "epoch": 0.4069266633885673, + "grad_norm": 1.8417056798934937, + "learning_rate": 2.3981614478598107e-05, + "loss": 1.9906, + "step": 1431 + }, + { + "epoch": 0.40721102863202546, + "grad_norm": 1.794356346130371, + "learning_rate": 2.397012352772192e-05, + "loss": 1.9631, + "step": 1432 + }, + { + "epoch": 0.40749539387548356, + "grad_norm": 1.9737939834594727, + "learning_rate": 2.3958632576845736e-05, + "loss": 2.8793, + "step": 1433 + }, + { + "epoch": 0.4077797591189417, + "grad_norm": 1.7797672748565674, + "learning_rate": 2.394714162596955e-05, + "loss": 2.6237, + "step": 1434 + }, + { + "epoch": 0.4080641243623998, + "grad_norm": 1.782353401184082, + "learning_rate": 2.393565067509337e-05, + "loss": 2.3761, + "step": 1435 + }, + { + "epoch": 0.4083484896058579, + "grad_norm": 1.662298560142517, + "learning_rate": 2.3924159724217183e-05, + "loss": 2.3679, + "step": 1436 + }, + { + "epoch": 0.40863285484931605, + "grad_norm": 1.7160699367523193, + "learning_rate": 2.3912668773340998e-05, + "loss": 2.0721, + "step": 1437 + }, + { + "epoch": 0.40891722009277415, + "grad_norm": 1.5304116010665894, + "learning_rate": 2.3901177822464812e-05, + "loss": 1.9679, + "step": 1438 + }, + { + "epoch": 0.4092015853362323, + "grad_norm": 1.6749613285064697, + "learning_rate": 2.3889686871588627e-05, + "loss": 1.9677, + "step": 1439 + }, + { + "epoch": 0.4094859505796904, + "grad_norm": 1.8116540908813477, + "learning_rate": 2.387819592071244e-05, + "loss": 1.7566, + "step": 1440 + }, + { + "epoch": 0.4097703158231485, + "grad_norm": 2.061837911605835, + "learning_rate": 2.3866704969836256e-05, + "loss": 2.768, + "step": 1441 + }, + { + "epoch": 0.41005468106660664, + "grad_norm": 1.6977015733718872, + "learning_rate": 2.385521401896007e-05, + "loss": 2.4958, + "step": 1442 + }, + { + "epoch": 0.41033904631006474, + "grad_norm": 1.6543117761611938, + "learning_rate": 2.3843723068083885e-05, + "loss": 2.4396, + "step": 1443 + }, + { + "epoch": 0.4106234115535229, + "grad_norm": 1.5568379163742065, + "learning_rate": 2.3832232117207702e-05, + "loss": 2.2158, + "step": 1444 + }, + { + "epoch": 0.410907776796981, + "grad_norm": 1.6220964193344116, + "learning_rate": 2.3820741166331517e-05, + "loss": 2.3038, + "step": 1445 + }, + { + "epoch": 0.41119214204043913, + "grad_norm": 1.6945890188217163, + "learning_rate": 2.380925021545533e-05, + "loss": 1.9376, + "step": 1446 + }, + { + "epoch": 0.41147650728389723, + "grad_norm": 1.5313851833343506, + "learning_rate": 2.3797759264579146e-05, + "loss": 1.888, + "step": 1447 + }, + { + "epoch": 0.4117608725273553, + "grad_norm": 1.7938929796218872, + "learning_rate": 2.378626831370296e-05, + "loss": 1.9618, + "step": 1448 + }, + { + "epoch": 0.4120452377708135, + "grad_norm": 2.1637260913848877, + "learning_rate": 2.3774777362826775e-05, + "loss": 2.864, + "step": 1449 + }, + { + "epoch": 0.4123296030142716, + "grad_norm": 1.8949286937713623, + "learning_rate": 2.376328641195059e-05, + "loss": 2.675, + "step": 1450 + }, + { + "epoch": 0.4126139682577297, + "grad_norm": 1.711017370223999, + "learning_rate": 2.3751795461074404e-05, + "loss": 2.4241, + "step": 1451 + }, + { + "epoch": 0.4128983335011878, + "grad_norm": 1.6025915145874023, + "learning_rate": 2.374030451019822e-05, + "loss": 2.3166, + "step": 1452 + }, + { + "epoch": 0.4131826987446459, + "grad_norm": 1.7555315494537354, + "learning_rate": 2.3728813559322036e-05, + "loss": 2.0809, + "step": 1453 + }, + { + "epoch": 0.41346706398810407, + "grad_norm": 1.5702033042907715, + "learning_rate": 2.371732260844585e-05, + "loss": 1.981, + "step": 1454 + }, + { + "epoch": 0.41375142923156216, + "grad_norm": 1.6825945377349854, + "learning_rate": 2.3705831657569665e-05, + "loss": 1.766, + "step": 1455 + }, + { + "epoch": 0.4140357944750203, + "grad_norm": 1.711393117904663, + "learning_rate": 2.369434070669348e-05, + "loss": 2.0005, + "step": 1456 + }, + { + "epoch": 0.4143201597184784, + "grad_norm": 1.917224407196045, + "learning_rate": 2.3682849755817294e-05, + "loss": 2.8846, + "step": 1457 + }, + { + "epoch": 0.4146045249619365, + "grad_norm": 1.728300929069519, + "learning_rate": 2.367135880494111e-05, + "loss": 2.63, + "step": 1458 + }, + { + "epoch": 0.41488889020539466, + "grad_norm": 1.6932635307312012, + "learning_rate": 2.3659867854064923e-05, + "loss": 2.3097, + "step": 1459 + }, + { + "epoch": 0.41517325544885275, + "grad_norm": 1.6644999980926514, + "learning_rate": 2.3648376903188738e-05, + "loss": 2.3912, + "step": 1460 + }, + { + "epoch": 0.4154576206923109, + "grad_norm": 1.7875523567199707, + "learning_rate": 2.3636885952312552e-05, + "loss": 2.3286, + "step": 1461 + }, + { + "epoch": 0.415741985935769, + "grad_norm": 1.7267717123031616, + "learning_rate": 2.362539500143637e-05, + "loss": 2.021, + "step": 1462 + }, + { + "epoch": 0.4160263511792271, + "grad_norm": 1.854394555091858, + "learning_rate": 2.3613904050560188e-05, + "loss": 2.0849, + "step": 1463 + }, + { + "epoch": 0.41631071642268525, + "grad_norm": 1.7931166887283325, + "learning_rate": 2.3602413099684003e-05, + "loss": 1.9079, + "step": 1464 + }, + { + "epoch": 0.41659508166614334, + "grad_norm": 1.9061031341552734, + "learning_rate": 2.3590922148807817e-05, + "loss": 2.7233, + "step": 1465 + }, + { + "epoch": 0.4168794469096015, + "grad_norm": 1.6702359914779663, + "learning_rate": 2.357943119793163e-05, + "loss": 2.6487, + "step": 1466 + }, + { + "epoch": 0.4171638121530596, + "grad_norm": 1.5894525051116943, + "learning_rate": 2.3567940247055446e-05, + "loss": 2.3472, + "step": 1467 + }, + { + "epoch": 0.4174481773965177, + "grad_norm": 1.6212420463562012, + "learning_rate": 2.3556449296179264e-05, + "loss": 2.0305, + "step": 1468 + }, + { + "epoch": 0.41773254263997583, + "grad_norm": 1.8889355659484863, + "learning_rate": 2.354495834530308e-05, + "loss": 2.5149, + "step": 1469 + }, + { + "epoch": 0.41801690788343393, + "grad_norm": 1.4953631162643433, + "learning_rate": 2.3533467394426893e-05, + "loss": 1.877, + "step": 1470 + }, + { + "epoch": 0.4183012731268921, + "grad_norm": 1.6321781873703003, + "learning_rate": 2.3521976443550707e-05, + "loss": 1.9186, + "step": 1471 + }, + { + "epoch": 0.4185856383703502, + "grad_norm": 1.6362193822860718, + "learning_rate": 2.3510485492674522e-05, + "loss": 1.7191, + "step": 1472 + }, + { + "epoch": 0.41887000361380833, + "grad_norm": 2.1711504459381104, + "learning_rate": 2.3498994541798336e-05, + "loss": 2.9604, + "step": 1473 + }, + { + "epoch": 0.4191543688572664, + "grad_norm": 1.5688064098358154, + "learning_rate": 2.348750359092215e-05, + "loss": 2.4455, + "step": 1474 + }, + { + "epoch": 0.4194387341007245, + "grad_norm": 1.5791515111923218, + "learning_rate": 2.3476012640045965e-05, + "loss": 2.3827, + "step": 1475 + }, + { + "epoch": 0.41972309934418267, + "grad_norm": 1.5574970245361328, + "learning_rate": 2.346452168916978e-05, + "loss": 2.2648, + "step": 1476 + }, + { + "epoch": 0.42000746458764077, + "grad_norm": 1.607318639755249, + "learning_rate": 2.3453030738293598e-05, + "loss": 2.1887, + "step": 1477 + }, + { + "epoch": 0.4202918298310989, + "grad_norm": 1.6023167371749878, + "learning_rate": 2.3441539787417412e-05, + "loss": 1.8484, + "step": 1478 + }, + { + "epoch": 0.420576195074557, + "grad_norm": 1.614344835281372, + "learning_rate": 2.3430048836541227e-05, + "loss": 1.7518, + "step": 1479 + }, + { + "epoch": 0.4208605603180151, + "grad_norm": 1.7779884338378906, + "learning_rate": 2.341855788566504e-05, + "loss": 1.9386, + "step": 1480 + }, + { + "epoch": 0.42114492556147326, + "grad_norm": 1.913327932357788, + "learning_rate": 2.3407066934788856e-05, + "loss": 2.8805, + "step": 1481 + }, + { + "epoch": 0.42142929080493136, + "grad_norm": 1.888013482093811, + "learning_rate": 2.339557598391267e-05, + "loss": 2.4296, + "step": 1482 + }, + { + "epoch": 0.4217136560483895, + "grad_norm": 1.841597080230713, + "learning_rate": 2.3384085033036485e-05, + "loss": 2.4056, + "step": 1483 + }, + { + "epoch": 0.4219980212918476, + "grad_norm": 1.6443198919296265, + "learning_rate": 2.33725940821603e-05, + "loss": 2.2189, + "step": 1484 + }, + { + "epoch": 0.4222823865353057, + "grad_norm": 1.7513799667358398, + "learning_rate": 2.3361103131284114e-05, + "loss": 2.3345, + "step": 1485 + }, + { + "epoch": 0.42256675177876385, + "grad_norm": 1.6593313217163086, + "learning_rate": 2.334961218040793e-05, + "loss": 2.0602, + "step": 1486 + }, + { + "epoch": 0.42285111702222195, + "grad_norm": 1.7148724794387817, + "learning_rate": 2.3338121229531746e-05, + "loss": 1.6755, + "step": 1487 + }, + { + "epoch": 0.4231354822656801, + "grad_norm": 1.6917473077774048, + "learning_rate": 2.332663027865556e-05, + "loss": 1.8464, + "step": 1488 + }, + { + "epoch": 0.4234198475091382, + "grad_norm": 2.1567392349243164, + "learning_rate": 2.3315139327779375e-05, + "loss": 2.9124, + "step": 1489 + }, + { + "epoch": 0.4237042127525963, + "grad_norm": 1.7503013610839844, + "learning_rate": 2.330364837690319e-05, + "loss": 2.5927, + "step": 1490 + }, + { + "epoch": 0.42398857799605444, + "grad_norm": 1.6987637281417847, + "learning_rate": 2.3292157426027004e-05, + "loss": 2.4439, + "step": 1491 + }, + { + "epoch": 0.42427294323951253, + "grad_norm": 1.6804696321487427, + "learning_rate": 2.328066647515082e-05, + "loss": 2.3747, + "step": 1492 + }, + { + "epoch": 0.4245573084829707, + "grad_norm": 1.7630577087402344, + "learning_rate": 2.3269175524274633e-05, + "loss": 2.4236, + "step": 1493 + }, + { + "epoch": 0.4248416737264288, + "grad_norm": 1.5913448333740234, + "learning_rate": 2.3257684573398448e-05, + "loss": 2.0831, + "step": 1494 + }, + { + "epoch": 0.42512603896988693, + "grad_norm": 1.6789977550506592, + "learning_rate": 2.3246193622522266e-05, + "loss": 1.9894, + "step": 1495 + }, + { + "epoch": 0.42541040421334503, + "grad_norm": 1.732582688331604, + "learning_rate": 2.323470267164608e-05, + "loss": 1.7747, + "step": 1496 + }, + { + "epoch": 0.4256947694568031, + "grad_norm": 1.926916241645813, + "learning_rate": 2.3223211720769895e-05, + "loss": 2.9434, + "step": 1497 + }, + { + "epoch": 0.4259791347002613, + "grad_norm": 1.6622836589813232, + "learning_rate": 2.321172076989371e-05, + "loss": 2.4302, + "step": 1498 + }, + { + "epoch": 0.42626349994371937, + "grad_norm": 1.64722740650177, + "learning_rate": 2.3200229819017524e-05, + "loss": 2.1876, + "step": 1499 + }, + { + "epoch": 0.4265478651871775, + "grad_norm": 1.7458999156951904, + "learning_rate": 2.318873886814134e-05, + "loss": 2.4106, + "step": 1500 + }, + { + "epoch": 0.4268322304306356, + "grad_norm": 1.6324485540390015, + "learning_rate": 2.317724791726516e-05, + "loss": 2.3124, + "step": 1501 + }, + { + "epoch": 0.4271165956740937, + "grad_norm": 1.6437077522277832, + "learning_rate": 2.3165756966388974e-05, + "loss": 1.9083, + "step": 1502 + }, + { + "epoch": 0.42740096091755186, + "grad_norm": 1.6657003164291382, + "learning_rate": 2.3154266015512788e-05, + "loss": 1.7993, + "step": 1503 + }, + { + "epoch": 0.42768532616100996, + "grad_norm": 1.6173607110977173, + "learning_rate": 2.3142775064636603e-05, + "loss": 1.7787, + "step": 1504 + }, + { + "epoch": 0.4279696914044681, + "grad_norm": 2.0600807666778564, + "learning_rate": 2.3131284113760417e-05, + "loss": 2.8575, + "step": 1505 + }, + { + "epoch": 0.4282540566479262, + "grad_norm": 1.759519338607788, + "learning_rate": 2.3119793162884232e-05, + "loss": 2.4221, + "step": 1506 + }, + { + "epoch": 0.4285384218913843, + "grad_norm": 1.6098270416259766, + "learning_rate": 2.3108302212008046e-05, + "loss": 2.4158, + "step": 1507 + }, + { + "epoch": 0.42882278713484245, + "grad_norm": 1.5840624570846558, + "learning_rate": 2.309681126113186e-05, + "loss": 2.1282, + "step": 1508 + }, + { + "epoch": 0.42910715237830055, + "grad_norm": 1.7449572086334229, + "learning_rate": 2.3085320310255675e-05, + "loss": 2.289, + "step": 1509 + }, + { + "epoch": 0.4293915176217587, + "grad_norm": 1.5223720073699951, + "learning_rate": 2.3073829359379493e-05, + "loss": 1.7867, + "step": 1510 + }, + { + "epoch": 0.4296758828652168, + "grad_norm": 1.6522331237792969, + "learning_rate": 2.3062338408503308e-05, + "loss": 2.1099, + "step": 1511 + }, + { + "epoch": 0.4299602481086749, + "grad_norm": 1.652082085609436, + "learning_rate": 2.3050847457627122e-05, + "loss": 1.9209, + "step": 1512 + }, + { + "epoch": 0.43024461335213304, + "grad_norm": 1.941834568977356, + "learning_rate": 2.3039356506750937e-05, + "loss": 2.6694, + "step": 1513 + }, + { + "epoch": 0.43052897859559114, + "grad_norm": 1.7142715454101562, + "learning_rate": 2.302786555587475e-05, + "loss": 2.4722, + "step": 1514 + }, + { + "epoch": 0.4308133438390493, + "grad_norm": 1.7237780094146729, + "learning_rate": 2.3016374604998566e-05, + "loss": 2.2415, + "step": 1515 + }, + { + "epoch": 0.4310977090825074, + "grad_norm": 1.696518063545227, + "learning_rate": 2.300488365412238e-05, + "loss": 2.1774, + "step": 1516 + }, + { + "epoch": 0.43138207432596554, + "grad_norm": 1.6398987770080566, + "learning_rate": 2.2993392703246195e-05, + "loss": 2.2672, + "step": 1517 + }, + { + "epoch": 0.43166643956942363, + "grad_norm": 1.7152602672576904, + "learning_rate": 2.298190175237001e-05, + "loss": 2.1277, + "step": 1518 + }, + { + "epoch": 0.43195080481288173, + "grad_norm": 1.612593173980713, + "learning_rate": 2.2970410801493827e-05, + "loss": 2.1015, + "step": 1519 + }, + { + "epoch": 0.4322351700563399, + "grad_norm": 1.6528387069702148, + "learning_rate": 2.295891985061764e-05, + "loss": 1.7369, + "step": 1520 + }, + { + "epoch": 0.432519535299798, + "grad_norm": 2.0139312744140625, + "learning_rate": 2.2947428899741456e-05, + "loss": 2.8552, + "step": 1521 + }, + { + "epoch": 0.4328039005432561, + "grad_norm": 1.6151010990142822, + "learning_rate": 2.293593794886527e-05, + "loss": 2.1954, + "step": 1522 + }, + { + "epoch": 0.4330882657867142, + "grad_norm": 1.5266865491867065, + "learning_rate": 2.2924446997989085e-05, + "loss": 2.2809, + "step": 1523 + }, + { + "epoch": 0.4333726310301723, + "grad_norm": 1.5820162296295166, + "learning_rate": 2.29129560471129e-05, + "loss": 2.2162, + "step": 1524 + }, + { + "epoch": 0.43365699627363047, + "grad_norm": 1.7275466918945312, + "learning_rate": 2.2901465096236714e-05, + "loss": 2.2792, + "step": 1525 + }, + { + "epoch": 0.43394136151708856, + "grad_norm": 1.6316412687301636, + "learning_rate": 2.288997414536053e-05, + "loss": 1.9186, + "step": 1526 + }, + { + "epoch": 0.4342257267605467, + "grad_norm": 1.7064950466156006, + "learning_rate": 2.2878483194484343e-05, + "loss": 1.7606, + "step": 1527 + }, + { + "epoch": 0.4345100920040048, + "grad_norm": 1.7663518190383911, + "learning_rate": 2.286699224360816e-05, + "loss": 1.7999, + "step": 1528 + }, + { + "epoch": 0.4347944572474629, + "grad_norm": 1.942549705505371, + "learning_rate": 2.2855501292731975e-05, + "loss": 3.1272, + "step": 1529 + }, + { + "epoch": 0.43507882249092106, + "grad_norm": 1.546832799911499, + "learning_rate": 2.284401034185579e-05, + "loss": 2.4118, + "step": 1530 + }, + { + "epoch": 0.43536318773437915, + "grad_norm": 1.5846697092056274, + "learning_rate": 2.2832519390979604e-05, + "loss": 2.3054, + "step": 1531 + }, + { + "epoch": 0.4356475529778373, + "grad_norm": 1.6705207824707031, + "learning_rate": 2.282102844010342e-05, + "loss": 2.465, + "step": 1532 + }, + { + "epoch": 0.4359319182212954, + "grad_norm": 1.7256457805633545, + "learning_rate": 2.2809537489227233e-05, + "loss": 2.2058, + "step": 1533 + }, + { + "epoch": 0.4362162834647535, + "grad_norm": 1.6743327379226685, + "learning_rate": 2.2798046538351048e-05, + "loss": 1.999, + "step": 1534 + }, + { + "epoch": 0.43650064870821165, + "grad_norm": 1.8469429016113281, + "learning_rate": 2.2786555587474862e-05, + "loss": 2.0113, + "step": 1535 + }, + { + "epoch": 0.43678501395166974, + "grad_norm": 1.6663111448287964, + "learning_rate": 2.2775064636598677e-05, + "loss": 1.9054, + "step": 1536 + }, + { + "epoch": 0.4370693791951279, + "grad_norm": 1.980746865272522, + "learning_rate": 2.2763573685722498e-05, + "loss": 2.5987, + "step": 1537 + }, + { + "epoch": 0.437353744438586, + "grad_norm": 1.5249035358428955, + "learning_rate": 2.2752082734846313e-05, + "loss": 2.5415, + "step": 1538 + }, + { + "epoch": 0.4376381096820441, + "grad_norm": 1.5359803438186646, + "learning_rate": 2.2740591783970127e-05, + "loss": 2.2511, + "step": 1539 + }, + { + "epoch": 0.43792247492550224, + "grad_norm": 1.671136498451233, + "learning_rate": 2.272910083309394e-05, + "loss": 2.264, + "step": 1540 + }, + { + "epoch": 0.43820684016896033, + "grad_norm": 1.6809831857681274, + "learning_rate": 2.2717609882217756e-05, + "loss": 2.2934, + "step": 1541 + }, + { + "epoch": 0.4384912054124185, + "grad_norm": 1.785244107246399, + "learning_rate": 2.270611893134157e-05, + "loss": 1.8482, + "step": 1542 + }, + { + "epoch": 0.4387755706558766, + "grad_norm": 1.749173641204834, + "learning_rate": 2.269462798046539e-05, + "loss": 2.0663, + "step": 1543 + }, + { + "epoch": 0.43905993589933473, + "grad_norm": 1.7575947046279907, + "learning_rate": 2.2683137029589203e-05, + "loss": 1.9117, + "step": 1544 + }, + { + "epoch": 0.4393443011427928, + "grad_norm": 1.795770287513733, + "learning_rate": 2.2671646078713017e-05, + "loss": 2.9625, + "step": 1545 + }, + { + "epoch": 0.4396286663862509, + "grad_norm": 1.6358546018600464, + "learning_rate": 2.2660155127836832e-05, + "loss": 2.4055, + "step": 1546 + }, + { + "epoch": 0.4399130316297091, + "grad_norm": 1.5694764852523804, + "learning_rate": 2.2648664176960646e-05, + "loss": 2.5108, + "step": 1547 + }, + { + "epoch": 0.44019739687316717, + "grad_norm": 1.6293818950653076, + "learning_rate": 2.263717322608446e-05, + "loss": 2.3389, + "step": 1548 + }, + { + "epoch": 0.4404817621166253, + "grad_norm": 1.683681845664978, + "learning_rate": 2.2625682275208275e-05, + "loss": 2.1871, + "step": 1549 + }, + { + "epoch": 0.4407661273600834, + "grad_norm": 1.668007254600525, + "learning_rate": 2.261419132433209e-05, + "loss": 1.8618, + "step": 1550 + }, + { + "epoch": 0.4410504926035415, + "grad_norm": 1.5370334386825562, + "learning_rate": 2.2602700373455904e-05, + "loss": 1.6422, + "step": 1551 + }, + { + "epoch": 0.44133485784699966, + "grad_norm": 1.6079347133636475, + "learning_rate": 2.2591209422579722e-05, + "loss": 1.7268, + "step": 1552 + }, + { + "epoch": 0.44161922309045776, + "grad_norm": 1.7942856550216675, + "learning_rate": 2.2579718471703537e-05, + "loss": 2.8017, + "step": 1553 + }, + { + "epoch": 0.4419035883339159, + "grad_norm": 1.694395899772644, + "learning_rate": 2.256822752082735e-05, + "loss": 2.4659, + "step": 1554 + }, + { + "epoch": 0.442187953577374, + "grad_norm": 1.6033554077148438, + "learning_rate": 2.2556736569951166e-05, + "loss": 2.321, + "step": 1555 + }, + { + "epoch": 0.4424723188208321, + "grad_norm": 1.4896838665008545, + "learning_rate": 2.254524561907498e-05, + "loss": 2.3603, + "step": 1556 + }, + { + "epoch": 0.44275668406429025, + "grad_norm": 1.5675745010375977, + "learning_rate": 2.2533754668198795e-05, + "loss": 2.199, + "step": 1557 + }, + { + "epoch": 0.44304104930774835, + "grad_norm": 1.5907286405563354, + "learning_rate": 2.252226371732261e-05, + "loss": 2.0474, + "step": 1558 + }, + { + "epoch": 0.4433254145512065, + "grad_norm": 1.67164146900177, + "learning_rate": 2.2510772766446424e-05, + "loss": 1.9384, + "step": 1559 + }, + { + "epoch": 0.4436097797946646, + "grad_norm": 1.6073744297027588, + "learning_rate": 2.249928181557024e-05, + "loss": 1.5771, + "step": 1560 + }, + { + "epoch": 0.4438941450381227, + "grad_norm": 1.9481167793273926, + "learning_rate": 2.2487790864694056e-05, + "loss": 2.8547, + "step": 1561 + }, + { + "epoch": 0.44417851028158084, + "grad_norm": 1.627387523651123, + "learning_rate": 2.247629991381787e-05, + "loss": 2.661, + "step": 1562 + }, + { + "epoch": 0.44446287552503894, + "grad_norm": 1.665747046470642, + "learning_rate": 2.2464808962941685e-05, + "loss": 2.2633, + "step": 1563 + }, + { + "epoch": 0.4447472407684971, + "grad_norm": 1.6608874797821045, + "learning_rate": 2.24533180120655e-05, + "loss": 2.3801, + "step": 1564 + }, + { + "epoch": 0.4450316060119552, + "grad_norm": 1.778861165046692, + "learning_rate": 2.2441827061189314e-05, + "loss": 2.1286, + "step": 1565 + }, + { + "epoch": 0.44531597125541333, + "grad_norm": 1.678720474243164, + "learning_rate": 2.243033611031313e-05, + "loss": 2.0441, + "step": 1566 + }, + { + "epoch": 0.44560033649887143, + "grad_norm": 1.6838959455490112, + "learning_rate": 2.2418845159436943e-05, + "loss": 1.8895, + "step": 1567 + }, + { + "epoch": 0.4458847017423295, + "grad_norm": 1.7138351202011108, + "learning_rate": 2.2407354208560758e-05, + "loss": 1.8344, + "step": 1568 + }, + { + "epoch": 0.4461690669857877, + "grad_norm": 1.9342299699783325, + "learning_rate": 2.2395863257684572e-05, + "loss": 2.7738, + "step": 1569 + }, + { + "epoch": 0.4464534322292458, + "grad_norm": 1.6085205078125, + "learning_rate": 2.238437230680839e-05, + "loss": 2.4156, + "step": 1570 + }, + { + "epoch": 0.4467377974727039, + "grad_norm": 1.598381757736206, + "learning_rate": 2.2372881355932205e-05, + "loss": 2.323, + "step": 1571 + }, + { + "epoch": 0.447022162716162, + "grad_norm": 1.5285613536834717, + "learning_rate": 2.236139040505602e-05, + "loss": 2.3501, + "step": 1572 + }, + { + "epoch": 0.4473065279596201, + "grad_norm": 1.6888141632080078, + "learning_rate": 2.2349899454179834e-05, + "loss": 2.3968, + "step": 1573 + }, + { + "epoch": 0.44759089320307827, + "grad_norm": 1.6421995162963867, + "learning_rate": 2.233840850330365e-05, + "loss": 1.7312, + "step": 1574 + }, + { + "epoch": 0.44787525844653636, + "grad_norm": 1.7016944885253906, + "learning_rate": 2.2326917552427466e-05, + "loss": 1.8975, + "step": 1575 + }, + { + "epoch": 0.4481596236899945, + "grad_norm": 1.8153733015060425, + "learning_rate": 2.231542660155128e-05, + "loss": 2.0029, + "step": 1576 + }, + { + "epoch": 0.4484439889334526, + "grad_norm": 1.941930890083313, + "learning_rate": 2.23039356506751e-05, + "loss": 3.027, + "step": 1577 + }, + { + "epoch": 0.4487283541769107, + "grad_norm": 1.643111228942871, + "learning_rate": 2.2292444699798913e-05, + "loss": 2.5447, + "step": 1578 + }, + { + "epoch": 0.44901271942036886, + "grad_norm": 1.5642459392547607, + "learning_rate": 2.2280953748922727e-05, + "loss": 2.3714, + "step": 1579 + }, + { + "epoch": 0.44929708466382695, + "grad_norm": 1.6498128175735474, + "learning_rate": 2.2269462798046542e-05, + "loss": 2.3655, + "step": 1580 + }, + { + "epoch": 0.4495814499072851, + "grad_norm": 1.7358462810516357, + "learning_rate": 2.2257971847170356e-05, + "loss": 2.2202, + "step": 1581 + }, + { + "epoch": 0.4498658151507432, + "grad_norm": 1.718059778213501, + "learning_rate": 2.224648089629417e-05, + "loss": 1.8404, + "step": 1582 + }, + { + "epoch": 0.4501501803942013, + "grad_norm": 1.6849524974822998, + "learning_rate": 2.2234989945417985e-05, + "loss": 1.857, + "step": 1583 + }, + { + "epoch": 0.45043454563765944, + "grad_norm": 1.6961820125579834, + "learning_rate": 2.22234989945418e-05, + "loss": 1.8101, + "step": 1584 + }, + { + "epoch": 0.45071891088111754, + "grad_norm": 1.9128514528274536, + "learning_rate": 2.2212008043665618e-05, + "loss": 2.9717, + "step": 1585 + }, + { + "epoch": 0.4510032761245757, + "grad_norm": 1.4966126680374146, + "learning_rate": 2.2200517092789432e-05, + "loss": 2.6604, + "step": 1586 + }, + { + "epoch": 0.4512876413680338, + "grad_norm": 1.5449092388153076, + "learning_rate": 2.2189026141913247e-05, + "loss": 2.1829, + "step": 1587 + }, + { + "epoch": 0.4515720066114919, + "grad_norm": 1.6908026933670044, + "learning_rate": 2.217753519103706e-05, + "loss": 2.2731, + "step": 1588 + }, + { + "epoch": 0.45185637185495003, + "grad_norm": 1.561723232269287, + "learning_rate": 2.2166044240160876e-05, + "loss": 2.1934, + "step": 1589 + }, + { + "epoch": 0.45214073709840813, + "grad_norm": 1.6748071908950806, + "learning_rate": 2.215455328928469e-05, + "loss": 2.112, + "step": 1590 + }, + { + "epoch": 0.4524251023418663, + "grad_norm": 1.5931808948516846, + "learning_rate": 2.2143062338408505e-05, + "loss": 1.853, + "step": 1591 + }, + { + "epoch": 0.4527094675853244, + "grad_norm": 1.7409586906433105, + "learning_rate": 2.213157138753232e-05, + "loss": 1.7871, + "step": 1592 + }, + { + "epoch": 0.45299383282878253, + "grad_norm": 1.953231930732727, + "learning_rate": 2.2120080436656134e-05, + "loss": 2.8715, + "step": 1593 + }, + { + "epoch": 0.4532781980722406, + "grad_norm": 1.6238800287246704, + "learning_rate": 2.210858948577995e-05, + "loss": 2.5115, + "step": 1594 + }, + { + "epoch": 0.4535625633156987, + "grad_norm": 1.643713116645813, + "learning_rate": 2.2097098534903766e-05, + "loss": 2.2356, + "step": 1595 + }, + { + "epoch": 0.45384692855915687, + "grad_norm": 1.5235264301300049, + "learning_rate": 2.208560758402758e-05, + "loss": 2.348, + "step": 1596 + }, + { + "epoch": 0.45413129380261497, + "grad_norm": 1.6755728721618652, + "learning_rate": 2.2074116633151395e-05, + "loss": 2.2908, + "step": 1597 + }, + { + "epoch": 0.4544156590460731, + "grad_norm": 1.6341913938522339, + "learning_rate": 2.206262568227521e-05, + "loss": 1.9315, + "step": 1598 + }, + { + "epoch": 0.4547000242895312, + "grad_norm": 1.8000038862228394, + "learning_rate": 2.2051134731399024e-05, + "loss": 1.8148, + "step": 1599 + }, + { + "epoch": 0.4549843895329893, + "grad_norm": 1.7715755701065063, + "learning_rate": 2.203964378052284e-05, + "loss": 1.9843, + "step": 1600 + }, + { + "epoch": 0.45526875477644746, + "grad_norm": 1.9607915878295898, + "learning_rate": 2.2028152829646653e-05, + "loss": 3.0164, + "step": 1601 + }, + { + "epoch": 0.45555312001990556, + "grad_norm": 1.5718859434127808, + "learning_rate": 2.2016661878770468e-05, + "loss": 2.5762, + "step": 1602 + }, + { + "epoch": 0.4558374852633637, + "grad_norm": 1.6624130010604858, + "learning_rate": 2.2005170927894285e-05, + "loss": 2.3932, + "step": 1603 + }, + { + "epoch": 0.4561218505068218, + "grad_norm": 1.6082323789596558, + "learning_rate": 2.19936799770181e-05, + "loss": 2.4261, + "step": 1604 + }, + { + "epoch": 0.4564062157502799, + "grad_norm": 1.7497981786727905, + "learning_rate": 2.1982189026141914e-05, + "loss": 2.3493, + "step": 1605 + }, + { + "epoch": 0.45669058099373805, + "grad_norm": 1.559987187385559, + "learning_rate": 2.197069807526573e-05, + "loss": 1.8154, + "step": 1606 + }, + { + "epoch": 0.45697494623719614, + "grad_norm": 1.724812388420105, + "learning_rate": 2.1959207124389543e-05, + "loss": 1.6993, + "step": 1607 + }, + { + "epoch": 0.4572593114806543, + "grad_norm": 1.6445400714874268, + "learning_rate": 2.1947716173513358e-05, + "loss": 1.7773, + "step": 1608 + }, + { + "epoch": 0.4575436767241124, + "grad_norm": 1.8340013027191162, + "learning_rate": 2.1936225222637172e-05, + "loss": 2.8774, + "step": 1609 + }, + { + "epoch": 0.4578280419675705, + "grad_norm": 1.6000295877456665, + "learning_rate": 2.1924734271760987e-05, + "loss": 2.6376, + "step": 1610 + }, + { + "epoch": 0.45811240721102864, + "grad_norm": 1.6122288703918457, + "learning_rate": 2.1913243320884808e-05, + "loss": 2.4566, + "step": 1611 + }, + { + "epoch": 0.45839677245448673, + "grad_norm": 1.62675940990448, + "learning_rate": 2.1901752370008623e-05, + "loss": 2.4088, + "step": 1612 + }, + { + "epoch": 0.4586811376979449, + "grad_norm": 1.7197331190109253, + "learning_rate": 2.1890261419132437e-05, + "loss": 2.2432, + "step": 1613 + }, + { + "epoch": 0.458965502941403, + "grad_norm": 1.5948972702026367, + "learning_rate": 2.187877046825625e-05, + "loss": 1.9617, + "step": 1614 + }, + { + "epoch": 0.45924986818486113, + "grad_norm": 1.689054012298584, + "learning_rate": 2.1867279517380066e-05, + "loss": 1.9137, + "step": 1615 + }, + { + "epoch": 0.4595342334283192, + "grad_norm": 1.829690933227539, + "learning_rate": 2.185578856650388e-05, + "loss": 1.8653, + "step": 1616 + }, + { + "epoch": 0.4598185986717773, + "grad_norm": 1.973512887954712, + "learning_rate": 2.1844297615627695e-05, + "loss": 2.8129, + "step": 1617 + }, + { + "epoch": 0.4601029639152355, + "grad_norm": 1.612326979637146, + "learning_rate": 2.183280666475151e-05, + "loss": 2.6233, + "step": 1618 + }, + { + "epoch": 0.46038732915869357, + "grad_norm": 1.473063588142395, + "learning_rate": 2.1821315713875328e-05, + "loss": 2.3022, + "step": 1619 + }, + { + "epoch": 0.4606716944021517, + "grad_norm": 1.552682876586914, + "learning_rate": 2.1809824762999142e-05, + "loss": 2.1985, + "step": 1620 + }, + { + "epoch": 0.4609560596456098, + "grad_norm": 1.7268232107162476, + "learning_rate": 2.1798333812122957e-05, + "loss": 2.2878, + "step": 1621 + }, + { + "epoch": 0.4612404248890679, + "grad_norm": 1.5425605773925781, + "learning_rate": 2.178684286124677e-05, + "loss": 1.8743, + "step": 1622 + }, + { + "epoch": 0.46152479013252606, + "grad_norm": 1.7569302320480347, + "learning_rate": 2.1775351910370586e-05, + "loss": 1.7704, + "step": 1623 + }, + { + "epoch": 0.46180915537598416, + "grad_norm": 1.6393110752105713, + "learning_rate": 2.17638609594944e-05, + "loss": 1.8674, + "step": 1624 + }, + { + "epoch": 0.4620935206194423, + "grad_norm": 1.9522907733917236, + "learning_rate": 2.1752370008618215e-05, + "loss": 2.9953, + "step": 1625 + }, + { + "epoch": 0.4623778858629004, + "grad_norm": 1.7476742267608643, + "learning_rate": 2.174087905774203e-05, + "loss": 2.4966, + "step": 1626 + }, + { + "epoch": 0.4626622511063585, + "grad_norm": 1.642991065979004, + "learning_rate": 2.1729388106865844e-05, + "loss": 2.4291, + "step": 1627 + }, + { + "epoch": 0.46294661634981665, + "grad_norm": 1.6043994426727295, + "learning_rate": 2.171789715598966e-05, + "loss": 2.1552, + "step": 1628 + }, + { + "epoch": 0.46323098159327475, + "grad_norm": 1.703909158706665, + "learning_rate": 2.1706406205113476e-05, + "loss": 2.2067, + "step": 1629 + }, + { + "epoch": 0.4635153468367329, + "grad_norm": 1.5352064371109009, + "learning_rate": 2.169491525423729e-05, + "loss": 2.0239, + "step": 1630 + }, + { + "epoch": 0.463799712080191, + "grad_norm": 1.6908349990844727, + "learning_rate": 2.1683424303361105e-05, + "loss": 1.9427, + "step": 1631 + }, + { + "epoch": 0.4640840773236491, + "grad_norm": 1.7095677852630615, + "learning_rate": 2.167193335248492e-05, + "loss": 1.7988, + "step": 1632 + }, + { + "epoch": 0.46436844256710724, + "grad_norm": 2.046193838119507, + "learning_rate": 2.1660442401608734e-05, + "loss": 3.045, + "step": 1633 + }, + { + "epoch": 0.46465280781056534, + "grad_norm": 1.6458848714828491, + "learning_rate": 2.164895145073255e-05, + "loss": 2.6204, + "step": 1634 + }, + { + "epoch": 0.4649371730540235, + "grad_norm": 1.6441450119018555, + "learning_rate": 2.1637460499856363e-05, + "loss": 2.482, + "step": 1635 + }, + { + "epoch": 0.4652215382974816, + "grad_norm": 1.6394556760787964, + "learning_rate": 2.1625969548980177e-05, + "loss": 2.2559, + "step": 1636 + }, + { + "epoch": 0.46550590354093974, + "grad_norm": 1.8199498653411865, + "learning_rate": 2.1614478598103995e-05, + "loss": 2.2969, + "step": 1637 + }, + { + "epoch": 0.46579026878439783, + "grad_norm": 1.5875297784805298, + "learning_rate": 2.160298764722781e-05, + "loss": 1.9558, + "step": 1638 + }, + { + "epoch": 0.4660746340278559, + "grad_norm": 1.6678173542022705, + "learning_rate": 2.1591496696351624e-05, + "loss": 1.9319, + "step": 1639 + }, + { + "epoch": 0.4663589992713141, + "grad_norm": 1.7809865474700928, + "learning_rate": 2.158000574547544e-05, + "loss": 1.8821, + "step": 1640 + }, + { + "epoch": 0.4666433645147722, + "grad_norm": 2.026603937149048, + "learning_rate": 2.1568514794599253e-05, + "loss": 2.7958, + "step": 1641 + }, + { + "epoch": 0.4669277297582303, + "grad_norm": 1.7228740453720093, + "learning_rate": 2.1557023843723068e-05, + "loss": 2.591, + "step": 1642 + }, + { + "epoch": 0.4672120950016884, + "grad_norm": 1.6805092096328735, + "learning_rate": 2.1545532892846882e-05, + "loss": 2.4039, + "step": 1643 + }, + { + "epoch": 0.4674964602451465, + "grad_norm": 1.605502724647522, + "learning_rate": 2.1534041941970697e-05, + "loss": 2.2221, + "step": 1644 + }, + { + "epoch": 0.46778082548860467, + "grad_norm": 1.750071406364441, + "learning_rate": 2.152255099109451e-05, + "loss": 2.2253, + "step": 1645 + }, + { + "epoch": 0.46806519073206276, + "grad_norm": 1.907105565071106, + "learning_rate": 2.151106004021833e-05, + "loss": 1.9662, + "step": 1646 + }, + { + "epoch": 0.4683495559755209, + "grad_norm": 1.6502389907836914, + "learning_rate": 2.1499569089342144e-05, + "loss": 1.8562, + "step": 1647 + }, + { + "epoch": 0.468633921218979, + "grad_norm": 1.72910737991333, + "learning_rate": 2.148807813846596e-05, + "loss": 1.8048, + "step": 1648 + }, + { + "epoch": 0.4689182864624371, + "grad_norm": 1.7769256830215454, + "learning_rate": 2.1476587187589776e-05, + "loss": 2.9131, + "step": 1649 + }, + { + "epoch": 0.46920265170589526, + "grad_norm": 1.667056918144226, + "learning_rate": 2.146509623671359e-05, + "loss": 2.3479, + "step": 1650 + }, + { + "epoch": 0.46948701694935335, + "grad_norm": 1.6939767599105835, + "learning_rate": 2.1453605285837405e-05, + "loss": 2.4278, + "step": 1651 + }, + { + "epoch": 0.4697713821928115, + "grad_norm": 1.723417043685913, + "learning_rate": 2.1442114334961223e-05, + "loss": 2.2906, + "step": 1652 + }, + { + "epoch": 0.4700557474362696, + "grad_norm": 1.7457711696624756, + "learning_rate": 2.1430623384085037e-05, + "loss": 2.1299, + "step": 1653 + }, + { + "epoch": 0.4703401126797277, + "grad_norm": 1.643261432647705, + "learning_rate": 2.1419132433208852e-05, + "loss": 1.99, + "step": 1654 + }, + { + "epoch": 0.47062447792318585, + "grad_norm": 1.7526366710662842, + "learning_rate": 2.1407641482332666e-05, + "loss": 1.8781, + "step": 1655 + }, + { + "epoch": 0.47090884316664394, + "grad_norm": 1.6480202674865723, + "learning_rate": 2.139615053145648e-05, + "loss": 1.7935, + "step": 1656 + }, + { + "epoch": 0.4711932084101021, + "grad_norm": 1.997900128364563, + "learning_rate": 2.1384659580580295e-05, + "loss": 3.0299, + "step": 1657 + }, + { + "epoch": 0.4714775736535602, + "grad_norm": 1.7013146877288818, + "learning_rate": 2.137316862970411e-05, + "loss": 2.548, + "step": 1658 + }, + { + "epoch": 0.4717619388970183, + "grad_norm": 1.5773371458053589, + "learning_rate": 2.1361677678827924e-05, + "loss": 2.2042, + "step": 1659 + }, + { + "epoch": 0.47204630414047644, + "grad_norm": 1.5602200031280518, + "learning_rate": 2.135018672795174e-05, + "loss": 2.1936, + "step": 1660 + }, + { + "epoch": 0.47233066938393453, + "grad_norm": 1.7694247961044312, + "learning_rate": 2.1338695777075557e-05, + "loss": 2.2457, + "step": 1661 + }, + { + "epoch": 0.4726150346273927, + "grad_norm": 1.5969568490982056, + "learning_rate": 2.132720482619937e-05, + "loss": 2.0625, + "step": 1662 + }, + { + "epoch": 0.4728993998708508, + "grad_norm": 1.6549208164215088, + "learning_rate": 2.1315713875323186e-05, + "loss": 1.9321, + "step": 1663 + }, + { + "epoch": 0.47318376511430893, + "grad_norm": 1.806251049041748, + "learning_rate": 2.1304222924447e-05, + "loss": 2.003, + "step": 1664 + }, + { + "epoch": 0.473468130357767, + "grad_norm": 2.0554563999176025, + "learning_rate": 2.1292731973570815e-05, + "loss": 2.967, + "step": 1665 + }, + { + "epoch": 0.4737524956012251, + "grad_norm": 1.6516426801681519, + "learning_rate": 2.128124102269463e-05, + "loss": 2.5175, + "step": 1666 + }, + { + "epoch": 0.47403686084468327, + "grad_norm": 1.5493863821029663, + "learning_rate": 2.1269750071818444e-05, + "loss": 2.385, + "step": 1667 + }, + { + "epoch": 0.47432122608814137, + "grad_norm": 1.759292483329773, + "learning_rate": 2.1258259120942258e-05, + "loss": 2.3617, + "step": 1668 + }, + { + "epoch": 0.4746055913315995, + "grad_norm": 1.6405413150787354, + "learning_rate": 2.1246768170066073e-05, + "loss": 2.2015, + "step": 1669 + }, + { + "epoch": 0.4748899565750576, + "grad_norm": 1.686164140701294, + "learning_rate": 2.123527721918989e-05, + "loss": 2.0404, + "step": 1670 + }, + { + "epoch": 0.4751743218185157, + "grad_norm": 1.6657214164733887, + "learning_rate": 2.1223786268313705e-05, + "loss": 1.705, + "step": 1671 + }, + { + "epoch": 0.47545868706197386, + "grad_norm": 1.5510362386703491, + "learning_rate": 2.121229531743752e-05, + "loss": 1.8379, + "step": 1672 + }, + { + "epoch": 0.47574305230543196, + "grad_norm": 1.9575543403625488, + "learning_rate": 2.1200804366561334e-05, + "loss": 2.9591, + "step": 1673 + }, + { + "epoch": 0.4760274175488901, + "grad_norm": 1.660929799079895, + "learning_rate": 2.118931341568515e-05, + "loss": 2.4613, + "step": 1674 + }, + { + "epoch": 0.4763117827923482, + "grad_norm": 1.6097886562347412, + "learning_rate": 2.1177822464808963e-05, + "loss": 2.3457, + "step": 1675 + }, + { + "epoch": 0.4765961480358063, + "grad_norm": 1.6404685974121094, + "learning_rate": 2.1166331513932778e-05, + "loss": 2.2377, + "step": 1676 + }, + { + "epoch": 0.47688051327926445, + "grad_norm": 1.6690386533737183, + "learning_rate": 2.1154840563056592e-05, + "loss": 2.088, + "step": 1677 + }, + { + "epoch": 0.47716487852272255, + "grad_norm": 1.7172131538391113, + "learning_rate": 2.1143349612180407e-05, + "loss": 2.1428, + "step": 1678 + }, + { + "epoch": 0.4774492437661807, + "grad_norm": 1.645805835723877, + "learning_rate": 2.1131858661304225e-05, + "loss": 1.7262, + "step": 1679 + }, + { + "epoch": 0.4777336090096388, + "grad_norm": 1.6956250667572021, + "learning_rate": 2.112036771042804e-05, + "loss": 1.7153, + "step": 1680 + }, + { + "epoch": 0.4780179742530969, + "grad_norm": 1.8658703565597534, + "learning_rate": 2.1108876759551854e-05, + "loss": 2.9007, + "step": 1681 + }, + { + "epoch": 0.47830233949655504, + "grad_norm": 1.5524179935455322, + "learning_rate": 2.1097385808675668e-05, + "loss": 2.5446, + "step": 1682 + }, + { + "epoch": 0.47858670474001314, + "grad_norm": 1.4524405002593994, + "learning_rate": 2.1085894857799483e-05, + "loss": 2.2187, + "step": 1683 + }, + { + "epoch": 0.4788710699834713, + "grad_norm": 1.5633457899093628, + "learning_rate": 2.1074403906923297e-05, + "loss": 2.3211, + "step": 1684 + }, + { + "epoch": 0.4791554352269294, + "grad_norm": 1.6988346576690674, + "learning_rate": 2.1062912956047118e-05, + "loss": 2.3174, + "step": 1685 + }, + { + "epoch": 0.47943980047038753, + "grad_norm": 1.747249722480774, + "learning_rate": 2.1051422005170933e-05, + "loss": 1.9955, + "step": 1686 + }, + { + "epoch": 0.47972416571384563, + "grad_norm": 1.5953621864318848, + "learning_rate": 2.1039931054294747e-05, + "loss": 1.939, + "step": 1687 + }, + { + "epoch": 0.4800085309573037, + "grad_norm": 1.6366169452667236, + "learning_rate": 2.1028440103418562e-05, + "loss": 1.7721, + "step": 1688 + }, + { + "epoch": 0.4802928962007619, + "grad_norm": 1.9023154973983765, + "learning_rate": 2.1016949152542376e-05, + "loss": 2.8568, + "step": 1689 + }, + { + "epoch": 0.48057726144421997, + "grad_norm": 1.4789276123046875, + "learning_rate": 2.100545820166619e-05, + "loss": 2.4112, + "step": 1690 + }, + { + "epoch": 0.4808616266876781, + "grad_norm": 1.5606416463851929, + "learning_rate": 2.0993967250790005e-05, + "loss": 2.2638, + "step": 1691 + }, + { + "epoch": 0.4811459919311362, + "grad_norm": 1.5675312280654907, + "learning_rate": 2.098247629991382e-05, + "loss": 2.3888, + "step": 1692 + }, + { + "epoch": 0.4814303571745943, + "grad_norm": 1.6891237497329712, + "learning_rate": 2.0970985349037634e-05, + "loss": 2.1829, + "step": 1693 + }, + { + "epoch": 0.48171472241805247, + "grad_norm": 1.6167731285095215, + "learning_rate": 2.0959494398161452e-05, + "loss": 1.9001, + "step": 1694 + }, + { + "epoch": 0.48199908766151056, + "grad_norm": 1.6935869455337524, + "learning_rate": 2.0948003447285267e-05, + "loss": 1.8639, + "step": 1695 + }, + { + "epoch": 0.4822834529049687, + "grad_norm": 1.6257230043411255, + "learning_rate": 2.093651249640908e-05, + "loss": 1.78, + "step": 1696 + }, + { + "epoch": 0.4825678181484268, + "grad_norm": 2.0507266521453857, + "learning_rate": 2.0925021545532896e-05, + "loss": 2.8274, + "step": 1697 + }, + { + "epoch": 0.4828521833918849, + "grad_norm": 1.5421440601348877, + "learning_rate": 2.091353059465671e-05, + "loss": 2.5799, + "step": 1698 + }, + { + "epoch": 0.48313654863534305, + "grad_norm": 1.5370677709579468, + "learning_rate": 2.0902039643780525e-05, + "loss": 2.2449, + "step": 1699 + }, + { + "epoch": 0.48342091387880115, + "grad_norm": 1.6264358758926392, + "learning_rate": 2.089054869290434e-05, + "loss": 2.3207, + "step": 1700 + }, + { + "epoch": 0.4837052791222593, + "grad_norm": 1.8033897876739502, + "learning_rate": 2.0879057742028154e-05, + "loss": 2.2928, + "step": 1701 + }, + { + "epoch": 0.4839896443657174, + "grad_norm": 1.610359787940979, + "learning_rate": 2.0867566791151968e-05, + "loss": 1.896, + "step": 1702 + }, + { + "epoch": 0.4842740096091755, + "grad_norm": 1.8859881162643433, + "learning_rate": 2.0856075840275786e-05, + "loss": 1.648, + "step": 1703 + }, + { + "epoch": 0.48455837485263364, + "grad_norm": 1.7698290348052979, + "learning_rate": 2.08445848893996e-05, + "loss": 1.6407, + "step": 1704 + }, + { + "epoch": 0.48484274009609174, + "grad_norm": 1.8778108358383179, + "learning_rate": 2.0833093938523415e-05, + "loss": 2.8316, + "step": 1705 + }, + { + "epoch": 0.4851271053395499, + "grad_norm": 1.6026455163955688, + "learning_rate": 2.082160298764723e-05, + "loss": 2.4976, + "step": 1706 + }, + { + "epoch": 0.485411470583008, + "grad_norm": 1.5125036239624023, + "learning_rate": 2.0810112036771044e-05, + "loss": 2.2848, + "step": 1707 + }, + { + "epoch": 0.4856958358264661, + "grad_norm": 1.6800127029418945, + "learning_rate": 2.079862108589486e-05, + "loss": 2.1576, + "step": 1708 + }, + { + "epoch": 0.48598020106992423, + "grad_norm": 1.6857738494873047, + "learning_rate": 2.0787130135018673e-05, + "loss": 2.1469, + "step": 1709 + }, + { + "epoch": 0.48626456631338233, + "grad_norm": 1.5208419561386108, + "learning_rate": 2.0775639184142487e-05, + "loss": 1.9286, + "step": 1710 + }, + { + "epoch": 0.4865489315568405, + "grad_norm": 1.5637043714523315, + "learning_rate": 2.0764148233266302e-05, + "loss": 1.7151, + "step": 1711 + }, + { + "epoch": 0.4868332968002986, + "grad_norm": 1.6756120920181274, + "learning_rate": 2.075265728239012e-05, + "loss": 1.8036, + "step": 1712 + }, + { + "epoch": 0.4871176620437567, + "grad_norm": 1.8630014657974243, + "learning_rate": 2.0741166331513934e-05, + "loss": 2.9317, + "step": 1713 + }, + { + "epoch": 0.4874020272872148, + "grad_norm": 1.732492208480835, + "learning_rate": 2.072967538063775e-05, + "loss": 2.4387, + "step": 1714 + }, + { + "epoch": 0.4876863925306729, + "grad_norm": 1.6385549306869507, + "learning_rate": 2.0718184429761563e-05, + "loss": 2.1674, + "step": 1715 + }, + { + "epoch": 0.48797075777413107, + "grad_norm": 1.5367069244384766, + "learning_rate": 2.0706693478885378e-05, + "loss": 2.2549, + "step": 1716 + }, + { + "epoch": 0.48825512301758917, + "grad_norm": 1.765271782875061, + "learning_rate": 2.0695202528009192e-05, + "loss": 2.2325, + "step": 1717 + }, + { + "epoch": 0.4885394882610473, + "grad_norm": 1.6616159677505493, + "learning_rate": 2.0683711577133007e-05, + "loss": 1.806, + "step": 1718 + }, + { + "epoch": 0.4888238535045054, + "grad_norm": 1.7250800132751465, + "learning_rate": 2.067222062625682e-05, + "loss": 1.8495, + "step": 1719 + }, + { + "epoch": 0.4891082187479635, + "grad_norm": 1.7794257402420044, + "learning_rate": 2.0660729675380636e-05, + "loss": 1.7158, + "step": 1720 + }, + { + "epoch": 0.48939258399142166, + "grad_norm": 1.9646224975585938, + "learning_rate": 2.0649238724504454e-05, + "loss": 2.9298, + "step": 1721 + }, + { + "epoch": 0.48967694923487975, + "grad_norm": 1.6093361377716064, + "learning_rate": 2.063774777362827e-05, + "loss": 2.4871, + "step": 1722 + }, + { + "epoch": 0.4899613144783379, + "grad_norm": 1.5131632089614868, + "learning_rate": 2.0626256822752086e-05, + "loss": 2.2751, + "step": 1723 + }, + { + "epoch": 0.490245679721796, + "grad_norm": 1.5674437284469604, + "learning_rate": 2.06147658718759e-05, + "loss": 2.2159, + "step": 1724 + }, + { + "epoch": 0.4905300449652541, + "grad_norm": 1.5765401124954224, + "learning_rate": 2.0603274920999715e-05, + "loss": 2.2252, + "step": 1725 + }, + { + "epoch": 0.49081441020871225, + "grad_norm": 1.6332764625549316, + "learning_rate": 2.059178397012353e-05, + "loss": 1.9848, + "step": 1726 + }, + { + "epoch": 0.49109877545217034, + "grad_norm": 1.555993914604187, + "learning_rate": 2.0580293019247347e-05, + "loss": 1.8058, + "step": 1727 + }, + { + "epoch": 0.4913831406956285, + "grad_norm": 1.792938470840454, + "learning_rate": 2.0568802068371162e-05, + "loss": 1.7801, + "step": 1728 + }, + { + "epoch": 0.4916675059390866, + "grad_norm": 1.881766676902771, + "learning_rate": 2.0557311117494976e-05, + "loss": 2.734, + "step": 1729 + }, + { + "epoch": 0.4919518711825447, + "grad_norm": 1.5868914127349854, + "learning_rate": 2.054582016661879e-05, + "loss": 2.5361, + "step": 1730 + }, + { + "epoch": 0.49223623642600284, + "grad_norm": 1.59299898147583, + "learning_rate": 2.0534329215742605e-05, + "loss": 2.3963, + "step": 1731 + }, + { + "epoch": 0.49252060166946093, + "grad_norm": 1.7048550844192505, + "learning_rate": 2.052283826486642e-05, + "loss": 2.1933, + "step": 1732 + }, + { + "epoch": 0.4928049669129191, + "grad_norm": 1.5875484943389893, + "learning_rate": 2.0511347313990234e-05, + "loss": 2.2065, + "step": 1733 + }, + { + "epoch": 0.4930893321563772, + "grad_norm": 1.7140761613845825, + "learning_rate": 2.049985636311405e-05, + "loss": 1.912, + "step": 1734 + }, + { + "epoch": 0.49337369739983533, + "grad_norm": 1.6499665975570679, + "learning_rate": 2.0488365412237863e-05, + "loss": 1.9668, + "step": 1735 + }, + { + "epoch": 0.4936580626432934, + "grad_norm": 1.7199233770370483, + "learning_rate": 2.047687446136168e-05, + "loss": 1.7098, + "step": 1736 + }, + { + "epoch": 0.4939424278867515, + "grad_norm": 1.7340093851089478, + "learning_rate": 2.0465383510485496e-05, + "loss": 2.9317, + "step": 1737 + }, + { + "epoch": 0.4942267931302097, + "grad_norm": 1.4972059726715088, + "learning_rate": 2.045389255960931e-05, + "loss": 2.4092, + "step": 1738 + }, + { + "epoch": 0.49451115837366777, + "grad_norm": 1.5227432250976562, + "learning_rate": 2.0442401608733125e-05, + "loss": 2.3194, + "step": 1739 + }, + { + "epoch": 0.4947955236171259, + "grad_norm": 1.5438145399093628, + "learning_rate": 2.043091065785694e-05, + "loss": 2.4278, + "step": 1740 + }, + { + "epoch": 0.495079888860584, + "grad_norm": 1.767443060874939, + "learning_rate": 2.0419419706980754e-05, + "loss": 2.2614, + "step": 1741 + }, + { + "epoch": 0.4953642541040421, + "grad_norm": 1.471556544303894, + "learning_rate": 2.040792875610457e-05, + "loss": 1.8055, + "step": 1742 + }, + { + "epoch": 0.49564861934750026, + "grad_norm": 1.977705478668213, + "learning_rate": 2.0396437805228383e-05, + "loss": 1.8515, + "step": 1743 + }, + { + "epoch": 0.49593298459095836, + "grad_norm": 1.7835551500320435, + "learning_rate": 2.0384946854352197e-05, + "loss": 1.8965, + "step": 1744 + }, + { + "epoch": 0.4962173498344165, + "grad_norm": 1.7893133163452148, + "learning_rate": 2.0373455903476015e-05, + "loss": 2.6412, + "step": 1745 + }, + { + "epoch": 0.4965017150778746, + "grad_norm": 1.5060399770736694, + "learning_rate": 2.036196495259983e-05, + "loss": 2.176, + "step": 1746 + }, + { + "epoch": 0.4967860803213327, + "grad_norm": 1.6172282695770264, + "learning_rate": 2.0350474001723644e-05, + "loss": 2.2377, + "step": 1747 + }, + { + "epoch": 0.49707044556479085, + "grad_norm": 1.5684025287628174, + "learning_rate": 2.033898305084746e-05, + "loss": 2.0984, + "step": 1748 + }, + { + "epoch": 0.49735481080824895, + "grad_norm": 1.6685534715652466, + "learning_rate": 2.0327492099971273e-05, + "loss": 2.2496, + "step": 1749 + }, + { + "epoch": 0.4976391760517071, + "grad_norm": 1.6324350833892822, + "learning_rate": 2.0316001149095088e-05, + "loss": 1.854, + "step": 1750 + }, + { + "epoch": 0.4979235412951652, + "grad_norm": 1.642563819885254, + "learning_rate": 2.0304510198218902e-05, + "loss": 1.8248, + "step": 1751 + }, + { + "epoch": 0.4982079065386233, + "grad_norm": 1.6866960525512695, + "learning_rate": 2.0293019247342717e-05, + "loss": 2.0292, + "step": 1752 + }, + { + "epoch": 0.49849227178208144, + "grad_norm": 1.9261857271194458, + "learning_rate": 2.028152829646653e-05, + "loss": 2.8267, + "step": 1753 + }, + { + "epoch": 0.49877663702553954, + "grad_norm": 1.5823674201965332, + "learning_rate": 2.027003734559035e-05, + "loss": 2.3225, + "step": 1754 + }, + { + "epoch": 0.4990610022689977, + "grad_norm": 1.5094738006591797, + "learning_rate": 2.0258546394714164e-05, + "loss": 2.2377, + "step": 1755 + }, + { + "epoch": 0.4993453675124558, + "grad_norm": 1.5388329029083252, + "learning_rate": 2.0247055443837978e-05, + "loss": 2.4092, + "step": 1756 + }, + { + "epoch": 0.49962973275591394, + "grad_norm": 1.5663154125213623, + "learning_rate": 2.0235564492961793e-05, + "loss": 2.1407, + "step": 1757 + }, + { + "epoch": 0.49991409799937203, + "grad_norm": 1.633455753326416, + "learning_rate": 2.0224073542085607e-05, + "loss": 2.0007, + "step": 1758 + }, + { + "epoch": 0.5001984632428301, + "grad_norm": 1.7223169803619385, + "learning_rate": 2.0212582591209425e-05, + "loss": 2.0154, + "step": 1759 + }, + { + "epoch": 0.5004828284862882, + "grad_norm": 1.8982787132263184, + "learning_rate": 2.0201091640333243e-05, + "loss": 1.9042, + "step": 1760 + }, + { + "epoch": 0.5007671937297464, + "grad_norm": 1.855883240699768, + "learning_rate": 2.0189600689457057e-05, + "loss": 2.6409, + "step": 1761 + }, + { + "epoch": 0.5010515589732045, + "grad_norm": 1.5711190700531006, + "learning_rate": 2.0178109738580872e-05, + "loss": 2.5989, + "step": 1762 + }, + { + "epoch": 0.5013359242166626, + "grad_norm": 1.565077781677246, + "learning_rate": 2.0166618787704686e-05, + "loss": 2.5103, + "step": 1763 + }, + { + "epoch": 0.5016202894601207, + "grad_norm": 1.5790541172027588, + "learning_rate": 2.01551278368285e-05, + "loss": 2.2145, + "step": 1764 + }, + { + "epoch": 0.5019046547035788, + "grad_norm": 1.712709665298462, + "learning_rate": 2.0143636885952315e-05, + "loss": 2.2995, + "step": 1765 + }, + { + "epoch": 0.502189019947037, + "grad_norm": 1.6883087158203125, + "learning_rate": 2.013214593507613e-05, + "loss": 1.8596, + "step": 1766 + }, + { + "epoch": 0.5024733851904951, + "grad_norm": 1.6610372066497803, + "learning_rate": 2.0120654984199944e-05, + "loss": 1.9617, + "step": 1767 + }, + { + "epoch": 0.5027577504339532, + "grad_norm": 1.5222752094268799, + "learning_rate": 2.010916403332376e-05, + "loss": 1.8691, + "step": 1768 + }, + { + "epoch": 0.5030421156774113, + "grad_norm": 1.8320504426956177, + "learning_rate": 2.0097673082447577e-05, + "loss": 2.722, + "step": 1769 + }, + { + "epoch": 0.5033264809208694, + "grad_norm": 1.5912829637527466, + "learning_rate": 2.008618213157139e-05, + "loss": 2.4662, + "step": 1770 + }, + { + "epoch": 0.5036108461643276, + "grad_norm": 1.522308588027954, + "learning_rate": 2.0074691180695206e-05, + "loss": 2.3028, + "step": 1771 + }, + { + "epoch": 0.5038952114077857, + "grad_norm": 1.5681263208389282, + "learning_rate": 2.006320022981902e-05, + "loss": 2.2298, + "step": 1772 + }, + { + "epoch": 0.5041795766512438, + "grad_norm": 1.6516104936599731, + "learning_rate": 2.0051709278942835e-05, + "loss": 2.2985, + "step": 1773 + }, + { + "epoch": 0.5044639418947019, + "grad_norm": 1.5645509958267212, + "learning_rate": 2.004021832806665e-05, + "loss": 1.942, + "step": 1774 + }, + { + "epoch": 0.50474830713816, + "grad_norm": 1.656414270401001, + "learning_rate": 2.0028727377190464e-05, + "loss": 1.7982, + "step": 1775 + }, + { + "epoch": 0.5050326723816182, + "grad_norm": 1.6322808265686035, + "learning_rate": 2.0017236426314278e-05, + "loss": 1.9504, + "step": 1776 + }, + { + "epoch": 0.5053170376250763, + "grad_norm": 1.7643145322799683, + "learning_rate": 2.0005745475438093e-05, + "loss": 2.765, + "step": 1777 + }, + { + "epoch": 0.5056014028685344, + "grad_norm": 1.471491813659668, + "learning_rate": 1.999425452456191e-05, + "loss": 2.2441, + "step": 1778 + }, + { + "epoch": 0.5058857681119925, + "grad_norm": 1.5488718748092651, + "learning_rate": 1.9982763573685725e-05, + "loss": 2.3885, + "step": 1779 + }, + { + "epoch": 0.5061701333554506, + "grad_norm": 1.5598701238632202, + "learning_rate": 1.997127262280954e-05, + "loss": 2.3161, + "step": 1780 + }, + { + "epoch": 0.5064544985989088, + "grad_norm": 1.5675969123840332, + "learning_rate": 1.9959781671933354e-05, + "loss": 2.3845, + "step": 1781 + }, + { + "epoch": 0.5067388638423669, + "grad_norm": 1.645579218864441, + "learning_rate": 1.994829072105717e-05, + "loss": 1.949, + "step": 1782 + }, + { + "epoch": 0.507023229085825, + "grad_norm": 1.514928936958313, + "learning_rate": 1.9936799770180983e-05, + "loss": 1.8749, + "step": 1783 + }, + { + "epoch": 0.5073075943292831, + "grad_norm": 1.7080934047698975, + "learning_rate": 1.9925308819304798e-05, + "loss": 1.6213, + "step": 1784 + }, + { + "epoch": 0.5075919595727412, + "grad_norm": 2.008892297744751, + "learning_rate": 1.9913817868428612e-05, + "loss": 2.8028, + "step": 1785 + }, + { + "epoch": 0.5078763248161994, + "grad_norm": 1.6706644296646118, + "learning_rate": 1.9902326917552427e-05, + "loss": 2.1027, + "step": 1786 + }, + { + "epoch": 0.5081606900596575, + "grad_norm": 1.5543371438980103, + "learning_rate": 1.9890835966676244e-05, + "loss": 2.4243, + "step": 1787 + }, + { + "epoch": 0.5084450553031156, + "grad_norm": 1.555815577507019, + "learning_rate": 1.987934501580006e-05, + "loss": 2.3077, + "step": 1788 + }, + { + "epoch": 0.5087294205465737, + "grad_norm": 1.6928585767745972, + "learning_rate": 1.9867854064923873e-05, + "loss": 2.0664, + "step": 1789 + }, + { + "epoch": 0.5090137857900318, + "grad_norm": 1.5728840827941895, + "learning_rate": 1.985636311404769e-05, + "loss": 2.0858, + "step": 1790 + }, + { + "epoch": 0.50929815103349, + "grad_norm": 1.7079205513000488, + "learning_rate": 1.9844872163171506e-05, + "loss": 1.8084, + "step": 1791 + }, + { + "epoch": 0.5095825162769481, + "grad_norm": 1.6424134969711304, + "learning_rate": 1.983338121229532e-05, + "loss": 1.7926, + "step": 1792 + }, + { + "epoch": 0.5098668815204062, + "grad_norm": 1.9183549880981445, + "learning_rate": 1.9821890261419135e-05, + "loss": 2.7518, + "step": 1793 + }, + { + "epoch": 0.5101512467638643, + "grad_norm": 1.5652717351913452, + "learning_rate": 1.981039931054295e-05, + "loss": 2.2392, + "step": 1794 + }, + { + "epoch": 0.5104356120073225, + "grad_norm": 1.5693566799163818, + "learning_rate": 1.9798908359666764e-05, + "loss": 2.5368, + "step": 1795 + }, + { + "epoch": 0.5107199772507806, + "grad_norm": 1.57747220993042, + "learning_rate": 1.978741740879058e-05, + "loss": 2.2448, + "step": 1796 + }, + { + "epoch": 0.5110043424942387, + "grad_norm": 1.6220982074737549, + "learning_rate": 1.9775926457914393e-05, + "loss": 1.9995, + "step": 1797 + }, + { + "epoch": 0.5112887077376967, + "grad_norm": 1.5088688135147095, + "learning_rate": 1.9764435507038207e-05, + "loss": 1.9295, + "step": 1798 + }, + { + "epoch": 0.5115730729811548, + "grad_norm": 1.5756409168243408, + "learning_rate": 1.9752944556162025e-05, + "loss": 1.8758, + "step": 1799 + }, + { + "epoch": 0.511857438224613, + "grad_norm": 1.6049522161483765, + "learning_rate": 1.974145360528584e-05, + "loss": 1.6545, + "step": 1800 + }, + { + "epoch": 0.5121418034680711, + "grad_norm": 1.9730896949768066, + "learning_rate": 1.9729962654409654e-05, + "loss": 2.7769, + "step": 1801 + }, + { + "epoch": 0.5124261687115292, + "grad_norm": 1.4819504022598267, + "learning_rate": 1.971847170353347e-05, + "loss": 2.2833, + "step": 1802 + }, + { + "epoch": 0.5127105339549873, + "grad_norm": 1.6456480026245117, + "learning_rate": 1.9706980752657283e-05, + "loss": 2.3185, + "step": 1803 + }, + { + "epoch": 0.5129948991984454, + "grad_norm": 1.5290210247039795, + "learning_rate": 1.9695489801781098e-05, + "loss": 2.4907, + "step": 1804 + }, + { + "epoch": 0.5132792644419036, + "grad_norm": 1.5902469158172607, + "learning_rate": 1.9683998850904912e-05, + "loss": 2.2612, + "step": 1805 + }, + { + "epoch": 0.5135636296853617, + "grad_norm": 1.622529149055481, + "learning_rate": 1.967250790002873e-05, + "loss": 1.9592, + "step": 1806 + }, + { + "epoch": 0.5138479949288198, + "grad_norm": 1.5335959196090698, + "learning_rate": 1.9661016949152545e-05, + "loss": 1.7958, + "step": 1807 + }, + { + "epoch": 0.5141323601722779, + "grad_norm": 1.5710722208023071, + "learning_rate": 1.964952599827636e-05, + "loss": 1.7861, + "step": 1808 + }, + { + "epoch": 0.514416725415736, + "grad_norm": 1.9415019750595093, + "learning_rate": 1.9638035047400174e-05, + "loss": 2.8067, + "step": 1809 + }, + { + "epoch": 0.5147010906591942, + "grad_norm": 1.686766266822815, + "learning_rate": 1.9626544096523988e-05, + "loss": 2.4224, + "step": 1810 + }, + { + "epoch": 0.5149854559026523, + "grad_norm": 1.6373934745788574, + "learning_rate": 1.9615053145647806e-05, + "loss": 2.3926, + "step": 1811 + }, + { + "epoch": 0.5152698211461104, + "grad_norm": 1.5860379934310913, + "learning_rate": 1.960356219477162e-05, + "loss": 2.213, + "step": 1812 + }, + { + "epoch": 0.5155541863895685, + "grad_norm": 1.5629751682281494, + "learning_rate": 1.9592071243895435e-05, + "loss": 2.2401, + "step": 1813 + }, + { + "epoch": 0.5158385516330266, + "grad_norm": 1.6226354837417603, + "learning_rate": 1.958058029301925e-05, + "loss": 1.8855, + "step": 1814 + }, + { + "epoch": 0.5161229168764848, + "grad_norm": 1.6399784088134766, + "learning_rate": 1.9569089342143064e-05, + "loss": 1.9354, + "step": 1815 + }, + { + "epoch": 0.5164072821199429, + "grad_norm": 1.8511910438537598, + "learning_rate": 1.955759839126688e-05, + "loss": 1.7383, + "step": 1816 + }, + { + "epoch": 0.516691647363401, + "grad_norm": 1.8263779878616333, + "learning_rate": 1.9546107440390693e-05, + "loss": 2.7295, + "step": 1817 + }, + { + "epoch": 0.5169760126068591, + "grad_norm": 1.721635341644287, + "learning_rate": 1.9534616489514507e-05, + "loss": 2.4588, + "step": 1818 + }, + { + "epoch": 0.5172603778503172, + "grad_norm": 1.6186027526855469, + "learning_rate": 1.9523125538638322e-05, + "loss": 2.4666, + "step": 1819 + }, + { + "epoch": 0.5175447430937754, + "grad_norm": 1.6999125480651855, + "learning_rate": 1.951163458776214e-05, + "loss": 2.21, + "step": 1820 + }, + { + "epoch": 0.5178291083372335, + "grad_norm": 1.6916403770446777, + "learning_rate": 1.9500143636885954e-05, + "loss": 2.037, + "step": 1821 + }, + { + "epoch": 0.5181134735806916, + "grad_norm": 1.541050672531128, + "learning_rate": 1.948865268600977e-05, + "loss": 1.8888, + "step": 1822 + }, + { + "epoch": 0.5183978388241497, + "grad_norm": 1.5618538856506348, + "learning_rate": 1.9477161735133583e-05, + "loss": 1.8478, + "step": 1823 + }, + { + "epoch": 0.5186822040676078, + "grad_norm": 1.6209863424301147, + "learning_rate": 1.94656707842574e-05, + "loss": 1.6114, + "step": 1824 + }, + { + "epoch": 0.518966569311066, + "grad_norm": 1.7759110927581787, + "learning_rate": 1.9454179833381216e-05, + "loss": 2.6206, + "step": 1825 + }, + { + "epoch": 0.5192509345545241, + "grad_norm": 1.5494519472122192, + "learning_rate": 1.944268888250503e-05, + "loss": 2.3457, + "step": 1826 + }, + { + "epoch": 0.5195352997979822, + "grad_norm": 1.6379374265670776, + "learning_rate": 1.9431197931628845e-05, + "loss": 2.581, + "step": 1827 + }, + { + "epoch": 0.5198196650414403, + "grad_norm": 1.5918887853622437, + "learning_rate": 1.941970698075266e-05, + "loss": 2.1037, + "step": 1828 + }, + { + "epoch": 0.5201040302848984, + "grad_norm": 1.5805288553237915, + "learning_rate": 1.9408216029876474e-05, + "loss": 2.2555, + "step": 1829 + }, + { + "epoch": 0.5203883955283566, + "grad_norm": 1.653382420539856, + "learning_rate": 1.9396725079000288e-05, + "loss": 1.7606, + "step": 1830 + }, + { + "epoch": 0.5206727607718147, + "grad_norm": 1.5923447608947754, + "learning_rate": 1.9385234128124103e-05, + "loss": 1.6076, + "step": 1831 + }, + { + "epoch": 0.5209571260152728, + "grad_norm": 1.7855467796325684, + "learning_rate": 1.937374317724792e-05, + "loss": 1.8589, + "step": 1832 + }, + { + "epoch": 0.5212414912587309, + "grad_norm": 1.8567280769348145, + "learning_rate": 1.9362252226371735e-05, + "loss": 2.7604, + "step": 1833 + }, + { + "epoch": 0.521525856502189, + "grad_norm": 1.6221263408660889, + "learning_rate": 1.935076127549555e-05, + "loss": 2.4281, + "step": 1834 + }, + { + "epoch": 0.5218102217456472, + "grad_norm": 1.6806731224060059, + "learning_rate": 1.9339270324619364e-05, + "loss": 2.4162, + "step": 1835 + }, + { + "epoch": 0.5220945869891053, + "grad_norm": 1.55227792263031, + "learning_rate": 1.932777937374318e-05, + "loss": 2.0863, + "step": 1836 + }, + { + "epoch": 0.5223789522325634, + "grad_norm": 1.5924606323242188, + "learning_rate": 1.9316288422866993e-05, + "loss": 2.004, + "step": 1837 + }, + { + "epoch": 0.5226633174760215, + "grad_norm": 1.6249966621398926, + "learning_rate": 1.9304797471990808e-05, + "loss": 2.0434, + "step": 1838 + }, + { + "epoch": 0.5229476827194796, + "grad_norm": 1.702721357345581, + "learning_rate": 1.9293306521114622e-05, + "loss": 1.8394, + "step": 1839 + }, + { + "epoch": 0.5232320479629378, + "grad_norm": 1.7455133199691772, + "learning_rate": 1.9281815570238437e-05, + "loss": 1.5562, + "step": 1840 + }, + { + "epoch": 0.5235164132063959, + "grad_norm": 1.8886913061141968, + "learning_rate": 1.9270324619362254e-05, + "loss": 2.6912, + "step": 1841 + }, + { + "epoch": 0.523800778449854, + "grad_norm": 1.5527037382125854, + "learning_rate": 1.925883366848607e-05, + "loss": 2.4175, + "step": 1842 + }, + { + "epoch": 0.524085143693312, + "grad_norm": 1.5094817876815796, + "learning_rate": 1.9247342717609883e-05, + "loss": 2.299, + "step": 1843 + }, + { + "epoch": 0.5243695089367703, + "grad_norm": 1.5052844285964966, + "learning_rate": 1.92358517667337e-05, + "loss": 2.3148, + "step": 1844 + }, + { + "epoch": 0.5246538741802284, + "grad_norm": 1.5606803894042969, + "learning_rate": 1.9224360815857516e-05, + "loss": 2.2779, + "step": 1845 + }, + { + "epoch": 0.5249382394236864, + "grad_norm": 1.6374552249908447, + "learning_rate": 1.921286986498133e-05, + "loss": 1.9273, + "step": 1846 + }, + { + "epoch": 0.5252226046671445, + "grad_norm": 1.6898576021194458, + "learning_rate": 1.9201378914105145e-05, + "loss": 1.8248, + "step": 1847 + }, + { + "epoch": 0.5255069699106026, + "grad_norm": 1.6066820621490479, + "learning_rate": 1.918988796322896e-05, + "loss": 1.8573, + "step": 1848 + }, + { + "epoch": 0.5257913351540608, + "grad_norm": 1.8908296823501587, + "learning_rate": 1.9178397012352774e-05, + "loss": 2.9586, + "step": 1849 + }, + { + "epoch": 0.5260757003975189, + "grad_norm": 1.7448999881744385, + "learning_rate": 1.9166906061476588e-05, + "loss": 2.5243, + "step": 1850 + }, + { + "epoch": 0.526360065640977, + "grad_norm": 1.5253217220306396, + "learning_rate": 1.9155415110600403e-05, + "loss": 2.3295, + "step": 1851 + }, + { + "epoch": 0.5266444308844351, + "grad_norm": 1.5445467233657837, + "learning_rate": 1.9143924159724217e-05, + "loss": 2.3908, + "step": 1852 + }, + { + "epoch": 0.5269287961278932, + "grad_norm": 1.622870922088623, + "learning_rate": 1.9132433208848035e-05, + "loss": 2.0477, + "step": 1853 + }, + { + "epoch": 0.5272131613713514, + "grad_norm": 1.7335789203643799, + "learning_rate": 1.912094225797185e-05, + "loss": 1.821, + "step": 1854 + }, + { + "epoch": 0.5274975266148095, + "grad_norm": 1.7417482137680054, + "learning_rate": 1.9109451307095664e-05, + "loss": 1.7534, + "step": 1855 + }, + { + "epoch": 0.5277818918582676, + "grad_norm": 1.7831518650054932, + "learning_rate": 1.909796035621948e-05, + "loss": 2.0581, + "step": 1856 + }, + { + "epoch": 0.5280662571017257, + "grad_norm": 1.9567564725875854, + "learning_rate": 1.9086469405343293e-05, + "loss": 2.7799, + "step": 1857 + }, + { + "epoch": 0.5283506223451838, + "grad_norm": 1.6343289613723755, + "learning_rate": 1.9074978454467108e-05, + "loss": 2.5412, + "step": 1858 + }, + { + "epoch": 0.528634987588642, + "grad_norm": 1.4926238059997559, + "learning_rate": 1.9063487503590922e-05, + "loss": 2.3056, + "step": 1859 + }, + { + "epoch": 0.5289193528321001, + "grad_norm": 1.5979171991348267, + "learning_rate": 1.9051996552714737e-05, + "loss": 2.4987, + "step": 1860 + }, + { + "epoch": 0.5292037180755582, + "grad_norm": 1.6570512056350708, + "learning_rate": 1.9040505601838555e-05, + "loss": 2.0305, + "step": 1861 + }, + { + "epoch": 0.5294880833190163, + "grad_norm": 1.517991065979004, + "learning_rate": 1.902901465096237e-05, + "loss": 1.672, + "step": 1862 + }, + { + "epoch": 0.5297724485624744, + "grad_norm": 1.5992649793624878, + "learning_rate": 1.9017523700086184e-05, + "loss": 1.8708, + "step": 1863 + }, + { + "epoch": 0.5300568138059326, + "grad_norm": 1.6488360166549683, + "learning_rate": 1.9006032749209998e-05, + "loss": 1.7175, + "step": 1864 + }, + { + "epoch": 0.5303411790493907, + "grad_norm": 1.810335397720337, + "learning_rate": 1.8994541798333816e-05, + "loss": 2.8107, + "step": 1865 + }, + { + "epoch": 0.5306255442928488, + "grad_norm": 1.5411492586135864, + "learning_rate": 1.898305084745763e-05, + "loss": 2.4042, + "step": 1866 + }, + { + "epoch": 0.5309099095363069, + "grad_norm": 1.6033616065979004, + "learning_rate": 1.8971559896581445e-05, + "loss": 2.5207, + "step": 1867 + }, + { + "epoch": 0.531194274779765, + "grad_norm": 1.451252818107605, + "learning_rate": 1.896006894570526e-05, + "loss": 2.2705, + "step": 1868 + }, + { + "epoch": 0.5314786400232232, + "grad_norm": 1.614965558052063, + "learning_rate": 1.8948577994829074e-05, + "loss": 2.3188, + "step": 1869 + }, + { + "epoch": 0.5317630052666813, + "grad_norm": 1.570432186126709, + "learning_rate": 1.893708704395289e-05, + "loss": 1.901, + "step": 1870 + }, + { + "epoch": 0.5320473705101394, + "grad_norm": 1.603621244430542, + "learning_rate": 1.8925596093076703e-05, + "loss": 1.811, + "step": 1871 + }, + { + "epoch": 0.5323317357535975, + "grad_norm": 1.7074507474899292, + "learning_rate": 1.8914105142200517e-05, + "loss": 1.7383, + "step": 1872 + }, + { + "epoch": 0.5326161009970556, + "grad_norm": 1.7993121147155762, + "learning_rate": 1.8902614191324332e-05, + "loss": 2.8412, + "step": 1873 + }, + { + "epoch": 0.5329004662405138, + "grad_norm": 1.4372435808181763, + "learning_rate": 1.889112324044815e-05, + "loss": 2.4849, + "step": 1874 + }, + { + "epoch": 0.5331848314839719, + "grad_norm": 1.4756051301956177, + "learning_rate": 1.8879632289571964e-05, + "loss": 2.3527, + "step": 1875 + }, + { + "epoch": 0.53346919672743, + "grad_norm": 1.54763925075531, + "learning_rate": 1.886814133869578e-05, + "loss": 2.1188, + "step": 1876 + }, + { + "epoch": 0.5337535619708881, + "grad_norm": 1.6387181282043457, + "learning_rate": 1.8856650387819593e-05, + "loss": 2.2232, + "step": 1877 + }, + { + "epoch": 0.5340379272143462, + "grad_norm": 1.5445959568023682, + "learning_rate": 1.8845159436943408e-05, + "loss": 1.96, + "step": 1878 + }, + { + "epoch": 0.5343222924578044, + "grad_norm": 1.6059736013412476, + "learning_rate": 1.8833668486067222e-05, + "loss": 1.9384, + "step": 1879 + }, + { + "epoch": 0.5346066577012625, + "grad_norm": 1.5326387882232666, + "learning_rate": 1.882217753519104e-05, + "loss": 1.8572, + "step": 1880 + }, + { + "epoch": 0.5348910229447206, + "grad_norm": 1.9614207744598389, + "learning_rate": 1.8810686584314855e-05, + "loss": 2.8123, + "step": 1881 + }, + { + "epoch": 0.5351753881881787, + "grad_norm": 1.532706618309021, + "learning_rate": 1.879919563343867e-05, + "loss": 2.3232, + "step": 1882 + }, + { + "epoch": 0.5354597534316368, + "grad_norm": 1.612025499343872, + "learning_rate": 1.8787704682562484e-05, + "loss": 2.4058, + "step": 1883 + }, + { + "epoch": 0.535744118675095, + "grad_norm": 1.535540223121643, + "learning_rate": 1.8776213731686298e-05, + "loss": 2.2764, + "step": 1884 + }, + { + "epoch": 0.5360284839185531, + "grad_norm": 1.7752254009246826, + "learning_rate": 1.8764722780810113e-05, + "loss": 2.1374, + "step": 1885 + }, + { + "epoch": 0.5363128491620112, + "grad_norm": 1.6558598279953003, + "learning_rate": 1.875323182993393e-05, + "loss": 1.9204, + "step": 1886 + }, + { + "epoch": 0.5365972144054693, + "grad_norm": 1.5669152736663818, + "learning_rate": 1.8741740879057745e-05, + "loss": 1.9425, + "step": 1887 + }, + { + "epoch": 0.5368815796489275, + "grad_norm": 1.711891531944275, + "learning_rate": 1.873024992818156e-05, + "loss": 1.8079, + "step": 1888 + }, + { + "epoch": 0.5371659448923856, + "grad_norm": 1.939569115638733, + "learning_rate": 1.8718758977305374e-05, + "loss": 2.7149, + "step": 1889 + }, + { + "epoch": 0.5374503101358437, + "grad_norm": 1.6168116331100464, + "learning_rate": 1.870726802642919e-05, + "loss": 2.6034, + "step": 1890 + }, + { + "epoch": 0.5377346753793018, + "grad_norm": 1.5140300989151, + "learning_rate": 1.8695777075553003e-05, + "loss": 2.2716, + "step": 1891 + }, + { + "epoch": 0.5380190406227598, + "grad_norm": 1.440793752670288, + "learning_rate": 1.8684286124676817e-05, + "loss": 2.2802, + "step": 1892 + }, + { + "epoch": 0.538303405866218, + "grad_norm": 1.5787031650543213, + "learning_rate": 1.8672795173800632e-05, + "loss": 2.3291, + "step": 1893 + }, + { + "epoch": 0.5385877711096761, + "grad_norm": 1.5322247743606567, + "learning_rate": 1.8661304222924446e-05, + "loss": 2.0667, + "step": 1894 + }, + { + "epoch": 0.5388721363531342, + "grad_norm": 1.6390894651412964, + "learning_rate": 1.8649813272048264e-05, + "loss": 1.8755, + "step": 1895 + }, + { + "epoch": 0.5391565015965923, + "grad_norm": 1.7358583211898804, + "learning_rate": 1.863832232117208e-05, + "loss": 1.8302, + "step": 1896 + }, + { + "epoch": 0.5394408668400504, + "grad_norm": 1.8744136095046997, + "learning_rate": 1.8626831370295893e-05, + "loss": 2.8263, + "step": 1897 + }, + { + "epoch": 0.5397252320835086, + "grad_norm": 1.5263346433639526, + "learning_rate": 1.861534041941971e-05, + "loss": 2.1965, + "step": 1898 + }, + { + "epoch": 0.5400095973269667, + "grad_norm": 1.5258394479751587, + "learning_rate": 1.8603849468543526e-05, + "loss": 2.2005, + "step": 1899 + }, + { + "epoch": 0.5402939625704248, + "grad_norm": 1.5305837392807007, + "learning_rate": 1.859235851766734e-05, + "loss": 2.2732, + "step": 1900 + }, + { + "epoch": 0.5405783278138829, + "grad_norm": 1.7153434753417969, + "learning_rate": 1.8580867566791155e-05, + "loss": 2.2237, + "step": 1901 + }, + { + "epoch": 0.540862693057341, + "grad_norm": 1.5634775161743164, + "learning_rate": 1.856937661591497e-05, + "loss": 1.9519, + "step": 1902 + }, + { + "epoch": 0.5411470583007992, + "grad_norm": 1.6455775499343872, + "learning_rate": 1.8557885665038784e-05, + "loss": 1.8184, + "step": 1903 + }, + { + "epoch": 0.5414314235442573, + "grad_norm": 1.5984224081039429, + "learning_rate": 1.8546394714162598e-05, + "loss": 1.7093, + "step": 1904 + }, + { + "epoch": 0.5417157887877154, + "grad_norm": 2.2204861640930176, + "learning_rate": 1.8534903763286413e-05, + "loss": 2.7596, + "step": 1905 + }, + { + "epoch": 0.5420001540311735, + "grad_norm": 1.8560435771942139, + "learning_rate": 1.8523412812410227e-05, + "loss": 2.5364, + "step": 1906 + }, + { + "epoch": 0.5422845192746316, + "grad_norm": 1.5567165613174438, + "learning_rate": 1.8511921861534045e-05, + "loss": 2.2509, + "step": 1907 + }, + { + "epoch": 0.5425688845180898, + "grad_norm": 1.4999059438705444, + "learning_rate": 1.850043091065786e-05, + "loss": 2.153, + "step": 1908 + }, + { + "epoch": 0.5428532497615479, + "grad_norm": 1.6572489738464355, + "learning_rate": 1.8488939959781674e-05, + "loss": 1.9657, + "step": 1909 + }, + { + "epoch": 0.543137615005006, + "grad_norm": 1.5145512819290161, + "learning_rate": 1.847744900890549e-05, + "loss": 1.7628, + "step": 1910 + }, + { + "epoch": 0.5434219802484641, + "grad_norm": 1.5486762523651123, + "learning_rate": 1.8465958058029303e-05, + "loss": 1.8377, + "step": 1911 + }, + { + "epoch": 0.5437063454919222, + "grad_norm": 1.6072850227355957, + "learning_rate": 1.8454467107153118e-05, + "loss": 1.9599, + "step": 1912 + }, + { + "epoch": 0.5439907107353804, + "grad_norm": 1.9221049547195435, + "learning_rate": 1.8442976156276932e-05, + "loss": 2.8359, + "step": 1913 + }, + { + "epoch": 0.5442750759788385, + "grad_norm": 1.6474133729934692, + "learning_rate": 1.8431485205400747e-05, + "loss": 2.495, + "step": 1914 + }, + { + "epoch": 0.5445594412222966, + "grad_norm": 1.5209523439407349, + "learning_rate": 1.841999425452456e-05, + "loss": 2.2543, + "step": 1915 + }, + { + "epoch": 0.5448438064657547, + "grad_norm": 1.592429280281067, + "learning_rate": 1.840850330364838e-05, + "loss": 2.2111, + "step": 1916 + }, + { + "epoch": 0.5451281717092128, + "grad_norm": 1.5777971744537354, + "learning_rate": 1.8397012352772193e-05, + "loss": 2.1635, + "step": 1917 + }, + { + "epoch": 0.545412536952671, + "grad_norm": 1.5663307905197144, + "learning_rate": 1.8385521401896008e-05, + "loss": 1.8126, + "step": 1918 + }, + { + "epoch": 0.5456969021961291, + "grad_norm": 1.643549919128418, + "learning_rate": 1.8374030451019826e-05, + "loss": 1.9639, + "step": 1919 + }, + { + "epoch": 0.5459812674395872, + "grad_norm": 1.7581907510757446, + "learning_rate": 1.836253950014364e-05, + "loss": 1.5631, + "step": 1920 + }, + { + "epoch": 0.5462656326830453, + "grad_norm": 1.8245182037353516, + "learning_rate": 1.8351048549267455e-05, + "loss": 2.8506, + "step": 1921 + }, + { + "epoch": 0.5465499979265034, + "grad_norm": 1.53452730178833, + "learning_rate": 1.833955759839127e-05, + "loss": 2.3347, + "step": 1922 + }, + { + "epoch": 0.5468343631699616, + "grad_norm": 1.4857192039489746, + "learning_rate": 1.8328066647515084e-05, + "loss": 2.3888, + "step": 1923 + }, + { + "epoch": 0.5471187284134197, + "grad_norm": 1.5653821229934692, + "learning_rate": 1.83165756966389e-05, + "loss": 2.2097, + "step": 1924 + }, + { + "epoch": 0.5474030936568778, + "grad_norm": 1.7162553071975708, + "learning_rate": 1.8305084745762713e-05, + "loss": 1.9651, + "step": 1925 + }, + { + "epoch": 0.5476874589003359, + "grad_norm": 1.5265828371047974, + "learning_rate": 1.8293593794886527e-05, + "loss": 1.8044, + "step": 1926 + }, + { + "epoch": 0.547971824143794, + "grad_norm": 1.5719175338745117, + "learning_rate": 1.8282102844010342e-05, + "loss": 1.8203, + "step": 1927 + }, + { + "epoch": 0.5482561893872522, + "grad_norm": 1.6208674907684326, + "learning_rate": 1.827061189313416e-05, + "loss": 1.777, + "step": 1928 + }, + { + "epoch": 0.5485405546307103, + "grad_norm": 1.8437464237213135, + "learning_rate": 1.8259120942257974e-05, + "loss": 2.8108, + "step": 1929 + }, + { + "epoch": 0.5488249198741684, + "grad_norm": 1.4935005903244019, + "learning_rate": 1.824762999138179e-05, + "loss": 2.4553, + "step": 1930 + }, + { + "epoch": 0.5491092851176265, + "grad_norm": 1.4502531290054321, + "learning_rate": 1.8236139040505603e-05, + "loss": 2.1391, + "step": 1931 + }, + { + "epoch": 0.5493936503610846, + "grad_norm": 1.467806100845337, + "learning_rate": 1.8224648089629418e-05, + "loss": 2.0988, + "step": 1932 + }, + { + "epoch": 0.5496780156045428, + "grad_norm": 1.5642067193984985, + "learning_rate": 1.8213157138753232e-05, + "loss": 2.1998, + "step": 1933 + }, + { + "epoch": 0.5499623808480009, + "grad_norm": 1.6454665660858154, + "learning_rate": 1.8201666187877047e-05, + "loss": 1.8338, + "step": 1934 + }, + { + "epoch": 0.550246746091459, + "grad_norm": 1.665605068206787, + "learning_rate": 1.8190175237000865e-05, + "loss": 1.8698, + "step": 1935 + }, + { + "epoch": 0.5505311113349171, + "grad_norm": 1.661318063735962, + "learning_rate": 1.817868428612468e-05, + "loss": 1.7389, + "step": 1936 + }, + { + "epoch": 0.5508154765783753, + "grad_norm": 1.778904676437378, + "learning_rate": 1.8167193335248494e-05, + "loss": 2.6794, + "step": 1937 + }, + { + "epoch": 0.5510998418218334, + "grad_norm": 1.7070825099945068, + "learning_rate": 1.8155702384372308e-05, + "loss": 2.4678, + "step": 1938 + }, + { + "epoch": 0.5513842070652915, + "grad_norm": 1.5696340799331665, + "learning_rate": 1.8144211433496123e-05, + "loss": 2.2423, + "step": 1939 + }, + { + "epoch": 0.5516685723087495, + "grad_norm": 1.664480209350586, + "learning_rate": 1.813272048261994e-05, + "loss": 2.2692, + "step": 1940 + }, + { + "epoch": 0.5519529375522076, + "grad_norm": 1.7250727415084839, + "learning_rate": 1.8121229531743755e-05, + "loss": 1.8935, + "step": 1941 + }, + { + "epoch": 0.5522373027956659, + "grad_norm": 1.5921610593795776, + "learning_rate": 1.810973858086757e-05, + "loss": 1.9107, + "step": 1942 + }, + { + "epoch": 0.552521668039124, + "grad_norm": 1.5735629796981812, + "learning_rate": 1.8098247629991384e-05, + "loss": 1.8625, + "step": 1943 + }, + { + "epoch": 0.552806033282582, + "grad_norm": 1.7604540586471558, + "learning_rate": 1.80867566791152e-05, + "loss": 1.9191, + "step": 1944 + }, + { + "epoch": 0.5530903985260401, + "grad_norm": 1.894787311553955, + "learning_rate": 1.8075265728239013e-05, + "loss": 2.8069, + "step": 1945 + }, + { + "epoch": 0.5533747637694982, + "grad_norm": 1.5562604665756226, + "learning_rate": 1.8063774777362827e-05, + "loss": 2.5382, + "step": 1946 + }, + { + "epoch": 0.5536591290129564, + "grad_norm": 1.5096544027328491, + "learning_rate": 1.8052283826486642e-05, + "loss": 2.174, + "step": 1947 + }, + { + "epoch": 0.5539434942564145, + "grad_norm": 1.5471742153167725, + "learning_rate": 1.8040792875610456e-05, + "loss": 2.3403, + "step": 1948 + }, + { + "epoch": 0.5542278594998726, + "grad_norm": 1.6458371877670288, + "learning_rate": 1.8029301924734274e-05, + "loss": 2.0733, + "step": 1949 + }, + { + "epoch": 0.5545122247433307, + "grad_norm": 1.7109332084655762, + "learning_rate": 1.801781097385809e-05, + "loss": 1.8824, + "step": 1950 + }, + { + "epoch": 0.5547965899867888, + "grad_norm": 1.7361643314361572, + "learning_rate": 1.8006320022981903e-05, + "loss": 1.8698, + "step": 1951 + }, + { + "epoch": 0.555080955230247, + "grad_norm": 1.6732168197631836, + "learning_rate": 1.7994829072105718e-05, + "loss": 1.774, + "step": 1952 + }, + { + "epoch": 0.5553653204737051, + "grad_norm": 1.7529971599578857, + "learning_rate": 1.7983338121229532e-05, + "loss": 2.8649, + "step": 1953 + }, + { + "epoch": 0.5556496857171632, + "grad_norm": 1.5565195083618164, + "learning_rate": 1.797184717035335e-05, + "loss": 2.4457, + "step": 1954 + }, + { + "epoch": 0.5559340509606213, + "grad_norm": 1.5663349628448486, + "learning_rate": 1.7960356219477165e-05, + "loss": 2.2654, + "step": 1955 + }, + { + "epoch": 0.5562184162040794, + "grad_norm": 1.509903907775879, + "learning_rate": 1.794886526860098e-05, + "loss": 2.1682, + "step": 1956 + }, + { + "epoch": 0.5565027814475376, + "grad_norm": 1.7437077760696411, + "learning_rate": 1.7937374317724794e-05, + "loss": 2.1682, + "step": 1957 + }, + { + "epoch": 0.5567871466909957, + "grad_norm": 1.532533049583435, + "learning_rate": 1.7925883366848608e-05, + "loss": 1.7354, + "step": 1958 + }, + { + "epoch": 0.5570715119344538, + "grad_norm": 1.6096084117889404, + "learning_rate": 1.7914392415972423e-05, + "loss": 1.8484, + "step": 1959 + }, + { + "epoch": 0.5573558771779119, + "grad_norm": 1.8258168697357178, + "learning_rate": 1.7902901465096237e-05, + "loss": 1.807, + "step": 1960 + }, + { + "epoch": 0.55764024242137, + "grad_norm": 2.004221200942993, + "learning_rate": 1.789141051422005e-05, + "loss": 2.7515, + "step": 1961 + }, + { + "epoch": 0.5579246076648282, + "grad_norm": 1.593041181564331, + "learning_rate": 1.787991956334387e-05, + "loss": 2.3786, + "step": 1962 + }, + { + "epoch": 0.5582089729082863, + "grad_norm": 1.5780787467956543, + "learning_rate": 1.7868428612467684e-05, + "loss": 2.3528, + "step": 1963 + }, + { + "epoch": 0.5584933381517444, + "grad_norm": 1.5928815603256226, + "learning_rate": 1.78569376615915e-05, + "loss": 2.2974, + "step": 1964 + }, + { + "epoch": 0.5587777033952025, + "grad_norm": 1.7977780103683472, + "learning_rate": 1.7845446710715313e-05, + "loss": 2.2807, + "step": 1965 + }, + { + "epoch": 0.5590620686386606, + "grad_norm": 1.5917166471481323, + "learning_rate": 1.7833955759839128e-05, + "loss": 2.0174, + "step": 1966 + }, + { + "epoch": 0.5593464338821188, + "grad_norm": 1.55897057056427, + "learning_rate": 1.7822464808962942e-05, + "loss": 1.8759, + "step": 1967 + }, + { + "epoch": 0.5596307991255769, + "grad_norm": 1.6804914474487305, + "learning_rate": 1.7810973858086757e-05, + "loss": 1.7898, + "step": 1968 + }, + { + "epoch": 0.559915164369035, + "grad_norm": 1.7221648693084717, + "learning_rate": 1.779948290721057e-05, + "loss": 2.6948, + "step": 1969 + }, + { + "epoch": 0.5601995296124931, + "grad_norm": 1.5615862607955933, + "learning_rate": 1.7787991956334386e-05, + "loss": 2.2914, + "step": 1970 + }, + { + "epoch": 0.5604838948559512, + "grad_norm": 1.5630532503128052, + "learning_rate": 1.7776501005458203e-05, + "loss": 2.4075, + "step": 1971 + }, + { + "epoch": 0.5607682600994094, + "grad_norm": 1.6465175151824951, + "learning_rate": 1.7765010054582018e-05, + "loss": 2.2431, + "step": 1972 + }, + { + "epoch": 0.5610526253428675, + "grad_norm": 1.6053401231765747, + "learning_rate": 1.7753519103705832e-05, + "loss": 1.9444, + "step": 1973 + }, + { + "epoch": 0.5613369905863256, + "grad_norm": 1.5108147859573364, + "learning_rate": 1.774202815282965e-05, + "loss": 1.8403, + "step": 1974 + }, + { + "epoch": 0.5616213558297837, + "grad_norm": 1.814316987991333, + "learning_rate": 1.7730537201953465e-05, + "loss": 1.7395, + "step": 1975 + }, + { + "epoch": 0.5619057210732418, + "grad_norm": 1.6270887851715088, + "learning_rate": 1.771904625107728e-05, + "loss": 1.6755, + "step": 1976 + }, + { + "epoch": 0.5621900863167, + "grad_norm": 1.9045475721359253, + "learning_rate": 1.7707555300201094e-05, + "loss": 2.8776, + "step": 1977 + }, + { + "epoch": 0.5624744515601581, + "grad_norm": 1.6713111400604248, + "learning_rate": 1.769606434932491e-05, + "loss": 2.3999, + "step": 1978 + }, + { + "epoch": 0.5627588168036162, + "grad_norm": 1.4755371809005737, + "learning_rate": 1.7684573398448723e-05, + "loss": 2.344, + "step": 1979 + }, + { + "epoch": 0.5630431820470743, + "grad_norm": 1.6609379053115845, + "learning_rate": 1.7673082447572537e-05, + "loss": 2.2334, + "step": 1980 + }, + { + "epoch": 0.5633275472905324, + "grad_norm": 1.750351071357727, + "learning_rate": 1.7661591496696352e-05, + "loss": 2.2961, + "step": 1981 + }, + { + "epoch": 0.5636119125339906, + "grad_norm": 1.47465181350708, + "learning_rate": 1.7650100545820166e-05, + "loss": 1.7603, + "step": 1982 + }, + { + "epoch": 0.5638962777774487, + "grad_norm": 1.627179503440857, + "learning_rate": 1.7638609594943984e-05, + "loss": 1.8822, + "step": 1983 + }, + { + "epoch": 0.5641806430209068, + "grad_norm": 1.570872187614441, + "learning_rate": 1.76271186440678e-05, + "loss": 1.8734, + "step": 1984 + }, + { + "epoch": 0.5644650082643649, + "grad_norm": 1.7837358713150024, + "learning_rate": 1.7615627693191613e-05, + "loss": 2.7609, + "step": 1985 + }, + { + "epoch": 0.5647493735078231, + "grad_norm": 1.5438477993011475, + "learning_rate": 1.7604136742315428e-05, + "loss": 2.2791, + "step": 1986 + }, + { + "epoch": 0.5650337387512812, + "grad_norm": 1.561362385749817, + "learning_rate": 1.7592645791439242e-05, + "loss": 2.1921, + "step": 1987 + }, + { + "epoch": 0.5653181039947393, + "grad_norm": 1.4173671007156372, + "learning_rate": 1.7581154840563057e-05, + "loss": 2.0708, + "step": 1988 + }, + { + "epoch": 0.5656024692381973, + "grad_norm": 1.5849429368972778, + "learning_rate": 1.756966388968687e-05, + "loss": 2.1582, + "step": 1989 + }, + { + "epoch": 0.5658868344816554, + "grad_norm": 1.5969016551971436, + "learning_rate": 1.7558172938810686e-05, + "loss": 1.9153, + "step": 1990 + }, + { + "epoch": 0.5661711997251136, + "grad_norm": 1.72384774684906, + "learning_rate": 1.7546681987934504e-05, + "loss": 2.1121, + "step": 1991 + }, + { + "epoch": 0.5664555649685717, + "grad_norm": 1.6643520593643188, + "learning_rate": 1.7535191037058318e-05, + "loss": 1.7982, + "step": 1992 + }, + { + "epoch": 0.5667399302120298, + "grad_norm": 1.7195711135864258, + "learning_rate": 1.7523700086182133e-05, + "loss": 2.8548, + "step": 1993 + }, + { + "epoch": 0.5670242954554879, + "grad_norm": 1.5349563360214233, + "learning_rate": 1.7512209135305947e-05, + "loss": 2.5764, + "step": 1994 + }, + { + "epoch": 0.567308660698946, + "grad_norm": 1.562687873840332, + "learning_rate": 1.7500718184429765e-05, + "loss": 2.1619, + "step": 1995 + }, + { + "epoch": 0.5675930259424042, + "grad_norm": 1.4912785291671753, + "learning_rate": 1.748922723355358e-05, + "loss": 2.2925, + "step": 1996 + }, + { + "epoch": 0.5678773911858623, + "grad_norm": 1.5238652229309082, + "learning_rate": 1.7477736282677394e-05, + "loss": 2.0571, + "step": 1997 + }, + { + "epoch": 0.5681617564293204, + "grad_norm": 1.570028305053711, + "learning_rate": 1.746624533180121e-05, + "loss": 2.0634, + "step": 1998 + }, + { + "epoch": 0.5684461216727785, + "grad_norm": 1.6997617483139038, + "learning_rate": 1.7454754380925023e-05, + "loss": 1.7017, + "step": 1999 + }, + { + "epoch": 0.5687304869162366, + "grad_norm": 1.747353434562683, + "learning_rate": 1.7443263430048837e-05, + "loss": 1.8186, + "step": 2000 + }, + { + "epoch": 0.5690148521596948, + "grad_norm": 1.8693517446517944, + "learning_rate": 1.7431772479172652e-05, + "loss": 2.8259, + "step": 2001 + }, + { + "epoch": 0.5692992174031529, + "grad_norm": 1.6010723114013672, + "learning_rate": 1.7420281528296466e-05, + "loss": 2.4006, + "step": 2002 + }, + { + "epoch": 0.569583582646611, + "grad_norm": 1.4531267881393433, + "learning_rate": 1.740879057742028e-05, + "loss": 2.1439, + "step": 2003 + }, + { + "epoch": 0.5698679478900691, + "grad_norm": 1.6961573362350464, + "learning_rate": 1.73972996265441e-05, + "loss": 2.2823, + "step": 2004 + }, + { + "epoch": 0.5701523131335272, + "grad_norm": 1.5460631847381592, + "learning_rate": 1.7385808675667913e-05, + "loss": 2.1329, + "step": 2005 + }, + { + "epoch": 0.5704366783769854, + "grad_norm": 1.644081950187683, + "learning_rate": 1.7374317724791728e-05, + "loss": 2.0236, + "step": 2006 + }, + { + "epoch": 0.5707210436204435, + "grad_norm": 1.6294313669204712, + "learning_rate": 1.7362826773915542e-05, + "loss": 2.004, + "step": 2007 + }, + { + "epoch": 0.5710054088639016, + "grad_norm": 1.7416433095932007, + "learning_rate": 1.7351335823039357e-05, + "loss": 1.9497, + "step": 2008 + }, + { + "epoch": 0.5712897741073597, + "grad_norm": 1.8446623086929321, + "learning_rate": 1.7339844872163175e-05, + "loss": 2.8675, + "step": 2009 + }, + { + "epoch": 0.5715741393508178, + "grad_norm": 1.5892642736434937, + "learning_rate": 1.732835392128699e-05, + "loss": 2.3443, + "step": 2010 + }, + { + "epoch": 0.571858504594276, + "grad_norm": 1.5367027521133423, + "learning_rate": 1.7316862970410804e-05, + "loss": 2.2238, + "step": 2011 + }, + { + "epoch": 0.5721428698377341, + "grad_norm": 1.5910507440567017, + "learning_rate": 1.7305372019534618e-05, + "loss": 2.2285, + "step": 2012 + }, + { + "epoch": 0.5724272350811922, + "grad_norm": 1.6503397226333618, + "learning_rate": 1.7293881068658433e-05, + "loss": 2.1693, + "step": 2013 + }, + { + "epoch": 0.5727116003246503, + "grad_norm": 1.5087186098098755, + "learning_rate": 1.7282390117782247e-05, + "loss": 1.896, + "step": 2014 + }, + { + "epoch": 0.5729959655681084, + "grad_norm": 1.6095649003982544, + "learning_rate": 1.727089916690606e-05, + "loss": 1.773, + "step": 2015 + }, + { + "epoch": 0.5732803308115666, + "grad_norm": 1.5037708282470703, + "learning_rate": 1.725940821602988e-05, + "loss": 1.7249, + "step": 2016 + }, + { + "epoch": 0.5735646960550247, + "grad_norm": 1.788989543914795, + "learning_rate": 1.7247917265153694e-05, + "loss": 2.8071, + "step": 2017 + }, + { + "epoch": 0.5738490612984828, + "grad_norm": 1.5129423141479492, + "learning_rate": 1.723642631427751e-05, + "loss": 2.3458, + "step": 2018 + }, + { + "epoch": 0.5741334265419409, + "grad_norm": 1.4829000234603882, + "learning_rate": 1.7224935363401323e-05, + "loss": 2.2329, + "step": 2019 + }, + { + "epoch": 0.574417791785399, + "grad_norm": 1.5166312456130981, + "learning_rate": 1.7213444412525138e-05, + "loss": 2.1336, + "step": 2020 + }, + { + "epoch": 0.5747021570288572, + "grad_norm": 1.6192970275878906, + "learning_rate": 1.7201953461648952e-05, + "loss": 2.2265, + "step": 2021 + }, + { + "epoch": 0.5749865222723153, + "grad_norm": 1.7159581184387207, + "learning_rate": 1.7190462510772767e-05, + "loss": 1.9326, + "step": 2022 + }, + { + "epoch": 0.5752708875157734, + "grad_norm": 1.5626370906829834, + "learning_rate": 1.717897155989658e-05, + "loss": 1.7684, + "step": 2023 + }, + { + "epoch": 0.5755552527592315, + "grad_norm": 1.6683768033981323, + "learning_rate": 1.7167480609020396e-05, + "loss": 1.9421, + "step": 2024 + }, + { + "epoch": 0.5758396180026896, + "grad_norm": 1.9297045469284058, + "learning_rate": 1.7155989658144213e-05, + "loss": 2.9594, + "step": 2025 + }, + { + "epoch": 0.5761239832461478, + "grad_norm": 1.5986518859863281, + "learning_rate": 1.7144498707268028e-05, + "loss": 2.334, + "step": 2026 + }, + { + "epoch": 0.5764083484896059, + "grad_norm": 1.5038700103759766, + "learning_rate": 1.7133007756391842e-05, + "loss": 2.326, + "step": 2027 + }, + { + "epoch": 0.576692713733064, + "grad_norm": 1.4739896059036255, + "learning_rate": 1.712151680551566e-05, + "loss": 2.2516, + "step": 2028 + }, + { + "epoch": 0.5769770789765221, + "grad_norm": 1.5917458534240723, + "learning_rate": 1.7110025854639475e-05, + "loss": 2.0803, + "step": 2029 + }, + { + "epoch": 0.5772614442199802, + "grad_norm": 1.491292953491211, + "learning_rate": 1.709853490376329e-05, + "loss": 1.8907, + "step": 2030 + }, + { + "epoch": 0.5775458094634384, + "grad_norm": 1.5379160642623901, + "learning_rate": 1.7087043952887104e-05, + "loss": 1.8301, + "step": 2031 + }, + { + "epoch": 0.5778301747068965, + "grad_norm": 1.6341428756713867, + "learning_rate": 1.7075553002010918e-05, + "loss": 1.7232, + "step": 2032 + }, + { + "epoch": 0.5781145399503546, + "grad_norm": 1.6281121969223022, + "learning_rate": 1.7064062051134733e-05, + "loss": 2.7037, + "step": 2033 + }, + { + "epoch": 0.5783989051938127, + "grad_norm": 1.5146305561065674, + "learning_rate": 1.7052571100258547e-05, + "loss": 2.4446, + "step": 2034 + }, + { + "epoch": 0.5786832704372709, + "grad_norm": 1.5931079387664795, + "learning_rate": 1.7041080149382362e-05, + "loss": 2.2561, + "step": 2035 + }, + { + "epoch": 0.578967635680729, + "grad_norm": 1.4791065454483032, + "learning_rate": 1.7029589198506176e-05, + "loss": 2.3593, + "step": 2036 + }, + { + "epoch": 0.579252000924187, + "grad_norm": 1.6104813814163208, + "learning_rate": 1.7018098247629994e-05, + "loss": 2.1088, + "step": 2037 + }, + { + "epoch": 0.5795363661676451, + "grad_norm": 1.5709761381149292, + "learning_rate": 1.700660729675381e-05, + "loss": 1.7519, + "step": 2038 + }, + { + "epoch": 0.5798207314111032, + "grad_norm": 1.6428778171539307, + "learning_rate": 1.6995116345877623e-05, + "loss": 1.9199, + "step": 2039 + }, + { + "epoch": 0.5801050966545614, + "grad_norm": 1.7062665224075317, + "learning_rate": 1.6983625395001438e-05, + "loss": 1.9117, + "step": 2040 + }, + { + "epoch": 0.5803894618980195, + "grad_norm": 1.9232088327407837, + "learning_rate": 1.6972134444125252e-05, + "loss": 2.822, + "step": 2041 + }, + { + "epoch": 0.5806738271414776, + "grad_norm": 1.44762122631073, + "learning_rate": 1.6960643493249067e-05, + "loss": 2.3914, + "step": 2042 + }, + { + "epoch": 0.5809581923849357, + "grad_norm": 1.5696653127670288, + "learning_rate": 1.694915254237288e-05, + "loss": 2.2531, + "step": 2043 + }, + { + "epoch": 0.5812425576283938, + "grad_norm": 1.4691784381866455, + "learning_rate": 1.6937661591496696e-05, + "loss": 2.225, + "step": 2044 + }, + { + "epoch": 0.581526922871852, + "grad_norm": 1.6073601245880127, + "learning_rate": 1.692617064062051e-05, + "loss": 1.9158, + "step": 2045 + }, + { + "epoch": 0.5818112881153101, + "grad_norm": 1.5055420398712158, + "learning_rate": 1.6914679689744328e-05, + "loss": 2.0171, + "step": 2046 + }, + { + "epoch": 0.5820956533587682, + "grad_norm": 1.6332578659057617, + "learning_rate": 1.6903188738868143e-05, + "loss": 1.7642, + "step": 2047 + }, + { + "epoch": 0.5823800186022263, + "grad_norm": 1.577101230621338, + "learning_rate": 1.6891697787991957e-05, + "loss": 1.8056, + "step": 2048 + }, + { + "epoch": 0.5826643838456844, + "grad_norm": 1.963454008102417, + "learning_rate": 1.6880206837115775e-05, + "loss": 2.8995, + "step": 2049 + }, + { + "epoch": 0.5829487490891426, + "grad_norm": 1.5220768451690674, + "learning_rate": 1.686871588623959e-05, + "loss": 2.2895, + "step": 2050 + }, + { + "epoch": 0.5832331143326007, + "grad_norm": 1.5367721319198608, + "learning_rate": 1.6857224935363404e-05, + "loss": 2.2004, + "step": 2051 + }, + { + "epoch": 0.5835174795760588, + "grad_norm": 1.5274112224578857, + "learning_rate": 1.684573398448722e-05, + "loss": 2.2675, + "step": 2052 + }, + { + "epoch": 0.5838018448195169, + "grad_norm": 1.7082741260528564, + "learning_rate": 1.6834243033611033e-05, + "loss": 2.1209, + "step": 2053 + }, + { + "epoch": 0.584086210062975, + "grad_norm": 1.5683553218841553, + "learning_rate": 1.6822752082734847e-05, + "loss": 1.9012, + "step": 2054 + }, + { + "epoch": 0.5843705753064332, + "grad_norm": 1.6328856945037842, + "learning_rate": 1.6811261131858662e-05, + "loss": 1.9311, + "step": 2055 + }, + { + "epoch": 0.5846549405498913, + "grad_norm": 1.782463550567627, + "learning_rate": 1.6799770180982476e-05, + "loss": 1.8888, + "step": 2056 + }, + { + "epoch": 0.5849393057933494, + "grad_norm": 1.9230624437332153, + "learning_rate": 1.678827923010629e-05, + "loss": 3.0453, + "step": 2057 + }, + { + "epoch": 0.5852236710368075, + "grad_norm": 1.576603651046753, + "learning_rate": 1.677678827923011e-05, + "loss": 2.2623, + "step": 2058 + }, + { + "epoch": 0.5855080362802656, + "grad_norm": 1.4937617778778076, + "learning_rate": 1.6765297328353923e-05, + "loss": 2.1262, + "step": 2059 + }, + { + "epoch": 0.5857924015237238, + "grad_norm": 1.6465567350387573, + "learning_rate": 1.6753806377477738e-05, + "loss": 2.2345, + "step": 2060 + }, + { + "epoch": 0.5860767667671819, + "grad_norm": 1.6568986177444458, + "learning_rate": 1.6742315426601552e-05, + "loss": 2.1957, + "step": 2061 + }, + { + "epoch": 0.58636113201064, + "grad_norm": 1.6795825958251953, + "learning_rate": 1.6730824475725367e-05, + "loss": 1.8565, + "step": 2062 + }, + { + "epoch": 0.5866454972540981, + "grad_norm": 1.547038197517395, + "learning_rate": 1.671933352484918e-05, + "loss": 1.8675, + "step": 2063 + }, + { + "epoch": 0.5869298624975562, + "grad_norm": 1.557686686515808, + "learning_rate": 1.6707842573972996e-05, + "loss": 1.7547, + "step": 2064 + }, + { + "epoch": 0.5872142277410144, + "grad_norm": 1.8700066804885864, + "learning_rate": 1.6696351623096814e-05, + "loss": 2.7392, + "step": 2065 + }, + { + "epoch": 0.5874985929844725, + "grad_norm": 1.6624739170074463, + "learning_rate": 1.6684860672220628e-05, + "loss": 2.343, + "step": 2066 + }, + { + "epoch": 0.5877829582279306, + "grad_norm": 1.5265758037567139, + "learning_rate": 1.6673369721344443e-05, + "loss": 2.1868, + "step": 2067 + }, + { + "epoch": 0.5880673234713887, + "grad_norm": 1.5248119831085205, + "learning_rate": 1.6661878770468257e-05, + "loss": 2.1564, + "step": 2068 + }, + { + "epoch": 0.5883516887148468, + "grad_norm": 1.6401458978652954, + "learning_rate": 1.665038781959207e-05, + "loss": 2.2195, + "step": 2069 + }, + { + "epoch": 0.588636053958305, + "grad_norm": 1.5524535179138184, + "learning_rate": 1.663889686871589e-05, + "loss": 1.9903, + "step": 2070 + }, + { + "epoch": 0.5889204192017631, + "grad_norm": 1.6761585474014282, + "learning_rate": 1.6627405917839704e-05, + "loss": 1.9134, + "step": 2071 + }, + { + "epoch": 0.5892047844452212, + "grad_norm": 1.6284050941467285, + "learning_rate": 1.661591496696352e-05, + "loss": 1.8622, + "step": 2072 + }, + { + "epoch": 0.5894891496886793, + "grad_norm": 1.7775750160217285, + "learning_rate": 1.6604424016087333e-05, + "loss": 2.7387, + "step": 2073 + }, + { + "epoch": 0.5897735149321374, + "grad_norm": 1.5672471523284912, + "learning_rate": 1.6592933065211147e-05, + "loss": 2.13, + "step": 2074 + }, + { + "epoch": 0.5900578801755956, + "grad_norm": 2.415464401245117, + "learning_rate": 1.6581442114334962e-05, + "loss": 2.58, + "step": 2075 + }, + { + "epoch": 0.5903422454190537, + "grad_norm": 1.550248622894287, + "learning_rate": 1.6569951163458776e-05, + "loss": 2.2516, + "step": 2076 + }, + { + "epoch": 0.5906266106625118, + "grad_norm": 1.564470887184143, + "learning_rate": 1.655846021258259e-05, + "loss": 1.9016, + "step": 2077 + }, + { + "epoch": 0.5909109759059699, + "grad_norm": 1.5428235530853271, + "learning_rate": 1.6546969261706405e-05, + "loss": 1.9438, + "step": 2078 + }, + { + "epoch": 0.5911953411494281, + "grad_norm": 1.5634739398956299, + "learning_rate": 1.6535478310830223e-05, + "loss": 2.0286, + "step": 2079 + }, + { + "epoch": 0.5914797063928862, + "grad_norm": 1.6420611143112183, + "learning_rate": 1.6523987359954038e-05, + "loss": 1.7369, + "step": 2080 + }, + { + "epoch": 0.5917640716363443, + "grad_norm": 1.8491171598434448, + "learning_rate": 1.6512496409077852e-05, + "loss": 2.7506, + "step": 2081 + }, + { + "epoch": 0.5920484368798024, + "grad_norm": 1.6593526601791382, + "learning_rate": 1.6501005458201667e-05, + "loss": 2.4749, + "step": 2082 + }, + { + "epoch": 0.5923328021232604, + "grad_norm": 1.6108300685882568, + "learning_rate": 1.648951450732548e-05, + "loss": 2.4418, + "step": 2083 + }, + { + "epoch": 0.5926171673667187, + "grad_norm": 1.489844799041748, + "learning_rate": 1.64780235564493e-05, + "loss": 2.2861, + "step": 2084 + }, + { + "epoch": 0.5929015326101768, + "grad_norm": 1.5152863264083862, + "learning_rate": 1.6466532605573114e-05, + "loss": 2.1108, + "step": 2085 + }, + { + "epoch": 0.5931858978536348, + "grad_norm": 1.5476469993591309, + "learning_rate": 1.6455041654696928e-05, + "loss": 1.9956, + "step": 2086 + }, + { + "epoch": 0.5934702630970929, + "grad_norm": 1.568286418914795, + "learning_rate": 1.6443550703820743e-05, + "loss": 1.7746, + "step": 2087 + }, + { + "epoch": 0.593754628340551, + "grad_norm": 1.673620581626892, + "learning_rate": 1.6432059752944557e-05, + "loss": 1.8101, + "step": 2088 + }, + { + "epoch": 0.5940389935840092, + "grad_norm": 1.8000257015228271, + "learning_rate": 1.6420568802068372e-05, + "loss": 2.8105, + "step": 2089 + }, + { + "epoch": 0.5943233588274673, + "grad_norm": 1.4337655305862427, + "learning_rate": 1.6409077851192186e-05, + "loss": 2.507, + "step": 2090 + }, + { + "epoch": 0.5946077240709254, + "grad_norm": 1.5374799966812134, + "learning_rate": 1.6397586900316004e-05, + "loss": 2.4551, + "step": 2091 + }, + { + "epoch": 0.5948920893143835, + "grad_norm": 1.586155891418457, + "learning_rate": 1.638609594943982e-05, + "loss": 2.0942, + "step": 2092 + }, + { + "epoch": 0.5951764545578416, + "grad_norm": 1.6072989702224731, + "learning_rate": 1.6374604998563633e-05, + "loss": 1.9915, + "step": 2093 + }, + { + "epoch": 0.5954608198012998, + "grad_norm": 1.4546177387237549, + "learning_rate": 1.6363114047687448e-05, + "loss": 1.8326, + "step": 2094 + }, + { + "epoch": 0.5957451850447579, + "grad_norm": 1.5891377925872803, + "learning_rate": 1.6351623096811262e-05, + "loss": 1.7843, + "step": 2095 + }, + { + "epoch": 0.596029550288216, + "grad_norm": 1.6567867994308472, + "learning_rate": 1.6340132145935077e-05, + "loss": 1.622, + "step": 2096 + }, + { + "epoch": 0.5963139155316741, + "grad_norm": 1.8327213525772095, + "learning_rate": 1.632864119505889e-05, + "loss": 2.7612, + "step": 2097 + }, + { + "epoch": 0.5965982807751322, + "grad_norm": 1.5466746091842651, + "learning_rate": 1.6317150244182706e-05, + "loss": 2.2606, + "step": 2098 + }, + { + "epoch": 0.5968826460185904, + "grad_norm": 1.4704080820083618, + "learning_rate": 1.630565929330652e-05, + "loss": 2.2202, + "step": 2099 + }, + { + "epoch": 0.5971670112620485, + "grad_norm": 1.5352089405059814, + "learning_rate": 1.6294168342430338e-05, + "loss": 2.1893, + "step": 2100 + }, + { + "epoch": 0.5974513765055066, + "grad_norm": 1.6276265382766724, + "learning_rate": 1.6282677391554152e-05, + "loss": 2.0548, + "step": 2101 + }, + { + "epoch": 0.5977357417489647, + "grad_norm": 1.4630635976791382, + "learning_rate": 1.6271186440677967e-05, + "loss": 1.799, + "step": 2102 + }, + { + "epoch": 0.5980201069924228, + "grad_norm": 1.538522720336914, + "learning_rate": 1.6259695489801785e-05, + "loss": 1.6713, + "step": 2103 + }, + { + "epoch": 0.598304472235881, + "grad_norm": 1.6037315130233765, + "learning_rate": 1.62482045389256e-05, + "loss": 1.6671, + "step": 2104 + }, + { + "epoch": 0.5985888374793391, + "grad_norm": 1.940868854522705, + "learning_rate": 1.6236713588049414e-05, + "loss": 2.8063, + "step": 2105 + }, + { + "epoch": 0.5988732027227972, + "grad_norm": 1.5579652786254883, + "learning_rate": 1.622522263717323e-05, + "loss": 2.3295, + "step": 2106 + }, + { + "epoch": 0.5991575679662553, + "grad_norm": 1.6156672239303589, + "learning_rate": 1.6213731686297043e-05, + "loss": 2.2363, + "step": 2107 + }, + { + "epoch": 0.5994419332097134, + "grad_norm": 1.485302209854126, + "learning_rate": 1.6202240735420857e-05, + "loss": 2.0772, + "step": 2108 + }, + { + "epoch": 0.5997262984531716, + "grad_norm": 1.7746816873550415, + "learning_rate": 1.6190749784544672e-05, + "loss": 2.1031, + "step": 2109 + }, + { + "epoch": 0.6000106636966297, + "grad_norm": 1.6292321681976318, + "learning_rate": 1.6179258833668486e-05, + "loss": 1.9516, + "step": 2110 + }, + { + "epoch": 0.6002950289400878, + "grad_norm": 1.6927971839904785, + "learning_rate": 1.61677678827923e-05, + "loss": 1.9776, + "step": 2111 + }, + { + "epoch": 0.6005793941835459, + "grad_norm": 1.7361969947814941, + "learning_rate": 1.615627693191612e-05, + "loss": 1.6757, + "step": 2112 + }, + { + "epoch": 0.600863759427004, + "grad_norm": 1.7664377689361572, + "learning_rate": 1.6144785981039933e-05, + "loss": 2.7698, + "step": 2113 + }, + { + "epoch": 0.6011481246704622, + "grad_norm": 1.4889873266220093, + "learning_rate": 1.6133295030163748e-05, + "loss": 2.3977, + "step": 2114 + }, + { + "epoch": 0.6014324899139203, + "grad_norm": 1.5173231363296509, + "learning_rate": 1.6121804079287562e-05, + "loss": 2.2699, + "step": 2115 + }, + { + "epoch": 0.6017168551573784, + "grad_norm": 1.5664979219436646, + "learning_rate": 1.6110313128411377e-05, + "loss": 2.2895, + "step": 2116 + }, + { + "epoch": 0.6020012204008365, + "grad_norm": 1.7025697231292725, + "learning_rate": 1.609882217753519e-05, + "loss": 1.9624, + "step": 2117 + }, + { + "epoch": 0.6022855856442946, + "grad_norm": 1.7116023302078247, + "learning_rate": 1.6087331226659006e-05, + "loss": 2.1481, + "step": 2118 + }, + { + "epoch": 0.6025699508877528, + "grad_norm": 1.53733491897583, + "learning_rate": 1.607584027578282e-05, + "loss": 1.6645, + "step": 2119 + }, + { + "epoch": 0.6028543161312109, + "grad_norm": 1.602069616317749, + "learning_rate": 1.6064349324906635e-05, + "loss": 1.7552, + "step": 2120 + }, + { + "epoch": 0.603138681374669, + "grad_norm": 1.7021903991699219, + "learning_rate": 1.6052858374030453e-05, + "loss": 2.7208, + "step": 2121 + }, + { + "epoch": 0.6034230466181271, + "grad_norm": 1.5884177684783936, + "learning_rate": 1.6041367423154267e-05, + "loss": 2.4558, + "step": 2122 + }, + { + "epoch": 0.6037074118615852, + "grad_norm": 1.600091814994812, + "learning_rate": 1.602987647227808e-05, + "loss": 2.3824, + "step": 2123 + }, + { + "epoch": 0.6039917771050434, + "grad_norm": 1.636725902557373, + "learning_rate": 1.60183855214019e-05, + "loss": 2.2555, + "step": 2124 + }, + { + "epoch": 0.6042761423485015, + "grad_norm": 1.5665066242218018, + "learning_rate": 1.6006894570525714e-05, + "loss": 2.2357, + "step": 2125 + }, + { + "epoch": 0.6045605075919596, + "grad_norm": 1.6503568887710571, + "learning_rate": 1.599540361964953e-05, + "loss": 1.9319, + "step": 2126 + }, + { + "epoch": 0.6048448728354177, + "grad_norm": 1.774845838546753, + "learning_rate": 1.5983912668773343e-05, + "loss": 1.8753, + "step": 2127 + }, + { + "epoch": 0.6051292380788759, + "grad_norm": 1.626198649406433, + "learning_rate": 1.5972421717897157e-05, + "loss": 1.9712, + "step": 2128 + }, + { + "epoch": 0.605413603322334, + "grad_norm": 1.8184738159179688, + "learning_rate": 1.5960930767020972e-05, + "loss": 2.7941, + "step": 2129 + }, + { + "epoch": 0.605697968565792, + "grad_norm": 1.5862571001052856, + "learning_rate": 1.5949439816144786e-05, + "loss": 2.421, + "step": 2130 + }, + { + "epoch": 0.6059823338092502, + "grad_norm": 1.5402880907058716, + "learning_rate": 1.59379488652686e-05, + "loss": 2.462, + "step": 2131 + }, + { + "epoch": 0.6062666990527082, + "grad_norm": 1.606608510017395, + "learning_rate": 1.5926457914392415e-05, + "loss": 2.1879, + "step": 2132 + }, + { + "epoch": 0.6065510642961665, + "grad_norm": 1.640724539756775, + "learning_rate": 1.5914966963516233e-05, + "loss": 1.9824, + "step": 2133 + }, + { + "epoch": 0.6068354295396245, + "grad_norm": 1.5415105819702148, + "learning_rate": 1.5903476012640048e-05, + "loss": 1.8615, + "step": 2134 + }, + { + "epoch": 0.6071197947830826, + "grad_norm": 1.5723168849945068, + "learning_rate": 1.5891985061763862e-05, + "loss": 1.8926, + "step": 2135 + }, + { + "epoch": 0.6074041600265407, + "grad_norm": 1.752859115600586, + "learning_rate": 1.5880494110887677e-05, + "loss": 1.9364, + "step": 2136 + }, + { + "epoch": 0.6076885252699988, + "grad_norm": 1.7561215162277222, + "learning_rate": 1.586900316001149e-05, + "loss": 2.7265, + "step": 2137 + }, + { + "epoch": 0.607972890513457, + "grad_norm": 1.5687557458877563, + "learning_rate": 1.5857512209135306e-05, + "loss": 2.1813, + "step": 2138 + }, + { + "epoch": 0.6082572557569151, + "grad_norm": 1.5815242528915405, + "learning_rate": 1.5846021258259124e-05, + "loss": 2.4614, + "step": 2139 + }, + { + "epoch": 0.6085416210003732, + "grad_norm": 1.4205738306045532, + "learning_rate": 1.5834530307382938e-05, + "loss": 2.0676, + "step": 2140 + }, + { + "epoch": 0.6088259862438313, + "grad_norm": 1.630874752998352, + "learning_rate": 1.5823039356506753e-05, + "loss": 2.286, + "step": 2141 + }, + { + "epoch": 0.6091103514872894, + "grad_norm": 1.5306339263916016, + "learning_rate": 1.5811548405630567e-05, + "loss": 2.0203, + "step": 2142 + }, + { + "epoch": 0.6093947167307476, + "grad_norm": 1.5277224779129028, + "learning_rate": 1.5800057454754382e-05, + "loss": 1.8091, + "step": 2143 + }, + { + "epoch": 0.6096790819742057, + "grad_norm": 1.5925323963165283, + "learning_rate": 1.5788566503878196e-05, + "loss": 1.7929, + "step": 2144 + }, + { + "epoch": 0.6099634472176638, + "grad_norm": 1.7087624073028564, + "learning_rate": 1.5777075553002014e-05, + "loss": 2.8013, + "step": 2145 + }, + { + "epoch": 0.6102478124611219, + "grad_norm": 1.4798260927200317, + "learning_rate": 1.576558460212583e-05, + "loss": 2.6274, + "step": 2146 + }, + { + "epoch": 0.61053217770458, + "grad_norm": 1.5136479139328003, + "learning_rate": 1.5754093651249643e-05, + "loss": 2.2147, + "step": 2147 + }, + { + "epoch": 0.6108165429480382, + "grad_norm": 1.4526771306991577, + "learning_rate": 1.5742602700373458e-05, + "loss": 2.3316, + "step": 2148 + }, + { + "epoch": 0.6111009081914963, + "grad_norm": 1.671984076499939, + "learning_rate": 1.5731111749497272e-05, + "loss": 2.0726, + "step": 2149 + }, + { + "epoch": 0.6113852734349544, + "grad_norm": 1.5075078010559082, + "learning_rate": 1.5719620798621087e-05, + "loss": 2.0302, + "step": 2150 + }, + { + "epoch": 0.6116696386784125, + "grad_norm": 1.594140648841858, + "learning_rate": 1.57081298477449e-05, + "loss": 1.9607, + "step": 2151 + }, + { + "epoch": 0.6119540039218706, + "grad_norm": 1.7576320171356201, + "learning_rate": 1.5696638896868716e-05, + "loss": 1.9199, + "step": 2152 + }, + { + "epoch": 0.6122383691653288, + "grad_norm": 1.8919973373413086, + "learning_rate": 1.568514794599253e-05, + "loss": 2.8498, + "step": 2153 + }, + { + "epoch": 0.6125227344087869, + "grad_norm": 1.564929723739624, + "learning_rate": 1.5673656995116348e-05, + "loss": 2.2753, + "step": 2154 + }, + { + "epoch": 0.612807099652245, + "grad_norm": 1.453860878944397, + "learning_rate": 1.5662166044240162e-05, + "loss": 2.2765, + "step": 2155 + }, + { + "epoch": 0.6130914648957031, + "grad_norm": 1.5218002796173096, + "learning_rate": 1.5650675093363977e-05, + "loss": 2.1331, + "step": 2156 + }, + { + "epoch": 0.6133758301391612, + "grad_norm": 1.625298261642456, + "learning_rate": 1.563918414248779e-05, + "loss": 2.1687, + "step": 2157 + }, + { + "epoch": 0.6136601953826194, + "grad_norm": 1.6260716915130615, + "learning_rate": 1.562769319161161e-05, + "loss": 1.9181, + "step": 2158 + }, + { + "epoch": 0.6139445606260775, + "grad_norm": 1.7083520889282227, + "learning_rate": 1.5616202240735424e-05, + "loss": 1.9319, + "step": 2159 + }, + { + "epoch": 0.6142289258695356, + "grad_norm": 1.6045297384262085, + "learning_rate": 1.560471128985924e-05, + "loss": 1.6412, + "step": 2160 + }, + { + "epoch": 0.6145132911129937, + "grad_norm": 1.6960153579711914, + "learning_rate": 1.5593220338983053e-05, + "loss": 2.6496, + "step": 2161 + }, + { + "epoch": 0.6147976563564518, + "grad_norm": 1.5479867458343506, + "learning_rate": 1.5581729388106867e-05, + "loss": 2.3291, + "step": 2162 + }, + { + "epoch": 0.61508202159991, + "grad_norm": 1.4145187139511108, + "learning_rate": 1.5570238437230682e-05, + "loss": 2.2698, + "step": 2163 + }, + { + "epoch": 0.6153663868433681, + "grad_norm": 1.6084132194519043, + "learning_rate": 1.5558747486354496e-05, + "loss": 2.3009, + "step": 2164 + }, + { + "epoch": 0.6156507520868262, + "grad_norm": 1.6727561950683594, + "learning_rate": 1.554725653547831e-05, + "loss": 2.2078, + "step": 2165 + }, + { + "epoch": 0.6159351173302843, + "grad_norm": 1.6095800399780273, + "learning_rate": 1.553576558460213e-05, + "loss": 1.9408, + "step": 2166 + }, + { + "epoch": 0.6162194825737424, + "grad_norm": 1.6844741106033325, + "learning_rate": 1.5524274633725943e-05, + "loss": 1.9492, + "step": 2167 + }, + { + "epoch": 0.6165038478172006, + "grad_norm": 1.600579857826233, + "learning_rate": 1.5512783682849758e-05, + "loss": 1.6956, + "step": 2168 + }, + { + "epoch": 0.6167882130606587, + "grad_norm": 1.7465009689331055, + "learning_rate": 1.5501292731973572e-05, + "loss": 2.7124, + "step": 2169 + }, + { + "epoch": 0.6170725783041168, + "grad_norm": 1.466794729232788, + "learning_rate": 1.5489801781097387e-05, + "loss": 2.2909, + "step": 2170 + }, + { + "epoch": 0.6173569435475749, + "grad_norm": 1.493714451789856, + "learning_rate": 1.54783108302212e-05, + "loss": 2.1008, + "step": 2171 + }, + { + "epoch": 0.617641308791033, + "grad_norm": 1.5686396360397339, + "learning_rate": 1.5466819879345016e-05, + "loss": 2.1567, + "step": 2172 + }, + { + "epoch": 0.6179256740344912, + "grad_norm": 1.6050747632980347, + "learning_rate": 1.545532892846883e-05, + "loss": 2.2713, + "step": 2173 + }, + { + "epoch": 0.6182100392779493, + "grad_norm": 1.5931973457336426, + "learning_rate": 1.5443837977592645e-05, + "loss": 1.797, + "step": 2174 + }, + { + "epoch": 0.6184944045214074, + "grad_norm": 1.662131667137146, + "learning_rate": 1.5432347026716463e-05, + "loss": 1.7863, + "step": 2175 + }, + { + "epoch": 0.6187787697648655, + "grad_norm": 1.6877639293670654, + "learning_rate": 1.5420856075840277e-05, + "loss": 1.9392, + "step": 2176 + }, + { + "epoch": 0.6190631350083237, + "grad_norm": 1.8865476846694946, + "learning_rate": 1.540936512496409e-05, + "loss": 2.7452, + "step": 2177 + }, + { + "epoch": 0.6193475002517818, + "grad_norm": 1.5432252883911133, + "learning_rate": 1.539787417408791e-05, + "loss": 2.5342, + "step": 2178 + }, + { + "epoch": 0.6196318654952399, + "grad_norm": 1.513899564743042, + "learning_rate": 1.5386383223211724e-05, + "loss": 2.1821, + "step": 2179 + }, + { + "epoch": 0.619916230738698, + "grad_norm": 1.5674052238464355, + "learning_rate": 1.537489227233554e-05, + "loss": 2.2672, + "step": 2180 + }, + { + "epoch": 0.620200595982156, + "grad_norm": 1.5859272480010986, + "learning_rate": 1.5363401321459353e-05, + "loss": 2.1825, + "step": 2181 + }, + { + "epoch": 0.6204849612256143, + "grad_norm": 1.5028396844863892, + "learning_rate": 1.5351910370583167e-05, + "loss": 1.7984, + "step": 2182 + }, + { + "epoch": 0.6207693264690723, + "grad_norm": 1.5256812572479248, + "learning_rate": 1.5340419419706982e-05, + "loss": 1.8812, + "step": 2183 + }, + { + "epoch": 0.6210536917125304, + "grad_norm": 1.6204417943954468, + "learning_rate": 1.5328928468830796e-05, + "loss": 1.8866, + "step": 2184 + }, + { + "epoch": 0.6213380569559885, + "grad_norm": 1.6233776807785034, + "learning_rate": 1.531743751795461e-05, + "loss": 2.7469, + "step": 2185 + }, + { + "epoch": 0.6216224221994466, + "grad_norm": 1.5075547695159912, + "learning_rate": 1.5305946567078425e-05, + "loss": 2.4263, + "step": 2186 + }, + { + "epoch": 0.6219067874429048, + "grad_norm": 1.5008249282836914, + "learning_rate": 1.5294455616202243e-05, + "loss": 2.1231, + "step": 2187 + }, + { + "epoch": 0.6221911526863629, + "grad_norm": 1.5601019859313965, + "learning_rate": 1.5282964665326058e-05, + "loss": 2.3066, + "step": 2188 + }, + { + "epoch": 0.622475517929821, + "grad_norm": 1.6898422241210938, + "learning_rate": 1.5271473714449872e-05, + "loss": 2.2774, + "step": 2189 + }, + { + "epoch": 0.6227598831732791, + "grad_norm": 1.5225181579589844, + "learning_rate": 1.5259982763573687e-05, + "loss": 2.0043, + "step": 2190 + }, + { + "epoch": 0.6230442484167372, + "grad_norm": 1.565322756767273, + "learning_rate": 1.5248491812697501e-05, + "loss": 1.7469, + "step": 2191 + }, + { + "epoch": 0.6233286136601954, + "grad_norm": 1.6591078042984009, + "learning_rate": 1.5237000861821316e-05, + "loss": 1.4503, + "step": 2192 + }, + { + "epoch": 0.6236129789036535, + "grad_norm": 1.8940781354904175, + "learning_rate": 1.522550991094513e-05, + "loss": 2.7695, + "step": 2193 + }, + { + "epoch": 0.6238973441471116, + "grad_norm": 1.5857594013214111, + "learning_rate": 1.5214018960068946e-05, + "loss": 2.3266, + "step": 2194 + }, + { + "epoch": 0.6241817093905697, + "grad_norm": 1.531599998474121, + "learning_rate": 1.5202528009192763e-05, + "loss": 2.3562, + "step": 2195 + }, + { + "epoch": 0.6244660746340278, + "grad_norm": 1.4814765453338623, + "learning_rate": 1.5191037058316577e-05, + "loss": 2.1034, + "step": 2196 + }, + { + "epoch": 0.624750439877486, + "grad_norm": 1.5699845552444458, + "learning_rate": 1.5179546107440393e-05, + "loss": 2.18, + "step": 2197 + }, + { + "epoch": 0.6250348051209441, + "grad_norm": 1.572761058807373, + "learning_rate": 1.5168055156564208e-05, + "loss": 1.9689, + "step": 2198 + }, + { + "epoch": 0.6253191703644022, + "grad_norm": 1.5882624387741089, + "learning_rate": 1.5156564205688022e-05, + "loss": 1.9178, + "step": 2199 + }, + { + "epoch": 0.6256035356078603, + "grad_norm": 1.979588508605957, + "learning_rate": 1.5145073254811837e-05, + "loss": 1.8587, + "step": 2200 + }, + { + "epoch": 0.6258879008513184, + "grad_norm": 1.7905445098876953, + "learning_rate": 1.5133582303935653e-05, + "loss": 2.6646, + "step": 2201 + }, + { + "epoch": 0.6261722660947766, + "grad_norm": 1.5020902156829834, + "learning_rate": 1.5122091353059468e-05, + "loss": 2.4876, + "step": 2202 + }, + { + "epoch": 0.6264566313382347, + "grad_norm": 1.551763653755188, + "learning_rate": 1.5110600402183282e-05, + "loss": 2.347, + "step": 2203 + }, + { + "epoch": 0.6267409965816928, + "grad_norm": 1.5747159719467163, + "learning_rate": 1.5099109451307097e-05, + "loss": 2.3735, + "step": 2204 + }, + { + "epoch": 0.6270253618251509, + "grad_norm": 1.567586064338684, + "learning_rate": 1.5087618500430911e-05, + "loss": 2.1383, + "step": 2205 + }, + { + "epoch": 0.627309727068609, + "grad_norm": 1.5105654001235962, + "learning_rate": 1.5076127549554727e-05, + "loss": 1.7974, + "step": 2206 + }, + { + "epoch": 0.6275940923120672, + "grad_norm": 1.600792646408081, + "learning_rate": 1.5064636598678542e-05, + "loss": 1.8991, + "step": 2207 + }, + { + "epoch": 0.6278784575555253, + "grad_norm": 1.6282237768173218, + "learning_rate": 1.5053145647802356e-05, + "loss": 1.7965, + "step": 2208 + }, + { + "epoch": 0.6281628227989834, + "grad_norm": 1.7487695217132568, + "learning_rate": 1.504165469692617e-05, + "loss": 2.6253, + "step": 2209 + }, + { + "epoch": 0.6284471880424415, + "grad_norm": 1.627800703048706, + "learning_rate": 1.5030163746049987e-05, + "loss": 2.3377, + "step": 2210 + }, + { + "epoch": 0.6287315532858996, + "grad_norm": 1.490409255027771, + "learning_rate": 1.5018672795173801e-05, + "loss": 2.1716, + "step": 2211 + }, + { + "epoch": 0.6290159185293578, + "grad_norm": 1.5944234132766724, + "learning_rate": 1.5007181844297616e-05, + "loss": 2.2221, + "step": 2212 + }, + { + "epoch": 0.6293002837728159, + "grad_norm": 1.5915591716766357, + "learning_rate": 1.4995690893421432e-05, + "loss": 2.2536, + "step": 2213 + }, + { + "epoch": 0.629584649016274, + "grad_norm": 1.5468132495880127, + "learning_rate": 1.4984199942545248e-05, + "loss": 2.0741, + "step": 2214 + }, + { + "epoch": 0.6298690142597321, + "grad_norm": 1.5128260850906372, + "learning_rate": 1.4972708991669063e-05, + "loss": 1.7709, + "step": 2215 + }, + { + "epoch": 0.6301533795031902, + "grad_norm": 1.6346701383590698, + "learning_rate": 1.4961218040792877e-05, + "loss": 1.7228, + "step": 2216 + }, + { + "epoch": 0.6304377447466484, + "grad_norm": 1.812436580657959, + "learning_rate": 1.4949727089916692e-05, + "loss": 2.7526, + "step": 2217 + }, + { + "epoch": 0.6307221099901065, + "grad_norm": 1.562059998512268, + "learning_rate": 1.4938236139040508e-05, + "loss": 2.4138, + "step": 2218 + }, + { + "epoch": 0.6310064752335646, + "grad_norm": 1.6043922901153564, + "learning_rate": 1.4926745188164322e-05, + "loss": 2.2918, + "step": 2219 + }, + { + "epoch": 0.6312908404770227, + "grad_norm": 1.5691092014312744, + "learning_rate": 1.4915254237288137e-05, + "loss": 2.2282, + "step": 2220 + }, + { + "epoch": 0.6315752057204808, + "grad_norm": 1.5854427814483643, + "learning_rate": 1.4903763286411951e-05, + "loss": 2.2005, + "step": 2221 + }, + { + "epoch": 0.631859570963939, + "grad_norm": 1.552298903465271, + "learning_rate": 1.4892272335535766e-05, + "loss": 1.7499, + "step": 2222 + }, + { + "epoch": 0.6321439362073971, + "grad_norm": 1.7358125448226929, + "learning_rate": 1.4880781384659582e-05, + "loss": 1.886, + "step": 2223 + }, + { + "epoch": 0.6324283014508552, + "grad_norm": 1.6180943250656128, + "learning_rate": 1.4869290433783397e-05, + "loss": 1.7801, + "step": 2224 + }, + { + "epoch": 0.6327126666943133, + "grad_norm": 1.9027990102767944, + "learning_rate": 1.4857799482907211e-05, + "loss": 2.6241, + "step": 2225 + }, + { + "epoch": 0.6329970319377715, + "grad_norm": 1.5672250986099243, + "learning_rate": 1.4846308532031026e-05, + "loss": 2.4785, + "step": 2226 + }, + { + "epoch": 0.6332813971812296, + "grad_norm": 1.5182520151138306, + "learning_rate": 1.4834817581154842e-05, + "loss": 2.0982, + "step": 2227 + }, + { + "epoch": 0.6335657624246877, + "grad_norm": 1.4269053936004639, + "learning_rate": 1.4823326630278656e-05, + "loss": 2.3107, + "step": 2228 + }, + { + "epoch": 0.6338501276681457, + "grad_norm": 1.6807674169540405, + "learning_rate": 1.481183567940247e-05, + "loss": 2.0994, + "step": 2229 + }, + { + "epoch": 0.6341344929116038, + "grad_norm": 1.5306531190872192, + "learning_rate": 1.4800344728526285e-05, + "loss": 1.8668, + "step": 2230 + }, + { + "epoch": 0.634418858155062, + "grad_norm": 1.683131456375122, + "learning_rate": 1.47888537776501e-05, + "loss": 1.7841, + "step": 2231 + }, + { + "epoch": 0.6347032233985201, + "grad_norm": 1.6257297992706299, + "learning_rate": 1.4777362826773918e-05, + "loss": 1.7767, + "step": 2232 + }, + { + "epoch": 0.6349875886419782, + "grad_norm": 1.8746527433395386, + "learning_rate": 1.4765871875897732e-05, + "loss": 2.8278, + "step": 2233 + }, + { + "epoch": 0.6352719538854363, + "grad_norm": 1.5843089818954468, + "learning_rate": 1.4754380925021547e-05, + "loss": 2.3801, + "step": 2234 + }, + { + "epoch": 0.6355563191288944, + "grad_norm": 1.465401291847229, + "learning_rate": 1.4742889974145363e-05, + "loss": 2.0188, + "step": 2235 + }, + { + "epoch": 0.6358406843723526, + "grad_norm": 1.4717556238174438, + "learning_rate": 1.4731399023269177e-05, + "loss": 2.1638, + "step": 2236 + }, + { + "epoch": 0.6361250496158107, + "grad_norm": 1.733504056930542, + "learning_rate": 1.4719908072392992e-05, + "loss": 2.2512, + "step": 2237 + }, + { + "epoch": 0.6364094148592688, + "grad_norm": 1.602497935295105, + "learning_rate": 1.4708417121516806e-05, + "loss": 1.9198, + "step": 2238 + }, + { + "epoch": 0.6366937801027269, + "grad_norm": 1.6287175416946411, + "learning_rate": 1.4696926170640623e-05, + "loss": 1.8728, + "step": 2239 + }, + { + "epoch": 0.636978145346185, + "grad_norm": 1.640389084815979, + "learning_rate": 1.4685435219764437e-05, + "loss": 1.6453, + "step": 2240 + }, + { + "epoch": 0.6372625105896432, + "grad_norm": 1.8420612812042236, + "learning_rate": 1.4673944268888252e-05, + "loss": 2.6964, + "step": 2241 + }, + { + "epoch": 0.6375468758331013, + "grad_norm": 1.4961106777191162, + "learning_rate": 1.4662453318012066e-05, + "loss": 2.2855, + "step": 2242 + }, + { + "epoch": 0.6378312410765594, + "grad_norm": 1.5498943328857422, + "learning_rate": 1.465096236713588e-05, + "loss": 2.3792, + "step": 2243 + }, + { + "epoch": 0.6381156063200175, + "grad_norm": 1.5839142799377441, + "learning_rate": 1.4639471416259697e-05, + "loss": 2.2614, + "step": 2244 + }, + { + "epoch": 0.6383999715634756, + "grad_norm": 1.598946452140808, + "learning_rate": 1.4627980465383511e-05, + "loss": 2.2075, + "step": 2245 + }, + { + "epoch": 0.6386843368069338, + "grad_norm": 1.521291732788086, + "learning_rate": 1.4616489514507326e-05, + "loss": 2.1571, + "step": 2246 + }, + { + "epoch": 0.6389687020503919, + "grad_norm": 1.5280981063842773, + "learning_rate": 1.460499856363114e-05, + "loss": 1.6476, + "step": 2247 + }, + { + "epoch": 0.63925306729385, + "grad_norm": 1.6235344409942627, + "learning_rate": 1.4593507612754956e-05, + "loss": 1.8018, + "step": 2248 + }, + { + "epoch": 0.6395374325373081, + "grad_norm": 1.906637191772461, + "learning_rate": 1.4582016661878771e-05, + "loss": 2.8116, + "step": 2249 + }, + { + "epoch": 0.6398217977807662, + "grad_norm": 1.4746451377868652, + "learning_rate": 1.4570525711002587e-05, + "loss": 2.4155, + "step": 2250 + }, + { + "epoch": 0.6401061630242244, + "grad_norm": 1.536787748336792, + "learning_rate": 1.4559034760126403e-05, + "loss": 2.2476, + "step": 2251 + }, + { + "epoch": 0.6403905282676825, + "grad_norm": 1.4829517602920532, + "learning_rate": 1.4547543809250218e-05, + "loss": 2.1656, + "step": 2252 + }, + { + "epoch": 0.6406748935111406, + "grad_norm": 1.6278111934661865, + "learning_rate": 1.4536052858374032e-05, + "loss": 2.025, + "step": 2253 + }, + { + "epoch": 0.6409592587545987, + "grad_norm": 1.5534253120422363, + "learning_rate": 1.4524561907497847e-05, + "loss": 1.9709, + "step": 2254 + }, + { + "epoch": 0.6412436239980568, + "grad_norm": 1.5600004196166992, + "learning_rate": 1.4513070956621661e-05, + "loss": 1.8011, + "step": 2255 + }, + { + "epoch": 0.641527989241515, + "grad_norm": 1.7009464502334595, + "learning_rate": 1.4501580005745477e-05, + "loss": 1.7419, + "step": 2256 + }, + { + "epoch": 0.6418123544849731, + "grad_norm": 1.6545665264129639, + "learning_rate": 1.4490089054869292e-05, + "loss": 2.6367, + "step": 2257 + }, + { + "epoch": 0.6420967197284312, + "grad_norm": 1.542686939239502, + "learning_rate": 1.4478598103993106e-05, + "loss": 2.5222, + "step": 2258 + }, + { + "epoch": 0.6423810849718893, + "grad_norm": 1.5785605907440186, + "learning_rate": 1.4467107153116921e-05, + "loss": 2.2131, + "step": 2259 + }, + { + "epoch": 0.6426654502153474, + "grad_norm": 1.6065642833709717, + "learning_rate": 1.4455616202240737e-05, + "loss": 2.1774, + "step": 2260 + }, + { + "epoch": 0.6429498154588056, + "grad_norm": 1.5818967819213867, + "learning_rate": 1.4444125251364552e-05, + "loss": 2.0904, + "step": 2261 + }, + { + "epoch": 0.6432341807022637, + "grad_norm": 1.6355520486831665, + "learning_rate": 1.4432634300488366e-05, + "loss": 1.8927, + "step": 2262 + }, + { + "epoch": 0.6435185459457218, + "grad_norm": 1.528612732887268, + "learning_rate": 1.442114334961218e-05, + "loss": 1.8309, + "step": 2263 + }, + { + "epoch": 0.6438029111891799, + "grad_norm": 1.5770155191421509, + "learning_rate": 1.4409652398735995e-05, + "loss": 1.6619, + "step": 2264 + }, + { + "epoch": 0.644087276432638, + "grad_norm": 1.7672600746154785, + "learning_rate": 1.4398161447859811e-05, + "loss": 2.5805, + "step": 2265 + }, + { + "epoch": 0.6443716416760962, + "grad_norm": 1.5915486812591553, + "learning_rate": 1.4386670496983626e-05, + "loss": 2.2712, + "step": 2266 + }, + { + "epoch": 0.6446560069195543, + "grad_norm": 1.3617949485778809, + "learning_rate": 1.437517954610744e-05, + "loss": 2.1554, + "step": 2267 + }, + { + "epoch": 0.6449403721630124, + "grad_norm": 1.487271785736084, + "learning_rate": 1.4363688595231255e-05, + "loss": 2.3376, + "step": 2268 + }, + { + "epoch": 0.6452247374064705, + "grad_norm": 1.9369791746139526, + "learning_rate": 1.4352197644355073e-05, + "loss": 2.1762, + "step": 2269 + }, + { + "epoch": 0.6455091026499287, + "grad_norm": 1.513728380203247, + "learning_rate": 1.4340706693478887e-05, + "loss": 1.8318, + "step": 2270 + }, + { + "epoch": 0.6457934678933868, + "grad_norm": 1.602360725402832, + "learning_rate": 1.4329215742602702e-05, + "loss": 1.8081, + "step": 2271 + }, + { + "epoch": 0.6460778331368449, + "grad_norm": 1.706312656402588, + "learning_rate": 1.4317724791726518e-05, + "loss": 1.8961, + "step": 2272 + }, + { + "epoch": 0.646362198380303, + "grad_norm": 1.8679949045181274, + "learning_rate": 1.4306233840850332e-05, + "loss": 2.7414, + "step": 2273 + }, + { + "epoch": 0.646646563623761, + "grad_norm": 1.5807439088821411, + "learning_rate": 1.4294742889974147e-05, + "loss": 2.4749, + "step": 2274 + }, + { + "epoch": 0.6469309288672193, + "grad_norm": 1.6467163562774658, + "learning_rate": 1.4283251939097961e-05, + "loss": 2.4233, + "step": 2275 + }, + { + "epoch": 0.6472152941106774, + "grad_norm": 1.611040711402893, + "learning_rate": 1.4271760988221776e-05, + "loss": 2.148, + "step": 2276 + }, + { + "epoch": 0.6474996593541354, + "grad_norm": 1.6380242109298706, + "learning_rate": 1.4260270037345592e-05, + "loss": 2.1739, + "step": 2277 + }, + { + "epoch": 0.6477840245975935, + "grad_norm": 1.6218241453170776, + "learning_rate": 1.4248779086469407e-05, + "loss": 2.0744, + "step": 2278 + }, + { + "epoch": 0.6480683898410516, + "grad_norm": 1.4787720441818237, + "learning_rate": 1.4237288135593221e-05, + "loss": 1.6869, + "step": 2279 + }, + { + "epoch": 0.6483527550845098, + "grad_norm": 1.682263731956482, + "learning_rate": 1.4225797184717036e-05, + "loss": 1.7572, + "step": 2280 + }, + { + "epoch": 0.6486371203279679, + "grad_norm": 1.9067274332046509, + "learning_rate": 1.4214306233840852e-05, + "loss": 2.7824, + "step": 2281 + }, + { + "epoch": 0.648921485571426, + "grad_norm": 1.4960787296295166, + "learning_rate": 1.4202815282964666e-05, + "loss": 2.3718, + "step": 2282 + }, + { + "epoch": 0.6492058508148841, + "grad_norm": 1.5059878826141357, + "learning_rate": 1.419132433208848e-05, + "loss": 2.1535, + "step": 2283 + }, + { + "epoch": 0.6494902160583422, + "grad_norm": 1.685180902481079, + "learning_rate": 1.4179833381212295e-05, + "loss": 2.2137, + "step": 2284 + }, + { + "epoch": 0.6497745813018004, + "grad_norm": 1.5441550016403198, + "learning_rate": 1.416834243033611e-05, + "loss": 2.3031, + "step": 2285 + }, + { + "epoch": 0.6500589465452585, + "grad_norm": 1.5095455646514893, + "learning_rate": 1.4156851479459926e-05, + "loss": 1.8482, + "step": 2286 + }, + { + "epoch": 0.6503433117887166, + "grad_norm": 1.5256438255310059, + "learning_rate": 1.4145360528583742e-05, + "loss": 1.7711, + "step": 2287 + }, + { + "epoch": 0.6506276770321747, + "grad_norm": 1.6489698886871338, + "learning_rate": 1.4133869577707557e-05, + "loss": 1.64, + "step": 2288 + }, + { + "epoch": 0.6509120422756328, + "grad_norm": 1.7157350778579712, + "learning_rate": 1.4122378626831373e-05, + "loss": 2.8341, + "step": 2289 + }, + { + "epoch": 0.651196407519091, + "grad_norm": 1.5686113834381104, + "learning_rate": 1.4110887675955187e-05, + "loss": 2.6388, + "step": 2290 + }, + { + "epoch": 0.6514807727625491, + "grad_norm": 1.576197624206543, + "learning_rate": 1.4099396725079002e-05, + "loss": 2.2588, + "step": 2291 + }, + { + "epoch": 0.6517651380060072, + "grad_norm": 1.5347895622253418, + "learning_rate": 1.4087905774202816e-05, + "loss": 2.1164, + "step": 2292 + }, + { + "epoch": 0.6520495032494653, + "grad_norm": 1.6401748657226562, + "learning_rate": 1.4076414823326633e-05, + "loss": 2.1258, + "step": 2293 + }, + { + "epoch": 0.6523338684929234, + "grad_norm": 1.5072190761566162, + "learning_rate": 1.4064923872450447e-05, + "loss": 1.9321, + "step": 2294 + }, + { + "epoch": 0.6526182337363816, + "grad_norm": 1.5900171995162964, + "learning_rate": 1.4053432921574262e-05, + "loss": 1.8916, + "step": 2295 + }, + { + "epoch": 0.6529025989798397, + "grad_norm": 1.6210306882858276, + "learning_rate": 1.4041941970698076e-05, + "loss": 1.6101, + "step": 2296 + }, + { + "epoch": 0.6531869642232978, + "grad_norm": 1.8477280139923096, + "learning_rate": 1.403045101982189e-05, + "loss": 2.6045, + "step": 2297 + }, + { + "epoch": 0.6534713294667559, + "grad_norm": 1.5514870882034302, + "learning_rate": 1.4018960068945707e-05, + "loss": 2.4303, + "step": 2298 + }, + { + "epoch": 0.653755694710214, + "grad_norm": 1.5255701541900635, + "learning_rate": 1.4007469118069521e-05, + "loss": 2.3312, + "step": 2299 + }, + { + "epoch": 0.6540400599536722, + "grad_norm": 1.5493617057800293, + "learning_rate": 1.3995978167193336e-05, + "loss": 2.3112, + "step": 2300 + }, + { + "epoch": 0.6543244251971303, + "grad_norm": 1.477663278579712, + "learning_rate": 1.398448721631715e-05, + "loss": 2.0823, + "step": 2301 + }, + { + "epoch": 0.6546087904405884, + "grad_norm": 1.534895420074463, + "learning_rate": 1.3972996265440966e-05, + "loss": 1.9711, + "step": 2302 + }, + { + "epoch": 0.6548931556840465, + "grad_norm": 1.559821605682373, + "learning_rate": 1.3961505314564781e-05, + "loss": 1.9114, + "step": 2303 + }, + { + "epoch": 0.6551775209275046, + "grad_norm": 1.5802251100540161, + "learning_rate": 1.3950014363688595e-05, + "loss": 1.7692, + "step": 2304 + }, + { + "epoch": 0.6554618861709628, + "grad_norm": 1.8919142484664917, + "learning_rate": 1.393852341281241e-05, + "loss": 2.8514, + "step": 2305 + }, + { + "epoch": 0.6557462514144209, + "grad_norm": 1.5178377628326416, + "learning_rate": 1.3927032461936228e-05, + "loss": 2.2836, + "step": 2306 + }, + { + "epoch": 0.656030616657879, + "grad_norm": 1.4845335483551025, + "learning_rate": 1.3915541511060042e-05, + "loss": 2.4409, + "step": 2307 + }, + { + "epoch": 0.6563149819013371, + "grad_norm": 1.55623459815979, + "learning_rate": 1.3904050560183857e-05, + "loss": 2.245, + "step": 2308 + }, + { + "epoch": 0.6565993471447952, + "grad_norm": 1.5961564779281616, + "learning_rate": 1.3892559609307671e-05, + "loss": 2.1436, + "step": 2309 + }, + { + "epoch": 0.6568837123882534, + "grad_norm": 1.593917727470398, + "learning_rate": 1.3881068658431487e-05, + "loss": 2.1096, + "step": 2310 + }, + { + "epoch": 0.6571680776317115, + "grad_norm": 1.4851388931274414, + "learning_rate": 1.3869577707555302e-05, + "loss": 1.7558, + "step": 2311 + }, + { + "epoch": 0.6574524428751696, + "grad_norm": 1.7430486679077148, + "learning_rate": 1.3858086756679116e-05, + "loss": 1.7607, + "step": 2312 + }, + { + "epoch": 0.6577368081186277, + "grad_norm": 1.8176724910736084, + "learning_rate": 1.3846595805802931e-05, + "loss": 2.7509, + "step": 2313 + }, + { + "epoch": 0.6580211733620858, + "grad_norm": 1.513588309288025, + "learning_rate": 1.3835104854926747e-05, + "loss": 2.405, + "step": 2314 + }, + { + "epoch": 0.658305538605544, + "grad_norm": 1.621985912322998, + "learning_rate": 1.3823613904050562e-05, + "loss": 2.3489, + "step": 2315 + }, + { + "epoch": 0.6585899038490021, + "grad_norm": 1.4367046356201172, + "learning_rate": 1.3812122953174376e-05, + "loss": 2.2808, + "step": 2316 + }, + { + "epoch": 0.6588742690924602, + "grad_norm": 1.577368974685669, + "learning_rate": 1.380063200229819e-05, + "loss": 2.1214, + "step": 2317 + }, + { + "epoch": 0.6591586343359183, + "grad_norm": 1.4581912755966187, + "learning_rate": 1.3789141051422005e-05, + "loss": 2.0159, + "step": 2318 + }, + { + "epoch": 0.6594429995793765, + "grad_norm": 1.500917911529541, + "learning_rate": 1.3777650100545821e-05, + "loss": 1.7401, + "step": 2319 + }, + { + "epoch": 0.6597273648228346, + "grad_norm": 1.6946115493774414, + "learning_rate": 1.3766159149669636e-05, + "loss": 1.9459, + "step": 2320 + }, + { + "epoch": 0.6600117300662927, + "grad_norm": 1.7423439025878906, + "learning_rate": 1.375466819879345e-05, + "loss": 2.8295, + "step": 2321 + }, + { + "epoch": 0.6602960953097508, + "grad_norm": 1.5395139455795288, + "learning_rate": 1.3743177247917265e-05, + "loss": 2.4043, + "step": 2322 + }, + { + "epoch": 0.6605804605532088, + "grad_norm": 1.6088899374008179, + "learning_rate": 1.3731686297041081e-05, + "loss": 2.396, + "step": 2323 + }, + { + "epoch": 0.660864825796667, + "grad_norm": 1.542049765586853, + "learning_rate": 1.3720195346164897e-05, + "loss": 2.2012, + "step": 2324 + }, + { + "epoch": 0.6611491910401252, + "grad_norm": 1.5389901399612427, + "learning_rate": 1.3708704395288712e-05, + "loss": 2.02, + "step": 2325 + }, + { + "epoch": 0.6614335562835832, + "grad_norm": 1.6238247156143188, + "learning_rate": 1.3697213444412528e-05, + "loss": 1.9875, + "step": 2326 + }, + { + "epoch": 0.6617179215270413, + "grad_norm": 1.5157362222671509, + "learning_rate": 1.3685722493536342e-05, + "loss": 1.7811, + "step": 2327 + }, + { + "epoch": 0.6620022867704994, + "grad_norm": 1.6152019500732422, + "learning_rate": 1.3674231542660157e-05, + "loss": 1.7777, + "step": 2328 + }, + { + "epoch": 0.6622866520139576, + "grad_norm": 1.740449070930481, + "learning_rate": 1.3662740591783971e-05, + "loss": 2.7021, + "step": 2329 + }, + { + "epoch": 0.6625710172574157, + "grad_norm": 1.5307239294052124, + "learning_rate": 1.3651249640907786e-05, + "loss": 2.3465, + "step": 2330 + }, + { + "epoch": 0.6628553825008738, + "grad_norm": 1.5852240324020386, + "learning_rate": 1.3639758690031602e-05, + "loss": 2.477, + "step": 2331 + }, + { + "epoch": 0.6631397477443319, + "grad_norm": 1.6011126041412354, + "learning_rate": 1.3628267739155417e-05, + "loss": 2.2583, + "step": 2332 + }, + { + "epoch": 0.66342411298779, + "grad_norm": 1.549425482749939, + "learning_rate": 1.3616776788279231e-05, + "loss": 2.2773, + "step": 2333 + }, + { + "epoch": 0.6637084782312482, + "grad_norm": 1.449255347251892, + "learning_rate": 1.3605285837403046e-05, + "loss": 1.7077, + "step": 2334 + }, + { + "epoch": 0.6639928434747063, + "grad_norm": 1.624582290649414, + "learning_rate": 1.3593794886526862e-05, + "loss": 1.7985, + "step": 2335 + }, + { + "epoch": 0.6642772087181644, + "grad_norm": 1.6954373121261597, + "learning_rate": 1.3582303935650676e-05, + "loss": 1.7301, + "step": 2336 + }, + { + "epoch": 0.6645615739616225, + "grad_norm": 1.8019691705703735, + "learning_rate": 1.357081298477449e-05, + "loss": 2.6443, + "step": 2337 + }, + { + "epoch": 0.6648459392050806, + "grad_norm": 1.6157490015029907, + "learning_rate": 1.3559322033898305e-05, + "loss": 2.248, + "step": 2338 + }, + { + "epoch": 0.6651303044485388, + "grad_norm": 1.6532044410705566, + "learning_rate": 1.354783108302212e-05, + "loss": 2.4353, + "step": 2339 + }, + { + "epoch": 0.6654146696919969, + "grad_norm": 1.4396045207977295, + "learning_rate": 1.3536340132145936e-05, + "loss": 2.1747, + "step": 2340 + }, + { + "epoch": 0.665699034935455, + "grad_norm": 1.4797991514205933, + "learning_rate": 1.352484918126975e-05, + "loss": 2.0431, + "step": 2341 + }, + { + "epoch": 0.6659834001789131, + "grad_norm": 1.3988265991210938, + "learning_rate": 1.3513358230393565e-05, + "loss": 1.858, + "step": 2342 + }, + { + "epoch": 0.6662677654223712, + "grad_norm": 1.6098911762237549, + "learning_rate": 1.3501867279517383e-05, + "loss": 1.735, + "step": 2343 + }, + { + "epoch": 0.6665521306658294, + "grad_norm": 1.6491178274154663, + "learning_rate": 1.3490376328641197e-05, + "loss": 1.6631, + "step": 2344 + }, + { + "epoch": 0.6668364959092875, + "grad_norm": 1.7857693433761597, + "learning_rate": 1.3478885377765012e-05, + "loss": 2.6628, + "step": 2345 + }, + { + "epoch": 0.6671208611527456, + "grad_norm": 1.5431671142578125, + "learning_rate": 1.3467394426888826e-05, + "loss": 2.4461, + "step": 2346 + }, + { + "epoch": 0.6674052263962037, + "grad_norm": 1.5646437406539917, + "learning_rate": 1.3455903476012642e-05, + "loss": 2.3102, + "step": 2347 + }, + { + "epoch": 0.6676895916396618, + "grad_norm": 1.4588571786880493, + "learning_rate": 1.3444412525136457e-05, + "loss": 2.3354, + "step": 2348 + }, + { + "epoch": 0.66797395688312, + "grad_norm": 1.5933592319488525, + "learning_rate": 1.3432921574260271e-05, + "loss": 2.2924, + "step": 2349 + }, + { + "epoch": 0.6682583221265781, + "grad_norm": 1.7044545412063599, + "learning_rate": 1.3421430623384086e-05, + "loss": 1.9148, + "step": 2350 + }, + { + "epoch": 0.6685426873700362, + "grad_norm": 1.6366909742355347, + "learning_rate": 1.34099396725079e-05, + "loss": 1.6891, + "step": 2351 + }, + { + "epoch": 0.6688270526134943, + "grad_norm": 1.6043225526809692, + "learning_rate": 1.3398448721631717e-05, + "loss": 1.895, + "step": 2352 + }, + { + "epoch": 0.6691114178569524, + "grad_norm": 1.8012562990188599, + "learning_rate": 1.3386957770755531e-05, + "loss": 2.7992, + "step": 2353 + }, + { + "epoch": 0.6693957831004106, + "grad_norm": 1.4551502466201782, + "learning_rate": 1.3375466819879346e-05, + "loss": 2.3551, + "step": 2354 + }, + { + "epoch": 0.6696801483438687, + "grad_norm": 1.5812779664993286, + "learning_rate": 1.336397586900316e-05, + "loss": 2.2085, + "step": 2355 + }, + { + "epoch": 0.6699645135873268, + "grad_norm": 1.529133677482605, + "learning_rate": 1.3352484918126976e-05, + "loss": 2.4194, + "step": 2356 + }, + { + "epoch": 0.6702488788307849, + "grad_norm": 1.595837116241455, + "learning_rate": 1.3340993967250791e-05, + "loss": 2.1076, + "step": 2357 + }, + { + "epoch": 0.670533244074243, + "grad_norm": 1.6244161128997803, + "learning_rate": 1.3329503016374605e-05, + "loss": 1.8285, + "step": 2358 + }, + { + "epoch": 0.6708176093177012, + "grad_norm": 1.5791698694229126, + "learning_rate": 1.331801206549842e-05, + "loss": 1.8041, + "step": 2359 + }, + { + "epoch": 0.6711019745611593, + "grad_norm": 1.7883336544036865, + "learning_rate": 1.3306521114622234e-05, + "loss": 1.7612, + "step": 2360 + }, + { + "epoch": 0.6713863398046174, + "grad_norm": 1.7180142402648926, + "learning_rate": 1.3295030163746052e-05, + "loss": 2.9109, + "step": 2361 + }, + { + "epoch": 0.6716707050480755, + "grad_norm": 1.4666606187820435, + "learning_rate": 1.3283539212869867e-05, + "loss": 2.3537, + "step": 2362 + }, + { + "epoch": 0.6719550702915336, + "grad_norm": 1.450372338294983, + "learning_rate": 1.3272048261993681e-05, + "loss": 2.4139, + "step": 2363 + }, + { + "epoch": 0.6722394355349918, + "grad_norm": 1.5204261541366577, + "learning_rate": 1.3260557311117497e-05, + "loss": 2.2118, + "step": 2364 + }, + { + "epoch": 0.6725238007784499, + "grad_norm": 1.5298410654067993, + "learning_rate": 1.3249066360241312e-05, + "loss": 2.2147, + "step": 2365 + }, + { + "epoch": 0.672808166021908, + "grad_norm": 1.5501554012298584, + "learning_rate": 1.3237575409365126e-05, + "loss": 1.9778, + "step": 2366 + }, + { + "epoch": 0.6730925312653661, + "grad_norm": 1.5184673070907593, + "learning_rate": 1.3226084458488941e-05, + "loss": 1.7726, + "step": 2367 + }, + { + "epoch": 0.6733768965088243, + "grad_norm": 1.6755380630493164, + "learning_rate": 1.3214593507612757e-05, + "loss": 1.7409, + "step": 2368 + }, + { + "epoch": 0.6736612617522824, + "grad_norm": 1.799485683441162, + "learning_rate": 1.3203102556736572e-05, + "loss": 2.698, + "step": 2369 + }, + { + "epoch": 0.6739456269957405, + "grad_norm": 1.6598178148269653, + "learning_rate": 1.3191611605860386e-05, + "loss": 2.3039, + "step": 2370 + }, + { + "epoch": 0.6742299922391986, + "grad_norm": 1.4246594905853271, + "learning_rate": 1.31801206549842e-05, + "loss": 2.1442, + "step": 2371 + }, + { + "epoch": 0.6745143574826566, + "grad_norm": 1.5940912961959839, + "learning_rate": 1.3168629704108015e-05, + "loss": 2.3078, + "step": 2372 + }, + { + "epoch": 0.6747987227261149, + "grad_norm": 1.6379321813583374, + "learning_rate": 1.3157138753231831e-05, + "loss": 2.1717, + "step": 2373 + }, + { + "epoch": 0.675083087969573, + "grad_norm": 1.4846510887145996, + "learning_rate": 1.3145647802355646e-05, + "loss": 1.9535, + "step": 2374 + }, + { + "epoch": 0.675367453213031, + "grad_norm": 1.5312621593475342, + "learning_rate": 1.313415685147946e-05, + "loss": 1.7979, + "step": 2375 + }, + { + "epoch": 0.6756518184564891, + "grad_norm": 1.6713464260101318, + "learning_rate": 1.3122665900603275e-05, + "loss": 1.9716, + "step": 2376 + }, + { + "epoch": 0.6759361836999472, + "grad_norm": 1.6956110000610352, + "learning_rate": 1.3111174949727091e-05, + "loss": 2.5694, + "step": 2377 + }, + { + "epoch": 0.6762205489434054, + "grad_norm": 1.5432319641113281, + "learning_rate": 1.3099683998850905e-05, + "loss": 2.3722, + "step": 2378 + }, + { + "epoch": 0.6765049141868635, + "grad_norm": 1.5072269439697266, + "learning_rate": 1.308819304797472e-05, + "loss": 2.347, + "step": 2379 + }, + { + "epoch": 0.6767892794303216, + "grad_norm": 1.498325228691101, + "learning_rate": 1.3076702097098536e-05, + "loss": 2.2973, + "step": 2380 + }, + { + "epoch": 0.6770736446737797, + "grad_norm": 1.5500154495239258, + "learning_rate": 1.3065211146222352e-05, + "loss": 2.1524, + "step": 2381 + }, + { + "epoch": 0.6773580099172378, + "grad_norm": 1.6011747121810913, + "learning_rate": 1.3053720195346167e-05, + "loss": 1.7051, + "step": 2382 + }, + { + "epoch": 0.677642375160696, + "grad_norm": 1.5794252157211304, + "learning_rate": 1.3042229244469981e-05, + "loss": 1.6992, + "step": 2383 + }, + { + "epoch": 0.6779267404041541, + "grad_norm": 1.5260065793991089, + "learning_rate": 1.3030738293593796e-05, + "loss": 1.7597, + "step": 2384 + }, + { + "epoch": 0.6782111056476122, + "grad_norm": 1.744638204574585, + "learning_rate": 1.3019247342717612e-05, + "loss": 2.9247, + "step": 2385 + }, + { + "epoch": 0.6784954708910703, + "grad_norm": 1.6044799089431763, + "learning_rate": 1.3007756391841427e-05, + "loss": 2.3649, + "step": 2386 + }, + { + "epoch": 0.6787798361345284, + "grad_norm": 1.4609475135803223, + "learning_rate": 1.2996265440965241e-05, + "loss": 2.2682, + "step": 2387 + }, + { + "epoch": 0.6790642013779866, + "grad_norm": 1.4124441146850586, + "learning_rate": 1.2984774490089056e-05, + "loss": 2.2928, + "step": 2388 + }, + { + "epoch": 0.6793485666214447, + "grad_norm": 1.5999109745025635, + "learning_rate": 1.297328353921287e-05, + "loss": 2.0884, + "step": 2389 + }, + { + "epoch": 0.6796329318649028, + "grad_norm": 1.541373372077942, + "learning_rate": 1.2961792588336686e-05, + "loss": 1.8306, + "step": 2390 + }, + { + "epoch": 0.6799172971083609, + "grad_norm": 1.6004866361618042, + "learning_rate": 1.29503016374605e-05, + "loss": 1.7566, + "step": 2391 + }, + { + "epoch": 0.680201662351819, + "grad_norm": 1.6891027688980103, + "learning_rate": 1.2938810686584315e-05, + "loss": 1.5669, + "step": 2392 + }, + { + "epoch": 0.6804860275952772, + "grad_norm": 1.9808518886566162, + "learning_rate": 1.292731973570813e-05, + "loss": 2.8488, + "step": 2393 + }, + { + "epoch": 0.6807703928387353, + "grad_norm": 1.5064984560012817, + "learning_rate": 1.2915828784831946e-05, + "loss": 2.449, + "step": 2394 + }, + { + "epoch": 0.6810547580821934, + "grad_norm": 1.4997645616531372, + "learning_rate": 1.290433783395576e-05, + "loss": 2.2506, + "step": 2395 + }, + { + "epoch": 0.6813391233256515, + "grad_norm": 1.476811170578003, + "learning_rate": 1.2892846883079575e-05, + "loss": 2.0701, + "step": 2396 + }, + { + "epoch": 0.6816234885691096, + "grad_norm": 1.593111515045166, + "learning_rate": 1.288135593220339e-05, + "loss": 2.0792, + "step": 2397 + }, + { + "epoch": 0.6819078538125678, + "grad_norm": 1.5982271432876587, + "learning_rate": 1.2869864981327207e-05, + "loss": 1.815, + "step": 2398 + }, + { + "epoch": 0.6821922190560259, + "grad_norm": 1.615859866142273, + "learning_rate": 1.2858374030451022e-05, + "loss": 1.8881, + "step": 2399 + }, + { + "epoch": 0.682476584299484, + "grad_norm": 1.6525177955627441, + "learning_rate": 1.2846883079574836e-05, + "loss": 1.9245, + "step": 2400 + }, + { + "epoch": 0.6827609495429421, + "grad_norm": 1.8938461542129517, + "learning_rate": 1.283539212869865e-05, + "loss": 2.9311, + "step": 2401 + }, + { + "epoch": 0.6830453147864002, + "grad_norm": 1.5215036869049072, + "learning_rate": 1.2823901177822467e-05, + "loss": 2.1695, + "step": 2402 + }, + { + "epoch": 0.6833296800298584, + "grad_norm": 1.4118622541427612, + "learning_rate": 1.2812410226946281e-05, + "loss": 2.3208, + "step": 2403 + }, + { + "epoch": 0.6836140452733165, + "grad_norm": 1.486366868019104, + "learning_rate": 1.2800919276070096e-05, + "loss": 2.3337, + "step": 2404 + }, + { + "epoch": 0.6838984105167746, + "grad_norm": 1.5969897508621216, + "learning_rate": 1.278942832519391e-05, + "loss": 2.1099, + "step": 2405 + }, + { + "epoch": 0.6841827757602327, + "grad_norm": 1.4879251718521118, + "learning_rate": 1.2777937374317727e-05, + "loss": 1.9579, + "step": 2406 + }, + { + "epoch": 0.6844671410036908, + "grad_norm": 1.6099344491958618, + "learning_rate": 1.2766446423441541e-05, + "loss": 2.0261, + "step": 2407 + }, + { + "epoch": 0.684751506247149, + "grad_norm": 1.8013091087341309, + "learning_rate": 1.2754955472565356e-05, + "loss": 1.7967, + "step": 2408 + }, + { + "epoch": 0.6850358714906071, + "grad_norm": 1.6463696956634521, + "learning_rate": 1.274346452168917e-05, + "loss": 2.7894, + "step": 2409 + }, + { + "epoch": 0.6853202367340652, + "grad_norm": 1.5260536670684814, + "learning_rate": 1.2731973570812985e-05, + "loss": 2.5281, + "step": 2410 + }, + { + "epoch": 0.6856046019775233, + "grad_norm": 1.5363825559616089, + "learning_rate": 1.27204826199368e-05, + "loss": 1.9302, + "step": 2411 + }, + { + "epoch": 0.6858889672209814, + "grad_norm": 1.5252505540847778, + "learning_rate": 1.2708991669060615e-05, + "loss": 2.3634, + "step": 2412 + }, + { + "epoch": 0.6861733324644396, + "grad_norm": 1.609386682510376, + "learning_rate": 1.269750071818443e-05, + "loss": 1.9163, + "step": 2413 + }, + { + "epoch": 0.6864576977078977, + "grad_norm": 1.467077612876892, + "learning_rate": 1.2686009767308244e-05, + "loss": 1.8251, + "step": 2414 + }, + { + "epoch": 0.6867420629513558, + "grad_norm": 1.5783828496932983, + "learning_rate": 1.267451881643206e-05, + "loss": 1.7815, + "step": 2415 + }, + { + "epoch": 0.6870264281948139, + "grad_norm": 1.699668049812317, + "learning_rate": 1.2663027865555875e-05, + "loss": 1.9008, + "step": 2416 + }, + { + "epoch": 0.6873107934382721, + "grad_norm": 1.7475955486297607, + "learning_rate": 1.2651536914679691e-05, + "loss": 2.4304, + "step": 2417 + }, + { + "epoch": 0.6875951586817302, + "grad_norm": 1.4119583368301392, + "learning_rate": 1.2640045963803507e-05, + "loss": 2.3749, + "step": 2418 + }, + { + "epoch": 0.6878795239251883, + "grad_norm": 1.54697847366333, + "learning_rate": 1.2628555012927322e-05, + "loss": 2.1809, + "step": 2419 + }, + { + "epoch": 0.6881638891686463, + "grad_norm": 1.5237066745758057, + "learning_rate": 1.2617064062051136e-05, + "loss": 2.2084, + "step": 2420 + }, + { + "epoch": 0.6884482544121044, + "grad_norm": 1.586302399635315, + "learning_rate": 1.2605573111174951e-05, + "loss": 1.9825, + "step": 2421 + }, + { + "epoch": 0.6887326196555627, + "grad_norm": 1.619920253753662, + "learning_rate": 1.2594082160298765e-05, + "loss": 1.9729, + "step": 2422 + }, + { + "epoch": 0.6890169848990207, + "grad_norm": 1.5657825469970703, + "learning_rate": 1.2582591209422582e-05, + "loss": 1.8286, + "step": 2423 + }, + { + "epoch": 0.6893013501424788, + "grad_norm": 1.5684531927108765, + "learning_rate": 1.2571100258546396e-05, + "loss": 1.7035, + "step": 2424 + }, + { + "epoch": 0.6895857153859369, + "grad_norm": 1.7014023065567017, + "learning_rate": 1.255960930767021e-05, + "loss": 2.7972, + "step": 2425 + }, + { + "epoch": 0.689870080629395, + "grad_norm": 1.52834951877594, + "learning_rate": 1.2548118356794025e-05, + "loss": 2.3543, + "step": 2426 + }, + { + "epoch": 0.6901544458728532, + "grad_norm": 1.3747142553329468, + "learning_rate": 1.2536627405917841e-05, + "loss": 2.1109, + "step": 2427 + }, + { + "epoch": 0.6904388111163113, + "grad_norm": 1.534885287284851, + "learning_rate": 1.2525136455041656e-05, + "loss": 2.2004, + "step": 2428 + }, + { + "epoch": 0.6907231763597694, + "grad_norm": 1.6728806495666504, + "learning_rate": 1.251364550416547e-05, + "loss": 2.1012, + "step": 2429 + }, + { + "epoch": 0.6910075416032275, + "grad_norm": 1.5786833763122559, + "learning_rate": 1.2502154553289285e-05, + "loss": 2.0211, + "step": 2430 + }, + { + "epoch": 0.6912919068466856, + "grad_norm": 1.4964499473571777, + "learning_rate": 1.24906636024131e-05, + "loss": 1.7223, + "step": 2431 + }, + { + "epoch": 0.6915762720901438, + "grad_norm": 1.5604283809661865, + "learning_rate": 1.2479172651536915e-05, + "loss": 1.73, + "step": 2432 + }, + { + "epoch": 0.6918606373336019, + "grad_norm": 2.041099786758423, + "learning_rate": 1.246768170066073e-05, + "loss": 2.7647, + "step": 2433 + }, + { + "epoch": 0.69214500257706, + "grad_norm": 1.5619685649871826, + "learning_rate": 1.2456190749784544e-05, + "loss": 2.2241, + "step": 2434 + }, + { + "epoch": 0.6924293678205181, + "grad_norm": 1.4527276754379272, + "learning_rate": 1.2444699798908362e-05, + "loss": 2.2705, + "step": 2435 + }, + { + "epoch": 0.6927137330639762, + "grad_norm": 1.5282810926437378, + "learning_rate": 1.2433208848032177e-05, + "loss": 2.0866, + "step": 2436 + }, + { + "epoch": 0.6929980983074344, + "grad_norm": 1.5058673620224, + "learning_rate": 1.2421717897155991e-05, + "loss": 1.8953, + "step": 2437 + }, + { + "epoch": 0.6932824635508925, + "grad_norm": 1.6031697988510132, + "learning_rate": 1.2410226946279806e-05, + "loss": 1.9201, + "step": 2438 + }, + { + "epoch": 0.6935668287943506, + "grad_norm": 1.544476866722107, + "learning_rate": 1.2398735995403622e-05, + "loss": 1.7643, + "step": 2439 + }, + { + "epoch": 0.6938511940378087, + "grad_norm": 1.6882764101028442, + "learning_rate": 1.2387245044527436e-05, + "loss": 1.7574, + "step": 2440 + }, + { + "epoch": 0.6941355592812668, + "grad_norm": 1.8056814670562744, + "learning_rate": 1.2375754093651251e-05, + "loss": 2.5871, + "step": 2441 + }, + { + "epoch": 0.694419924524725, + "grad_norm": 1.553879976272583, + "learning_rate": 1.2364263142775065e-05, + "loss": 2.5545, + "step": 2442 + }, + { + "epoch": 0.6947042897681831, + "grad_norm": 1.5317288637161255, + "learning_rate": 1.235277219189888e-05, + "loss": 2.2901, + "step": 2443 + }, + { + "epoch": 0.6949886550116412, + "grad_norm": 1.6215896606445312, + "learning_rate": 1.2341281241022696e-05, + "loss": 2.2079, + "step": 2444 + }, + { + "epoch": 0.6952730202550993, + "grad_norm": 1.5378239154815674, + "learning_rate": 1.232979029014651e-05, + "loss": 2.0399, + "step": 2445 + }, + { + "epoch": 0.6955573854985574, + "grad_norm": 1.4927102327346802, + "learning_rate": 1.2318299339270325e-05, + "loss": 1.8721, + "step": 2446 + }, + { + "epoch": 0.6958417507420156, + "grad_norm": 1.48208749294281, + "learning_rate": 1.230680838839414e-05, + "loss": 1.8783, + "step": 2447 + }, + { + "epoch": 0.6961261159854737, + "grad_norm": 1.5241432189941406, + "learning_rate": 1.2295317437517956e-05, + "loss": 1.7324, + "step": 2448 + }, + { + "epoch": 0.6964104812289318, + "grad_norm": 1.8973292112350464, + "learning_rate": 1.228382648664177e-05, + "loss": 2.7123, + "step": 2449 + }, + { + "epoch": 0.6966948464723899, + "grad_norm": 1.539310097694397, + "learning_rate": 1.2272335535765585e-05, + "loss": 2.3683, + "step": 2450 + }, + { + "epoch": 0.696979211715848, + "grad_norm": 1.5558347702026367, + "learning_rate": 1.22608445848894e-05, + "loss": 2.0833, + "step": 2451 + }, + { + "epoch": 0.6972635769593062, + "grad_norm": 1.690713882446289, + "learning_rate": 1.2249353634013214e-05, + "loss": 2.1543, + "step": 2452 + }, + { + "epoch": 0.6975479422027643, + "grad_norm": 1.7084331512451172, + "learning_rate": 1.223786268313703e-05, + "loss": 2.2718, + "step": 2453 + }, + { + "epoch": 0.6978323074462224, + "grad_norm": 1.5845692157745361, + "learning_rate": 1.2226371732260846e-05, + "loss": 1.7258, + "step": 2454 + }, + { + "epoch": 0.6981166726896805, + "grad_norm": 1.7029811143875122, + "learning_rate": 1.221488078138466e-05, + "loss": 1.7476, + "step": 2455 + }, + { + "epoch": 0.6984010379331386, + "grad_norm": 1.6282069683074951, + "learning_rate": 1.2203389830508477e-05, + "loss": 1.7149, + "step": 2456 + }, + { + "epoch": 0.6986854031765968, + "grad_norm": 1.8195720911026, + "learning_rate": 1.2191898879632291e-05, + "loss": 2.6883, + "step": 2457 + }, + { + "epoch": 0.6989697684200549, + "grad_norm": 1.4874529838562012, + "learning_rate": 1.2180407928756106e-05, + "loss": 2.3012, + "step": 2458 + }, + { + "epoch": 0.699254133663513, + "grad_norm": 1.4426288604736328, + "learning_rate": 1.216891697787992e-05, + "loss": 2.2598, + "step": 2459 + }, + { + "epoch": 0.6995384989069711, + "grad_norm": 1.4771324396133423, + "learning_rate": 1.2157426027003737e-05, + "loss": 2.1432, + "step": 2460 + }, + { + "epoch": 0.6998228641504293, + "grad_norm": 1.6694068908691406, + "learning_rate": 1.2145935076127551e-05, + "loss": 2.1884, + "step": 2461 + }, + { + "epoch": 0.7001072293938874, + "grad_norm": 1.5652145147323608, + "learning_rate": 1.2134444125251366e-05, + "loss": 1.8736, + "step": 2462 + }, + { + "epoch": 0.7003915946373455, + "grad_norm": 1.6377602815628052, + "learning_rate": 1.212295317437518e-05, + "loss": 1.7648, + "step": 2463 + }, + { + "epoch": 0.7006759598808036, + "grad_norm": 1.671975016593933, + "learning_rate": 1.2111462223498995e-05, + "loss": 1.8393, + "step": 2464 + }, + { + "epoch": 0.7009603251242617, + "grad_norm": 1.6467212438583374, + "learning_rate": 1.209997127262281e-05, + "loss": 2.8314, + "step": 2465 + }, + { + "epoch": 0.7012446903677199, + "grad_norm": 1.6222443580627441, + "learning_rate": 1.2088480321746625e-05, + "loss": 2.4483, + "step": 2466 + }, + { + "epoch": 0.701529055611178, + "grad_norm": 1.5044670104980469, + "learning_rate": 1.207698937087044e-05, + "loss": 2.4326, + "step": 2467 + }, + { + "epoch": 0.701813420854636, + "grad_norm": 1.5599663257598877, + "learning_rate": 1.2065498419994254e-05, + "loss": 2.0106, + "step": 2468 + }, + { + "epoch": 0.7020977860980941, + "grad_norm": 1.4764515161514282, + "learning_rate": 1.205400746911807e-05, + "loss": 2.0588, + "step": 2469 + }, + { + "epoch": 0.7023821513415522, + "grad_norm": 1.4903563261032104, + "learning_rate": 1.2042516518241885e-05, + "loss": 1.8838, + "step": 2470 + }, + { + "epoch": 0.7026665165850104, + "grad_norm": 1.5704025030136108, + "learning_rate": 1.20310255673657e-05, + "loss": 1.8783, + "step": 2471 + }, + { + "epoch": 0.7029508818284685, + "grad_norm": 1.671426773071289, + "learning_rate": 1.2019534616489517e-05, + "loss": 1.6243, + "step": 2472 + }, + { + "epoch": 0.7032352470719266, + "grad_norm": 1.8858857154846191, + "learning_rate": 1.2008043665613332e-05, + "loss": 2.6409, + "step": 2473 + }, + { + "epoch": 0.7035196123153847, + "grad_norm": 1.4502723217010498, + "learning_rate": 1.1996552714737146e-05, + "loss": 2.4368, + "step": 2474 + }, + { + "epoch": 0.7038039775588428, + "grad_norm": 1.4333308935165405, + "learning_rate": 1.198506176386096e-05, + "loss": 2.2744, + "step": 2475 + }, + { + "epoch": 0.704088342802301, + "grad_norm": 1.4810882806777954, + "learning_rate": 1.1973570812984775e-05, + "loss": 2.105, + "step": 2476 + }, + { + "epoch": 0.7043727080457591, + "grad_norm": 1.6196544170379639, + "learning_rate": 1.1962079862108592e-05, + "loss": 2.1092, + "step": 2477 + }, + { + "epoch": 0.7046570732892172, + "grad_norm": 1.455788254737854, + "learning_rate": 1.1950588911232406e-05, + "loss": 1.75, + "step": 2478 + }, + { + "epoch": 0.7049414385326753, + "grad_norm": 1.5926382541656494, + "learning_rate": 1.193909796035622e-05, + "loss": 1.8092, + "step": 2479 + }, + { + "epoch": 0.7052258037761334, + "grad_norm": 1.7150598764419556, + "learning_rate": 1.1927607009480035e-05, + "loss": 1.7442, + "step": 2480 + }, + { + "epoch": 0.7055101690195916, + "grad_norm": 1.7297258377075195, + "learning_rate": 1.1916116058603851e-05, + "loss": 2.6048, + "step": 2481 + }, + { + "epoch": 0.7057945342630497, + "grad_norm": 1.6914091110229492, + "learning_rate": 1.1904625107727666e-05, + "loss": 2.3913, + "step": 2482 + }, + { + "epoch": 0.7060788995065078, + "grad_norm": 1.6037940979003906, + "learning_rate": 1.189313415685148e-05, + "loss": 2.2043, + "step": 2483 + }, + { + "epoch": 0.7063632647499659, + "grad_norm": 1.5348221063613892, + "learning_rate": 1.1881643205975295e-05, + "loss": 2.2177, + "step": 2484 + }, + { + "epoch": 0.706647629993424, + "grad_norm": 1.5680232048034668, + "learning_rate": 1.187015225509911e-05, + "loss": 2.2766, + "step": 2485 + }, + { + "epoch": 0.7069319952368822, + "grad_norm": 1.5366462469100952, + "learning_rate": 1.1858661304222925e-05, + "loss": 1.98, + "step": 2486 + }, + { + "epoch": 0.7072163604803403, + "grad_norm": 1.4476945400238037, + "learning_rate": 1.184717035334674e-05, + "loss": 1.8068, + "step": 2487 + }, + { + "epoch": 0.7075007257237984, + "grad_norm": 1.6980016231536865, + "learning_rate": 1.1835679402470554e-05, + "loss": 1.6514, + "step": 2488 + }, + { + "epoch": 0.7077850909672565, + "grad_norm": 1.753934383392334, + "learning_rate": 1.1824188451594369e-05, + "loss": 2.7271, + "step": 2489 + }, + { + "epoch": 0.7080694562107146, + "grad_norm": 1.530096411705017, + "learning_rate": 1.1812697500718185e-05, + "loss": 2.0594, + "step": 2490 + }, + { + "epoch": 0.7083538214541728, + "grad_norm": 1.539100170135498, + "learning_rate": 1.1801206549842001e-05, + "loss": 2.0438, + "step": 2491 + }, + { + "epoch": 0.7086381866976309, + "grad_norm": 1.5810004472732544, + "learning_rate": 1.1789715598965816e-05, + "loss": 2.2391, + "step": 2492 + }, + { + "epoch": 0.708922551941089, + "grad_norm": 1.5543794631958008, + "learning_rate": 1.1778224648089632e-05, + "loss": 2.0453, + "step": 2493 + }, + { + "epoch": 0.7092069171845471, + "grad_norm": 1.462825894355774, + "learning_rate": 1.1766733697213446e-05, + "loss": 1.8446, + "step": 2494 + }, + { + "epoch": 0.7094912824280052, + "grad_norm": 1.5893888473510742, + "learning_rate": 1.1755242746337261e-05, + "loss": 1.6117, + "step": 2495 + }, + { + "epoch": 0.7097756476714634, + "grad_norm": 1.5945665836334229, + "learning_rate": 1.1743751795461075e-05, + "loss": 1.6783, + "step": 2496 + }, + { + "epoch": 0.7100600129149215, + "grad_norm": 1.7469823360443115, + "learning_rate": 1.173226084458489e-05, + "loss": 2.6954, + "step": 2497 + }, + { + "epoch": 0.7103443781583796, + "grad_norm": 1.4920471906661987, + "learning_rate": 1.1720769893708706e-05, + "loss": 2.2442, + "step": 2498 + }, + { + "epoch": 0.7106287434018377, + "grad_norm": 1.5059243440628052, + "learning_rate": 1.170927894283252e-05, + "loss": 2.3696, + "step": 2499 + }, + { + "epoch": 0.7109131086452958, + "grad_norm": 1.5132187604904175, + "learning_rate": 1.1697787991956335e-05, + "loss": 2.0736, + "step": 2500 + }, + { + "epoch": 0.711197473888754, + "grad_norm": 1.6220301389694214, + "learning_rate": 1.168629704108015e-05, + "loss": 2.0143, + "step": 2501 + }, + { + "epoch": 0.7114818391322121, + "grad_norm": 1.5194025039672852, + "learning_rate": 1.1674806090203966e-05, + "loss": 1.9679, + "step": 2502 + }, + { + "epoch": 0.7117662043756702, + "grad_norm": 1.4100217819213867, + "learning_rate": 1.166331513932778e-05, + "loss": 1.7964, + "step": 2503 + }, + { + "epoch": 0.7120505696191283, + "grad_norm": 1.6122227907180786, + "learning_rate": 1.1651824188451595e-05, + "loss": 1.7849, + "step": 2504 + }, + { + "epoch": 0.7123349348625864, + "grad_norm": 1.7167197465896606, + "learning_rate": 1.164033323757541e-05, + "loss": 2.6167, + "step": 2505 + }, + { + "epoch": 0.7126193001060446, + "grad_norm": 1.4899656772613525, + "learning_rate": 1.1628842286699224e-05, + "loss": 2.3321, + "step": 2506 + }, + { + "epoch": 0.7129036653495027, + "grad_norm": 1.5323848724365234, + "learning_rate": 1.161735133582304e-05, + "loss": 2.3636, + "step": 2507 + }, + { + "epoch": 0.7131880305929608, + "grad_norm": 1.7162936925888062, + "learning_rate": 1.1605860384946855e-05, + "loss": 2.2844, + "step": 2508 + }, + { + "epoch": 0.7134723958364189, + "grad_norm": 1.5394136905670166, + "learning_rate": 1.159436943407067e-05, + "loss": 2.2002, + "step": 2509 + }, + { + "epoch": 0.7137567610798771, + "grad_norm": 1.561732292175293, + "learning_rate": 1.1582878483194487e-05, + "loss": 1.8321, + "step": 2510 + }, + { + "epoch": 0.7140411263233352, + "grad_norm": 1.4912663698196411, + "learning_rate": 1.1571387532318301e-05, + "loss": 1.6438, + "step": 2511 + }, + { + "epoch": 0.7143254915667933, + "grad_norm": 1.7454664707183838, + "learning_rate": 1.1559896581442116e-05, + "loss": 2.0865, + "step": 2512 + }, + { + "epoch": 0.7146098568102514, + "grad_norm": 1.7953159809112549, + "learning_rate": 1.154840563056593e-05, + "loss": 2.5611, + "step": 2513 + }, + { + "epoch": 0.7148942220537094, + "grad_norm": 1.570483922958374, + "learning_rate": 1.1536914679689747e-05, + "loss": 2.1698, + "step": 2514 + }, + { + "epoch": 0.7151785872971677, + "grad_norm": 1.4322417974472046, + "learning_rate": 1.1525423728813561e-05, + "loss": 2.1472, + "step": 2515 + }, + { + "epoch": 0.7154629525406258, + "grad_norm": 1.5140742063522339, + "learning_rate": 1.1513932777937376e-05, + "loss": 2.2549, + "step": 2516 + }, + { + "epoch": 0.7157473177840838, + "grad_norm": 1.485280990600586, + "learning_rate": 1.150244182706119e-05, + "loss": 2.0911, + "step": 2517 + }, + { + "epoch": 0.7160316830275419, + "grad_norm": 1.444021224975586, + "learning_rate": 1.1490950876185005e-05, + "loss": 1.8068, + "step": 2518 + }, + { + "epoch": 0.716316048271, + "grad_norm": 1.5001165866851807, + "learning_rate": 1.147945992530882e-05, + "loss": 1.7745, + "step": 2519 + }, + { + "epoch": 0.7166004135144582, + "grad_norm": 1.7177432775497437, + "learning_rate": 1.1467968974432635e-05, + "loss": 1.7376, + "step": 2520 + }, + { + "epoch": 0.7168847787579163, + "grad_norm": 1.9216365814208984, + "learning_rate": 1.145647802355645e-05, + "loss": 2.783, + "step": 2521 + }, + { + "epoch": 0.7171691440013744, + "grad_norm": 1.6360090970993042, + "learning_rate": 1.1444987072680264e-05, + "loss": 2.4359, + "step": 2522 + }, + { + "epoch": 0.7174535092448325, + "grad_norm": 1.5208964347839355, + "learning_rate": 1.143349612180408e-05, + "loss": 2.3083, + "step": 2523 + }, + { + "epoch": 0.7177378744882906, + "grad_norm": 1.394752860069275, + "learning_rate": 1.1422005170927895e-05, + "loss": 2.1679, + "step": 2524 + }, + { + "epoch": 0.7180222397317488, + "grad_norm": 1.508109211921692, + "learning_rate": 1.141051422005171e-05, + "loss": 2.1873, + "step": 2525 + }, + { + "epoch": 0.7183066049752069, + "grad_norm": 1.7108025550842285, + "learning_rate": 1.1399023269175524e-05, + "loss": 1.8141, + "step": 2526 + }, + { + "epoch": 0.718590970218665, + "grad_norm": 1.727073311805725, + "learning_rate": 1.1387532318299338e-05, + "loss": 1.7839, + "step": 2527 + }, + { + "epoch": 0.7188753354621231, + "grad_norm": 1.7177026271820068, + "learning_rate": 1.1376041367423156e-05, + "loss": 1.8421, + "step": 2528 + }, + { + "epoch": 0.7191597007055812, + "grad_norm": 1.9634953737258911, + "learning_rate": 1.136455041654697e-05, + "loss": 2.5981, + "step": 2529 + }, + { + "epoch": 0.7194440659490394, + "grad_norm": 1.5275763273239136, + "learning_rate": 1.1353059465670785e-05, + "loss": 2.3648, + "step": 2530 + }, + { + "epoch": 0.7197284311924975, + "grad_norm": 1.5566397905349731, + "learning_rate": 1.1341568514794601e-05, + "loss": 2.3055, + "step": 2531 + }, + { + "epoch": 0.7200127964359556, + "grad_norm": 1.6546510457992554, + "learning_rate": 1.1330077563918416e-05, + "loss": 2.1899, + "step": 2532 + }, + { + "epoch": 0.7202971616794137, + "grad_norm": 1.5946438312530518, + "learning_rate": 1.131858661304223e-05, + "loss": 2.1639, + "step": 2533 + }, + { + "epoch": 0.7205815269228718, + "grad_norm": 1.4848626852035522, + "learning_rate": 1.1307095662166045e-05, + "loss": 2.0955, + "step": 2534 + }, + { + "epoch": 0.72086589216633, + "grad_norm": 1.4589171409606934, + "learning_rate": 1.1295604711289861e-05, + "loss": 1.7393, + "step": 2535 + }, + { + "epoch": 0.7211502574097881, + "grad_norm": 1.6969494819641113, + "learning_rate": 1.1284113760413676e-05, + "loss": 1.3365, + "step": 2536 + }, + { + "epoch": 0.7214346226532462, + "grad_norm": 1.7486660480499268, + "learning_rate": 1.127262280953749e-05, + "loss": 2.695, + "step": 2537 + }, + { + "epoch": 0.7217189878967043, + "grad_norm": 1.4772002696990967, + "learning_rate": 1.1261131858661305e-05, + "loss": 2.4941, + "step": 2538 + }, + { + "epoch": 0.7220033531401624, + "grad_norm": 1.4565849304199219, + "learning_rate": 1.124964090778512e-05, + "loss": 2.2789, + "step": 2539 + }, + { + "epoch": 0.7222877183836206, + "grad_norm": 1.3858977556228638, + "learning_rate": 1.1238149956908935e-05, + "loss": 2.2516, + "step": 2540 + }, + { + "epoch": 0.7225720836270787, + "grad_norm": 1.4847400188446045, + "learning_rate": 1.122665900603275e-05, + "loss": 2.1125, + "step": 2541 + }, + { + "epoch": 0.7228564488705368, + "grad_norm": 1.653690218925476, + "learning_rate": 1.1215168055156564e-05, + "loss": 1.8551, + "step": 2542 + }, + { + "epoch": 0.7231408141139949, + "grad_norm": 1.6189554929733276, + "learning_rate": 1.1203677104280379e-05, + "loss": 1.6745, + "step": 2543 + }, + { + "epoch": 0.723425179357453, + "grad_norm": 1.6225018501281738, + "learning_rate": 1.1192186153404195e-05, + "loss": 1.881, + "step": 2544 + }, + { + "epoch": 0.7237095446009112, + "grad_norm": 1.7607388496398926, + "learning_rate": 1.118069520252801e-05, + "loss": 2.5443, + "step": 2545 + }, + { + "epoch": 0.7239939098443693, + "grad_norm": 1.4778530597686768, + "learning_rate": 1.1169204251651826e-05, + "loss": 2.347, + "step": 2546 + }, + { + "epoch": 0.7242782750878274, + "grad_norm": 1.4404771327972412, + "learning_rate": 1.115771330077564e-05, + "loss": 2.4381, + "step": 2547 + }, + { + "epoch": 0.7245626403312855, + "grad_norm": 1.7989895343780518, + "learning_rate": 1.1146222349899456e-05, + "loss": 2.2025, + "step": 2548 + }, + { + "epoch": 0.7248470055747436, + "grad_norm": 1.5871349573135376, + "learning_rate": 1.1134731399023271e-05, + "loss": 2.1601, + "step": 2549 + }, + { + "epoch": 0.7251313708182018, + "grad_norm": 1.521226167678833, + "learning_rate": 1.1123240448147085e-05, + "loss": 1.7278, + "step": 2550 + }, + { + "epoch": 0.7254157360616599, + "grad_norm": 1.6469875574111938, + "learning_rate": 1.11117494972709e-05, + "loss": 1.6268, + "step": 2551 + }, + { + "epoch": 0.725700101305118, + "grad_norm": 1.732232689857483, + "learning_rate": 1.1100258546394716e-05, + "loss": 1.616, + "step": 2552 + }, + { + "epoch": 0.7259844665485761, + "grad_norm": 1.7754976749420166, + "learning_rate": 1.108876759551853e-05, + "loss": 2.5168, + "step": 2553 + }, + { + "epoch": 0.7262688317920342, + "grad_norm": 1.471461296081543, + "learning_rate": 1.1077276644642345e-05, + "loss": 2.3877, + "step": 2554 + }, + { + "epoch": 0.7265531970354924, + "grad_norm": 1.427992820739746, + "learning_rate": 1.106578569376616e-05, + "loss": 2.1713, + "step": 2555 + }, + { + "epoch": 0.7268375622789505, + "grad_norm": 1.6102306842803955, + "learning_rate": 1.1054294742889976e-05, + "loss": 2.1996, + "step": 2556 + }, + { + "epoch": 0.7271219275224086, + "grad_norm": 1.6785222291946411, + "learning_rate": 1.104280379201379e-05, + "loss": 2.0858, + "step": 2557 + }, + { + "epoch": 0.7274062927658667, + "grad_norm": 1.556443214416504, + "learning_rate": 1.1031312841137605e-05, + "loss": 1.9964, + "step": 2558 + }, + { + "epoch": 0.7276906580093249, + "grad_norm": 1.5980945825576782, + "learning_rate": 1.101982189026142e-05, + "loss": 1.9399, + "step": 2559 + }, + { + "epoch": 0.727975023252783, + "grad_norm": 1.6788307428359985, + "learning_rate": 1.1008330939385234e-05, + "loss": 1.7525, + "step": 2560 + }, + { + "epoch": 0.7282593884962411, + "grad_norm": 1.6237465143203735, + "learning_rate": 1.099683998850905e-05, + "loss": 2.7299, + "step": 2561 + }, + { + "epoch": 0.7285437537396992, + "grad_norm": 1.56391179561615, + "learning_rate": 1.0985349037632864e-05, + "loss": 2.5252, + "step": 2562 + }, + { + "epoch": 0.7288281189831572, + "grad_norm": 1.4667418003082275, + "learning_rate": 1.0973858086756679e-05, + "loss": 2.1989, + "step": 2563 + }, + { + "epoch": 0.7291124842266155, + "grad_norm": 1.421413540840149, + "learning_rate": 1.0962367135880493e-05, + "loss": 2.199, + "step": 2564 + }, + { + "epoch": 0.7293968494700735, + "grad_norm": 1.588476538658142, + "learning_rate": 1.0950876185004311e-05, + "loss": 2.1981, + "step": 2565 + }, + { + "epoch": 0.7296812147135316, + "grad_norm": 1.5892815589904785, + "learning_rate": 1.0939385234128126e-05, + "loss": 1.884, + "step": 2566 + }, + { + "epoch": 0.7299655799569897, + "grad_norm": 1.5493581295013428, + "learning_rate": 1.092789428325194e-05, + "loss": 1.8578, + "step": 2567 + }, + { + "epoch": 0.7302499452004478, + "grad_norm": 1.7328555583953857, + "learning_rate": 1.0916403332375755e-05, + "loss": 1.6838, + "step": 2568 + }, + { + "epoch": 0.730534310443906, + "grad_norm": 1.8126568794250488, + "learning_rate": 1.0904912381499571e-05, + "loss": 2.7335, + "step": 2569 + }, + { + "epoch": 0.7308186756873641, + "grad_norm": 1.505807638168335, + "learning_rate": 1.0893421430623386e-05, + "loss": 2.3911, + "step": 2570 + }, + { + "epoch": 0.7311030409308222, + "grad_norm": 1.6133103370666504, + "learning_rate": 1.08819304797472e-05, + "loss": 2.2435, + "step": 2571 + }, + { + "epoch": 0.7313874061742803, + "grad_norm": 1.4566494226455688, + "learning_rate": 1.0870439528871015e-05, + "loss": 2.0838, + "step": 2572 + }, + { + "epoch": 0.7316717714177384, + "grad_norm": 1.5375003814697266, + "learning_rate": 1.085894857799483e-05, + "loss": 1.9548, + "step": 2573 + }, + { + "epoch": 0.7319561366611966, + "grad_norm": 1.5447187423706055, + "learning_rate": 1.0847457627118645e-05, + "loss": 2.0166, + "step": 2574 + }, + { + "epoch": 0.7322405019046547, + "grad_norm": 1.5107059478759766, + "learning_rate": 1.083596667624246e-05, + "loss": 1.6546, + "step": 2575 + }, + { + "epoch": 0.7325248671481128, + "grad_norm": 1.5727344751358032, + "learning_rate": 1.0824475725366274e-05, + "loss": 1.6626, + "step": 2576 + }, + { + "epoch": 0.7328092323915709, + "grad_norm": 1.8032009601593018, + "learning_rate": 1.0812984774490089e-05, + "loss": 2.8995, + "step": 2577 + }, + { + "epoch": 0.733093597635029, + "grad_norm": 1.4891507625579834, + "learning_rate": 1.0801493823613905e-05, + "loss": 2.3765, + "step": 2578 + }, + { + "epoch": 0.7333779628784872, + "grad_norm": 1.535091519355774, + "learning_rate": 1.079000287273772e-05, + "loss": 2.2866, + "step": 2579 + }, + { + "epoch": 0.7336623281219453, + "grad_norm": 1.510955572128296, + "learning_rate": 1.0778511921861534e-05, + "loss": 2.2064, + "step": 2580 + }, + { + "epoch": 0.7339466933654034, + "grad_norm": 1.803688406944275, + "learning_rate": 1.0767020970985348e-05, + "loss": 2.2372, + "step": 2581 + }, + { + "epoch": 0.7342310586088615, + "grad_norm": 1.6097159385681152, + "learning_rate": 1.0755530020109165e-05, + "loss": 1.8109, + "step": 2582 + }, + { + "epoch": 0.7345154238523196, + "grad_norm": 1.478907585144043, + "learning_rate": 1.074403906923298e-05, + "loss": 1.6932, + "step": 2583 + }, + { + "epoch": 0.7347997890957778, + "grad_norm": 1.6126059293746948, + "learning_rate": 1.0732548118356795e-05, + "loss": 1.6228, + "step": 2584 + }, + { + "epoch": 0.7350841543392359, + "grad_norm": 1.7208243608474731, + "learning_rate": 1.0721057167480611e-05, + "loss": 2.7996, + "step": 2585 + }, + { + "epoch": 0.735368519582694, + "grad_norm": 1.4276622533798218, + "learning_rate": 1.0709566216604426e-05, + "loss": 2.3561, + "step": 2586 + }, + { + "epoch": 0.7356528848261521, + "grad_norm": 1.5668575763702393, + "learning_rate": 1.069807526572824e-05, + "loss": 2.2804, + "step": 2587 + }, + { + "epoch": 0.7359372500696102, + "grad_norm": 1.4144526720046997, + "learning_rate": 1.0686584314852055e-05, + "loss": 2.1127, + "step": 2588 + }, + { + "epoch": 0.7362216153130684, + "grad_norm": 1.6393625736236572, + "learning_rate": 1.067509336397587e-05, + "loss": 2.2331, + "step": 2589 + }, + { + "epoch": 0.7365059805565265, + "grad_norm": 1.552177906036377, + "learning_rate": 1.0663602413099686e-05, + "loss": 1.876, + "step": 2590 + }, + { + "epoch": 0.7367903457999846, + "grad_norm": 1.5180331468582153, + "learning_rate": 1.06521114622235e-05, + "loss": 1.9381, + "step": 2591 + }, + { + "epoch": 0.7370747110434427, + "grad_norm": 1.8120715618133545, + "learning_rate": 1.0640620511347315e-05, + "loss": 1.7459, + "step": 2592 + }, + { + "epoch": 0.7373590762869008, + "grad_norm": 1.9727323055267334, + "learning_rate": 1.0629129560471129e-05, + "loss": 2.7591, + "step": 2593 + }, + { + "epoch": 0.737643441530359, + "grad_norm": 1.6499255895614624, + "learning_rate": 1.0617638609594945e-05, + "loss": 2.582, + "step": 2594 + }, + { + "epoch": 0.7379278067738171, + "grad_norm": 1.4931328296661377, + "learning_rate": 1.060614765871876e-05, + "loss": 2.2723, + "step": 2595 + }, + { + "epoch": 0.7382121720172752, + "grad_norm": 1.4535672664642334, + "learning_rate": 1.0594656707842574e-05, + "loss": 2.1424, + "step": 2596 + }, + { + "epoch": 0.7384965372607333, + "grad_norm": 1.6324405670166016, + "learning_rate": 1.0583165756966389e-05, + "loss": 2.2187, + "step": 2597 + }, + { + "epoch": 0.7387809025041914, + "grad_norm": 1.4527398347854614, + "learning_rate": 1.0571674806090203e-05, + "loss": 1.8947, + "step": 2598 + }, + { + "epoch": 0.7390652677476496, + "grad_norm": 1.51231050491333, + "learning_rate": 1.056018385521402e-05, + "loss": 1.6234, + "step": 2599 + }, + { + "epoch": 0.7393496329911077, + "grad_norm": 1.6000384092330933, + "learning_rate": 1.0548692904337834e-05, + "loss": 1.8085, + "step": 2600 + }, + { + "epoch": 0.7396339982345658, + "grad_norm": 1.772000789642334, + "learning_rate": 1.0537201953461649e-05, + "loss": 2.6206, + "step": 2601 + }, + { + "epoch": 0.7399183634780239, + "grad_norm": 1.5927568674087524, + "learning_rate": 1.0525711002585466e-05, + "loss": 2.4873, + "step": 2602 + }, + { + "epoch": 0.740202728721482, + "grad_norm": 1.451069951057434, + "learning_rate": 1.0514220051709281e-05, + "loss": 2.0906, + "step": 2603 + }, + { + "epoch": 0.7404870939649402, + "grad_norm": 1.6845619678497314, + "learning_rate": 1.0502729100833095e-05, + "loss": 2.1397, + "step": 2604 + }, + { + "epoch": 0.7407714592083983, + "grad_norm": 1.546721339225769, + "learning_rate": 1.049123814995691e-05, + "loss": 1.8256, + "step": 2605 + }, + { + "epoch": 0.7410558244518564, + "grad_norm": 1.4821118116378784, + "learning_rate": 1.0479747199080726e-05, + "loss": 1.842, + "step": 2606 + }, + { + "epoch": 0.7413401896953145, + "grad_norm": 1.5237010717391968, + "learning_rate": 1.046825624820454e-05, + "loss": 1.6978, + "step": 2607 + }, + { + "epoch": 0.7416245549387727, + "grad_norm": 1.6505897045135498, + "learning_rate": 1.0456765297328355e-05, + "loss": 1.6463, + "step": 2608 + }, + { + "epoch": 0.7419089201822308, + "grad_norm": 1.657488226890564, + "learning_rate": 1.044527434645217e-05, + "loss": 2.9463, + "step": 2609 + }, + { + "epoch": 0.7421932854256889, + "grad_norm": 1.5707652568817139, + "learning_rate": 1.0433783395575984e-05, + "loss": 2.5416, + "step": 2610 + }, + { + "epoch": 0.742477650669147, + "grad_norm": 1.4029492139816284, + "learning_rate": 1.04222924446998e-05, + "loss": 2.2495, + "step": 2611 + }, + { + "epoch": 0.742762015912605, + "grad_norm": 1.707994818687439, + "learning_rate": 1.0410801493823615e-05, + "loss": 2.0821, + "step": 2612 + }, + { + "epoch": 0.7430463811560633, + "grad_norm": 1.5762234926223755, + "learning_rate": 1.039931054294743e-05, + "loss": 2.0927, + "step": 2613 + }, + { + "epoch": 0.7433307463995213, + "grad_norm": 1.567622423171997, + "learning_rate": 1.0387819592071244e-05, + "loss": 1.8988, + "step": 2614 + }, + { + "epoch": 0.7436151116429794, + "grad_norm": 1.5860332250595093, + "learning_rate": 1.037632864119506e-05, + "loss": 1.9538, + "step": 2615 + }, + { + "epoch": 0.7438994768864375, + "grad_norm": 1.6096969842910767, + "learning_rate": 1.0364837690318874e-05, + "loss": 1.8601, + "step": 2616 + }, + { + "epoch": 0.7441838421298956, + "grad_norm": 1.703778862953186, + "learning_rate": 1.0353346739442689e-05, + "loss": 2.7728, + "step": 2617 + }, + { + "epoch": 0.7444682073733538, + "grad_norm": 1.4619109630584717, + "learning_rate": 1.0341855788566503e-05, + "loss": 2.373, + "step": 2618 + }, + { + "epoch": 0.7447525726168119, + "grad_norm": 1.4133814573287964, + "learning_rate": 1.0330364837690318e-05, + "loss": 2.2617, + "step": 2619 + }, + { + "epoch": 0.74503693786027, + "grad_norm": 1.5261253118515015, + "learning_rate": 1.0318873886814136e-05, + "loss": 2.1614, + "step": 2620 + }, + { + "epoch": 0.7453213031037281, + "grad_norm": 1.649577021598816, + "learning_rate": 1.030738293593795e-05, + "loss": 2.1312, + "step": 2621 + }, + { + "epoch": 0.7456056683471862, + "grad_norm": 1.4830260276794434, + "learning_rate": 1.0295891985061765e-05, + "loss": 1.6617, + "step": 2622 + }, + { + "epoch": 0.7458900335906444, + "grad_norm": 1.6154065132141113, + "learning_rate": 1.0284401034185581e-05, + "loss": 1.8633, + "step": 2623 + }, + { + "epoch": 0.7461743988341025, + "grad_norm": 1.634304404258728, + "learning_rate": 1.0272910083309395e-05, + "loss": 1.88, + "step": 2624 + }, + { + "epoch": 0.7464587640775606, + "grad_norm": 1.7923297882080078, + "learning_rate": 1.026141913243321e-05, + "loss": 2.8546, + "step": 2625 + }, + { + "epoch": 0.7467431293210187, + "grad_norm": 1.4359058141708374, + "learning_rate": 1.0249928181557024e-05, + "loss": 2.2801, + "step": 2626 + }, + { + "epoch": 0.7470274945644768, + "grad_norm": 1.4793764352798462, + "learning_rate": 1.023843723068084e-05, + "loss": 2.3711, + "step": 2627 + }, + { + "epoch": 0.747311859807935, + "grad_norm": 1.487067699432373, + "learning_rate": 1.0226946279804655e-05, + "loss": 2.1701, + "step": 2628 + }, + { + "epoch": 0.7475962250513931, + "grad_norm": 1.7713996171951294, + "learning_rate": 1.021545532892847e-05, + "loss": 2.283, + "step": 2629 + }, + { + "epoch": 0.7478805902948512, + "grad_norm": 1.474770188331604, + "learning_rate": 1.0203964378052284e-05, + "loss": 1.8782, + "step": 2630 + }, + { + "epoch": 0.7481649555383093, + "grad_norm": 1.6031092405319214, + "learning_rate": 1.0192473427176099e-05, + "loss": 1.89, + "step": 2631 + }, + { + "epoch": 0.7484493207817674, + "grad_norm": 1.7248785495758057, + "learning_rate": 1.0180982476299915e-05, + "loss": 1.8929, + "step": 2632 + }, + { + "epoch": 0.7487336860252256, + "grad_norm": 1.6073254346847534, + "learning_rate": 1.016949152542373e-05, + "loss": 2.6679, + "step": 2633 + }, + { + "epoch": 0.7490180512686837, + "grad_norm": 1.4250975847244263, + "learning_rate": 1.0158000574547544e-05, + "loss": 2.393, + "step": 2634 + }, + { + "epoch": 0.7493024165121418, + "grad_norm": 1.3941985368728638, + "learning_rate": 1.0146509623671358e-05, + "loss": 2.203, + "step": 2635 + }, + { + "epoch": 0.7495867817555999, + "grad_norm": 1.540351390838623, + "learning_rate": 1.0135018672795175e-05, + "loss": 2.0514, + "step": 2636 + }, + { + "epoch": 0.749871146999058, + "grad_norm": 1.4758607149124146, + "learning_rate": 1.0123527721918989e-05, + "loss": 2.1415, + "step": 2637 + }, + { + "epoch": 0.7501555122425162, + "grad_norm": 1.5170818567276, + "learning_rate": 1.0112036771042804e-05, + "loss": 1.7861, + "step": 2638 + }, + { + "epoch": 0.7504398774859743, + "grad_norm": 1.5630824565887451, + "learning_rate": 1.0100545820166621e-05, + "loss": 1.7773, + "step": 2639 + }, + { + "epoch": 0.7507242427294324, + "grad_norm": 1.5273665189743042, + "learning_rate": 1.0089054869290436e-05, + "loss": 1.7984, + "step": 2640 + }, + { + "epoch": 0.7510086079728905, + "grad_norm": 1.8208073377609253, + "learning_rate": 1.007756391841425e-05, + "loss": 2.9004, + "step": 2641 + }, + { + "epoch": 0.7512929732163486, + "grad_norm": 1.5931475162506104, + "learning_rate": 1.0066072967538065e-05, + "loss": 2.4002, + "step": 2642 + }, + { + "epoch": 0.7515773384598068, + "grad_norm": 1.410474181175232, + "learning_rate": 1.005458201666188e-05, + "loss": 2.0302, + "step": 2643 + }, + { + "epoch": 0.7518617037032649, + "grad_norm": 1.5305248498916626, + "learning_rate": 1.0043091065785696e-05, + "loss": 2.0728, + "step": 2644 + }, + { + "epoch": 0.752146068946723, + "grad_norm": 1.7503458261489868, + "learning_rate": 1.003160011490951e-05, + "loss": 2.1175, + "step": 2645 + }, + { + "epoch": 0.7524304341901811, + "grad_norm": 1.5125000476837158, + "learning_rate": 1.0020109164033325e-05, + "loss": 1.8522, + "step": 2646 + }, + { + "epoch": 0.7527147994336392, + "grad_norm": 1.5034544467926025, + "learning_rate": 1.0008618213157139e-05, + "loss": 1.7503, + "step": 2647 + }, + { + "epoch": 0.7529991646770974, + "grad_norm": 1.5701968669891357, + "learning_rate": 9.997127262280955e-06, + "loss": 1.7253, + "step": 2648 + }, + { + "epoch": 0.7532835299205555, + "grad_norm": 1.7175260782241821, + "learning_rate": 9.98563631140477e-06, + "loss": 2.6342, + "step": 2649 + }, + { + "epoch": 0.7535678951640136, + "grad_norm": 1.4930824041366577, + "learning_rate": 9.974145360528584e-06, + "loss": 2.4192, + "step": 2650 + }, + { + "epoch": 0.7538522604074717, + "grad_norm": 1.5410807132720947, + "learning_rate": 9.962654409652399e-06, + "loss": 2.3138, + "step": 2651 + }, + { + "epoch": 0.7541366256509299, + "grad_norm": 1.4877644777297974, + "learning_rate": 9.951163458776213e-06, + "loss": 2.0459, + "step": 2652 + }, + { + "epoch": 0.754420990894388, + "grad_norm": 1.711830973625183, + "learning_rate": 9.93967250790003e-06, + "loss": 2.1824, + "step": 2653 + }, + { + "epoch": 0.7547053561378461, + "grad_norm": 1.4872764348983765, + "learning_rate": 9.928181557023846e-06, + "loss": 1.7254, + "step": 2654 + }, + { + "epoch": 0.7549897213813042, + "grad_norm": 1.5591570138931274, + "learning_rate": 9.91669060614766e-06, + "loss": 1.9, + "step": 2655 + }, + { + "epoch": 0.7552740866247623, + "grad_norm": 1.5367933511734009, + "learning_rate": 9.905199655271475e-06, + "loss": 1.6445, + "step": 2656 + }, + { + "epoch": 0.7555584518682205, + "grad_norm": 1.752350091934204, + "learning_rate": 9.89370870439529e-06, + "loss": 2.8917, + "step": 2657 + }, + { + "epoch": 0.7558428171116786, + "grad_norm": 1.5216292142868042, + "learning_rate": 9.882217753519104e-06, + "loss": 2.4243, + "step": 2658 + }, + { + "epoch": 0.7561271823551367, + "grad_norm": 1.5508999824523926, + "learning_rate": 9.87072680264292e-06, + "loss": 2.1663, + "step": 2659 + }, + { + "epoch": 0.7564115475985947, + "grad_norm": 1.4459530115127563, + "learning_rate": 9.859235851766734e-06, + "loss": 2.1307, + "step": 2660 + }, + { + "epoch": 0.7566959128420528, + "grad_norm": 1.627680778503418, + "learning_rate": 9.847744900890549e-06, + "loss": 2.0607, + "step": 2661 + }, + { + "epoch": 0.756980278085511, + "grad_norm": 1.4881747961044312, + "learning_rate": 9.836253950014365e-06, + "loss": 1.9483, + "step": 2662 + }, + { + "epoch": 0.7572646433289691, + "grad_norm": 1.6278051137924194, + "learning_rate": 9.82476299913818e-06, + "loss": 1.794, + "step": 2663 + }, + { + "epoch": 0.7575490085724272, + "grad_norm": 1.6977025270462036, + "learning_rate": 9.813272048261994e-06, + "loss": 1.6891, + "step": 2664 + }, + { + "epoch": 0.7578333738158853, + "grad_norm": 1.795324444770813, + "learning_rate": 9.80178109738581e-06, + "loss": 2.4871, + "step": 2665 + }, + { + "epoch": 0.7581177390593434, + "grad_norm": 1.48861563205719, + "learning_rate": 9.790290146509625e-06, + "loss": 2.4192, + "step": 2666 + }, + { + "epoch": 0.7584021043028016, + "grad_norm": 1.4271656274795532, + "learning_rate": 9.77879919563344e-06, + "loss": 2.1096, + "step": 2667 + }, + { + "epoch": 0.7586864695462597, + "grad_norm": 1.4701792001724243, + "learning_rate": 9.767308244757254e-06, + "loss": 2.0263, + "step": 2668 + }, + { + "epoch": 0.7589708347897178, + "grad_norm": 1.5513591766357422, + "learning_rate": 9.75581729388107e-06, + "loss": 2.2207, + "step": 2669 + }, + { + "epoch": 0.7592552000331759, + "grad_norm": 1.4835832118988037, + "learning_rate": 9.744326343004884e-06, + "loss": 1.9584, + "step": 2670 + }, + { + "epoch": 0.759539565276634, + "grad_norm": 1.5424950122833252, + "learning_rate": 9.7328353921287e-06, + "loss": 2.0076, + "step": 2671 + }, + { + "epoch": 0.7598239305200922, + "grad_norm": 1.5847523212432861, + "learning_rate": 9.721344441252515e-06, + "loss": 1.712, + "step": 2672 + }, + { + "epoch": 0.7601082957635503, + "grad_norm": 1.8330718278884888, + "learning_rate": 9.70985349037633e-06, + "loss": 2.7035, + "step": 2673 + }, + { + "epoch": 0.7603926610070084, + "grad_norm": 1.533942699432373, + "learning_rate": 9.698362539500144e-06, + "loss": 2.1846, + "step": 2674 + }, + { + "epoch": 0.7606770262504665, + "grad_norm": 1.5470412969589233, + "learning_rate": 9.68687158862396e-06, + "loss": 2.2428, + "step": 2675 + }, + { + "epoch": 0.7609613914939246, + "grad_norm": 1.4431195259094238, + "learning_rate": 9.675380637747775e-06, + "loss": 2.2741, + "step": 2676 + }, + { + "epoch": 0.7612457567373828, + "grad_norm": 1.7669216394424438, + "learning_rate": 9.66388968687159e-06, + "loss": 2.05, + "step": 2677 + }, + { + "epoch": 0.7615301219808409, + "grad_norm": 1.4941767454147339, + "learning_rate": 9.652398735995404e-06, + "loss": 2.0978, + "step": 2678 + }, + { + "epoch": 0.761814487224299, + "grad_norm": 1.4513996839523315, + "learning_rate": 9.640907785119218e-06, + "loss": 1.7291, + "step": 2679 + }, + { + "epoch": 0.7620988524677571, + "grad_norm": 1.507140874862671, + "learning_rate": 9.629416834243034e-06, + "loss": 1.7039, + "step": 2680 + }, + { + "epoch": 0.7623832177112152, + "grad_norm": 1.6555514335632324, + "learning_rate": 9.61792588336685e-06, + "loss": 2.5856, + "step": 2681 + }, + { + "epoch": 0.7626675829546734, + "grad_norm": 1.4625468254089355, + "learning_rate": 9.606434932490665e-06, + "loss": 2.4481, + "step": 2682 + }, + { + "epoch": 0.7629519481981315, + "grad_norm": 1.55473792552948, + "learning_rate": 9.59494398161448e-06, + "loss": 2.0131, + "step": 2683 + }, + { + "epoch": 0.7632363134415896, + "grad_norm": 1.5071228742599487, + "learning_rate": 9.583453030738294e-06, + "loss": 2.2608, + "step": 2684 + }, + { + "epoch": 0.7635206786850477, + "grad_norm": 1.4757564067840576, + "learning_rate": 9.571962079862109e-06, + "loss": 1.9793, + "step": 2685 + }, + { + "epoch": 0.7638050439285058, + "grad_norm": 1.4441161155700684, + "learning_rate": 9.560471128985925e-06, + "loss": 1.7435, + "step": 2686 + }, + { + "epoch": 0.764089409171964, + "grad_norm": 1.4009804725646973, + "learning_rate": 9.54898017810974e-06, + "loss": 1.7192, + "step": 2687 + }, + { + "epoch": 0.7643737744154221, + "grad_norm": 1.5814915895462036, + "learning_rate": 9.537489227233554e-06, + "loss": 1.6818, + "step": 2688 + }, + { + "epoch": 0.7646581396588802, + "grad_norm": 1.7841424942016602, + "learning_rate": 9.525998276357368e-06, + "loss": 2.8207, + "step": 2689 + }, + { + "epoch": 0.7649425049023383, + "grad_norm": 1.4650540351867676, + "learning_rate": 9.514507325481185e-06, + "loss": 2.3041, + "step": 2690 + }, + { + "epoch": 0.7652268701457964, + "grad_norm": 1.519219994544983, + "learning_rate": 9.503016374604999e-06, + "loss": 2.0641, + "step": 2691 + }, + { + "epoch": 0.7655112353892546, + "grad_norm": 1.5591269731521606, + "learning_rate": 9.491525423728815e-06, + "loss": 2.3594, + "step": 2692 + }, + { + "epoch": 0.7657956006327127, + "grad_norm": 1.680830717086792, + "learning_rate": 9.48003447285263e-06, + "loss": 2.3668, + "step": 2693 + }, + { + "epoch": 0.7660799658761708, + "grad_norm": 1.5440500974655151, + "learning_rate": 9.468543521976444e-06, + "loss": 1.9979, + "step": 2694 + }, + { + "epoch": 0.7663643311196289, + "grad_norm": 1.6357736587524414, + "learning_rate": 9.457052571100259e-06, + "loss": 1.729, + "step": 2695 + }, + { + "epoch": 0.766648696363087, + "grad_norm": 1.5347177982330322, + "learning_rate": 9.445561620224075e-06, + "loss": 1.8021, + "step": 2696 + }, + { + "epoch": 0.7669330616065452, + "grad_norm": 1.7074389457702637, + "learning_rate": 9.43407066934789e-06, + "loss": 2.7968, + "step": 2697 + }, + { + "epoch": 0.7672174268500033, + "grad_norm": 1.4788157939910889, + "learning_rate": 9.422579718471704e-06, + "loss": 2.5411, + "step": 2698 + }, + { + "epoch": 0.7675017920934614, + "grad_norm": 1.428257703781128, + "learning_rate": 9.41108876759552e-06, + "loss": 2.3632, + "step": 2699 + }, + { + "epoch": 0.7677861573369195, + "grad_norm": 1.4511548280715942, + "learning_rate": 9.399597816719335e-06, + "loss": 2.102, + "step": 2700 + }, + { + "epoch": 0.7680705225803777, + "grad_norm": 1.6660867929458618, + "learning_rate": 9.388106865843149e-06, + "loss": 1.9556, + "step": 2701 + }, + { + "epoch": 0.7683548878238358, + "grad_norm": 1.6087857484817505, + "learning_rate": 9.376615914966965e-06, + "loss": 1.9751, + "step": 2702 + }, + { + "epoch": 0.7686392530672939, + "grad_norm": 1.5220601558685303, + "learning_rate": 9.36512496409078e-06, + "loss": 1.8027, + "step": 2703 + }, + { + "epoch": 0.768923618310752, + "grad_norm": 1.727441668510437, + "learning_rate": 9.353634013214594e-06, + "loss": 1.7694, + "step": 2704 + }, + { + "epoch": 0.76920798355421, + "grad_norm": 1.7294251918792725, + "learning_rate": 9.342143062338409e-06, + "loss": 2.6689, + "step": 2705 + }, + { + "epoch": 0.7694923487976683, + "grad_norm": 1.4868618249893188, + "learning_rate": 9.330652111462223e-06, + "loss": 2.4436, + "step": 2706 + }, + { + "epoch": 0.7697767140411264, + "grad_norm": 1.4418073892593384, + "learning_rate": 9.31916116058604e-06, + "loss": 2.1285, + "step": 2707 + }, + { + "epoch": 0.7700610792845844, + "grad_norm": 1.4674290418624878, + "learning_rate": 9.307670209709856e-06, + "loss": 1.9759, + "step": 2708 + }, + { + "epoch": 0.7703454445280425, + "grad_norm": 1.6296724081039429, + "learning_rate": 9.29617925883367e-06, + "loss": 2.1011, + "step": 2709 + }, + { + "epoch": 0.7706298097715006, + "grad_norm": 1.5555256605148315, + "learning_rate": 9.284688307957485e-06, + "loss": 1.7846, + "step": 2710 + }, + { + "epoch": 0.7709141750149588, + "grad_norm": 1.4738327264785767, + "learning_rate": 9.273197357081299e-06, + "loss": 1.5781, + "step": 2711 + }, + { + "epoch": 0.7711985402584169, + "grad_norm": 1.5029698610305786, + "learning_rate": 9.261706406205114e-06, + "loss": 1.5587, + "step": 2712 + }, + { + "epoch": 0.771482905501875, + "grad_norm": 1.7328534126281738, + "learning_rate": 9.25021545532893e-06, + "loss": 2.6136, + "step": 2713 + }, + { + "epoch": 0.7717672707453331, + "grad_norm": 1.5154101848602295, + "learning_rate": 9.238724504452744e-06, + "loss": 2.4293, + "step": 2714 + }, + { + "epoch": 0.7720516359887912, + "grad_norm": 1.420172929763794, + "learning_rate": 9.227233553576559e-06, + "loss": 2.1027, + "step": 2715 + }, + { + "epoch": 0.7723360012322494, + "grad_norm": 1.5309847593307495, + "learning_rate": 9.215742602700373e-06, + "loss": 2.3935, + "step": 2716 + }, + { + "epoch": 0.7726203664757075, + "grad_norm": 1.5749589204788208, + "learning_rate": 9.20425165182419e-06, + "loss": 1.9977, + "step": 2717 + }, + { + "epoch": 0.7729047317191656, + "grad_norm": 1.5268434286117554, + "learning_rate": 9.192760700948004e-06, + "loss": 1.7747, + "step": 2718 + }, + { + "epoch": 0.7731890969626237, + "grad_norm": 1.5345548391342163, + "learning_rate": 9.18126975007182e-06, + "loss": 1.7534, + "step": 2719 + }, + { + "epoch": 0.7734734622060818, + "grad_norm": 1.646863341331482, + "learning_rate": 9.169778799195635e-06, + "loss": 1.7113, + "step": 2720 + }, + { + "epoch": 0.77375782744954, + "grad_norm": 1.6630712747573853, + "learning_rate": 9.15828784831945e-06, + "loss": 2.6318, + "step": 2721 + }, + { + "epoch": 0.7740421926929981, + "grad_norm": 1.4465360641479492, + "learning_rate": 9.146796897443264e-06, + "loss": 2.5043, + "step": 2722 + }, + { + "epoch": 0.7743265579364562, + "grad_norm": 1.4710736274719238, + "learning_rate": 9.13530594656708e-06, + "loss": 2.1993, + "step": 2723 + }, + { + "epoch": 0.7746109231799143, + "grad_norm": 1.5619592666625977, + "learning_rate": 9.123814995690894e-06, + "loss": 2.1542, + "step": 2724 + }, + { + "epoch": 0.7748952884233724, + "grad_norm": 1.507546305656433, + "learning_rate": 9.112324044814709e-06, + "loss": 2.048, + "step": 2725 + }, + { + "epoch": 0.7751796536668306, + "grad_norm": 1.5877704620361328, + "learning_rate": 9.100833093938523e-06, + "loss": 1.8837, + "step": 2726 + }, + { + "epoch": 0.7754640189102887, + "grad_norm": 1.5766810178756714, + "learning_rate": 9.08934214306234e-06, + "loss": 1.7013, + "step": 2727 + }, + { + "epoch": 0.7757483841537468, + "grad_norm": 1.535407304763794, + "learning_rate": 9.077851192186154e-06, + "loss": 1.726, + "step": 2728 + }, + { + "epoch": 0.7760327493972049, + "grad_norm": 1.723103404045105, + "learning_rate": 9.06636024130997e-06, + "loss": 2.5775, + "step": 2729 + }, + { + "epoch": 0.776317114640663, + "grad_norm": 1.5206818580627441, + "learning_rate": 9.054869290433785e-06, + "loss": 2.4745, + "step": 2730 + }, + { + "epoch": 0.7766014798841212, + "grad_norm": 1.486801266670227, + "learning_rate": 9.0433783395576e-06, + "loss": 2.3211, + "step": 2731 + }, + { + "epoch": 0.7768858451275793, + "grad_norm": 1.546639323234558, + "learning_rate": 9.031887388681414e-06, + "loss": 2.2065, + "step": 2732 + }, + { + "epoch": 0.7771702103710374, + "grad_norm": 1.6100459098815918, + "learning_rate": 9.020396437805228e-06, + "loss": 2.1838, + "step": 2733 + }, + { + "epoch": 0.7774545756144955, + "grad_norm": 1.537714958190918, + "learning_rate": 9.008905486929044e-06, + "loss": 1.845, + "step": 2734 + }, + { + "epoch": 0.7777389408579536, + "grad_norm": 1.5195116996765137, + "learning_rate": 8.997414536052859e-06, + "loss": 1.8668, + "step": 2735 + }, + { + "epoch": 0.7780233061014118, + "grad_norm": 1.630244255065918, + "learning_rate": 8.985923585176675e-06, + "loss": 1.8279, + "step": 2736 + }, + { + "epoch": 0.7783076713448699, + "grad_norm": 1.72090744972229, + "learning_rate": 8.97443263430049e-06, + "loss": 2.7764, + "step": 2737 + }, + { + "epoch": 0.778592036588328, + "grad_norm": 1.5594855546951294, + "learning_rate": 8.962941683424304e-06, + "loss": 2.2368, + "step": 2738 + }, + { + "epoch": 0.7788764018317861, + "grad_norm": 1.4783302545547485, + "learning_rate": 8.951450732548119e-06, + "loss": 2.2075, + "step": 2739 + }, + { + "epoch": 0.7791607670752442, + "grad_norm": 1.5036635398864746, + "learning_rate": 8.939959781671935e-06, + "loss": 2.1962, + "step": 2740 + }, + { + "epoch": 0.7794451323187024, + "grad_norm": 1.511878252029419, + "learning_rate": 8.92846883079575e-06, + "loss": 1.9705, + "step": 2741 + }, + { + "epoch": 0.7797294975621605, + "grad_norm": 1.6692416667938232, + "learning_rate": 8.916977879919564e-06, + "loss": 2.0001, + "step": 2742 + }, + { + "epoch": 0.7800138628056186, + "grad_norm": 1.5080974102020264, + "learning_rate": 8.905486929043378e-06, + "loss": 1.5839, + "step": 2743 + }, + { + "epoch": 0.7802982280490767, + "grad_norm": 1.500978946685791, + "learning_rate": 8.893995978167193e-06, + "loss": 1.7545, + "step": 2744 + }, + { + "epoch": 0.7805825932925348, + "grad_norm": 1.763393521308899, + "learning_rate": 8.882505027291009e-06, + "loss": 2.7148, + "step": 2745 + }, + { + "epoch": 0.780866958535993, + "grad_norm": 1.525657057762146, + "learning_rate": 8.871014076414825e-06, + "loss": 2.4186, + "step": 2746 + }, + { + "epoch": 0.7811513237794511, + "grad_norm": 1.4492183923721313, + "learning_rate": 8.85952312553864e-06, + "loss": 2.1651, + "step": 2747 + }, + { + "epoch": 0.7814356890229092, + "grad_norm": 1.5383533239364624, + "learning_rate": 8.848032174662454e-06, + "loss": 2.1932, + "step": 2748 + }, + { + "epoch": 0.7817200542663673, + "grad_norm": 1.5642298460006714, + "learning_rate": 8.836541223786269e-06, + "loss": 2.1424, + "step": 2749 + }, + { + "epoch": 0.7820044195098255, + "grad_norm": 1.4735430479049683, + "learning_rate": 8.825050272910083e-06, + "loss": 1.7743, + "step": 2750 + }, + { + "epoch": 0.7822887847532836, + "grad_norm": 1.368130087852478, + "learning_rate": 8.8135593220339e-06, + "loss": 1.8134, + "step": 2751 + }, + { + "epoch": 0.7825731499967417, + "grad_norm": 1.6192513704299927, + "learning_rate": 8.802068371157714e-06, + "loss": 1.7028, + "step": 2752 + }, + { + "epoch": 0.7828575152401998, + "grad_norm": 1.9017709493637085, + "learning_rate": 8.790577420281528e-06, + "loss": 2.8195, + "step": 2753 + }, + { + "epoch": 0.7831418804836578, + "grad_norm": 1.5791484117507935, + "learning_rate": 8.779086469405343e-06, + "loss": 2.4216, + "step": 2754 + }, + { + "epoch": 0.783426245727116, + "grad_norm": 1.4580869674682617, + "learning_rate": 8.767595518529159e-06, + "loss": 2.0641, + "step": 2755 + }, + { + "epoch": 0.7837106109705742, + "grad_norm": 1.4634650945663452, + "learning_rate": 8.756104567652974e-06, + "loss": 2.1984, + "step": 2756 + }, + { + "epoch": 0.7839949762140322, + "grad_norm": 1.5937823057174683, + "learning_rate": 8.74461361677679e-06, + "loss": 1.9328, + "step": 2757 + }, + { + "epoch": 0.7842793414574903, + "grad_norm": 1.5349475145339966, + "learning_rate": 8.733122665900604e-06, + "loss": 1.7728, + "step": 2758 + }, + { + "epoch": 0.7845637067009484, + "grad_norm": 1.4689794778823853, + "learning_rate": 8.721631715024419e-06, + "loss": 1.7538, + "step": 2759 + }, + { + "epoch": 0.7848480719444066, + "grad_norm": 1.59420645236969, + "learning_rate": 8.710140764148233e-06, + "loss": 1.587, + "step": 2760 + }, + { + "epoch": 0.7851324371878647, + "grad_norm": 1.7142568826675415, + "learning_rate": 8.69864981327205e-06, + "loss": 2.6669, + "step": 2761 + }, + { + "epoch": 0.7854168024313228, + "grad_norm": 1.4829474687576294, + "learning_rate": 8.687158862395864e-06, + "loss": 2.5419, + "step": 2762 + }, + { + "epoch": 0.7857011676747809, + "grad_norm": 1.5407049655914307, + "learning_rate": 8.675667911519678e-06, + "loss": 2.2923, + "step": 2763 + }, + { + "epoch": 0.785985532918239, + "grad_norm": 1.489302158355713, + "learning_rate": 8.664176960643495e-06, + "loss": 2.3303, + "step": 2764 + }, + { + "epoch": 0.7862698981616972, + "grad_norm": 1.5373653173446655, + "learning_rate": 8.652686009767309e-06, + "loss": 2.0711, + "step": 2765 + }, + { + "epoch": 0.7865542634051553, + "grad_norm": 1.6663075685501099, + "learning_rate": 8.641195058891124e-06, + "loss": 1.8473, + "step": 2766 + }, + { + "epoch": 0.7868386286486134, + "grad_norm": 1.595422387123108, + "learning_rate": 8.62970410801494e-06, + "loss": 1.7362, + "step": 2767 + }, + { + "epoch": 0.7871229938920715, + "grad_norm": 1.601948857307434, + "learning_rate": 8.618213157138754e-06, + "loss": 1.6511, + "step": 2768 + }, + { + "epoch": 0.7874073591355296, + "grad_norm": 1.8554433584213257, + "learning_rate": 8.606722206262569e-06, + "loss": 2.7364, + "step": 2769 + }, + { + "epoch": 0.7876917243789878, + "grad_norm": 1.4515372514724731, + "learning_rate": 8.595231255386383e-06, + "loss": 2.6112, + "step": 2770 + }, + { + "epoch": 0.7879760896224459, + "grad_norm": 1.4031703472137451, + "learning_rate": 8.583740304510198e-06, + "loss": 2.2281, + "step": 2771 + }, + { + "epoch": 0.788260454865904, + "grad_norm": 1.527431845664978, + "learning_rate": 8.572249353634014e-06, + "loss": 2.2233, + "step": 2772 + }, + { + "epoch": 0.7885448201093621, + "grad_norm": 1.635292649269104, + "learning_rate": 8.56075840275783e-06, + "loss": 2.1587, + "step": 2773 + }, + { + "epoch": 0.7888291853528202, + "grad_norm": 1.6808167695999146, + "learning_rate": 8.549267451881645e-06, + "loss": 1.8091, + "step": 2774 + }, + { + "epoch": 0.7891135505962784, + "grad_norm": 1.5428953170776367, + "learning_rate": 8.537776501005459e-06, + "loss": 1.9059, + "step": 2775 + }, + { + "epoch": 0.7893979158397365, + "grad_norm": 1.5486587285995483, + "learning_rate": 8.526285550129274e-06, + "loss": 1.6714, + "step": 2776 + }, + { + "epoch": 0.7896822810831946, + "grad_norm": 1.6755555868148804, + "learning_rate": 8.514794599253088e-06, + "loss": 2.6297, + "step": 2777 + }, + { + "epoch": 0.7899666463266527, + "grad_norm": 1.4705214500427246, + "learning_rate": 8.503303648376904e-06, + "loss": 2.2428, + "step": 2778 + }, + { + "epoch": 0.7902510115701108, + "grad_norm": 1.4628565311431885, + "learning_rate": 8.491812697500719e-06, + "loss": 2.0421, + "step": 2779 + }, + { + "epoch": 0.790535376813569, + "grad_norm": 1.5100350379943848, + "learning_rate": 8.480321746624533e-06, + "loss": 2.2767, + "step": 2780 + }, + { + "epoch": 0.7908197420570271, + "grad_norm": 1.6801701784133911, + "learning_rate": 8.468830795748348e-06, + "loss": 1.9363, + "step": 2781 + }, + { + "epoch": 0.7911041073004852, + "grad_norm": 1.6616177558898926, + "learning_rate": 8.457339844872164e-06, + "loss": 2.0598, + "step": 2782 + }, + { + "epoch": 0.7913884725439433, + "grad_norm": 1.5380514860153198, + "learning_rate": 8.445848893995979e-06, + "loss": 1.7708, + "step": 2783 + }, + { + "epoch": 0.7916728377874014, + "grad_norm": 1.5327441692352295, + "learning_rate": 8.434357943119795e-06, + "loss": 1.7078, + "step": 2784 + }, + { + "epoch": 0.7919572030308596, + "grad_norm": 1.597601294517517, + "learning_rate": 8.42286699224361e-06, + "loss": 2.5618, + "step": 2785 + }, + { + "epoch": 0.7922415682743177, + "grad_norm": 1.663575291633606, + "learning_rate": 8.411376041367424e-06, + "loss": 2.422, + "step": 2786 + }, + { + "epoch": 0.7925259335177758, + "grad_norm": 1.4837387800216675, + "learning_rate": 8.399885090491238e-06, + "loss": 2.3795, + "step": 2787 + }, + { + "epoch": 0.7928102987612339, + "grad_norm": 1.405079960823059, + "learning_rate": 8.388394139615054e-06, + "loss": 2.2017, + "step": 2788 + }, + { + "epoch": 0.793094664004692, + "grad_norm": 1.6203116178512573, + "learning_rate": 8.376903188738869e-06, + "loss": 2.1513, + "step": 2789 + }, + { + "epoch": 0.7933790292481502, + "grad_norm": 1.4136089086532593, + "learning_rate": 8.365412237862683e-06, + "loss": 1.6959, + "step": 2790 + }, + { + "epoch": 0.7936633944916083, + "grad_norm": 1.5652225017547607, + "learning_rate": 8.353921286986498e-06, + "loss": 1.7965, + "step": 2791 + }, + { + "epoch": 0.7939477597350664, + "grad_norm": 1.5910707712173462, + "learning_rate": 8.342430336110314e-06, + "loss": 1.6998, + "step": 2792 + }, + { + "epoch": 0.7942321249785245, + "grad_norm": 1.7475180625915527, + "learning_rate": 8.330939385234129e-06, + "loss": 2.6038, + "step": 2793 + }, + { + "epoch": 0.7945164902219826, + "grad_norm": 1.538881540298462, + "learning_rate": 8.319448434357945e-06, + "loss": 2.217, + "step": 2794 + }, + { + "epoch": 0.7948008554654408, + "grad_norm": 1.5075610876083374, + "learning_rate": 8.30795748348176e-06, + "loss": 2.4162, + "step": 2795 + }, + { + "epoch": 0.7950852207088989, + "grad_norm": 1.4764316082000732, + "learning_rate": 8.296466532605574e-06, + "loss": 2.1782, + "step": 2796 + }, + { + "epoch": 0.795369585952357, + "grad_norm": 1.5661760568618774, + "learning_rate": 8.284975581729388e-06, + "loss": 2.1276, + "step": 2797 + }, + { + "epoch": 0.7956539511958151, + "grad_norm": 1.4711788892745972, + "learning_rate": 8.273484630853203e-06, + "loss": 1.7211, + "step": 2798 + }, + { + "epoch": 0.7959383164392733, + "grad_norm": 1.5930848121643066, + "learning_rate": 8.261993679977019e-06, + "loss": 1.6124, + "step": 2799 + }, + { + "epoch": 0.7962226816827314, + "grad_norm": 1.6392720937728882, + "learning_rate": 8.250502729100833e-06, + "loss": 1.8286, + "step": 2800 + }, + { + "epoch": 0.7965070469261895, + "grad_norm": 1.7386542558670044, + "learning_rate": 8.23901177822465e-06, + "loss": 2.7341, + "step": 2801 + }, + { + "epoch": 0.7967914121696476, + "grad_norm": 1.4948952198028564, + "learning_rate": 8.227520827348464e-06, + "loss": 2.4624, + "step": 2802 + }, + { + "epoch": 0.7970757774131056, + "grad_norm": 1.3838168382644653, + "learning_rate": 8.216029876472279e-06, + "loss": 2.3202, + "step": 2803 + }, + { + "epoch": 0.7973601426565639, + "grad_norm": 1.4264721870422363, + "learning_rate": 8.204538925596093e-06, + "loss": 2.2274, + "step": 2804 + }, + { + "epoch": 0.797644507900022, + "grad_norm": 1.5542453527450562, + "learning_rate": 8.19304797471991e-06, + "loss": 2.0876, + "step": 2805 + }, + { + "epoch": 0.79792887314348, + "grad_norm": 1.5131659507751465, + "learning_rate": 8.181557023843724e-06, + "loss": 1.8612, + "step": 2806 + }, + { + "epoch": 0.7982132383869381, + "grad_norm": 1.5086268186569214, + "learning_rate": 8.170066072967538e-06, + "loss": 1.7961, + "step": 2807 + }, + { + "epoch": 0.7984976036303962, + "grad_norm": 1.5793956518173218, + "learning_rate": 8.158575122091353e-06, + "loss": 1.6068, + "step": 2808 + }, + { + "epoch": 0.7987819688738544, + "grad_norm": 1.7621535062789917, + "learning_rate": 8.147084171215169e-06, + "loss": 2.4022, + "step": 2809 + }, + { + "epoch": 0.7990663341173125, + "grad_norm": 1.5189746618270874, + "learning_rate": 8.135593220338983e-06, + "loss": 2.3452, + "step": 2810 + }, + { + "epoch": 0.7993506993607706, + "grad_norm": 1.418518304824829, + "learning_rate": 8.1241022694628e-06, + "loss": 1.9866, + "step": 2811 + }, + { + "epoch": 0.7996350646042287, + "grad_norm": 1.5955320596694946, + "learning_rate": 8.112611318586614e-06, + "loss": 2.1267, + "step": 2812 + }, + { + "epoch": 0.7999194298476868, + "grad_norm": 1.7210025787353516, + "learning_rate": 8.101120367710429e-06, + "loss": 1.9749, + "step": 2813 + }, + { + "epoch": 0.800203795091145, + "grad_norm": 1.5319161415100098, + "learning_rate": 8.089629416834243e-06, + "loss": 1.9589, + "step": 2814 + }, + { + "epoch": 0.8004881603346031, + "grad_norm": 1.6114243268966675, + "learning_rate": 8.07813846595806e-06, + "loss": 1.6376, + "step": 2815 + }, + { + "epoch": 0.8007725255780612, + "grad_norm": 1.6235712766647339, + "learning_rate": 8.066647515081874e-06, + "loss": 1.7676, + "step": 2816 + }, + { + "epoch": 0.8010568908215193, + "grad_norm": 1.5523582696914673, + "learning_rate": 8.055156564205688e-06, + "loss": 2.7133, + "step": 2817 + }, + { + "epoch": 0.8013412560649774, + "grad_norm": 1.4934026002883911, + "learning_rate": 8.043665613329503e-06, + "loss": 2.5263, + "step": 2818 + }, + { + "epoch": 0.8016256213084356, + "grad_norm": 1.4201688766479492, + "learning_rate": 8.032174662453317e-06, + "loss": 2.214, + "step": 2819 + }, + { + "epoch": 0.8019099865518937, + "grad_norm": 1.5365192890167236, + "learning_rate": 8.020683711577134e-06, + "loss": 2.2509, + "step": 2820 + }, + { + "epoch": 0.8021943517953518, + "grad_norm": 1.5041007995605469, + "learning_rate": 8.00919276070095e-06, + "loss": 2.1733, + "step": 2821 + }, + { + "epoch": 0.8024787170388099, + "grad_norm": 1.4607468843460083, + "learning_rate": 7.997701809824764e-06, + "loss": 1.8385, + "step": 2822 + }, + { + "epoch": 0.802763082282268, + "grad_norm": 1.6158239841461182, + "learning_rate": 7.986210858948579e-06, + "loss": 1.8724, + "step": 2823 + }, + { + "epoch": 0.8030474475257262, + "grad_norm": 1.6089496612548828, + "learning_rate": 7.974719908072393e-06, + "loss": 1.746, + "step": 2824 + }, + { + "epoch": 0.8033318127691843, + "grad_norm": 1.6502569913864136, + "learning_rate": 7.963228957196208e-06, + "loss": 2.6797, + "step": 2825 + }, + { + "epoch": 0.8036161780126424, + "grad_norm": 1.4397003650665283, + "learning_rate": 7.951738006320024e-06, + "loss": 2.2595, + "step": 2826 + }, + { + "epoch": 0.8039005432561005, + "grad_norm": 1.5017975568771362, + "learning_rate": 7.940247055443838e-06, + "loss": 2.3404, + "step": 2827 + }, + { + "epoch": 0.8041849084995586, + "grad_norm": 1.4788340330123901, + "learning_rate": 7.928756104567653e-06, + "loss": 2.2218, + "step": 2828 + }, + { + "epoch": 0.8044692737430168, + "grad_norm": 1.5507515668869019, + "learning_rate": 7.917265153691469e-06, + "loss": 2.0377, + "step": 2829 + }, + { + "epoch": 0.8047536389864749, + "grad_norm": 1.5333271026611328, + "learning_rate": 7.905774202815284e-06, + "loss": 1.948, + "step": 2830 + }, + { + "epoch": 0.805038004229933, + "grad_norm": 1.4439997673034668, + "learning_rate": 7.894283251939098e-06, + "loss": 1.7283, + "step": 2831 + }, + { + "epoch": 0.8053223694733911, + "grad_norm": 1.5981254577636719, + "learning_rate": 7.882792301062914e-06, + "loss": 1.899, + "step": 2832 + }, + { + "epoch": 0.8056067347168492, + "grad_norm": 1.6823999881744385, + "learning_rate": 7.871301350186729e-06, + "loss": 2.8227, + "step": 2833 + }, + { + "epoch": 0.8058910999603074, + "grad_norm": 1.593544840812683, + "learning_rate": 7.859810399310543e-06, + "loss": 2.4295, + "step": 2834 + }, + { + "epoch": 0.8061754652037655, + "grad_norm": 1.3846356868743896, + "learning_rate": 7.848319448434358e-06, + "loss": 2.0042, + "step": 2835 + }, + { + "epoch": 0.8064598304472236, + "grad_norm": 1.4523168802261353, + "learning_rate": 7.836828497558174e-06, + "loss": 2.0895, + "step": 2836 + }, + { + "epoch": 0.8067441956906817, + "grad_norm": 1.5731124877929688, + "learning_rate": 7.825337546681988e-06, + "loss": 2.2216, + "step": 2837 + }, + { + "epoch": 0.8070285609341398, + "grad_norm": 1.5912824869155884, + "learning_rate": 7.813846595805805e-06, + "loss": 2.0229, + "step": 2838 + }, + { + "epoch": 0.807312926177598, + "grad_norm": 1.5180526971817017, + "learning_rate": 7.80235564492962e-06, + "loss": 1.8176, + "step": 2839 + }, + { + "epoch": 0.8075972914210561, + "grad_norm": 1.5623652935028076, + "learning_rate": 7.790864694053434e-06, + "loss": 1.8123, + "step": 2840 + }, + { + "epoch": 0.8078816566645142, + "grad_norm": 1.7535358667373657, + "learning_rate": 7.779373743177248e-06, + "loss": 2.7814, + "step": 2841 + }, + { + "epoch": 0.8081660219079723, + "grad_norm": 1.5314868688583374, + "learning_rate": 7.767882792301064e-06, + "loss": 2.2653, + "step": 2842 + }, + { + "epoch": 0.8084503871514305, + "grad_norm": 1.4723985195159912, + "learning_rate": 7.756391841424879e-06, + "loss": 2.2257, + "step": 2843 + }, + { + "epoch": 0.8087347523948886, + "grad_norm": 1.5310198068618774, + "learning_rate": 7.744900890548693e-06, + "loss": 2.0548, + "step": 2844 + }, + { + "epoch": 0.8090191176383467, + "grad_norm": 1.6289122104644775, + "learning_rate": 7.733409939672508e-06, + "loss": 2.1181, + "step": 2845 + }, + { + "epoch": 0.8093034828818048, + "grad_norm": 1.669812798500061, + "learning_rate": 7.721918988796322e-06, + "loss": 1.6431, + "step": 2846 + }, + { + "epoch": 0.8095878481252629, + "grad_norm": 1.6252498626708984, + "learning_rate": 7.710428037920139e-06, + "loss": 1.7344, + "step": 2847 + }, + { + "epoch": 0.8098722133687211, + "grad_norm": 1.6311345100402832, + "learning_rate": 7.698937087043955e-06, + "loss": 1.8725, + "step": 2848 + }, + { + "epoch": 0.8101565786121792, + "grad_norm": 1.6725844144821167, + "learning_rate": 7.68744613616777e-06, + "loss": 2.5774, + "step": 2849 + }, + { + "epoch": 0.8104409438556373, + "grad_norm": 1.5067769289016724, + "learning_rate": 7.675955185291584e-06, + "loss": 2.2774, + "step": 2850 + }, + { + "epoch": 0.8107253090990953, + "grad_norm": 1.5363556146621704, + "learning_rate": 7.664464234415398e-06, + "loss": 2.1134, + "step": 2851 + }, + { + "epoch": 0.8110096743425534, + "grad_norm": 1.4846501350402832, + "learning_rate": 7.652973283539213e-06, + "loss": 2.0919, + "step": 2852 + }, + { + "epoch": 0.8112940395860117, + "grad_norm": 1.6034198999404907, + "learning_rate": 7.641482332663029e-06, + "loss": 2.0541, + "step": 2853 + }, + { + "epoch": 0.8115784048294697, + "grad_norm": 1.49429452419281, + "learning_rate": 7.629991381786843e-06, + "loss": 1.6646, + "step": 2854 + }, + { + "epoch": 0.8118627700729278, + "grad_norm": 1.4957629442214966, + "learning_rate": 7.618500430910658e-06, + "loss": 1.8006, + "step": 2855 + }, + { + "epoch": 0.8121471353163859, + "grad_norm": 1.5566238164901733, + "learning_rate": 7.607009480034473e-06, + "loss": 1.7759, + "step": 2856 + }, + { + "epoch": 0.812431500559844, + "grad_norm": 1.7334346771240234, + "learning_rate": 7.595518529158289e-06, + "loss": 2.607, + "step": 2857 + }, + { + "epoch": 0.8127158658033022, + "grad_norm": 1.5797719955444336, + "learning_rate": 7.584027578282104e-06, + "loss": 2.4472, + "step": 2858 + }, + { + "epoch": 0.8130002310467603, + "grad_norm": 1.4985685348510742, + "learning_rate": 7.572536627405918e-06, + "loss": 2.4235, + "step": 2859 + }, + { + "epoch": 0.8132845962902184, + "grad_norm": 1.4722380638122559, + "learning_rate": 7.561045676529734e-06, + "loss": 2.1274, + "step": 2860 + }, + { + "epoch": 0.8135689615336765, + "grad_norm": 1.4994633197784424, + "learning_rate": 7.549554725653548e-06, + "loss": 2.074, + "step": 2861 + }, + { + "epoch": 0.8138533267771346, + "grad_norm": 1.689273476600647, + "learning_rate": 7.538063774777364e-06, + "loss": 2.0419, + "step": 2862 + }, + { + "epoch": 0.8141376920205928, + "grad_norm": 1.5861849784851074, + "learning_rate": 7.526572823901178e-06, + "loss": 1.9478, + "step": 2863 + }, + { + "epoch": 0.8144220572640509, + "grad_norm": 1.5588481426239014, + "learning_rate": 7.5150818730249935e-06, + "loss": 1.7538, + "step": 2864 + }, + { + "epoch": 0.814706422507509, + "grad_norm": 1.675533413887024, + "learning_rate": 7.503590922148808e-06, + "loss": 2.7183, + "step": 2865 + }, + { + "epoch": 0.8149907877509671, + "grad_norm": 1.4782207012176514, + "learning_rate": 7.492099971272624e-06, + "loss": 2.4106, + "step": 2866 + }, + { + "epoch": 0.8152751529944252, + "grad_norm": 1.4612501859664917, + "learning_rate": 7.480609020396439e-06, + "loss": 2.4035, + "step": 2867 + }, + { + "epoch": 0.8155595182378834, + "grad_norm": 1.5237157344818115, + "learning_rate": 7.469118069520254e-06, + "loss": 2.203, + "step": 2868 + }, + { + "epoch": 0.8158438834813415, + "grad_norm": 1.601366639137268, + "learning_rate": 7.4576271186440685e-06, + "loss": 2.0569, + "step": 2869 + }, + { + "epoch": 0.8161282487247996, + "grad_norm": 1.5333240032196045, + "learning_rate": 7.446136167767883e-06, + "loss": 1.932, + "step": 2870 + }, + { + "epoch": 0.8164126139682577, + "grad_norm": 1.6135165691375732, + "learning_rate": 7.434645216891698e-06, + "loss": 1.879, + "step": 2871 + }, + { + "epoch": 0.8166969792117158, + "grad_norm": 1.542605996131897, + "learning_rate": 7.423154266015513e-06, + "loss": 1.7374, + "step": 2872 + }, + { + "epoch": 0.816981344455174, + "grad_norm": 1.7298074960708618, + "learning_rate": 7.411663315139328e-06, + "loss": 2.7893, + "step": 2873 + }, + { + "epoch": 0.8172657096986321, + "grad_norm": 1.439632773399353, + "learning_rate": 7.400172364263143e-06, + "loss": 2.1772, + "step": 2874 + }, + { + "epoch": 0.8175500749420902, + "grad_norm": 1.3654167652130127, + "learning_rate": 7.388681413386959e-06, + "loss": 2.2241, + "step": 2875 + }, + { + "epoch": 0.8178344401855483, + "grad_norm": 1.4207921028137207, + "learning_rate": 7.377190462510773e-06, + "loss": 2.1142, + "step": 2876 + }, + { + "epoch": 0.8181188054290064, + "grad_norm": 1.5437437295913696, + "learning_rate": 7.365699511634589e-06, + "loss": 2.0925, + "step": 2877 + }, + { + "epoch": 0.8184031706724646, + "grad_norm": 1.6208105087280273, + "learning_rate": 7.354208560758403e-06, + "loss": 1.9039, + "step": 2878 + }, + { + "epoch": 0.8186875359159227, + "grad_norm": 1.5807172060012817, + "learning_rate": 7.3427176098822185e-06, + "loss": 1.8186, + "step": 2879 + }, + { + "epoch": 0.8189719011593808, + "grad_norm": 1.6478073596954346, + "learning_rate": 7.331226659006033e-06, + "loss": 1.7717, + "step": 2880 + }, + { + "epoch": 0.8192562664028389, + "grad_norm": 1.6527962684631348, + "learning_rate": 7.319735708129848e-06, + "loss": 2.7834, + "step": 2881 + }, + { + "epoch": 0.819540631646297, + "grad_norm": 1.4940738677978516, + "learning_rate": 7.308244757253663e-06, + "loss": 2.2835, + "step": 2882 + }, + { + "epoch": 0.8198249968897552, + "grad_norm": 1.490037202835083, + "learning_rate": 7.296753806377478e-06, + "loss": 2.1039, + "step": 2883 + }, + { + "epoch": 0.8201093621332133, + "grad_norm": 1.4153006076812744, + "learning_rate": 7.2852628555012936e-06, + "loss": 2.1536, + "step": 2884 + }, + { + "epoch": 0.8203937273766714, + "grad_norm": 1.5724902153015137, + "learning_rate": 7.273771904625109e-06, + "loss": 1.9262, + "step": 2885 + }, + { + "epoch": 0.8206780926201295, + "grad_norm": 1.4987521171569824, + "learning_rate": 7.262280953748923e-06, + "loss": 1.9364, + "step": 2886 + }, + { + "epoch": 0.8209624578635876, + "grad_norm": 1.650689721107483, + "learning_rate": 7.250790002872739e-06, + "loss": 1.7343, + "step": 2887 + }, + { + "epoch": 0.8212468231070458, + "grad_norm": 1.5347603559494019, + "learning_rate": 7.239299051996553e-06, + "loss": 1.9424, + "step": 2888 + }, + { + "epoch": 0.8215311883505039, + "grad_norm": 1.7783067226409912, + "learning_rate": 7.227808101120369e-06, + "loss": 2.81, + "step": 2889 + }, + { + "epoch": 0.821815553593962, + "grad_norm": 1.4985435009002686, + "learning_rate": 7.216317150244183e-06, + "loss": 2.3066, + "step": 2890 + }, + { + "epoch": 0.8220999188374201, + "grad_norm": 1.5006333589553833, + "learning_rate": 7.204826199367998e-06, + "loss": 2.191, + "step": 2891 + }, + { + "epoch": 0.8223842840808783, + "grad_norm": 1.4915610551834106, + "learning_rate": 7.193335248491813e-06, + "loss": 2.0556, + "step": 2892 + }, + { + "epoch": 0.8226686493243364, + "grad_norm": 1.5882519483566284, + "learning_rate": 7.1818442976156274e-06, + "loss": 2.0908, + "step": 2893 + }, + { + "epoch": 0.8229530145677945, + "grad_norm": 1.4619516134262085, + "learning_rate": 7.170353346739444e-06, + "loss": 1.9773, + "step": 2894 + }, + { + "epoch": 0.8232373798112526, + "grad_norm": 1.5004616975784302, + "learning_rate": 7.158862395863259e-06, + "loss": 1.9638, + "step": 2895 + }, + { + "epoch": 0.8235217450547107, + "grad_norm": 1.5177069902420044, + "learning_rate": 7.1473714449870735e-06, + "loss": 1.6566, + "step": 2896 + }, + { + "epoch": 0.8238061102981689, + "grad_norm": 1.724021315574646, + "learning_rate": 7.135880494110888e-06, + "loss": 2.7129, + "step": 2897 + }, + { + "epoch": 0.824090475541627, + "grad_norm": 1.537143349647522, + "learning_rate": 7.124389543234703e-06, + "loss": 2.3473, + "step": 2898 + }, + { + "epoch": 0.824374840785085, + "grad_norm": 1.3727134466171265, + "learning_rate": 7.112898592358518e-06, + "loss": 2.4072, + "step": 2899 + }, + { + "epoch": 0.8246592060285431, + "grad_norm": 1.4793521165847778, + "learning_rate": 7.101407641482333e-06, + "loss": 2.1391, + "step": 2900 + }, + { + "epoch": 0.8249435712720012, + "grad_norm": 1.4474788904190063, + "learning_rate": 7.089916690606148e-06, + "loss": 2.084, + "step": 2901 + }, + { + "epoch": 0.8252279365154594, + "grad_norm": 1.6021233797073364, + "learning_rate": 7.078425739729963e-06, + "loss": 1.8724, + "step": 2902 + }, + { + "epoch": 0.8255123017589175, + "grad_norm": 1.5263599157333374, + "learning_rate": 7.066934788853778e-06, + "loss": 1.7535, + "step": 2903 + }, + { + "epoch": 0.8257966670023756, + "grad_norm": 1.5314630270004272, + "learning_rate": 7.055443837977594e-06, + "loss": 1.7294, + "step": 2904 + }, + { + "epoch": 0.8260810322458337, + "grad_norm": 1.6846193075180054, + "learning_rate": 7.043952887101408e-06, + "loss": 2.6378, + "step": 2905 + }, + { + "epoch": 0.8263653974892918, + "grad_norm": 1.6511030197143555, + "learning_rate": 7.0324619362252235e-06, + "loss": 2.4237, + "step": 2906 + }, + { + "epoch": 0.82664976273275, + "grad_norm": 1.453942894935608, + "learning_rate": 7.020970985349038e-06, + "loss": 2.3405, + "step": 2907 + }, + { + "epoch": 0.8269341279762081, + "grad_norm": 1.4035779237747192, + "learning_rate": 7.009480034472853e-06, + "loss": 2.0887, + "step": 2908 + }, + { + "epoch": 0.8272184932196662, + "grad_norm": 1.5787320137023926, + "learning_rate": 6.997989083596668e-06, + "loss": 2.1067, + "step": 2909 + }, + { + "epoch": 0.8275028584631243, + "grad_norm": 1.6145730018615723, + "learning_rate": 6.986498132720483e-06, + "loss": 1.9487, + "step": 2910 + }, + { + "epoch": 0.8277872237065824, + "grad_norm": 1.4792077541351318, + "learning_rate": 6.975007181844298e-06, + "loss": 1.8149, + "step": 2911 + }, + { + "epoch": 0.8280715889500406, + "grad_norm": 1.5759203433990479, + "learning_rate": 6.963516230968114e-06, + "loss": 1.8977, + "step": 2912 + }, + { + "epoch": 0.8283559541934987, + "grad_norm": 1.6006420850753784, + "learning_rate": 6.952025280091928e-06, + "loss": 2.8743, + "step": 2913 + }, + { + "epoch": 0.8286403194369568, + "grad_norm": 1.4774819612503052, + "learning_rate": 6.940534329215744e-06, + "loss": 2.3842, + "step": 2914 + }, + { + "epoch": 0.8289246846804149, + "grad_norm": 1.4915317296981812, + "learning_rate": 6.929043378339558e-06, + "loss": 2.2381, + "step": 2915 + }, + { + "epoch": 0.829209049923873, + "grad_norm": 1.4956070184707642, + "learning_rate": 6.9175524274633736e-06, + "loss": 2.2751, + "step": 2916 + }, + { + "epoch": 0.8294934151673312, + "grad_norm": 1.5358397960662842, + "learning_rate": 6.906061476587188e-06, + "loss": 2.0869, + "step": 2917 + }, + { + "epoch": 0.8297777804107893, + "grad_norm": 1.4759082794189453, + "learning_rate": 6.8945705257110026e-06, + "loss": 1.6791, + "step": 2918 + }, + { + "epoch": 0.8300621456542474, + "grad_norm": 1.5984234809875488, + "learning_rate": 6.883079574834818e-06, + "loss": 1.7364, + "step": 2919 + }, + { + "epoch": 0.8303465108977055, + "grad_norm": 1.6775093078613281, + "learning_rate": 6.871588623958632e-06, + "loss": 1.734, + "step": 2920 + }, + { + "epoch": 0.8306308761411636, + "grad_norm": 1.687727928161621, + "learning_rate": 6.860097673082449e-06, + "loss": 2.5652, + "step": 2921 + }, + { + "epoch": 0.8309152413846218, + "grad_norm": 1.4916083812713623, + "learning_rate": 6.848606722206264e-06, + "loss": 2.2731, + "step": 2922 + }, + { + "epoch": 0.8311996066280799, + "grad_norm": 1.407136082649231, + "learning_rate": 6.8371157713300784e-06, + "loss": 2.1255, + "step": 2923 + }, + { + "epoch": 0.831483971871538, + "grad_norm": 1.390185832977295, + "learning_rate": 6.825624820453893e-06, + "loss": 2.043, + "step": 2924 + }, + { + "epoch": 0.8317683371149961, + "grad_norm": 1.599908471107483, + "learning_rate": 6.814133869577708e-06, + "loss": 2.3225, + "step": 2925 + }, + { + "epoch": 0.8320527023584542, + "grad_norm": 1.5953835248947144, + "learning_rate": 6.802642918701523e-06, + "loss": 1.7706, + "step": 2926 + }, + { + "epoch": 0.8323370676019124, + "grad_norm": 1.5901626348495483, + "learning_rate": 6.791151967825338e-06, + "loss": 1.7223, + "step": 2927 + }, + { + "epoch": 0.8326214328453705, + "grad_norm": 1.617637038230896, + "learning_rate": 6.779661016949153e-06, + "loss": 1.9098, + "step": 2928 + }, + { + "epoch": 0.8329057980888286, + "grad_norm": 1.7057212591171265, + "learning_rate": 6.768170066072968e-06, + "loss": 2.6873, + "step": 2929 + }, + { + "epoch": 0.8331901633322867, + "grad_norm": 1.514479637145996, + "learning_rate": 6.7566791151967825e-06, + "loss": 2.2949, + "step": 2930 + }, + { + "epoch": 0.8334745285757448, + "grad_norm": 1.5152493715286255, + "learning_rate": 6.745188164320599e-06, + "loss": 2.2854, + "step": 2931 + }, + { + "epoch": 0.833758893819203, + "grad_norm": 1.4838507175445557, + "learning_rate": 6.733697213444413e-06, + "loss": 1.9852, + "step": 2932 + }, + { + "epoch": 0.8340432590626611, + "grad_norm": 1.5288348197937012, + "learning_rate": 6.7222062625682285e-06, + "loss": 1.994, + "step": 2933 + }, + { + "epoch": 0.8343276243061192, + "grad_norm": 1.4767318964004517, + "learning_rate": 6.710715311692043e-06, + "loss": 1.6974, + "step": 2934 + }, + { + "epoch": 0.8346119895495773, + "grad_norm": 1.5459532737731934, + "learning_rate": 6.699224360815858e-06, + "loss": 1.9026, + "step": 2935 + }, + { + "epoch": 0.8348963547930354, + "grad_norm": 1.5643906593322754, + "learning_rate": 6.687733409939673e-06, + "loss": 1.7265, + "step": 2936 + }, + { + "epoch": 0.8351807200364936, + "grad_norm": 1.6530026197433472, + "learning_rate": 6.676242459063488e-06, + "loss": 2.6701, + "step": 2937 + }, + { + "epoch": 0.8354650852799517, + "grad_norm": 1.464538812637329, + "learning_rate": 6.664751508187303e-06, + "loss": 2.3489, + "step": 2938 + }, + { + "epoch": 0.8357494505234098, + "grad_norm": 1.469451904296875, + "learning_rate": 6.653260557311117e-06, + "loss": 2.242, + "step": 2939 + }, + { + "epoch": 0.8360338157668679, + "grad_norm": 1.4956061840057373, + "learning_rate": 6.641769606434933e-06, + "loss": 2.0947, + "step": 2940 + }, + { + "epoch": 0.8363181810103261, + "grad_norm": 1.5736510753631592, + "learning_rate": 6.630278655558749e-06, + "loss": 2.1535, + "step": 2941 + }, + { + "epoch": 0.8366025462537842, + "grad_norm": 1.4406914710998535, + "learning_rate": 6.618787704682563e-06, + "loss": 1.8535, + "step": 2942 + }, + { + "epoch": 0.8368869114972423, + "grad_norm": 1.5601533651351929, + "learning_rate": 6.6072967538063786e-06, + "loss": 1.9428, + "step": 2943 + }, + { + "epoch": 0.8371712767407004, + "grad_norm": 1.4611488580703735, + "learning_rate": 6.595805802930193e-06, + "loss": 1.6424, + "step": 2944 + }, + { + "epoch": 0.8374556419841584, + "grad_norm": 1.6867340803146362, + "learning_rate": 6.5843148520540075e-06, + "loss": 2.6731, + "step": 2945 + }, + { + "epoch": 0.8377400072276167, + "grad_norm": 1.4799604415893555, + "learning_rate": 6.572823901177823e-06, + "loss": 2.3161, + "step": 2946 + }, + { + "epoch": 0.8380243724710748, + "grad_norm": 1.4663941860198975, + "learning_rate": 6.561332950301637e-06, + "loss": 2.2613, + "step": 2947 + }, + { + "epoch": 0.8383087377145328, + "grad_norm": 1.496065616607666, + "learning_rate": 6.549841999425453e-06, + "loss": 1.9605, + "step": 2948 + }, + { + "epoch": 0.8385931029579909, + "grad_norm": 1.416529893875122, + "learning_rate": 6.538351048549268e-06, + "loss": 2.041, + "step": 2949 + }, + { + "epoch": 0.838877468201449, + "grad_norm": 1.5188525915145874, + "learning_rate": 6.5268600976730834e-06, + "loss": 2.0049, + "step": 2950 + }, + { + "epoch": 0.8391618334449072, + "grad_norm": 1.6561343669891357, + "learning_rate": 6.515369146796898e-06, + "loss": 1.7761, + "step": 2951 + }, + { + "epoch": 0.8394461986883653, + "grad_norm": 1.8842443227767944, + "learning_rate": 6.503878195920713e-06, + "loss": 1.8444, + "step": 2952 + }, + { + "epoch": 0.8397305639318234, + "grad_norm": 1.5928751230239868, + "learning_rate": 6.492387245044528e-06, + "loss": 2.7766, + "step": 2953 + }, + { + "epoch": 0.8400149291752815, + "grad_norm": 1.551367998123169, + "learning_rate": 6.480896294168343e-06, + "loss": 2.3994, + "step": 2954 + }, + { + "epoch": 0.8402992944187396, + "grad_norm": 1.4871084690093994, + "learning_rate": 6.469405343292158e-06, + "loss": 2.2015, + "step": 2955 + }, + { + "epoch": 0.8405836596621978, + "grad_norm": 1.6148115396499634, + "learning_rate": 6.457914392415973e-06, + "loss": 2.1799, + "step": 2956 + }, + { + "epoch": 0.8408680249056559, + "grad_norm": 1.5156315565109253, + "learning_rate": 6.4464234415397874e-06, + "loss": 2.031, + "step": 2957 + }, + { + "epoch": 0.841152390149114, + "grad_norm": 1.4778313636779785, + "learning_rate": 6.434932490663604e-06, + "loss": 1.9324, + "step": 2958 + }, + { + "epoch": 0.8414367553925721, + "grad_norm": 1.5155595541000366, + "learning_rate": 6.423441539787418e-06, + "loss": 1.7774, + "step": 2959 + }, + { + "epoch": 0.8417211206360302, + "grad_norm": 1.6561177968978882, + "learning_rate": 6.4119505889112335e-06, + "loss": 1.724, + "step": 2960 + }, + { + "epoch": 0.8420054858794884, + "grad_norm": 2.0607149600982666, + "learning_rate": 6.400459638035048e-06, + "loss": 2.829, + "step": 2961 + }, + { + "epoch": 0.8422898511229465, + "grad_norm": 1.4684460163116455, + "learning_rate": 6.388968687158863e-06, + "loss": 2.5184, + "step": 2962 + }, + { + "epoch": 0.8425742163664046, + "grad_norm": 1.4178595542907715, + "learning_rate": 6.377477736282678e-06, + "loss": 2.165, + "step": 2963 + }, + { + "epoch": 0.8428585816098627, + "grad_norm": 1.4885823726654053, + "learning_rate": 6.365986785406492e-06, + "loss": 2.027, + "step": 2964 + }, + { + "epoch": 0.8431429468533208, + "grad_norm": 1.5183528661727905, + "learning_rate": 6.354495834530308e-06, + "loss": 1.9819, + "step": 2965 + }, + { + "epoch": 0.843427312096779, + "grad_norm": 1.3794087171554565, + "learning_rate": 6.343004883654122e-06, + "loss": 1.5737, + "step": 2966 + }, + { + "epoch": 0.8437116773402371, + "grad_norm": 1.5426974296569824, + "learning_rate": 6.3315139327779375e-06, + "loss": 1.6998, + "step": 2967 + }, + { + "epoch": 0.8439960425836952, + "grad_norm": 1.6509177684783936, + "learning_rate": 6.320022981901754e-06, + "loss": 1.8888, + "step": 2968 + }, + { + "epoch": 0.8442804078271533, + "grad_norm": 1.6451570987701416, + "learning_rate": 6.308532031025568e-06, + "loss": 2.8388, + "step": 2969 + }, + { + "epoch": 0.8445647730706114, + "grad_norm": 1.47929048538208, + "learning_rate": 6.297041080149383e-06, + "loss": 2.4065, + "step": 2970 + }, + { + "epoch": 0.8448491383140696, + "grad_norm": 1.3882609605789185, + "learning_rate": 6.285550129273198e-06, + "loss": 2.1397, + "step": 2971 + }, + { + "epoch": 0.8451335035575277, + "grad_norm": 1.4089328050613403, + "learning_rate": 6.2740591783970125e-06, + "loss": 2.13, + "step": 2972 + }, + { + "epoch": 0.8454178688009858, + "grad_norm": 1.5933607816696167, + "learning_rate": 6.262568227520828e-06, + "loss": 2.1391, + "step": 2973 + }, + { + "epoch": 0.8457022340444439, + "grad_norm": 1.4820233583450317, + "learning_rate": 6.251077276644642e-06, + "loss": 1.9708, + "step": 2974 + }, + { + "epoch": 0.845986599287902, + "grad_norm": 1.4918299913406372, + "learning_rate": 6.239586325768458e-06, + "loss": 1.8649, + "step": 2975 + }, + { + "epoch": 0.8462709645313602, + "grad_norm": 1.7036802768707275, + "learning_rate": 6.228095374892272e-06, + "loss": 1.8209, + "step": 2976 + }, + { + "epoch": 0.8465553297748183, + "grad_norm": 1.7224177122116089, + "learning_rate": 6.216604424016088e-06, + "loss": 2.6908, + "step": 2977 + }, + { + "epoch": 0.8468396950182764, + "grad_norm": 1.5237011909484863, + "learning_rate": 6.205113473139903e-06, + "loss": 2.4184, + "step": 2978 + }, + { + "epoch": 0.8471240602617345, + "grad_norm": 1.4036983251571655, + "learning_rate": 6.193622522263718e-06, + "loss": 2.0392, + "step": 2979 + }, + { + "epoch": 0.8474084255051926, + "grad_norm": 1.3745940923690796, + "learning_rate": 6.182131571387533e-06, + "loss": 2.136, + "step": 2980 + }, + { + "epoch": 0.8476927907486508, + "grad_norm": 1.3899210691452026, + "learning_rate": 6.170640620511348e-06, + "loss": 1.8753, + "step": 2981 + }, + { + "epoch": 0.8479771559921089, + "grad_norm": 1.5072089433670044, + "learning_rate": 6.159149669635163e-06, + "loss": 1.8412, + "step": 2982 + }, + { + "epoch": 0.848261521235567, + "grad_norm": 1.4439475536346436, + "learning_rate": 6.147658718758978e-06, + "loss": 1.5971, + "step": 2983 + }, + { + "epoch": 0.8485458864790251, + "grad_norm": 1.7163138389587402, + "learning_rate": 6.1361677678827924e-06, + "loss": 1.6792, + "step": 2984 + }, + { + "epoch": 0.8488302517224832, + "grad_norm": 1.6258455514907837, + "learning_rate": 6.124676817006607e-06, + "loss": 2.6053, + "step": 2985 + }, + { + "epoch": 0.8491146169659414, + "grad_norm": 1.390105962753296, + "learning_rate": 6.113185866130423e-06, + "loss": 2.3118, + "step": 2986 + }, + { + "epoch": 0.8493989822093995, + "grad_norm": 1.5217338800430298, + "learning_rate": 6.1016949152542385e-06, + "loss": 2.1152, + "step": 2987 + }, + { + "epoch": 0.8496833474528576, + "grad_norm": 1.5048413276672363, + "learning_rate": 6.090203964378053e-06, + "loss": 2.2262, + "step": 2988 + }, + { + "epoch": 0.8499677126963157, + "grad_norm": 1.4718562364578247, + "learning_rate": 6.078713013501868e-06, + "loss": 2.1621, + "step": 2989 + }, + { + "epoch": 0.8502520779397739, + "grad_norm": 1.3779963254928589, + "learning_rate": 6.067222062625683e-06, + "loss": 1.6434, + "step": 2990 + }, + { + "epoch": 0.850536443183232, + "grad_norm": 1.5905683040618896, + "learning_rate": 6.055731111749497e-06, + "loss": 1.7679, + "step": 2991 + }, + { + "epoch": 0.8508208084266901, + "grad_norm": 1.6249622106552124, + "learning_rate": 6.044240160873313e-06, + "loss": 1.661, + "step": 2992 + }, + { + "epoch": 0.8511051736701482, + "grad_norm": 1.6707121133804321, + "learning_rate": 6.032749209997127e-06, + "loss": 2.5263, + "step": 2993 + }, + { + "epoch": 0.8513895389136062, + "grad_norm": 1.5046759843826294, + "learning_rate": 6.0212582591209425e-06, + "loss": 2.2967, + "step": 2994 + }, + { + "epoch": 0.8516739041570645, + "grad_norm": 1.5674413442611694, + "learning_rate": 6.009767308244759e-06, + "loss": 2.0386, + "step": 2995 + }, + { + "epoch": 0.8519582694005225, + "grad_norm": 1.4231250286102295, + "learning_rate": 5.998276357368573e-06, + "loss": 2.318, + "step": 2996 + }, + { + "epoch": 0.8522426346439806, + "grad_norm": 1.5579756498336792, + "learning_rate": 5.986785406492388e-06, + "loss": 2.0154, + "step": 2997 + }, + { + "epoch": 0.8525269998874387, + "grad_norm": 1.404220700263977, + "learning_rate": 5.975294455616203e-06, + "loss": 1.7704, + "step": 2998 + }, + { + "epoch": 0.8528113651308968, + "grad_norm": 1.4892497062683105, + "learning_rate": 5.9638035047400175e-06, + "loss": 1.7247, + "step": 2999 + }, + { + "epoch": 0.853095730374355, + "grad_norm": 1.6335406303405762, + "learning_rate": 5.952312553863833e-06, + "loss": 1.5733, + "step": 3000 + }, + { + "epoch": 0.8533800956178131, + "grad_norm": 1.781394124031067, + "learning_rate": 5.940821602987647e-06, + "loss": 2.7558, + "step": 3001 + }, + { + "epoch": 0.8536644608612712, + "grad_norm": 1.4314653873443604, + "learning_rate": 5.929330652111463e-06, + "loss": 2.3129, + "step": 3002 + }, + { + "epoch": 0.8539488261047293, + "grad_norm": 1.5230185985565186, + "learning_rate": 5.917839701235277e-06, + "loss": 2.3062, + "step": 3003 + }, + { + "epoch": 0.8542331913481874, + "grad_norm": 1.4675681591033936, + "learning_rate": 5.9063487503590925e-06, + "loss": 2.0741, + "step": 3004 + }, + { + "epoch": 0.8545175565916456, + "grad_norm": 1.5174206495285034, + "learning_rate": 5.894857799482908e-06, + "loss": 2.0766, + "step": 3005 + }, + { + "epoch": 0.8548019218351037, + "grad_norm": 1.480103850364685, + "learning_rate": 5.883366848606723e-06, + "loss": 1.6913, + "step": 3006 + }, + { + "epoch": 0.8550862870785618, + "grad_norm": 1.5968501567840576, + "learning_rate": 5.871875897730538e-06, + "loss": 1.7072, + "step": 3007 + }, + { + "epoch": 0.8553706523220199, + "grad_norm": 1.528976321220398, + "learning_rate": 5.860384946854353e-06, + "loss": 1.6712, + "step": 3008 + }, + { + "epoch": 0.855655017565478, + "grad_norm": 1.6280978918075562, + "learning_rate": 5.8488939959781676e-06, + "loss": 2.822, + "step": 3009 + }, + { + "epoch": 0.8559393828089362, + "grad_norm": 1.4882813692092896, + "learning_rate": 5.837403045101983e-06, + "loss": 2.0284, + "step": 3010 + }, + { + "epoch": 0.8562237480523943, + "grad_norm": 1.4951153993606567, + "learning_rate": 5.825912094225797e-06, + "loss": 2.2519, + "step": 3011 + }, + { + "epoch": 0.8565081132958524, + "grad_norm": 1.4803876876831055, + "learning_rate": 5.814421143349612e-06, + "loss": 2.2105, + "step": 3012 + }, + { + "epoch": 0.8567924785393105, + "grad_norm": 1.4944387674331665, + "learning_rate": 5.802930192473427e-06, + "loss": 2.0987, + "step": 3013 + }, + { + "epoch": 0.8570768437827686, + "grad_norm": 1.4236083030700684, + "learning_rate": 5.7914392415972434e-06, + "loss": 1.8932, + "step": 3014 + }, + { + "epoch": 0.8573612090262268, + "grad_norm": 1.4983583688735962, + "learning_rate": 5.779948290721058e-06, + "loss": 1.6304, + "step": 3015 + }, + { + "epoch": 0.8576455742696849, + "grad_norm": 1.554520606994629, + "learning_rate": 5.768457339844873e-06, + "loss": 1.5071, + "step": 3016 + }, + { + "epoch": 0.857929939513143, + "grad_norm": 1.8354604244232178, + "learning_rate": 5.756966388968688e-06, + "loss": 2.5812, + "step": 3017 + }, + { + "epoch": 0.8582143047566011, + "grad_norm": 1.4702214002609253, + "learning_rate": 5.745475438092502e-06, + "loss": 2.4053, + "step": 3018 + }, + { + "epoch": 0.8584986700000592, + "grad_norm": 1.5303442478179932, + "learning_rate": 5.733984487216318e-06, + "loss": 2.3453, + "step": 3019 + }, + { + "epoch": 0.8587830352435174, + "grad_norm": 1.5670809745788574, + "learning_rate": 5.722493536340132e-06, + "loss": 2.2886, + "step": 3020 + }, + { + "epoch": 0.8590674004869755, + "grad_norm": 1.6706324815750122, + "learning_rate": 5.7110025854639475e-06, + "loss": 1.9736, + "step": 3021 + }, + { + "epoch": 0.8593517657304336, + "grad_norm": 1.497307300567627, + "learning_rate": 5.699511634587762e-06, + "loss": 1.6649, + "step": 3022 + }, + { + "epoch": 0.8596361309738917, + "grad_norm": 1.4424810409545898, + "learning_rate": 5.688020683711578e-06, + "loss": 1.7273, + "step": 3023 + }, + { + "epoch": 0.8599204962173498, + "grad_norm": 1.5607413053512573, + "learning_rate": 5.676529732835393e-06, + "loss": 1.7178, + "step": 3024 + }, + { + "epoch": 0.860204861460808, + "grad_norm": 1.6604523658752441, + "learning_rate": 5.665038781959208e-06, + "loss": 2.5183, + "step": 3025 + }, + { + "epoch": 0.8604892267042661, + "grad_norm": 1.4753596782684326, + "learning_rate": 5.6535478310830225e-06, + "loss": 2.4315, + "step": 3026 + }, + { + "epoch": 0.8607735919477242, + "grad_norm": 1.5106394290924072, + "learning_rate": 5.642056880206838e-06, + "loss": 2.2461, + "step": 3027 + }, + { + "epoch": 0.8610579571911823, + "grad_norm": 1.4064358472824097, + "learning_rate": 5.630565929330652e-06, + "loss": 1.9892, + "step": 3028 + }, + { + "epoch": 0.8613423224346404, + "grad_norm": 1.5657269954681396, + "learning_rate": 5.619074978454468e-06, + "loss": 2.0241, + "step": 3029 + }, + { + "epoch": 0.8616266876780986, + "grad_norm": 1.3951325416564941, + "learning_rate": 5.607584027578282e-06, + "loss": 1.962, + "step": 3030 + }, + { + "epoch": 0.8619110529215567, + "grad_norm": 1.3956987857818604, + "learning_rate": 5.5960930767020975e-06, + "loss": 1.7142, + "step": 3031 + }, + { + "epoch": 0.8621954181650148, + "grad_norm": 1.8063596487045288, + "learning_rate": 5.584602125825913e-06, + "loss": 1.7746, + "step": 3032 + }, + { + "epoch": 0.8624797834084729, + "grad_norm": 1.7616422176361084, + "learning_rate": 5.573111174949728e-06, + "loss": 2.6338, + "step": 3033 + }, + { + "epoch": 0.8627641486519311, + "grad_norm": 1.576029896736145, + "learning_rate": 5.561620224073543e-06, + "loss": 2.487, + "step": 3034 + }, + { + "epoch": 0.8630485138953892, + "grad_norm": 1.469588279724121, + "learning_rate": 5.550129273197358e-06, + "loss": 2.3708, + "step": 3035 + }, + { + "epoch": 0.8633328791388473, + "grad_norm": 1.5429444313049316, + "learning_rate": 5.5386383223211726e-06, + "loss": 2.1626, + "step": 3036 + }, + { + "epoch": 0.8636172443823054, + "grad_norm": 1.5128809213638306, + "learning_rate": 5.527147371444988e-06, + "loss": 1.9956, + "step": 3037 + }, + { + "epoch": 0.8639016096257635, + "grad_norm": 1.458830714225769, + "learning_rate": 5.515656420568802e-06, + "loss": 1.9701, + "step": 3038 + }, + { + "epoch": 0.8641859748692217, + "grad_norm": 1.585025429725647, + "learning_rate": 5.504165469692617e-06, + "loss": 1.8178, + "step": 3039 + }, + { + "epoch": 0.8644703401126798, + "grad_norm": 1.6396241188049316, + "learning_rate": 5.492674518816432e-06, + "loss": 1.7052, + "step": 3040 + }, + { + "epoch": 0.8647547053561379, + "grad_norm": 1.7168043851852417, + "learning_rate": 5.481183567940247e-06, + "loss": 2.8315, + "step": 3041 + }, + { + "epoch": 0.865039070599596, + "grad_norm": 1.431821346282959, + "learning_rate": 5.469692617064063e-06, + "loss": 2.2904, + "step": 3042 + }, + { + "epoch": 0.865323435843054, + "grad_norm": 1.5637871026992798, + "learning_rate": 5.458201666187877e-06, + "loss": 2.2041, + "step": 3043 + }, + { + "epoch": 0.8656078010865123, + "grad_norm": 1.4506059885025024, + "learning_rate": 5.446710715311693e-06, + "loss": 2.2017, + "step": 3044 + }, + { + "epoch": 0.8658921663299703, + "grad_norm": 1.5410045385360718, + "learning_rate": 5.435219764435507e-06, + "loss": 1.9422, + "step": 3045 + }, + { + "epoch": 0.8661765315734284, + "grad_norm": 1.4460687637329102, + "learning_rate": 5.423728813559323e-06, + "loss": 1.7594, + "step": 3046 + }, + { + "epoch": 0.8664608968168865, + "grad_norm": 1.5550706386566162, + "learning_rate": 5.412237862683137e-06, + "loss": 1.7515, + "step": 3047 + }, + { + "epoch": 0.8667452620603446, + "grad_norm": 1.5752551555633545, + "learning_rate": 5.4007469118069524e-06, + "loss": 1.644, + "step": 3048 + }, + { + "epoch": 0.8670296273038028, + "grad_norm": 1.8624625205993652, + "learning_rate": 5.389255960930767e-06, + "loss": 2.817, + "step": 3049 + }, + { + "epoch": 0.8673139925472609, + "grad_norm": 1.416124939918518, + "learning_rate": 5.377765010054582e-06, + "loss": 2.1468, + "step": 3050 + }, + { + "epoch": 0.867598357790719, + "grad_norm": 1.4211061000823975, + "learning_rate": 5.366274059178398e-06, + "loss": 2.0979, + "step": 3051 + }, + { + "epoch": 0.8678827230341771, + "grad_norm": 1.4626717567443848, + "learning_rate": 5.354783108302213e-06, + "loss": 2.0164, + "step": 3052 + }, + { + "epoch": 0.8681670882776352, + "grad_norm": 1.486775279045105, + "learning_rate": 5.3432921574260275e-06, + "loss": 2.0579, + "step": 3053 + }, + { + "epoch": 0.8684514535210934, + "grad_norm": 1.5586199760437012, + "learning_rate": 5.331801206549843e-06, + "loss": 2.1097, + "step": 3054 + }, + { + "epoch": 0.8687358187645515, + "grad_norm": 1.5133928060531616, + "learning_rate": 5.320310255673657e-06, + "loss": 1.8172, + "step": 3055 + }, + { + "epoch": 0.8690201840080096, + "grad_norm": 1.5888675451278687, + "learning_rate": 5.308819304797473e-06, + "loss": 1.7316, + "step": 3056 + }, + { + "epoch": 0.8693045492514677, + "grad_norm": 1.7980897426605225, + "learning_rate": 5.297328353921287e-06, + "loss": 2.8514, + "step": 3057 + }, + { + "epoch": 0.8695889144949258, + "grad_norm": 1.4976806640625, + "learning_rate": 5.285837403045102e-06, + "loss": 2.5206, + "step": 3058 + }, + { + "epoch": 0.869873279738384, + "grad_norm": 1.5453336238861084, + "learning_rate": 5.274346452168917e-06, + "loss": 2.4075, + "step": 3059 + }, + { + "epoch": 0.8701576449818421, + "grad_norm": 1.3958678245544434, + "learning_rate": 5.262855501292733e-06, + "loss": 1.9729, + "step": 3060 + }, + { + "epoch": 0.8704420102253002, + "grad_norm": 1.5089367628097534, + "learning_rate": 5.251364550416548e-06, + "loss": 2.061, + "step": 3061 + }, + { + "epoch": 0.8707263754687583, + "grad_norm": 1.5372157096862793, + "learning_rate": 5.239873599540363e-06, + "loss": 1.8647, + "step": 3062 + }, + { + "epoch": 0.8710107407122164, + "grad_norm": 1.4671186208724976, + "learning_rate": 5.2283826486641775e-06, + "loss": 1.6414, + "step": 3063 + }, + { + "epoch": 0.8712951059556746, + "grad_norm": 1.5816073417663574, + "learning_rate": 5.216891697787992e-06, + "loss": 1.8357, + "step": 3064 + }, + { + "epoch": 0.8715794711991327, + "grad_norm": 1.703484296798706, + "learning_rate": 5.205400746911807e-06, + "loss": 2.8886, + "step": 3065 + }, + { + "epoch": 0.8718638364425908, + "grad_norm": 1.511811375617981, + "learning_rate": 5.193909796035622e-06, + "loss": 2.3373, + "step": 3066 + }, + { + "epoch": 0.8721482016860489, + "grad_norm": 1.5414661169052124, + "learning_rate": 5.182418845159437e-06, + "loss": 2.1505, + "step": 3067 + }, + { + "epoch": 0.872432566929507, + "grad_norm": 1.5230942964553833, + "learning_rate": 5.170927894283252e-06, + "loss": 2.1039, + "step": 3068 + }, + { + "epoch": 0.8727169321729652, + "grad_norm": 1.4703636169433594, + "learning_rate": 5.159436943407068e-06, + "loss": 2.0667, + "step": 3069 + }, + { + "epoch": 0.8730012974164233, + "grad_norm": 1.432626485824585, + "learning_rate": 5.147945992530882e-06, + "loss": 1.9027, + "step": 3070 + }, + { + "epoch": 0.8732856626598814, + "grad_norm": 1.5644500255584717, + "learning_rate": 5.136455041654698e-06, + "loss": 1.756, + "step": 3071 + }, + { + "epoch": 0.8735700279033395, + "grad_norm": 1.6351237297058105, + "learning_rate": 5.124964090778512e-06, + "loss": 1.7897, + "step": 3072 + }, + { + "epoch": 0.8738543931467976, + "grad_norm": 1.7098308801651, + "learning_rate": 5.113473139902328e-06, + "loss": 2.7434, + "step": 3073 + }, + { + "epoch": 0.8741387583902558, + "grad_norm": 1.4680440425872803, + "learning_rate": 5.101982189026142e-06, + "loss": 2.1812, + "step": 3074 + }, + { + "epoch": 0.8744231236337139, + "grad_norm": 1.4263336658477783, + "learning_rate": 5.0904912381499574e-06, + "loss": 2.2004, + "step": 3075 + }, + { + "epoch": 0.874707488877172, + "grad_norm": 1.4721750020980835, + "learning_rate": 5.079000287273772e-06, + "loss": 2.1114, + "step": 3076 + }, + { + "epoch": 0.8749918541206301, + "grad_norm": 1.6234413385391235, + "learning_rate": 5.067509336397587e-06, + "loss": 2.1707, + "step": 3077 + }, + { + "epoch": 0.8752762193640882, + "grad_norm": 1.5787984132766724, + "learning_rate": 5.056018385521402e-06, + "loss": 1.9247, + "step": 3078 + }, + { + "epoch": 0.8755605846075464, + "grad_norm": 1.4935133457183838, + "learning_rate": 5.044527434645218e-06, + "loss": 1.78, + "step": 3079 + }, + { + "epoch": 0.8758449498510045, + "grad_norm": 1.624784231185913, + "learning_rate": 5.0330364837690325e-06, + "loss": 1.7151, + "step": 3080 + }, + { + "epoch": 0.8761293150944626, + "grad_norm": 1.7499675750732422, + "learning_rate": 5.021545532892848e-06, + "loss": 2.8374, + "step": 3081 + }, + { + "epoch": 0.8764136803379207, + "grad_norm": 1.4725526571273804, + "learning_rate": 5.010054582016662e-06, + "loss": 2.3646, + "step": 3082 + }, + { + "epoch": 0.8766980455813789, + "grad_norm": 1.5129636526107788, + "learning_rate": 4.998563631140478e-06, + "loss": 2.2392, + "step": 3083 + }, + { + "epoch": 0.876982410824837, + "grad_norm": 1.4263098239898682, + "learning_rate": 4.987072680264292e-06, + "loss": 2.2247, + "step": 3084 + }, + { + "epoch": 0.8772667760682951, + "grad_norm": 1.5456427335739136, + "learning_rate": 4.975581729388107e-06, + "loss": 2.068, + "step": 3085 + }, + { + "epoch": 0.8775511413117532, + "grad_norm": 1.4494200944900513, + "learning_rate": 4.964090778511923e-06, + "loss": 1.893, + "step": 3086 + }, + { + "epoch": 0.8778355065552113, + "grad_norm": 1.5122965574264526, + "learning_rate": 4.952599827635737e-06, + "loss": 1.6924, + "step": 3087 + }, + { + "epoch": 0.8781198717986695, + "grad_norm": 1.593127965927124, + "learning_rate": 4.941108876759552e-06, + "loss": 1.7498, + "step": 3088 + }, + { + "epoch": 0.8784042370421276, + "grad_norm": 1.7108339071273804, + "learning_rate": 4.929617925883367e-06, + "loss": 2.7341, + "step": 3089 + }, + { + "epoch": 0.8786886022855857, + "grad_norm": 1.4968798160552979, + "learning_rate": 4.9181269750071825e-06, + "loss": 2.4878, + "step": 3090 + }, + { + "epoch": 0.8789729675290437, + "grad_norm": 1.514430284500122, + "learning_rate": 4.906636024130997e-06, + "loss": 2.2634, + "step": 3091 + }, + { + "epoch": 0.8792573327725018, + "grad_norm": 1.5105502605438232, + "learning_rate": 4.895145073254812e-06, + "loss": 2.2145, + "step": 3092 + }, + { + "epoch": 0.87954169801596, + "grad_norm": 1.5424399375915527, + "learning_rate": 4.883654122378627e-06, + "loss": 2.1025, + "step": 3093 + }, + { + "epoch": 0.8798260632594181, + "grad_norm": 1.4774196147918701, + "learning_rate": 4.872163171502442e-06, + "loss": 1.7497, + "step": 3094 + }, + { + "epoch": 0.8801104285028762, + "grad_norm": 1.5772838592529297, + "learning_rate": 4.8606722206262575e-06, + "loss": 1.6591, + "step": 3095 + }, + { + "epoch": 0.8803947937463343, + "grad_norm": 1.5816643238067627, + "learning_rate": 4.849181269750072e-06, + "loss": 1.6053, + "step": 3096 + }, + { + "epoch": 0.8806791589897924, + "grad_norm": 1.8602535724639893, + "learning_rate": 4.837690318873887e-06, + "loss": 2.8624, + "step": 3097 + }, + { + "epoch": 0.8809635242332506, + "grad_norm": 1.4142849445343018, + "learning_rate": 4.826199367997702e-06, + "loss": 2.1765, + "step": 3098 + }, + { + "epoch": 0.8812478894767087, + "grad_norm": 1.418143391609192, + "learning_rate": 4.814708417121517e-06, + "loss": 2.142, + "step": 3099 + }, + { + "epoch": 0.8815322547201668, + "grad_norm": 1.5086448192596436, + "learning_rate": 4.8032174662453326e-06, + "loss": 2.2362, + "step": 3100 + }, + { + "epoch": 0.8818166199636249, + "grad_norm": 1.5796101093292236, + "learning_rate": 4.791726515369147e-06, + "loss": 2.2314, + "step": 3101 + }, + { + "epoch": 0.882100985207083, + "grad_norm": 1.4776320457458496, + "learning_rate": 4.780235564492962e-06, + "loss": 2.0073, + "step": 3102 + }, + { + "epoch": 0.8823853504505412, + "grad_norm": 1.5670294761657715, + "learning_rate": 4.768744613616777e-06, + "loss": 1.7092, + "step": 3103 + }, + { + "epoch": 0.8826697156939993, + "grad_norm": 1.5772371292114258, + "learning_rate": 4.757253662740592e-06, + "loss": 1.6023, + "step": 3104 + }, + { + "epoch": 0.8829540809374574, + "grad_norm": 1.6653498411178589, + "learning_rate": 4.745762711864408e-06, + "loss": 2.7154, + "step": 3105 + }, + { + "epoch": 0.8832384461809155, + "grad_norm": 1.4277734756469727, + "learning_rate": 4.734271760988222e-06, + "loss": 2.4472, + "step": 3106 + }, + { + "epoch": 0.8835228114243736, + "grad_norm": 1.4376863241195679, + "learning_rate": 4.7227808101120374e-06, + "loss": 2.1116, + "step": 3107 + }, + { + "epoch": 0.8838071766678318, + "grad_norm": 1.4949778318405151, + "learning_rate": 4.711289859235852e-06, + "loss": 2.2258, + "step": 3108 + }, + { + "epoch": 0.8840915419112899, + "grad_norm": 1.6463311910629272, + "learning_rate": 4.699798908359667e-06, + "loss": 2.1172, + "step": 3109 + }, + { + "epoch": 0.884375907154748, + "grad_norm": 1.5631059408187866, + "learning_rate": 4.688307957483483e-06, + "loss": 1.8706, + "step": 3110 + }, + { + "epoch": 0.8846602723982061, + "grad_norm": 1.5554920434951782, + "learning_rate": 4.676817006607297e-06, + "loss": 1.8467, + "step": 3111 + }, + { + "epoch": 0.8849446376416642, + "grad_norm": 1.6312092542648315, + "learning_rate": 4.665326055731112e-06, + "loss": 1.6004, + "step": 3112 + }, + { + "epoch": 0.8852290028851224, + "grad_norm": 1.807760238647461, + "learning_rate": 4.653835104854928e-06, + "loss": 2.7044, + "step": 3113 + }, + { + "epoch": 0.8855133681285805, + "grad_norm": 1.6211071014404297, + "learning_rate": 4.642344153978742e-06, + "loss": 2.3121, + "step": 3114 + }, + { + "epoch": 0.8857977333720386, + "grad_norm": 1.6699950695037842, + "learning_rate": 4.630853203102557e-06, + "loss": 2.2832, + "step": 3115 + }, + { + "epoch": 0.8860820986154967, + "grad_norm": 1.495404839515686, + "learning_rate": 4.619362252226372e-06, + "loss": 2.0071, + "step": 3116 + }, + { + "epoch": 0.8863664638589548, + "grad_norm": 1.4680708646774292, + "learning_rate": 4.607871301350187e-06, + "loss": 2.1704, + "step": 3117 + }, + { + "epoch": 0.886650829102413, + "grad_norm": 1.5451794862747192, + "learning_rate": 4.596380350474002e-06, + "loss": 1.8087, + "step": 3118 + }, + { + "epoch": 0.8869351943458711, + "grad_norm": 1.470167636871338, + "learning_rate": 4.584889399597817e-06, + "loss": 1.7698, + "step": 3119 + }, + { + "epoch": 0.8872195595893292, + "grad_norm": 1.5515146255493164, + "learning_rate": 4.573398448721632e-06, + "loss": 1.6454, + "step": 3120 + }, + { + "epoch": 0.8875039248327873, + "grad_norm": 1.7271732091903687, + "learning_rate": 4.561907497845447e-06, + "loss": 2.6316, + "step": 3121 + }, + { + "epoch": 0.8877882900762454, + "grad_norm": 1.494756817817688, + "learning_rate": 4.550416546969262e-06, + "loss": 2.4271, + "step": 3122 + }, + { + "epoch": 0.8880726553197036, + "grad_norm": 1.4232012033462524, + "learning_rate": 4.538925596093077e-06, + "loss": 2.2557, + "step": 3123 + }, + { + "epoch": 0.8883570205631617, + "grad_norm": 1.4829121828079224, + "learning_rate": 4.527434645216892e-06, + "loss": 2.0524, + "step": 3124 + }, + { + "epoch": 0.8886413858066198, + "grad_norm": 1.5503113269805908, + "learning_rate": 4.515943694340707e-06, + "loss": 1.9053, + "step": 3125 + }, + { + "epoch": 0.8889257510500779, + "grad_norm": 1.4779075384140015, + "learning_rate": 4.504452743464522e-06, + "loss": 1.8903, + "step": 3126 + }, + { + "epoch": 0.889210116293536, + "grad_norm": 1.5646677017211914, + "learning_rate": 4.4929617925883376e-06, + "loss": 1.7303, + "step": 3127 + }, + { + "epoch": 0.8894944815369942, + "grad_norm": 1.709120273590088, + "learning_rate": 4.481470841712152e-06, + "loss": 1.7812, + "step": 3128 + }, + { + "epoch": 0.8897788467804523, + "grad_norm": 1.6658837795257568, + "learning_rate": 4.469979890835967e-06, + "loss": 2.8185, + "step": 3129 + }, + { + "epoch": 0.8900632120239104, + "grad_norm": 1.4445295333862305, + "learning_rate": 4.458488939959782e-06, + "loss": 2.2779, + "step": 3130 + }, + { + "epoch": 0.8903475772673685, + "grad_norm": 1.5250149965286255, + "learning_rate": 4.446997989083596e-06, + "loss": 2.2863, + "step": 3131 + }, + { + "epoch": 0.8906319425108267, + "grad_norm": 1.364835262298584, + "learning_rate": 4.435507038207413e-06, + "loss": 2.0326, + "step": 3132 + }, + { + "epoch": 0.8909163077542848, + "grad_norm": 1.4369089603424072, + "learning_rate": 4.424016087331227e-06, + "loss": 2.0469, + "step": 3133 + }, + { + "epoch": 0.8912006729977429, + "grad_norm": 1.4388717412948608, + "learning_rate": 4.4125251364550416e-06, + "loss": 1.7271, + "step": 3134 + }, + { + "epoch": 0.891485038241201, + "grad_norm": 1.51015305519104, + "learning_rate": 4.401034185578857e-06, + "loss": 1.7673, + "step": 3135 + }, + { + "epoch": 0.891769403484659, + "grad_norm": 1.6226184368133545, + "learning_rate": 4.389543234702671e-06, + "loss": 1.6646, + "step": 3136 + }, + { + "epoch": 0.8920537687281173, + "grad_norm": 1.579350233078003, + "learning_rate": 4.378052283826487e-06, + "loss": 2.6567, + "step": 3137 + }, + { + "epoch": 0.8923381339715754, + "grad_norm": 1.427051067352295, + "learning_rate": 4.366561332950302e-06, + "loss": 2.104, + "step": 3138 + }, + { + "epoch": 0.8926224992150334, + "grad_norm": 1.3372541666030884, + "learning_rate": 4.355070382074117e-06, + "loss": 2.2061, + "step": 3139 + }, + { + "epoch": 0.8929068644584915, + "grad_norm": 1.4767935276031494, + "learning_rate": 4.343579431197932e-06, + "loss": 2.2415, + "step": 3140 + }, + { + "epoch": 0.8931912297019496, + "grad_norm": 1.5126450061798096, + "learning_rate": 4.332088480321747e-06, + "loss": 2.0232, + "step": 3141 + }, + { + "epoch": 0.8934755949454078, + "grad_norm": 1.5773214101791382, + "learning_rate": 4.320597529445562e-06, + "loss": 2.0683, + "step": 3142 + }, + { + "epoch": 0.8937599601888659, + "grad_norm": 1.64472496509552, + "learning_rate": 4.309106578569377e-06, + "loss": 1.9053, + "step": 3143 + }, + { + "epoch": 0.894044325432324, + "grad_norm": 1.5076712369918823, + "learning_rate": 4.297615627693192e-06, + "loss": 1.7959, + "step": 3144 + }, + { + "epoch": 0.8943286906757821, + "grad_norm": 1.8122453689575195, + "learning_rate": 4.286124676817007e-06, + "loss": 2.8296, + "step": 3145 + }, + { + "epoch": 0.8946130559192402, + "grad_norm": 1.409722924232483, + "learning_rate": 4.274633725940822e-06, + "loss": 2.3145, + "step": 3146 + }, + { + "epoch": 0.8948974211626984, + "grad_norm": 1.43026602268219, + "learning_rate": 4.263142775064637e-06, + "loss": 2.2631, + "step": 3147 + }, + { + "epoch": 0.8951817864061565, + "grad_norm": 1.4173415899276733, + "learning_rate": 4.251651824188452e-06, + "loss": 2.1985, + "step": 3148 + }, + { + "epoch": 0.8954661516496146, + "grad_norm": 1.4719431400299072, + "learning_rate": 4.240160873312267e-06, + "loss": 1.9246, + "step": 3149 + }, + { + "epoch": 0.8957505168930727, + "grad_norm": 1.5066853761672974, + "learning_rate": 4.228669922436082e-06, + "loss": 1.8011, + "step": 3150 + }, + { + "epoch": 0.8960348821365308, + "grad_norm": 1.6725085973739624, + "learning_rate": 4.217178971559897e-06, + "loss": 1.608, + "step": 3151 + }, + { + "epoch": 0.896319247379989, + "grad_norm": 1.6424719095230103, + "learning_rate": 4.205688020683712e-06, + "loss": 1.7574, + "step": 3152 + }, + { + "epoch": 0.8966036126234471, + "grad_norm": 1.7039910554885864, + "learning_rate": 4.194197069807527e-06, + "loss": 2.7038, + "step": 3153 + }, + { + "epoch": 0.8968879778669052, + "grad_norm": 1.482321858406067, + "learning_rate": 4.182706118931342e-06, + "loss": 2.2022, + "step": 3154 + }, + { + "epoch": 0.8971723431103633, + "grad_norm": 1.4395989179611206, + "learning_rate": 4.171215168055157e-06, + "loss": 2.3795, + "step": 3155 + }, + { + "epoch": 0.8974567083538214, + "grad_norm": 1.381151795387268, + "learning_rate": 4.159724217178972e-06, + "loss": 2.2011, + "step": 3156 + }, + { + "epoch": 0.8977410735972796, + "grad_norm": 1.4382047653198242, + "learning_rate": 4.148233266302787e-06, + "loss": 1.9888, + "step": 3157 + }, + { + "epoch": 0.8980254388407377, + "grad_norm": 1.566701889038086, + "learning_rate": 4.136742315426601e-06, + "loss": 1.9158, + "step": 3158 + }, + { + "epoch": 0.8983098040841958, + "grad_norm": 1.452852725982666, + "learning_rate": 4.125251364550417e-06, + "loss": 1.7248, + "step": 3159 + }, + { + "epoch": 0.8985941693276539, + "grad_norm": 1.540461778640747, + "learning_rate": 4.113760413674232e-06, + "loss": 1.8487, + "step": 3160 + }, + { + "epoch": 0.898878534571112, + "grad_norm": 2.046682119369507, + "learning_rate": 4.1022694627980466e-06, + "loss": 2.6915, + "step": 3161 + }, + { + "epoch": 0.8991628998145702, + "grad_norm": 1.4611966609954834, + "learning_rate": 4.090778511921862e-06, + "loss": 2.324, + "step": 3162 + }, + { + "epoch": 0.8994472650580283, + "grad_norm": 1.5413190126419067, + "learning_rate": 4.079287561045676e-06, + "loss": 2.1069, + "step": 3163 + }, + { + "epoch": 0.8997316303014864, + "grad_norm": 1.4921023845672607, + "learning_rate": 4.067796610169492e-06, + "loss": 2.1568, + "step": 3164 + }, + { + "epoch": 0.9000159955449445, + "grad_norm": 1.4238756895065308, + "learning_rate": 4.056305659293307e-06, + "loss": 2.0992, + "step": 3165 + }, + { + "epoch": 0.9003003607884026, + "grad_norm": 1.2893463373184204, + "learning_rate": 4.044814708417122e-06, + "loss": 1.7327, + "step": 3166 + }, + { + "epoch": 0.9005847260318608, + "grad_norm": 1.4908276796340942, + "learning_rate": 4.033323757540937e-06, + "loss": 1.6974, + "step": 3167 + }, + { + "epoch": 0.9008690912753189, + "grad_norm": 1.6396807432174683, + "learning_rate": 4.0218328066647514e-06, + "loss": 1.8493, + "step": 3168 + }, + { + "epoch": 0.901153456518777, + "grad_norm": 1.881540298461914, + "learning_rate": 4.010341855788567e-06, + "loss": 2.7723, + "step": 3169 + }, + { + "epoch": 0.9014378217622351, + "grad_norm": 1.3765074014663696, + "learning_rate": 3.998850904912382e-06, + "loss": 2.4163, + "step": 3170 + }, + { + "epoch": 0.9017221870056932, + "grad_norm": 1.4607453346252441, + "learning_rate": 3.987359954036197e-06, + "loss": 2.1944, + "step": 3171 + }, + { + "epoch": 0.9020065522491514, + "grad_norm": 1.5134079456329346, + "learning_rate": 3.975869003160012e-06, + "loss": 2.1223, + "step": 3172 + }, + { + "epoch": 0.9022909174926095, + "grad_norm": 1.612130880355835, + "learning_rate": 3.9643780522838265e-06, + "loss": 2.0902, + "step": 3173 + }, + { + "epoch": 0.9025752827360676, + "grad_norm": 1.4546161890029907, + "learning_rate": 3.952887101407642e-06, + "loss": 1.6594, + "step": 3174 + }, + { + "epoch": 0.9028596479795257, + "grad_norm": 1.5486887693405151, + "learning_rate": 3.941396150531457e-06, + "loss": 1.7874, + "step": 3175 + }, + { + "epoch": 0.9031440132229838, + "grad_norm": 1.671090006828308, + "learning_rate": 3.929905199655272e-06, + "loss": 1.7745, + "step": 3176 + }, + { + "epoch": 0.903428378466442, + "grad_norm": 1.8405499458312988, + "learning_rate": 3.918414248779087e-06, + "loss": 2.6482, + "step": 3177 + }, + { + "epoch": 0.9037127437099001, + "grad_norm": 1.4907866716384888, + "learning_rate": 3.906923297902902e-06, + "loss": 2.3249, + "step": 3178 + }, + { + "epoch": 0.9039971089533582, + "grad_norm": 1.5165135860443115, + "learning_rate": 3.895432347026717e-06, + "loss": 2.4738, + "step": 3179 + }, + { + "epoch": 0.9042814741968163, + "grad_norm": 1.5014480352401733, + "learning_rate": 3.883941396150532e-06, + "loss": 2.1646, + "step": 3180 + }, + { + "epoch": 0.9045658394402745, + "grad_norm": 1.5247701406478882, + "learning_rate": 3.872450445274347e-06, + "loss": 1.9272, + "step": 3181 + }, + { + "epoch": 0.9048502046837326, + "grad_norm": 1.585174798965454, + "learning_rate": 3.860959494398161e-06, + "loss": 1.9112, + "step": 3182 + }, + { + "epoch": 0.9051345699271907, + "grad_norm": 1.5576118230819702, + "learning_rate": 3.849468543521977e-06, + "loss": 1.716, + "step": 3183 + }, + { + "epoch": 0.9054189351706488, + "grad_norm": 1.6155881881713867, + "learning_rate": 3.837977592645792e-06, + "loss": 1.7258, + "step": 3184 + }, + { + "epoch": 0.9057033004141068, + "grad_norm": 1.6325162649154663, + "learning_rate": 3.826486641769606e-06, + "loss": 2.4844, + "step": 3185 + }, + { + "epoch": 0.9059876656575651, + "grad_norm": 1.38741934299469, + "learning_rate": 3.814995690893422e-06, + "loss": 2.1399, + "step": 3186 + }, + { + "epoch": 0.9062720309010232, + "grad_norm": 1.4506083726882935, + "learning_rate": 3.8035047400172366e-06, + "loss": 2.076, + "step": 3187 + }, + { + "epoch": 0.9065563961444812, + "grad_norm": 1.423922061920166, + "learning_rate": 3.792013789141052e-06, + "loss": 2.0409, + "step": 3188 + }, + { + "epoch": 0.9068407613879393, + "grad_norm": 1.591650366783142, + "learning_rate": 3.780522838264867e-06, + "loss": 2.1596, + "step": 3189 + }, + { + "epoch": 0.9071251266313974, + "grad_norm": 1.5626674890518188, + "learning_rate": 3.769031887388682e-06, + "loss": 1.9831, + "step": 3190 + }, + { + "epoch": 0.9074094918748556, + "grad_norm": 1.5110690593719482, + "learning_rate": 3.7575409365124967e-06, + "loss": 1.9154, + "step": 3191 + }, + { + "epoch": 0.9076938571183137, + "grad_norm": 1.5417031049728394, + "learning_rate": 3.746049985636312e-06, + "loss": 1.704, + "step": 3192 + }, + { + "epoch": 0.9079782223617718, + "grad_norm": 1.6698137521743774, + "learning_rate": 3.734559034760127e-06, + "loss": 2.7173, + "step": 3193 + }, + { + "epoch": 0.9082625876052299, + "grad_norm": 1.58273446559906, + "learning_rate": 3.7230680838839415e-06, + "loss": 2.3419, + "step": 3194 + }, + { + "epoch": 0.908546952848688, + "grad_norm": 1.4195032119750977, + "learning_rate": 3.7115771330077564e-06, + "loss": 2.184, + "step": 3195 + }, + { + "epoch": 0.9088313180921462, + "grad_norm": 1.9811179637908936, + "learning_rate": 3.7000861821315713e-06, + "loss": 2.1592, + "step": 3196 + }, + { + "epoch": 0.9091156833356043, + "grad_norm": 1.8835865259170532, + "learning_rate": 3.6885952312553867e-06, + "loss": 2.1639, + "step": 3197 + }, + { + "epoch": 0.9094000485790624, + "grad_norm": 1.4895811080932617, + "learning_rate": 3.6771042803792016e-06, + "loss": 1.8653, + "step": 3198 + }, + { + "epoch": 0.9096844138225205, + "grad_norm": 1.5817967653274536, + "learning_rate": 3.6656133295030165e-06, + "loss": 1.8198, + "step": 3199 + }, + { + "epoch": 0.9099687790659786, + "grad_norm": 1.4373753070831299, + "learning_rate": 3.6541223786268314e-06, + "loss": 1.5907, + "step": 3200 + }, + { + "epoch": 0.9102531443094368, + "grad_norm": 1.6298754215240479, + "learning_rate": 3.6426314277506468e-06, + "loss": 2.7107, + "step": 3201 + }, + { + "epoch": 0.9105375095528949, + "grad_norm": 1.4616210460662842, + "learning_rate": 3.6311404768744617e-06, + "loss": 2.2934, + "step": 3202 + }, + { + "epoch": 0.910821874796353, + "grad_norm": 1.4932990074157715, + "learning_rate": 3.6196495259982766e-06, + "loss": 1.951, + "step": 3203 + }, + { + "epoch": 0.9111062400398111, + "grad_norm": 1.4640005826950073, + "learning_rate": 3.6081585751220915e-06, + "loss": 1.9661, + "step": 3204 + }, + { + "epoch": 0.9113906052832692, + "grad_norm": 1.583443284034729, + "learning_rate": 3.5966676242459065e-06, + "loss": 2.0104, + "step": 3205 + }, + { + "epoch": 0.9116749705267274, + "grad_norm": 1.4007854461669922, + "learning_rate": 3.585176673369722e-06, + "loss": 1.9239, + "step": 3206 + }, + { + "epoch": 0.9119593357701855, + "grad_norm": 1.590077519416809, + "learning_rate": 3.5736857224935367e-06, + "loss": 1.8776, + "step": 3207 + }, + { + "epoch": 0.9122437010136436, + "grad_norm": 1.5077543258666992, + "learning_rate": 3.5621947716173517e-06, + "loss": 1.6225, + "step": 3208 + }, + { + "epoch": 0.9125280662571017, + "grad_norm": 1.8528907299041748, + "learning_rate": 3.5507038207411666e-06, + "loss": 2.6378, + "step": 3209 + }, + { + "epoch": 0.9128124315005598, + "grad_norm": 1.5426160097122192, + "learning_rate": 3.5392128698649815e-06, + "loss": 2.4834, + "step": 3210 + }, + { + "epoch": 0.913096796744018, + "grad_norm": 1.3771498203277588, + "learning_rate": 3.527721918988797e-06, + "loss": 2.1898, + "step": 3211 + }, + { + "epoch": 0.9133811619874761, + "grad_norm": 1.4479544162750244, + "learning_rate": 3.5162309681126118e-06, + "loss": 2.2442, + "step": 3212 + }, + { + "epoch": 0.9136655272309342, + "grad_norm": 1.5544335842132568, + "learning_rate": 3.5047400172364267e-06, + "loss": 1.9227, + "step": 3213 + }, + { + "epoch": 0.9139498924743923, + "grad_norm": 1.4518970251083374, + "learning_rate": 3.4932490663602416e-06, + "loss": 1.8666, + "step": 3214 + }, + { + "epoch": 0.9142342577178504, + "grad_norm": 1.4834986925125122, + "learning_rate": 3.481758115484057e-06, + "loss": 1.5869, + "step": 3215 + }, + { + "epoch": 0.9145186229613086, + "grad_norm": 1.5631589889526367, + "learning_rate": 3.470267164607872e-06, + "loss": 1.6409, + "step": 3216 + }, + { + "epoch": 0.9148029882047667, + "grad_norm": 1.7660410404205322, + "learning_rate": 3.4587762137316868e-06, + "loss": 2.7967, + "step": 3217 + }, + { + "epoch": 0.9150873534482248, + "grad_norm": 1.4638726711273193, + "learning_rate": 3.4472852628555013e-06, + "loss": 2.276, + "step": 3218 + }, + { + "epoch": 0.9153717186916829, + "grad_norm": 1.5217621326446533, + "learning_rate": 3.435794311979316e-06, + "loss": 2.2588, + "step": 3219 + }, + { + "epoch": 0.915656083935141, + "grad_norm": 1.4932063817977905, + "learning_rate": 3.424303361103132e-06, + "loss": 2.138, + "step": 3220 + }, + { + "epoch": 0.9159404491785992, + "grad_norm": 1.5329244136810303, + "learning_rate": 3.4128124102269465e-06, + "loss": 1.9642, + "step": 3221 + }, + { + "epoch": 0.9162248144220573, + "grad_norm": 1.458665132522583, + "learning_rate": 3.4013214593507614e-06, + "loss": 1.961, + "step": 3222 + }, + { + "epoch": 0.9165091796655154, + "grad_norm": 1.5886058807373047, + "learning_rate": 3.3898305084745763e-06, + "loss": 1.9216, + "step": 3223 + }, + { + "epoch": 0.9167935449089735, + "grad_norm": 1.5155012607574463, + "learning_rate": 3.3783395575983912e-06, + "loss": 1.754, + "step": 3224 + }, + { + "epoch": 0.9170779101524317, + "grad_norm": 1.733697772026062, + "learning_rate": 3.3668486067222066e-06, + "loss": 2.5523, + "step": 3225 + }, + { + "epoch": 0.9173622753958898, + "grad_norm": 1.5041455030441284, + "learning_rate": 3.3553576558460215e-06, + "loss": 2.2183, + "step": 3226 + }, + { + "epoch": 0.9176466406393479, + "grad_norm": 1.443900465965271, + "learning_rate": 3.3438667049698364e-06, + "loss": 2.2949, + "step": 3227 + }, + { + "epoch": 0.917931005882806, + "grad_norm": 1.4563335180282593, + "learning_rate": 3.3323757540936513e-06, + "loss": 2.2165, + "step": 3228 + }, + { + "epoch": 0.9182153711262641, + "grad_norm": 1.5183465480804443, + "learning_rate": 3.3208848032174667e-06, + "loss": 2.1452, + "step": 3229 + }, + { + "epoch": 0.9184997363697223, + "grad_norm": 1.4686776399612427, + "learning_rate": 3.3093938523412816e-06, + "loss": 1.9584, + "step": 3230 + }, + { + "epoch": 0.9187841016131804, + "grad_norm": 1.4295648336410522, + "learning_rate": 3.2979029014650965e-06, + "loss": 1.7519, + "step": 3231 + }, + { + "epoch": 0.9190684668566385, + "grad_norm": 1.6376121044158936, + "learning_rate": 3.2864119505889114e-06, + "loss": 1.6658, + "step": 3232 + }, + { + "epoch": 0.9193528321000966, + "grad_norm": 1.892962098121643, + "learning_rate": 3.2749209997127264e-06, + "loss": 2.629, + "step": 3233 + }, + { + "epoch": 0.9196371973435546, + "grad_norm": 1.4390010833740234, + "learning_rate": 3.2634300488365417e-06, + "loss": 2.4514, + "step": 3234 + }, + { + "epoch": 0.9199215625870129, + "grad_norm": 1.4880425930023193, + "learning_rate": 3.2519390979603566e-06, + "loss": 2.1385, + "step": 3235 + }, + { + "epoch": 0.920205927830471, + "grad_norm": 1.5567518472671509, + "learning_rate": 3.2404481470841716e-06, + "loss": 2.1202, + "step": 3236 + }, + { + "epoch": 0.920490293073929, + "grad_norm": 1.4381585121154785, + "learning_rate": 3.2289571962079865e-06, + "loss": 1.9348, + "step": 3237 + }, + { + "epoch": 0.9207746583173871, + "grad_norm": 1.4756149053573608, + "learning_rate": 3.217466245331802e-06, + "loss": 1.8763, + "step": 3238 + }, + { + "epoch": 0.9210590235608452, + "grad_norm": 1.4677278995513916, + "learning_rate": 3.2059752944556167e-06, + "loss": 1.5907, + "step": 3239 + }, + { + "epoch": 0.9213433888043034, + "grad_norm": 1.5422618389129639, + "learning_rate": 3.1944843435794317e-06, + "loss": 1.6591, + "step": 3240 + }, + { + "epoch": 0.9216277540477615, + "grad_norm": 1.696669101715088, + "learning_rate": 3.182993392703246e-06, + "loss": 2.6402, + "step": 3241 + }, + { + "epoch": 0.9219121192912196, + "grad_norm": 1.6080213785171509, + "learning_rate": 3.171502441827061e-06, + "loss": 2.5319, + "step": 3242 + }, + { + "epoch": 0.9221964845346777, + "grad_norm": 1.4872413873672485, + "learning_rate": 3.160011490950877e-06, + "loss": 2.1528, + "step": 3243 + }, + { + "epoch": 0.9224808497781358, + "grad_norm": 1.3897939920425415, + "learning_rate": 3.1485205400746913e-06, + "loss": 2.2001, + "step": 3244 + }, + { + "epoch": 0.922765215021594, + "grad_norm": 1.5631881952285767, + "learning_rate": 3.1370295891985063e-06, + "loss": 2.1586, + "step": 3245 + }, + { + "epoch": 0.9230495802650521, + "grad_norm": 1.603376865386963, + "learning_rate": 3.125538638322321e-06, + "loss": 1.8584, + "step": 3246 + }, + { + "epoch": 0.9233339455085102, + "grad_norm": 1.4845857620239258, + "learning_rate": 3.114047687446136e-06, + "loss": 1.6781, + "step": 3247 + }, + { + "epoch": 0.9236183107519683, + "grad_norm": 1.5035042762756348, + "learning_rate": 3.1025567365699515e-06, + "loss": 1.5507, + "step": 3248 + }, + { + "epoch": 0.9239026759954264, + "grad_norm": 1.5952107906341553, + "learning_rate": 3.0910657856937664e-06, + "loss": 2.6295, + "step": 3249 + }, + { + "epoch": 0.9241870412388846, + "grad_norm": 1.4013127088546753, + "learning_rate": 3.0795748348175813e-06, + "loss": 2.59, + "step": 3250 + }, + { + "epoch": 0.9244714064823427, + "grad_norm": 1.4684206247329712, + "learning_rate": 3.0680838839413962e-06, + "loss": 2.1826, + "step": 3251 + }, + { + "epoch": 0.9247557717258008, + "grad_norm": 1.4950830936431885, + "learning_rate": 3.0565929330652116e-06, + "loss": 2.2631, + "step": 3252 + }, + { + "epoch": 0.9250401369692589, + "grad_norm": 1.56867253780365, + "learning_rate": 3.0451019821890265e-06, + "loss": 2.0641, + "step": 3253 + }, + { + "epoch": 0.925324502212717, + "grad_norm": 1.5613471269607544, + "learning_rate": 3.0336110313128414e-06, + "loss": 1.8448, + "step": 3254 + }, + { + "epoch": 0.9256088674561752, + "grad_norm": 1.413219690322876, + "learning_rate": 3.0221200804366563e-06, + "loss": 2.0414, + "step": 3255 + }, + { + "epoch": 0.9258932326996333, + "grad_norm": 1.5721766948699951, + "learning_rate": 3.0106291295604712e-06, + "loss": 1.4875, + "step": 3256 + }, + { + "epoch": 0.9261775979430914, + "grad_norm": 1.5911824703216553, + "learning_rate": 2.9991381786842866e-06, + "loss": 2.5628, + "step": 3257 + }, + { + "epoch": 0.9264619631865495, + "grad_norm": 1.427064299583435, + "learning_rate": 2.9876472278081015e-06, + "loss": 2.5189, + "step": 3258 + }, + { + "epoch": 0.9267463284300076, + "grad_norm": 1.47379732131958, + "learning_rate": 2.9761562769319164e-06, + "loss": 2.258, + "step": 3259 + }, + { + "epoch": 0.9270306936734658, + "grad_norm": 1.4227964878082275, + "learning_rate": 2.9646653260557313e-06, + "loss": 1.9227, + "step": 3260 + }, + { + "epoch": 0.9273150589169239, + "grad_norm": 1.488387107849121, + "learning_rate": 2.9531743751795463e-06, + "loss": 1.8924, + "step": 3261 + }, + { + "epoch": 0.927599424160382, + "grad_norm": 1.4256445169448853, + "learning_rate": 2.9416834243033616e-06, + "loss": 1.6688, + "step": 3262 + }, + { + "epoch": 0.9278837894038401, + "grad_norm": 1.5635275840759277, + "learning_rate": 2.9301924734271765e-06, + "loss": 1.7883, + "step": 3263 + }, + { + "epoch": 0.9281681546472982, + "grad_norm": 1.5392005443572998, + "learning_rate": 2.9187015225509915e-06, + "loss": 1.6098, + "step": 3264 + }, + { + "epoch": 0.9284525198907564, + "grad_norm": 1.5983314514160156, + "learning_rate": 2.907210571674806e-06, + "loss": 2.6295, + "step": 3265 + }, + { + "epoch": 0.9287368851342145, + "grad_norm": 1.55398690700531, + "learning_rate": 2.8957196207986217e-06, + "loss": 2.2437, + "step": 3266 + }, + { + "epoch": 0.9290212503776726, + "grad_norm": 1.477097511291504, + "learning_rate": 2.8842286699224366e-06, + "loss": 2.2931, + "step": 3267 + }, + { + "epoch": 0.9293056156211307, + "grad_norm": 1.5535800457000732, + "learning_rate": 2.872737719046251e-06, + "loss": 2.1373, + "step": 3268 + }, + { + "epoch": 0.9295899808645888, + "grad_norm": 1.570197582244873, + "learning_rate": 2.861246768170066e-06, + "loss": 2.0627, + "step": 3269 + }, + { + "epoch": 0.929874346108047, + "grad_norm": 1.4615015983581543, + "learning_rate": 2.849755817293881e-06, + "loss": 1.8171, + "step": 3270 + }, + { + "epoch": 0.9301587113515051, + "grad_norm": 1.552833080291748, + "learning_rate": 2.8382648664176963e-06, + "loss": 1.7411, + "step": 3271 + }, + { + "epoch": 0.9304430765949632, + "grad_norm": 1.5648410320281982, + "learning_rate": 2.8267739155415112e-06, + "loss": 1.6361, + "step": 3272 + }, + { + "epoch": 0.9307274418384213, + "grad_norm": 1.7800242900848389, + "learning_rate": 2.815282964665326e-06, + "loss": 2.5435, + "step": 3273 + }, + { + "epoch": 0.9310118070818795, + "grad_norm": 1.5224663019180298, + "learning_rate": 2.803792013789141e-06, + "loss": 2.364, + "step": 3274 + }, + { + "epoch": 0.9312961723253376, + "grad_norm": 1.4838275909423828, + "learning_rate": 2.7923010629129564e-06, + "loss": 2.1583, + "step": 3275 + }, + { + "epoch": 0.9315805375687957, + "grad_norm": 1.468105435371399, + "learning_rate": 2.7808101120367714e-06, + "loss": 2.3038, + "step": 3276 + }, + { + "epoch": 0.9318649028122538, + "grad_norm": 1.6450117826461792, + "learning_rate": 2.7693191611605863e-06, + "loss": 2.2681, + "step": 3277 + }, + { + "epoch": 0.9321492680557119, + "grad_norm": 1.4350481033325195, + "learning_rate": 2.757828210284401e-06, + "loss": 1.7551, + "step": 3278 + }, + { + "epoch": 0.9324336332991701, + "grad_norm": 1.4676655530929565, + "learning_rate": 2.746337259408216e-06, + "loss": 1.7063, + "step": 3279 + }, + { + "epoch": 0.9327179985426282, + "grad_norm": 1.562709927558899, + "learning_rate": 2.7348463085320315e-06, + "loss": 1.7359, + "step": 3280 + }, + { + "epoch": 0.9330023637860863, + "grad_norm": 1.739317536354065, + "learning_rate": 2.7233553576558464e-06, + "loss": 2.5635, + "step": 3281 + }, + { + "epoch": 0.9332867290295443, + "grad_norm": 1.480629801750183, + "learning_rate": 2.7118644067796613e-06, + "loss": 2.3922, + "step": 3282 + }, + { + "epoch": 0.9335710942730024, + "grad_norm": 1.4481021165847778, + "learning_rate": 2.7003734559034762e-06, + "loss": 2.2305, + "step": 3283 + }, + { + "epoch": 0.9338554595164607, + "grad_norm": 1.4051294326782227, + "learning_rate": 2.688882505027291e-06, + "loss": 1.8938, + "step": 3284 + }, + { + "epoch": 0.9341398247599187, + "grad_norm": 1.5220015048980713, + "learning_rate": 2.6773915541511065e-06, + "loss": 2.045, + "step": 3285 + }, + { + "epoch": 0.9344241900033768, + "grad_norm": 1.5136582851409912, + "learning_rate": 2.6659006032749214e-06, + "loss": 1.8875, + "step": 3286 + }, + { + "epoch": 0.9347085552468349, + "grad_norm": 1.5294973850250244, + "learning_rate": 2.6544096523987363e-06, + "loss": 1.8228, + "step": 3287 + }, + { + "epoch": 0.934992920490293, + "grad_norm": 1.4998856782913208, + "learning_rate": 2.642918701522551e-06, + "loss": 1.5929, + "step": 3288 + }, + { + "epoch": 0.9352772857337512, + "grad_norm": 1.751221776008606, + "learning_rate": 2.6314277506463666e-06, + "loss": 2.6353, + "step": 3289 + }, + { + "epoch": 0.9355616509772093, + "grad_norm": 1.5697683095932007, + "learning_rate": 2.6199367997701815e-06, + "loss": 2.68, + "step": 3290 + }, + { + "epoch": 0.9358460162206674, + "grad_norm": 1.5043424367904663, + "learning_rate": 2.608445848893996e-06, + "loss": 2.1733, + "step": 3291 + }, + { + "epoch": 0.9361303814641255, + "grad_norm": 1.4452601671218872, + "learning_rate": 2.596954898017811e-06, + "loss": 2.0397, + "step": 3292 + }, + { + "epoch": 0.9364147467075836, + "grad_norm": 1.478661298751831, + "learning_rate": 2.585463947141626e-06, + "loss": 1.8285, + "step": 3293 + }, + { + "epoch": 0.9366991119510418, + "grad_norm": 1.4804670810699463, + "learning_rate": 2.573972996265441e-06, + "loss": 1.9527, + "step": 3294 + }, + { + "epoch": 0.9369834771944999, + "grad_norm": 1.5798107385635376, + "learning_rate": 2.562482045389256e-06, + "loss": 1.8947, + "step": 3295 + }, + { + "epoch": 0.937267842437958, + "grad_norm": 1.7154254913330078, + "learning_rate": 2.550991094513071e-06, + "loss": 1.7322, + "step": 3296 + }, + { + "epoch": 0.9375522076814161, + "grad_norm": 1.6413829326629639, + "learning_rate": 2.539500143636886e-06, + "loss": 2.9358, + "step": 3297 + }, + { + "epoch": 0.9378365729248742, + "grad_norm": 1.5562938451766968, + "learning_rate": 2.528009192760701e-06, + "loss": 2.3359, + "step": 3298 + }, + { + "epoch": 0.9381209381683324, + "grad_norm": 1.3978956937789917, + "learning_rate": 2.5165182418845162e-06, + "loss": 2.2585, + "step": 3299 + }, + { + "epoch": 0.9384053034117905, + "grad_norm": 1.5323485136032104, + "learning_rate": 2.505027291008331e-06, + "loss": 2.1701, + "step": 3300 + }, + { + "epoch": 0.9386896686552486, + "grad_norm": 1.5247138738632202, + "learning_rate": 2.493536340132146e-06, + "loss": 2.2914, + "step": 3301 + }, + { + "epoch": 0.9389740338987067, + "grad_norm": 1.4950916767120361, + "learning_rate": 2.4820453892559614e-06, + "loss": 1.9327, + "step": 3302 + }, + { + "epoch": 0.9392583991421648, + "grad_norm": 1.5644844770431519, + "learning_rate": 2.470554438379776e-06, + "loss": 1.7878, + "step": 3303 + }, + { + "epoch": 0.939542764385623, + "grad_norm": 1.7486357688903809, + "learning_rate": 2.4590634875035913e-06, + "loss": 1.5255, + "step": 3304 + }, + { + "epoch": 0.9398271296290811, + "grad_norm": 1.6616047620773315, + "learning_rate": 2.447572536627406e-06, + "loss": 2.4573, + "step": 3305 + }, + { + "epoch": 0.9401114948725392, + "grad_norm": 1.5083410739898682, + "learning_rate": 2.436081585751221e-06, + "loss": 2.2363, + "step": 3306 + }, + { + "epoch": 0.9403958601159973, + "grad_norm": 1.4322084188461304, + "learning_rate": 2.424590634875036e-06, + "loss": 2.1424, + "step": 3307 + }, + { + "epoch": 0.9406802253594554, + "grad_norm": 1.466874122619629, + "learning_rate": 2.413099683998851e-06, + "loss": 2.1491, + "step": 3308 + }, + { + "epoch": 0.9409645906029136, + "grad_norm": 1.532874584197998, + "learning_rate": 2.4016087331226663e-06, + "loss": 1.9099, + "step": 3309 + }, + { + "epoch": 0.9412489558463717, + "grad_norm": 1.3527659177780151, + "learning_rate": 2.390117782246481e-06, + "loss": 1.8746, + "step": 3310 + }, + { + "epoch": 0.9415333210898298, + "grad_norm": 1.550384521484375, + "learning_rate": 2.378626831370296e-06, + "loss": 1.8256, + "step": 3311 + }, + { + "epoch": 0.9418176863332879, + "grad_norm": 1.514704704284668, + "learning_rate": 2.367135880494111e-06, + "loss": 1.5771, + "step": 3312 + }, + { + "epoch": 0.942102051576746, + "grad_norm": 1.789668083190918, + "learning_rate": 2.355644929617926e-06, + "loss": 2.8358, + "step": 3313 + }, + { + "epoch": 0.9423864168202042, + "grad_norm": 1.4652148485183716, + "learning_rate": 2.3441539787417413e-06, + "loss": 2.312, + "step": 3314 + }, + { + "epoch": 0.9426707820636623, + "grad_norm": 1.5131518840789795, + "learning_rate": 2.332663027865556e-06, + "loss": 2.0083, + "step": 3315 + }, + { + "epoch": 0.9429551473071204, + "grad_norm": 1.439961314201355, + "learning_rate": 2.321172076989371e-06, + "loss": 2.076, + "step": 3316 + }, + { + "epoch": 0.9432395125505785, + "grad_norm": 1.473046898841858, + "learning_rate": 2.309681126113186e-06, + "loss": 1.969, + "step": 3317 + }, + { + "epoch": 0.9435238777940366, + "grad_norm": 1.4626710414886475, + "learning_rate": 2.298190175237001e-06, + "loss": 1.7359, + "step": 3318 + }, + { + "epoch": 0.9438082430374948, + "grad_norm": 1.4370124340057373, + "learning_rate": 2.286699224360816e-06, + "loss": 1.7492, + "step": 3319 + }, + { + "epoch": 0.9440926082809529, + "grad_norm": 1.5686242580413818, + "learning_rate": 2.275208273484631e-06, + "loss": 1.683, + "step": 3320 + }, + { + "epoch": 0.944376973524411, + "grad_norm": 1.6691089868545532, + "learning_rate": 2.263717322608446e-06, + "loss": 2.6069, + "step": 3321 + }, + { + "epoch": 0.9446613387678691, + "grad_norm": 1.55553138256073, + "learning_rate": 2.252226371732261e-06, + "loss": 2.3096, + "step": 3322 + }, + { + "epoch": 0.9449457040113273, + "grad_norm": 1.4959852695465088, + "learning_rate": 2.240735420856076e-06, + "loss": 2.4402, + "step": 3323 + }, + { + "epoch": 0.9452300692547854, + "grad_norm": 1.4532426595687866, + "learning_rate": 2.229244469979891e-06, + "loss": 2.0286, + "step": 3324 + }, + { + "epoch": 0.9455144344982435, + "grad_norm": 1.565828561782837, + "learning_rate": 2.2177535191037063e-06, + "loss": 2.217, + "step": 3325 + }, + { + "epoch": 0.9457987997417016, + "grad_norm": 1.489469289779663, + "learning_rate": 2.2062625682275208e-06, + "loss": 1.8138, + "step": 3326 + }, + { + "epoch": 0.9460831649851597, + "grad_norm": 1.5402168035507202, + "learning_rate": 2.1947716173513357e-06, + "loss": 1.8234, + "step": 3327 + }, + { + "epoch": 0.9463675302286179, + "grad_norm": 1.4881329536437988, + "learning_rate": 2.183280666475151e-06, + "loss": 1.4388, + "step": 3328 + }, + { + "epoch": 0.946651895472076, + "grad_norm": 1.8732259273529053, + "learning_rate": 2.171789715598966e-06, + "loss": 2.7893, + "step": 3329 + }, + { + "epoch": 0.946936260715534, + "grad_norm": 1.452468752861023, + "learning_rate": 2.160298764722781e-06, + "loss": 2.2695, + "step": 3330 + }, + { + "epoch": 0.9472206259589921, + "grad_norm": 1.4889235496520996, + "learning_rate": 2.148807813846596e-06, + "loss": 2.2459, + "step": 3331 + }, + { + "epoch": 0.9475049912024502, + "grad_norm": 1.4310044050216675, + "learning_rate": 2.137316862970411e-06, + "loss": 2.1622, + "step": 3332 + }, + { + "epoch": 0.9477893564459084, + "grad_norm": 1.490521788597107, + "learning_rate": 2.125825912094226e-06, + "loss": 1.9426, + "step": 3333 + }, + { + "epoch": 0.9480737216893665, + "grad_norm": 1.4765541553497314, + "learning_rate": 2.114334961218041e-06, + "loss": 1.8035, + "step": 3334 + }, + { + "epoch": 0.9483580869328246, + "grad_norm": 1.5888854265213013, + "learning_rate": 2.102844010341856e-06, + "loss": 1.8977, + "step": 3335 + }, + { + "epoch": 0.9486424521762827, + "grad_norm": 1.5300711393356323, + "learning_rate": 2.091353059465671e-06, + "loss": 1.8379, + "step": 3336 + }, + { + "epoch": 0.9489268174197408, + "grad_norm": 1.5477615594863892, + "learning_rate": 2.079862108589486e-06, + "loss": 2.6793, + "step": 3337 + }, + { + "epoch": 0.949211182663199, + "grad_norm": 1.5292421579360962, + "learning_rate": 2.0683711577133007e-06, + "loss": 2.3593, + "step": 3338 + }, + { + "epoch": 0.9494955479066571, + "grad_norm": 1.4530328512191772, + "learning_rate": 2.056880206837116e-06, + "loss": 2.1165, + "step": 3339 + }, + { + "epoch": 0.9497799131501152, + "grad_norm": 1.475620150566101, + "learning_rate": 2.045389255960931e-06, + "loss": 2.1348, + "step": 3340 + }, + { + "epoch": 0.9500642783935733, + "grad_norm": 1.4726742506027222, + "learning_rate": 2.033898305084746e-06, + "loss": 2.1875, + "step": 3341 + }, + { + "epoch": 0.9503486436370314, + "grad_norm": 1.4152458906173706, + "learning_rate": 2.022407354208561e-06, + "loss": 1.9168, + "step": 3342 + }, + { + "epoch": 0.9506330088804896, + "grad_norm": 1.4417401552200317, + "learning_rate": 2.0109164033323757e-06, + "loss": 1.778, + "step": 3343 + }, + { + "epoch": 0.9509173741239477, + "grad_norm": 1.5183541774749756, + "learning_rate": 1.999425452456191e-06, + "loss": 1.6503, + "step": 3344 + }, + { + "epoch": 0.9512017393674058, + "grad_norm": 1.5476274490356445, + "learning_rate": 1.987934501580006e-06, + "loss": 2.781, + "step": 3345 + }, + { + "epoch": 0.9514861046108639, + "grad_norm": 1.5080674886703491, + "learning_rate": 1.976443550703821e-06, + "loss": 2.0026, + "step": 3346 + }, + { + "epoch": 0.951770469854322, + "grad_norm": 1.437429666519165, + "learning_rate": 1.964952599827636e-06, + "loss": 2.3271, + "step": 3347 + }, + { + "epoch": 0.9520548350977802, + "grad_norm": 1.526536226272583, + "learning_rate": 1.953461648951451e-06, + "loss": 2.0797, + "step": 3348 + }, + { + "epoch": 0.9523392003412383, + "grad_norm": 1.5311537981033325, + "learning_rate": 1.941970698075266e-06, + "loss": 1.9877, + "step": 3349 + }, + { + "epoch": 0.9526235655846964, + "grad_norm": 1.4898496866226196, + "learning_rate": 1.9304797471990806e-06, + "loss": 1.8399, + "step": 3350 + }, + { + "epoch": 0.9529079308281545, + "grad_norm": 1.5804930925369263, + "learning_rate": 1.918988796322896e-06, + "loss": 1.8401, + "step": 3351 + }, + { + "epoch": 0.9531922960716126, + "grad_norm": 1.619555950164795, + "learning_rate": 1.907497845446711e-06, + "loss": 1.7878, + "step": 3352 + }, + { + "epoch": 0.9534766613150708, + "grad_norm": 1.7056188583374023, + "learning_rate": 1.896006894570526e-06, + "loss": 2.5584, + "step": 3353 + }, + { + "epoch": 0.9537610265585289, + "grad_norm": 1.5180672407150269, + "learning_rate": 1.884515943694341e-06, + "loss": 2.3481, + "step": 3354 + }, + { + "epoch": 0.954045391801987, + "grad_norm": 1.4261457920074463, + "learning_rate": 1.873024992818156e-06, + "loss": 2.0368, + "step": 3355 + }, + { + "epoch": 0.9543297570454451, + "grad_norm": 1.4864054918289185, + "learning_rate": 1.8615340419419707e-06, + "loss": 2.1562, + "step": 3356 + }, + { + "epoch": 0.9546141222889032, + "grad_norm": 1.6229420900344849, + "learning_rate": 1.8500430910657857e-06, + "loss": 1.8096, + "step": 3357 + }, + { + "epoch": 0.9548984875323614, + "grad_norm": 1.468389630317688, + "learning_rate": 1.8385521401896008e-06, + "loss": 1.7469, + "step": 3358 + }, + { + "epoch": 0.9551828527758195, + "grad_norm": 2.151934862136841, + "learning_rate": 1.8270611893134157e-06, + "loss": 1.9153, + "step": 3359 + }, + { + "epoch": 0.9554672180192776, + "grad_norm": 1.6833430528640747, + "learning_rate": 1.8155702384372309e-06, + "loss": 1.7895, + "step": 3360 + }, + { + "epoch": 0.9557515832627357, + "grad_norm": 1.6929632425308228, + "learning_rate": 1.8040792875610458e-06, + "loss": 2.6418, + "step": 3361 + }, + { + "epoch": 0.9560359485061938, + "grad_norm": 1.4506680965423584, + "learning_rate": 1.792588336684861e-06, + "loss": 2.3191, + "step": 3362 + }, + { + "epoch": 0.956320313749652, + "grad_norm": 1.452488899230957, + "learning_rate": 1.7810973858086758e-06, + "loss": 2.2384, + "step": 3363 + }, + { + "epoch": 0.9566046789931101, + "grad_norm": 1.4387120008468628, + "learning_rate": 1.7696064349324907e-06, + "loss": 1.9809, + "step": 3364 + }, + { + "epoch": 0.9568890442365682, + "grad_norm": 1.4808820486068726, + "learning_rate": 1.7581154840563059e-06, + "loss": 1.9542, + "step": 3365 + }, + { + "epoch": 0.9571734094800263, + "grad_norm": 1.5310330390930176, + "learning_rate": 1.7466245331801208e-06, + "loss": 1.9475, + "step": 3366 + }, + { + "epoch": 0.9574577747234844, + "grad_norm": 1.628447413444519, + "learning_rate": 1.735133582303936e-06, + "loss": 1.8739, + "step": 3367 + }, + { + "epoch": 0.9577421399669426, + "grad_norm": 1.5938904285430908, + "learning_rate": 1.7236426314277506e-06, + "loss": 1.7342, + "step": 3368 + }, + { + "epoch": 0.9580265052104007, + "grad_norm": 1.7595031261444092, + "learning_rate": 1.712151680551566e-06, + "loss": 2.6314, + "step": 3369 + }, + { + "epoch": 0.9583108704538588, + "grad_norm": 1.4485570192337036, + "learning_rate": 1.7006607296753807e-06, + "loss": 2.3092, + "step": 3370 + }, + { + "epoch": 0.9585952356973169, + "grad_norm": 1.341435432434082, + "learning_rate": 1.6891697787991956e-06, + "loss": 2.1334, + "step": 3371 + }, + { + "epoch": 0.9588796009407751, + "grad_norm": 1.397361159324646, + "learning_rate": 1.6776788279230107e-06, + "loss": 2.1447, + "step": 3372 + }, + { + "epoch": 0.9591639661842332, + "grad_norm": 1.4863638877868652, + "learning_rate": 1.6661878770468257e-06, + "loss": 2.0255, + "step": 3373 + }, + { + "epoch": 0.9594483314276913, + "grad_norm": 1.468654751777649, + "learning_rate": 1.6546969261706408e-06, + "loss": 1.7851, + "step": 3374 + }, + { + "epoch": 0.9597326966711494, + "grad_norm": 1.470919132232666, + "learning_rate": 1.6432059752944557e-06, + "loss": 1.7603, + "step": 3375 + }, + { + "epoch": 0.9600170619146075, + "grad_norm": 1.625359296798706, + "learning_rate": 1.6317150244182709e-06, + "loss": 1.7318, + "step": 3376 + }, + { + "epoch": 0.9603014271580657, + "grad_norm": 1.6317858695983887, + "learning_rate": 1.6202240735420858e-06, + "loss": 2.6952, + "step": 3377 + }, + { + "epoch": 0.9605857924015238, + "grad_norm": 1.432524561882019, + "learning_rate": 1.608733122665901e-06, + "loss": 2.3517, + "step": 3378 + }, + { + "epoch": 0.9608701576449818, + "grad_norm": 1.5162678956985474, + "learning_rate": 1.5972421717897158e-06, + "loss": 2.374, + "step": 3379 + }, + { + "epoch": 0.9611545228884399, + "grad_norm": 1.4993191957473755, + "learning_rate": 1.5857512209135305e-06, + "loss": 2.2357, + "step": 3380 + }, + { + "epoch": 0.961438888131898, + "grad_norm": 1.4974647760391235, + "learning_rate": 1.5742602700373457e-06, + "loss": 1.9876, + "step": 3381 + }, + { + "epoch": 0.9617232533753562, + "grad_norm": 1.4593394994735718, + "learning_rate": 1.5627693191611606e-06, + "loss": 1.7961, + "step": 3382 + }, + { + "epoch": 0.9620076186188143, + "grad_norm": 1.453155517578125, + "learning_rate": 1.5512783682849757e-06, + "loss": 1.6868, + "step": 3383 + }, + { + "epoch": 0.9622919838622724, + "grad_norm": 1.6229658126831055, + "learning_rate": 1.5397874174087906e-06, + "loss": 1.8026, + "step": 3384 + }, + { + "epoch": 0.9625763491057305, + "grad_norm": 1.7281709909439087, + "learning_rate": 1.5282964665326058e-06, + "loss": 2.781, + "step": 3385 + }, + { + "epoch": 0.9628607143491886, + "grad_norm": 1.487370491027832, + "learning_rate": 1.5168055156564207e-06, + "loss": 2.3773, + "step": 3386 + }, + { + "epoch": 0.9631450795926468, + "grad_norm": 1.5010656118392944, + "learning_rate": 1.5053145647802356e-06, + "loss": 2.1523, + "step": 3387 + }, + { + "epoch": 0.9634294448361049, + "grad_norm": 1.5002316236495972, + "learning_rate": 1.4938236139040508e-06, + "loss": 2.0299, + "step": 3388 + }, + { + "epoch": 0.963713810079563, + "grad_norm": 1.5855457782745361, + "learning_rate": 1.4823326630278657e-06, + "loss": 2.1472, + "step": 3389 + }, + { + "epoch": 0.9639981753230211, + "grad_norm": 1.4458882808685303, + "learning_rate": 1.4708417121516808e-06, + "loss": 1.7629, + "step": 3390 + }, + { + "epoch": 0.9642825405664792, + "grad_norm": 1.5790014266967773, + "learning_rate": 1.4593507612754957e-06, + "loss": 1.6007, + "step": 3391 + }, + { + "epoch": 0.9645669058099374, + "grad_norm": 1.5922648906707764, + "learning_rate": 1.4478598103993109e-06, + "loss": 1.9121, + "step": 3392 + }, + { + "epoch": 0.9648512710533955, + "grad_norm": 1.733428955078125, + "learning_rate": 1.4363688595231256e-06, + "loss": 2.5225, + "step": 3393 + }, + { + "epoch": 0.9651356362968536, + "grad_norm": 1.4528639316558838, + "learning_rate": 1.4248779086469405e-06, + "loss": 2.1728, + "step": 3394 + }, + { + "epoch": 0.9654200015403117, + "grad_norm": 1.411819338798523, + "learning_rate": 1.4133869577707556e-06, + "loss": 2.1298, + "step": 3395 + }, + { + "epoch": 0.9657043667837698, + "grad_norm": 1.5605361461639404, + "learning_rate": 1.4018960068945705e-06, + "loss": 2.3136, + "step": 3396 + }, + { + "epoch": 0.965988732027228, + "grad_norm": 1.5014349222183228, + "learning_rate": 1.3904050560183857e-06, + "loss": 2.0917, + "step": 3397 + }, + { + "epoch": 0.9662730972706861, + "grad_norm": 1.4370369911193848, + "learning_rate": 1.3789141051422006e-06, + "loss": 1.8056, + "step": 3398 + }, + { + "epoch": 0.9665574625141442, + "grad_norm": 1.5899243354797363, + "learning_rate": 1.3674231542660157e-06, + "loss": 1.5606, + "step": 3399 + }, + { + "epoch": 0.9668418277576023, + "grad_norm": 1.4517974853515625, + "learning_rate": 1.3559322033898307e-06, + "loss": 1.5532, + "step": 3400 + }, + { + "epoch": 0.9671261930010604, + "grad_norm": 1.7034136056900024, + "learning_rate": 1.3444412525136456e-06, + "loss": 2.5908, + "step": 3401 + }, + { + "epoch": 0.9674105582445186, + "grad_norm": 1.504889726638794, + "learning_rate": 1.3329503016374607e-06, + "loss": 2.4146, + "step": 3402 + }, + { + "epoch": 0.9676949234879767, + "grad_norm": 1.5229851007461548, + "learning_rate": 1.3214593507612754e-06, + "loss": 2.2832, + "step": 3403 + }, + { + "epoch": 0.9679792887314348, + "grad_norm": 1.3611642122268677, + "learning_rate": 1.3099683998850908e-06, + "loss": 2.2432, + "step": 3404 + }, + { + "epoch": 0.9682636539748929, + "grad_norm": 1.4854189157485962, + "learning_rate": 1.2984774490089055e-06, + "loss": 2.1156, + "step": 3405 + }, + { + "epoch": 0.968548019218351, + "grad_norm": 1.5415984392166138, + "learning_rate": 1.2869864981327206e-06, + "loss": 1.859, + "step": 3406 + }, + { + "epoch": 0.9688323844618092, + "grad_norm": 1.4857087135314941, + "learning_rate": 1.2754955472565355e-06, + "loss": 1.6399, + "step": 3407 + }, + { + "epoch": 0.9691167497052673, + "grad_norm": 1.531732201576233, + "learning_rate": 1.2640045963803504e-06, + "loss": 1.6132, + "step": 3408 + }, + { + "epoch": 0.9694011149487254, + "grad_norm": 1.7313965559005737, + "learning_rate": 1.2525136455041656e-06, + "loss": 2.5503, + "step": 3409 + }, + { + "epoch": 0.9696854801921835, + "grad_norm": 1.430211067199707, + "learning_rate": 1.2410226946279807e-06, + "loss": 2.2446, + "step": 3410 + }, + { + "epoch": 0.9699698454356416, + "grad_norm": 1.432405948638916, + "learning_rate": 1.2295317437517956e-06, + "loss": 2.2009, + "step": 3411 + }, + { + "epoch": 0.9702542106790998, + "grad_norm": 1.5122909545898438, + "learning_rate": 1.2180407928756105e-06, + "loss": 2.2101, + "step": 3412 + }, + { + "epoch": 0.9705385759225579, + "grad_norm": 1.592177391052246, + "learning_rate": 1.2065498419994255e-06, + "loss": 2.1943, + "step": 3413 + }, + { + "epoch": 0.970822941166016, + "grad_norm": 1.4828754663467407, + "learning_rate": 1.1950588911232406e-06, + "loss": 2.0317, + "step": 3414 + }, + { + "epoch": 0.9711073064094741, + "grad_norm": 1.4396264553070068, + "learning_rate": 1.1835679402470555e-06, + "loss": 1.7245, + "step": 3415 + }, + { + "epoch": 0.9713916716529322, + "grad_norm": 1.5686695575714111, + "learning_rate": 1.1720769893708707e-06, + "loss": 1.8843, + "step": 3416 + }, + { + "epoch": 0.9716760368963904, + "grad_norm": 1.7028121948242188, + "learning_rate": 1.1605860384946856e-06, + "loss": 2.6639, + "step": 3417 + }, + { + "epoch": 0.9719604021398485, + "grad_norm": 1.5416810512542725, + "learning_rate": 1.1490950876185005e-06, + "loss": 2.38, + "step": 3418 + }, + { + "epoch": 0.9722447673833066, + "grad_norm": 1.492241621017456, + "learning_rate": 1.1376041367423154e-06, + "loss": 2.2302, + "step": 3419 + }, + { + "epoch": 0.9725291326267647, + "grad_norm": 1.4459635019302368, + "learning_rate": 1.1261131858661306e-06, + "loss": 2.0947, + "step": 3420 + }, + { + "epoch": 0.9728134978702229, + "grad_norm": 1.649179458618164, + "learning_rate": 1.1146222349899455e-06, + "loss": 2.1094, + "step": 3421 + }, + { + "epoch": 0.973097863113681, + "grad_norm": 1.3947135210037231, + "learning_rate": 1.1031312841137604e-06, + "loss": 1.8, + "step": 3422 + }, + { + "epoch": 0.9733822283571391, + "grad_norm": 1.423638105392456, + "learning_rate": 1.0916403332375755e-06, + "loss": 1.7245, + "step": 3423 + }, + { + "epoch": 0.9736665936005972, + "grad_norm": 1.5806896686553955, + "learning_rate": 1.0801493823613904e-06, + "loss": 1.8189, + "step": 3424 + }, + { + "epoch": 0.9739509588440552, + "grad_norm": 1.8673456907272339, + "learning_rate": 1.0686584314852056e-06, + "loss": 2.8048, + "step": 3425 + }, + { + "epoch": 0.9742353240875135, + "grad_norm": 1.4717233180999756, + "learning_rate": 1.0571674806090205e-06, + "loss": 2.0696, + "step": 3426 + }, + { + "epoch": 0.9745196893309716, + "grad_norm": 1.3976044654846191, + "learning_rate": 1.0456765297328354e-06, + "loss": 2.1881, + "step": 3427 + }, + { + "epoch": 0.9748040545744296, + "grad_norm": 1.4841516017913818, + "learning_rate": 1.0341855788566503e-06, + "loss": 2.1457, + "step": 3428 + }, + { + "epoch": 0.9750884198178877, + "grad_norm": 1.4041268825531006, + "learning_rate": 1.0226946279804655e-06, + "loss": 1.9074, + "step": 3429 + }, + { + "epoch": 0.9753727850613458, + "grad_norm": 1.5698847770690918, + "learning_rate": 1.0112036771042804e-06, + "loss": 1.8711, + "step": 3430 + }, + { + "epoch": 0.975657150304804, + "grad_norm": 1.568758249282837, + "learning_rate": 9.997127262280955e-07, + "loss": 1.7552, + "step": 3431 + }, + { + "epoch": 0.9759415155482621, + "grad_norm": 1.540923833847046, + "learning_rate": 9.882217753519104e-07, + "loss": 1.724, + "step": 3432 + }, + { + "epoch": 0.9762258807917202, + "grad_norm": 1.6241703033447266, + "learning_rate": 9.767308244757256e-07, + "loss": 2.583, + "step": 3433 + }, + { + "epoch": 0.9765102460351783, + "grad_norm": 1.4737218618392944, + "learning_rate": 9.652398735995403e-07, + "loss": 2.2108, + "step": 3434 + }, + { + "epoch": 0.9767946112786364, + "grad_norm": 1.477249026298523, + "learning_rate": 9.537489227233554e-07, + "loss": 2.4118, + "step": 3435 + }, + { + "epoch": 0.9770789765220946, + "grad_norm": 1.4675695896148682, + "learning_rate": 9.422579718471705e-07, + "loss": 2.2902, + "step": 3436 + }, + { + "epoch": 0.9773633417655527, + "grad_norm": 1.5511778593063354, + "learning_rate": 9.307670209709854e-07, + "loss": 2.0277, + "step": 3437 + }, + { + "epoch": 0.9776477070090108, + "grad_norm": 1.3725906610488892, + "learning_rate": 9.192760700948004e-07, + "loss": 1.8596, + "step": 3438 + }, + { + "epoch": 0.9779320722524689, + "grad_norm": 1.5125075578689575, + "learning_rate": 9.077851192186154e-07, + "loss": 1.551, + "step": 3439 + }, + { + "epoch": 0.978216437495927, + "grad_norm": 1.536952257156372, + "learning_rate": 8.962941683424305e-07, + "loss": 1.6953, + "step": 3440 + }, + { + "epoch": 0.9785008027393852, + "grad_norm": 1.8010717630386353, + "learning_rate": 8.848032174662454e-07, + "loss": 2.8914, + "step": 3441 + }, + { + "epoch": 0.9787851679828433, + "grad_norm": 1.4743620157241821, + "learning_rate": 8.733122665900604e-07, + "loss": 2.3008, + "step": 3442 + }, + { + "epoch": 0.9790695332263014, + "grad_norm": 1.5010536909103394, + "learning_rate": 8.618213157138753e-07, + "loss": 1.9155, + "step": 3443 + }, + { + "epoch": 0.9793538984697595, + "grad_norm": 1.4484606981277466, + "learning_rate": 8.503303648376903e-07, + "loss": 2.0419, + "step": 3444 + }, + { + "epoch": 0.9796382637132176, + "grad_norm": 1.5516077280044556, + "learning_rate": 8.388394139615054e-07, + "loss": 2.034, + "step": 3445 + }, + { + "epoch": 0.9799226289566758, + "grad_norm": 1.445167899131775, + "learning_rate": 8.273484630853204e-07, + "loss": 1.7347, + "step": 3446 + }, + { + "epoch": 0.9802069942001339, + "grad_norm": 1.5177215337753296, + "learning_rate": 8.158575122091354e-07, + "loss": 1.7245, + "step": 3447 + }, + { + "epoch": 0.980491359443592, + "grad_norm": 1.4989851713180542, + "learning_rate": 8.043665613329505e-07, + "loss": 1.7891, + "step": 3448 + }, + { + "epoch": 0.9807757246870501, + "grad_norm": 1.808464527130127, + "learning_rate": 7.928756104567653e-07, + "loss": 2.9027, + "step": 3449 + }, + { + "epoch": 0.9810600899305082, + "grad_norm": 1.4827134609222412, + "learning_rate": 7.813846595805803e-07, + "loss": 2.3215, + "step": 3450 + }, + { + "epoch": 0.9813444551739664, + "grad_norm": 1.4429081678390503, + "learning_rate": 7.698937087043953e-07, + "loss": 2.2759, + "step": 3451 + }, + { + "epoch": 0.9816288204174245, + "grad_norm": 1.4872095584869385, + "learning_rate": 7.584027578282104e-07, + "loss": 2.0688, + "step": 3452 + }, + { + "epoch": 0.9819131856608826, + "grad_norm": 1.5652166604995728, + "learning_rate": 7.469118069520254e-07, + "loss": 2.0771, + "step": 3453 + }, + { + "epoch": 0.9821975509043407, + "grad_norm": 1.380708932876587, + "learning_rate": 7.354208560758404e-07, + "loss": 1.616, + "step": 3454 + }, + { + "epoch": 0.9824819161477988, + "grad_norm": 1.489150047302246, + "learning_rate": 7.239299051996554e-07, + "loss": 1.7372, + "step": 3455 + }, + { + "epoch": 0.982766281391257, + "grad_norm": 1.7191890478134155, + "learning_rate": 7.124389543234702e-07, + "loss": 1.7137, + "step": 3456 + }, + { + "epoch": 0.9830506466347151, + "grad_norm": 1.6472156047821045, + "learning_rate": 7.009480034472853e-07, + "loss": 2.8431, + "step": 3457 + }, + { + "epoch": 0.9833350118781732, + "grad_norm": 1.4679713249206543, + "learning_rate": 6.894570525711003e-07, + "loss": 2.2365, + "step": 3458 + }, + { + "epoch": 0.9836193771216313, + "grad_norm": 1.43304443359375, + "learning_rate": 6.779661016949153e-07, + "loss": 2.2726, + "step": 3459 + }, + { + "epoch": 0.9839037423650894, + "grad_norm": 1.4966747760772705, + "learning_rate": 6.664751508187304e-07, + "loss": 1.9848, + "step": 3460 + }, + { + "epoch": 0.9841881076085476, + "grad_norm": 1.5169103145599365, + "learning_rate": 6.549841999425454e-07, + "loss": 2.0286, + "step": 3461 + }, + { + "epoch": 0.9844724728520057, + "grad_norm": 1.6073898077011108, + "learning_rate": 6.434932490663603e-07, + "loss": 1.8712, + "step": 3462 + }, + { + "epoch": 0.9847568380954638, + "grad_norm": 1.4505630731582642, + "learning_rate": 6.320022981901752e-07, + "loss": 1.6837, + "step": 3463 + }, + { + "epoch": 0.9850412033389219, + "grad_norm": 1.5838265419006348, + "learning_rate": 6.205113473139904e-07, + "loss": 1.7908, + "step": 3464 + }, + { + "epoch": 0.9853255685823801, + "grad_norm": 1.5671101808547974, + "learning_rate": 6.090203964378053e-07, + "loss": 2.6081, + "step": 3465 + }, + { + "epoch": 0.9856099338258382, + "grad_norm": 1.4666972160339355, + "learning_rate": 5.975294455616203e-07, + "loss": 2.4855, + "step": 3466 + }, + { + "epoch": 0.9858942990692963, + "grad_norm": 1.4629732370376587, + "learning_rate": 5.860384946854353e-07, + "loss": 2.1884, + "step": 3467 + }, + { + "epoch": 0.9861786643127544, + "grad_norm": 1.4864407777786255, + "learning_rate": 5.745475438092502e-07, + "loss": 2.1033, + "step": 3468 + }, + { + "epoch": 0.9864630295562125, + "grad_norm": 1.498170256614685, + "learning_rate": 5.630565929330653e-07, + "loss": 2.0259, + "step": 3469 + }, + { + "epoch": 0.9867473947996707, + "grad_norm": 1.402068018913269, + "learning_rate": 5.515656420568802e-07, + "loss": 1.8348, + "step": 3470 + }, + { + "epoch": 0.9870317600431288, + "grad_norm": 1.5257517099380493, + "learning_rate": 5.400746911806952e-07, + "loss": 1.8354, + "step": 3471 + }, + { + "epoch": 0.9873161252865869, + "grad_norm": 1.5966296195983887, + "learning_rate": 5.285837403045103e-07, + "loss": 1.7858, + "step": 3472 + }, + { + "epoch": 0.987600490530045, + "grad_norm": 1.67741060256958, + "learning_rate": 5.170927894283252e-07, + "loss": 2.6827, + "step": 3473 + }, + { + "epoch": 0.987884855773503, + "grad_norm": 1.4048272371292114, + "learning_rate": 5.056018385521402e-07, + "loss": 2.5054, + "step": 3474 + }, + { + "epoch": 0.9881692210169613, + "grad_norm": 1.3851925134658813, + "learning_rate": 4.941108876759552e-07, + "loss": 2.1778, + "step": 3475 + }, + { + "epoch": 0.9884535862604193, + "grad_norm": 1.344907283782959, + "learning_rate": 4.826199367997701e-07, + "loss": 2.2011, + "step": 3476 + }, + { + "epoch": 0.9887379515038774, + "grad_norm": 1.4560372829437256, + "learning_rate": 4.711289859235852e-07, + "loss": 2.0771, + "step": 3477 + }, + { + "epoch": 0.9890223167473355, + "grad_norm": 1.4100260734558105, + "learning_rate": 4.596380350474002e-07, + "loss": 1.7795, + "step": 3478 + }, + { + "epoch": 0.9893066819907936, + "grad_norm": 1.4562536478042603, + "learning_rate": 4.481470841712152e-07, + "loss": 1.7464, + "step": 3479 + }, + { + "epoch": 0.9895910472342518, + "grad_norm": 1.5763298273086548, + "learning_rate": 4.366561332950302e-07, + "loss": 1.6895, + "step": 3480 + }, + { + "epoch": 0.9898754124777099, + "grad_norm": 1.698847770690918, + "learning_rate": 4.251651824188452e-07, + "loss": 2.9543, + "step": 3481 + }, + { + "epoch": 0.990159777721168, + "grad_norm": 1.475989818572998, + "learning_rate": 4.136742315426602e-07, + "loss": 2.3288, + "step": 3482 + }, + { + "epoch": 0.9904441429646261, + "grad_norm": 1.4876697063446045, + "learning_rate": 4.0218328066647523e-07, + "loss": 2.1684, + "step": 3483 + }, + { + "epoch": 0.9907285082080842, + "grad_norm": 1.5078527927398682, + "learning_rate": 3.9069232979029015e-07, + "loss": 2.2732, + "step": 3484 + }, + { + "epoch": 0.9910128734515424, + "grad_norm": 1.5910695791244507, + "learning_rate": 3.792013789141052e-07, + "loss": 2.215, + "step": 3485 + }, + { + "epoch": 0.9912972386950005, + "grad_norm": 1.6093436479568481, + "learning_rate": 3.677104280379202e-07, + "loss": 1.9322, + "step": 3486 + }, + { + "epoch": 0.9915816039384586, + "grad_norm": 1.5375397205352783, + "learning_rate": 3.562194771617351e-07, + "loss": 1.7414, + "step": 3487 + }, + { + "epoch": 0.9918659691819167, + "grad_norm": 1.5303829908370972, + "learning_rate": 3.4472852628555015e-07, + "loss": 1.4854, + "step": 3488 + }, + { + "epoch": 0.9921503344253748, + "grad_norm": 1.6482408046722412, + "learning_rate": 3.332375754093652e-07, + "loss": 2.4752, + "step": 3489 + }, + { + "epoch": 0.992434699668833, + "grad_norm": 1.387835144996643, + "learning_rate": 3.2174662453318015e-07, + "loss": 2.0685, + "step": 3490 + }, + { + "epoch": 0.9927190649122911, + "grad_norm": 1.428237795829773, + "learning_rate": 3.102556736569952e-07, + "loss": 2.1544, + "step": 3491 + }, + { + "epoch": 0.9930034301557492, + "grad_norm": 1.4636019468307495, + "learning_rate": 2.9876472278081015e-07, + "loss": 2.2585, + "step": 3492 + }, + { + "epoch": 0.9932877953992073, + "grad_norm": 1.5067780017852783, + "learning_rate": 2.872737719046251e-07, + "loss": 1.8351, + "step": 3493 + }, + { + "epoch": 0.9935721606426654, + "grad_norm": 1.483904480934143, + "learning_rate": 2.757828210284401e-07, + "loss": 1.6935, + "step": 3494 + }, + { + "epoch": 0.9938565258861236, + "grad_norm": 1.4138548374176025, + "learning_rate": 2.642918701522551e-07, + "loss": 1.8169, + "step": 3495 + }, + { + "epoch": 0.9941408911295817, + "grad_norm": 1.464961290359497, + "learning_rate": 2.528009192760701e-07, + "loss": 1.7828, + "step": 3496 + }, + { + "epoch": 0.9944252563730398, + "grad_norm": 1.8348134756088257, + "learning_rate": 2.4130996839988507e-07, + "loss": 2.8307, + "step": 3497 + }, + { + "epoch": 0.9947096216164979, + "grad_norm": 1.4368764162063599, + "learning_rate": 2.298190175237001e-07, + "loss": 2.3679, + "step": 3498 + }, + { + "epoch": 0.994993986859956, + "grad_norm": 1.479984998703003, + "learning_rate": 2.183280666475151e-07, + "loss": 2.2969, + "step": 3499 + }, + { + "epoch": 0.9952783521034142, + "grad_norm": 1.4134024381637573, + "learning_rate": 2.068371157713301e-07, + "loss": 2.2069, + "step": 3500 + }, + { + "epoch": 0.9955627173468723, + "grad_norm": 1.640987515449524, + "learning_rate": 1.9534616489514507e-07, + "loss": 1.9712, + "step": 3501 + }, + { + "epoch": 0.9958470825903304, + "grad_norm": 1.4915003776550293, + "learning_rate": 1.838552140189601e-07, + "loss": 1.6915, + "step": 3502 + }, + { + "epoch": 0.9961314478337885, + "grad_norm": 1.5747153759002686, + "learning_rate": 1.7236426314277507e-07, + "loss": 1.5998, + "step": 3503 + }, + { + "epoch": 0.9964158130772466, + "grad_norm": 1.5918612480163574, + "learning_rate": 1.6087331226659008e-07, + "loss": 1.7691, + "step": 3504 + }, + { + "epoch": 0.9967001783207048, + "grad_norm": 1.6233141422271729, + "learning_rate": 1.4938236139040508e-07, + "loss": 2.538, + "step": 3505 + }, + { + "epoch": 0.9969845435641629, + "grad_norm": 1.5444676876068115, + "learning_rate": 1.3789141051422005e-07, + "loss": 2.3451, + "step": 3506 + }, + { + "epoch": 0.997268908807621, + "grad_norm": 1.4213060140609741, + "learning_rate": 1.2640045963803505e-07, + "loss": 2.1811, + "step": 3507 + }, + { + "epoch": 0.9975532740510791, + "grad_norm": 1.5667362213134766, + "learning_rate": 1.1490950876185005e-07, + "loss": 2.0772, + "step": 3508 + }, + { + "epoch": 0.9978376392945372, + "grad_norm": 1.6838546991348267, + "learning_rate": 1.0341855788566505e-07, + "loss": 1.8718, + "step": 3509 + }, + { + "epoch": 0.9981220045379954, + "grad_norm": 1.4781723022460938, + "learning_rate": 9.192760700948005e-08, + "loss": 1.916, + "step": 3510 + }, + { + "epoch": 0.9984063697814535, + "grad_norm": 1.4474114179611206, + "learning_rate": 8.043665613329504e-08, + "loss": 1.7148, + "step": 3511 + }, + { + "epoch": 0.9986907350249116, + "grad_norm": 1.7419354915618896, + "learning_rate": 6.894570525711002e-08, + "loss": 1.6992, + "step": 3512 + }, + { + "epoch": 0.9989751002683697, + "grad_norm": 1.6991440057754517, + "learning_rate": 5.7454754380925025e-08, + "loss": 2.6591, + "step": 3513 + }, + { + "epoch": 0.9992594655118279, + "grad_norm": 1.4011856317520142, + "learning_rate": 4.5963803504740025e-08, + "loss": 2.2933, + "step": 3514 + }, + { + "epoch": 0.999543830755286, + "grad_norm": 1.5167193412780762, + "learning_rate": 3.447285262855501e-08, + "loss": 2.3063, + "step": 3515 + }, + { + "epoch": 0.9998281959987441, + "grad_norm": 1.423835039138794, + "learning_rate": 2.2981901752370013e-08, + "loss": 2.2698, + "step": 3516 + }, + { + "epoch": 1.0, + "grad_norm": 1.7970972061157227, + "learning_rate": 1.1490950876185006e-08, + "loss": 2.0559, + "step": 3517 + } + ], + "logging_steps": 1, + "max_steps": 3517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.433231803074478e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}