{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.498591082380242, "eval_steps": 10, "global_step": 282, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017680534836178794, "grad_norm": 107.08050537109375, "learning_rate": 2.6333333333333332e-06, "loss": 0.0511, "step": 1 }, { "epoch": 0.0035361069672357587, "grad_norm": 33.058345794677734, "learning_rate": 4.266666666666667e-06, "loss": 0.0073, "step": 2 }, { "epoch": 0.005304160450853639, "grad_norm": 11.785157203674316, "learning_rate": 5.9e-06, "loss": 0.0038, "step": 3 }, { "epoch": 0.0070722139344715175, "grad_norm": 3.531964063644409, "learning_rate": 7.533333333333334e-06, "loss": 0.002, "step": 4 }, { "epoch": 0.008840267418089397, "grad_norm": 1.3157356977462769, "learning_rate": 9.166666666666668e-06, "loss": 0.0012, "step": 5 }, { "epoch": 0.010608320901707277, "grad_norm": 0.22130531072616577, "learning_rate": 1.0800000000000002e-05, "loss": 0.0009, "step": 6 }, { "epoch": 0.012376374385325156, "grad_norm": 0.16746075451374054, "learning_rate": 1.2433333333333335e-05, "loss": 0.0009, "step": 7 }, { "epoch": 0.014144427868943035, "grad_norm": 0.2485613077878952, "learning_rate": 1.4066666666666669e-05, "loss": 0.0008, "step": 8 }, { "epoch": 0.015912481352560914, "grad_norm": 0.19209939241409302, "learning_rate": 1.5700000000000002e-05, "loss": 0.0008, "step": 9 }, { "epoch": 0.017680534836178794, "grad_norm": 2.578878402709961, "learning_rate": 1.7333333333333336e-05, "loss": 0.0008, "step": 10 }, { "epoch": 0.019448588319796675, "grad_norm": 19.695049285888672, "learning_rate": 1.896666666666667e-05, "loss": 0.0014, "step": 11 }, { "epoch": 0.021216641803414555, "grad_norm": 0.31556376814842224, "learning_rate": 2.0600000000000003e-05, "loss": 0.0007, "step": 12 }, { "epoch": 0.022984695287032432, "grad_norm": 34.17892074584961, "learning_rate": 2.2233333333333336e-05, "loss": 0.0008, "step": 13 }, { "epoch": 0.024752748770650312, "grad_norm": 0.9590745568275452, "learning_rate": 2.386666666666667e-05, "loss": 0.0002, "step": 14 }, { "epoch": 0.026520802254268193, "grad_norm": 7.85075044631958, "learning_rate": 2.5500000000000003e-05, "loss": 0.0008, "step": 15 }, { "epoch": 0.02828885573788607, "grad_norm": 1.8065848350524902, "learning_rate": 2.7133333333333337e-05, "loss": 0.0003, "step": 16 }, { "epoch": 0.03005690922150395, "grad_norm": 0.3425370454788208, "learning_rate": 2.876666666666667e-05, "loss": 0.0, "step": 17 }, { "epoch": 0.03182496270512183, "grad_norm": 1.7299435138702393, "learning_rate": 3.0400000000000004e-05, "loss": 0.0001, "step": 18 }, { "epoch": 0.03359301618873971, "grad_norm": 0.37139496207237244, "learning_rate": 3.203333333333333e-05, "loss": 0.0, "step": 19 }, { "epoch": 0.03536106967235759, "grad_norm": 1.1884018182754517, "learning_rate": 3.366666666666667e-05, "loss": 0.0, "step": 20 }, { "epoch": 0.03712912315597547, "grad_norm": 0.018331818282604218, "learning_rate": 3.53e-05, "loss": 0.0, "step": 21 }, { "epoch": 0.03889717663959335, "grad_norm": 0.046326447278261185, "learning_rate": 3.6933333333333334e-05, "loss": 0.0, "step": 22 }, { "epoch": 0.04066523012321123, "grad_norm": 0.041994426399469376, "learning_rate": 3.856666666666667e-05, "loss": 0.0, "step": 23 }, { "epoch": 0.04243328360682911, "grad_norm": 5.286490440368652, "learning_rate": 4.02e-05, "loss": 0.0001, "step": 24 }, { "epoch": 0.044201337090446984, "grad_norm": 0.23539425432682037, "learning_rate": 4.183333333333334e-05, "loss": 0.0, "step": 25 }, { "epoch": 0.045969390574064864, "grad_norm": 22.727415084838867, "learning_rate": 4.346666666666667e-05, "loss": 0.0012, "step": 26 }, { "epoch": 0.047737444057682744, "grad_norm": 0.29610776901245117, "learning_rate": 4.5100000000000005e-05, "loss": 0.0001, "step": 27 }, { "epoch": 0.049505497541300625, "grad_norm": 319.07232666015625, "learning_rate": 4.6733333333333335e-05, "loss": 0.0006, "step": 28 }, { "epoch": 0.051273551024918505, "grad_norm": 430.6841125488281, "learning_rate": 4.836666666666667e-05, "loss": 0.0221, "step": 29 }, { "epoch": 0.053041604508536386, "grad_norm": 3.6946537494659424, "learning_rate": 5e-05, "loss": 0.0002, "step": 30 }, { "epoch": 0.05480965799215426, "grad_norm": 45.59299850463867, "learning_rate": 4.999834154609218e-05, "loss": 0.0035, "step": 31 }, { "epoch": 0.05657771147577214, "grad_norm": 32.24581527709961, "learning_rate": 4.999336640889681e-05, "loss": 0.0032, "step": 32 }, { "epoch": 0.05834576495939002, "grad_norm": 58.2963981628418, "learning_rate": 4.998507526196785e-05, "loss": 0.003, "step": 33 }, { "epoch": 0.0601138184430079, "grad_norm": 3.416010856628418, "learning_rate": 4.997346922779386e-05, "loss": 0.0005, "step": 34 }, { "epoch": 0.06188187192662578, "grad_norm": 0.8263393640518188, "learning_rate": 4.9958549877646073e-05, "loss": 0.0002, "step": 35 }, { "epoch": 0.06364992541024365, "grad_norm": 0.5320990681648254, "learning_rate": 4.994031923136569e-05, "loss": 0.0, "step": 36 }, { "epoch": 0.06541797889386154, "grad_norm": 0.006872811354696751, "learning_rate": 4.99187797570904e-05, "loss": 0.0, "step": 37 }, { "epoch": 0.06718603237747942, "grad_norm": 1.7324073314666748, "learning_rate": 4.9893934370920207e-05, "loss": 0.0001, "step": 38 }, { "epoch": 0.0689540858610973, "grad_norm": 17.383434295654297, "learning_rate": 4.98657864365227e-05, "loss": 0.0019, "step": 39 }, { "epoch": 0.07072213934471518, "grad_norm": 397.38128662109375, "learning_rate": 4.9834339764677606e-05, "loss": 0.0127, "step": 40 }, { "epoch": 0.07249019282833306, "grad_norm": 9.999519348144531, "learning_rate": 4.979959861276091e-05, "loss": 0.0006, "step": 41 }, { "epoch": 0.07425824631195094, "grad_norm": 1.2901943922042847, "learning_rate": 4.976156768416848e-05, "loss": 0.0001, "step": 42 }, { "epoch": 0.07602629979556881, "grad_norm": 0.0491662472486496, "learning_rate": 4.9720252127679233e-05, "loss": 0.0, "step": 43 }, { "epoch": 0.0777943532791867, "grad_norm": 0.012911037541925907, "learning_rate": 4.96756575367582e-05, "loss": 0.0, "step": 44 }, { "epoch": 0.07956240676280457, "grad_norm": 0.11798875033855438, "learning_rate": 4.96277899487991e-05, "loss": 0.0, "step": 45 }, { "epoch": 0.08133046024642246, "grad_norm": 4.875470161437988, "learning_rate": 4.957665584430713e-05, "loss": 0.0006, "step": 46 }, { "epoch": 0.08309851373004033, "grad_norm": 3.2761361598968506, "learning_rate": 4.9522262146021495e-05, "loss": 0.0002, "step": 47 }, { "epoch": 0.08486656721365822, "grad_norm": 1.1224943399429321, "learning_rate": 4.946461621797824e-05, "loss": 0.0, "step": 48 }, { "epoch": 0.0866346206972761, "grad_norm": 0.38790443539619446, "learning_rate": 4.940372586451325e-05, "loss": 0.0, "step": 49 }, { "epoch": 0.08840267418089397, "grad_norm": 0.6254971027374268, "learning_rate": 4.9339599329205686e-05, "loss": 0.0, "step": 50 }, { "epoch": 0.09017072766451185, "grad_norm": 0.11508096754550934, "learning_rate": 4.927224529376191e-05, "loss": 0.0, "step": 51 }, { "epoch": 0.09193878114812973, "grad_norm": 4.301520824432373, "learning_rate": 4.920167287684016e-05, "loss": 0.0001, "step": 52 }, { "epoch": 0.09370683463174762, "grad_norm": 0.008904535323381424, "learning_rate": 4.912789163281601e-05, "loss": 0.0, "step": 53 }, { "epoch": 0.09547488811536549, "grad_norm": 0.6658762097358704, "learning_rate": 4.905091155048882e-05, "loss": 0.0001, "step": 54 }, { "epoch": 0.09724294159898336, "grad_norm": 0.008907541632652283, "learning_rate": 4.897074305172948e-05, "loss": 0.0, "step": 55 }, { "epoch": 0.09901099508260125, "grad_norm": 1.161902904510498, "learning_rate": 4.8887396990069434e-05, "loss": 0.0, "step": 56 }, { "epoch": 0.10077904856621912, "grad_norm": 0.017062336206436157, "learning_rate": 4.8800884649231264e-05, "loss": 0.0, "step": 57 }, { "epoch": 0.10254710204983701, "grad_norm": 0.07215119153261185, "learning_rate": 4.871121774160107e-05, "loss": 0.0, "step": 58 }, { "epoch": 0.10431515553345488, "grad_norm": 0.29680925607681274, "learning_rate": 4.8618408406642795e-05, "loss": 0.0, "step": 59 }, { "epoch": 0.10608320901707277, "grad_norm": 1.7219104766845703, "learning_rate": 4.852246920925476e-05, "loss": 0.0, "step": 60 }, { "epoch": 0.10785126250069065, "grad_norm": 0.017782438546419144, "learning_rate": 4.842341313806852e-05, "loss": 0.0, "step": 61 }, { "epoch": 0.10961931598430852, "grad_norm": 26.54680061340332, "learning_rate": 4.832125360369049e-05, "loss": 0.0015, "step": 62 }, { "epoch": 0.1113873694679264, "grad_norm": 0.6519933342933655, "learning_rate": 4.82160044368863e-05, "loss": 0.0, "step": 63 }, { "epoch": 0.11315542295154428, "grad_norm": 7.924159049987793, "learning_rate": 4.810767988670834e-05, "loss": 0.0035, "step": 64 }, { "epoch": 0.11492347643516217, "grad_norm": 0.13337986171245575, "learning_rate": 4.799629461856672e-05, "loss": 0.0, "step": 65 }, { "epoch": 0.11669152991878004, "grad_norm": 19.419836044311523, "learning_rate": 4.788186371224372e-05, "loss": 0.0012, "step": 66 }, { "epoch": 0.11845958340239793, "grad_norm": 0.5985578894615173, "learning_rate": 4.776440265985233e-05, "loss": 0.0001, "step": 67 }, { "epoch": 0.1202276368860158, "grad_norm": 0.4132179319858551, "learning_rate": 4.764392736373876e-05, "loss": 0.0, "step": 68 }, { "epoch": 0.12199569036963367, "grad_norm": 0.14033225178718567, "learning_rate": 4.7520454134329594e-05, "loss": 0.0, "step": 69 }, { "epoch": 0.12376374385325156, "grad_norm": 0.5785059332847595, "learning_rate": 4.73939996879236e-05, "loss": 0.0001, "step": 70 }, { "epoch": 0.12553179733686945, "grad_norm": 0.14392127096652985, "learning_rate": 4.72645811444286e-05, "loss": 0.0, "step": 71 }, { "epoch": 0.1272998508204873, "grad_norm": 1.70956552028656, "learning_rate": 4.7132216025043714e-05, "loss": 0.0001, "step": 72 }, { "epoch": 0.1290679043041052, "grad_norm": 0.11939554661512375, "learning_rate": 4.699692224988726e-05, "loss": 0.0, "step": 73 }, { "epoch": 0.13083595778772308, "grad_norm": 0.33141276240348816, "learning_rate": 4.685871813557068e-05, "loss": 0.0001, "step": 74 }, { "epoch": 0.13260401127134097, "grad_norm": 0.1561572402715683, "learning_rate": 4.671762239271875e-05, "loss": 0.0, "step": 75 }, { "epoch": 0.13437206475495883, "grad_norm": 0.509835422039032, "learning_rate": 4.6573654123436456e-05, "loss": 0.0001, "step": 76 }, { "epoch": 0.13614011823857672, "grad_norm": 0.18685153126716614, "learning_rate": 4.642683281872288e-05, "loss": 0.0, "step": 77 }, { "epoch": 0.1379081717221946, "grad_norm": 0.10680101066827774, "learning_rate": 4.6277178355832434e-05, "loss": 0.0, "step": 78 }, { "epoch": 0.13967622520581247, "grad_norm": 0.0250838790088892, "learning_rate": 4.6124710995583807e-05, "loss": 0.0, "step": 79 }, { "epoch": 0.14144427868943035, "grad_norm": 0.04628512263298035, "learning_rate": 4.5969451379616945e-05, "loss": 0.0, "step": 80 }, { "epoch": 0.14321233217304824, "grad_norm": 1.1579428911209106, "learning_rate": 4.581142052759852e-05, "loss": 0.0007, "step": 81 }, { "epoch": 0.14498038565666613, "grad_norm": 0.3260582685470581, "learning_rate": 4.565063983437623e-05, "loss": 0.0, "step": 82 }, { "epoch": 0.146748439140284, "grad_norm": 0.14016196131706238, "learning_rate": 4.548713106708222e-05, "loss": 0.0, "step": 83 }, { "epoch": 0.14851649262390187, "grad_norm": 0.013404465280473232, "learning_rate": 4.532091636218621e-05, "loss": 0.0, "step": 84 }, { "epoch": 0.15028454610751976, "grad_norm": 0.058658480644226074, "learning_rate": 4.5152018222498574e-05, "loss": 0.0, "step": 85 }, { "epoch": 0.15205259959113762, "grad_norm": 0.058295607566833496, "learning_rate": 4.498045951412377e-05, "loss": 0.0, "step": 86 }, { "epoch": 0.1538206530747555, "grad_norm": 0.040826935321092606, "learning_rate": 4.480626346336469e-05, "loss": 0.0, "step": 87 }, { "epoch": 0.1555887065583734, "grad_norm": 0.5249735116958618, "learning_rate": 4.462945365357815e-05, "loss": 0.0001, "step": 88 }, { "epoch": 0.15735676004199128, "grad_norm": 0.027441998943686485, "learning_rate": 4.4450054021982115e-05, "loss": 0.0, "step": 89 }, { "epoch": 0.15912481352560914, "grad_norm": 0.16359610855579376, "learning_rate": 4.426808885641496e-05, "loss": 0.0, "step": 90 }, { "epoch": 0.16089286700922703, "grad_norm": 0.01867711916565895, "learning_rate": 4.408358279204729e-05, "loss": 0.0, "step": 91 }, { "epoch": 0.16266092049284492, "grad_norm": 0.026866458356380463, "learning_rate": 4.389656080804674e-05, "loss": 0.0, "step": 92 }, { "epoch": 0.16442897397646278, "grad_norm": 0.007205847185105085, "learning_rate": 4.370704822419616e-05, "loss": 0.0, "step": 93 }, { "epoch": 0.16619702746008067, "grad_norm": 0.027548780664801598, "learning_rate": 4.3515070697465805e-05, "loss": 0.0, "step": 94 }, { "epoch": 0.16796508094369855, "grad_norm": 0.03899214044213295, "learning_rate": 4.33206542185397e-05, "loss": 0.0, "step": 95 }, { "epoch": 0.16973313442731644, "grad_norm": 0.006059037055820227, "learning_rate": 4.3123825108296954e-05, "loss": 0.0, "step": 96 }, { "epoch": 0.1715011879109343, "grad_norm": 0.0009920188458636403, "learning_rate": 4.292461001424836e-05, "loss": 0.0, "step": 97 }, { "epoch": 0.1732692413945522, "grad_norm": 0.15524977445602417, "learning_rate": 4.272303590692872e-05, "loss": 0.0, "step": 98 }, { "epoch": 0.17503729487817007, "grad_norm": 0.0014381350483745337, "learning_rate": 4.251913007624543e-05, "loss": 0.0, "step": 99 }, { "epoch": 0.17680534836178793, "grad_norm": 0.2695734202861786, "learning_rate": 4.231292012778398e-05, "loss": 0.0001, "step": 100 }, { "epoch": 0.17857340184540582, "grad_norm": 42.32940673828125, "learning_rate": 4.210443397907045e-05, "loss": 0.0004, "step": 101 }, { "epoch": 0.1803414553290237, "grad_norm": 0.18144920468330383, "learning_rate": 4.1893699855792026e-05, "loss": 0.0, "step": 102 }, { "epoch": 0.1821095088126416, "grad_norm": 0.00621533440425992, "learning_rate": 4.1680746287975686e-05, "loss": 0.0, "step": 103 }, { "epoch": 0.18387756229625946, "grad_norm": 0.011037588119506836, "learning_rate": 4.1465602106125656e-05, "loss": 0.0, "step": 104 }, { "epoch": 0.18564561577987734, "grad_norm": 0.11516346037387848, "learning_rate": 4.1248296437320216e-05, "loss": 0.0, "step": 105 }, { "epoch": 0.18741366926349523, "grad_norm": 0.013171572238206863, "learning_rate": 4.10288587012684e-05, "loss": 0.0, "step": 106 }, { "epoch": 0.1891817227471131, "grad_norm": 0.1711127609014511, "learning_rate": 4.080731860632702e-05, "loss": 0.0, "step": 107 }, { "epoch": 0.19094977623073098, "grad_norm": 0.14865125715732574, "learning_rate": 4.0583706145478626e-05, "loss": 0.0, "step": 108 }, { "epoch": 0.19271782971434887, "grad_norm": 0.03242873027920723, "learning_rate": 4.035805159227094e-05, "loss": 0.0, "step": 109 }, { "epoch": 0.19448588319796672, "grad_norm": 0.14900478720664978, "learning_rate": 4.0130385496718264e-05, "loss": 0.0, "step": 110 }, { "epoch": 0.1962539366815846, "grad_norm": 0.004072085022926331, "learning_rate": 3.990073868116559e-05, "loss": 0.0, "step": 111 }, { "epoch": 0.1980219901652025, "grad_norm": 0.11495672166347504, "learning_rate": 3.966914223611567e-05, "loss": 0.0001, "step": 112 }, { "epoch": 0.1997900436488204, "grad_norm": 0.1237805113196373, "learning_rate": 3.943562751601987e-05, "loss": 0.0, "step": 113 }, { "epoch": 0.20155809713243825, "grad_norm": 0.044289521872997284, "learning_rate": 3.92002261350333e-05, "loss": 0.0, "step": 114 }, { "epoch": 0.20332615061605613, "grad_norm": 0.3086288869380951, "learning_rate": 3.896296996273475e-05, "loss": 0.0, "step": 115 }, { "epoch": 0.20509420409967402, "grad_norm": 0.7167790532112122, "learning_rate": 3.872389111981207e-05, "loss": 0.0002, "step": 116 }, { "epoch": 0.20686225758329188, "grad_norm": 0.01117889303714037, "learning_rate": 3.848302197371352e-05, "loss": 0.0, "step": 117 }, { "epoch": 0.20863031106690977, "grad_norm": 0.9482950568199158, "learning_rate": 3.8240395134265775e-05, "loss": 0.0002, "step": 118 }, { "epoch": 0.21039836455052766, "grad_norm": 0.01158567052334547, "learning_rate": 3.799604344925904e-05, "loss": 0.0, "step": 119 }, { "epoch": 0.21216641803414554, "grad_norm": 0.001482126535847783, "learning_rate": 3.775e-05, "loss": 0.0, "step": 120 }, { "epoch": 0.2139344715177634, "grad_norm": 0.2988652288913727, "learning_rate": 3.7502298096833135e-05, "loss": 0.0, "step": 121 }, { "epoch": 0.2157025250013813, "grad_norm": 0.011216375045478344, "learning_rate": 3.725297127463104e-05, "loss": 0.0, "step": 122 }, { "epoch": 0.21747057848499918, "grad_norm": 1.6700996160507202, "learning_rate": 3.7002053288254325e-05, "loss": 0.0002, "step": 123 }, { "epoch": 0.21923863196861704, "grad_norm": 0.17890863120555878, "learning_rate": 3.6749578107981784e-05, "loss": 0.0, "step": 124 }, { "epoch": 0.22100668545223492, "grad_norm": 0.004812731873244047, "learning_rate": 3.649557991491133e-05, "loss": 0.0, "step": 125 }, { "epoch": 0.2227747389358528, "grad_norm": 0.0008214990375563502, "learning_rate": 3.62400930963324e-05, "loss": 0.0, "step": 126 }, { "epoch": 0.2245427924194707, "grad_norm": 0.005143096670508385, "learning_rate": 3.598315224107053e-05, "loss": 0.0, "step": 127 }, { "epoch": 0.22631084590308856, "grad_norm": 0.14021626114845276, "learning_rate": 3.572479213480454e-05, "loss": 0.0, "step": 128 }, { "epoch": 0.22807889938670645, "grad_norm": 0.0075095584616065025, "learning_rate": 3.546504775535711e-05, "loss": 0.0, "step": 129 }, { "epoch": 0.22984695287032433, "grad_norm": 0.0022524860687553883, "learning_rate": 3.5203954267959345e-05, "loss": 0.0, "step": 130 }, { "epoch": 0.2316150063539422, "grad_norm": 0.0018451135838404298, "learning_rate": 3.4941547020489984e-05, "loss": 0.0, "step": 131 }, { "epoch": 0.23338305983756008, "grad_norm": 0.0029321014881134033, "learning_rate": 3.467786153868985e-05, "loss": 0.0, "step": 132 }, { "epoch": 0.23515111332117797, "grad_norm": 0.0029901517555117607, "learning_rate": 3.4412933521352215e-05, "loss": 0.0, "step": 133 }, { "epoch": 0.23691916680479586, "grad_norm": 0.1540692150592804, "learning_rate": 3.414679883548978e-05, "loss": 0.0, "step": 134 }, { "epoch": 0.23868722028841372, "grad_norm": 0.001123814843595028, "learning_rate": 3.387949351147889e-05, "loss": 0.0, "step": 135 }, { "epoch": 0.2404552737720316, "grad_norm": 0.00034733908250927925, "learning_rate": 3.3611053738181474e-05, "loss": 0.0, "step": 136 }, { "epoch": 0.2422233272556495, "grad_norm": 0.018943598493933678, "learning_rate": 3.33415158580458e-05, "loss": 0.0, "step": 137 }, { "epoch": 0.24399138073926735, "grad_norm": 0.0012388058239594102, "learning_rate": 3.307091636218621e-05, "loss": 0.0, "step": 138 }, { "epoch": 0.24575943422288524, "grad_norm": 0.579187273979187, "learning_rate": 3.2799291885442806e-05, "loss": 0.0, "step": 139 }, { "epoch": 0.24752748770650312, "grad_norm": 0.06531547755002975, "learning_rate": 3.2526679201421715e-05, "loss": 0.0, "step": 140 }, { "epoch": 0.249295541190121, "grad_norm": 0.0009087176294997334, "learning_rate": 3.225311521751648e-05, "loss": 0.0, "step": 141 }, { "epoch": 0.2510635946737389, "grad_norm": 0.0007331356173381209, "learning_rate": 3.197863696991146e-05, "loss": 0.0, "step": 142 }, { "epoch": 0.25283164815735676, "grad_norm": 0.0016905225347727537, "learning_rate": 3.170328161856765e-05, "loss": 0.0, "step": 143 }, { "epoch": 0.2545997016409746, "grad_norm": 0.8500795364379883, "learning_rate": 3.142708644219186e-05, "loss": 0.0001, "step": 144 }, { "epoch": 0.25636775512459253, "grad_norm": 0.0008820121875032783, "learning_rate": 3.1150088833189785e-05, "loss": 0.0, "step": 145 }, { "epoch": 0.2581358086082104, "grad_norm": 0.0022942032665014267, "learning_rate": 3.087232629260363e-05, "loss": 0.0, "step": 146 }, { "epoch": 0.25990386209182825, "grad_norm": 0.00030348575091920793, "learning_rate": 3.05938364250351e-05, "loss": 0.0, "step": 147 }, { "epoch": 0.26167191557544617, "grad_norm": 0.00010618369560688734, "learning_rate": 3.0314656933554343e-05, "loss": 0.0, "step": 148 }, { "epoch": 0.263439969059064, "grad_norm": 0.0016436876030638814, "learning_rate": 3.003482561459551e-05, "loss": 0.0, "step": 149 }, { "epoch": 0.26520802254268194, "grad_norm": 0.025613779202103615, "learning_rate": 2.97543803528398e-05, "loss": 0.0, "step": 150 }, { "epoch": 0.2669760760262998, "grad_norm": 0.0002844675909727812, "learning_rate": 2.947335911608641e-05, "loss": 0.0, "step": 151 }, { "epoch": 0.26874412950991766, "grad_norm": 0.0015938935102894902, "learning_rate": 2.9191799950112342e-05, "loss": 0.0, "step": 152 }, { "epoch": 0.2705121829935356, "grad_norm": 0.0003039448638446629, "learning_rate": 2.8909740973521605e-05, "loss": 0.0, "step": 153 }, { "epoch": 0.27228023647715344, "grad_norm": 0.0007424139184877276, "learning_rate": 2.8627220372584576e-05, "loss": 0.0, "step": 154 }, { "epoch": 0.2740482899607713, "grad_norm": 0.002882719039916992, "learning_rate": 2.8344276396068144e-05, "loss": 0.0, "step": 155 }, { "epoch": 0.2758163434443892, "grad_norm": 0.005102601367980242, "learning_rate": 2.8060947350057514e-05, "loss": 0.0, "step": 156 }, { "epoch": 0.27758439692800707, "grad_norm": 0.02509269304573536, "learning_rate": 2.7777271592770116e-05, "loss": 0.0, "step": 157 }, { "epoch": 0.27935245041162493, "grad_norm": 0.009309419430792332, "learning_rate": 2.749328752936253e-05, "loss": 0.0, "step": 158 }, { "epoch": 0.28112050389524285, "grad_norm": 0.0033299459610134363, "learning_rate": 2.720903360673107e-05, "loss": 0.0, "step": 159 }, { "epoch": 0.2828885573788607, "grad_norm": 0.009038392454385757, "learning_rate": 2.692454830830666e-05, "loss": 0.0, "step": 160 }, { "epoch": 0.28465661086247857, "grad_norm": 0.001115231541916728, "learning_rate": 2.6639870148844804e-05, "loss": 0.0, "step": 161 }, { "epoch": 0.2864246643460965, "grad_norm": 0.0021960127633064985, "learning_rate": 2.635503766921128e-05, "loss": 0.0, "step": 162 }, { "epoch": 0.28819271782971434, "grad_norm": 0.0012615895830094814, "learning_rate": 2.607008943116438e-05, "loss": 0.0, "step": 163 }, { "epoch": 0.28996077131333226, "grad_norm": 0.00011055181676056236, "learning_rate": 2.5785064012134235e-05, "loss": 0.0, "step": 164 }, { "epoch": 0.2917288247969501, "grad_norm": 1.1846184730529785, "learning_rate": 2.5500000000000003e-05, "loss": 0.0001, "step": 165 }, { "epoch": 0.293496878280568, "grad_norm": 0.14394652843475342, "learning_rate": 2.5214935987865782e-05, "loss": 0.0, "step": 166 }, { "epoch": 0.2952649317641859, "grad_norm": 0.004770410712808371, "learning_rate": 2.4929910568835625e-05, "loss": 0.0, "step": 167 }, { "epoch": 0.29703298524780375, "grad_norm": 0.001502661150880158, "learning_rate": 2.4644962330788736e-05, "loss": 0.0, "step": 168 }, { "epoch": 0.2988010387314216, "grad_norm": 0.6833661794662476, "learning_rate": 2.436012985115521e-05, "loss": 0.0, "step": 169 }, { "epoch": 0.3005690922150395, "grad_norm": 0.000995868700556457, "learning_rate": 2.4075451691693345e-05, "loss": 0.0, "step": 170 }, { "epoch": 0.3023371456986574, "grad_norm": 0.28130462765693665, "learning_rate": 2.3790966393268934e-05, "loss": 0.0, "step": 171 }, { "epoch": 0.30410519918227524, "grad_norm": 0.0009763907874003053, "learning_rate": 2.3506712470637478e-05, "loss": 0.0, "step": 172 }, { "epoch": 0.30587325266589316, "grad_norm": 0.0002226334618171677, "learning_rate": 2.3222728407229894e-05, "loss": 0.0, "step": 173 }, { "epoch": 0.307641306149511, "grad_norm": 0.0005141818546690047, "learning_rate": 2.2939052649942496e-05, "loss": 0.0, "step": 174 }, { "epoch": 0.3094093596331289, "grad_norm": 0.000805897347163409, "learning_rate": 2.265572360393186e-05, "loss": 0.0, "step": 175 }, { "epoch": 0.3111774131167468, "grad_norm": 0.01516794040799141, "learning_rate": 2.2372779627415433e-05, "loss": 0.0, "step": 176 }, { "epoch": 0.31294546660036465, "grad_norm": 0.00015710192383266985, "learning_rate": 2.2090259026478405e-05, "loss": 0.0, "step": 177 }, { "epoch": 0.31471352008398257, "grad_norm": 0.0007961568771861494, "learning_rate": 2.1808200049887665e-05, "loss": 0.0, "step": 178 }, { "epoch": 0.3164815735676004, "grad_norm": 0.34228724241256714, "learning_rate": 2.1526640883913603e-05, "loss": 0.0001, "step": 179 }, { "epoch": 0.3182496270512183, "grad_norm": 0.00023374942247755826, "learning_rate": 2.124561964716021e-05, "loss": 0.0, "step": 180 }, { "epoch": 0.3200176805348362, "grad_norm": 0.0010924128582701087, "learning_rate": 2.09651743854045e-05, "loss": 0.0, "step": 181 }, { "epoch": 0.32178573401845406, "grad_norm": 0.0013544277753680944, "learning_rate": 2.0685343066445663e-05, "loss": 0.0, "step": 182 }, { "epoch": 0.3235537875020719, "grad_norm": 3.933921834686771e-05, "learning_rate": 2.0406163574964904e-05, "loss": 0.0, "step": 183 }, { "epoch": 0.32532184098568984, "grad_norm": 0.0008416796335950494, "learning_rate": 2.0127673707396367e-05, "loss": 0.0, "step": 184 }, { "epoch": 0.3270898944693077, "grad_norm": 8.908173185773194e-05, "learning_rate": 1.9849911166810218e-05, "loss": 0.0, "step": 185 }, { "epoch": 0.32885794795292556, "grad_norm": 7.215650839498267e-05, "learning_rate": 1.9572913557808146e-05, "loss": 0.0, "step": 186 }, { "epoch": 0.33062600143654347, "grad_norm": 0.009005086496472359, "learning_rate": 1.929671838143236e-05, "loss": 0.0, "step": 187 }, { "epoch": 0.33239405492016133, "grad_norm": 8.976953540695831e-05, "learning_rate": 1.9021363030088554e-05, "loss": 0.0, "step": 188 }, { "epoch": 0.3341621084037792, "grad_norm": 0.001722825225442648, "learning_rate": 1.8746884782483525e-05, "loss": 0.0, "step": 189 }, { "epoch": 0.3359301618873971, "grad_norm": 0.004825813230127096, "learning_rate": 1.8473320798578298e-05, "loss": 0.0, "step": 190 }, { "epoch": 0.33769821537101496, "grad_norm": 0.0007849527173675597, "learning_rate": 1.8200708114557193e-05, "loss": 0.0, "step": 191 }, { "epoch": 0.3394662688546329, "grad_norm": 0.000184956646990031, "learning_rate": 1.792908363781379e-05, "loss": 0.0, "step": 192 }, { "epoch": 0.34123432233825074, "grad_norm": 0.0004415436997078359, "learning_rate": 1.7658484141954197e-05, "loss": 0.0, "step": 193 }, { "epoch": 0.3430023758218686, "grad_norm": 0.0002726184611674398, "learning_rate": 1.7388946261818532e-05, "loss": 0.0, "step": 194 }, { "epoch": 0.3447704293054865, "grad_norm": 0.18765950202941895, "learning_rate": 1.712050648852112e-05, "loss": 0.0, "step": 195 }, { "epoch": 0.3465384827891044, "grad_norm": 5.5921867897268385e-05, "learning_rate": 1.6853201164510216e-05, "loss": 0.0, "step": 196 }, { "epoch": 0.34830653627272223, "grad_norm": 0.0006541347247548401, "learning_rate": 1.6587066478647798e-05, "loss": 0.0, "step": 197 }, { "epoch": 0.35007458975634015, "grad_norm": 0.06359551846981049, "learning_rate": 1.6322138461310157e-05, "loss": 0.0, "step": 198 }, { "epoch": 0.351842643239958, "grad_norm": 7.466107490472496e-05, "learning_rate": 1.6058452979510023e-05, "loss": 0.0, "step": 199 }, { "epoch": 0.35361069672357587, "grad_norm": 0.0012128509115427732, "learning_rate": 1.5796045732040658e-05, "loss": 0.0, "step": 200 }, { "epoch": 0.3553787502071938, "grad_norm": 0.0028565828688442707, "learning_rate": 1.55349522446429e-05, "loss": 0.0, "step": 201 }, { "epoch": 0.35714680369081164, "grad_norm": 0.0035199481062591076, "learning_rate": 1.5275207865195463e-05, "loss": 0.0, "step": 202 }, { "epoch": 0.3589148571744295, "grad_norm": 3.6958659620722756e-05, "learning_rate": 1.5016847758929478e-05, "loss": 0.0, "step": 203 }, { "epoch": 0.3606829106580474, "grad_norm": 0.013499101623892784, "learning_rate": 1.4759906903667603e-05, "loss": 0.0, "step": 204 }, { "epoch": 0.3624509641416653, "grad_norm": 0.00010528836719458923, "learning_rate": 1.4504420085088679e-05, "loss": 0.0, "step": 205 }, { "epoch": 0.3642190176252832, "grad_norm": 0.0005988091579638422, "learning_rate": 1.425042189201821e-05, "loss": 0.0, "step": 206 }, { "epoch": 0.36598707110890105, "grad_norm": 8.299437467940152e-05, "learning_rate": 1.3997946711745685e-05, "loss": 0.0, "step": 207 }, { "epoch": 0.3677551245925189, "grad_norm": 0.08108139038085938, "learning_rate": 1.3747028725368977e-05, "loss": 0.0, "step": 208 }, { "epoch": 0.3695231780761368, "grad_norm": 1.022919241222553e-05, "learning_rate": 1.3497701903166873e-05, "loss": 0.0, "step": 209 }, { "epoch": 0.3712912315597547, "grad_norm": 0.21460385620594025, "learning_rate": 1.3250000000000007e-05, "loss": 0.0001, "step": 210 }, { "epoch": 0.37305928504337255, "grad_norm": 0.00021967390784993768, "learning_rate": 1.3003956550740957e-05, "loss": 0.0, "step": 211 }, { "epoch": 0.37482733852699046, "grad_norm": 0.001171005773358047, "learning_rate": 1.2759604865734225e-05, "loss": 0.0, "step": 212 }, { "epoch": 0.3765953920106083, "grad_norm": 0.09991420805454254, "learning_rate": 1.2516978026286484e-05, "loss": 0.0, "step": 213 }, { "epoch": 0.3783634454942262, "grad_norm": 6.677448254777119e-05, "learning_rate": 1.2276108880187944e-05, "loss": 0.0, "step": 214 }, { "epoch": 0.3801314989778441, "grad_norm": 0.0013546834234148264, "learning_rate": 1.2037030037265254e-05, "loss": 0.0, "step": 215 }, { "epoch": 0.38189955246146196, "grad_norm": 0.0005404578987509012, "learning_rate": 1.1799773864966708e-05, "loss": 0.0, "step": 216 }, { "epoch": 0.3836676059450798, "grad_norm": 0.00040231485036201775, "learning_rate": 1.1564372483980137e-05, "loss": 0.0, "step": 217 }, { "epoch": 0.38543565942869773, "grad_norm": 1.2076519851689227e-05, "learning_rate": 1.1330857763884332e-05, "loss": 0.0, "step": 218 }, { "epoch": 0.3872037129123156, "grad_norm": 5.123095615999773e-05, "learning_rate": 1.1099261318834413e-05, "loss": 0.0, "step": 219 }, { "epoch": 0.38897176639593345, "grad_norm": 0.06918606162071228, "learning_rate": 1.0869614503281746e-05, "loss": 0.0, "step": 220 }, { "epoch": 0.39073981987955136, "grad_norm": 6.619005580432713e-05, "learning_rate": 1.064194840772908e-05, "loss": 0.0, "step": 221 }, { "epoch": 0.3925078733631692, "grad_norm": 0.00015157890447881073, "learning_rate": 1.0416293854521374e-05, "loss": 0.0, "step": 222 }, { "epoch": 0.39427592684678714, "grad_norm": 0.0037292453926056623, "learning_rate": 1.0192681393672982e-05, "loss": 0.0, "step": 223 }, { "epoch": 0.396043980330405, "grad_norm": 0.16160666942596436, "learning_rate": 9.971141298731597e-06, "loss": 0.0, "step": 224 }, { "epoch": 0.39781203381402286, "grad_norm": 0.06928429752588272, "learning_rate": 9.751703562679787e-06, "loss": 0.0, "step": 225 }, { "epoch": 0.3995800872976408, "grad_norm": 9.774271893547848e-05, "learning_rate": 9.53439789387435e-06, "loss": 0.0, "step": 226 }, { "epoch": 0.40134814078125863, "grad_norm": 0.0001532236929051578, "learning_rate": 9.319253712024317e-06, "loss": 0.0, "step": 227 }, { "epoch": 0.4031161942648765, "grad_norm": 0.00043412772356532514, "learning_rate": 9.106300144207982e-06, "loss": 0.0, "step": 228 }, { "epoch": 0.4048842477484944, "grad_norm": 0.0001928459678310901, "learning_rate": 8.895566020929561e-06, "loss": 0.0, "step": 229 }, { "epoch": 0.40665230123211227, "grad_norm": 0.00013222330017015338, "learning_rate": 8.68707987221603e-06, "loss": 0.0, "step": 230 }, { "epoch": 0.4084203547157301, "grad_norm": 0.00016468556714244187, "learning_rate": 8.480869923754565e-06, "loss": 0.0, "step": 231 }, { "epoch": 0.41018840819934804, "grad_norm": 1.7172653315356e-05, "learning_rate": 8.276964093071288e-06, "loss": 0.0, "step": 232 }, { "epoch": 0.4119564616829659, "grad_norm": 4.863995854975656e-05, "learning_rate": 8.075389985751642e-06, "loss": 0.0, "step": 233 }, { "epoch": 0.41372451516658376, "grad_norm": 1.6034869986469857e-05, "learning_rate": 7.876174891703052e-06, "loss": 0.0, "step": 234 }, { "epoch": 0.4154925686502017, "grad_norm": 0.020293768495321274, "learning_rate": 7.679345781460308e-06, "loss": 0.0, "step": 235 }, { "epoch": 0.41726062213381954, "grad_norm": 0.00036274161539040506, "learning_rate": 7.484929302534201e-06, "loss": 0.0, "step": 236 }, { "epoch": 0.41902867561743745, "grad_norm": 0.00016249797772616148, "learning_rate": 7.292951775803839e-06, "loss": 0.0, "step": 237 }, { "epoch": 0.4207967291010553, "grad_norm": 0.004641630221158266, "learning_rate": 7.10343919195327e-06, "loss": 0.0, "step": 238 }, { "epoch": 0.42256478258467317, "grad_norm": 0.0009545126813463867, "learning_rate": 6.916417207952713e-06, "loss": 0.0, "step": 239 }, { "epoch": 0.4243328360682911, "grad_norm": 0.00012078192958142608, "learning_rate": 6.731911143585042e-06, "loss": 0.0, "step": 240 }, { "epoch": 0.42610088955190895, "grad_norm": 0.0007262192084454, "learning_rate": 6.549945978017887e-06, "loss": 0.0, "step": 241 }, { "epoch": 0.4278689430355268, "grad_norm": 0.0009577589225955307, "learning_rate": 6.37054634642185e-06, "loss": 0.0, "step": 242 }, { "epoch": 0.4296369965191447, "grad_norm": 0.0585528202354908, "learning_rate": 6.193736536635314e-06, "loss": 0.0, "step": 243 }, { "epoch": 0.4314050500027626, "grad_norm": 7.095329783624038e-05, "learning_rate": 6.019540485876227e-06, "loss": 0.0, "step": 244 }, { "epoch": 0.43317310348638044, "grad_norm": 7.245037704706192e-05, "learning_rate": 5.847981777501426e-06, "loss": 0.0, "step": 245 }, { "epoch": 0.43494115696999835, "grad_norm": 5.6373453844571486e-05, "learning_rate": 5.6790836378137905e-06, "loss": 0.0, "step": 246 }, { "epoch": 0.4367092104536162, "grad_norm": 3.724493581103161e-05, "learning_rate": 5.5128689329177896e-06, "loss": 0.0, "step": 247 }, { "epoch": 0.4384772639372341, "grad_norm": 0.0006005808245390654, "learning_rate": 5.349360165623773e-06, "loss": 0.0, "step": 248 }, { "epoch": 0.440245317420852, "grad_norm": 3.0376237191376276e-05, "learning_rate": 5.188579472401481e-06, "loss": 0.0, "step": 249 }, { "epoch": 0.44201337090446985, "grad_norm": 0.00017754016153048724, "learning_rate": 5.0305486203830615e-06, "loss": 0.0, "step": 250 }, { "epoch": 0.44378142438808776, "grad_norm": 4.187566082691774e-05, "learning_rate": 4.8752890044161906e-06, "loss": 0.0, "step": 251 }, { "epoch": 0.4455494778717056, "grad_norm": 5.963912553852424e-05, "learning_rate": 4.722821644167564e-06, "loss": 0.0, "step": 252 }, { "epoch": 0.4473175313553235, "grad_norm": 0.00011405260738683864, "learning_rate": 4.573167181277125e-06, "loss": 0.0, "step": 253 }, { "epoch": 0.4490855848389414, "grad_norm": 7.98743076302344e-06, "learning_rate": 4.4263458765635525e-06, "loss": 0.0, "step": 254 }, { "epoch": 0.45085363832255926, "grad_norm": 0.00010863741772482172, "learning_rate": 4.282377607281252e-06, "loss": 0.0, "step": 255 }, { "epoch": 0.4526216918061771, "grad_norm": 9.043182217283174e-05, "learning_rate": 4.141281864429324e-06, "loss": 0.0, "step": 256 }, { "epoch": 0.45438974528979503, "grad_norm": 0.0006291704485192895, "learning_rate": 4.003077750112739e-06, "loss": 0.0, "step": 257 }, { "epoch": 0.4561577987734129, "grad_norm": 0.0001768024085322395, "learning_rate": 3.867783974956289e-06, "loss": 0.0, "step": 258 }, { "epoch": 0.45792585225703075, "grad_norm": 3.043546348635573e-05, "learning_rate": 3.735418855571405e-06, "loss": 0.0, "step": 259 }, { "epoch": 0.45969390574064867, "grad_norm": 1.1459591405582614e-05, "learning_rate": 3.6060003120764025e-06, "loss": 0.0, "step": 260 }, { "epoch": 0.4614619592242665, "grad_norm": 3.286511491751298e-05, "learning_rate": 3.479545865670413e-06, "loss": 0.0, "step": 261 }, { "epoch": 0.4632300127078844, "grad_norm": 1.5214385712170042e-05, "learning_rate": 3.356072636261248e-06, "loss": 0.0, "step": 262 }, { "epoch": 0.4649980661915023, "grad_norm": 2.3637854610569775e-05, "learning_rate": 3.235597340147676e-06, "loss": 0.0, "step": 263 }, { "epoch": 0.46676611967512016, "grad_norm": 0.00024508286151103675, "learning_rate": 3.1181362877562766e-06, "loss": 0.0, "step": 264 }, { "epoch": 0.4685341731587381, "grad_norm": 0.002545794239267707, "learning_rate": 3.00370538143329e-06, "loss": 0.0, "step": 265 }, { "epoch": 0.47030222664235594, "grad_norm": 2.2318992705550045e-05, "learning_rate": 2.8923201132916638e-06, "loss": 0.0, "step": 266 }, { "epoch": 0.4720702801259738, "grad_norm": 1.516962583991699e-05, "learning_rate": 2.783995563113711e-06, "loss": 0.0, "step": 267 }, { "epoch": 0.4738383336095917, "grad_norm": 0.000140973279485479, "learning_rate": 2.6787463963095105e-06, "loss": 0.0, "step": 268 }, { "epoch": 0.47560638709320957, "grad_norm": 0.01587841846048832, "learning_rate": 2.5765868619314814e-06, "loss": 0.0, "step": 269 }, { "epoch": 0.47737444057682743, "grad_norm": 1.07875666799373e-05, "learning_rate": 2.4775307907452464e-06, "loss": 0.0, "step": 270 }, { "epoch": 0.47914249406044535, "grad_norm": 0.0006835050880908966, "learning_rate": 2.381591593357204e-06, "loss": 0.0, "step": 271 }, { "epoch": 0.4809105475440632, "grad_norm": 1.7690043023321778e-05, "learning_rate": 2.2887822583989345e-06, "loss": 0.0, "step": 272 }, { "epoch": 0.48267860102768106, "grad_norm": 0.00046111404662951827, "learning_rate": 2.1991153507687385e-06, "loss": 0.0, "step": 273 }, { "epoch": 0.484446654511299, "grad_norm": 0.022714175283908844, "learning_rate": 2.104361506153305e-06, "loss": 0.0, "step": 274 }, { "epoch": 0.48621470799491684, "grad_norm": 4.023480505566113e-05, "learning_rate": 2.014008697185033e-06, "loss": 0.0, "step": 275 }, { "epoch": 0.4879827614785347, "grad_norm": 0.0869794487953186, "learning_rate": 1.9279975950776054e-06, "loss": 0.0, "step": 276 }, { "epoch": 0.4897508149621526, "grad_norm": 8.842632269079331e-06, "learning_rate": 1.8462681192674342e-06, "loss": 0.0, "step": 277 }, { "epoch": 0.4915188684457705, "grad_norm": 0.00010428605310153216, "learning_rate": 1.768759451931695e-06, "loss": 0.0, "step": 278 }, { "epoch": 0.4932869219293884, "grad_norm": 0.00014882163668517023, "learning_rate": 1.6954100526275048e-06, "loss": 0.0, "step": 279 }, { "epoch": 0.49505497541300625, "grad_norm": 2.4553623006795533e-05, "learning_rate": 1.6261576730494264e-06, "loss": 0.0, "step": 280 }, { "epoch": 0.4968230288966241, "grad_norm": 1.0228201063000597e-05, "learning_rate": 1.5609393719023522e-06, "loss": 0.0, "step": 281 }, { "epoch": 0.498591082380242, "grad_norm": 5.0795490096788853e-05, "learning_rate": 1.4996915298869766e-06, "loss": 0.0, "step": 282 } ], "logging_steps": 1, "max_steps": 282, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }