{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 210.173828125, "learning_rate": 6.000000000000001e-07, "loss": 8.791841125488281, "step": 10 }, { "epoch": 0.04, "grad_norm": 161.85198974609375, "learning_rate": 1.2666666666666669e-06, "loss": 6.186199951171875, "step": 20 }, { "epoch": 0.06, "grad_norm": 79.59968566894531, "learning_rate": 1.9333333333333336e-06, "loss": 5.037051010131836, "step": 30 }, { "epoch": 0.08, "grad_norm": 77.91535949707031, "learning_rate": 2.6e-06, "loss": 3.229383850097656, "step": 40 }, { "epoch": 0.1, "grad_norm": 45.42420959472656, "learning_rate": 3.266666666666667e-06, "loss": 2.6033956527709963, "step": 50 }, { "epoch": 0.12, "grad_norm": 41.46728515625, "learning_rate": 3.9333333333333335e-06, "loss": 2.942063903808594, "step": 60 }, { "epoch": 0.14, "grad_norm": 78.79148864746094, "learning_rate": 4.600000000000001e-06, "loss": 2.450759506225586, "step": 70 }, { "epoch": 0.16, "grad_norm": 62.13072967529297, "learning_rate": 5.2666666666666665e-06, "loss": 2.3624486923217773, "step": 80 }, { "epoch": 0.18, "grad_norm": 52.63612747192383, "learning_rate": 5.933333333333335e-06, "loss": 1.8120697021484375, "step": 90 }, { "epoch": 0.2, "grad_norm": 67.64952850341797, "learning_rate": 6.600000000000001e-06, "loss": 2.04521484375, "step": 100 }, { "epoch": 0.22, "grad_norm": 43.48116683959961, "learning_rate": 7.266666666666668e-06, "loss": 2.3016538619995117, "step": 110 }, { "epoch": 0.24, "grad_norm": 51.687042236328125, "learning_rate": 7.933333333333334e-06, "loss": 2.394601821899414, "step": 120 }, { "epoch": 0.26, "grad_norm": 66.71311950683594, "learning_rate": 8.6e-06, "loss": 2.0336360931396484, "step": 130 }, { "epoch": 0.28, "grad_norm": 44.05624771118164, "learning_rate": 9.266666666666667e-06, "loss": 2.5297100067138674, "step": 140 }, { "epoch": 0.3, "grad_norm": 43.44951629638672, "learning_rate": 9.933333333333334e-06, "loss": 2.532087707519531, "step": 150 }, { "epoch": 0.32, "grad_norm": 39.6384162902832, "learning_rate": 9.998903417374228e-06, "loss": 2.106338310241699, "step": 160 }, { "epoch": 0.34, "grad_norm": 38.4444694519043, "learning_rate": 9.995113378907791e-06, "loss": 1.8893505096435548, "step": 170 }, { "epoch": 0.36, "grad_norm": 40.845069885253906, "learning_rate": 9.988618398427495e-06, "loss": 2.187959671020508, "step": 180 }, { "epoch": 0.38, "grad_norm": 32.24748611450195, "learning_rate": 9.979421993079853e-06, "loss": 2.390948677062988, "step": 190 }, { "epoch": 0.4, "grad_norm": 47.72621536254883, "learning_rate": 9.967529142880592e-06, "loss": 2.4106063842773438, "step": 200 }, { "epoch": 0.42, "grad_norm": 25.50874137878418, "learning_rate": 9.952946288017899e-06, "loss": 2.082447624206543, "step": 210 }, { "epoch": 0.44, "grad_norm": 51.26837158203125, "learning_rate": 9.93568132536494e-06, "loss": 2.0642595291137695, "step": 220 }, { "epoch": 0.46, "grad_norm": 65.230712890625, "learning_rate": 9.915743604203563e-06, "loss": 2.5710067749023438, "step": 230 }, { "epoch": 0.48, "grad_norm": 37.009708404541016, "learning_rate": 9.893143921161501e-06, "loss": 2.123330307006836, "step": 240 }, { "epoch": 0.5, "grad_norm": 42.43490982055664, "learning_rate": 9.867894514365802e-06, "loss": 2.516407012939453, "step": 250 }, { "epoch": 0.52, "grad_norm": 33.466773986816406, "learning_rate": 9.840009056815674e-06, "loss": 2.5996524810791017, "step": 260 }, { "epoch": 0.54, "grad_norm": 38.300453186035156, "learning_rate": 9.809502648978311e-06, "loss": 2.2154964447021483, "step": 270 }, { "epoch": 0.56, "grad_norm": 41.977272033691406, "learning_rate": 9.776391810611719e-06, "loss": 2.7604133605957033, "step": 280 }, { "epoch": 0.58, "grad_norm": 39.470916748046875, "learning_rate": 9.740694471818988e-06, "loss": 2.422298240661621, "step": 290 }, { "epoch": 0.6, "grad_norm": 25.746841430664062, "learning_rate": 9.702429963338812e-06, "loss": 1.9244077682495118, "step": 300 }, { "epoch": 0.62, "grad_norm": 61.744041442871094, "learning_rate": 9.661619006077562e-06, "loss": 1.9998054504394531, "step": 310 }, { "epoch": 0.64, "grad_norm": 81.94461822509766, "learning_rate": 9.618283699888543e-06, "loss": 2.256130027770996, "step": 320 }, { "epoch": 0.66, "grad_norm": 35.797786712646484, "learning_rate": 9.572447511604536e-06, "loss": 2.06947021484375, "step": 330 }, { "epoch": 0.68, "grad_norm": 32.96639633178711, "learning_rate": 9.524135262330098e-06, "loss": 2.0332563400268553, "step": 340 }, { "epoch": 0.7, "grad_norm": 32.352699279785156, "learning_rate": 9.473373114000493e-06, "loss": 2.068455696105957, "step": 350 }, { "epoch": 0.72, "grad_norm": 38.09100341796875, "learning_rate": 9.420188555214537e-06, "loss": 2.32584171295166, "step": 360 }, { "epoch": 0.74, "grad_norm": 50.007049560546875, "learning_rate": 9.364610386349048e-06, "loss": 2.5027301788330076, "step": 370 }, { "epoch": 0.76, "grad_norm": 27.599956512451172, "learning_rate": 9.306668703962927e-06, "loss": 2.2121116638183596, "step": 380 }, { "epoch": 0.78, "grad_norm": 36.08063507080078, "learning_rate": 9.246394884499334e-06, "loss": 2.542919158935547, "step": 390 }, { "epoch": 0.8, "grad_norm": 49.41411209106445, "learning_rate": 9.18382156729481e-06, "loss": 1.9286565780639648, "step": 400 }, { "epoch": 0.82, "grad_norm": 54.925540924072266, "learning_rate": 9.118982636904476e-06, "loss": 2.313096809387207, "step": 410 }, { "epoch": 0.84, "grad_norm": 56.74051284790039, "learning_rate": 9.051913204752972e-06, "loss": 2.0929283142089843, "step": 420 }, { "epoch": 0.86, "grad_norm": 24.994333267211914, "learning_rate": 8.982649590120982e-06, "loss": 2.0113702774047852, "step": 430 }, { "epoch": 0.88, "grad_norm": 41.89128494262695, "learning_rate": 8.911229300477716e-06, "loss": 1.6521787643432617, "step": 440 }, { "epoch": 0.9, "grad_norm": 44.65105438232422, "learning_rate": 8.837691011169944e-06, "loss": 2.2836629867553713, "step": 450 }, { "epoch": 0.92, "grad_norm": 42.151737213134766, "learning_rate": 8.762074544478622e-06, "loss": 1.705312156677246, "step": 460 }, { "epoch": 0.94, "grad_norm": 36.330989837646484, "learning_rate": 8.68442084805442e-06, "loss": 1.4402057647705078, "step": 470 }, { "epoch": 0.96, "grad_norm": 27.495149612426758, "learning_rate": 8.604771972743848e-06, "loss": 1.9172407150268556, "step": 480 }, { "epoch": 0.98, "grad_norm": 17.782899856567383, "learning_rate": 8.523171049817974e-06, "loss": 1.5030399322509767, "step": 490 }, { "epoch": 1.0, "grad_norm": 34.83807373046875, "learning_rate": 8.439662267616093e-06, "loss": 1.9837615966796875, "step": 500 }, { "epoch": 1.02, "grad_norm": 13.142373085021973, "learning_rate": 8.354290847616954e-06, "loss": 1.2435311317443847, "step": 510 }, { "epoch": 1.04, "grad_norm": 37.417720794677734, "learning_rate": 8.267103019950529e-06, "loss": 1.23653564453125, "step": 520 }, { "epoch": 1.06, "grad_norm": 18.470767974853516, "learning_rate": 8.178145998363585e-06, "loss": 0.8462669372558593, "step": 530 }, { "epoch": 1.08, "grad_norm": 27.512048721313477, "learning_rate": 8.087467954652608e-06, "loss": 1.2925146102905274, "step": 540 }, { "epoch": 1.1, "grad_norm": 37.81305694580078, "learning_rate": 7.99511799257793e-06, "loss": 0.901913833618164, "step": 550 }, { "epoch": 1.12, "grad_norm": 37.69697570800781, "learning_rate": 7.901146121273165e-06, "loss": 1.3357799530029297, "step": 560 }, { "epoch": 1.1400000000000001, "grad_norm": 34.828433990478516, "learning_rate": 7.80560322816439e-06, "loss": 1.3306777954101563, "step": 570 }, { "epoch": 1.16, "grad_norm": 38.92500305175781, "learning_rate": 7.7085410514137e-06, "loss": 0.944632625579834, "step": 580 }, { "epoch": 1.18, "grad_norm": 19.071941375732422, "learning_rate": 7.610012151902091e-06, "loss": 1.3212141036987304, "step": 590 }, { "epoch": 1.2, "grad_norm": 34.952938079833984, "learning_rate": 7.510069884766802e-06, "loss": 1.2321388244628906, "step": 600 }, { "epoch": 1.22, "grad_norm": 32.1840934753418, "learning_rate": 7.408768370508577e-06, "loss": 1.3100163459777832, "step": 610 }, { "epoch": 1.24, "grad_norm": 23.4957275390625, "learning_rate": 7.3061624656844544e-06, "loss": 1.179959774017334, "step": 620 }, { "epoch": 1.26, "grad_norm": 40.54483413696289, "learning_rate": 7.2023077332019755e-06, "loss": 0.8590527534484863, "step": 630 }, { "epoch": 1.28, "grad_norm": 42.49688720703125, "learning_rate": 7.0972604122308865e-06, "loss": 1.3245353698730469, "step": 640 }, { "epoch": 1.3, "grad_norm": 38.857479095458984, "learning_rate": 6.991077387748643e-06, "loss": 0.9064340591430664, "step": 650 }, { "epoch": 1.32, "grad_norm": 24.489877700805664, "learning_rate": 6.883816159736187e-06, "loss": 1.3220383644104003, "step": 660 }, { "epoch": 1.34, "grad_norm": 18.46355628967285, "learning_rate": 6.775534812040686e-06, "loss": 0.7077834129333496, "step": 670 }, { "epoch": 1.3599999999999999, "grad_norm": 24.25245475769043, "learning_rate": 6.666291980922122e-06, "loss": 0.9919247627258301, "step": 680 }, { "epoch": 1.38, "grad_norm": 48.17041778564453, "learning_rate": 6.556146823300701e-06, "loss": 1.6220426559448242, "step": 690 }, { "epoch": 1.4, "grad_norm": 47.63260269165039, "learning_rate": 6.445158984722358e-06, "loss": 1.3412548065185548, "step": 700 }, { "epoch": 1.42, "grad_norm": 24.807323455810547, "learning_rate": 6.3333885670596285e-06, "loss": 1.2461795806884766, "step": 710 }, { "epoch": 1.44, "grad_norm": 11.098411560058594, "learning_rate": 6.220896095965428e-06, "loss": 1.0722078323364257, "step": 720 }, { "epoch": 1.46, "grad_norm": 36.12248611450195, "learning_rate": 6.107742488097338e-06, "loss": 0.83670015335083, "step": 730 }, { "epoch": 1.48, "grad_norm": 39.64963150024414, "learning_rate": 5.993989018130173e-06, "loss": 1.1005640029907227, "step": 740 }, { "epoch": 1.5, "grad_norm": 35.805389404296875, "learning_rate": 5.879697285574655e-06, "loss": 1.5326812744140625, "step": 750 }, { "epoch": 1.52, "grad_norm": 20.854694366455078, "learning_rate": 5.764929181420191e-06, "loss": 1.028077983856201, "step": 760 }, { "epoch": 1.54, "grad_norm": 19.16396141052246, "learning_rate": 5.649746854619814e-06, "loss": 1.1157980918884278, "step": 770 }, { "epoch": 1.56, "grad_norm": 26.597848892211914, "learning_rate": 5.5342126784354265e-06, "loss": 1.2585329055786132, "step": 780 }, { "epoch": 1.58, "grad_norm": 43.42577362060547, "learning_rate": 5.41838921666158e-06, "loss": 1.1203573226928711, "step": 790 }, { "epoch": 1.6, "grad_norm": 63.867942810058594, "learning_rate": 5.3023391897460715e-06, "loss": 1.0436135292053224, "step": 800 }, { "epoch": 1.62, "grad_norm": 36.27252960205078, "learning_rate": 5.18612544082573e-06, "loss": 0.9347926139831543, "step": 810 }, { "epoch": 1.6400000000000001, "grad_norm": 33.137611389160156, "learning_rate": 5.069810901695727e-06, "loss": 1.5266364097595215, "step": 820 }, { "epoch": 1.6600000000000001, "grad_norm": 46.118202209472656, "learning_rate": 4.953458558730917e-06, "loss": 1.1257129669189454, "step": 830 }, { "epoch": 1.6800000000000002, "grad_norm": 31.607257843017578, "learning_rate": 4.837131418777595e-06, "loss": 0.9643683433532715, "step": 840 }, { "epoch": 1.7, "grad_norm": 50.87643814086914, "learning_rate": 4.720892475034181e-06, "loss": 1.3242192268371582, "step": 850 }, { "epoch": 1.72, "grad_norm": 27.850143432617188, "learning_rate": 4.604804672939295e-06, "loss": 1.251343059539795, "step": 860 }, { "epoch": 1.74, "grad_norm": 45.26017379760742, "learning_rate": 4.4889308760856826e-06, "loss": 1.1028053283691406, "step": 870 }, { "epoch": 1.76, "grad_norm": 3.31717586517334, "learning_rate": 4.373333832178478e-06, "loss": 0.8769556045532226, "step": 880 }, { "epoch": 1.78, "grad_norm": 26.864580154418945, "learning_rate": 4.258076139056217e-06, "loss": 1.1681681632995606, "step": 890 }, { "epoch": 1.8, "grad_norm": 33.86524200439453, "learning_rate": 4.143220210792993e-06, "loss": 0.9344164848327636, "step": 900 }, { "epoch": 1.8199999999999998, "grad_norm": 25.299028396606445, "learning_rate": 4.028828243900141e-06, "loss": 1.634626579284668, "step": 910 }, { "epoch": 1.8399999999999999, "grad_norm": 28.403207778930664, "learning_rate": 3.914962183645733e-06, "loss": 1.0626420974731445, "step": 920 }, { "epoch": 1.8599999999999999, "grad_norm": 29.372901916503906, "learning_rate": 3.8016836905101157e-06, "loss": 1.375041389465332, "step": 930 }, { "epoch": 1.88, "grad_norm": 21.600330352783203, "learning_rate": 3.6890541067956775e-06, "loss": 0.9415129661560059, "step": 940 }, { "epoch": 1.9, "grad_norm": 20.263734817504883, "learning_rate": 3.577134423408906e-06, "loss": 0.8620157241821289, "step": 950 }, { "epoch": 1.92, "grad_norm": 37.88676834106445, "learning_rate": 3.465985246832739e-06, "loss": 1.1039237976074219, "step": 960 }, { "epoch": 1.94, "grad_norm": 8.769469261169434, "learning_rate": 3.355666766307084e-06, "loss": 0.8490506172180176, "step": 970 }, { "epoch": 1.96, "grad_norm": 44.77504348754883, "learning_rate": 3.246238721235283e-06, "loss": 1.2799974441528321, "step": 980 }, { "epoch": 1.98, "grad_norm": 15.338781356811523, "learning_rate": 3.137760368834169e-06, "loss": 0.8015480995178222, "step": 990 }, { "epoch": 2.0, "grad_norm": 33.2166748046875, "learning_rate": 3.030290452045245e-06, "loss": 1.091315746307373, "step": 1000 }, { "epoch": 2.02, "grad_norm": 4.909487247467041, "learning_rate": 2.9238871677243354e-06, "loss": 0.47315549850463867, "step": 1010 }, { "epoch": 2.04, "grad_norm": 39.29719161987305, "learning_rate": 2.818608135126967e-06, "loss": 0.3728752613067627, "step": 1020 }, { "epoch": 2.06, "grad_norm": 21.967544555664062, "learning_rate": 2.714510364706531e-06, "loss": 0.33084559440612793, "step": 1030 }, { "epoch": 2.08, "grad_norm": 25.645288467407227, "learning_rate": 2.611650227242102e-06, "loss": 0.32590973377227783, "step": 1040 }, { "epoch": 2.1, "grad_norm": 18.202743530273438, "learning_rate": 2.5100834233126827e-06, "loss": 0.4684451580047607, "step": 1050 }, { "epoch": 2.12, "grad_norm": 0.5234746336936951, "learning_rate": 2.40986495313435e-06, "loss": 0.26931188106536863, "step": 1060 }, { "epoch": 2.14, "grad_norm": 21.24897575378418, "learning_rate": 2.3110490867766644e-06, "loss": 0.499393892288208, "step": 1070 }, { "epoch": 2.16, "grad_norm": 13.816218376159668, "learning_rate": 2.213689334774479e-06, "loss": 0.3078450202941895, "step": 1080 }, { "epoch": 2.18, "grad_norm": 23.9191837310791, "learning_rate": 2.1178384191510344e-06, "loss": 0.47670416831970214, "step": 1090 }, { "epoch": 2.2, "grad_norm": 34.2762451171875, "learning_rate": 2.023548244868051e-06, "loss": 0.47382168769836425, "step": 1100 }, { "epoch": 2.22, "grad_norm": 19.34497833251953, "learning_rate": 1.9308698717182874e-06, "loss": 0.19442765712738036, "step": 1110 }, { "epoch": 2.24, "grad_norm": 28.95246124267578, "learning_rate": 1.8398534866757455e-06, "loss": 0.44660329818725586, "step": 1120 }, { "epoch": 2.26, "grad_norm": 1.9053665399551392, "learning_rate": 1.7505483767185583e-06, "loss": 0.36274888515472414, "step": 1130 }, { "epoch": 2.2800000000000002, "grad_norm": 31.379884719848633, "learning_rate": 1.6630029021392007e-06, "loss": 0.24480688571929932, "step": 1140 }, { "epoch": 2.3, "grad_norm": 17.360788345336914, "learning_rate": 1.5772644703565564e-06, "loss": 0.14373520612716675, "step": 1150 }, { "epoch": 2.32, "grad_norm": 36.79311752319336, "learning_rate": 1.4933795102439558e-06, "loss": 0.34253692626953125, "step": 1160 }, { "epoch": 2.34, "grad_norm": 9.673837661743164, "learning_rate": 1.4113934469871166e-06, "loss": 0.1978028893470764, "step": 1170 }, { "epoch": 2.36, "grad_norm": 0.5280386805534363, "learning_rate": 1.3313506774856177e-06, "loss": 0.29554598331451415, "step": 1180 }, { "epoch": 2.38, "grad_norm": 1.7119940519332886, "learning_rate": 1.2532945463111856e-06, "loss": 0.3423331260681152, "step": 1190 }, { "epoch": 2.4, "grad_norm": 9.490172386169434, "learning_rate": 1.1772673222358421e-06, "loss": 0.3274805784225464, "step": 1200 }, { "epoch": 2.42, "grad_norm": 2.7689626216888428, "learning_rate": 1.1033101753426285e-06, "loss": 0.29171273708343504, "step": 1210 }, { "epoch": 2.44, "grad_norm": 0.009632566943764687, "learning_rate": 1.0314631547312738e-06, "loss": 0.2250969171524048, "step": 1220 }, { "epoch": 2.46, "grad_norm": 46.948333740234375, "learning_rate": 9.617651668308914e-07, "loss": 0.2975991487503052, "step": 1230 }, { "epoch": 2.48, "grad_norm": 0.1608089804649353, "learning_rate": 8.942539543314799e-07, "loss": 0.3745620012283325, "step": 1240 }, { "epoch": 2.5, "grad_norm": 41.225494384765625, "learning_rate": 8.289660757455803e-07, "loss": 0.3142155885696411, "step": 1250 }, { "epoch": 2.52, "grad_norm": 38.09383010864258, "learning_rate": 7.659368856111926e-07, "loss": 0.4468146800994873, "step": 1260 }, { "epoch": 2.54, "grad_norm": 25.196218490600586, "learning_rate": 7.052005153466779e-07, "loss": 0.514287281036377, "step": 1270 }, { "epoch": 2.56, "grad_norm": 22.963035583496094, "learning_rate": 6.467898547679913e-07, "loss": 0.21953434944152833, "step": 1280 }, { "epoch": 2.58, "grad_norm": 17.07082176208496, "learning_rate": 5.9073653427826e-07, "loss": 0.3766503095626831, "step": 1290 }, { "epoch": 2.6, "grad_norm": 38.808040618896484, "learning_rate": 5.370709077393721e-07, "loss": 0.3407785892486572, "step": 1300 }, { "epoch": 2.62, "grad_norm": 52.602874755859375, "learning_rate": 4.858220360348187e-07, "loss": 0.2537382125854492, "step": 1310 }, { "epoch": 2.64, "grad_norm": 19.56964874267578, "learning_rate": 4.370176713327118e-07, "loss": 0.25959508419036864, "step": 1320 }, { "epoch": 2.66, "grad_norm": 5.28275203704834, "learning_rate": 3.90684242057498e-07, "loss": 0.43554534912109377, "step": 1330 }, { "epoch": 2.68, "grad_norm": 31.416179656982422, "learning_rate": 3.468468385785023e-07, "loss": 0.5232177257537842, "step": 1340 }, { "epoch": 2.7, "grad_norm": 46.08461380004883, "learning_rate": 3.055291996230492e-07, "loss": 0.2125147581100464, "step": 1350 }, { "epoch": 2.7199999999999998, "grad_norm": 11.485236167907715, "learning_rate": 2.6675369942151864e-07, "loss": 0.21367454528808594, "step": 1360 }, { "epoch": 2.74, "grad_norm": 31.298377990722656, "learning_rate": 2.3054133559131163e-07, "loss": 0.1693113088607788, "step": 1370 }, { "epoch": 2.76, "grad_norm": 30.015377044677734, "learning_rate": 1.9691171776626882e-07, "loss": 0.2792075157165527, "step": 1380 }, { "epoch": 2.7800000000000002, "grad_norm": 0.11436483263969421, "learning_rate": 1.6588305697770313e-07, "loss": 0.13291265964508056, "step": 1390 }, { "epoch": 2.8, "grad_norm": 54.67913818359375, "learning_rate": 1.374721557928116e-07, "loss": 0.4104594707489014, "step": 1400 }, { "epoch": 2.82, "grad_norm": 29.346342086791992, "learning_rate": 1.1169439921578485e-07, "loss": 0.5142350196838379, "step": 1410 }, { "epoch": 2.84, "grad_norm": 0.025704028084874153, "learning_rate": 8.856374635655696e-08, "loss": 0.31447272300720214, "step": 1420 }, { "epoch": 2.86, "grad_norm": 14.04995059967041, "learning_rate": 6.809272287169988e-08, "loss": 0.32525787353515623, "step": 1430 }, { "epoch": 2.88, "grad_norm": 5.467782020568848, "learning_rate": 5.029241418156139e-08, "loss": 0.1824193000793457, "step": 1440 }, { "epoch": 2.9, "grad_norm": 14.223469734191895, "learning_rate": 3.517245946731529e-08, "loss": 0.2836474895477295, "step": 1450 }, { "epoch": 2.92, "grad_norm": 18.699003219604492, "learning_rate": 2.27410464511707e-08, "loss": 0.20449333190917968, "step": 1460 }, { "epoch": 2.94, "grad_norm": 12.462102890014648, "learning_rate": 1.3004906962578723e-08, "loss": 0.30746448040008545, "step": 1470 }, { "epoch": 2.96, "grad_norm": 2.1692745685577393, "learning_rate": 5.969313292830126e-09, "loss": 0.2320650577545166, "step": 1480 }, { "epoch": 2.98, "grad_norm": 30.105022430419922, "learning_rate": 1.638075340010814e-09, "loss": 0.39888916015625, "step": 1490 }, { "epoch": 3.0, "grad_norm": 50.01400375366211, "learning_rate": 1.3538545881042198e-11, "loss": 0.3253091096878052, "step": 1500 }, { "epoch": 3.0, "step": 1500, "total_flos": 2.273097293733888e+16, "train_loss": 1.3163418625195822, "train_runtime": 1442.119, "train_samples_per_second": 2.08, "train_steps_per_second": 1.04 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.273097293733888e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }