{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.6246719160104988, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013123359580052493, "grad_norm": 23.86959882889253, "learning_rate": 3.930131004366813e-07, "loss": 3.6671, "step": 10 }, { "epoch": 0.026246719160104987, "grad_norm": 14.462567513995324, "learning_rate": 8.296943231441049e-07, "loss": 3.5017, "step": 20 }, { "epoch": 0.03937007874015748, "grad_norm": 9.814656935541821, "learning_rate": 1.2663755458515283e-06, "loss": 3.0198, "step": 30 }, { "epoch": 0.05249343832020997, "grad_norm": 5.878057728224257, "learning_rate": 1.703056768558952e-06, "loss": 2.5872, "step": 40 }, { "epoch": 0.06561679790026247, "grad_norm": 3.906259383681401, "learning_rate": 2.1397379912663756e-06, "loss": 2.2781, "step": 50 }, { "epoch": 0.07874015748031496, "grad_norm": 4.4867149824774835, "learning_rate": 2.576419213973799e-06, "loss": 2.0297, "step": 60 }, { "epoch": 0.09186351706036745, "grad_norm": 4.808687465121692, "learning_rate": 3.0131004366812227e-06, "loss": 1.8012, "step": 70 }, { "epoch": 0.10498687664041995, "grad_norm": 4.8883051654567256, "learning_rate": 3.4497816593886467e-06, "loss": 1.6206, "step": 80 }, { "epoch": 0.11811023622047244, "grad_norm": 4.85572751158142, "learning_rate": 3.88646288209607e-06, "loss": 1.4667, "step": 90 }, { "epoch": 0.13123359580052493, "grad_norm": 4.149268447654452, "learning_rate": 4.323144104803494e-06, "loss": 1.3442, "step": 100 }, { "epoch": 0.14435695538057744, "grad_norm": 3.8514308742742105, "learning_rate": 4.759825327510917e-06, "loss": 1.2591, "step": 110 }, { "epoch": 0.15748031496062992, "grad_norm": 2.3342255673234478, "learning_rate": 5.196506550218341e-06, "loss": 1.1949, "step": 120 }, { "epoch": 0.17060367454068243, "grad_norm": 2.260928208792793, "learning_rate": 5.6331877729257645e-06, "loss": 1.1603, "step": 130 }, { "epoch": 0.1837270341207349, "grad_norm": 2.057320468921841, "learning_rate": 6.069868995633188e-06, "loss": 1.1186, "step": 140 }, { "epoch": 0.1968503937007874, "grad_norm": 1.8563167183407243, "learning_rate": 6.5065502183406116e-06, "loss": 1.079, "step": 150 }, { "epoch": 0.2099737532808399, "grad_norm": 1.8461471131985807, "learning_rate": 6.943231441048035e-06, "loss": 1.0464, "step": 160 }, { "epoch": 0.2230971128608924, "grad_norm": 2.034240834591269, "learning_rate": 7.3799126637554595e-06, "loss": 1.0376, "step": 170 }, { "epoch": 0.23622047244094488, "grad_norm": 1.7734341028229181, "learning_rate": 7.816593886462883e-06, "loss": 1.0199, "step": 180 }, { "epoch": 0.24934383202099739, "grad_norm": 1.9112436701720903, "learning_rate": 8.253275109170307e-06, "loss": 1.0007, "step": 190 }, { "epoch": 0.26246719160104987, "grad_norm": 1.8822173865457565, "learning_rate": 8.68995633187773e-06, "loss": 0.9955, "step": 200 }, { "epoch": 0.2755905511811024, "grad_norm": 1.7697350327707664, "learning_rate": 9.126637554585154e-06, "loss": 0.9728, "step": 210 }, { "epoch": 0.2887139107611549, "grad_norm": 1.817013692625474, "learning_rate": 9.563318777292577e-06, "loss": 0.9613, "step": 220 }, { "epoch": 0.30183727034120733, "grad_norm": 1.7196715695129818, "learning_rate": 1e-05, "loss": 0.9657, "step": 230 }, { "epoch": 0.31496062992125984, "grad_norm": 1.8850637599616171, "learning_rate": 9.999416873566297e-06, "loss": 0.9442, "step": 240 }, { "epoch": 0.32808398950131235, "grad_norm": 1.7285303777141385, "learning_rate": 9.997667630279758e-06, "loss": 0.932, "step": 250 }, { "epoch": 0.34120734908136485, "grad_norm": 1.7265089317333577, "learning_rate": 9.994752678152384e-06, "loss": 0.9357, "step": 260 }, { "epoch": 0.3543307086614173, "grad_norm": 1.8320254043398032, "learning_rate": 9.990672697098431e-06, "loss": 0.9222, "step": 270 }, { "epoch": 0.3674540682414698, "grad_norm": 1.7044024495030916, "learning_rate": 9.985428638775822e-06, "loss": 0.9216, "step": 280 }, { "epoch": 0.3805774278215223, "grad_norm": 2.2575206943279382, "learning_rate": 9.979021726364164e-06, "loss": 0.897, "step": 290 }, { "epoch": 0.3937007874015748, "grad_norm": 1.7729314927057351, "learning_rate": 9.971453454279454e-06, "loss": 0.8844, "step": 300 }, { "epoch": 0.4068241469816273, "grad_norm": 2.35007808422404, "learning_rate": 9.962725587825492e-06, "loss": 0.8832, "step": 310 }, { "epoch": 0.4199475065616798, "grad_norm": 2.2653008942496, "learning_rate": 9.95284016278214e-06, "loss": 0.8803, "step": 320 }, { "epoch": 0.4330708661417323, "grad_norm": 1.986794023664208, "learning_rate": 9.941799484930454e-06, "loss": 0.8741, "step": 330 }, { "epoch": 0.4461942257217848, "grad_norm": 1.5861501803724807, "learning_rate": 9.929606129514875e-06, "loss": 0.8636, "step": 340 }, { "epoch": 0.45931758530183725, "grad_norm": 1.8008905768440275, "learning_rate": 9.916262940642549e-06, "loss": 0.8656, "step": 350 }, { "epoch": 0.47244094488188976, "grad_norm": 1.765844088914314, "learning_rate": 9.90177303061993e-06, "loss": 0.8642, "step": 360 }, { "epoch": 0.48556430446194226, "grad_norm": 1.8318811699836053, "learning_rate": 9.88613977922684e-06, "loss": 0.8537, "step": 370 }, { "epoch": 0.49868766404199477, "grad_norm": 1.66695173639277, "learning_rate": 9.869366832928134e-06, "loss": 0.8535, "step": 380 }, { "epoch": 0.5118110236220472, "grad_norm": 1.8171270864493383, "learning_rate": 9.851458104023153e-06, "loss": 0.8367, "step": 390 }, { "epoch": 0.5249343832020997, "grad_norm": 1.773784462102006, "learning_rate": 9.832417769733185e-06, "loss": 0.8302, "step": 400 }, { "epoch": 0.5380577427821522, "grad_norm": 1.908924552643567, "learning_rate": 9.812250271227123e-06, "loss": 0.8403, "step": 410 }, { "epoch": 0.5511811023622047, "grad_norm": 1.673461507192779, "learning_rate": 9.790960312585561e-06, "loss": 0.8347, "step": 420 }, { "epoch": 0.5643044619422573, "grad_norm": 1.5727733661189518, "learning_rate": 9.76855285970356e-06, "loss": 0.8284, "step": 430 }, { "epoch": 0.5774278215223098, "grad_norm": 1.6740839799492047, "learning_rate": 9.745033139132352e-06, "loss": 0.828, "step": 440 }, { "epoch": 0.5905511811023622, "grad_norm": 1.7521499040025525, "learning_rate": 9.720406636860252e-06, "loss": 0.8164, "step": 450 }, { "epoch": 0.6036745406824147, "grad_norm": 1.5716458693651338, "learning_rate": 9.694679097033038e-06, "loss": 0.8158, "step": 460 }, { "epoch": 0.6167979002624672, "grad_norm": 1.869817821288055, "learning_rate": 9.667856520614128e-06, "loss": 0.8152, "step": 470 }, { "epoch": 0.6299212598425197, "grad_norm": 1.7933186182662169, "learning_rate": 9.639945163984852e-06, "loss": 0.8229, "step": 480 }, { "epoch": 0.6430446194225722, "grad_norm": 1.6201846678042537, "learning_rate": 9.610951537485152e-06, "loss": 0.8175, "step": 490 }, { "epoch": 0.6561679790026247, "grad_norm": 1.7811525807024158, "learning_rate": 9.580882403895038e-06, "loss": 0.8053, "step": 500 }, { "epoch": 0.6692913385826772, "grad_norm": 1.473703478611248, "learning_rate": 9.549744776857162e-06, "loss": 0.8086, "step": 510 }, { "epoch": 0.6824146981627297, "grad_norm": 1.5936730653642202, "learning_rate": 9.51754591924089e-06, "loss": 0.8102, "step": 520 }, { "epoch": 0.6955380577427821, "grad_norm": 1.6012042279794647, "learning_rate": 9.484293341448221e-06, "loss": 0.7944, "step": 530 }, { "epoch": 0.7086614173228346, "grad_norm": 1.5905348724723356, "learning_rate": 9.449994799662e-06, "loss": 0.7958, "step": 540 }, { "epoch": 0.7217847769028871, "grad_norm": 1.5342424313908625, "learning_rate": 9.414658294036768e-06, "loss": 0.8018, "step": 550 }, { "epoch": 0.7349081364829396, "grad_norm": 1.575920836396293, "learning_rate": 9.378292066832723e-06, "loss": 0.7928, "step": 560 }, { "epoch": 0.7480314960629921, "grad_norm": 1.524275013695747, "learning_rate": 9.34090460049322e-06, "loss": 0.7939, "step": 570 }, { "epoch": 0.7611548556430446, "grad_norm": 1.5915377546345706, "learning_rate": 9.302504615666222e-06, "loss": 0.7943, "step": 580 }, { "epoch": 0.7742782152230971, "grad_norm": 1.440116819392148, "learning_rate": 9.26310106917021e-06, "loss": 0.7963, "step": 590 }, { "epoch": 0.7874015748031497, "grad_norm": 1.3528278926960116, "learning_rate": 9.222703151905005e-06, "loss": 0.7914, "step": 600 }, { "epoch": 0.800524934383202, "grad_norm": 1.5465191371552307, "learning_rate": 9.181320286707974e-06, "loss": 0.7927, "step": 610 }, { "epoch": 0.8136482939632546, "grad_norm": 1.648064371427012, "learning_rate": 9.138962126156157e-06, "loss": 0.796, "step": 620 }, { "epoch": 0.8267716535433071, "grad_norm": 1.4758839650496245, "learning_rate": 9.095638550314794e-06, "loss": 0.7933, "step": 630 }, { "epoch": 0.8398950131233596, "grad_norm": 1.7226329473102002, "learning_rate": 9.051359664432795e-06, "loss": 0.7804, "step": 640 }, { "epoch": 0.8530183727034121, "grad_norm": 1.612826825222584, "learning_rate": 9.006135796585688e-06, "loss": 0.7836, "step": 650 }, { "epoch": 0.8661417322834646, "grad_norm": 1.4463880498485353, "learning_rate": 8.95997749526658e-06, "loss": 0.7811, "step": 660 }, { "epoch": 0.8792650918635171, "grad_norm": 1.604651553438199, "learning_rate": 8.912895526925726e-06, "loss": 0.7781, "step": 670 }, { "epoch": 0.8923884514435696, "grad_norm": 1.5684616487142087, "learning_rate": 8.86490087345924e-06, "loss": 0.7814, "step": 680 }, { "epoch": 0.905511811023622, "grad_norm": 1.4869728238675464, "learning_rate": 8.816004729647573e-06, "loss": 0.7827, "step": 690 }, { "epoch": 0.9186351706036745, "grad_norm": 1.50603165520279, "learning_rate": 8.766218500544305e-06, "loss": 0.7848, "step": 700 }, { "epoch": 0.931758530183727, "grad_norm": 1.5149074570283008, "learning_rate": 8.715553798815925e-06, "loss": 0.7769, "step": 710 }, { "epoch": 0.9448818897637795, "grad_norm": 1.544976291499801, "learning_rate": 8.66402244203317e-06, "loss": 0.7613, "step": 720 }, { "epoch": 0.958005249343832, "grad_norm": 1.555367156177007, "learning_rate": 8.611636449914563e-06, "loss": 0.7668, "step": 730 }, { "epoch": 0.9711286089238845, "grad_norm": 1.504586391232175, "learning_rate": 8.558408041522801e-06, "loss": 0.7654, "step": 740 }, { "epoch": 0.984251968503937, "grad_norm": 1.4898212640427306, "learning_rate": 8.504349632414675e-06, "loss": 0.764, "step": 750 }, { "epoch": 0.9973753280839895, "grad_norm": 1.401986546883637, "learning_rate": 8.449473831745106e-06, "loss": 0.7583, "step": 760 }, { "epoch": 1.010498687664042, "grad_norm": 1.4416543605053558, "learning_rate": 8.393793439326071e-06, "loss": 0.7103, "step": 770 }, { "epoch": 1.0236220472440944, "grad_norm": 1.6403171843794115, "learning_rate": 8.337321442641036e-06, "loss": 0.7034, "step": 780 }, { "epoch": 1.036745406824147, "grad_norm": 1.5388511360135515, "learning_rate": 8.28007101381561e-06, "loss": 0.7011, "step": 790 }, { "epoch": 1.0498687664041995, "grad_norm": 1.4334465678780328, "learning_rate": 8.22205550654515e-06, "loss": 0.7079, "step": 800 }, { "epoch": 1.0629921259842519, "grad_norm": 1.5587791264347304, "learning_rate": 8.16328845298e-06, "loss": 0.6971, "step": 810 }, { "epoch": 1.0761154855643045, "grad_norm": 1.5233569268398837, "learning_rate": 8.103783560569104e-06, "loss": 0.7088, "step": 820 }, { "epoch": 1.0892388451443569, "grad_norm": 1.4976958999381966, "learning_rate": 8.04355470886274e-06, "loss": 0.6934, "step": 830 }, { "epoch": 1.1023622047244095, "grad_norm": 1.3430067666073104, "learning_rate": 7.98261594627511e-06, "loss": 0.7011, "step": 840 }, { "epoch": 1.1154855643044619, "grad_norm": 1.4631861658373042, "learning_rate": 7.920981486807537e-06, "loss": 0.7053, "step": 850 }, { "epoch": 1.1286089238845145, "grad_norm": 1.3662308568396804, "learning_rate": 7.858665706733035e-06, "loss": 0.6999, "step": 860 }, { "epoch": 1.141732283464567, "grad_norm": 1.5745244501174307, "learning_rate": 7.795683141243046e-06, "loss": 0.7036, "step": 870 }, { "epoch": 1.1548556430446195, "grad_norm": 1.4028684322219405, "learning_rate": 7.732048481057088e-06, "loss": 0.7017, "step": 880 }, { "epoch": 1.167979002624672, "grad_norm": 1.684251036089208, "learning_rate": 7.667776568996143e-06, "loss": 0.6849, "step": 890 }, { "epoch": 1.1811023622047245, "grad_norm": 1.4835499788707966, "learning_rate": 7.602882396520559e-06, "loss": 0.6951, "step": 900 }, { "epoch": 1.194225721784777, "grad_norm": 1.5123572551106819, "learning_rate": 7.5373811002332785e-06, "loss": 0.6962, "step": 910 }, { "epoch": 1.2073490813648293, "grad_norm": 1.8825750275150834, "learning_rate": 7.47128795834923e-06, "loss": 0.7017, "step": 920 }, { "epoch": 1.220472440944882, "grad_norm": 1.334943117481765, "learning_rate": 7.4046183871316544e-06, "loss": 0.6973, "step": 930 }, { "epoch": 1.2335958005249343, "grad_norm": 1.5838501251475126, "learning_rate": 7.337387937296278e-06, "loss": 0.6944, "step": 940 }, { "epoch": 1.246719160104987, "grad_norm": 1.342799954640876, "learning_rate": 7.269612290384076e-06, "loss": 0.6822, "step": 950 }, { "epoch": 1.2598425196850394, "grad_norm": 1.3443879066347948, "learning_rate": 7.201307255103561e-06, "loss": 0.692, "step": 960 }, { "epoch": 1.272965879265092, "grad_norm": 1.586723729979877, "learning_rate": 7.132488763643384e-06, "loss": 0.6925, "step": 970 }, { "epoch": 1.2860892388451444, "grad_norm": 1.3370936442314167, "learning_rate": 7.063172867956143e-06, "loss": 0.6904, "step": 980 }, { "epoch": 1.2992125984251968, "grad_norm": 1.5378877632776058, "learning_rate": 6.993375736014259e-06, "loss": 0.6903, "step": 990 }, { "epoch": 1.3123359580052494, "grad_norm": 1.361410460657643, "learning_rate": 6.923113648038784e-06, "loss": 0.6943, "step": 1000 }, { "epoch": 1.3254593175853018, "grad_norm": 1.646369394476635, "learning_rate": 6.852402992702034e-06, "loss": 0.6788, "step": 1010 }, { "epoch": 1.3385826771653544, "grad_norm": 1.3176415658235974, "learning_rate": 6.781260263304918e-06, "loss": 0.6836, "step": 1020 }, { "epoch": 1.3517060367454068, "grad_norm": 1.3116825387663804, "learning_rate": 6.709702053929865e-06, "loss": 0.6804, "step": 1030 }, { "epoch": 1.3648293963254594, "grad_norm": 1.3926884194197895, "learning_rate": 6.6377450555702485e-06, "loss": 0.683, "step": 1040 }, { "epoch": 1.3779527559055118, "grad_norm": 1.4099706312703377, "learning_rate": 6.565406052237205e-06, "loss": 0.6793, "step": 1050 }, { "epoch": 1.3910761154855642, "grad_norm": 1.7784719464765815, "learning_rate": 6.4927019170447434e-06, "loss": 0.6922, "step": 1060 }, { "epoch": 1.4041994750656168, "grad_norm": 1.4629942474237903, "learning_rate": 6.419649608274096e-06, "loss": 0.6945, "step": 1070 }, { "epoch": 1.4173228346456692, "grad_norm": 1.3978321692556004, "learning_rate": 6.346266165418173e-06, "loss": 0.6794, "step": 1080 }, { "epoch": 1.4304461942257218, "grad_norm": 1.4239220003078308, "learning_rate": 6.272568705207109e-06, "loss": 0.6822, "step": 1090 }, { "epoch": 1.4435695538057742, "grad_norm": 1.5699876966221564, "learning_rate": 6.198574417615758e-06, "loss": 0.6837, "step": 1100 }, { "epoch": 1.4566929133858268, "grad_norm": 1.5574208469323227, "learning_rate": 6.124300561854139e-06, "loss": 0.6797, "step": 1110 }, { "epoch": 1.4698162729658792, "grad_norm": 1.4140951570795213, "learning_rate": 6.049764462341702e-06, "loss": 0.6741, "step": 1120 }, { "epoch": 1.4829396325459316, "grad_norm": 1.4432324570722699, "learning_rate": 5.974983504666402e-06, "loss": 0.6822, "step": 1130 }, { "epoch": 1.4960629921259843, "grad_norm": 1.3824079635846924, "learning_rate": 5.899975131529504e-06, "loss": 0.6821, "step": 1140 }, { "epoch": 1.5091863517060369, "grad_norm": 1.3639889335044162, "learning_rate": 5.824756838677057e-06, "loss": 0.6833, "step": 1150 }, { "epoch": 1.5223097112860893, "grad_norm": 1.3279921872652147, "learning_rate": 5.749346170819006e-06, "loss": 0.6745, "step": 1160 }, { "epoch": 1.5354330708661417, "grad_norm": 1.4234609859145475, "learning_rate": 5.6737607175368735e-06, "loss": 0.6845, "step": 1170 }, { "epoch": 1.5485564304461943, "grad_norm": 1.4045161720671306, "learning_rate": 5.598018109180988e-06, "loss": 0.6832, "step": 1180 }, { "epoch": 1.5616797900262467, "grad_norm": 1.5599767307463868, "learning_rate": 5.5221360127581815e-06, "loss": 0.673, "step": 1190 }, { "epoch": 1.574803149606299, "grad_norm": 1.3244762605195228, "learning_rate": 5.446132127810966e-06, "loss": 0.6714, "step": 1200 }, { "epoch": 1.5879265091863517, "grad_norm": 1.5834736978023898, "learning_rate": 5.370024182289087e-06, "loss": 0.6659, "step": 1210 }, { "epoch": 1.6010498687664043, "grad_norm": 1.4369156468567603, "learning_rate": 5.29382992841449e-06, "loss": 0.6796, "step": 1220 }, { "epoch": 1.6141732283464567, "grad_norm": 1.3895175396051678, "learning_rate": 5.217567138540581e-06, "loss": 0.6742, "step": 1230 }, { "epoch": 1.627296587926509, "grad_norm": 1.3970406816982954, "learning_rate": 5.141253601006841e-06, "loss": 0.6608, "step": 1240 }, { "epoch": 1.6404199475065617, "grad_norm": 1.31995657343019, "learning_rate": 5.064907115989655e-06, "loss": 0.6732, "step": 1250 }, { "epoch": 1.6535433070866141, "grad_norm": 1.3046889168284816, "learning_rate": 4.9885454913504435e-06, "loss": 0.6663, "step": 1260 }, { "epoch": 1.6666666666666665, "grad_norm": 1.2295984270005762, "learning_rate": 4.912186538481944e-06, "loss": 0.6619, "step": 1270 }, { "epoch": 1.6797900262467191, "grad_norm": 1.4667420849887025, "learning_rate": 4.835848068153702e-06, "loss": 0.6708, "step": 1280 }, { "epoch": 1.6929133858267718, "grad_norm": 1.3732477862949846, "learning_rate": 4.759547886357701e-06, "loss": 0.6699, "step": 1290 }, { "epoch": 1.7060367454068242, "grad_norm": 1.2525903026333147, "learning_rate": 4.683303790155103e-06, "loss": 0.6672, "step": 1300 }, { "epoch": 1.7191601049868765, "grad_norm": 1.4884566011924565, "learning_rate": 4.607133563525072e-06, "loss": 0.6657, "step": 1310 }, { "epoch": 1.7322834645669292, "grad_norm": 1.2651877837014813, "learning_rate": 4.531054973216648e-06, "loss": 0.6601, "step": 1320 }, { "epoch": 1.7454068241469818, "grad_norm": 1.2957136613776106, "learning_rate": 4.455085764604653e-06, "loss": 0.665, "step": 1330 }, { "epoch": 1.758530183727034, "grad_norm": 1.2914149104267014, "learning_rate": 4.3792436575505644e-06, "loss": 0.6711, "step": 1340 }, { "epoch": 1.7716535433070866, "grad_norm": 1.3087878612877044, "learning_rate": 4.303546342269344e-06, "loss": 0.6802, "step": 1350 }, { "epoch": 1.7847769028871392, "grad_norm": 1.354110047538368, "learning_rate": 4.228011475203191e-06, "loss": 0.6639, "step": 1360 }, { "epoch": 1.7979002624671916, "grad_norm": 1.264469178278457, "learning_rate": 4.152656674903169e-06, "loss": 0.6596, "step": 1370 }, { "epoch": 1.811023622047244, "grad_norm": 1.355906590548092, "learning_rate": 4.077499517919663e-06, "loss": 0.6657, "step": 1380 }, { "epoch": 1.8241469816272966, "grad_norm": 1.291362822136906, "learning_rate": 4.002557534702639e-06, "loss": 0.6616, "step": 1390 }, { "epoch": 1.8372703412073492, "grad_norm": 1.3281942700420335, "learning_rate": 3.927848205512659e-06, "loss": 0.6641, "step": 1400 }, { "epoch": 1.8503937007874016, "grad_norm": 1.2988138580887156, "learning_rate": 3.853388956343604e-06, "loss": 0.6673, "step": 1410 }, { "epoch": 1.863517060367454, "grad_norm": 1.386424735985052, "learning_rate": 3.779197154858044e-06, "loss": 0.6509, "step": 1420 }, { "epoch": 1.8766404199475066, "grad_norm": 1.335199751133558, "learning_rate": 3.705290106336221e-06, "loss": 0.6641, "step": 1430 }, { "epoch": 1.889763779527559, "grad_norm": 1.2373191854244718, "learning_rate": 3.6316850496395863e-06, "loss": 0.6632, "step": 1440 }, { "epoch": 1.9028871391076114, "grad_norm": 1.254937848225745, "learning_rate": 3.5583991531898276e-06, "loss": 0.6586, "step": 1450 }, { "epoch": 1.916010498687664, "grad_norm": 1.2586091439960605, "learning_rate": 3.4854495109643207e-06, "loss": 0.6555, "step": 1460 }, { "epoch": 1.9291338582677167, "grad_norm": 1.2691195643845157, "learning_rate": 3.412853138508947e-06, "loss": 0.6652, "step": 1470 }, { "epoch": 1.942257217847769, "grad_norm": 1.37789137126307, "learning_rate": 3.340626968969215e-06, "loss": 0.6578, "step": 1480 }, { "epoch": 1.9553805774278215, "grad_norm": 1.3680845584870929, "learning_rate": 3.2687878491405933e-06, "loss": 0.6613, "step": 1490 }, { "epoch": 1.968503937007874, "grad_norm": 1.344475700563475, "learning_rate": 3.197352535538978e-06, "loss": 0.6712, "step": 1500 }, { "epoch": 1.9816272965879265, "grad_norm": 1.3678562268271839, "learning_rate": 3.1263376904922318e-06, "loss": 0.6584, "step": 1510 }, { "epoch": 1.9947506561679789, "grad_norm": 1.346152280655493, "learning_rate": 3.0557598782536914e-06, "loss": 0.6551, "step": 1520 }, { "epoch": 2.0078740157480315, "grad_norm": 1.2899162269306206, "learning_rate": 2.9856355611385356e-06, "loss": 0.6218, "step": 1530 }, { "epoch": 2.020997375328084, "grad_norm": 1.34329640848473, "learning_rate": 2.915981095683943e-06, "loss": 0.5752, "step": 1540 }, { "epoch": 2.0341207349081363, "grad_norm": 1.24625682130632, "learning_rate": 2.846812728833931e-06, "loss": 0.578, "step": 1550 }, { "epoch": 2.047244094488189, "grad_norm": 1.3603958866479353, "learning_rate": 2.778146594149732e-06, "loss": 0.5909, "step": 1560 }, { "epoch": 2.0603674540682415, "grad_norm": 1.3130248810501666, "learning_rate": 2.7099987080466417e-06, "loss": 0.5779, "step": 1570 }, { "epoch": 2.073490813648294, "grad_norm": 1.2728229935348123, "learning_rate": 2.64238496605817e-06, "loss": 0.5874, "step": 1580 }, { "epoch": 2.0866141732283463, "grad_norm": 1.3312388632136043, "learning_rate": 2.5753211391284172e-06, "loss": 0.5788, "step": 1590 }, { "epoch": 2.099737532808399, "grad_norm": 1.2348457534841435, "learning_rate": 2.5088228699334717e-06, "loss": 0.5763, "step": 1600 }, { "epoch": 2.1128608923884515, "grad_norm": 1.3038144088885084, "learning_rate": 2.44290566923276e-06, "loss": 0.5799, "step": 1610 }, { "epoch": 2.1259842519685037, "grad_norm": 1.3164791802552103, "learning_rate": 2.3775849122511442e-06, "loss": 0.5769, "step": 1620 }, { "epoch": 2.1391076115485563, "grad_norm": 1.291734568095305, "learning_rate": 2.312875835092655e-06, "loss": 0.5721, "step": 1630 }, { "epoch": 2.152230971128609, "grad_norm": 1.4191185408274385, "learning_rate": 2.248793531186647e-06, "loss": 0.5768, "step": 1640 }, { "epoch": 2.1653543307086616, "grad_norm": 1.18784718220649, "learning_rate": 2.185352947767257e-06, "loss": 0.579, "step": 1650 }, { "epoch": 2.1784776902887137, "grad_norm": 1.2311199986468657, "learning_rate": 2.1225688823869494e-06, "loss": 0.5747, "step": 1660 }, { "epoch": 2.1916010498687664, "grad_norm": 1.2447404349206936, "learning_rate": 2.0604559794649793e-06, "loss": 0.5734, "step": 1670 }, { "epoch": 2.204724409448819, "grad_norm": 1.296578730192031, "learning_rate": 1.999028726871576e-06, "loss": 0.5755, "step": 1680 }, { "epoch": 2.2178477690288716, "grad_norm": 1.2677498871223802, "learning_rate": 1.9383014525486287e-06, "loss": 0.5728, "step": 1690 }, { "epoch": 2.2309711286089238, "grad_norm": 1.2522124484651282, "learning_rate": 1.8782883211677044e-06, "loss": 0.5713, "step": 1700 }, { "epoch": 2.2440944881889764, "grad_norm": 1.3344150468049227, "learning_rate": 1.8190033308261134e-06, "loss": 0.5723, "step": 1710 }, { "epoch": 2.257217847769029, "grad_norm": 1.2564213421314623, "learning_rate": 1.7604603097818523e-06, "loss": 0.5734, "step": 1720 }, { "epoch": 2.270341207349081, "grad_norm": 1.3130336081169207, "learning_rate": 1.7026729132281489e-06, "loss": 0.5787, "step": 1730 }, { "epoch": 2.283464566929134, "grad_norm": 1.1841727433165028, "learning_rate": 1.6456546201083934e-06, "loss": 0.5744, "step": 1740 }, { "epoch": 2.2965879265091864, "grad_norm": 1.3367937537125874, "learning_rate": 1.5894187299721535e-06, "loss": 0.5707, "step": 1750 }, { "epoch": 2.309711286089239, "grad_norm": 1.2242809577158813, "learning_rate": 1.5339783598730568e-06, "loss": 0.5692, "step": 1760 }, { "epoch": 2.322834645669291, "grad_norm": 1.2513286468060605, "learning_rate": 1.4793464413092161e-06, "loss": 0.574, "step": 1770 }, { "epoch": 2.335958005249344, "grad_norm": 1.259680151618769, "learning_rate": 1.4255357172069727e-06, "loss": 0.5786, "step": 1780 }, { "epoch": 2.3490813648293964, "grad_norm": 1.2315488553903935, "learning_rate": 1.3725587389485812e-06, "loss": 0.5671, "step": 1790 }, { "epoch": 2.362204724409449, "grad_norm": 1.2676834549720328, "learning_rate": 1.3204278634446028e-06, "loss": 0.5769, "step": 1800 }, { "epoch": 2.3753280839895012, "grad_norm": 1.3389962065747927, "learning_rate": 1.2691552502516414e-06, "loss": 0.5777, "step": 1810 }, { "epoch": 2.388451443569554, "grad_norm": 1.2653499838227833, "learning_rate": 1.2187528587361313e-06, "loss": 0.5723, "step": 1820 }, { "epoch": 2.4015748031496065, "grad_norm": 1.1672991280648781, "learning_rate": 1.1692324452847992e-06, "loss": 0.5699, "step": 1830 }, { "epoch": 2.4146981627296586, "grad_norm": 1.2893411655718872, "learning_rate": 1.1206055605624777e-06, "loss": 0.5732, "step": 1840 }, { "epoch": 2.4278215223097113, "grad_norm": 1.290721907760722, "learning_rate": 1.0728835468179183e-06, "loss": 0.5662, "step": 1850 }, { "epoch": 2.440944881889764, "grad_norm": 1.217997593835658, "learning_rate": 1.0260775352381934e-06, "loss": 0.5727, "step": 1860 }, { "epoch": 2.454068241469816, "grad_norm": 1.2440219130503634, "learning_rate": 9.801984433523483e-07, "loss": 0.5694, "step": 1870 }, { "epoch": 2.4671916010498687, "grad_norm": 1.2592327456759795, "learning_rate": 9.352569724848715e-07, "loss": 0.5735, "step": 1880 }, { "epoch": 2.4803149606299213, "grad_norm": 1.226880374562509, "learning_rate": 8.912636052596207e-07, "loss": 0.5816, "step": 1890 }, { "epoch": 2.493438320209974, "grad_norm": 1.2083486104350505, "learning_rate": 8.482286031547282e-07, "loss": 0.5638, "step": 1900 }, { "epoch": 2.506561679790026, "grad_norm": 1.14544648130176, "learning_rate": 8.061620041091172e-07, "loss": 0.5735, "step": 1910 }, { "epoch": 2.5196850393700787, "grad_norm": 1.2633309998706563, "learning_rate": 7.650736201811348e-07, "loss": 0.5768, "step": 1920 }, { "epoch": 2.5328083989501313, "grad_norm": 1.2681452803808053, "learning_rate": 7.249730352599e-07, "loss": 0.575, "step": 1930 }, { "epoch": 2.545931758530184, "grad_norm": 1.2826031568874492, "learning_rate": 6.858696028298412e-07, "loss": 0.5687, "step": 1940 }, { "epoch": 2.559055118110236, "grad_norm": 1.2627571245177376, "learning_rate": 6.477724437889988e-07, "loss": 0.564, "step": 1950 }, { "epoch": 2.5721784776902887, "grad_norm": 1.1880367657533304, "learning_rate": 6.106904443215639e-07, "loss": 0.5709, "step": 1960 }, { "epoch": 2.5853018372703414, "grad_norm": 1.234319028151925, "learning_rate": 5.746322538251814e-07, "loss": 0.5731, "step": 1970 }, { "epoch": 2.5984251968503935, "grad_norm": 1.2520017746520362, "learning_rate": 5.396062828934634e-07, "loss": 0.5765, "step": 1980 }, { "epoch": 2.611548556430446, "grad_norm": 1.2282113090629896, "learning_rate": 5.056207013542131e-07, "loss": 0.5723, "step": 1990 }, { "epoch": 2.6246719160104988, "grad_norm": 1.250069721424398, "learning_rate": 4.7268343636381774e-07, "loss": 0.5681, "step": 2000 } ], "logging_steps": 10, "max_steps": 2286, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1100109887569920.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }