| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.6246719160104988, |
| "eval_steps": 500, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.013123359580052493, |
| "grad_norm": 23.86959882889253, |
| "learning_rate": 3.930131004366813e-07, |
| "loss": 3.6671, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.026246719160104987, |
| "grad_norm": 14.462567513995324, |
| "learning_rate": 8.296943231441049e-07, |
| "loss": 3.5017, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.03937007874015748, |
| "grad_norm": 9.814656935541821, |
| "learning_rate": 1.2663755458515283e-06, |
| "loss": 3.0198, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.05249343832020997, |
| "grad_norm": 5.878057728224257, |
| "learning_rate": 1.703056768558952e-06, |
| "loss": 2.5872, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.06561679790026247, |
| "grad_norm": 3.906259383681401, |
| "learning_rate": 2.1397379912663756e-06, |
| "loss": 2.2781, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.07874015748031496, |
| "grad_norm": 4.4867149824774835, |
| "learning_rate": 2.576419213973799e-06, |
| "loss": 2.0297, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.09186351706036745, |
| "grad_norm": 4.808687465121692, |
| "learning_rate": 3.0131004366812227e-06, |
| "loss": 1.8012, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.10498687664041995, |
| "grad_norm": 4.8883051654567256, |
| "learning_rate": 3.4497816593886467e-06, |
| "loss": 1.6206, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.11811023622047244, |
| "grad_norm": 4.85572751158142, |
| "learning_rate": 3.88646288209607e-06, |
| "loss": 1.4667, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.13123359580052493, |
| "grad_norm": 4.149268447654452, |
| "learning_rate": 4.323144104803494e-06, |
| "loss": 1.3442, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.14435695538057744, |
| "grad_norm": 3.8514308742742105, |
| "learning_rate": 4.759825327510917e-06, |
| "loss": 1.2591, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.15748031496062992, |
| "grad_norm": 2.3342255673234478, |
| "learning_rate": 5.196506550218341e-06, |
| "loss": 1.1949, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.17060367454068243, |
| "grad_norm": 2.260928208792793, |
| "learning_rate": 5.6331877729257645e-06, |
| "loss": 1.1603, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.1837270341207349, |
| "grad_norm": 2.057320468921841, |
| "learning_rate": 6.069868995633188e-06, |
| "loss": 1.1186, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.1968503937007874, |
| "grad_norm": 1.8563167183407243, |
| "learning_rate": 6.5065502183406116e-06, |
| "loss": 1.079, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.2099737532808399, |
| "grad_norm": 1.8461471131985807, |
| "learning_rate": 6.943231441048035e-06, |
| "loss": 1.0464, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2230971128608924, |
| "grad_norm": 2.034240834591269, |
| "learning_rate": 7.3799126637554595e-06, |
| "loss": 1.0376, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.23622047244094488, |
| "grad_norm": 1.7734341028229181, |
| "learning_rate": 7.816593886462883e-06, |
| "loss": 1.0199, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.24934383202099739, |
| "grad_norm": 1.9112436701720903, |
| "learning_rate": 8.253275109170307e-06, |
| "loss": 1.0007, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.26246719160104987, |
| "grad_norm": 1.8822173865457565, |
| "learning_rate": 8.68995633187773e-06, |
| "loss": 0.9955, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.2755905511811024, |
| "grad_norm": 1.7697350327707664, |
| "learning_rate": 9.126637554585154e-06, |
| "loss": 0.9728, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.2887139107611549, |
| "grad_norm": 1.817013692625474, |
| "learning_rate": 9.563318777292577e-06, |
| "loss": 0.9613, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.30183727034120733, |
| "grad_norm": 1.7196715695129818, |
| "learning_rate": 1e-05, |
| "loss": 0.9657, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.31496062992125984, |
| "grad_norm": 1.8850637599616171, |
| "learning_rate": 9.999416873566297e-06, |
| "loss": 0.9442, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.32808398950131235, |
| "grad_norm": 1.7285303777141385, |
| "learning_rate": 9.997667630279758e-06, |
| "loss": 0.932, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.34120734908136485, |
| "grad_norm": 1.7265089317333577, |
| "learning_rate": 9.994752678152384e-06, |
| "loss": 0.9357, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.3543307086614173, |
| "grad_norm": 1.8320254043398032, |
| "learning_rate": 9.990672697098431e-06, |
| "loss": 0.9222, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.3674540682414698, |
| "grad_norm": 1.7044024495030916, |
| "learning_rate": 9.985428638775822e-06, |
| "loss": 0.9216, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.3805774278215223, |
| "grad_norm": 2.2575206943279382, |
| "learning_rate": 9.979021726364164e-06, |
| "loss": 0.897, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.3937007874015748, |
| "grad_norm": 1.7729314927057351, |
| "learning_rate": 9.971453454279454e-06, |
| "loss": 0.8844, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4068241469816273, |
| "grad_norm": 2.35007808422404, |
| "learning_rate": 9.962725587825492e-06, |
| "loss": 0.8832, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.4199475065616798, |
| "grad_norm": 2.2653008942496, |
| "learning_rate": 9.95284016278214e-06, |
| "loss": 0.8803, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4330708661417323, |
| "grad_norm": 1.986794023664208, |
| "learning_rate": 9.941799484930454e-06, |
| "loss": 0.8741, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.4461942257217848, |
| "grad_norm": 1.5861501803724807, |
| "learning_rate": 9.929606129514875e-06, |
| "loss": 0.8636, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.45931758530183725, |
| "grad_norm": 1.8008905768440275, |
| "learning_rate": 9.916262940642549e-06, |
| "loss": 0.8656, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.47244094488188976, |
| "grad_norm": 1.765844088914314, |
| "learning_rate": 9.90177303061993e-06, |
| "loss": 0.8642, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.48556430446194226, |
| "grad_norm": 1.8318811699836053, |
| "learning_rate": 9.88613977922684e-06, |
| "loss": 0.8537, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.49868766404199477, |
| "grad_norm": 1.66695173639277, |
| "learning_rate": 9.869366832928134e-06, |
| "loss": 0.8535, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5118110236220472, |
| "grad_norm": 1.8171270864493383, |
| "learning_rate": 9.851458104023153e-06, |
| "loss": 0.8367, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5249343832020997, |
| "grad_norm": 1.773784462102006, |
| "learning_rate": 9.832417769733185e-06, |
| "loss": 0.8302, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5380577427821522, |
| "grad_norm": 1.908924552643567, |
| "learning_rate": 9.812250271227123e-06, |
| "loss": 0.8403, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.5511811023622047, |
| "grad_norm": 1.673461507192779, |
| "learning_rate": 9.790960312585561e-06, |
| "loss": 0.8347, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.5643044619422573, |
| "grad_norm": 1.5727733661189518, |
| "learning_rate": 9.76855285970356e-06, |
| "loss": 0.8284, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.5774278215223098, |
| "grad_norm": 1.6740839799492047, |
| "learning_rate": 9.745033139132352e-06, |
| "loss": 0.828, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.5905511811023622, |
| "grad_norm": 1.7521499040025525, |
| "learning_rate": 9.720406636860252e-06, |
| "loss": 0.8164, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6036745406824147, |
| "grad_norm": 1.5716458693651338, |
| "learning_rate": 9.694679097033038e-06, |
| "loss": 0.8158, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.6167979002624672, |
| "grad_norm": 1.869817821288055, |
| "learning_rate": 9.667856520614128e-06, |
| "loss": 0.8152, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.6299212598425197, |
| "grad_norm": 1.7933186182662169, |
| "learning_rate": 9.639945163984852e-06, |
| "loss": 0.8229, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.6430446194225722, |
| "grad_norm": 1.6201846678042537, |
| "learning_rate": 9.610951537485152e-06, |
| "loss": 0.8175, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.6561679790026247, |
| "grad_norm": 1.7811525807024158, |
| "learning_rate": 9.580882403895038e-06, |
| "loss": 0.8053, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6692913385826772, |
| "grad_norm": 1.473703478611248, |
| "learning_rate": 9.549744776857162e-06, |
| "loss": 0.8086, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.6824146981627297, |
| "grad_norm": 1.5936730653642202, |
| "learning_rate": 9.51754591924089e-06, |
| "loss": 0.8102, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.6955380577427821, |
| "grad_norm": 1.6012042279794647, |
| "learning_rate": 9.484293341448221e-06, |
| "loss": 0.7944, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.7086614173228346, |
| "grad_norm": 1.5905348724723356, |
| "learning_rate": 9.449994799662e-06, |
| "loss": 0.7958, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.7217847769028871, |
| "grad_norm": 1.5342424313908625, |
| "learning_rate": 9.414658294036768e-06, |
| "loss": 0.8018, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.7349081364829396, |
| "grad_norm": 1.575920836396293, |
| "learning_rate": 9.378292066832723e-06, |
| "loss": 0.7928, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.7480314960629921, |
| "grad_norm": 1.524275013695747, |
| "learning_rate": 9.34090460049322e-06, |
| "loss": 0.7939, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.7611548556430446, |
| "grad_norm": 1.5915377546345706, |
| "learning_rate": 9.302504615666222e-06, |
| "loss": 0.7943, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.7742782152230971, |
| "grad_norm": 1.440116819392148, |
| "learning_rate": 9.26310106917021e-06, |
| "loss": 0.7963, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.7874015748031497, |
| "grad_norm": 1.3528278926960116, |
| "learning_rate": 9.222703151905005e-06, |
| "loss": 0.7914, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.800524934383202, |
| "grad_norm": 1.5465191371552307, |
| "learning_rate": 9.181320286707974e-06, |
| "loss": 0.7927, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.8136482939632546, |
| "grad_norm": 1.648064371427012, |
| "learning_rate": 9.138962126156157e-06, |
| "loss": 0.796, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.8267716535433071, |
| "grad_norm": 1.4758839650496245, |
| "learning_rate": 9.095638550314794e-06, |
| "loss": 0.7933, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.8398950131233596, |
| "grad_norm": 1.7226329473102002, |
| "learning_rate": 9.051359664432795e-06, |
| "loss": 0.7804, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.8530183727034121, |
| "grad_norm": 1.612826825222584, |
| "learning_rate": 9.006135796585688e-06, |
| "loss": 0.7836, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.8661417322834646, |
| "grad_norm": 1.4463880498485353, |
| "learning_rate": 8.95997749526658e-06, |
| "loss": 0.7811, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.8792650918635171, |
| "grad_norm": 1.604651553438199, |
| "learning_rate": 8.912895526925726e-06, |
| "loss": 0.7781, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.8923884514435696, |
| "grad_norm": 1.5684616487142087, |
| "learning_rate": 8.86490087345924e-06, |
| "loss": 0.7814, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.905511811023622, |
| "grad_norm": 1.4869728238675464, |
| "learning_rate": 8.816004729647573e-06, |
| "loss": 0.7827, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.9186351706036745, |
| "grad_norm": 1.50603165520279, |
| "learning_rate": 8.766218500544305e-06, |
| "loss": 0.7848, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.931758530183727, |
| "grad_norm": 1.5149074570283008, |
| "learning_rate": 8.715553798815925e-06, |
| "loss": 0.7769, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.9448818897637795, |
| "grad_norm": 1.544976291499801, |
| "learning_rate": 8.66402244203317e-06, |
| "loss": 0.7613, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.958005249343832, |
| "grad_norm": 1.555367156177007, |
| "learning_rate": 8.611636449914563e-06, |
| "loss": 0.7668, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.9711286089238845, |
| "grad_norm": 1.504586391232175, |
| "learning_rate": 8.558408041522801e-06, |
| "loss": 0.7654, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.984251968503937, |
| "grad_norm": 1.4898212640427306, |
| "learning_rate": 8.504349632414675e-06, |
| "loss": 0.764, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.9973753280839895, |
| "grad_norm": 1.401986546883637, |
| "learning_rate": 8.449473831745106e-06, |
| "loss": 0.7583, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.010498687664042, |
| "grad_norm": 1.4416543605053558, |
| "learning_rate": 8.393793439326071e-06, |
| "loss": 0.7103, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.0236220472440944, |
| "grad_norm": 1.6403171843794115, |
| "learning_rate": 8.337321442641036e-06, |
| "loss": 0.7034, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.036745406824147, |
| "grad_norm": 1.5388511360135515, |
| "learning_rate": 8.28007101381561e-06, |
| "loss": 0.7011, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.0498687664041995, |
| "grad_norm": 1.4334465678780328, |
| "learning_rate": 8.22205550654515e-06, |
| "loss": 0.7079, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.0629921259842519, |
| "grad_norm": 1.5587791264347304, |
| "learning_rate": 8.16328845298e-06, |
| "loss": 0.6971, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.0761154855643045, |
| "grad_norm": 1.5233569268398837, |
| "learning_rate": 8.103783560569104e-06, |
| "loss": 0.7088, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.0892388451443569, |
| "grad_norm": 1.4976958999381966, |
| "learning_rate": 8.04355470886274e-06, |
| "loss": 0.6934, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.1023622047244095, |
| "grad_norm": 1.3430067666073104, |
| "learning_rate": 7.98261594627511e-06, |
| "loss": 0.7011, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.1154855643044619, |
| "grad_norm": 1.4631861658373042, |
| "learning_rate": 7.920981486807537e-06, |
| "loss": 0.7053, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.1286089238845145, |
| "grad_norm": 1.3662308568396804, |
| "learning_rate": 7.858665706733035e-06, |
| "loss": 0.6999, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.141732283464567, |
| "grad_norm": 1.5745244501174307, |
| "learning_rate": 7.795683141243046e-06, |
| "loss": 0.7036, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.1548556430446195, |
| "grad_norm": 1.4028684322219405, |
| "learning_rate": 7.732048481057088e-06, |
| "loss": 0.7017, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.167979002624672, |
| "grad_norm": 1.684251036089208, |
| "learning_rate": 7.667776568996143e-06, |
| "loss": 0.6849, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.1811023622047245, |
| "grad_norm": 1.4835499788707966, |
| "learning_rate": 7.602882396520559e-06, |
| "loss": 0.6951, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.194225721784777, |
| "grad_norm": 1.5123572551106819, |
| "learning_rate": 7.5373811002332785e-06, |
| "loss": 0.6962, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.2073490813648293, |
| "grad_norm": 1.8825750275150834, |
| "learning_rate": 7.47128795834923e-06, |
| "loss": 0.7017, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.220472440944882, |
| "grad_norm": 1.334943117481765, |
| "learning_rate": 7.4046183871316544e-06, |
| "loss": 0.6973, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.2335958005249343, |
| "grad_norm": 1.5838501251475126, |
| "learning_rate": 7.337387937296278e-06, |
| "loss": 0.6944, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.246719160104987, |
| "grad_norm": 1.342799954640876, |
| "learning_rate": 7.269612290384076e-06, |
| "loss": 0.6822, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.2598425196850394, |
| "grad_norm": 1.3443879066347948, |
| "learning_rate": 7.201307255103561e-06, |
| "loss": 0.692, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.272965879265092, |
| "grad_norm": 1.586723729979877, |
| "learning_rate": 7.132488763643384e-06, |
| "loss": 0.6925, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.2860892388451444, |
| "grad_norm": 1.3370936442314167, |
| "learning_rate": 7.063172867956143e-06, |
| "loss": 0.6904, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.2992125984251968, |
| "grad_norm": 1.5378877632776058, |
| "learning_rate": 6.993375736014259e-06, |
| "loss": 0.6903, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.3123359580052494, |
| "grad_norm": 1.361410460657643, |
| "learning_rate": 6.923113648038784e-06, |
| "loss": 0.6943, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.3254593175853018, |
| "grad_norm": 1.646369394476635, |
| "learning_rate": 6.852402992702034e-06, |
| "loss": 0.6788, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.3385826771653544, |
| "grad_norm": 1.3176415658235974, |
| "learning_rate": 6.781260263304918e-06, |
| "loss": 0.6836, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.3517060367454068, |
| "grad_norm": 1.3116825387663804, |
| "learning_rate": 6.709702053929865e-06, |
| "loss": 0.6804, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.3648293963254594, |
| "grad_norm": 1.3926884194197895, |
| "learning_rate": 6.6377450555702485e-06, |
| "loss": 0.683, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.3779527559055118, |
| "grad_norm": 1.4099706312703377, |
| "learning_rate": 6.565406052237205e-06, |
| "loss": 0.6793, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.3910761154855642, |
| "grad_norm": 1.7784719464765815, |
| "learning_rate": 6.4927019170447434e-06, |
| "loss": 0.6922, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.4041994750656168, |
| "grad_norm": 1.4629942474237903, |
| "learning_rate": 6.419649608274096e-06, |
| "loss": 0.6945, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.4173228346456692, |
| "grad_norm": 1.3978321692556004, |
| "learning_rate": 6.346266165418173e-06, |
| "loss": 0.6794, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.4304461942257218, |
| "grad_norm": 1.4239220003078308, |
| "learning_rate": 6.272568705207109e-06, |
| "loss": 0.6822, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.4435695538057742, |
| "grad_norm": 1.5699876966221564, |
| "learning_rate": 6.198574417615758e-06, |
| "loss": 0.6837, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.4566929133858268, |
| "grad_norm": 1.5574208469323227, |
| "learning_rate": 6.124300561854139e-06, |
| "loss": 0.6797, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.4698162729658792, |
| "grad_norm": 1.4140951570795213, |
| "learning_rate": 6.049764462341702e-06, |
| "loss": 0.6741, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.4829396325459316, |
| "grad_norm": 1.4432324570722699, |
| "learning_rate": 5.974983504666402e-06, |
| "loss": 0.6822, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.4960629921259843, |
| "grad_norm": 1.3824079635846924, |
| "learning_rate": 5.899975131529504e-06, |
| "loss": 0.6821, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.5091863517060369, |
| "grad_norm": 1.3639889335044162, |
| "learning_rate": 5.824756838677057e-06, |
| "loss": 0.6833, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.5223097112860893, |
| "grad_norm": 1.3279921872652147, |
| "learning_rate": 5.749346170819006e-06, |
| "loss": 0.6745, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.5354330708661417, |
| "grad_norm": 1.4234609859145475, |
| "learning_rate": 5.6737607175368735e-06, |
| "loss": 0.6845, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.5485564304461943, |
| "grad_norm": 1.4045161720671306, |
| "learning_rate": 5.598018109180988e-06, |
| "loss": 0.6832, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.5616797900262467, |
| "grad_norm": 1.5599767307463868, |
| "learning_rate": 5.5221360127581815e-06, |
| "loss": 0.673, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.574803149606299, |
| "grad_norm": 1.3244762605195228, |
| "learning_rate": 5.446132127810966e-06, |
| "loss": 0.6714, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.5879265091863517, |
| "grad_norm": 1.5834736978023898, |
| "learning_rate": 5.370024182289087e-06, |
| "loss": 0.6659, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.6010498687664043, |
| "grad_norm": 1.4369156468567603, |
| "learning_rate": 5.29382992841449e-06, |
| "loss": 0.6796, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.6141732283464567, |
| "grad_norm": 1.3895175396051678, |
| "learning_rate": 5.217567138540581e-06, |
| "loss": 0.6742, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.627296587926509, |
| "grad_norm": 1.3970406816982954, |
| "learning_rate": 5.141253601006841e-06, |
| "loss": 0.6608, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.6404199475065617, |
| "grad_norm": 1.31995657343019, |
| "learning_rate": 5.064907115989655e-06, |
| "loss": 0.6732, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.6535433070866141, |
| "grad_norm": 1.3046889168284816, |
| "learning_rate": 4.9885454913504435e-06, |
| "loss": 0.6663, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 1.2295984270005762, |
| "learning_rate": 4.912186538481944e-06, |
| "loss": 0.6619, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.6797900262467191, |
| "grad_norm": 1.4667420849887025, |
| "learning_rate": 4.835848068153702e-06, |
| "loss": 0.6708, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.6929133858267718, |
| "grad_norm": 1.3732477862949846, |
| "learning_rate": 4.759547886357701e-06, |
| "loss": 0.6699, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.7060367454068242, |
| "grad_norm": 1.2525903026333147, |
| "learning_rate": 4.683303790155103e-06, |
| "loss": 0.6672, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.7191601049868765, |
| "grad_norm": 1.4884566011924565, |
| "learning_rate": 4.607133563525072e-06, |
| "loss": 0.6657, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.7322834645669292, |
| "grad_norm": 1.2651877837014813, |
| "learning_rate": 4.531054973216648e-06, |
| "loss": 0.6601, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.7454068241469818, |
| "grad_norm": 1.2957136613776106, |
| "learning_rate": 4.455085764604653e-06, |
| "loss": 0.665, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.758530183727034, |
| "grad_norm": 1.2914149104267014, |
| "learning_rate": 4.3792436575505644e-06, |
| "loss": 0.6711, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.7716535433070866, |
| "grad_norm": 1.3087878612877044, |
| "learning_rate": 4.303546342269344e-06, |
| "loss": 0.6802, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.7847769028871392, |
| "grad_norm": 1.354110047538368, |
| "learning_rate": 4.228011475203191e-06, |
| "loss": 0.6639, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.7979002624671916, |
| "grad_norm": 1.264469178278457, |
| "learning_rate": 4.152656674903169e-06, |
| "loss": 0.6596, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.811023622047244, |
| "grad_norm": 1.355906590548092, |
| "learning_rate": 4.077499517919663e-06, |
| "loss": 0.6657, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.8241469816272966, |
| "grad_norm": 1.291362822136906, |
| "learning_rate": 4.002557534702639e-06, |
| "loss": 0.6616, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.8372703412073492, |
| "grad_norm": 1.3281942700420335, |
| "learning_rate": 3.927848205512659e-06, |
| "loss": 0.6641, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.8503937007874016, |
| "grad_norm": 1.2988138580887156, |
| "learning_rate": 3.853388956343604e-06, |
| "loss": 0.6673, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.863517060367454, |
| "grad_norm": 1.386424735985052, |
| "learning_rate": 3.779197154858044e-06, |
| "loss": 0.6509, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.8766404199475066, |
| "grad_norm": 1.335199751133558, |
| "learning_rate": 3.705290106336221e-06, |
| "loss": 0.6641, |
| "step": 1430 |
| }, |
| { |
| "epoch": 1.889763779527559, |
| "grad_norm": 1.2373191854244718, |
| "learning_rate": 3.6316850496395863e-06, |
| "loss": 0.6632, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.9028871391076114, |
| "grad_norm": 1.254937848225745, |
| "learning_rate": 3.5583991531898276e-06, |
| "loss": 0.6586, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.916010498687664, |
| "grad_norm": 1.2586091439960605, |
| "learning_rate": 3.4854495109643207e-06, |
| "loss": 0.6555, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.9291338582677167, |
| "grad_norm": 1.2691195643845157, |
| "learning_rate": 3.412853138508947e-06, |
| "loss": 0.6652, |
| "step": 1470 |
| }, |
| { |
| "epoch": 1.942257217847769, |
| "grad_norm": 1.37789137126307, |
| "learning_rate": 3.340626968969215e-06, |
| "loss": 0.6578, |
| "step": 1480 |
| }, |
| { |
| "epoch": 1.9553805774278215, |
| "grad_norm": 1.3680845584870929, |
| "learning_rate": 3.2687878491405933e-06, |
| "loss": 0.6613, |
| "step": 1490 |
| }, |
| { |
| "epoch": 1.968503937007874, |
| "grad_norm": 1.344475700563475, |
| "learning_rate": 3.197352535538978e-06, |
| "loss": 0.6712, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.9816272965879265, |
| "grad_norm": 1.3678562268271839, |
| "learning_rate": 3.1263376904922318e-06, |
| "loss": 0.6584, |
| "step": 1510 |
| }, |
| { |
| "epoch": 1.9947506561679789, |
| "grad_norm": 1.346152280655493, |
| "learning_rate": 3.0557598782536914e-06, |
| "loss": 0.6551, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.0078740157480315, |
| "grad_norm": 1.2899162269306206, |
| "learning_rate": 2.9856355611385356e-06, |
| "loss": 0.6218, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.020997375328084, |
| "grad_norm": 1.34329640848473, |
| "learning_rate": 2.915981095683943e-06, |
| "loss": 0.5752, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.0341207349081363, |
| "grad_norm": 1.24625682130632, |
| "learning_rate": 2.846812728833931e-06, |
| "loss": 0.578, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.047244094488189, |
| "grad_norm": 1.3603958866479353, |
| "learning_rate": 2.778146594149732e-06, |
| "loss": 0.5909, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.0603674540682415, |
| "grad_norm": 1.3130248810501666, |
| "learning_rate": 2.7099987080466417e-06, |
| "loss": 0.5779, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.073490813648294, |
| "grad_norm": 1.2728229935348123, |
| "learning_rate": 2.64238496605817e-06, |
| "loss": 0.5874, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.0866141732283463, |
| "grad_norm": 1.3312388632136043, |
| "learning_rate": 2.5753211391284172e-06, |
| "loss": 0.5788, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.099737532808399, |
| "grad_norm": 1.2348457534841435, |
| "learning_rate": 2.5088228699334717e-06, |
| "loss": 0.5763, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.1128608923884515, |
| "grad_norm": 1.3038144088885084, |
| "learning_rate": 2.44290566923276e-06, |
| "loss": 0.5799, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.1259842519685037, |
| "grad_norm": 1.3164791802552103, |
| "learning_rate": 2.3775849122511442e-06, |
| "loss": 0.5769, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.1391076115485563, |
| "grad_norm": 1.291734568095305, |
| "learning_rate": 2.312875835092655e-06, |
| "loss": 0.5721, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.152230971128609, |
| "grad_norm": 1.4191185408274385, |
| "learning_rate": 2.248793531186647e-06, |
| "loss": 0.5768, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.1653543307086616, |
| "grad_norm": 1.18784718220649, |
| "learning_rate": 2.185352947767257e-06, |
| "loss": 0.579, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.1784776902887137, |
| "grad_norm": 1.2311199986468657, |
| "learning_rate": 2.1225688823869494e-06, |
| "loss": 0.5747, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.1916010498687664, |
| "grad_norm": 1.2447404349206936, |
| "learning_rate": 2.0604559794649793e-06, |
| "loss": 0.5734, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.204724409448819, |
| "grad_norm": 1.296578730192031, |
| "learning_rate": 1.999028726871576e-06, |
| "loss": 0.5755, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.2178477690288716, |
| "grad_norm": 1.2677498871223802, |
| "learning_rate": 1.9383014525486287e-06, |
| "loss": 0.5728, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.2309711286089238, |
| "grad_norm": 1.2522124484651282, |
| "learning_rate": 1.8782883211677044e-06, |
| "loss": 0.5713, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.2440944881889764, |
| "grad_norm": 1.3344150468049227, |
| "learning_rate": 1.8190033308261134e-06, |
| "loss": 0.5723, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.257217847769029, |
| "grad_norm": 1.2564213421314623, |
| "learning_rate": 1.7604603097818523e-06, |
| "loss": 0.5734, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.270341207349081, |
| "grad_norm": 1.3130336081169207, |
| "learning_rate": 1.7026729132281489e-06, |
| "loss": 0.5787, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.283464566929134, |
| "grad_norm": 1.1841727433165028, |
| "learning_rate": 1.6456546201083934e-06, |
| "loss": 0.5744, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.2965879265091864, |
| "grad_norm": 1.3367937537125874, |
| "learning_rate": 1.5894187299721535e-06, |
| "loss": 0.5707, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.309711286089239, |
| "grad_norm": 1.2242809577158813, |
| "learning_rate": 1.5339783598730568e-06, |
| "loss": 0.5692, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.322834645669291, |
| "grad_norm": 1.2513286468060605, |
| "learning_rate": 1.4793464413092161e-06, |
| "loss": 0.574, |
| "step": 1770 |
| }, |
| { |
| "epoch": 2.335958005249344, |
| "grad_norm": 1.259680151618769, |
| "learning_rate": 1.4255357172069727e-06, |
| "loss": 0.5786, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.3490813648293964, |
| "grad_norm": 1.2315488553903935, |
| "learning_rate": 1.3725587389485812e-06, |
| "loss": 0.5671, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.362204724409449, |
| "grad_norm": 1.2676834549720328, |
| "learning_rate": 1.3204278634446028e-06, |
| "loss": 0.5769, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.3753280839895012, |
| "grad_norm": 1.3389962065747927, |
| "learning_rate": 1.2691552502516414e-06, |
| "loss": 0.5777, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.388451443569554, |
| "grad_norm": 1.2653499838227833, |
| "learning_rate": 1.2187528587361313e-06, |
| "loss": 0.5723, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.4015748031496065, |
| "grad_norm": 1.1672991280648781, |
| "learning_rate": 1.1692324452847992e-06, |
| "loss": 0.5699, |
| "step": 1830 |
| }, |
| { |
| "epoch": 2.4146981627296586, |
| "grad_norm": 1.2893411655718872, |
| "learning_rate": 1.1206055605624777e-06, |
| "loss": 0.5732, |
| "step": 1840 |
| }, |
| { |
| "epoch": 2.4278215223097113, |
| "grad_norm": 1.290721907760722, |
| "learning_rate": 1.0728835468179183e-06, |
| "loss": 0.5662, |
| "step": 1850 |
| }, |
| { |
| "epoch": 2.440944881889764, |
| "grad_norm": 1.217997593835658, |
| "learning_rate": 1.0260775352381934e-06, |
| "loss": 0.5727, |
| "step": 1860 |
| }, |
| { |
| "epoch": 2.454068241469816, |
| "grad_norm": 1.2440219130503634, |
| "learning_rate": 9.801984433523483e-07, |
| "loss": 0.5694, |
| "step": 1870 |
| }, |
| { |
| "epoch": 2.4671916010498687, |
| "grad_norm": 1.2592327456759795, |
| "learning_rate": 9.352569724848715e-07, |
| "loss": 0.5735, |
| "step": 1880 |
| }, |
| { |
| "epoch": 2.4803149606299213, |
| "grad_norm": 1.226880374562509, |
| "learning_rate": 8.912636052596207e-07, |
| "loss": 0.5816, |
| "step": 1890 |
| }, |
| { |
| "epoch": 2.493438320209974, |
| "grad_norm": 1.2083486104350505, |
| "learning_rate": 8.482286031547282e-07, |
| "loss": 0.5638, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.506561679790026, |
| "grad_norm": 1.14544648130176, |
| "learning_rate": 8.061620041091172e-07, |
| "loss": 0.5735, |
| "step": 1910 |
| }, |
| { |
| "epoch": 2.5196850393700787, |
| "grad_norm": 1.2633309998706563, |
| "learning_rate": 7.650736201811348e-07, |
| "loss": 0.5768, |
| "step": 1920 |
| }, |
| { |
| "epoch": 2.5328083989501313, |
| "grad_norm": 1.2681452803808053, |
| "learning_rate": 7.249730352599e-07, |
| "loss": 0.575, |
| "step": 1930 |
| }, |
| { |
| "epoch": 2.545931758530184, |
| "grad_norm": 1.2826031568874492, |
| "learning_rate": 6.858696028298412e-07, |
| "loss": 0.5687, |
| "step": 1940 |
| }, |
| { |
| "epoch": 2.559055118110236, |
| "grad_norm": 1.2627571245177376, |
| "learning_rate": 6.477724437889988e-07, |
| "loss": 0.564, |
| "step": 1950 |
| }, |
| { |
| "epoch": 2.5721784776902887, |
| "grad_norm": 1.1880367657533304, |
| "learning_rate": 6.106904443215639e-07, |
| "loss": 0.5709, |
| "step": 1960 |
| }, |
| { |
| "epoch": 2.5853018372703414, |
| "grad_norm": 1.234319028151925, |
| "learning_rate": 5.746322538251814e-07, |
| "loss": 0.5731, |
| "step": 1970 |
| }, |
| { |
| "epoch": 2.5984251968503935, |
| "grad_norm": 1.2520017746520362, |
| "learning_rate": 5.396062828934634e-07, |
| "loss": 0.5765, |
| "step": 1980 |
| }, |
| { |
| "epoch": 2.611548556430446, |
| "grad_norm": 1.2282113090629896, |
| "learning_rate": 5.056207013542131e-07, |
| "loss": 0.5723, |
| "step": 1990 |
| }, |
| { |
| "epoch": 2.6246719160104988, |
| "grad_norm": 1.250069721424398, |
| "learning_rate": 4.7268343636381774e-07, |
| "loss": 0.5681, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2286, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1100109887569920.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|