InTool-SFT-stage2-7B / trainer_state.json
xytian1008's picture
Upload folder using huggingface_hub
3e53724 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.6246719160104988,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013123359580052493,
"grad_norm": 23.86959882889253,
"learning_rate": 3.930131004366813e-07,
"loss": 3.6671,
"step": 10
},
{
"epoch": 0.026246719160104987,
"grad_norm": 14.462567513995324,
"learning_rate": 8.296943231441049e-07,
"loss": 3.5017,
"step": 20
},
{
"epoch": 0.03937007874015748,
"grad_norm": 9.814656935541821,
"learning_rate": 1.2663755458515283e-06,
"loss": 3.0198,
"step": 30
},
{
"epoch": 0.05249343832020997,
"grad_norm": 5.878057728224257,
"learning_rate": 1.703056768558952e-06,
"loss": 2.5872,
"step": 40
},
{
"epoch": 0.06561679790026247,
"grad_norm": 3.906259383681401,
"learning_rate": 2.1397379912663756e-06,
"loss": 2.2781,
"step": 50
},
{
"epoch": 0.07874015748031496,
"grad_norm": 4.4867149824774835,
"learning_rate": 2.576419213973799e-06,
"loss": 2.0297,
"step": 60
},
{
"epoch": 0.09186351706036745,
"grad_norm": 4.808687465121692,
"learning_rate": 3.0131004366812227e-06,
"loss": 1.8012,
"step": 70
},
{
"epoch": 0.10498687664041995,
"grad_norm": 4.8883051654567256,
"learning_rate": 3.4497816593886467e-06,
"loss": 1.6206,
"step": 80
},
{
"epoch": 0.11811023622047244,
"grad_norm": 4.85572751158142,
"learning_rate": 3.88646288209607e-06,
"loss": 1.4667,
"step": 90
},
{
"epoch": 0.13123359580052493,
"grad_norm": 4.149268447654452,
"learning_rate": 4.323144104803494e-06,
"loss": 1.3442,
"step": 100
},
{
"epoch": 0.14435695538057744,
"grad_norm": 3.8514308742742105,
"learning_rate": 4.759825327510917e-06,
"loss": 1.2591,
"step": 110
},
{
"epoch": 0.15748031496062992,
"grad_norm": 2.3342255673234478,
"learning_rate": 5.196506550218341e-06,
"loss": 1.1949,
"step": 120
},
{
"epoch": 0.17060367454068243,
"grad_norm": 2.260928208792793,
"learning_rate": 5.6331877729257645e-06,
"loss": 1.1603,
"step": 130
},
{
"epoch": 0.1837270341207349,
"grad_norm": 2.057320468921841,
"learning_rate": 6.069868995633188e-06,
"loss": 1.1186,
"step": 140
},
{
"epoch": 0.1968503937007874,
"grad_norm": 1.8563167183407243,
"learning_rate": 6.5065502183406116e-06,
"loss": 1.079,
"step": 150
},
{
"epoch": 0.2099737532808399,
"grad_norm": 1.8461471131985807,
"learning_rate": 6.943231441048035e-06,
"loss": 1.0464,
"step": 160
},
{
"epoch": 0.2230971128608924,
"grad_norm": 2.034240834591269,
"learning_rate": 7.3799126637554595e-06,
"loss": 1.0376,
"step": 170
},
{
"epoch": 0.23622047244094488,
"grad_norm": 1.7734341028229181,
"learning_rate": 7.816593886462883e-06,
"loss": 1.0199,
"step": 180
},
{
"epoch": 0.24934383202099739,
"grad_norm": 1.9112436701720903,
"learning_rate": 8.253275109170307e-06,
"loss": 1.0007,
"step": 190
},
{
"epoch": 0.26246719160104987,
"grad_norm": 1.8822173865457565,
"learning_rate": 8.68995633187773e-06,
"loss": 0.9955,
"step": 200
},
{
"epoch": 0.2755905511811024,
"grad_norm": 1.7697350327707664,
"learning_rate": 9.126637554585154e-06,
"loss": 0.9728,
"step": 210
},
{
"epoch": 0.2887139107611549,
"grad_norm": 1.817013692625474,
"learning_rate": 9.563318777292577e-06,
"loss": 0.9613,
"step": 220
},
{
"epoch": 0.30183727034120733,
"grad_norm": 1.7196715695129818,
"learning_rate": 1e-05,
"loss": 0.9657,
"step": 230
},
{
"epoch": 0.31496062992125984,
"grad_norm": 1.8850637599616171,
"learning_rate": 9.999416873566297e-06,
"loss": 0.9442,
"step": 240
},
{
"epoch": 0.32808398950131235,
"grad_norm": 1.7285303777141385,
"learning_rate": 9.997667630279758e-06,
"loss": 0.932,
"step": 250
},
{
"epoch": 0.34120734908136485,
"grad_norm": 1.7265089317333577,
"learning_rate": 9.994752678152384e-06,
"loss": 0.9357,
"step": 260
},
{
"epoch": 0.3543307086614173,
"grad_norm": 1.8320254043398032,
"learning_rate": 9.990672697098431e-06,
"loss": 0.9222,
"step": 270
},
{
"epoch": 0.3674540682414698,
"grad_norm": 1.7044024495030916,
"learning_rate": 9.985428638775822e-06,
"loss": 0.9216,
"step": 280
},
{
"epoch": 0.3805774278215223,
"grad_norm": 2.2575206943279382,
"learning_rate": 9.979021726364164e-06,
"loss": 0.897,
"step": 290
},
{
"epoch": 0.3937007874015748,
"grad_norm": 1.7729314927057351,
"learning_rate": 9.971453454279454e-06,
"loss": 0.8844,
"step": 300
},
{
"epoch": 0.4068241469816273,
"grad_norm": 2.35007808422404,
"learning_rate": 9.962725587825492e-06,
"loss": 0.8832,
"step": 310
},
{
"epoch": 0.4199475065616798,
"grad_norm": 2.2653008942496,
"learning_rate": 9.95284016278214e-06,
"loss": 0.8803,
"step": 320
},
{
"epoch": 0.4330708661417323,
"grad_norm": 1.986794023664208,
"learning_rate": 9.941799484930454e-06,
"loss": 0.8741,
"step": 330
},
{
"epoch": 0.4461942257217848,
"grad_norm": 1.5861501803724807,
"learning_rate": 9.929606129514875e-06,
"loss": 0.8636,
"step": 340
},
{
"epoch": 0.45931758530183725,
"grad_norm": 1.8008905768440275,
"learning_rate": 9.916262940642549e-06,
"loss": 0.8656,
"step": 350
},
{
"epoch": 0.47244094488188976,
"grad_norm": 1.765844088914314,
"learning_rate": 9.90177303061993e-06,
"loss": 0.8642,
"step": 360
},
{
"epoch": 0.48556430446194226,
"grad_norm": 1.8318811699836053,
"learning_rate": 9.88613977922684e-06,
"loss": 0.8537,
"step": 370
},
{
"epoch": 0.49868766404199477,
"grad_norm": 1.66695173639277,
"learning_rate": 9.869366832928134e-06,
"loss": 0.8535,
"step": 380
},
{
"epoch": 0.5118110236220472,
"grad_norm": 1.8171270864493383,
"learning_rate": 9.851458104023153e-06,
"loss": 0.8367,
"step": 390
},
{
"epoch": 0.5249343832020997,
"grad_norm": 1.773784462102006,
"learning_rate": 9.832417769733185e-06,
"loss": 0.8302,
"step": 400
},
{
"epoch": 0.5380577427821522,
"grad_norm": 1.908924552643567,
"learning_rate": 9.812250271227123e-06,
"loss": 0.8403,
"step": 410
},
{
"epoch": 0.5511811023622047,
"grad_norm": 1.673461507192779,
"learning_rate": 9.790960312585561e-06,
"loss": 0.8347,
"step": 420
},
{
"epoch": 0.5643044619422573,
"grad_norm": 1.5727733661189518,
"learning_rate": 9.76855285970356e-06,
"loss": 0.8284,
"step": 430
},
{
"epoch": 0.5774278215223098,
"grad_norm": 1.6740839799492047,
"learning_rate": 9.745033139132352e-06,
"loss": 0.828,
"step": 440
},
{
"epoch": 0.5905511811023622,
"grad_norm": 1.7521499040025525,
"learning_rate": 9.720406636860252e-06,
"loss": 0.8164,
"step": 450
},
{
"epoch": 0.6036745406824147,
"grad_norm": 1.5716458693651338,
"learning_rate": 9.694679097033038e-06,
"loss": 0.8158,
"step": 460
},
{
"epoch": 0.6167979002624672,
"grad_norm": 1.869817821288055,
"learning_rate": 9.667856520614128e-06,
"loss": 0.8152,
"step": 470
},
{
"epoch": 0.6299212598425197,
"grad_norm": 1.7933186182662169,
"learning_rate": 9.639945163984852e-06,
"loss": 0.8229,
"step": 480
},
{
"epoch": 0.6430446194225722,
"grad_norm": 1.6201846678042537,
"learning_rate": 9.610951537485152e-06,
"loss": 0.8175,
"step": 490
},
{
"epoch": 0.6561679790026247,
"grad_norm": 1.7811525807024158,
"learning_rate": 9.580882403895038e-06,
"loss": 0.8053,
"step": 500
},
{
"epoch": 0.6692913385826772,
"grad_norm": 1.473703478611248,
"learning_rate": 9.549744776857162e-06,
"loss": 0.8086,
"step": 510
},
{
"epoch": 0.6824146981627297,
"grad_norm": 1.5936730653642202,
"learning_rate": 9.51754591924089e-06,
"loss": 0.8102,
"step": 520
},
{
"epoch": 0.6955380577427821,
"grad_norm": 1.6012042279794647,
"learning_rate": 9.484293341448221e-06,
"loss": 0.7944,
"step": 530
},
{
"epoch": 0.7086614173228346,
"grad_norm": 1.5905348724723356,
"learning_rate": 9.449994799662e-06,
"loss": 0.7958,
"step": 540
},
{
"epoch": 0.7217847769028871,
"grad_norm": 1.5342424313908625,
"learning_rate": 9.414658294036768e-06,
"loss": 0.8018,
"step": 550
},
{
"epoch": 0.7349081364829396,
"grad_norm": 1.575920836396293,
"learning_rate": 9.378292066832723e-06,
"loss": 0.7928,
"step": 560
},
{
"epoch": 0.7480314960629921,
"grad_norm": 1.524275013695747,
"learning_rate": 9.34090460049322e-06,
"loss": 0.7939,
"step": 570
},
{
"epoch": 0.7611548556430446,
"grad_norm": 1.5915377546345706,
"learning_rate": 9.302504615666222e-06,
"loss": 0.7943,
"step": 580
},
{
"epoch": 0.7742782152230971,
"grad_norm": 1.440116819392148,
"learning_rate": 9.26310106917021e-06,
"loss": 0.7963,
"step": 590
},
{
"epoch": 0.7874015748031497,
"grad_norm": 1.3528278926960116,
"learning_rate": 9.222703151905005e-06,
"loss": 0.7914,
"step": 600
},
{
"epoch": 0.800524934383202,
"grad_norm": 1.5465191371552307,
"learning_rate": 9.181320286707974e-06,
"loss": 0.7927,
"step": 610
},
{
"epoch": 0.8136482939632546,
"grad_norm": 1.648064371427012,
"learning_rate": 9.138962126156157e-06,
"loss": 0.796,
"step": 620
},
{
"epoch": 0.8267716535433071,
"grad_norm": 1.4758839650496245,
"learning_rate": 9.095638550314794e-06,
"loss": 0.7933,
"step": 630
},
{
"epoch": 0.8398950131233596,
"grad_norm": 1.7226329473102002,
"learning_rate": 9.051359664432795e-06,
"loss": 0.7804,
"step": 640
},
{
"epoch": 0.8530183727034121,
"grad_norm": 1.612826825222584,
"learning_rate": 9.006135796585688e-06,
"loss": 0.7836,
"step": 650
},
{
"epoch": 0.8661417322834646,
"grad_norm": 1.4463880498485353,
"learning_rate": 8.95997749526658e-06,
"loss": 0.7811,
"step": 660
},
{
"epoch": 0.8792650918635171,
"grad_norm": 1.604651553438199,
"learning_rate": 8.912895526925726e-06,
"loss": 0.7781,
"step": 670
},
{
"epoch": 0.8923884514435696,
"grad_norm": 1.5684616487142087,
"learning_rate": 8.86490087345924e-06,
"loss": 0.7814,
"step": 680
},
{
"epoch": 0.905511811023622,
"grad_norm": 1.4869728238675464,
"learning_rate": 8.816004729647573e-06,
"loss": 0.7827,
"step": 690
},
{
"epoch": 0.9186351706036745,
"grad_norm": 1.50603165520279,
"learning_rate": 8.766218500544305e-06,
"loss": 0.7848,
"step": 700
},
{
"epoch": 0.931758530183727,
"grad_norm": 1.5149074570283008,
"learning_rate": 8.715553798815925e-06,
"loss": 0.7769,
"step": 710
},
{
"epoch": 0.9448818897637795,
"grad_norm": 1.544976291499801,
"learning_rate": 8.66402244203317e-06,
"loss": 0.7613,
"step": 720
},
{
"epoch": 0.958005249343832,
"grad_norm": 1.555367156177007,
"learning_rate": 8.611636449914563e-06,
"loss": 0.7668,
"step": 730
},
{
"epoch": 0.9711286089238845,
"grad_norm": 1.504586391232175,
"learning_rate": 8.558408041522801e-06,
"loss": 0.7654,
"step": 740
},
{
"epoch": 0.984251968503937,
"grad_norm": 1.4898212640427306,
"learning_rate": 8.504349632414675e-06,
"loss": 0.764,
"step": 750
},
{
"epoch": 0.9973753280839895,
"grad_norm": 1.401986546883637,
"learning_rate": 8.449473831745106e-06,
"loss": 0.7583,
"step": 760
},
{
"epoch": 1.010498687664042,
"grad_norm": 1.4416543605053558,
"learning_rate": 8.393793439326071e-06,
"loss": 0.7103,
"step": 770
},
{
"epoch": 1.0236220472440944,
"grad_norm": 1.6403171843794115,
"learning_rate": 8.337321442641036e-06,
"loss": 0.7034,
"step": 780
},
{
"epoch": 1.036745406824147,
"grad_norm": 1.5388511360135515,
"learning_rate": 8.28007101381561e-06,
"loss": 0.7011,
"step": 790
},
{
"epoch": 1.0498687664041995,
"grad_norm": 1.4334465678780328,
"learning_rate": 8.22205550654515e-06,
"loss": 0.7079,
"step": 800
},
{
"epoch": 1.0629921259842519,
"grad_norm": 1.5587791264347304,
"learning_rate": 8.16328845298e-06,
"loss": 0.6971,
"step": 810
},
{
"epoch": 1.0761154855643045,
"grad_norm": 1.5233569268398837,
"learning_rate": 8.103783560569104e-06,
"loss": 0.7088,
"step": 820
},
{
"epoch": 1.0892388451443569,
"grad_norm": 1.4976958999381966,
"learning_rate": 8.04355470886274e-06,
"loss": 0.6934,
"step": 830
},
{
"epoch": 1.1023622047244095,
"grad_norm": 1.3430067666073104,
"learning_rate": 7.98261594627511e-06,
"loss": 0.7011,
"step": 840
},
{
"epoch": 1.1154855643044619,
"grad_norm": 1.4631861658373042,
"learning_rate": 7.920981486807537e-06,
"loss": 0.7053,
"step": 850
},
{
"epoch": 1.1286089238845145,
"grad_norm": 1.3662308568396804,
"learning_rate": 7.858665706733035e-06,
"loss": 0.6999,
"step": 860
},
{
"epoch": 1.141732283464567,
"grad_norm": 1.5745244501174307,
"learning_rate": 7.795683141243046e-06,
"loss": 0.7036,
"step": 870
},
{
"epoch": 1.1548556430446195,
"grad_norm": 1.4028684322219405,
"learning_rate": 7.732048481057088e-06,
"loss": 0.7017,
"step": 880
},
{
"epoch": 1.167979002624672,
"grad_norm": 1.684251036089208,
"learning_rate": 7.667776568996143e-06,
"loss": 0.6849,
"step": 890
},
{
"epoch": 1.1811023622047245,
"grad_norm": 1.4835499788707966,
"learning_rate": 7.602882396520559e-06,
"loss": 0.6951,
"step": 900
},
{
"epoch": 1.194225721784777,
"grad_norm": 1.5123572551106819,
"learning_rate": 7.5373811002332785e-06,
"loss": 0.6962,
"step": 910
},
{
"epoch": 1.2073490813648293,
"grad_norm": 1.8825750275150834,
"learning_rate": 7.47128795834923e-06,
"loss": 0.7017,
"step": 920
},
{
"epoch": 1.220472440944882,
"grad_norm": 1.334943117481765,
"learning_rate": 7.4046183871316544e-06,
"loss": 0.6973,
"step": 930
},
{
"epoch": 1.2335958005249343,
"grad_norm": 1.5838501251475126,
"learning_rate": 7.337387937296278e-06,
"loss": 0.6944,
"step": 940
},
{
"epoch": 1.246719160104987,
"grad_norm": 1.342799954640876,
"learning_rate": 7.269612290384076e-06,
"loss": 0.6822,
"step": 950
},
{
"epoch": 1.2598425196850394,
"grad_norm": 1.3443879066347948,
"learning_rate": 7.201307255103561e-06,
"loss": 0.692,
"step": 960
},
{
"epoch": 1.272965879265092,
"grad_norm": 1.586723729979877,
"learning_rate": 7.132488763643384e-06,
"loss": 0.6925,
"step": 970
},
{
"epoch": 1.2860892388451444,
"grad_norm": 1.3370936442314167,
"learning_rate": 7.063172867956143e-06,
"loss": 0.6904,
"step": 980
},
{
"epoch": 1.2992125984251968,
"grad_norm": 1.5378877632776058,
"learning_rate": 6.993375736014259e-06,
"loss": 0.6903,
"step": 990
},
{
"epoch": 1.3123359580052494,
"grad_norm": 1.361410460657643,
"learning_rate": 6.923113648038784e-06,
"loss": 0.6943,
"step": 1000
},
{
"epoch": 1.3254593175853018,
"grad_norm": 1.646369394476635,
"learning_rate": 6.852402992702034e-06,
"loss": 0.6788,
"step": 1010
},
{
"epoch": 1.3385826771653544,
"grad_norm": 1.3176415658235974,
"learning_rate": 6.781260263304918e-06,
"loss": 0.6836,
"step": 1020
},
{
"epoch": 1.3517060367454068,
"grad_norm": 1.3116825387663804,
"learning_rate": 6.709702053929865e-06,
"loss": 0.6804,
"step": 1030
},
{
"epoch": 1.3648293963254594,
"grad_norm": 1.3926884194197895,
"learning_rate": 6.6377450555702485e-06,
"loss": 0.683,
"step": 1040
},
{
"epoch": 1.3779527559055118,
"grad_norm": 1.4099706312703377,
"learning_rate": 6.565406052237205e-06,
"loss": 0.6793,
"step": 1050
},
{
"epoch": 1.3910761154855642,
"grad_norm": 1.7784719464765815,
"learning_rate": 6.4927019170447434e-06,
"loss": 0.6922,
"step": 1060
},
{
"epoch": 1.4041994750656168,
"grad_norm": 1.4629942474237903,
"learning_rate": 6.419649608274096e-06,
"loss": 0.6945,
"step": 1070
},
{
"epoch": 1.4173228346456692,
"grad_norm": 1.3978321692556004,
"learning_rate": 6.346266165418173e-06,
"loss": 0.6794,
"step": 1080
},
{
"epoch": 1.4304461942257218,
"grad_norm": 1.4239220003078308,
"learning_rate": 6.272568705207109e-06,
"loss": 0.6822,
"step": 1090
},
{
"epoch": 1.4435695538057742,
"grad_norm": 1.5699876966221564,
"learning_rate": 6.198574417615758e-06,
"loss": 0.6837,
"step": 1100
},
{
"epoch": 1.4566929133858268,
"grad_norm": 1.5574208469323227,
"learning_rate": 6.124300561854139e-06,
"loss": 0.6797,
"step": 1110
},
{
"epoch": 1.4698162729658792,
"grad_norm": 1.4140951570795213,
"learning_rate": 6.049764462341702e-06,
"loss": 0.6741,
"step": 1120
},
{
"epoch": 1.4829396325459316,
"grad_norm": 1.4432324570722699,
"learning_rate": 5.974983504666402e-06,
"loss": 0.6822,
"step": 1130
},
{
"epoch": 1.4960629921259843,
"grad_norm": 1.3824079635846924,
"learning_rate": 5.899975131529504e-06,
"loss": 0.6821,
"step": 1140
},
{
"epoch": 1.5091863517060369,
"grad_norm": 1.3639889335044162,
"learning_rate": 5.824756838677057e-06,
"loss": 0.6833,
"step": 1150
},
{
"epoch": 1.5223097112860893,
"grad_norm": 1.3279921872652147,
"learning_rate": 5.749346170819006e-06,
"loss": 0.6745,
"step": 1160
},
{
"epoch": 1.5354330708661417,
"grad_norm": 1.4234609859145475,
"learning_rate": 5.6737607175368735e-06,
"loss": 0.6845,
"step": 1170
},
{
"epoch": 1.5485564304461943,
"grad_norm": 1.4045161720671306,
"learning_rate": 5.598018109180988e-06,
"loss": 0.6832,
"step": 1180
},
{
"epoch": 1.5616797900262467,
"grad_norm": 1.5599767307463868,
"learning_rate": 5.5221360127581815e-06,
"loss": 0.673,
"step": 1190
},
{
"epoch": 1.574803149606299,
"grad_norm": 1.3244762605195228,
"learning_rate": 5.446132127810966e-06,
"loss": 0.6714,
"step": 1200
},
{
"epoch": 1.5879265091863517,
"grad_norm": 1.5834736978023898,
"learning_rate": 5.370024182289087e-06,
"loss": 0.6659,
"step": 1210
},
{
"epoch": 1.6010498687664043,
"grad_norm": 1.4369156468567603,
"learning_rate": 5.29382992841449e-06,
"loss": 0.6796,
"step": 1220
},
{
"epoch": 1.6141732283464567,
"grad_norm": 1.3895175396051678,
"learning_rate": 5.217567138540581e-06,
"loss": 0.6742,
"step": 1230
},
{
"epoch": 1.627296587926509,
"grad_norm": 1.3970406816982954,
"learning_rate": 5.141253601006841e-06,
"loss": 0.6608,
"step": 1240
},
{
"epoch": 1.6404199475065617,
"grad_norm": 1.31995657343019,
"learning_rate": 5.064907115989655e-06,
"loss": 0.6732,
"step": 1250
},
{
"epoch": 1.6535433070866141,
"grad_norm": 1.3046889168284816,
"learning_rate": 4.9885454913504435e-06,
"loss": 0.6663,
"step": 1260
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.2295984270005762,
"learning_rate": 4.912186538481944e-06,
"loss": 0.6619,
"step": 1270
},
{
"epoch": 1.6797900262467191,
"grad_norm": 1.4667420849887025,
"learning_rate": 4.835848068153702e-06,
"loss": 0.6708,
"step": 1280
},
{
"epoch": 1.6929133858267718,
"grad_norm": 1.3732477862949846,
"learning_rate": 4.759547886357701e-06,
"loss": 0.6699,
"step": 1290
},
{
"epoch": 1.7060367454068242,
"grad_norm": 1.2525903026333147,
"learning_rate": 4.683303790155103e-06,
"loss": 0.6672,
"step": 1300
},
{
"epoch": 1.7191601049868765,
"grad_norm": 1.4884566011924565,
"learning_rate": 4.607133563525072e-06,
"loss": 0.6657,
"step": 1310
},
{
"epoch": 1.7322834645669292,
"grad_norm": 1.2651877837014813,
"learning_rate": 4.531054973216648e-06,
"loss": 0.6601,
"step": 1320
},
{
"epoch": 1.7454068241469818,
"grad_norm": 1.2957136613776106,
"learning_rate": 4.455085764604653e-06,
"loss": 0.665,
"step": 1330
},
{
"epoch": 1.758530183727034,
"grad_norm": 1.2914149104267014,
"learning_rate": 4.3792436575505644e-06,
"loss": 0.6711,
"step": 1340
},
{
"epoch": 1.7716535433070866,
"grad_norm": 1.3087878612877044,
"learning_rate": 4.303546342269344e-06,
"loss": 0.6802,
"step": 1350
},
{
"epoch": 1.7847769028871392,
"grad_norm": 1.354110047538368,
"learning_rate": 4.228011475203191e-06,
"loss": 0.6639,
"step": 1360
},
{
"epoch": 1.7979002624671916,
"grad_norm": 1.264469178278457,
"learning_rate": 4.152656674903169e-06,
"loss": 0.6596,
"step": 1370
},
{
"epoch": 1.811023622047244,
"grad_norm": 1.355906590548092,
"learning_rate": 4.077499517919663e-06,
"loss": 0.6657,
"step": 1380
},
{
"epoch": 1.8241469816272966,
"grad_norm": 1.291362822136906,
"learning_rate": 4.002557534702639e-06,
"loss": 0.6616,
"step": 1390
},
{
"epoch": 1.8372703412073492,
"grad_norm": 1.3281942700420335,
"learning_rate": 3.927848205512659e-06,
"loss": 0.6641,
"step": 1400
},
{
"epoch": 1.8503937007874016,
"grad_norm": 1.2988138580887156,
"learning_rate": 3.853388956343604e-06,
"loss": 0.6673,
"step": 1410
},
{
"epoch": 1.863517060367454,
"grad_norm": 1.386424735985052,
"learning_rate": 3.779197154858044e-06,
"loss": 0.6509,
"step": 1420
},
{
"epoch": 1.8766404199475066,
"grad_norm": 1.335199751133558,
"learning_rate": 3.705290106336221e-06,
"loss": 0.6641,
"step": 1430
},
{
"epoch": 1.889763779527559,
"grad_norm": 1.2373191854244718,
"learning_rate": 3.6316850496395863e-06,
"loss": 0.6632,
"step": 1440
},
{
"epoch": 1.9028871391076114,
"grad_norm": 1.254937848225745,
"learning_rate": 3.5583991531898276e-06,
"loss": 0.6586,
"step": 1450
},
{
"epoch": 1.916010498687664,
"grad_norm": 1.2586091439960605,
"learning_rate": 3.4854495109643207e-06,
"loss": 0.6555,
"step": 1460
},
{
"epoch": 1.9291338582677167,
"grad_norm": 1.2691195643845157,
"learning_rate": 3.412853138508947e-06,
"loss": 0.6652,
"step": 1470
},
{
"epoch": 1.942257217847769,
"grad_norm": 1.37789137126307,
"learning_rate": 3.340626968969215e-06,
"loss": 0.6578,
"step": 1480
},
{
"epoch": 1.9553805774278215,
"grad_norm": 1.3680845584870929,
"learning_rate": 3.2687878491405933e-06,
"loss": 0.6613,
"step": 1490
},
{
"epoch": 1.968503937007874,
"grad_norm": 1.344475700563475,
"learning_rate": 3.197352535538978e-06,
"loss": 0.6712,
"step": 1500
},
{
"epoch": 1.9816272965879265,
"grad_norm": 1.3678562268271839,
"learning_rate": 3.1263376904922318e-06,
"loss": 0.6584,
"step": 1510
},
{
"epoch": 1.9947506561679789,
"grad_norm": 1.346152280655493,
"learning_rate": 3.0557598782536914e-06,
"loss": 0.6551,
"step": 1520
},
{
"epoch": 2.0078740157480315,
"grad_norm": 1.2899162269306206,
"learning_rate": 2.9856355611385356e-06,
"loss": 0.6218,
"step": 1530
},
{
"epoch": 2.020997375328084,
"grad_norm": 1.34329640848473,
"learning_rate": 2.915981095683943e-06,
"loss": 0.5752,
"step": 1540
},
{
"epoch": 2.0341207349081363,
"grad_norm": 1.24625682130632,
"learning_rate": 2.846812728833931e-06,
"loss": 0.578,
"step": 1550
},
{
"epoch": 2.047244094488189,
"grad_norm": 1.3603958866479353,
"learning_rate": 2.778146594149732e-06,
"loss": 0.5909,
"step": 1560
},
{
"epoch": 2.0603674540682415,
"grad_norm": 1.3130248810501666,
"learning_rate": 2.7099987080466417e-06,
"loss": 0.5779,
"step": 1570
},
{
"epoch": 2.073490813648294,
"grad_norm": 1.2728229935348123,
"learning_rate": 2.64238496605817e-06,
"loss": 0.5874,
"step": 1580
},
{
"epoch": 2.0866141732283463,
"grad_norm": 1.3312388632136043,
"learning_rate": 2.5753211391284172e-06,
"loss": 0.5788,
"step": 1590
},
{
"epoch": 2.099737532808399,
"grad_norm": 1.2348457534841435,
"learning_rate": 2.5088228699334717e-06,
"loss": 0.5763,
"step": 1600
},
{
"epoch": 2.1128608923884515,
"grad_norm": 1.3038144088885084,
"learning_rate": 2.44290566923276e-06,
"loss": 0.5799,
"step": 1610
},
{
"epoch": 2.1259842519685037,
"grad_norm": 1.3164791802552103,
"learning_rate": 2.3775849122511442e-06,
"loss": 0.5769,
"step": 1620
},
{
"epoch": 2.1391076115485563,
"grad_norm": 1.291734568095305,
"learning_rate": 2.312875835092655e-06,
"loss": 0.5721,
"step": 1630
},
{
"epoch": 2.152230971128609,
"grad_norm": 1.4191185408274385,
"learning_rate": 2.248793531186647e-06,
"loss": 0.5768,
"step": 1640
},
{
"epoch": 2.1653543307086616,
"grad_norm": 1.18784718220649,
"learning_rate": 2.185352947767257e-06,
"loss": 0.579,
"step": 1650
},
{
"epoch": 2.1784776902887137,
"grad_norm": 1.2311199986468657,
"learning_rate": 2.1225688823869494e-06,
"loss": 0.5747,
"step": 1660
},
{
"epoch": 2.1916010498687664,
"grad_norm": 1.2447404349206936,
"learning_rate": 2.0604559794649793e-06,
"loss": 0.5734,
"step": 1670
},
{
"epoch": 2.204724409448819,
"grad_norm": 1.296578730192031,
"learning_rate": 1.999028726871576e-06,
"loss": 0.5755,
"step": 1680
},
{
"epoch": 2.2178477690288716,
"grad_norm": 1.2677498871223802,
"learning_rate": 1.9383014525486287e-06,
"loss": 0.5728,
"step": 1690
},
{
"epoch": 2.2309711286089238,
"grad_norm": 1.2522124484651282,
"learning_rate": 1.8782883211677044e-06,
"loss": 0.5713,
"step": 1700
},
{
"epoch": 2.2440944881889764,
"grad_norm": 1.3344150468049227,
"learning_rate": 1.8190033308261134e-06,
"loss": 0.5723,
"step": 1710
},
{
"epoch": 2.257217847769029,
"grad_norm": 1.2564213421314623,
"learning_rate": 1.7604603097818523e-06,
"loss": 0.5734,
"step": 1720
},
{
"epoch": 2.270341207349081,
"grad_norm": 1.3130336081169207,
"learning_rate": 1.7026729132281489e-06,
"loss": 0.5787,
"step": 1730
},
{
"epoch": 2.283464566929134,
"grad_norm": 1.1841727433165028,
"learning_rate": 1.6456546201083934e-06,
"loss": 0.5744,
"step": 1740
},
{
"epoch": 2.2965879265091864,
"grad_norm": 1.3367937537125874,
"learning_rate": 1.5894187299721535e-06,
"loss": 0.5707,
"step": 1750
},
{
"epoch": 2.309711286089239,
"grad_norm": 1.2242809577158813,
"learning_rate": 1.5339783598730568e-06,
"loss": 0.5692,
"step": 1760
},
{
"epoch": 2.322834645669291,
"grad_norm": 1.2513286468060605,
"learning_rate": 1.4793464413092161e-06,
"loss": 0.574,
"step": 1770
},
{
"epoch": 2.335958005249344,
"grad_norm": 1.259680151618769,
"learning_rate": 1.4255357172069727e-06,
"loss": 0.5786,
"step": 1780
},
{
"epoch": 2.3490813648293964,
"grad_norm": 1.2315488553903935,
"learning_rate": 1.3725587389485812e-06,
"loss": 0.5671,
"step": 1790
},
{
"epoch": 2.362204724409449,
"grad_norm": 1.2676834549720328,
"learning_rate": 1.3204278634446028e-06,
"loss": 0.5769,
"step": 1800
},
{
"epoch": 2.3753280839895012,
"grad_norm": 1.3389962065747927,
"learning_rate": 1.2691552502516414e-06,
"loss": 0.5777,
"step": 1810
},
{
"epoch": 2.388451443569554,
"grad_norm": 1.2653499838227833,
"learning_rate": 1.2187528587361313e-06,
"loss": 0.5723,
"step": 1820
},
{
"epoch": 2.4015748031496065,
"grad_norm": 1.1672991280648781,
"learning_rate": 1.1692324452847992e-06,
"loss": 0.5699,
"step": 1830
},
{
"epoch": 2.4146981627296586,
"grad_norm": 1.2893411655718872,
"learning_rate": 1.1206055605624777e-06,
"loss": 0.5732,
"step": 1840
},
{
"epoch": 2.4278215223097113,
"grad_norm": 1.290721907760722,
"learning_rate": 1.0728835468179183e-06,
"loss": 0.5662,
"step": 1850
},
{
"epoch": 2.440944881889764,
"grad_norm": 1.217997593835658,
"learning_rate": 1.0260775352381934e-06,
"loss": 0.5727,
"step": 1860
},
{
"epoch": 2.454068241469816,
"grad_norm": 1.2440219130503634,
"learning_rate": 9.801984433523483e-07,
"loss": 0.5694,
"step": 1870
},
{
"epoch": 2.4671916010498687,
"grad_norm": 1.2592327456759795,
"learning_rate": 9.352569724848715e-07,
"loss": 0.5735,
"step": 1880
},
{
"epoch": 2.4803149606299213,
"grad_norm": 1.226880374562509,
"learning_rate": 8.912636052596207e-07,
"loss": 0.5816,
"step": 1890
},
{
"epoch": 2.493438320209974,
"grad_norm": 1.2083486104350505,
"learning_rate": 8.482286031547282e-07,
"loss": 0.5638,
"step": 1900
},
{
"epoch": 2.506561679790026,
"grad_norm": 1.14544648130176,
"learning_rate": 8.061620041091172e-07,
"loss": 0.5735,
"step": 1910
},
{
"epoch": 2.5196850393700787,
"grad_norm": 1.2633309998706563,
"learning_rate": 7.650736201811348e-07,
"loss": 0.5768,
"step": 1920
},
{
"epoch": 2.5328083989501313,
"grad_norm": 1.2681452803808053,
"learning_rate": 7.249730352599e-07,
"loss": 0.575,
"step": 1930
},
{
"epoch": 2.545931758530184,
"grad_norm": 1.2826031568874492,
"learning_rate": 6.858696028298412e-07,
"loss": 0.5687,
"step": 1940
},
{
"epoch": 2.559055118110236,
"grad_norm": 1.2627571245177376,
"learning_rate": 6.477724437889988e-07,
"loss": 0.564,
"step": 1950
},
{
"epoch": 2.5721784776902887,
"grad_norm": 1.1880367657533304,
"learning_rate": 6.106904443215639e-07,
"loss": 0.5709,
"step": 1960
},
{
"epoch": 2.5853018372703414,
"grad_norm": 1.234319028151925,
"learning_rate": 5.746322538251814e-07,
"loss": 0.5731,
"step": 1970
},
{
"epoch": 2.5984251968503935,
"grad_norm": 1.2520017746520362,
"learning_rate": 5.396062828934634e-07,
"loss": 0.5765,
"step": 1980
},
{
"epoch": 2.611548556430446,
"grad_norm": 1.2282113090629896,
"learning_rate": 5.056207013542131e-07,
"loss": 0.5723,
"step": 1990
},
{
"epoch": 2.6246719160104988,
"grad_norm": 1.250069721424398,
"learning_rate": 4.7268343636381774e-07,
"loss": 0.5681,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 2286,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1100109887569920.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}