Qwen3.5-2B-metamath / checkpoint-2865 /trainer_state.json
WhiteGiverPlus's picture
Add files using upload-large-folder tool
bde1506 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 250,
"global_step": 2865,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010479434110558029,
"grad_norm": 0.19915591180324554,
"learning_rate": 1.0465116279069768e-05,
"loss": 1.1350045204162598,
"step": 10
},
{
"epoch": 0.020958868221116058,
"grad_norm": 0.18158815801143646,
"learning_rate": 2.2093023255813955e-05,
"loss": 1.0580164909362793,
"step": 20
},
{
"epoch": 0.03143830233167409,
"grad_norm": 0.16481591761112213,
"learning_rate": 3.372093023255814e-05,
"loss": 0.9252842903137207,
"step": 30
},
{
"epoch": 0.041917736442232116,
"grad_norm": 0.15599584579467773,
"learning_rate": 4.5348837209302326e-05,
"loss": 0.8342072486877441,
"step": 40
},
{
"epoch": 0.05239717055279015,
"grad_norm": 0.1804327368736267,
"learning_rate": 5.697674418604652e-05,
"loss": 0.7955524921417236,
"step": 50
},
{
"epoch": 0.06287660466334818,
"grad_norm": 0.16934047639369965,
"learning_rate": 6.86046511627907e-05,
"loss": 0.7358035087585449,
"step": 60
},
{
"epoch": 0.07335603877390622,
"grad_norm": 0.2234930843114853,
"learning_rate": 8.023255813953489e-05,
"loss": 0.6985861301422119,
"step": 70
},
{
"epoch": 0.08383547288446423,
"grad_norm": 0.16290400922298431,
"learning_rate": 9.186046511627907e-05,
"loss": 0.599607515335083,
"step": 80
},
{
"epoch": 0.09431490699502226,
"grad_norm": 0.1660464107990265,
"learning_rate": 9.999971245570617e-05,
"loss": 0.5886398315429687,
"step": 90
},
{
"epoch": 0.1047943411055803,
"grad_norm": 0.16978025436401367,
"learning_rate": 9.999460064915317e-05,
"loss": 0.5450529098510742,
"step": 100
},
{
"epoch": 0.11527377521613832,
"grad_norm": 0.21447990834712982,
"learning_rate": 9.998309972134645e-05,
"loss": 0.5072262287139893,
"step": 110
},
{
"epoch": 0.12575320932669637,
"grad_norm": 0.17418669164180756,
"learning_rate": 9.996521114206116e-05,
"loss": 0.49445347785949706,
"step": 120
},
{
"epoch": 0.13623264343725439,
"grad_norm": 0.22226351499557495,
"learning_rate": 9.994093719739023e-05,
"loss": 0.47142682075500486,
"step": 130
},
{
"epoch": 0.14671207754781243,
"grad_norm": 0.1745530068874359,
"learning_rate": 9.991028098945215e-05,
"loss": 0.46663532257080076,
"step": 140
},
{
"epoch": 0.15719151165837045,
"grad_norm": 0.17074695229530334,
"learning_rate": 9.987324643599459e-05,
"loss": 0.4508847236633301,
"step": 150
},
{
"epoch": 0.16767094576892846,
"grad_norm": 0.13428406417369843,
"learning_rate": 9.982983826989367e-05,
"loss": 0.40740265846252444,
"step": 160
},
{
"epoch": 0.1781503798794865,
"grad_norm": 0.17766578495502472,
"learning_rate": 9.978006203854918e-05,
"loss": 0.3998516321182251,
"step": 170
},
{
"epoch": 0.18862981399004453,
"grad_norm": 0.1672629565000534,
"learning_rate": 9.972392410317562e-05,
"loss": 0.41658673286437986,
"step": 180
},
{
"epoch": 0.19910924810060257,
"grad_norm": 0.1333673745393753,
"learning_rate": 9.96614316379892e-05,
"loss": 0.37024455070495604,
"step": 190
},
{
"epoch": 0.2095886822111606,
"grad_norm": 0.18037110567092896,
"learning_rate": 9.959259262929113e-05,
"loss": 0.35086841583251954,
"step": 200
},
{
"epoch": 0.22006811632171863,
"grad_norm": 0.14616410434246063,
"learning_rate": 9.951741587444683e-05,
"loss": 0.37918968200683595,
"step": 210
},
{
"epoch": 0.23054755043227665,
"grad_norm": 0.14523574709892273,
"learning_rate": 9.943591098076184e-05,
"loss": 0.32804527282714846,
"step": 220
},
{
"epoch": 0.2410269845428347,
"grad_norm": 0.14667049050331116,
"learning_rate": 9.934808836425393e-05,
"loss": 0.3480507850646973,
"step": 230
},
{
"epoch": 0.25150641865339274,
"grad_norm": 0.18156558275222778,
"learning_rate": 9.925395924832198e-05,
"loss": 0.3300448179244995,
"step": 240
},
{
"epoch": 0.26198585276395076,
"grad_norm": 0.13806430995464325,
"learning_rate": 9.91535356623117e-05,
"loss": 0.3127591609954834,
"step": 250
},
{
"epoch": 0.26198585276395076,
"eval_loss": 0.3132782578468323,
"eval_runtime": 94.8848,
"eval_samples_per_second": 3.278,
"eval_steps_per_second": 3.278,
"step": 250
},
{
"epoch": 0.27246528687450877,
"grad_norm": 0.17205959558486938,
"learning_rate": 9.904683043997835e-05,
"loss": 0.3306673288345337,
"step": 260
},
{
"epoch": 0.2829447209850668,
"grad_norm": 0.12620031833648682,
"learning_rate": 9.893385721784656e-05,
"loss": 0.3011106729507446,
"step": 270
},
{
"epoch": 0.29342415509562486,
"grad_norm": 0.11466006934642792,
"learning_rate": 9.881463043346768e-05,
"loss": 0.2951968669891357,
"step": 280
},
{
"epoch": 0.3039035892061829,
"grad_norm": 0.1671207845211029,
"learning_rate": 9.868916532357475e-05,
"loss": 0.2910990953445435,
"step": 290
},
{
"epoch": 0.3143830233167409,
"grad_norm": 0.1683349907398224,
"learning_rate": 9.855747792213521e-05,
"loss": 0.31409192085266113,
"step": 300
},
{
"epoch": 0.3248624574272989,
"grad_norm": 0.12934699654579163,
"learning_rate": 9.84195850583019e-05,
"loss": 0.27755858898162844,
"step": 310
},
{
"epoch": 0.33534189153785693,
"grad_norm": 0.13784605264663696,
"learning_rate": 9.827550435426234e-05,
"loss": 0.2809821605682373,
"step": 320
},
{
"epoch": 0.345821325648415,
"grad_norm": 0.18590271472930908,
"learning_rate": 9.812525422298664e-05,
"loss": 0.28698866367340087,
"step": 330
},
{
"epoch": 0.356300759758973,
"grad_norm": 0.1704522967338562,
"learning_rate": 9.796885386587447e-05,
"loss": 0.250814414024353,
"step": 340
},
{
"epoch": 0.36678019386953103,
"grad_norm": 0.1316167265176773,
"learning_rate": 9.780632327030112e-05,
"loss": 0.25458922386169436,
"step": 350
},
{
"epoch": 0.37725962798008905,
"grad_norm": 0.16226200759410858,
"learning_rate": 9.763768320706319e-05,
"loss": 0.26563262939453125,
"step": 360
},
{
"epoch": 0.3877390620906471,
"grad_norm": 0.1297195851802826,
"learning_rate": 9.746295522772424e-05,
"loss": 0.2632328748703003,
"step": 370
},
{
"epoch": 0.39821849620120514,
"grad_norm": 0.1286139190196991,
"learning_rate": 9.728216166186049e-05,
"loss": 0.2624588251113892,
"step": 380
},
{
"epoch": 0.40869793031176316,
"grad_norm": 0.1587965339422226,
"learning_rate": 9.709532561420725e-05,
"loss": 0.24741590023040771,
"step": 390
},
{
"epoch": 0.4191773644223212,
"grad_norm": 0.11963177472352982,
"learning_rate": 9.690247096170615e-05,
"loss": 0.22777397632598878,
"step": 400
},
{
"epoch": 0.42965679853287925,
"grad_norm": 0.13638927042484283,
"learning_rate": 9.670362235045387e-05,
"loss": 0.23324952125549317,
"step": 410
},
{
"epoch": 0.44013623264343726,
"grad_norm": 0.1514088362455368,
"learning_rate": 9.649880519255232e-05,
"loss": 0.2505915880203247,
"step": 420
},
{
"epoch": 0.4506156667539953,
"grad_norm": 0.10994207113981247,
"learning_rate": 9.62880456628612e-05,
"loss": 0.2078850269317627,
"step": 430
},
{
"epoch": 0.4610951008645533,
"grad_norm": 0.11983369290828705,
"learning_rate": 9.607137069565288e-05,
"loss": 0.21452484130859376,
"step": 440
},
{
"epoch": 0.47157453497511137,
"grad_norm": 0.12684305012226105,
"learning_rate": 9.58488079811703e-05,
"loss": 0.22002685070037842,
"step": 450
},
{
"epoch": 0.4820539690856694,
"grad_norm": 0.16841623187065125,
"learning_rate": 9.562038596208828e-05,
"loss": 0.21405396461486817,
"step": 460
},
{
"epoch": 0.4925334031962274,
"grad_norm": 0.1498555839061737,
"learning_rate": 9.538613382987865e-05,
"loss": 0.20534911155700683,
"step": 470
},
{
"epoch": 0.5030128373067855,
"grad_norm": 0.13913628458976746,
"learning_rate": 9.514608152107974e-05,
"loss": 0.22248730659484864,
"step": 480
},
{
"epoch": 0.5134922714173434,
"grad_norm": 0.14408951997756958,
"learning_rate": 9.490025971347047e-05,
"loss": 0.214866042137146,
"step": 490
},
{
"epoch": 0.5239717055279015,
"grad_norm": 0.1649770438671112,
"learning_rate": 9.464869982215001e-05,
"loss": 0.19965900182724,
"step": 500
},
{
"epoch": 0.5239717055279015,
"eval_loss": 0.19267401099205017,
"eval_runtime": 95.3374,
"eval_samples_per_second": 3.262,
"eval_steps_per_second": 3.262,
"step": 500
},
{
"epoch": 0.5344511396384595,
"grad_norm": 0.1305568665266037,
"learning_rate": 9.439143399552291e-05,
"loss": 0.21112546920776368,
"step": 510
},
{
"epoch": 0.5449305737490175,
"grad_norm": 0.11998175084590912,
"learning_rate": 9.412849511119074e-05,
"loss": 0.21422922611236572,
"step": 520
},
{
"epoch": 0.5554100078595756,
"grad_norm": 0.15220341086387634,
"learning_rate": 9.385991677175046e-05,
"loss": 0.20999882221221924,
"step": 530
},
{
"epoch": 0.5658894419701336,
"grad_norm": 0.13170023262500763,
"learning_rate": 9.358573330050004e-05,
"loss": 0.20208392143249512,
"step": 540
},
{
"epoch": 0.5763688760806917,
"grad_norm": 0.10457764565944672,
"learning_rate": 9.330597973705219e-05,
"loss": 0.1908803701400757,
"step": 550
},
{
"epoch": 0.5868483101912497,
"grad_norm": 0.12568537890911102,
"learning_rate": 9.302069183285637e-05,
"loss": 0.19316340684890748,
"step": 560
},
{
"epoch": 0.5973277443018077,
"grad_norm": 0.14824528992176056,
"learning_rate": 9.272990604662988e-05,
"loss": 0.18987581729888917,
"step": 570
},
{
"epoch": 0.6078071784123658,
"grad_norm": 0.14521734416484833,
"learning_rate": 9.243365953969861e-05,
"loss": 0.19232832193374633,
"step": 580
},
{
"epoch": 0.6182866125229237,
"grad_norm": 0.1335408091545105,
"learning_rate": 9.213199017124793e-05,
"loss": 0.1758212924003601,
"step": 590
},
{
"epoch": 0.6287660466334818,
"grad_norm": 0.11143071949481964,
"learning_rate": 9.182493649348447e-05,
"loss": 0.19117680788040162,
"step": 600
},
{
"epoch": 0.6392454807440399,
"grad_norm": 0.14789296686649323,
"learning_rate": 9.151253774670921e-05,
"loss": 0.184559965133667,
"step": 610
},
{
"epoch": 0.6497249148545978,
"grad_norm": 0.10541336238384247,
"learning_rate": 9.119483385430283e-05,
"loss": 0.1720304846763611,
"step": 620
},
{
"epoch": 0.6602043489651559,
"grad_norm": 0.12105975300073624,
"learning_rate": 9.087186541762358e-05,
"loss": 0.17654836177825928,
"step": 630
},
{
"epoch": 0.6706837830757139,
"grad_norm": 0.13114669919013977,
"learning_rate": 9.054367371081858e-05,
"loss": 0.1696592688560486,
"step": 640
},
{
"epoch": 0.6811632171862719,
"grad_norm": 0.13745592534542084,
"learning_rate": 9.021030067554919e-05,
"loss": 0.15404462814331055,
"step": 650
},
{
"epoch": 0.69164265129683,
"grad_norm": 0.15927442908287048,
"learning_rate": 8.987178891563094e-05,
"loss": 0.17024366855621337,
"step": 660
},
{
"epoch": 0.702122085407388,
"grad_norm": 0.13737429678440094,
"learning_rate": 8.952818169158903e-05,
"loss": 0.1602048397064209,
"step": 670
},
{
"epoch": 0.712601519517946,
"grad_norm": 0.13941751420497894,
"learning_rate": 8.91795229151297e-05,
"loss": 0.18057082891464232,
"step": 680
},
{
"epoch": 0.7230809536285041,
"grad_norm": 0.14242954552173615,
"learning_rate": 8.882585714352856e-05,
"loss": 0.14863334894180297,
"step": 690
},
{
"epoch": 0.7335603877390621,
"grad_norm": 0.15553542971611023,
"learning_rate": 8.846722957393626e-05,
"loss": 0.15701137781143187,
"step": 700
},
{
"epoch": 0.7440398218496201,
"grad_norm": 0.12901411950588226,
"learning_rate": 8.810368603760249e-05,
"loss": 0.15571318864822387,
"step": 710
},
{
"epoch": 0.7545192559601781,
"grad_norm": 0.13449430465698242,
"learning_rate": 8.773527299401902e-05,
"loss": 0.16418551206588744,
"step": 720
},
{
"epoch": 0.7649986900707362,
"grad_norm": 0.10630270838737488,
"learning_rate": 8.736203752498218e-05,
"loss": 0.16800801753997802,
"step": 730
},
{
"epoch": 0.7754781241812942,
"grad_norm": 0.11299935728311539,
"learning_rate": 8.698402732857611e-05,
"loss": 0.15700833797454833,
"step": 740
},
{
"epoch": 0.7859575582918522,
"grad_norm": 0.11920930445194244,
"learning_rate": 8.660129071307707e-05,
"loss": 0.15091001987457275,
"step": 750
},
{
"epoch": 0.7859575582918522,
"eval_loss": 0.1356429010629654,
"eval_runtime": 94.0557,
"eval_samples_per_second": 3.307,
"eval_steps_per_second": 3.307,
"step": 750
},
{
"epoch": 0.7964369924024103,
"grad_norm": 0.13870343565940857,
"learning_rate": 8.621387659077986e-05,
"loss": 0.1422027826309204,
"step": 760
},
{
"epoch": 0.8069164265129684,
"grad_norm": 0.12753477692604065,
"learning_rate": 8.582183447174697e-05,
"loss": 0.142450213432312,
"step": 770
},
{
"epoch": 0.8173958606235263,
"grad_norm": 0.11877496540546417,
"learning_rate": 8.542521445748141e-05,
"loss": 0.15361062288284302,
"step": 780
},
{
"epoch": 0.8278752947340844,
"grad_norm": 0.1200249195098877,
"learning_rate": 8.502406723452392e-05,
"loss": 0.14647477865219116,
"step": 790
},
{
"epoch": 0.8383547288446423,
"grad_norm": 0.12913794815540314,
"learning_rate": 8.461844406797543e-05,
"loss": 0.1591552734375,
"step": 800
},
{
"epoch": 0.8488341629552004,
"grad_norm": 0.17270176112651825,
"learning_rate": 8.420839679494558e-05,
"loss": 0.1495436668395996,
"step": 810
},
{
"epoch": 0.8593135970657585,
"grad_norm": 0.15545596182346344,
"learning_rate": 8.379397781792808e-05,
"loss": 0.15377395153045653,
"step": 820
},
{
"epoch": 0.8697930311763165,
"grad_norm": 0.12941111624240875,
"learning_rate": 8.337524009810395e-05,
"loss": 0.14733861684799193,
"step": 830
},
{
"epoch": 0.8802724652868745,
"grad_norm": 0.13152749836444855,
"learning_rate": 8.295223714857319e-05,
"loss": 0.13980752229690552,
"step": 840
},
{
"epoch": 0.8907518993974325,
"grad_norm": 0.11208872497081757,
"learning_rate": 8.252502302751612e-05,
"loss": 0.12019969224929809,
"step": 850
},
{
"epoch": 0.9012313335079906,
"grad_norm": 0.11118603497743607,
"learning_rate": 8.209365233128482e-05,
"loss": 0.13822466135025024,
"step": 860
},
{
"epoch": 0.9117107676185486,
"grad_norm": 0.11705653369426727,
"learning_rate": 8.165818018742605e-05,
"loss": 0.1439664840698242,
"step": 870
},
{
"epoch": 0.9221902017291066,
"grad_norm": 0.08817730098962784,
"learning_rate": 8.121866224763606e-05,
"loss": 0.13380355834960939,
"step": 880
},
{
"epoch": 0.9326696358396647,
"grad_norm": 0.1092257872223854,
"learning_rate": 8.077515468064851e-05,
"loss": 0.12982802391052245,
"step": 890
},
{
"epoch": 0.9431490699502227,
"grad_norm": 0.12680962681770325,
"learning_rate": 8.032771416505647e-05,
"loss": 0.1489071011543274,
"step": 900
},
{
"epoch": 0.9536285040607807,
"grad_norm": 0.11953219771385193,
"learning_rate": 7.987639788206888e-05,
"loss": 0.14020267724990845,
"step": 910
},
{
"epoch": 0.9641079381713388,
"grad_norm": 0.1041467934846878,
"learning_rate": 7.942126350820318e-05,
"loss": 0.1439213275909424,
"step": 920
},
{
"epoch": 0.9745873722818967,
"grad_norm": 0.1277916431427002,
"learning_rate": 7.896236920791442e-05,
"loss": 0.1468779683113098,
"step": 930
},
{
"epoch": 0.9850668063924548,
"grad_norm": 0.11245205253362656,
"learning_rate": 7.849977362616201e-05,
"loss": 0.12012372016906739,
"step": 940
},
{
"epoch": 0.9955462405030129,
"grad_norm": 0.12230483442544937,
"learning_rate": 7.803353588091522e-05,
"loss": 0.1488939881324768,
"step": 950
},
{
"epoch": 1.005239717055279,
"grad_norm": 0.14185865223407745,
"learning_rate": 7.7563715555598e-05,
"loss": 0.11488113403320313,
"step": 960
},
{
"epoch": 1.015719151165837,
"grad_norm": 0.10545773804187775,
"learning_rate": 7.709037269147459e-05,
"loss": 0.10712549686431885,
"step": 970
},
{
"epoch": 1.026198585276395,
"grad_norm": 0.10376274585723877,
"learning_rate": 7.661356777997631e-05,
"loss": 0.11428828239440918,
"step": 980
},
{
"epoch": 1.0366780193869531,
"grad_norm": 0.09950564056634903,
"learning_rate": 7.613336175497111e-05,
"loss": 0.09823058247566223,
"step": 990
},
{
"epoch": 1.0471574534975112,
"grad_norm": 0.10412753373384476,
"learning_rate": 7.564981598497643e-05,
"loss": 0.1106558084487915,
"step": 1000
},
{
"epoch": 1.0471574534975112,
"eval_loss": 0.11185819655656815,
"eval_runtime": 93.808,
"eval_samples_per_second": 3.315,
"eval_steps_per_second": 3.315,
"step": 1000
},
{
"epoch": 1.057636887608069,
"grad_norm": 0.10430868715047836,
"learning_rate": 7.516299226531645e-05,
"loss": 0.11168640851974487,
"step": 1010
},
{
"epoch": 1.0681163217186271,
"grad_norm": 0.09646806865930557,
"learning_rate": 7.467295281022501e-05,
"loss": 0.10711305141448975,
"step": 1020
},
{
"epoch": 1.0785957558291852,
"grad_norm": 0.13060614466667175,
"learning_rate": 7.417976024489474e-05,
"loss": 0.10001810789108276,
"step": 1030
},
{
"epoch": 1.0890751899397433,
"grad_norm": 0.10389085114002228,
"learning_rate": 7.368347759747393e-05,
"loss": 0.11893858909606933,
"step": 1040
},
{
"epoch": 1.0995546240503014,
"grad_norm": 0.11291550099849701,
"learning_rate": 7.318416829101164e-05,
"loss": 0.1079628586769104,
"step": 1050
},
{
"epoch": 1.1100340581608594,
"grad_norm": 0.10372598469257355,
"learning_rate": 7.268189613535255e-05,
"loss": 0.10332397222518921,
"step": 1060
},
{
"epoch": 1.1205134922714173,
"grad_norm": 0.12971536815166473,
"learning_rate": 7.217672531898225e-05,
"loss": 0.10804877281188965,
"step": 1070
},
{
"epoch": 1.1309929263819753,
"grad_norm": 0.10902425646781921,
"learning_rate": 7.166872040082431e-05,
"loss": 0.09947454929351807,
"step": 1080
},
{
"epoch": 1.1414723604925334,
"grad_norm": 0.09305932372808456,
"learning_rate": 7.11579463019897e-05,
"loss": 0.09406971335411071,
"step": 1090
},
{
"epoch": 1.1519517946030915,
"grad_norm": 0.11485275626182556,
"learning_rate": 7.064446829748034e-05,
"loss": 0.09943979978561401,
"step": 1100
},
{
"epoch": 1.1624312287136496,
"grad_norm": 0.09556467831134796,
"learning_rate": 7.0128352007847e-05,
"loss": 0.10862170457839966,
"step": 1110
},
{
"epoch": 1.1729106628242074,
"grad_norm": 0.11937833577394485,
"learning_rate": 6.96096633908034e-05,
"loss": 0.10385221242904663,
"step": 1120
},
{
"epoch": 1.1833900969347655,
"grad_norm": 0.11560507863759995,
"learning_rate": 6.908846873279691e-05,
"loss": 0.09252402186393738,
"step": 1130
},
{
"epoch": 1.1938695310453236,
"grad_norm": 0.11119654029607773,
"learning_rate": 6.856483464053758e-05,
"loss": 0.09637172818183899,
"step": 1140
},
{
"epoch": 1.2043489651558816,
"grad_norm": 0.11722644418478012,
"learning_rate": 6.803882803248585e-05,
"loss": 0.09078751802444458,
"step": 1150
},
{
"epoch": 1.2148283992664397,
"grad_norm": 0.10487739741802216,
"learning_rate": 6.751051613030082e-05,
"loss": 0.10334972143173218,
"step": 1160
},
{
"epoch": 1.2253078333769976,
"grad_norm": 0.10202383995056152,
"learning_rate": 6.697996645024937e-05,
"loss": 0.08661433458328247,
"step": 1170
},
{
"epoch": 1.2357872674875556,
"grad_norm": 0.11801143735647202,
"learning_rate": 6.644724679457804e-05,
"loss": 0.0997927188873291,
"step": 1180
},
{
"epoch": 1.2462667015981137,
"grad_norm": 0.10949107259511948,
"learning_rate": 6.591242524284802e-05,
"loss": 0.0977592945098877,
"step": 1190
},
{
"epoch": 1.2567461357086718,
"grad_norm": 0.10221222043037415,
"learning_rate": 6.537557014323487e-05,
"loss": 0.0970361053943634,
"step": 1200
},
{
"epoch": 1.2672255698192298,
"grad_norm": 0.10554748773574829,
"learning_rate": 6.483675010379393e-05,
"loss": 0.09007551074028015,
"step": 1210
},
{
"epoch": 1.2777050039297877,
"grad_norm": 0.11625627428293228,
"learning_rate": 6.429603398369242e-05,
"loss": 0.08734490275382996,
"step": 1220
},
{
"epoch": 1.2881844380403458,
"grad_norm": 0.10624277591705322,
"learning_rate": 6.37534908844095e-05,
"loss": 0.09858485460281372,
"step": 1230
},
{
"epoch": 1.2986638721509038,
"grad_norm": 0.10184557735919952,
"learning_rate": 6.320919014090534e-05,
"loss": 0.09335023164749146,
"step": 1240
},
{
"epoch": 1.309143306261462,
"grad_norm": 0.10787283629179001,
"learning_rate": 6.266320131276051e-05,
"loss": 0.08665563464164734,
"step": 1250
},
{
"epoch": 1.309143306261462,
"eval_loss": 0.08951585739850998,
"eval_runtime": 94.0567,
"eval_samples_per_second": 3.307,
"eval_steps_per_second": 3.307,
"step": 1250
},
{
"epoch": 1.31962274037202,
"grad_norm": 0.10836981981992722,
"learning_rate": 6.211559417528631e-05,
"loss": 0.0933380126953125,
"step": 1260
},
{
"epoch": 1.3301021744825778,
"grad_norm": 0.1397171914577484,
"learning_rate": 6.156643871060795e-05,
"loss": 0.09835371971130372,
"step": 1270
},
{
"epoch": 1.340581608593136,
"grad_norm": 0.11242218315601349,
"learning_rate": 6.101580509872097e-05,
"loss": 0.09398673176765442,
"step": 1280
},
{
"epoch": 1.351061042703694,
"grad_norm": 0.10235017538070679,
"learning_rate": 6.0463763708522536e-05,
"loss": 0.10350929498672486,
"step": 1290
},
{
"epoch": 1.361540476814252,
"grad_norm": 0.09327106177806854,
"learning_rate": 5.99103850888186e-05,
"loss": 0.09580238461494446,
"step": 1300
},
{
"epoch": 1.3720199109248101,
"grad_norm": 0.12995658814907074,
"learning_rate": 5.9355739959307976e-05,
"loss": 0.08437412977218628,
"step": 1310
},
{
"epoch": 1.382499345035368,
"grad_norm": 0.11962983757257462,
"learning_rate": 5.879989920154466e-05,
"loss": 0.08409937620162963,
"step": 1320
},
{
"epoch": 1.392978779145926,
"grad_norm": 0.09431737661361694,
"learning_rate": 5.824293384987941e-05,
"loss": 0.09504773020744324,
"step": 1330
},
{
"epoch": 1.4034582132564841,
"grad_norm": 0.13824374973773956,
"learning_rate": 5.768491508238188e-05,
"loss": 0.09193333983421326,
"step": 1340
},
{
"epoch": 1.4139376473670422,
"grad_norm": 0.10595858097076416,
"learning_rate": 5.712591421174422e-05,
"loss": 0.08976472616195678,
"step": 1350
},
{
"epoch": 1.4244170814776003,
"grad_norm": 0.09911809861660004,
"learning_rate": 5.6566002676167725e-05,
"loss": 0.07597061395645141,
"step": 1360
},
{
"epoch": 1.4348965155881581,
"grad_norm": 0.09723466634750366,
"learning_rate": 5.60052520302332e-05,
"loss": 0.10513757467269898,
"step": 1370
},
{
"epoch": 1.4453759496987162,
"grad_norm": 0.11331687867641449,
"learning_rate": 5.5443733935756615e-05,
"loss": 0.09019948840141297,
"step": 1380
},
{
"epoch": 1.4558553838092743,
"grad_norm": 0.13363589346408844,
"learning_rate": 5.4881520152630886e-05,
"loss": 0.08314153552055359,
"step": 1390
},
{
"epoch": 1.4663348179198323,
"grad_norm": 0.14111892879009247,
"learning_rate": 5.4318682529655404e-05,
"loss": 0.07892010807991028,
"step": 1400
},
{
"epoch": 1.4768142520303904,
"grad_norm": 0.13948485255241394,
"learning_rate": 5.3755292995353913e-05,
"loss": 0.0840128481388092,
"step": 1410
},
{
"epoch": 1.4872936861409483,
"grad_norm": 0.12535949051380157,
"learning_rate": 5.31914235487823e-05,
"loss": 0.07869629859924317,
"step": 1420
},
{
"epoch": 1.4977731202515066,
"grad_norm": 0.10041694343090057,
"learning_rate": 5.2627146250327484e-05,
"loss": 0.08074848055839538,
"step": 1430
},
{
"epoch": 1.5082525543620644,
"grad_norm": 0.10112891346216202,
"learning_rate": 5.2062533212498275e-05,
"loss": 0.0860810935497284,
"step": 1440
},
{
"epoch": 1.5187319884726225,
"grad_norm": 0.11297477036714554,
"learning_rate": 5.149765659070973e-05,
"loss": 0.08794642686843872,
"step": 1450
},
{
"epoch": 1.5292114225831805,
"grad_norm": 0.10511091351509094,
"learning_rate": 5.0932588574061945e-05,
"loss": 0.07854819297790527,
"step": 1460
},
{
"epoch": 1.5396908566937384,
"grad_norm": 0.09333530068397522,
"learning_rate": 5.036740137611453e-05,
"loss": 0.08821435570716858,
"step": 1470
},
{
"epoch": 1.5501702908042967,
"grad_norm": 0.11480343341827393,
"learning_rate": 4.980216722565804e-05,
"loss": 0.08062278628349304,
"step": 1480
},
{
"epoch": 1.5606497249148545,
"grad_norm": 0.08406255394220352,
"learning_rate": 4.923695835748338e-05,
"loss": 0.0940588355064392,
"step": 1490
},
{
"epoch": 1.5711291590254126,
"grad_norm": 0.12927693128585815,
"learning_rate": 4.8671847003150447e-05,
"loss": 0.0775177538394928,
"step": 1500
},
{
"epoch": 1.5711291590254126,
"eval_loss": 0.07877222448587418,
"eval_runtime": 34.4389,
"eval_samples_per_second": 9.03,
"eval_steps_per_second": 9.03,
"step": 1500
},
{
"epoch": 1.5816085931359707,
"grad_norm": 0.1255076378583908,
"learning_rate": 4.810690538175728e-05,
"loss": 0.09362970590591431,
"step": 1510
},
{
"epoch": 1.5920880272465285,
"grad_norm": 0.1326853185892105,
"learning_rate": 4.754220569071068e-05,
"loss": 0.08364834189414978,
"step": 1520
},
{
"epoch": 1.6025674613570868,
"grad_norm": 0.10229979455471039,
"learning_rate": 4.697782009649962e-05,
"loss": 0.0725843846797943,
"step": 1530
},
{
"epoch": 1.6130468954676447,
"grad_norm": 0.11407258361577988,
"learning_rate": 4.641382072547272e-05,
"loss": 0.07566151022911072,
"step": 1540
},
{
"epoch": 1.6235263295782028,
"grad_norm": 0.09398165345191956,
"learning_rate": 4.585027965462075e-05,
"loss": 0.087736576795578,
"step": 1550
},
{
"epoch": 1.6340057636887608,
"grad_norm": 0.11289424449205399,
"learning_rate": 4.528726890236544e-05,
"loss": 0.08366051316261292,
"step": 1560
},
{
"epoch": 1.6444851977993187,
"grad_norm": 0.09478718787431717,
"learning_rate": 4.4724860419355746e-05,
"loss": 0.0885531723499298,
"step": 1570
},
{
"epoch": 1.654964631909877,
"grad_norm": 0.09163404256105423,
"learning_rate": 4.416312607927295e-05,
"loss": 0.08392030596733094,
"step": 1580
},
{
"epoch": 1.6654440660204348,
"grad_norm": 0.11422222852706909,
"learning_rate": 4.360213766964542e-05,
"loss": 0.08059985041618348,
"step": 1590
},
{
"epoch": 1.675923500130993,
"grad_norm": 0.08131479471921921,
"learning_rate": 4.304196688267438e-05,
"loss": 0.07613803148269653,
"step": 1600
},
{
"epoch": 1.686402934241551,
"grad_norm": 0.09615079313516617,
"learning_rate": 4.248268530607199e-05,
"loss": 0.07764078378677368,
"step": 1610
},
{
"epoch": 1.696882368352109,
"grad_norm": 0.09730526059865952,
"learning_rate": 4.192436441391271e-05,
"loss": 0.07644452452659607,
"step": 1620
},
{
"epoch": 1.707361802462667,
"grad_norm": 0.09649327397346497,
"learning_rate": 4.136707555749907e-05,
"loss": 0.07866159081459045,
"step": 1630
},
{
"epoch": 1.717841236573225,
"grad_norm": 0.11804413050413132,
"learning_rate": 4.0810889956243415e-05,
"loss": 0.06996130347251892,
"step": 1640
},
{
"epoch": 1.728320670683783,
"grad_norm": 0.09874672442674637,
"learning_rate": 4.025587868856622e-05,
"loss": 0.07877404093742371,
"step": 1650
},
{
"epoch": 1.738800104794341,
"grad_norm": 0.11149467527866364,
"learning_rate": 3.9702112682812544e-05,
"loss": 0.07241421341896057,
"step": 1660
},
{
"epoch": 1.7492795389048992,
"grad_norm": 0.08748896420001984,
"learning_rate": 3.914966270818766e-05,
"loss": 0.07336459755897522,
"step": 1670
},
{
"epoch": 1.7597589730154573,
"grad_norm": 0.1172696202993393,
"learning_rate": 3.859859936571307e-05,
"loss": 0.07742337584495544,
"step": 1680
},
{
"epoch": 1.770238407126015,
"grad_norm": 0.0719197615981102,
"learning_rate": 3.8048993079203925e-05,
"loss": 0.06242966651916504,
"step": 1690
},
{
"epoch": 1.7807178412365732,
"grad_norm": 0.12380168586969376,
"learning_rate": 3.750091408626907e-05,
"loss": 0.07270430326461792,
"step": 1700
},
{
"epoch": 1.7911972753471312,
"grad_norm": 0.1587221622467041,
"learning_rate": 3.6954432429335015e-05,
"loss": 0.06409866213798524,
"step": 1710
},
{
"epoch": 1.8016767094576893,
"grad_norm": 0.10983912646770477,
"learning_rate": 3.640961794669482e-05,
"loss": 0.06610031127929687,
"step": 1720
},
{
"epoch": 1.8121561435682474,
"grad_norm": 0.11023026704788208,
"learning_rate": 3.586654026358287e-05,
"loss": 0.06866579055786133,
"step": 1730
},
{
"epoch": 1.8226355776788052,
"grad_norm": 0.11857719719409943,
"learning_rate": 3.532526878327719e-05,
"loss": 0.06734356880187989,
"step": 1740
},
{
"epoch": 1.8331150117893635,
"grad_norm": 0.09280339628458023,
"learning_rate": 3.478587267822987e-05,
"loss": 0.06897796392440796,
"step": 1750
},
{
"epoch": 1.8331150117893635,
"eval_loss": 0.06596127897500992,
"eval_runtime": 35.5001,
"eval_samples_per_second": 8.761,
"eval_steps_per_second": 8.761,
"step": 1750
},
{
"epoch": 1.8435944458999214,
"grad_norm": 0.1175367683172226,
"learning_rate": 3.424842088122716e-05,
"loss": 0.08288194537162781,
"step": 1760
},
{
"epoch": 1.8540738800104795,
"grad_norm": 0.10271462798118591,
"learning_rate": 3.371298207658003e-05,
"loss": 0.05643013119697571,
"step": 1770
},
{
"epoch": 1.8645533141210375,
"grad_norm": 0.11965195834636688,
"learning_rate": 3.3179624691346654e-05,
"loss": 0.07403092980384826,
"step": 1780
},
{
"epoch": 1.8750327482315954,
"grad_norm": 0.09981680661439896,
"learning_rate": 3.2648416886587686e-05,
"loss": 0.07118859887123108,
"step": 1790
},
{
"epoch": 1.8855121823421537,
"grad_norm": 0.07787375897169113,
"learning_rate": 3.2119426548655435e-05,
"loss": 0.07219682335853576,
"step": 1800
},
{
"epoch": 1.8959916164527115,
"grad_norm": 0.1303507387638092,
"learning_rate": 3.1592721280518404e-05,
"loss": 0.07636030912399291,
"step": 1810
},
{
"epoch": 1.9064710505632696,
"grad_norm": 0.09162267297506332,
"learning_rate": 3.106836839312175e-05,
"loss": 0.06230143308639526,
"step": 1820
},
{
"epoch": 1.9169504846738277,
"grad_norm": 0.11375878751277924,
"learning_rate": 3.054643489678526e-05,
"loss": 0.060506826639175414,
"step": 1830
},
{
"epoch": 1.9274299187843855,
"grad_norm": 0.1377716213464737,
"learning_rate": 3.0026987492639668e-05,
"loss": 0.08148540854454041,
"step": 1840
},
{
"epoch": 1.9379093528949438,
"grad_norm": 0.10483554750680923,
"learning_rate": 2.951009256410255e-05,
"loss": 0.07040726542472839,
"step": 1850
},
{
"epoch": 1.9483887870055017,
"grad_norm": 0.08736151456832886,
"learning_rate": 2.8995816168394702e-05,
"loss": 0.04931557774543762,
"step": 1860
},
{
"epoch": 1.9588682211160597,
"grad_norm": 0.11461569368839264,
"learning_rate": 2.848422402809828e-05,
"loss": 0.057559752464294435,
"step": 1870
},
{
"epoch": 1.9693476552266178,
"grad_norm": 0.09060918539762497,
"learning_rate": 2.7975381522757803e-05,
"loss": 0.06379705667495728,
"step": 1880
},
{
"epoch": 1.9798270893371757,
"grad_norm": 0.07104971259832382,
"learning_rate": 2.746935368052477e-05,
"loss": 0.05813115239143372,
"step": 1890
},
{
"epoch": 1.990306523447734,
"grad_norm": 0.10802938044071198,
"learning_rate": 2.696620516984733e-05,
"loss": 0.07732833027839661,
"step": 1900
},
{
"epoch": 2.0,
"grad_norm": 0.16884952783584595,
"learning_rate": 2.6466000291206004e-05,
"loss": 0.06166202425956726,
"step": 1910
},
{
"epoch": 2.010479434110558,
"grad_norm": 0.08582179993391037,
"learning_rate": 2.5968802968896228e-05,
"loss": 0.04766199886798859,
"step": 1920
},
{
"epoch": 2.020958868221116,
"grad_norm": 0.1457364708185196,
"learning_rate": 2.5474676742859048e-05,
"loss": 0.03826354146003723,
"step": 1930
},
{
"epoch": 2.031438302331674,
"grad_norm": 0.09275342524051666,
"learning_rate": 2.4983684760561023e-05,
"loss": 0.045059433579444884,
"step": 1940
},
{
"epoch": 2.0419177364422323,
"grad_norm": 0.09085927903652191,
"learning_rate": 2.44958897689242e-05,
"loss": 0.04904903173446655,
"step": 1950
},
{
"epoch": 2.05239717055279,
"grad_norm": 0.11733179539442062,
"learning_rate": 2.401135410630731e-05,
"loss": 0.05008396506309509,
"step": 1960
},
{
"epoch": 2.062876604663348,
"grad_norm": 0.0894237607717514,
"learning_rate": 2.3530139694539095e-05,
"loss": 0.04057626128196716,
"step": 1970
},
{
"epoch": 2.0733560387739063,
"grad_norm": 0.08560927212238312,
"learning_rate": 2.305230803100496e-05,
"loss": 0.04843136668205261,
"step": 1980
},
{
"epoch": 2.083835472884464,
"grad_norm": 0.07991836220026016,
"learning_rate": 2.257792018078793e-05,
"loss": 0.0544127106666565,
"step": 1990
},
{
"epoch": 2.0943149069950224,
"grad_norm": 0.08846250921487808,
"learning_rate": 2.210703676886461e-05,
"loss": 0.0459000825881958,
"step": 2000
},
{
"epoch": 2.0943149069950224,
"eval_loss": 0.060011014342308044,
"eval_runtime": 36.3755,
"eval_samples_per_second": 8.55,
"eval_steps_per_second": 8.55,
"step": 2000
},
{
"epoch": 2.1047943411055803,
"grad_norm": 0.10082945972681046,
"learning_rate": 2.1639717972357678e-05,
"loss": 0.038090622425079344,
"step": 2010
},
{
"epoch": 2.115273775216138,
"grad_norm": 0.05712248757481575,
"learning_rate": 2.1176023512845376e-05,
"loss": 0.04598597884178161,
"step": 2020
},
{
"epoch": 2.1257532093266964,
"grad_norm": 0.11628362536430359,
"learning_rate": 2.0716012648729353e-05,
"loss": 0.04984880685806274,
"step": 2030
},
{
"epoch": 2.1362326434372543,
"grad_norm": 0.10635484755039215,
"learning_rate": 2.025974416766171e-05,
"loss": 0.04293925166130066,
"step": 2040
},
{
"epoch": 2.1467120775478126,
"grad_norm": 0.1017381027340889,
"learning_rate": 1.9807276379032113e-05,
"loss": 0.04305694401264191,
"step": 2050
},
{
"epoch": 2.1571915116583704,
"grad_norm": 0.13550882041454315,
"learning_rate": 1.9358667106516055e-05,
"loss": 0.04478869140148163,
"step": 2060
},
{
"epoch": 2.1676709457689283,
"grad_norm": 0.08526366949081421,
"learning_rate": 1.8913973680685226e-05,
"loss": 0.036646312475204466,
"step": 2070
},
{
"epoch": 2.1781503798794866,
"grad_norm": 0.10932011157274246,
"learning_rate": 1.8473252931680928e-05,
"loss": 0.042200219631195066,
"step": 2080
},
{
"epoch": 2.1886298139900444,
"grad_norm": 0.08768360316753387,
"learning_rate": 1.803656118195136e-05,
"loss": 0.0437488317489624,
"step": 2090
},
{
"epoch": 2.1991092481006027,
"grad_norm": 0.08362651616334915,
"learning_rate": 1.760395423905379e-05,
"loss": 0.04669668078422547,
"step": 2100
},
{
"epoch": 2.2095886822111606,
"grad_norm": 0.08554034680128098,
"learning_rate": 1.7175487388522588e-05,
"loss": 0.034989356994628906,
"step": 2110
},
{
"epoch": 2.220068116321719,
"grad_norm": 0.08215561509132385,
"learning_rate": 1.6751215386803986e-05,
"loss": 0.040298929810523985,
"step": 2120
},
{
"epoch": 2.2305475504322767,
"grad_norm": 0.0840689167380333,
"learning_rate": 1.6331192454258337e-05,
"loss": 0.041704925894737246,
"step": 2130
},
{
"epoch": 2.2410269845428346,
"grad_norm": 0.06530614197254181,
"learning_rate": 1.5915472268231018e-05,
"loss": 0.03651900887489319,
"step": 2140
},
{
"epoch": 2.251506418653393,
"grad_norm": 0.12431822717189789,
"learning_rate": 1.550410795619261e-05,
"loss": 0.04806804955005646,
"step": 2150
},
{
"epoch": 2.2619858527639507,
"grad_norm": 0.09592410176992416,
"learning_rate": 1.509715208894949e-05,
"loss": 0.0454313725233078,
"step": 2160
},
{
"epoch": 2.2724652868745085,
"grad_norm": 0.07589780539274216,
"learning_rate": 1.469465667392536e-05,
"loss": 0.03574602603912354,
"step": 2170
},
{
"epoch": 2.282944720985067,
"grad_norm": 0.09734483063220978,
"learning_rate": 1.4296673148515038e-05,
"loss": 0.04358702301979065,
"step": 2180
},
{
"epoch": 2.2934241550956247,
"grad_norm": 0.0974339172244072,
"learning_rate": 1.3903252373510838e-05,
"loss": 0.04603351950645447,
"step": 2190
},
{
"epoch": 2.303903589206183,
"grad_norm": 0.09025271981954575,
"learning_rate": 1.3514444626602773e-05,
"loss": 0.040065237879753114,
"step": 2200
},
{
"epoch": 2.314383023316741,
"grad_norm": 0.07625086605548859,
"learning_rate": 1.3130299595953338e-05,
"loss": 0.044061675667762756,
"step": 2210
},
{
"epoch": 2.324862457427299,
"grad_norm": 0.07306221127510071,
"learning_rate": 1.2750866373847465e-05,
"loss": 0.03366467654705048,
"step": 2220
},
{
"epoch": 2.335341891537857,
"grad_norm": 0.08357638120651245,
"learning_rate": 1.2376193450418715e-05,
"loss": 0.041424044966697694,
"step": 2230
},
{
"epoch": 2.345821325648415,
"grad_norm": 0.09153921157121658,
"learning_rate": 1.2006328707452459e-05,
"loss": 0.03938372135162353,
"step": 2240
},
{
"epoch": 2.356300759758973,
"grad_norm": 0.09109660983085632,
"learning_rate": 1.1641319412266765e-05,
"loss": 0.04015985131263733,
"step": 2250
},
{
"epoch": 2.356300759758973,
"eval_loss": 0.05486458167433739,
"eval_runtime": 36.8119,
"eval_samples_per_second": 8.448,
"eval_steps_per_second": 8.448,
"step": 2250
},
{
"epoch": 2.366780193869531,
"grad_norm": 0.052502721548080444,
"learning_rate": 1.1281212211671822e-05,
"loss": 0.0270554780960083,
"step": 2260
},
{
"epoch": 2.377259627980089,
"grad_norm": 0.07931812107563019,
"learning_rate": 1.0926053126008584e-05,
"loss": 0.0417300134897232,
"step": 2270
},
{
"epoch": 2.387739062090647,
"grad_norm": 0.08996254205703735,
"learning_rate": 1.0575887543267609e-05,
"loss": 0.037659955024719236,
"step": 2280
},
{
"epoch": 2.398218496201205,
"grad_norm": 0.08800788223743439,
"learning_rate": 1.023076021328867e-05,
"loss": 0.048437944054603575,
"step": 2290
},
{
"epoch": 2.4086979303117633,
"grad_norm": 0.10572271049022675,
"learning_rate": 9.890715242041787e-06,
"loss": 0.04166909456253052,
"step": 2300
},
{
"epoch": 2.419177364422321,
"grad_norm": 0.10573071986436844,
"learning_rate": 9.555796085990781e-06,
"loss": 0.03919607996940613,
"step": 2310
},
{
"epoch": 2.4296567985328794,
"grad_norm": 0.09714583307504654,
"learning_rate": 9.226045546539608e-06,
"loss": 0.03530588150024414,
"step": 2320
},
{
"epoch": 2.4401362326434373,
"grad_norm": 0.09436199069023132,
"learning_rate": 8.901505764562518e-06,
"loss": 0.05111382007598877,
"step": 2330
},
{
"epoch": 2.450615666753995,
"grad_norm": 0.06353961676359177,
"learning_rate": 8.582218215018656e-06,
"loss": 0.03805697858333588,
"step": 2340
},
{
"epoch": 2.4610951008645534,
"grad_norm": 0.08853815495967865,
"learning_rate": 8.268223701651684e-06,
"loss": 0.04815975427627563,
"step": 2350
},
{
"epoch": 2.4715745349751113,
"grad_norm": 0.07472016662359238,
"learning_rate": 7.959562351775196e-06,
"loss": 0.042247459292411804,
"step": 2360
},
{
"epoch": 2.4820539690856696,
"grad_norm": 0.12121549248695374,
"learning_rate": 7.656273611144632e-06,
"loss": 0.040102115273475646,
"step": 2370
},
{
"epoch": 2.4925334031962274,
"grad_norm": 0.08667747676372528,
"learning_rate": 7.358396238916254e-06,
"loss": 0.03656341433525086,
"step": 2380
},
{
"epoch": 2.5030128373067857,
"grad_norm": 0.1162872165441513,
"learning_rate": 7.065968302693882e-06,
"loss": 0.04052766263484955,
"step": 2390
},
{
"epoch": 2.5134922714173435,
"grad_norm": 0.07924140989780426,
"learning_rate": 6.7790271736639595e-06,
"loss": 0.03394221067428589,
"step": 2400
},
{
"epoch": 2.5239717055279014,
"grad_norm": 0.09523408859968185,
"learning_rate": 6.497609521819681e-06,
"loss": 0.04119439423084259,
"step": 2410
},
{
"epoch": 2.5344511396384597,
"grad_norm": 0.12182598561048508,
"learning_rate": 6.221751311274731e-06,
"loss": 0.05154783725738525,
"step": 2420
},
{
"epoch": 2.5449305737490175,
"grad_norm": 0.09359873831272125,
"learning_rate": 5.951487795667149e-06,
"loss": 0.035483264923095705,
"step": 2430
},
{
"epoch": 2.5554100078595754,
"grad_norm": 0.08514095097780228,
"learning_rate": 5.686853513654117e-06,
"loss": 0.03830339312553406,
"step": 2440
},
{
"epoch": 2.5658894419701337,
"grad_norm": 0.10625084489583969,
"learning_rate": 5.4278822844979705e-06,
"loss": 0.034111028909683226,
"step": 2450
},
{
"epoch": 2.5763688760806915,
"grad_norm": 0.1004003956913948,
"learning_rate": 5.174607203744286e-06,
"loss": 0.04465605318546295,
"step": 2460
},
{
"epoch": 2.58684831019125,
"grad_norm": 0.0962519720196724,
"learning_rate": 4.927060638992382e-06,
"loss": 0.041056016087532045,
"step": 2470
},
{
"epoch": 2.5973277443018077,
"grad_norm": 0.06380607187747955,
"learning_rate": 4.685274225758846e-06,
"loss": 0.03880062401294708,
"step": 2480
},
{
"epoch": 2.607807178412366,
"grad_norm": 0.07326535880565643,
"learning_rate": 4.449278863434647e-06,
"loss": 0.03194461762905121,
"step": 2490
},
{
"epoch": 2.618286612522924,
"grad_norm": 0.12218596786260605,
"learning_rate": 4.2191047113362854e-06,
"loss": 0.04258840978145599,
"step": 2500
},
{
"epoch": 2.618286612522924,
"eval_loss": 0.05223666876554489,
"eval_runtime": 37.7234,
"eval_samples_per_second": 8.244,
"eval_steps_per_second": 8.244,
"step": 2500
},
{
"epoch": 2.6287660466334817,
"grad_norm": 0.08594664931297302,
"learning_rate": 3.994781184851598e-06,
"loss": 0.04302787780761719,
"step": 2510
},
{
"epoch": 2.63924548074404,
"grad_norm": 0.08187596499919891,
"learning_rate": 3.776336951680548e-06,
"loss": 0.0341387003660202,
"step": 2520
},
{
"epoch": 2.649724914854598,
"grad_norm": 0.10216796398162842,
"learning_rate": 3.563799928171596e-06,
"loss": 0.04289879500865936,
"step": 2530
},
{
"epoch": 2.6602043489651557,
"grad_norm": 0.11215174198150635,
"learning_rate": 3.3571972757540814e-06,
"loss": 0.04055049121379852,
"step": 2540
},
{
"epoch": 2.670683783075714,
"grad_norm": 0.07941269129514694,
"learning_rate": 3.156555397467176e-06,
"loss": 0.04118689000606537,
"step": 2550
},
{
"epoch": 2.681163217186272,
"grad_norm": 0.09404437988996506,
"learning_rate": 2.9618999345855547e-06,
"loss": 0.03079705536365509,
"step": 2560
},
{
"epoch": 2.69164265129683,
"grad_norm": 0.1109817698597908,
"learning_rate": 2.773255763342647e-06,
"loss": 0.038885954022407535,
"step": 2570
},
{
"epoch": 2.702122085407388,
"grad_norm": 0.09431962668895721,
"learning_rate": 2.590646991751472e-06,
"loss": 0.043543145060539246,
"step": 2580
},
{
"epoch": 2.7126015195179463,
"grad_norm": 0.08184763044118881,
"learning_rate": 2.414096956523776e-06,
"loss": 0.03256987631320953,
"step": 2590
},
{
"epoch": 2.723080953628504,
"grad_norm": 0.08390141278505325,
"learning_rate": 2.2436282200876458e-06,
"loss": 0.03908055424690247,
"step": 2600
},
{
"epoch": 2.733560387739062,
"grad_norm": 0.0762532502412796,
"learning_rate": 2.07926256770416e-06,
"loss": 0.04899201393127441,
"step": 2610
},
{
"epoch": 2.7440398218496203,
"grad_norm": 0.08239631354808807,
"learning_rate": 1.9210210046832768e-06,
"loss": 0.048707082867622375,
"step": 2620
},
{
"epoch": 2.754519255960178,
"grad_norm": 0.09619107842445374,
"learning_rate": 1.7689237536994364e-06,
"loss": 0.0372231125831604,
"step": 2630
},
{
"epoch": 2.764998690070736,
"grad_norm": 0.07099667191505432,
"learning_rate": 1.6229902522072293e-06,
"loss": 0.03421170711517334,
"step": 2640
},
{
"epoch": 2.7754781241812942,
"grad_norm": 0.10154753923416138,
"learning_rate": 1.4832391499572996e-06,
"loss": 0.03656705319881439,
"step": 2650
},
{
"epoch": 2.785957558291852,
"grad_norm": 0.09349387139081955,
"learning_rate": 1.3496883066130173e-06,
"loss": 0.03710306882858276,
"step": 2660
},
{
"epoch": 2.7964369924024104,
"grad_norm": 0.061091430485248566,
"learning_rate": 1.2223547894680443e-06,
"loss": 0.0308389812707901,
"step": 2670
},
{
"epoch": 2.8069164265129682,
"grad_norm": 0.09838075935840607,
"learning_rate": 1.101254871265256e-06,
"loss": 0.03703555166721344,
"step": 2680
},
{
"epoch": 2.8173958606235265,
"grad_norm": 0.10046928375959396,
"learning_rate": 9.864040281170938e-07,
"loss": 0.04500553905963898,
"step": 2690
},
{
"epoch": 2.8278752947340844,
"grad_norm": 0.06770773977041245,
"learning_rate": 8.778169375277978e-07,
"loss": 0.03823737502098083,
"step": 2700
},
{
"epoch": 2.8383547288446422,
"grad_norm": 0.08373535424470901,
"learning_rate": 7.755074765176618e-07,
"loss": 0.03961678743362427,
"step": 2710
},
{
"epoch": 2.8488341629552005,
"grad_norm": 0.07590050995349884,
"learning_rate": 6.794887198496413e-07,
"loss": 0.03221273124217987,
"step": 2720
},
{
"epoch": 2.8593135970657584,
"grad_norm": 0.08507678657770157,
"learning_rate": 5.897729383583906e-07,
"loss": 0.04571912884712219,
"step": 2730
},
{
"epoch": 2.8697930311763162,
"grad_norm": 0.06584763526916504,
"learning_rate": 5.063715973821659e-07,
"loss": 0.03794914484024048,
"step": 2740
},
{
"epoch": 2.8802724652868745,
"grad_norm": 0.07312892377376556,
"learning_rate": 4.292953552975154e-07,
"loss": 0.036365586519241336,
"step": 2750
},
{
"epoch": 2.8802724652868745,
"eval_loss": 0.05090421438217163,
"eval_runtime": 85.293,
"eval_samples_per_second": 3.646,
"eval_steps_per_second": 3.646,
"step": 2750
},
{
"epoch": 2.8907518993974324,
"grad_norm": 0.08459606021642685,
"learning_rate": 3.5855406215725697e-07,
"loss": 0.03068857192993164,
"step": 2760
},
{
"epoch": 2.9012313335079907,
"grad_norm": 0.06866376101970673,
"learning_rate": 2.9415675843163515e-07,
"loss": 0.03265829384326935,
"step": 2770
},
{
"epoch": 2.9117107676185485,
"grad_norm": 0.09082643687725067,
"learning_rate": 2.361116738529956e-07,
"loss": 0.03418546915054321,
"step": 2780
},
{
"epoch": 2.922190201729107,
"grad_norm": 0.10772739350795746,
"learning_rate": 1.8442622636404284e-07,
"loss": 0.03810786008834839,
"step": 2790
},
{
"epoch": 2.9326696358396647,
"grad_norm": 0.08321297913789749,
"learning_rate": 1.391070211698764e-07,
"loss": 0.04068491756916046,
"step": 2800
},
{
"epoch": 2.9431490699502225,
"grad_norm": 0.11239277571439743,
"learning_rate": 1.0015984989385496e-07,
"loss": 0.041029155254364014,
"step": 2810
},
{
"epoch": 2.953628504060781,
"grad_norm": 0.07199843227863312,
"learning_rate": 6.758968983747171e-08,
"loss": 0.037902483344078065,
"step": 2820
},
{
"epoch": 2.9641079381713387,
"grad_norm": 0.08249279856681824,
"learning_rate": 4.140070334422985e-08,
"loss": 0.03996126651763916,
"step": 2830
},
{
"epoch": 2.9745873722818965,
"grad_norm": 0.0852220207452774,
"learning_rate": 2.1596237267751396e-08,
"loss": 0.04228667616844177,
"step": 2840
},
{
"epoch": 2.985066806392455,
"grad_norm": 0.0858582928776741,
"learning_rate": 8.178822544052666e-09,
"loss": 0.03813594281673431,
"step": 2850
},
{
"epoch": 2.995546240503013,
"grad_norm": 0.06642451137304306,
"learning_rate": 1.1501738680919084e-09,
"loss": 0.033472076058387756,
"step": 2860
}
],
"logging_steps": 10,
"max_steps": 2865,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.031737271887514e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}