{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 250, "global_step": 2865, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010479434110558029, "grad_norm": 0.19915591180324554, "learning_rate": 1.0465116279069768e-05, "loss": 1.1350045204162598, "step": 10 }, { "epoch": 0.020958868221116058, "grad_norm": 0.18158815801143646, "learning_rate": 2.2093023255813955e-05, "loss": 1.0580164909362793, "step": 20 }, { "epoch": 0.03143830233167409, "grad_norm": 0.16481591761112213, "learning_rate": 3.372093023255814e-05, "loss": 0.9252842903137207, "step": 30 }, { "epoch": 0.041917736442232116, "grad_norm": 0.15599584579467773, "learning_rate": 4.5348837209302326e-05, "loss": 0.8342072486877441, "step": 40 }, { "epoch": 0.05239717055279015, "grad_norm": 0.1804327368736267, "learning_rate": 5.697674418604652e-05, "loss": 0.7955524921417236, "step": 50 }, { "epoch": 0.06287660466334818, "grad_norm": 0.16934047639369965, "learning_rate": 6.86046511627907e-05, "loss": 0.7358035087585449, "step": 60 }, { "epoch": 0.07335603877390622, "grad_norm": 0.2234930843114853, "learning_rate": 8.023255813953489e-05, "loss": 0.6985861301422119, "step": 70 }, { "epoch": 0.08383547288446423, "grad_norm": 0.16290400922298431, "learning_rate": 9.186046511627907e-05, "loss": 0.599607515335083, "step": 80 }, { "epoch": 0.09431490699502226, "grad_norm": 0.1660464107990265, "learning_rate": 9.999971245570617e-05, "loss": 0.5886398315429687, "step": 90 }, { "epoch": 0.1047943411055803, "grad_norm": 0.16978025436401367, "learning_rate": 9.999460064915317e-05, "loss": 0.5450529098510742, "step": 100 }, { "epoch": 0.11527377521613832, "grad_norm": 0.21447990834712982, "learning_rate": 9.998309972134645e-05, "loss": 0.5072262287139893, "step": 110 }, { "epoch": 0.12575320932669637, "grad_norm": 0.17418669164180756, "learning_rate": 9.996521114206116e-05, "loss": 0.49445347785949706, "step": 120 }, { "epoch": 0.13623264343725439, "grad_norm": 0.22226351499557495, "learning_rate": 9.994093719739023e-05, "loss": 0.47142682075500486, "step": 130 }, { "epoch": 0.14671207754781243, "grad_norm": 0.1745530068874359, "learning_rate": 9.991028098945215e-05, "loss": 0.46663532257080076, "step": 140 }, { "epoch": 0.15719151165837045, "grad_norm": 0.17074695229530334, "learning_rate": 9.987324643599459e-05, "loss": 0.4508847236633301, "step": 150 }, { "epoch": 0.16767094576892846, "grad_norm": 0.13428406417369843, "learning_rate": 9.982983826989367e-05, "loss": 0.40740265846252444, "step": 160 }, { "epoch": 0.1781503798794865, "grad_norm": 0.17766578495502472, "learning_rate": 9.978006203854918e-05, "loss": 0.3998516321182251, "step": 170 }, { "epoch": 0.18862981399004453, "grad_norm": 0.1672629565000534, "learning_rate": 9.972392410317562e-05, "loss": 0.41658673286437986, "step": 180 }, { "epoch": 0.19910924810060257, "grad_norm": 0.1333673745393753, "learning_rate": 9.96614316379892e-05, "loss": 0.37024455070495604, "step": 190 }, { "epoch": 0.2095886822111606, "grad_norm": 0.18037110567092896, "learning_rate": 9.959259262929113e-05, "loss": 0.35086841583251954, "step": 200 }, { "epoch": 0.22006811632171863, "grad_norm": 0.14616410434246063, "learning_rate": 9.951741587444683e-05, "loss": 0.37918968200683595, "step": 210 }, { "epoch": 0.23054755043227665, "grad_norm": 0.14523574709892273, "learning_rate": 9.943591098076184e-05, "loss": 0.32804527282714846, "step": 220 }, { "epoch": 0.2410269845428347, "grad_norm": 0.14667049050331116, "learning_rate": 9.934808836425393e-05, "loss": 0.3480507850646973, "step": 230 }, { "epoch": 0.25150641865339274, "grad_norm": 0.18156558275222778, "learning_rate": 9.925395924832198e-05, "loss": 0.3300448179244995, "step": 240 }, { "epoch": 0.26198585276395076, "grad_norm": 0.13806430995464325, "learning_rate": 9.91535356623117e-05, "loss": 0.3127591609954834, "step": 250 }, { "epoch": 0.26198585276395076, "eval_loss": 0.3132782578468323, "eval_runtime": 94.8848, "eval_samples_per_second": 3.278, "eval_steps_per_second": 3.278, "step": 250 }, { "epoch": 0.27246528687450877, "grad_norm": 0.17205959558486938, "learning_rate": 9.904683043997835e-05, "loss": 0.3306673288345337, "step": 260 }, { "epoch": 0.2829447209850668, "grad_norm": 0.12620031833648682, "learning_rate": 9.893385721784656e-05, "loss": 0.3011106729507446, "step": 270 }, { "epoch": 0.29342415509562486, "grad_norm": 0.11466006934642792, "learning_rate": 9.881463043346768e-05, "loss": 0.2951968669891357, "step": 280 }, { "epoch": 0.3039035892061829, "grad_norm": 0.1671207845211029, "learning_rate": 9.868916532357475e-05, "loss": 0.2910990953445435, "step": 290 }, { "epoch": 0.3143830233167409, "grad_norm": 0.1683349907398224, "learning_rate": 9.855747792213521e-05, "loss": 0.31409192085266113, "step": 300 }, { "epoch": 0.3248624574272989, "grad_norm": 0.12934699654579163, "learning_rate": 9.84195850583019e-05, "loss": 0.27755858898162844, "step": 310 }, { "epoch": 0.33534189153785693, "grad_norm": 0.13784605264663696, "learning_rate": 9.827550435426234e-05, "loss": 0.2809821605682373, "step": 320 }, { "epoch": 0.345821325648415, "grad_norm": 0.18590271472930908, "learning_rate": 9.812525422298664e-05, "loss": 0.28698866367340087, "step": 330 }, { "epoch": 0.356300759758973, "grad_norm": 0.1704522967338562, "learning_rate": 9.796885386587447e-05, "loss": 0.250814414024353, "step": 340 }, { "epoch": 0.36678019386953103, "grad_norm": 0.1316167265176773, "learning_rate": 9.780632327030112e-05, "loss": 0.25458922386169436, "step": 350 }, { "epoch": 0.37725962798008905, "grad_norm": 0.16226200759410858, "learning_rate": 9.763768320706319e-05, "loss": 0.26563262939453125, "step": 360 }, { "epoch": 0.3877390620906471, "grad_norm": 0.1297195851802826, "learning_rate": 9.746295522772424e-05, "loss": 0.2632328748703003, "step": 370 }, { "epoch": 0.39821849620120514, "grad_norm": 0.1286139190196991, "learning_rate": 9.728216166186049e-05, "loss": 0.2624588251113892, "step": 380 }, { "epoch": 0.40869793031176316, "grad_norm": 0.1587965339422226, "learning_rate": 9.709532561420725e-05, "loss": 0.24741590023040771, "step": 390 }, { "epoch": 0.4191773644223212, "grad_norm": 0.11963177472352982, "learning_rate": 9.690247096170615e-05, "loss": 0.22777397632598878, "step": 400 }, { "epoch": 0.42965679853287925, "grad_norm": 0.13638927042484283, "learning_rate": 9.670362235045387e-05, "loss": 0.23324952125549317, "step": 410 }, { "epoch": 0.44013623264343726, "grad_norm": 0.1514088362455368, "learning_rate": 9.649880519255232e-05, "loss": 0.2505915880203247, "step": 420 }, { "epoch": 0.4506156667539953, "grad_norm": 0.10994207113981247, "learning_rate": 9.62880456628612e-05, "loss": 0.2078850269317627, "step": 430 }, { "epoch": 0.4610951008645533, "grad_norm": 0.11983369290828705, "learning_rate": 9.607137069565288e-05, "loss": 0.21452484130859376, "step": 440 }, { "epoch": 0.47157453497511137, "grad_norm": 0.12684305012226105, "learning_rate": 9.58488079811703e-05, "loss": 0.22002685070037842, "step": 450 }, { "epoch": 0.4820539690856694, "grad_norm": 0.16841623187065125, "learning_rate": 9.562038596208828e-05, "loss": 0.21405396461486817, "step": 460 }, { "epoch": 0.4925334031962274, "grad_norm": 0.1498555839061737, "learning_rate": 9.538613382987865e-05, "loss": 0.20534911155700683, "step": 470 }, { "epoch": 0.5030128373067855, "grad_norm": 0.13913628458976746, "learning_rate": 9.514608152107974e-05, "loss": 0.22248730659484864, "step": 480 }, { "epoch": 0.5134922714173434, "grad_norm": 0.14408951997756958, "learning_rate": 9.490025971347047e-05, "loss": 0.214866042137146, "step": 490 }, { "epoch": 0.5239717055279015, "grad_norm": 0.1649770438671112, "learning_rate": 9.464869982215001e-05, "loss": 0.19965900182724, "step": 500 }, { "epoch": 0.5239717055279015, "eval_loss": 0.19267401099205017, "eval_runtime": 95.3374, "eval_samples_per_second": 3.262, "eval_steps_per_second": 3.262, "step": 500 }, { "epoch": 0.5344511396384595, "grad_norm": 0.1305568665266037, "learning_rate": 9.439143399552291e-05, "loss": 0.21112546920776368, "step": 510 }, { "epoch": 0.5449305737490175, "grad_norm": 0.11998175084590912, "learning_rate": 9.412849511119074e-05, "loss": 0.21422922611236572, "step": 520 }, { "epoch": 0.5554100078595756, "grad_norm": 0.15220341086387634, "learning_rate": 9.385991677175046e-05, "loss": 0.20999882221221924, "step": 530 }, { "epoch": 0.5658894419701336, "grad_norm": 0.13170023262500763, "learning_rate": 9.358573330050004e-05, "loss": 0.20208392143249512, "step": 540 }, { "epoch": 0.5763688760806917, "grad_norm": 0.10457764565944672, "learning_rate": 9.330597973705219e-05, "loss": 0.1908803701400757, "step": 550 }, { "epoch": 0.5868483101912497, "grad_norm": 0.12568537890911102, "learning_rate": 9.302069183285637e-05, "loss": 0.19316340684890748, "step": 560 }, { "epoch": 0.5973277443018077, "grad_norm": 0.14824528992176056, "learning_rate": 9.272990604662988e-05, "loss": 0.18987581729888917, "step": 570 }, { "epoch": 0.6078071784123658, "grad_norm": 0.14521734416484833, "learning_rate": 9.243365953969861e-05, "loss": 0.19232832193374633, "step": 580 }, { "epoch": 0.6182866125229237, "grad_norm": 0.1335408091545105, "learning_rate": 9.213199017124793e-05, "loss": 0.1758212924003601, "step": 590 }, { "epoch": 0.6287660466334818, "grad_norm": 0.11143071949481964, "learning_rate": 9.182493649348447e-05, "loss": 0.19117680788040162, "step": 600 }, { "epoch": 0.6392454807440399, "grad_norm": 0.14789296686649323, "learning_rate": 9.151253774670921e-05, "loss": 0.184559965133667, "step": 610 }, { "epoch": 0.6497249148545978, "grad_norm": 0.10541336238384247, "learning_rate": 9.119483385430283e-05, "loss": 0.1720304846763611, "step": 620 }, { "epoch": 0.6602043489651559, "grad_norm": 0.12105975300073624, "learning_rate": 9.087186541762358e-05, "loss": 0.17654836177825928, "step": 630 }, { "epoch": 0.6706837830757139, "grad_norm": 0.13114669919013977, "learning_rate": 9.054367371081858e-05, "loss": 0.1696592688560486, "step": 640 }, { "epoch": 0.6811632171862719, "grad_norm": 0.13745592534542084, "learning_rate": 9.021030067554919e-05, "loss": 0.15404462814331055, "step": 650 }, { "epoch": 0.69164265129683, "grad_norm": 0.15927442908287048, "learning_rate": 8.987178891563094e-05, "loss": 0.17024366855621337, "step": 660 }, { "epoch": 0.702122085407388, "grad_norm": 0.13737429678440094, "learning_rate": 8.952818169158903e-05, "loss": 0.1602048397064209, "step": 670 }, { "epoch": 0.712601519517946, "grad_norm": 0.13941751420497894, "learning_rate": 8.91795229151297e-05, "loss": 0.18057082891464232, "step": 680 }, { "epoch": 0.7230809536285041, "grad_norm": 0.14242954552173615, "learning_rate": 8.882585714352856e-05, "loss": 0.14863334894180297, "step": 690 }, { "epoch": 0.7335603877390621, "grad_norm": 0.15553542971611023, "learning_rate": 8.846722957393626e-05, "loss": 0.15701137781143187, "step": 700 }, { "epoch": 0.7440398218496201, "grad_norm": 0.12901411950588226, "learning_rate": 8.810368603760249e-05, "loss": 0.15571318864822387, "step": 710 }, { "epoch": 0.7545192559601781, "grad_norm": 0.13449430465698242, "learning_rate": 8.773527299401902e-05, "loss": 0.16418551206588744, "step": 720 }, { "epoch": 0.7649986900707362, "grad_norm": 0.10630270838737488, "learning_rate": 8.736203752498218e-05, "loss": 0.16800801753997802, "step": 730 }, { "epoch": 0.7754781241812942, "grad_norm": 0.11299935728311539, "learning_rate": 8.698402732857611e-05, "loss": 0.15700833797454833, "step": 740 }, { "epoch": 0.7859575582918522, "grad_norm": 0.11920930445194244, "learning_rate": 8.660129071307707e-05, "loss": 0.15091001987457275, "step": 750 }, { "epoch": 0.7859575582918522, "eval_loss": 0.1356429010629654, "eval_runtime": 94.0557, "eval_samples_per_second": 3.307, "eval_steps_per_second": 3.307, "step": 750 }, { "epoch": 0.7964369924024103, "grad_norm": 0.13870343565940857, "learning_rate": 8.621387659077986e-05, "loss": 0.1422027826309204, "step": 760 }, { "epoch": 0.8069164265129684, "grad_norm": 0.12753477692604065, "learning_rate": 8.582183447174697e-05, "loss": 0.142450213432312, "step": 770 }, { "epoch": 0.8173958606235263, "grad_norm": 0.11877496540546417, "learning_rate": 8.542521445748141e-05, "loss": 0.15361062288284302, "step": 780 }, { "epoch": 0.8278752947340844, "grad_norm": 0.1200249195098877, "learning_rate": 8.502406723452392e-05, "loss": 0.14647477865219116, "step": 790 }, { "epoch": 0.8383547288446423, "grad_norm": 0.12913794815540314, "learning_rate": 8.461844406797543e-05, "loss": 0.1591552734375, "step": 800 }, { "epoch": 0.8488341629552004, "grad_norm": 0.17270176112651825, "learning_rate": 8.420839679494558e-05, "loss": 0.1495436668395996, "step": 810 }, { "epoch": 0.8593135970657585, "grad_norm": 0.15545596182346344, "learning_rate": 8.379397781792808e-05, "loss": 0.15377395153045653, "step": 820 }, { "epoch": 0.8697930311763165, "grad_norm": 0.12941111624240875, "learning_rate": 8.337524009810395e-05, "loss": 0.14733861684799193, "step": 830 }, { "epoch": 0.8802724652868745, "grad_norm": 0.13152749836444855, "learning_rate": 8.295223714857319e-05, "loss": 0.13980752229690552, "step": 840 }, { "epoch": 0.8907518993974325, "grad_norm": 0.11208872497081757, "learning_rate": 8.252502302751612e-05, "loss": 0.12019969224929809, "step": 850 }, { "epoch": 0.9012313335079906, "grad_norm": 0.11118603497743607, "learning_rate": 8.209365233128482e-05, "loss": 0.13822466135025024, "step": 860 }, { "epoch": 0.9117107676185486, "grad_norm": 0.11705653369426727, "learning_rate": 8.165818018742605e-05, "loss": 0.1439664840698242, "step": 870 }, { "epoch": 0.9221902017291066, "grad_norm": 0.08817730098962784, "learning_rate": 8.121866224763606e-05, "loss": 0.13380355834960939, "step": 880 }, { "epoch": 0.9326696358396647, "grad_norm": 0.1092257872223854, "learning_rate": 8.077515468064851e-05, "loss": 0.12982802391052245, "step": 890 }, { "epoch": 0.9431490699502227, "grad_norm": 0.12680962681770325, "learning_rate": 8.032771416505647e-05, "loss": 0.1489071011543274, "step": 900 }, { "epoch": 0.9536285040607807, "grad_norm": 0.11953219771385193, "learning_rate": 7.987639788206888e-05, "loss": 0.14020267724990845, "step": 910 }, { "epoch": 0.9641079381713388, "grad_norm": 0.1041467934846878, "learning_rate": 7.942126350820318e-05, "loss": 0.1439213275909424, "step": 920 }, { "epoch": 0.9745873722818967, "grad_norm": 0.1277916431427002, "learning_rate": 7.896236920791442e-05, "loss": 0.1468779683113098, "step": 930 }, { "epoch": 0.9850668063924548, "grad_norm": 0.11245205253362656, "learning_rate": 7.849977362616201e-05, "loss": 0.12012372016906739, "step": 940 }, { "epoch": 0.9955462405030129, "grad_norm": 0.12230483442544937, "learning_rate": 7.803353588091522e-05, "loss": 0.1488939881324768, "step": 950 }, { "epoch": 1.005239717055279, "grad_norm": 0.14185865223407745, "learning_rate": 7.7563715555598e-05, "loss": 0.11488113403320313, "step": 960 }, { "epoch": 1.015719151165837, "grad_norm": 0.10545773804187775, "learning_rate": 7.709037269147459e-05, "loss": 0.10712549686431885, "step": 970 }, { "epoch": 1.026198585276395, "grad_norm": 0.10376274585723877, "learning_rate": 7.661356777997631e-05, "loss": 0.11428828239440918, "step": 980 }, { "epoch": 1.0366780193869531, "grad_norm": 0.09950564056634903, "learning_rate": 7.613336175497111e-05, "loss": 0.09823058247566223, "step": 990 }, { "epoch": 1.0471574534975112, "grad_norm": 0.10412753373384476, "learning_rate": 7.564981598497643e-05, "loss": 0.1106558084487915, "step": 1000 }, { "epoch": 1.0471574534975112, "eval_loss": 0.11185819655656815, "eval_runtime": 93.808, "eval_samples_per_second": 3.315, "eval_steps_per_second": 3.315, "step": 1000 }, { "epoch": 1.057636887608069, "grad_norm": 0.10430868715047836, "learning_rate": 7.516299226531645e-05, "loss": 0.11168640851974487, "step": 1010 }, { "epoch": 1.0681163217186271, "grad_norm": 0.09646806865930557, "learning_rate": 7.467295281022501e-05, "loss": 0.10711305141448975, "step": 1020 }, { "epoch": 1.0785957558291852, "grad_norm": 0.13060614466667175, "learning_rate": 7.417976024489474e-05, "loss": 0.10001810789108276, "step": 1030 }, { "epoch": 1.0890751899397433, "grad_norm": 0.10389085114002228, "learning_rate": 7.368347759747393e-05, "loss": 0.11893858909606933, "step": 1040 }, { "epoch": 1.0995546240503014, "grad_norm": 0.11291550099849701, "learning_rate": 7.318416829101164e-05, "loss": 0.1079628586769104, "step": 1050 }, { "epoch": 1.1100340581608594, "grad_norm": 0.10372598469257355, "learning_rate": 7.268189613535255e-05, "loss": 0.10332397222518921, "step": 1060 }, { "epoch": 1.1205134922714173, "grad_norm": 0.12971536815166473, "learning_rate": 7.217672531898225e-05, "loss": 0.10804877281188965, "step": 1070 }, { "epoch": 1.1309929263819753, "grad_norm": 0.10902425646781921, "learning_rate": 7.166872040082431e-05, "loss": 0.09947454929351807, "step": 1080 }, { "epoch": 1.1414723604925334, "grad_norm": 0.09305932372808456, "learning_rate": 7.11579463019897e-05, "loss": 0.09406971335411071, "step": 1090 }, { "epoch": 1.1519517946030915, "grad_norm": 0.11485275626182556, "learning_rate": 7.064446829748034e-05, "loss": 0.09943979978561401, "step": 1100 }, { "epoch": 1.1624312287136496, "grad_norm": 0.09556467831134796, "learning_rate": 7.0128352007847e-05, "loss": 0.10862170457839966, "step": 1110 }, { "epoch": 1.1729106628242074, "grad_norm": 0.11937833577394485, "learning_rate": 6.96096633908034e-05, "loss": 0.10385221242904663, "step": 1120 }, { "epoch": 1.1833900969347655, "grad_norm": 0.11560507863759995, "learning_rate": 6.908846873279691e-05, "loss": 0.09252402186393738, "step": 1130 }, { "epoch": 1.1938695310453236, "grad_norm": 0.11119654029607773, "learning_rate": 6.856483464053758e-05, "loss": 0.09637172818183899, "step": 1140 }, { "epoch": 1.2043489651558816, "grad_norm": 0.11722644418478012, "learning_rate": 6.803882803248585e-05, "loss": 0.09078751802444458, "step": 1150 }, { "epoch": 1.2148283992664397, "grad_norm": 0.10487739741802216, "learning_rate": 6.751051613030082e-05, "loss": 0.10334972143173218, "step": 1160 }, { "epoch": 1.2253078333769976, "grad_norm": 0.10202383995056152, "learning_rate": 6.697996645024937e-05, "loss": 0.08661433458328247, "step": 1170 }, { "epoch": 1.2357872674875556, "grad_norm": 0.11801143735647202, "learning_rate": 6.644724679457804e-05, "loss": 0.0997927188873291, "step": 1180 }, { "epoch": 1.2462667015981137, "grad_norm": 0.10949107259511948, "learning_rate": 6.591242524284802e-05, "loss": 0.0977592945098877, "step": 1190 }, { "epoch": 1.2567461357086718, "grad_norm": 0.10221222043037415, "learning_rate": 6.537557014323487e-05, "loss": 0.0970361053943634, "step": 1200 }, { "epoch": 1.2672255698192298, "grad_norm": 0.10554748773574829, "learning_rate": 6.483675010379393e-05, "loss": 0.09007551074028015, "step": 1210 }, { "epoch": 1.2777050039297877, "grad_norm": 0.11625627428293228, "learning_rate": 6.429603398369242e-05, "loss": 0.08734490275382996, "step": 1220 }, { "epoch": 1.2881844380403458, "grad_norm": 0.10624277591705322, "learning_rate": 6.37534908844095e-05, "loss": 0.09858485460281372, "step": 1230 }, { "epoch": 1.2986638721509038, "grad_norm": 0.10184557735919952, "learning_rate": 6.320919014090534e-05, "loss": 0.09335023164749146, "step": 1240 }, { "epoch": 1.309143306261462, "grad_norm": 0.10787283629179001, "learning_rate": 6.266320131276051e-05, "loss": 0.08665563464164734, "step": 1250 }, { "epoch": 1.309143306261462, "eval_loss": 0.08951585739850998, "eval_runtime": 94.0567, "eval_samples_per_second": 3.307, "eval_steps_per_second": 3.307, "step": 1250 }, { "epoch": 1.31962274037202, "grad_norm": 0.10836981981992722, "learning_rate": 6.211559417528631e-05, "loss": 0.0933380126953125, "step": 1260 }, { "epoch": 1.3301021744825778, "grad_norm": 0.1397171914577484, "learning_rate": 6.156643871060795e-05, "loss": 0.09835371971130372, "step": 1270 }, { "epoch": 1.340581608593136, "grad_norm": 0.11242218315601349, "learning_rate": 6.101580509872097e-05, "loss": 0.09398673176765442, "step": 1280 }, { "epoch": 1.351061042703694, "grad_norm": 0.10235017538070679, "learning_rate": 6.0463763708522536e-05, "loss": 0.10350929498672486, "step": 1290 }, { "epoch": 1.361540476814252, "grad_norm": 0.09327106177806854, "learning_rate": 5.99103850888186e-05, "loss": 0.09580238461494446, "step": 1300 }, { "epoch": 1.3720199109248101, "grad_norm": 0.12995658814907074, "learning_rate": 5.9355739959307976e-05, "loss": 0.08437412977218628, "step": 1310 }, { "epoch": 1.382499345035368, "grad_norm": 0.11962983757257462, "learning_rate": 5.879989920154466e-05, "loss": 0.08409937620162963, "step": 1320 }, { "epoch": 1.392978779145926, "grad_norm": 0.09431737661361694, "learning_rate": 5.824293384987941e-05, "loss": 0.09504773020744324, "step": 1330 }, { "epoch": 1.4034582132564841, "grad_norm": 0.13824374973773956, "learning_rate": 5.768491508238188e-05, "loss": 0.09193333983421326, "step": 1340 }, { "epoch": 1.4139376473670422, "grad_norm": 0.10595858097076416, "learning_rate": 5.712591421174422e-05, "loss": 0.08976472616195678, "step": 1350 }, { "epoch": 1.4244170814776003, "grad_norm": 0.09911809861660004, "learning_rate": 5.6566002676167725e-05, "loss": 0.07597061395645141, "step": 1360 }, { "epoch": 1.4348965155881581, "grad_norm": 0.09723466634750366, "learning_rate": 5.60052520302332e-05, "loss": 0.10513757467269898, "step": 1370 }, { "epoch": 1.4453759496987162, "grad_norm": 0.11331687867641449, "learning_rate": 5.5443733935756615e-05, "loss": 0.09019948840141297, "step": 1380 }, { "epoch": 1.4558553838092743, "grad_norm": 0.13363589346408844, "learning_rate": 5.4881520152630886e-05, "loss": 0.08314153552055359, "step": 1390 }, { "epoch": 1.4663348179198323, "grad_norm": 0.14111892879009247, "learning_rate": 5.4318682529655404e-05, "loss": 0.07892010807991028, "step": 1400 }, { "epoch": 1.4768142520303904, "grad_norm": 0.13948485255241394, "learning_rate": 5.3755292995353913e-05, "loss": 0.0840128481388092, "step": 1410 }, { "epoch": 1.4872936861409483, "grad_norm": 0.12535949051380157, "learning_rate": 5.31914235487823e-05, "loss": 0.07869629859924317, "step": 1420 }, { "epoch": 1.4977731202515066, "grad_norm": 0.10041694343090057, "learning_rate": 5.2627146250327484e-05, "loss": 0.08074848055839538, "step": 1430 }, { "epoch": 1.5082525543620644, "grad_norm": 0.10112891346216202, "learning_rate": 5.2062533212498275e-05, "loss": 0.0860810935497284, "step": 1440 }, { "epoch": 1.5187319884726225, "grad_norm": 0.11297477036714554, "learning_rate": 5.149765659070973e-05, "loss": 0.08794642686843872, "step": 1450 }, { "epoch": 1.5292114225831805, "grad_norm": 0.10511091351509094, "learning_rate": 5.0932588574061945e-05, "loss": 0.07854819297790527, "step": 1460 }, { "epoch": 1.5396908566937384, "grad_norm": 0.09333530068397522, "learning_rate": 5.036740137611453e-05, "loss": 0.08821435570716858, "step": 1470 }, { "epoch": 1.5501702908042967, "grad_norm": 0.11480343341827393, "learning_rate": 4.980216722565804e-05, "loss": 0.08062278628349304, "step": 1480 }, { "epoch": 1.5606497249148545, "grad_norm": 0.08406255394220352, "learning_rate": 4.923695835748338e-05, "loss": 0.0940588355064392, "step": 1490 }, { "epoch": 1.5711291590254126, "grad_norm": 0.12927693128585815, "learning_rate": 4.8671847003150447e-05, "loss": 0.0775177538394928, "step": 1500 }, { "epoch": 1.5711291590254126, "eval_loss": 0.07877222448587418, "eval_runtime": 34.4389, "eval_samples_per_second": 9.03, "eval_steps_per_second": 9.03, "step": 1500 }, { "epoch": 1.5816085931359707, "grad_norm": 0.1255076378583908, "learning_rate": 4.810690538175728e-05, "loss": 0.09362970590591431, "step": 1510 }, { "epoch": 1.5920880272465285, "grad_norm": 0.1326853185892105, "learning_rate": 4.754220569071068e-05, "loss": 0.08364834189414978, "step": 1520 }, { "epoch": 1.6025674613570868, "grad_norm": 0.10229979455471039, "learning_rate": 4.697782009649962e-05, "loss": 0.0725843846797943, "step": 1530 }, { "epoch": 1.6130468954676447, "grad_norm": 0.11407258361577988, "learning_rate": 4.641382072547272e-05, "loss": 0.07566151022911072, "step": 1540 }, { "epoch": 1.6235263295782028, "grad_norm": 0.09398165345191956, "learning_rate": 4.585027965462075e-05, "loss": 0.087736576795578, "step": 1550 }, { "epoch": 1.6340057636887608, "grad_norm": 0.11289424449205399, "learning_rate": 4.528726890236544e-05, "loss": 0.08366051316261292, "step": 1560 }, { "epoch": 1.6444851977993187, "grad_norm": 0.09478718787431717, "learning_rate": 4.4724860419355746e-05, "loss": 0.0885531723499298, "step": 1570 }, { "epoch": 1.654964631909877, "grad_norm": 0.09163404256105423, "learning_rate": 4.416312607927295e-05, "loss": 0.08392030596733094, "step": 1580 }, { "epoch": 1.6654440660204348, "grad_norm": 0.11422222852706909, "learning_rate": 4.360213766964542e-05, "loss": 0.08059985041618348, "step": 1590 }, { "epoch": 1.675923500130993, "grad_norm": 0.08131479471921921, "learning_rate": 4.304196688267438e-05, "loss": 0.07613803148269653, "step": 1600 }, { "epoch": 1.686402934241551, "grad_norm": 0.09615079313516617, "learning_rate": 4.248268530607199e-05, "loss": 0.07764078378677368, "step": 1610 }, { "epoch": 1.696882368352109, "grad_norm": 0.09730526059865952, "learning_rate": 4.192436441391271e-05, "loss": 0.07644452452659607, "step": 1620 }, { "epoch": 1.707361802462667, "grad_norm": 0.09649327397346497, "learning_rate": 4.136707555749907e-05, "loss": 0.07866159081459045, "step": 1630 }, { "epoch": 1.717841236573225, "grad_norm": 0.11804413050413132, "learning_rate": 4.0810889956243415e-05, "loss": 0.06996130347251892, "step": 1640 }, { "epoch": 1.728320670683783, "grad_norm": 0.09874672442674637, "learning_rate": 4.025587868856622e-05, "loss": 0.07877404093742371, "step": 1650 }, { "epoch": 1.738800104794341, "grad_norm": 0.11149467527866364, "learning_rate": 3.9702112682812544e-05, "loss": 0.07241421341896057, "step": 1660 }, { "epoch": 1.7492795389048992, "grad_norm": 0.08748896420001984, "learning_rate": 3.914966270818766e-05, "loss": 0.07336459755897522, "step": 1670 }, { "epoch": 1.7597589730154573, "grad_norm": 0.1172696202993393, "learning_rate": 3.859859936571307e-05, "loss": 0.07742337584495544, "step": 1680 }, { "epoch": 1.770238407126015, "grad_norm": 0.0719197615981102, "learning_rate": 3.8048993079203925e-05, "loss": 0.06242966651916504, "step": 1690 }, { "epoch": 1.7807178412365732, "grad_norm": 0.12380168586969376, "learning_rate": 3.750091408626907e-05, "loss": 0.07270430326461792, "step": 1700 }, { "epoch": 1.7911972753471312, "grad_norm": 0.1587221622467041, "learning_rate": 3.6954432429335015e-05, "loss": 0.06409866213798524, "step": 1710 }, { "epoch": 1.8016767094576893, "grad_norm": 0.10983912646770477, "learning_rate": 3.640961794669482e-05, "loss": 0.06610031127929687, "step": 1720 }, { "epoch": 1.8121561435682474, "grad_norm": 0.11023026704788208, "learning_rate": 3.586654026358287e-05, "loss": 0.06866579055786133, "step": 1730 }, { "epoch": 1.8226355776788052, "grad_norm": 0.11857719719409943, "learning_rate": 3.532526878327719e-05, "loss": 0.06734356880187989, "step": 1740 }, { "epoch": 1.8331150117893635, "grad_norm": 0.09280339628458023, "learning_rate": 3.478587267822987e-05, "loss": 0.06897796392440796, "step": 1750 }, { "epoch": 1.8331150117893635, "eval_loss": 0.06596127897500992, "eval_runtime": 35.5001, "eval_samples_per_second": 8.761, "eval_steps_per_second": 8.761, "step": 1750 }, { "epoch": 1.8435944458999214, "grad_norm": 0.1175367683172226, "learning_rate": 3.424842088122716e-05, "loss": 0.08288194537162781, "step": 1760 }, { "epoch": 1.8540738800104795, "grad_norm": 0.10271462798118591, "learning_rate": 3.371298207658003e-05, "loss": 0.05643013119697571, "step": 1770 }, { "epoch": 1.8645533141210375, "grad_norm": 0.11965195834636688, "learning_rate": 3.3179624691346654e-05, "loss": 0.07403092980384826, "step": 1780 }, { "epoch": 1.8750327482315954, "grad_norm": 0.09981680661439896, "learning_rate": 3.2648416886587686e-05, "loss": 0.07118859887123108, "step": 1790 }, { "epoch": 1.8855121823421537, "grad_norm": 0.07787375897169113, "learning_rate": 3.2119426548655435e-05, "loss": 0.07219682335853576, "step": 1800 }, { "epoch": 1.8959916164527115, "grad_norm": 0.1303507387638092, "learning_rate": 3.1592721280518404e-05, "loss": 0.07636030912399291, "step": 1810 }, { "epoch": 1.9064710505632696, "grad_norm": 0.09162267297506332, "learning_rate": 3.106836839312175e-05, "loss": 0.06230143308639526, "step": 1820 }, { "epoch": 1.9169504846738277, "grad_norm": 0.11375878751277924, "learning_rate": 3.054643489678526e-05, "loss": 0.060506826639175414, "step": 1830 }, { "epoch": 1.9274299187843855, "grad_norm": 0.1377716213464737, "learning_rate": 3.0026987492639668e-05, "loss": 0.08148540854454041, "step": 1840 }, { "epoch": 1.9379093528949438, "grad_norm": 0.10483554750680923, "learning_rate": 2.951009256410255e-05, "loss": 0.07040726542472839, "step": 1850 }, { "epoch": 1.9483887870055017, "grad_norm": 0.08736151456832886, "learning_rate": 2.8995816168394702e-05, "loss": 0.04931557774543762, "step": 1860 }, { "epoch": 1.9588682211160597, "grad_norm": 0.11461569368839264, "learning_rate": 2.848422402809828e-05, "loss": 0.057559752464294435, "step": 1870 }, { "epoch": 1.9693476552266178, "grad_norm": 0.09060918539762497, "learning_rate": 2.7975381522757803e-05, "loss": 0.06379705667495728, "step": 1880 }, { "epoch": 1.9798270893371757, "grad_norm": 0.07104971259832382, "learning_rate": 2.746935368052477e-05, "loss": 0.05813115239143372, "step": 1890 }, { "epoch": 1.990306523447734, "grad_norm": 0.10802938044071198, "learning_rate": 2.696620516984733e-05, "loss": 0.07732833027839661, "step": 1900 }, { "epoch": 2.0, "grad_norm": 0.16884952783584595, "learning_rate": 2.6466000291206004e-05, "loss": 0.06166202425956726, "step": 1910 }, { "epoch": 2.010479434110558, "grad_norm": 0.08582179993391037, "learning_rate": 2.5968802968896228e-05, "loss": 0.04766199886798859, "step": 1920 }, { "epoch": 2.020958868221116, "grad_norm": 0.1457364708185196, "learning_rate": 2.5474676742859048e-05, "loss": 0.03826354146003723, "step": 1930 }, { "epoch": 2.031438302331674, "grad_norm": 0.09275342524051666, "learning_rate": 2.4983684760561023e-05, "loss": 0.045059433579444884, "step": 1940 }, { "epoch": 2.0419177364422323, "grad_norm": 0.09085927903652191, "learning_rate": 2.44958897689242e-05, "loss": 0.04904903173446655, "step": 1950 }, { "epoch": 2.05239717055279, "grad_norm": 0.11733179539442062, "learning_rate": 2.401135410630731e-05, "loss": 0.05008396506309509, "step": 1960 }, { "epoch": 2.062876604663348, "grad_norm": 0.0894237607717514, "learning_rate": 2.3530139694539095e-05, "loss": 0.04057626128196716, "step": 1970 }, { "epoch": 2.0733560387739063, "grad_norm": 0.08560927212238312, "learning_rate": 2.305230803100496e-05, "loss": 0.04843136668205261, "step": 1980 }, { "epoch": 2.083835472884464, "grad_norm": 0.07991836220026016, "learning_rate": 2.257792018078793e-05, "loss": 0.0544127106666565, "step": 1990 }, { "epoch": 2.0943149069950224, "grad_norm": 0.08846250921487808, "learning_rate": 2.210703676886461e-05, "loss": 0.0459000825881958, "step": 2000 }, { "epoch": 2.0943149069950224, "eval_loss": 0.060011014342308044, "eval_runtime": 36.3755, "eval_samples_per_second": 8.55, "eval_steps_per_second": 8.55, "step": 2000 }, { "epoch": 2.1047943411055803, "grad_norm": 0.10082945972681046, "learning_rate": 2.1639717972357678e-05, "loss": 0.038090622425079344, "step": 2010 }, { "epoch": 2.115273775216138, "grad_norm": 0.05712248757481575, "learning_rate": 2.1176023512845376e-05, "loss": 0.04598597884178161, "step": 2020 }, { "epoch": 2.1257532093266964, "grad_norm": 0.11628362536430359, "learning_rate": 2.0716012648729353e-05, "loss": 0.04984880685806274, "step": 2030 }, { "epoch": 2.1362326434372543, "grad_norm": 0.10635484755039215, "learning_rate": 2.025974416766171e-05, "loss": 0.04293925166130066, "step": 2040 }, { "epoch": 2.1467120775478126, "grad_norm": 0.1017381027340889, "learning_rate": 1.9807276379032113e-05, "loss": 0.04305694401264191, "step": 2050 }, { "epoch": 2.1571915116583704, "grad_norm": 0.13550882041454315, "learning_rate": 1.9358667106516055e-05, "loss": 0.04478869140148163, "step": 2060 }, { "epoch": 2.1676709457689283, "grad_norm": 0.08526366949081421, "learning_rate": 1.8913973680685226e-05, "loss": 0.036646312475204466, "step": 2070 }, { "epoch": 2.1781503798794866, "grad_norm": 0.10932011157274246, "learning_rate": 1.8473252931680928e-05, "loss": 0.042200219631195066, "step": 2080 }, { "epoch": 2.1886298139900444, "grad_norm": 0.08768360316753387, "learning_rate": 1.803656118195136e-05, "loss": 0.0437488317489624, "step": 2090 }, { "epoch": 2.1991092481006027, "grad_norm": 0.08362651616334915, "learning_rate": 1.760395423905379e-05, "loss": 0.04669668078422547, "step": 2100 }, { "epoch": 2.2095886822111606, "grad_norm": 0.08554034680128098, "learning_rate": 1.7175487388522588e-05, "loss": 0.034989356994628906, "step": 2110 }, { "epoch": 2.220068116321719, "grad_norm": 0.08215561509132385, "learning_rate": 1.6751215386803986e-05, "loss": 0.040298929810523985, "step": 2120 }, { "epoch": 2.2305475504322767, "grad_norm": 0.0840689167380333, "learning_rate": 1.6331192454258337e-05, "loss": 0.041704925894737246, "step": 2130 }, { "epoch": 2.2410269845428346, "grad_norm": 0.06530614197254181, "learning_rate": 1.5915472268231018e-05, "loss": 0.03651900887489319, "step": 2140 }, { "epoch": 2.251506418653393, "grad_norm": 0.12431822717189789, "learning_rate": 1.550410795619261e-05, "loss": 0.04806804955005646, "step": 2150 }, { "epoch": 2.2619858527639507, "grad_norm": 0.09592410176992416, "learning_rate": 1.509715208894949e-05, "loss": 0.0454313725233078, "step": 2160 }, { "epoch": 2.2724652868745085, "grad_norm": 0.07589780539274216, "learning_rate": 1.469465667392536e-05, "loss": 0.03574602603912354, "step": 2170 }, { "epoch": 2.282944720985067, "grad_norm": 0.09734483063220978, "learning_rate": 1.4296673148515038e-05, "loss": 0.04358702301979065, "step": 2180 }, { "epoch": 2.2934241550956247, "grad_norm": 0.0974339172244072, "learning_rate": 1.3903252373510838e-05, "loss": 0.04603351950645447, "step": 2190 }, { "epoch": 2.303903589206183, "grad_norm": 0.09025271981954575, "learning_rate": 1.3514444626602773e-05, "loss": 0.040065237879753114, "step": 2200 }, { "epoch": 2.314383023316741, "grad_norm": 0.07625086605548859, "learning_rate": 1.3130299595953338e-05, "loss": 0.044061675667762756, "step": 2210 }, { "epoch": 2.324862457427299, "grad_norm": 0.07306221127510071, "learning_rate": 1.2750866373847465e-05, "loss": 0.03366467654705048, "step": 2220 }, { "epoch": 2.335341891537857, "grad_norm": 0.08357638120651245, "learning_rate": 1.2376193450418715e-05, "loss": 0.041424044966697694, "step": 2230 }, { "epoch": 2.345821325648415, "grad_norm": 0.09153921157121658, "learning_rate": 1.2006328707452459e-05, "loss": 0.03938372135162353, "step": 2240 }, { "epoch": 2.356300759758973, "grad_norm": 0.09109660983085632, "learning_rate": 1.1641319412266765e-05, "loss": 0.04015985131263733, "step": 2250 }, { "epoch": 2.356300759758973, "eval_loss": 0.05486458167433739, "eval_runtime": 36.8119, "eval_samples_per_second": 8.448, "eval_steps_per_second": 8.448, "step": 2250 }, { "epoch": 2.366780193869531, "grad_norm": 0.052502721548080444, "learning_rate": 1.1281212211671822e-05, "loss": 0.0270554780960083, "step": 2260 }, { "epoch": 2.377259627980089, "grad_norm": 0.07931812107563019, "learning_rate": 1.0926053126008584e-05, "loss": 0.0417300134897232, "step": 2270 }, { "epoch": 2.387739062090647, "grad_norm": 0.08996254205703735, "learning_rate": 1.0575887543267609e-05, "loss": 0.037659955024719236, "step": 2280 }, { "epoch": 2.398218496201205, "grad_norm": 0.08800788223743439, "learning_rate": 1.023076021328867e-05, "loss": 0.048437944054603575, "step": 2290 }, { "epoch": 2.4086979303117633, "grad_norm": 0.10572271049022675, "learning_rate": 9.890715242041787e-06, "loss": 0.04166909456253052, "step": 2300 }, { "epoch": 2.419177364422321, "grad_norm": 0.10573071986436844, "learning_rate": 9.555796085990781e-06, "loss": 0.03919607996940613, "step": 2310 }, { "epoch": 2.4296567985328794, "grad_norm": 0.09714583307504654, "learning_rate": 9.226045546539608e-06, "loss": 0.03530588150024414, "step": 2320 }, { "epoch": 2.4401362326434373, "grad_norm": 0.09436199069023132, "learning_rate": 8.901505764562518e-06, "loss": 0.05111382007598877, "step": 2330 }, { "epoch": 2.450615666753995, "grad_norm": 0.06353961676359177, "learning_rate": 8.582218215018656e-06, "loss": 0.03805697858333588, "step": 2340 }, { "epoch": 2.4610951008645534, "grad_norm": 0.08853815495967865, "learning_rate": 8.268223701651684e-06, "loss": 0.04815975427627563, "step": 2350 }, { "epoch": 2.4715745349751113, "grad_norm": 0.07472016662359238, "learning_rate": 7.959562351775196e-06, "loss": 0.042247459292411804, "step": 2360 }, { "epoch": 2.4820539690856696, "grad_norm": 0.12121549248695374, "learning_rate": 7.656273611144632e-06, "loss": 0.040102115273475646, "step": 2370 }, { "epoch": 2.4925334031962274, "grad_norm": 0.08667747676372528, "learning_rate": 7.358396238916254e-06, "loss": 0.03656341433525086, "step": 2380 }, { "epoch": 2.5030128373067857, "grad_norm": 0.1162872165441513, "learning_rate": 7.065968302693882e-06, "loss": 0.04052766263484955, "step": 2390 }, { "epoch": 2.5134922714173435, "grad_norm": 0.07924140989780426, "learning_rate": 6.7790271736639595e-06, "loss": 0.03394221067428589, "step": 2400 }, { "epoch": 2.5239717055279014, "grad_norm": 0.09523408859968185, "learning_rate": 6.497609521819681e-06, "loss": 0.04119439423084259, "step": 2410 }, { "epoch": 2.5344511396384597, "grad_norm": 0.12182598561048508, "learning_rate": 6.221751311274731e-06, "loss": 0.05154783725738525, "step": 2420 }, { "epoch": 2.5449305737490175, "grad_norm": 0.09359873831272125, "learning_rate": 5.951487795667149e-06, "loss": 0.035483264923095705, "step": 2430 }, { "epoch": 2.5554100078595754, "grad_norm": 0.08514095097780228, "learning_rate": 5.686853513654117e-06, "loss": 0.03830339312553406, "step": 2440 }, { "epoch": 2.5658894419701337, "grad_norm": 0.10625084489583969, "learning_rate": 5.4278822844979705e-06, "loss": 0.034111028909683226, "step": 2450 }, { "epoch": 2.5763688760806915, "grad_norm": 0.1004003956913948, "learning_rate": 5.174607203744286e-06, "loss": 0.04465605318546295, "step": 2460 }, { "epoch": 2.58684831019125, "grad_norm": 0.0962519720196724, "learning_rate": 4.927060638992382e-06, "loss": 0.041056016087532045, "step": 2470 }, { "epoch": 2.5973277443018077, "grad_norm": 0.06380607187747955, "learning_rate": 4.685274225758846e-06, "loss": 0.03880062401294708, "step": 2480 }, { "epoch": 2.607807178412366, "grad_norm": 0.07326535880565643, "learning_rate": 4.449278863434647e-06, "loss": 0.03194461762905121, "step": 2490 }, { "epoch": 2.618286612522924, "grad_norm": 0.12218596786260605, "learning_rate": 4.2191047113362854e-06, "loss": 0.04258840978145599, "step": 2500 }, { "epoch": 2.618286612522924, "eval_loss": 0.05223666876554489, "eval_runtime": 37.7234, "eval_samples_per_second": 8.244, "eval_steps_per_second": 8.244, "step": 2500 }, { "epoch": 2.6287660466334817, "grad_norm": 0.08594664931297302, "learning_rate": 3.994781184851598e-06, "loss": 0.04302787780761719, "step": 2510 }, { "epoch": 2.63924548074404, "grad_norm": 0.08187596499919891, "learning_rate": 3.776336951680548e-06, "loss": 0.0341387003660202, "step": 2520 }, { "epoch": 2.649724914854598, "grad_norm": 0.10216796398162842, "learning_rate": 3.563799928171596e-06, "loss": 0.04289879500865936, "step": 2530 }, { "epoch": 2.6602043489651557, "grad_norm": 0.11215174198150635, "learning_rate": 3.3571972757540814e-06, "loss": 0.04055049121379852, "step": 2540 }, { "epoch": 2.670683783075714, "grad_norm": 0.07941269129514694, "learning_rate": 3.156555397467176e-06, "loss": 0.04118689000606537, "step": 2550 }, { "epoch": 2.681163217186272, "grad_norm": 0.09404437988996506, "learning_rate": 2.9618999345855547e-06, "loss": 0.03079705536365509, "step": 2560 }, { "epoch": 2.69164265129683, "grad_norm": 0.1109817698597908, "learning_rate": 2.773255763342647e-06, "loss": 0.038885954022407535, "step": 2570 }, { "epoch": 2.702122085407388, "grad_norm": 0.09431962668895721, "learning_rate": 2.590646991751472e-06, "loss": 0.043543145060539246, "step": 2580 }, { "epoch": 2.7126015195179463, "grad_norm": 0.08184763044118881, "learning_rate": 2.414096956523776e-06, "loss": 0.03256987631320953, "step": 2590 }, { "epoch": 2.723080953628504, "grad_norm": 0.08390141278505325, "learning_rate": 2.2436282200876458e-06, "loss": 0.03908055424690247, "step": 2600 }, { "epoch": 2.733560387739062, "grad_norm": 0.0762532502412796, "learning_rate": 2.07926256770416e-06, "loss": 0.04899201393127441, "step": 2610 }, { "epoch": 2.7440398218496203, "grad_norm": 0.08239631354808807, "learning_rate": 1.9210210046832768e-06, "loss": 0.048707082867622375, "step": 2620 }, { "epoch": 2.754519255960178, "grad_norm": 0.09619107842445374, "learning_rate": 1.7689237536994364e-06, "loss": 0.0372231125831604, "step": 2630 }, { "epoch": 2.764998690070736, "grad_norm": 0.07099667191505432, "learning_rate": 1.6229902522072293e-06, "loss": 0.03421170711517334, "step": 2640 }, { "epoch": 2.7754781241812942, "grad_norm": 0.10154753923416138, "learning_rate": 1.4832391499572996e-06, "loss": 0.03656705319881439, "step": 2650 }, { "epoch": 2.785957558291852, "grad_norm": 0.09349387139081955, "learning_rate": 1.3496883066130173e-06, "loss": 0.03710306882858276, "step": 2660 }, { "epoch": 2.7964369924024104, "grad_norm": 0.061091430485248566, "learning_rate": 1.2223547894680443e-06, "loss": 0.0308389812707901, "step": 2670 }, { "epoch": 2.8069164265129682, "grad_norm": 0.09838075935840607, "learning_rate": 1.101254871265256e-06, "loss": 0.03703555166721344, "step": 2680 }, { "epoch": 2.8173958606235265, "grad_norm": 0.10046928375959396, "learning_rate": 9.864040281170938e-07, "loss": 0.04500553905963898, "step": 2690 }, { "epoch": 2.8278752947340844, "grad_norm": 0.06770773977041245, "learning_rate": 8.778169375277978e-07, "loss": 0.03823737502098083, "step": 2700 }, { "epoch": 2.8383547288446422, "grad_norm": 0.08373535424470901, "learning_rate": 7.755074765176618e-07, "loss": 0.03961678743362427, "step": 2710 }, { "epoch": 2.8488341629552005, "grad_norm": 0.07590050995349884, "learning_rate": 6.794887198496413e-07, "loss": 0.03221273124217987, "step": 2720 }, { "epoch": 2.8593135970657584, "grad_norm": 0.08507678657770157, "learning_rate": 5.897729383583906e-07, "loss": 0.04571912884712219, "step": 2730 }, { "epoch": 2.8697930311763162, "grad_norm": 0.06584763526916504, "learning_rate": 5.063715973821659e-07, "loss": 0.03794914484024048, "step": 2740 }, { "epoch": 2.8802724652868745, "grad_norm": 0.07312892377376556, "learning_rate": 4.292953552975154e-07, "loss": 0.036365586519241336, "step": 2750 }, { "epoch": 2.8802724652868745, "eval_loss": 0.05090421438217163, "eval_runtime": 85.293, "eval_samples_per_second": 3.646, "eval_steps_per_second": 3.646, "step": 2750 }, { "epoch": 2.8907518993974324, "grad_norm": 0.08459606021642685, "learning_rate": 3.5855406215725697e-07, "loss": 0.03068857192993164, "step": 2760 }, { "epoch": 2.9012313335079907, "grad_norm": 0.06866376101970673, "learning_rate": 2.9415675843163515e-07, "loss": 0.03265829384326935, "step": 2770 }, { "epoch": 2.9117107676185485, "grad_norm": 0.09082643687725067, "learning_rate": 2.361116738529956e-07, "loss": 0.03418546915054321, "step": 2780 }, { "epoch": 2.922190201729107, "grad_norm": 0.10772739350795746, "learning_rate": 1.8442622636404284e-07, "loss": 0.03810786008834839, "step": 2790 }, { "epoch": 2.9326696358396647, "grad_norm": 0.08321297913789749, "learning_rate": 1.391070211698764e-07, "loss": 0.04068491756916046, "step": 2800 }, { "epoch": 2.9431490699502225, "grad_norm": 0.11239277571439743, "learning_rate": 1.0015984989385496e-07, "loss": 0.041029155254364014, "step": 2810 }, { "epoch": 2.953628504060781, "grad_norm": 0.07199843227863312, "learning_rate": 6.758968983747171e-08, "loss": 0.037902483344078065, "step": 2820 }, { "epoch": 2.9641079381713387, "grad_norm": 0.08249279856681824, "learning_rate": 4.140070334422985e-08, "loss": 0.03996126651763916, "step": 2830 }, { "epoch": 2.9745873722818965, "grad_norm": 0.0852220207452774, "learning_rate": 2.1596237267751396e-08, "loss": 0.04228667616844177, "step": 2840 }, { "epoch": 2.985066806392455, "grad_norm": 0.0858582928776741, "learning_rate": 8.178822544052666e-09, "loss": 0.03813594281673431, "step": 2850 }, { "epoch": 2.995546240503013, "grad_norm": 0.06642451137304306, "learning_rate": 1.1501738680919084e-09, "loss": 0.033472076058387756, "step": 2860 } ], "logging_steps": 10, "max_steps": 2865, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.031737271887514e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }