{ "best_global_step": 6878, "best_metric": 0.8412191271781921, "best_model_checkpoint": "outputs/vlm-age-rating-qwen25vl/checkpoint-6878", "epoch": 2.0, "eval_steps": 500, "global_step": 6878, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002907822041291073, "grad_norm": 4.79532527923584, "learning_rate": 8.695652173913044e-06, "loss": 2.4354520797729493, "step": 10 }, { "epoch": 0.005815644082582146, "grad_norm": 4.398916721343994, "learning_rate": 1.8357487922705315e-05, "loss": 2.348348045349121, "step": 20 }, { "epoch": 0.008723466123873218, "grad_norm": 3.8363561630249023, "learning_rate": 2.8019323671497587e-05, "loss": 2.2402442932128905, "step": 30 }, { "epoch": 0.011631288165164292, "grad_norm": 3.9996328353881836, "learning_rate": 3.7681159420289856e-05, "loss": 2.0806962966918947, "step": 40 }, { "epoch": 0.014539110206455364, "grad_norm": 3.623289108276367, "learning_rate": 4.7342995169082125e-05, "loss": 1.9209724426269532, "step": 50 }, { "epoch": 0.017446932247746436, "grad_norm": 3.6283390522003174, "learning_rate": 5.7004830917874394e-05, "loss": 1.8231483459472657, "step": 60 }, { "epoch": 0.020354754289037512, "grad_norm": 3.6183929443359375, "learning_rate": 6.666666666666667e-05, "loss": 1.7064151763916016, "step": 70 }, { "epoch": 0.023262576330328584, "grad_norm": 4.0795063972473145, "learning_rate": 7.632850241545893e-05, "loss": 1.6121023178100586, "step": 80 }, { "epoch": 0.026170398371619656, "grad_norm": 3.4447927474975586, "learning_rate": 8.599033816425122e-05, "loss": 1.6030376434326172, "step": 90 }, { "epoch": 0.02907822041291073, "grad_norm": 3.6108040809631348, "learning_rate": 9.565217391304348e-05, "loss": 1.5841608047485352, "step": 100 }, { "epoch": 0.0319860424542018, "grad_norm": 3.598888635635376, "learning_rate": 0.00010531400966183576, "loss": 1.4951010704040528, "step": 110 }, { "epoch": 0.03489386449549287, "grad_norm": 3.70912766456604, "learning_rate": 0.00011497584541062802, "loss": 1.482225227355957, "step": 120 }, { "epoch": 0.03780168653678395, "grad_norm": 3.4434823989868164, "learning_rate": 0.0001246376811594203, "loss": 1.5170270919799804, "step": 130 }, { "epoch": 0.040709508578075024, "grad_norm": 3.361293315887451, "learning_rate": 0.00013429951690821257, "loss": 1.4422802925109863, "step": 140 }, { "epoch": 0.043617330619366096, "grad_norm": 3.285003900527954, "learning_rate": 0.00014396135265700482, "loss": 1.4413420677185058, "step": 150 }, { "epoch": 0.04652515266065717, "grad_norm": 3.6356050968170166, "learning_rate": 0.0001536231884057971, "loss": 1.4118947982788086, "step": 160 }, { "epoch": 0.04943297470194824, "grad_norm": 3.8388562202453613, "learning_rate": 0.00016328502415458937, "loss": 1.4001873970031737, "step": 170 }, { "epoch": 0.05234079674323931, "grad_norm": 3.3433616161346436, "learning_rate": 0.00017294685990338165, "loss": 1.3969547271728515, "step": 180 }, { "epoch": 0.055248618784530384, "grad_norm": 3.4669270515441895, "learning_rate": 0.00018260869565217392, "loss": 1.337031650543213, "step": 190 }, { "epoch": 0.05815644082582146, "grad_norm": 3.1025021076202393, "learning_rate": 0.0001922705314009662, "loss": 1.435082244873047, "step": 200 }, { "epoch": 0.061064262867112536, "grad_norm": 3.3911843299865723, "learning_rate": 0.00019994003897466647, "loss": 1.392270278930664, "step": 210 }, { "epoch": 0.0639720849084036, "grad_norm": 3.142547130584717, "learning_rate": 0.0001996402338479988, "loss": 1.243046760559082, "step": 220 }, { "epoch": 0.06687990694969467, "grad_norm": 3.35446834564209, "learning_rate": 0.00019934042872133114, "loss": 1.367037868499756, "step": 230 }, { "epoch": 0.06978772899098575, "grad_norm": 3.458134651184082, "learning_rate": 0.0001990406235946635, "loss": 1.3419992446899414, "step": 240 }, { "epoch": 0.07269555103227683, "grad_norm": 3.352386474609375, "learning_rate": 0.0001987408184679958, "loss": 1.2986759185791015, "step": 250 }, { "epoch": 0.0756033730735679, "grad_norm": 3.1780903339385986, "learning_rate": 0.00019844101334132814, "loss": 1.2988534927368165, "step": 260 }, { "epoch": 0.07851119511485898, "grad_norm": 2.86576247215271, "learning_rate": 0.00019814120821466048, "loss": 1.3682787895202637, "step": 270 }, { "epoch": 0.08141901715615005, "grad_norm": 3.3476641178131104, "learning_rate": 0.0001978414030879928, "loss": 1.3652713775634766, "step": 280 }, { "epoch": 0.08432683919744112, "grad_norm": 3.2246811389923096, "learning_rate": 0.00019754159796132515, "loss": 1.2679337501525878, "step": 290 }, { "epoch": 0.08723466123873219, "grad_norm": 3.134486675262451, "learning_rate": 0.00019724179283465748, "loss": 1.211684513092041, "step": 300 }, { "epoch": 0.09014248328002326, "grad_norm": 3.142294406890869, "learning_rate": 0.00019694198770798982, "loss": 1.3083333015441894, "step": 310 }, { "epoch": 0.09305030532131434, "grad_norm": 3.2110912799835205, "learning_rate": 0.00019664218258132215, "loss": 1.2491901397705079, "step": 320 }, { "epoch": 0.09595812736260541, "grad_norm": 3.0253570079803467, "learning_rate": 0.00019634237745465449, "loss": 1.3311482429504395, "step": 330 }, { "epoch": 0.09886594940389648, "grad_norm": 2.7756271362304688, "learning_rate": 0.00019604257232798682, "loss": 1.2524259567260743, "step": 340 }, { "epoch": 0.10177377144518755, "grad_norm": 3.243698835372925, "learning_rate": 0.00019574276720131915, "loss": 1.293147087097168, "step": 350 }, { "epoch": 0.10468159348647862, "grad_norm": 2.8504862785339355, "learning_rate": 0.0001954429620746515, "loss": 1.2383965492248534, "step": 360 }, { "epoch": 0.1075894155277697, "grad_norm": 3.280348062515259, "learning_rate": 0.00019514315694798382, "loss": 1.2218168258666993, "step": 370 }, { "epoch": 0.11049723756906077, "grad_norm": 3.1593716144561768, "learning_rate": 0.00019484335182131616, "loss": 1.2068678855895996, "step": 380 }, { "epoch": 0.11340505961035184, "grad_norm": 3.1796255111694336, "learning_rate": 0.0001945435466946485, "loss": 1.2717905998229981, "step": 390 }, { "epoch": 0.11631288165164291, "grad_norm": 2.8659069538116455, "learning_rate": 0.00019424374156798083, "loss": 1.2112377166748047, "step": 400 }, { "epoch": 0.119220703692934, "grad_norm": 2.9140565395355225, "learning_rate": 0.00019394393644131313, "loss": 1.2414008140563966, "step": 410 }, { "epoch": 0.12212852573422507, "grad_norm": 3.0246622562408447, "learning_rate": 0.0001936441313146455, "loss": 1.2327194213867188, "step": 420 }, { "epoch": 0.12503634777551614, "grad_norm": 2.7067604064941406, "learning_rate": 0.00019334432618797783, "loss": 1.1890941619873048, "step": 430 }, { "epoch": 0.1279441698168072, "grad_norm": 3.0323450565338135, "learning_rate": 0.00019304452106131016, "loss": 1.2328312873840332, "step": 440 }, { "epoch": 0.1308519918580983, "grad_norm": 3.0487864017486572, "learning_rate": 0.0001927447159346425, "loss": 1.1881980895996094, "step": 450 }, { "epoch": 0.13375981389938935, "grad_norm": 3.0193893909454346, "learning_rate": 0.00019244491080797483, "loss": 1.2217585563659668, "step": 460 }, { "epoch": 0.13666763594068043, "grad_norm": 3.0121376514434814, "learning_rate": 0.00019214510568130717, "loss": 1.1492167472839356, "step": 470 }, { "epoch": 0.1395754579819715, "grad_norm": 2.9854133129119873, "learning_rate": 0.0001918453005546395, "loss": 1.2008344650268554, "step": 480 }, { "epoch": 0.14248328002326258, "grad_norm": 2.982191562652588, "learning_rate": 0.0001915454954279718, "loss": 1.1686691284179687, "step": 490 }, { "epoch": 0.14539110206455366, "grad_norm": 4.596578598022461, "learning_rate": 0.00019124569030130417, "loss": 1.2213269233703614, "step": 500 }, { "epoch": 0.14539110206455366, "eval_loss": 1.1948493719100952, "eval_runtime": 471.9433, "eval_samples_per_second": 7.287, "eval_steps_per_second": 7.287, "step": 500 }, { "epoch": 0.14829892410584472, "grad_norm": 2.8607027530670166, "learning_rate": 0.0001909458851746365, "loss": 1.2157353401184081, "step": 510 }, { "epoch": 0.1512067461471358, "grad_norm": 3.3106703758239746, "learning_rate": 0.0001906460800479688, "loss": 1.2436732292175292, "step": 520 }, { "epoch": 0.15411456818842686, "grad_norm": 2.8542511463165283, "learning_rate": 0.00019034627492130118, "loss": 1.2832223892211914, "step": 530 }, { "epoch": 0.15702239022971795, "grad_norm": 2.961954355239868, "learning_rate": 0.0001900464697946335, "loss": 1.214921474456787, "step": 540 }, { "epoch": 0.159930212271009, "grad_norm": 3.2760446071624756, "learning_rate": 0.00018974666466796582, "loss": 1.2085216522216797, "step": 550 }, { "epoch": 0.1628380343123001, "grad_norm": 3.117536783218384, "learning_rate": 0.00018944685954129818, "loss": 1.1909247398376466, "step": 560 }, { "epoch": 0.16574585635359115, "grad_norm": 3.1558895111083984, "learning_rate": 0.00018914705441463049, "loss": 1.2084429740905762, "step": 570 }, { "epoch": 0.16865367839488224, "grad_norm": 2.6539628505706787, "learning_rate": 0.00018884724928796282, "loss": 1.2048931121826172, "step": 580 }, { "epoch": 0.1715615004361733, "grad_norm": 2.4215893745422363, "learning_rate": 0.00018854744416129518, "loss": 1.1528879165649415, "step": 590 }, { "epoch": 0.17446932247746438, "grad_norm": 3.247122049331665, "learning_rate": 0.0001882476390346275, "loss": 1.1167863845825194, "step": 600 }, { "epoch": 0.17737714451875544, "grad_norm": 2.5841548442840576, "learning_rate": 0.00018794783390795982, "loss": 1.1510201454162599, "step": 610 }, { "epoch": 0.18028496656004653, "grad_norm": 2.8135194778442383, "learning_rate": 0.00018764802878129219, "loss": 1.1705083847045898, "step": 620 }, { "epoch": 0.1831927886013376, "grad_norm": 2.7703614234924316, "learning_rate": 0.0001873482236546245, "loss": 1.1498478889465331, "step": 630 }, { "epoch": 0.18610061064262867, "grad_norm": 2.933802843093872, "learning_rate": 0.00018704841852795685, "loss": 1.1263715744018554, "step": 640 }, { "epoch": 0.18900843268391973, "grad_norm": 2.8813698291778564, "learning_rate": 0.00018674861340128916, "loss": 1.0713846206665039, "step": 650 }, { "epoch": 0.19191625472521082, "grad_norm": 4.469653606414795, "learning_rate": 0.0001864488082746215, "loss": 1.1526763916015625, "step": 660 }, { "epoch": 0.1948240767665019, "grad_norm": 2.608485698699951, "learning_rate": 0.00018614900314795386, "loss": 1.1115928649902345, "step": 670 }, { "epoch": 0.19773189880779296, "grad_norm": 2.5391645431518555, "learning_rate": 0.00018584919802128617, "loss": 1.1768034934997558, "step": 680 }, { "epoch": 0.20063972084908405, "grad_norm": 2.8083078861236572, "learning_rate": 0.0001855493928946185, "loss": 1.1693111419677735, "step": 690 }, { "epoch": 0.2035475428903751, "grad_norm": 2.7247753143310547, "learning_rate": 0.00018524958776795086, "loss": 1.1793179512023926, "step": 700 }, { "epoch": 0.2064553649316662, "grad_norm": 2.5489275455474854, "learning_rate": 0.00018494978264128317, "loss": 1.153579044342041, "step": 710 }, { "epoch": 0.20936318697295725, "grad_norm": 2.848568916320801, "learning_rate": 0.0001846499775146155, "loss": 1.1616141319274902, "step": 720 }, { "epoch": 0.21227100901424834, "grad_norm": 2.6661322116851807, "learning_rate": 0.00018435017238794784, "loss": 1.185004997253418, "step": 730 }, { "epoch": 0.2151788310555394, "grad_norm": 2.6399037837982178, "learning_rate": 0.00018405036726128017, "loss": 1.1467523574829102, "step": 740 }, { "epoch": 0.21808665309683048, "grad_norm": 2.8910253047943115, "learning_rate": 0.0001837505621346125, "loss": 1.1423175811767579, "step": 750 }, { "epoch": 0.22099447513812154, "grad_norm": 2.6750359535217285, "learning_rate": 0.00018345075700794484, "loss": 1.1023540496826172, "step": 760 }, { "epoch": 0.22390229717941262, "grad_norm": 2.7184813022613525, "learning_rate": 0.00018315095188127718, "loss": 1.0655659675598144, "step": 770 }, { "epoch": 0.22681011922070368, "grad_norm": 2.352343797683716, "learning_rate": 0.0001828511467546095, "loss": 1.0686527252197267, "step": 780 }, { "epoch": 0.22971794126199477, "grad_norm": 2.9609851837158203, "learning_rate": 0.00018255134162794185, "loss": 1.0448270797729493, "step": 790 }, { "epoch": 0.23262576330328583, "grad_norm": 2.629925489425659, "learning_rate": 0.00018225153650127418, "loss": 1.2246409416198731, "step": 800 }, { "epoch": 0.2355335853445769, "grad_norm": 2.443338632583618, "learning_rate": 0.00018195173137460651, "loss": 1.0720739364624023, "step": 810 }, { "epoch": 0.238441407385868, "grad_norm": 2.612340211868286, "learning_rate": 0.00018165192624793885, "loss": 1.135690975189209, "step": 820 }, { "epoch": 0.24134922942715906, "grad_norm": 2.6910789012908936, "learning_rate": 0.00018135212112127118, "loss": 1.160903263092041, "step": 830 }, { "epoch": 0.24425705146845014, "grad_norm": 2.729325532913208, "learning_rate": 0.00018105231599460352, "loss": 1.1131114959716797, "step": 840 }, { "epoch": 0.2471648735097412, "grad_norm": 2.6346912384033203, "learning_rate": 0.00018075251086793585, "loss": 1.1584989547729492, "step": 850 }, { "epoch": 0.2500726955510323, "grad_norm": 2.4764983654022217, "learning_rate": 0.0001804527057412682, "loss": 1.1217921257019043, "step": 860 }, { "epoch": 0.2529805175923234, "grad_norm": 2.8104140758514404, "learning_rate": 0.00018015290061460052, "loss": 1.1900800704956054, "step": 870 }, { "epoch": 0.2558883396336144, "grad_norm": 2.6423113346099854, "learning_rate": 0.00017985309548793286, "loss": 1.1654984474182128, "step": 880 }, { "epoch": 0.2587961616749055, "grad_norm": 2.456171751022339, "learning_rate": 0.0001795532903612652, "loss": 1.1631184577941895, "step": 890 }, { "epoch": 0.2617039837161966, "grad_norm": 2.355860471725464, "learning_rate": 0.00017925348523459752, "loss": 1.1408035278320312, "step": 900 }, { "epoch": 0.26461180575748766, "grad_norm": 2.7640347480773926, "learning_rate": 0.00017895368010792986, "loss": 1.095857810974121, "step": 910 }, { "epoch": 0.2675196277987787, "grad_norm": 2.518118143081665, "learning_rate": 0.0001786538749812622, "loss": 1.1492120742797851, "step": 920 }, { "epoch": 0.2704274498400698, "grad_norm": 2.478942632675171, "learning_rate": 0.00017835406985459453, "loss": 1.095020580291748, "step": 930 }, { "epoch": 0.27333527188136086, "grad_norm": 2.5483644008636475, "learning_rate": 0.00017805426472792686, "loss": 1.1031085014343263, "step": 940 }, { "epoch": 0.27624309392265195, "grad_norm": 3.0256600379943848, "learning_rate": 0.00017775445960125917, "loss": 1.1728601455688477, "step": 950 }, { "epoch": 0.279150915963943, "grad_norm": 2.417307138442993, "learning_rate": 0.00017745465447459153, "loss": 1.1077165603637695, "step": 960 }, { "epoch": 0.28205873800523407, "grad_norm": 2.5772206783294678, "learning_rate": 0.00017715484934792387, "loss": 1.1050203323364258, "step": 970 }, { "epoch": 0.28496656004652515, "grad_norm": 2.4329328536987305, "learning_rate": 0.00017685504422125617, "loss": 1.0694819450378419, "step": 980 }, { "epoch": 0.28787438208781624, "grad_norm": 2.2022688388824463, "learning_rate": 0.00017655523909458854, "loss": 1.0653936386108398, "step": 990 }, { "epoch": 0.2907822041291073, "grad_norm": 2.5784685611724854, "learning_rate": 0.00017625543396792087, "loss": 1.1188321113586426, "step": 1000 }, { "epoch": 0.2907822041291073, "eval_loss": 1.088085651397705, "eval_runtime": 466.4025, "eval_samples_per_second": 7.373, "eval_steps_per_second": 7.373, "step": 1000 }, { "epoch": 0.29369002617039836, "grad_norm": 2.1797893047332764, "learning_rate": 0.00017595562884125318, "loss": 1.0641416549682616, "step": 1010 }, { "epoch": 0.29659784821168944, "grad_norm": 2.351658344268799, "learning_rate": 0.00017565582371458554, "loss": 1.0726565361022948, "step": 1020 }, { "epoch": 0.2995056702529805, "grad_norm": 2.5299222469329834, "learning_rate": 0.00017535601858791785, "loss": 1.110872459411621, "step": 1030 }, { "epoch": 0.3024134922942716, "grad_norm": 2.492405652999878, "learning_rate": 0.00017505621346125018, "loss": 1.0856511116027832, "step": 1040 }, { "epoch": 0.30532131433556264, "grad_norm": 2.490410089492798, "learning_rate": 0.00017475640833458254, "loss": 1.053286075592041, "step": 1050 }, { "epoch": 0.30822913637685373, "grad_norm": 2.451176404953003, "learning_rate": 0.00017445660320791485, "loss": 1.1099212646484375, "step": 1060 }, { "epoch": 0.3111369584181448, "grad_norm": 2.2699105739593506, "learning_rate": 0.0001741567980812472, "loss": 0.9613182067871093, "step": 1070 }, { "epoch": 0.3140447804594359, "grad_norm": 2.1652886867523193, "learning_rate": 0.00017385699295457955, "loss": 1.0921488761901856, "step": 1080 }, { "epoch": 0.31695260250072693, "grad_norm": 2.385770797729492, "learning_rate": 0.00017355718782791185, "loss": 1.1041201591491698, "step": 1090 }, { "epoch": 0.319860424542018, "grad_norm": 2.5070347785949707, "learning_rate": 0.00017325738270124421, "loss": 1.0626919746398926, "step": 1100 }, { "epoch": 0.3227682465833091, "grad_norm": 2.5580434799194336, "learning_rate": 0.00017295757757457652, "loss": 1.0355568885803224, "step": 1110 }, { "epoch": 0.3256760686246002, "grad_norm": 2.285900592803955, "learning_rate": 0.00017265777244790886, "loss": 1.094072151184082, "step": 1120 }, { "epoch": 0.3285838906658912, "grad_norm": 2.2862985134124756, "learning_rate": 0.00017235796732124122, "loss": 1.1321526527404786, "step": 1130 }, { "epoch": 0.3314917127071823, "grad_norm": 2.3698503971099854, "learning_rate": 0.00017205816219457353, "loss": 1.1155936241149902, "step": 1140 }, { "epoch": 0.3343995347484734, "grad_norm": 2.2256312370300293, "learning_rate": 0.00017175835706790586, "loss": 1.0627290725708007, "step": 1150 }, { "epoch": 0.3373073567897645, "grad_norm": 2.3896291255950928, "learning_rate": 0.00017145855194123822, "loss": 1.0547872543334962, "step": 1160 }, { "epoch": 0.34021517883105556, "grad_norm": 2.1930463314056396, "learning_rate": 0.00017115874681457053, "loss": 1.0243175506591797, "step": 1170 }, { "epoch": 0.3431230008723466, "grad_norm": 2.0768635272979736, "learning_rate": 0.00017085894168790286, "loss": 1.1152023315429687, "step": 1180 }, { "epoch": 0.3460308229136377, "grad_norm": 2.179349422454834, "learning_rate": 0.0001705591365612352, "loss": 1.1170848846435546, "step": 1190 }, { "epoch": 0.34893864495492877, "grad_norm": 2.2244808673858643, "learning_rate": 0.00017025933143456753, "loss": 0.964411735534668, "step": 1200 }, { "epoch": 0.35184646699621985, "grad_norm": 2.39132022857666, "learning_rate": 0.00016995952630789987, "loss": 1.0051309585571289, "step": 1210 }, { "epoch": 0.3547542890375109, "grad_norm": 2.1408185958862305, "learning_rate": 0.0001696597211812322, "loss": 1.1022598266601562, "step": 1220 }, { "epoch": 0.35766211107880197, "grad_norm": 2.3732504844665527, "learning_rate": 0.00016935991605456454, "loss": 1.0292579650878906, "step": 1230 }, { "epoch": 0.36056993312009306, "grad_norm": 2.5366053581237793, "learning_rate": 0.00016906011092789687, "loss": 1.0912357330322267, "step": 1240 }, { "epoch": 0.36347775516138414, "grad_norm": 2.2400059700012207, "learning_rate": 0.0001687603058012292, "loss": 1.033323383331299, "step": 1250 }, { "epoch": 0.3663855772026752, "grad_norm": 2.2703261375427246, "learning_rate": 0.00016846050067456154, "loss": 1.0109487533569337, "step": 1260 }, { "epoch": 0.36929339924396626, "grad_norm": 2.280935764312744, "learning_rate": 0.00016816069554789387, "loss": 1.0813608169555664, "step": 1270 }, { "epoch": 0.37220122128525734, "grad_norm": 2.168682098388672, "learning_rate": 0.0001678608904212262, "loss": 1.0790273666381835, "step": 1280 }, { "epoch": 0.37510904332654843, "grad_norm": 2.1177940368652344, "learning_rate": 0.00016756108529455854, "loss": 1.031337356567383, "step": 1290 }, { "epoch": 0.37801686536783946, "grad_norm": 2.4730405807495117, "learning_rate": 0.00016726128016789088, "loss": 1.129223918914795, "step": 1300 }, { "epoch": 0.38092468740913055, "grad_norm": 2.095201015472412, "learning_rate": 0.0001669614750412232, "loss": 1.097676658630371, "step": 1310 }, { "epoch": 0.38383250945042163, "grad_norm": 5.1267242431640625, "learning_rate": 0.00016666166991455555, "loss": 1.0067487716674806, "step": 1320 }, { "epoch": 0.3867403314917127, "grad_norm": 2.3142173290252686, "learning_rate": 0.00016636186478788788, "loss": 1.0111748695373535, "step": 1330 }, { "epoch": 0.3896481535330038, "grad_norm": 2.5803937911987305, "learning_rate": 0.00016606205966122022, "loss": 1.0414213180541991, "step": 1340 }, { "epoch": 0.39255597557429484, "grad_norm": 2.057889223098755, "learning_rate": 0.00016576225453455255, "loss": 1.0709516525268554, "step": 1350 }, { "epoch": 0.3954637976155859, "grad_norm": 2.221109628677368, "learning_rate": 0.00016546244940788488, "loss": 0.9829123497009278, "step": 1360 }, { "epoch": 0.398371619656877, "grad_norm": 2.2963709831237793, "learning_rate": 0.00016516264428121722, "loss": 1.0530911445617677, "step": 1370 }, { "epoch": 0.4012794416981681, "grad_norm": 2.225609064102173, "learning_rate": 0.00016486283915454955, "loss": 0.9976913452148437, "step": 1380 }, { "epoch": 0.4041872637394591, "grad_norm": 2.186084032058716, "learning_rate": 0.0001645630340278819, "loss": 0.9746930122375488, "step": 1390 }, { "epoch": 0.4070950857807502, "grad_norm": 2.3998475074768066, "learning_rate": 0.00016426322890121422, "loss": 0.995127010345459, "step": 1400 }, { "epoch": 0.4100029078220413, "grad_norm": 2.2103660106658936, "learning_rate": 0.00016396342377454653, "loss": 0.9968692779541015, "step": 1410 }, { "epoch": 0.4129107298633324, "grad_norm": 2.228457450866699, "learning_rate": 0.0001636636186478789, "loss": 1.0345491409301757, "step": 1420 }, { "epoch": 0.4158185519046234, "grad_norm": 2.2369489669799805, "learning_rate": 0.00016336381352121123, "loss": 1.0916296005249024, "step": 1430 }, { "epoch": 0.4187263739459145, "grad_norm": 2.4540905952453613, "learning_rate": 0.00016306400839454353, "loss": 1.0946255683898927, "step": 1440 }, { "epoch": 0.4216341959872056, "grad_norm": 2.272212505340576, "learning_rate": 0.0001627642032678759, "loss": 1.0748573303222657, "step": 1450 }, { "epoch": 0.42454201802849667, "grad_norm": 2.1080758571624756, "learning_rate": 0.00016246439814120823, "loss": 1.0456165313720702, "step": 1460 }, { "epoch": 0.42744984006978776, "grad_norm": 2.1176912784576416, "learning_rate": 0.00016216459301454056, "loss": 0.9982593536376954, "step": 1470 }, { "epoch": 0.4303576621110788, "grad_norm": 2.0624117851257324, "learning_rate": 0.0001618647878878729, "loss": 1.067337417602539, "step": 1480 }, { "epoch": 0.4332654841523699, "grad_norm": 2.2433207035064697, "learning_rate": 0.0001615649827612052, "loss": 1.1099421501159668, "step": 1490 }, { "epoch": 0.43617330619366096, "grad_norm": 2.0459365844726562, "learning_rate": 0.00016126517763453757, "loss": 1.0837010383605956, "step": 1500 }, { "epoch": 0.43617330619366096, "eval_loss": 1.0330508947372437, "eval_runtime": 467.9014, "eval_samples_per_second": 7.35, "eval_steps_per_second": 7.35, "step": 1500 }, { "epoch": 0.43908112823495205, "grad_norm": 1.9743572473526, "learning_rate": 0.0001609653725078699, "loss": 0.9442779541015625, "step": 1510 }, { "epoch": 0.4419889502762431, "grad_norm": 2.259799003601074, "learning_rate": 0.0001606655673812022, "loss": 1.0716489791870116, "step": 1520 }, { "epoch": 0.44489677231753416, "grad_norm": 2.1605937480926514, "learning_rate": 0.00016036576225453457, "loss": 1.0564517974853516, "step": 1530 }, { "epoch": 0.44780459435882525, "grad_norm": 2.2314250469207764, "learning_rate": 0.0001600659571278669, "loss": 1.0048583030700684, "step": 1540 }, { "epoch": 0.45071241640011633, "grad_norm": 2.099571943283081, "learning_rate": 0.0001597661520011992, "loss": 1.0006441116333007, "step": 1550 }, { "epoch": 0.45362023844140736, "grad_norm": 1.9105018377304077, "learning_rate": 0.00015946634687453157, "loss": 0.9975058555603027, "step": 1560 }, { "epoch": 0.45652806048269845, "grad_norm": 2.057384729385376, "learning_rate": 0.00015916654174786388, "loss": 1.087107276916504, "step": 1570 }, { "epoch": 0.45943588252398954, "grad_norm": 1.9773719310760498, "learning_rate": 0.00015886673662119622, "loss": 1.0378877639770507, "step": 1580 }, { "epoch": 0.4623437045652806, "grad_norm": 2.122605323791504, "learning_rate": 0.00015856693149452858, "loss": 1.0330044746398925, "step": 1590 }, { "epoch": 0.46525152660657165, "grad_norm": 2.409942388534546, "learning_rate": 0.00015826712636786089, "loss": 1.0367450714111328, "step": 1600 }, { "epoch": 0.46815934864786274, "grad_norm": 2.112682342529297, "learning_rate": 0.00015796732124119322, "loss": 1.0601228713989257, "step": 1610 }, { "epoch": 0.4710671706891538, "grad_norm": 2.8438005447387695, "learning_rate": 0.00015766751611452558, "loss": 1.013437271118164, "step": 1620 }, { "epoch": 0.4739749927304449, "grad_norm": 2.1778664588928223, "learning_rate": 0.0001573677109878579, "loss": 0.9827444076538085, "step": 1630 }, { "epoch": 0.476882814771736, "grad_norm": 2.261019468307495, "learning_rate": 0.00015706790586119022, "loss": 1.019342803955078, "step": 1640 }, { "epoch": 0.479790636813027, "grad_norm": 2.01497483253479, "learning_rate": 0.00015676810073452258, "loss": 0.9160719871520996, "step": 1650 }, { "epoch": 0.4826984588543181, "grad_norm": 2.2591161727905273, "learning_rate": 0.0001564682956078549, "loss": 0.9794828414916992, "step": 1660 }, { "epoch": 0.4856062808956092, "grad_norm": 2.303205966949463, "learning_rate": 0.00015616849048118725, "loss": 1.070561408996582, "step": 1670 }, { "epoch": 0.4885141029369003, "grad_norm": 2.075136661529541, "learning_rate": 0.00015586868535451956, "loss": 1.0001187324523926, "step": 1680 }, { "epoch": 0.4914219249781913, "grad_norm": 2.0339841842651367, "learning_rate": 0.0001555688802278519, "loss": 1.042809009552002, "step": 1690 }, { "epoch": 0.4943297470194824, "grad_norm": 2.23016357421875, "learning_rate": 0.00015526907510118426, "loss": 1.0348269462585449, "step": 1700 }, { "epoch": 0.4972375690607735, "grad_norm": 2.620288372039795, "learning_rate": 0.00015496926997451656, "loss": 0.9250534057617188, "step": 1710 }, { "epoch": 0.5001453911020646, "grad_norm": 2.0876739025115967, "learning_rate": 0.0001546694648478489, "loss": 1.0507851600646974, "step": 1720 }, { "epoch": 0.5030532131433556, "grad_norm": 2.0192840099334717, "learning_rate": 0.00015436965972118126, "loss": 0.9346181869506835, "step": 1730 }, { "epoch": 0.5059610351846467, "grad_norm": 2.198378086090088, "learning_rate": 0.00015406985459451357, "loss": 1.008394145965576, "step": 1740 }, { "epoch": 0.5088688572259378, "grad_norm": 2.028977394104004, "learning_rate": 0.0001537700494678459, "loss": 1.0137055397033692, "step": 1750 }, { "epoch": 0.5117766792672288, "grad_norm": 2.091458320617676, "learning_rate": 0.00015347024434117824, "loss": 0.9982250213623047, "step": 1760 }, { "epoch": 0.51468450130852, "grad_norm": 1.993230938911438, "learning_rate": 0.00015317043921451057, "loss": 0.9881864547729492, "step": 1770 }, { "epoch": 0.517592323349811, "grad_norm": 1.9999781847000122, "learning_rate": 0.0001528706340878429, "loss": 0.9195417404174805, "step": 1780 }, { "epoch": 0.520500145391102, "grad_norm": 2.0336172580718994, "learning_rate": 0.00015257082896117524, "loss": 1.040501880645752, "step": 1790 }, { "epoch": 0.5234079674323932, "grad_norm": 2.3573317527770996, "learning_rate": 0.00015227102383450758, "loss": 1.0388559341430663, "step": 1800 }, { "epoch": 0.5263157894736842, "grad_norm": 2.2667224407196045, "learning_rate": 0.0001519712187078399, "loss": 1.0280171394348145, "step": 1810 }, { "epoch": 0.5292236115149753, "grad_norm": 2.0019094944000244, "learning_rate": 0.00015167141358117224, "loss": 1.054083251953125, "step": 1820 }, { "epoch": 0.5321314335562664, "grad_norm": 2.1444737911224365, "learning_rate": 0.00015137160845450458, "loss": 1.0175910949707032, "step": 1830 }, { "epoch": 0.5350392555975574, "grad_norm": 2.2335095405578613, "learning_rate": 0.0001510718033278369, "loss": 1.0406922340393066, "step": 1840 }, { "epoch": 0.5379470776388485, "grad_norm": 1.984129786491394, "learning_rate": 0.00015077199820116925, "loss": 0.9368982315063477, "step": 1850 }, { "epoch": 0.5408548996801396, "grad_norm": 1.9685107469558716, "learning_rate": 0.00015047219307450158, "loss": 0.9616594314575195, "step": 1860 }, { "epoch": 0.5437627217214307, "grad_norm": 1.945898413658142, "learning_rate": 0.00015017238794783392, "loss": 0.8819991111755371, "step": 1870 }, { "epoch": 0.5466705437627217, "grad_norm": 1.999085783958435, "learning_rate": 0.00014987258282116625, "loss": 0.9666972160339355, "step": 1880 }, { "epoch": 0.5495783658040128, "grad_norm": 2.3765146732330322, "learning_rate": 0.00014957277769449859, "loss": 0.959471607208252, "step": 1890 }, { "epoch": 0.5524861878453039, "grad_norm": 2.3136610984802246, "learning_rate": 0.00014927297256783092, "loss": 1.027439785003662, "step": 1900 }, { "epoch": 0.5553940098865949, "grad_norm": 2.017866373062134, "learning_rate": 0.00014897316744116325, "loss": 0.9374051094055176, "step": 1910 }, { "epoch": 0.558301831927886, "grad_norm": 2.01604962348938, "learning_rate": 0.0001486733623144956, "loss": 0.9679468154907227, "step": 1920 }, { "epoch": 0.5612096539691771, "grad_norm": 2.088538646697998, "learning_rate": 0.00014837355718782792, "loss": 1.0370773315429687, "step": 1930 }, { "epoch": 0.5641174760104681, "grad_norm": 2.233410120010376, "learning_rate": 0.00014807375206116026, "loss": 1.0147683143615722, "step": 1940 }, { "epoch": 0.5670252980517593, "grad_norm": 2.323624610900879, "learning_rate": 0.0001477739469344926, "loss": 0.8868412017822266, "step": 1950 }, { "epoch": 0.5699331200930503, "grad_norm": 2.008387327194214, "learning_rate": 0.00014747414180782493, "loss": 1.042736530303955, "step": 1960 }, { "epoch": 0.5728409421343413, "grad_norm": 2.069681406021118, "learning_rate": 0.00014717433668115726, "loss": 1.0332537651062013, "step": 1970 }, { "epoch": 0.5757487641756325, "grad_norm": 2.0033297538757324, "learning_rate": 0.00014687453155448957, "loss": 1.033948040008545, "step": 1980 }, { "epoch": 0.5786565862169235, "grad_norm": 2.320786237716675, "learning_rate": 0.00014657472642782193, "loss": 0.9264739036560059, "step": 1990 }, { "epoch": 0.5815644082582146, "grad_norm": 2.1512808799743652, "learning_rate": 0.00014627492130115427, "loss": 0.9679861068725586, "step": 2000 }, { "epoch": 0.5815644082582146, "eval_loss": 0.9911443591117859, "eval_runtime": 466.7402, "eval_samples_per_second": 7.368, "eval_steps_per_second": 7.368, "step": 2000 }, { "epoch": 0.5844722302995057, "grad_norm": 2.1983020305633545, "learning_rate": 0.00014597511617448657, "loss": 0.9210161209106446, "step": 2010 }, { "epoch": 0.5873800523407967, "grad_norm": 2.4881699085235596, "learning_rate": 0.00014567531104781893, "loss": 0.9766405105590821, "step": 2020 }, { "epoch": 0.5902878743820879, "grad_norm": 2.123229503631592, "learning_rate": 0.00014537550592115127, "loss": 0.9692230224609375, "step": 2030 }, { "epoch": 0.5931956964233789, "grad_norm": 2.089709758758545, "learning_rate": 0.00014507570079448358, "loss": 0.9490997314453125, "step": 2040 }, { "epoch": 0.5961035184646699, "grad_norm": 2.2401466369628906, "learning_rate": 0.00014477589566781594, "loss": 1.010519027709961, "step": 2050 }, { "epoch": 0.599011340505961, "grad_norm": 1.967027187347412, "learning_rate": 0.00014447609054114825, "loss": 0.9748605728149414, "step": 2060 }, { "epoch": 0.6019191625472521, "grad_norm": 2.350550889968872, "learning_rate": 0.0001441762854144806, "loss": 1.0137447357177733, "step": 2070 }, { "epoch": 0.6048269845885432, "grad_norm": 2.527585744857788, "learning_rate": 0.00014387648028781294, "loss": 0.9678420066833496, "step": 2080 }, { "epoch": 0.6077348066298343, "grad_norm": 2.1940855979919434, "learning_rate": 0.00014357667516114525, "loss": 0.9550240516662598, "step": 2090 }, { "epoch": 0.6106426286711253, "grad_norm": 1.9863537549972534, "learning_rate": 0.0001432768700344776, "loss": 0.941413688659668, "step": 2100 }, { "epoch": 0.6135504507124164, "grad_norm": 2.2011659145355225, "learning_rate": 0.00014297706490780994, "loss": 0.947486686706543, "step": 2110 }, { "epoch": 0.6164582727537075, "grad_norm": 2.1515135765075684, "learning_rate": 0.00014267725978114225, "loss": 0.9121341705322266, "step": 2120 }, { "epoch": 0.6193660947949986, "grad_norm": 2.002952814102173, "learning_rate": 0.00014237745465447461, "loss": 0.9708009719848633, "step": 2130 }, { "epoch": 0.6222739168362896, "grad_norm": 1.953120231628418, "learning_rate": 0.00014207764952780692, "loss": 0.9927172660827637, "step": 2140 }, { "epoch": 0.6251817388775807, "grad_norm": 1.9617403745651245, "learning_rate": 0.00014177784440113926, "loss": 0.9382636070251464, "step": 2150 }, { "epoch": 0.6280895609188718, "grad_norm": 2.2497262954711914, "learning_rate": 0.00014147803927447162, "loss": 0.9852985382080078, "step": 2160 }, { "epoch": 0.6309973829601628, "grad_norm": 2.1998257637023926, "learning_rate": 0.00014117823414780392, "loss": 1.0225582122802734, "step": 2170 }, { "epoch": 0.6339052050014539, "grad_norm": 1.931475281715393, "learning_rate": 0.00014087842902113626, "loss": 0.951665210723877, "step": 2180 }, { "epoch": 0.636813027042745, "grad_norm": 2.1101760864257812, "learning_rate": 0.00014057862389446862, "loss": 0.9361721038818359, "step": 2190 }, { "epoch": 0.639720849084036, "grad_norm": 2.0846989154815674, "learning_rate": 0.00014027881876780093, "loss": 0.9642247200012207, "step": 2200 }, { "epoch": 0.6426286711253272, "grad_norm": 2.221968650817871, "learning_rate": 0.00013997901364113326, "loss": 0.9930216789245605, "step": 2210 }, { "epoch": 0.6455364931666182, "grad_norm": 1.9288430213928223, "learning_rate": 0.0001396792085144656, "loss": 0.934235668182373, "step": 2220 }, { "epoch": 0.6484443152079092, "grad_norm": 1.9532197713851929, "learning_rate": 0.00013937940338779793, "loss": 0.9925549507141114, "step": 2230 }, { "epoch": 0.6513521372492004, "grad_norm": 1.9111248254776, "learning_rate": 0.00013907959826113027, "loss": 0.9969470024108886, "step": 2240 }, { "epoch": 0.6542599592904914, "grad_norm": 2.285212278366089, "learning_rate": 0.0001387797931344626, "loss": 0.9836911201477051, "step": 2250 }, { "epoch": 0.6571677813317824, "grad_norm": 2.1522858142852783, "learning_rate": 0.00013847998800779494, "loss": 0.9622378349304199, "step": 2260 }, { "epoch": 0.6600756033730736, "grad_norm": 2.1318211555480957, "learning_rate": 0.0001381801828811273, "loss": 0.9809438705444335, "step": 2270 }, { "epoch": 0.6629834254143646, "grad_norm": 2.0285794734954834, "learning_rate": 0.0001378803777544596, "loss": 1.0077838897705078, "step": 2280 }, { "epoch": 0.6658912474556558, "grad_norm": 1.9738699197769165, "learning_rate": 0.00013758057262779194, "loss": 0.9947647094726563, "step": 2290 }, { "epoch": 0.6687990694969468, "grad_norm": 2.0177745819091797, "learning_rate": 0.00013728076750112427, "loss": 1.000431442260742, "step": 2300 }, { "epoch": 0.6717068915382378, "grad_norm": 2.348642110824585, "learning_rate": 0.0001369809623744566, "loss": 1.0012446403503419, "step": 2310 }, { "epoch": 0.674614713579529, "grad_norm": 2.1052684783935547, "learning_rate": 0.00013668115724778894, "loss": 1.0000034332275392, "step": 2320 }, { "epoch": 0.67752253562082, "grad_norm": 2.2087337970733643, "learning_rate": 0.00013638135212112128, "loss": 0.946139907836914, "step": 2330 }, { "epoch": 0.6804303576621111, "grad_norm": 2.344235420227051, "learning_rate": 0.0001360815469944536, "loss": 0.9389586448669434, "step": 2340 }, { "epoch": 0.6833381797034022, "grad_norm": 1.8961925506591797, "learning_rate": 0.00013578174186778595, "loss": 0.9666190147399902, "step": 2350 }, { "epoch": 0.6862460017446932, "grad_norm": 2.05000901222229, "learning_rate": 0.00013548193674111828, "loss": 0.9825595855712891, "step": 2360 }, { "epoch": 0.6891538237859843, "grad_norm": 2.0229718685150146, "learning_rate": 0.00013518213161445061, "loss": 1.0510747909545899, "step": 2370 }, { "epoch": 0.6920616458272754, "grad_norm": 1.9893312454223633, "learning_rate": 0.00013488232648778295, "loss": 0.954042911529541, "step": 2380 }, { "epoch": 0.6949694678685664, "grad_norm": 1.9798680543899536, "learning_rate": 0.00013458252136111528, "loss": 0.9377657890319824, "step": 2390 }, { "epoch": 0.6978772899098575, "grad_norm": 2.1432321071624756, "learning_rate": 0.00013428271623444762, "loss": 0.9713227272033691, "step": 2400 }, { "epoch": 0.7007851119511486, "grad_norm": 2.111288070678711, "learning_rate": 0.00013398291110777995, "loss": 0.9865769386291504, "step": 2410 }, { "epoch": 0.7036929339924397, "grad_norm": 1.9824166297912598, "learning_rate": 0.0001336831059811123, "loss": 0.9995452880859375, "step": 2420 }, { "epoch": 0.7066007560337307, "grad_norm": 1.9680949449539185, "learning_rate": 0.00013338330085444462, "loss": 0.909880256652832, "step": 2430 }, { "epoch": 0.7095085780750218, "grad_norm": 2.0280842781066895, "learning_rate": 0.00013308349572777693, "loss": 0.8853329658508301, "step": 2440 }, { "epoch": 0.7124164001163129, "grad_norm": 1.9791338443756104, "learning_rate": 0.0001327836906011093, "loss": 1.0020163536071778, "step": 2450 }, { "epoch": 0.7153242221576039, "grad_norm": 2.158463954925537, "learning_rate": 0.00013248388547444163, "loss": 0.915585708618164, "step": 2460 }, { "epoch": 0.7182320441988951, "grad_norm": 2.0322635173797607, "learning_rate": 0.00013218408034777396, "loss": 0.9437061309814453, "step": 2470 }, { "epoch": 0.7211398662401861, "grad_norm": 2.1616227626800537, "learning_rate": 0.0001318842752211063, "loss": 0.9552411079406739, "step": 2480 }, { "epoch": 0.7240476882814771, "grad_norm": 1.9678977727890015, "learning_rate": 0.00013158447009443863, "loss": 0.9155937194824219, "step": 2490 }, { "epoch": 0.7269555103227683, "grad_norm": 1.7541477680206299, "learning_rate": 0.00013128466496777096, "loss": 0.9028853416442871, "step": 2500 }, { "epoch": 0.7269555103227683, "eval_loss": 0.9591814279556274, "eval_runtime": 473.0242, "eval_samples_per_second": 7.27, "eval_steps_per_second": 7.27, "step": 2500 }, { "epoch": 0.7298633323640593, "grad_norm": 2.0649845600128174, "learning_rate": 0.0001309848598411033, "loss": 0.9972336769104004, "step": 2510 }, { "epoch": 0.7327711544053503, "grad_norm": 1.8874973058700562, "learning_rate": 0.0001306850547144356, "loss": 0.9446205139160156, "step": 2520 }, { "epoch": 0.7356789764466415, "grad_norm": 2.4050185680389404, "learning_rate": 0.00013038524958776797, "loss": 1.0329988479614258, "step": 2530 }, { "epoch": 0.7385867984879325, "grad_norm": 2.0193111896514893, "learning_rate": 0.0001300854444611003, "loss": 0.910318660736084, "step": 2540 }, { "epoch": 0.7414946205292237, "grad_norm": 2.376051902770996, "learning_rate": 0.0001297856393344326, "loss": 1.0247613906860351, "step": 2550 }, { "epoch": 0.7444024425705147, "grad_norm": 2.1105611324310303, "learning_rate": 0.00012948583420776497, "loss": 1.016015625, "step": 2560 }, { "epoch": 0.7473102646118057, "grad_norm": 2.1081364154815674, "learning_rate": 0.0001291860290810973, "loss": 0.909546184539795, "step": 2570 }, { "epoch": 0.7502180866530969, "grad_norm": 2.046095132827759, "learning_rate": 0.0001288862239544296, "loss": 0.9204464912414551, "step": 2580 }, { "epoch": 0.7531259086943879, "grad_norm": 1.9615705013275146, "learning_rate": 0.00012858641882776197, "loss": 0.9793522834777832, "step": 2590 }, { "epoch": 0.7560337307356789, "grad_norm": 1.9187848567962646, "learning_rate": 0.00012828661370109428, "loss": 0.9414368629455566, "step": 2600 }, { "epoch": 0.7589415527769701, "grad_norm": 2.0125985145568848, "learning_rate": 0.00012798680857442662, "loss": 1.0450970649719238, "step": 2610 }, { "epoch": 0.7618493748182611, "grad_norm": 2.2543137073516846, "learning_rate": 0.00012768700344775898, "loss": 1.0270273208618164, "step": 2620 }, { "epoch": 0.7647571968595522, "grad_norm": 2.00054669380188, "learning_rate": 0.00012738719832109128, "loss": 1.0244030952453613, "step": 2630 }, { "epoch": 0.7676650189008433, "grad_norm": 1.8571268320083618, "learning_rate": 0.00012708739319442362, "loss": 0.9479835510253907, "step": 2640 }, { "epoch": 0.7705728409421343, "grad_norm": 2.0844404697418213, "learning_rate": 0.00012678758806775598, "loss": 0.8867239952087402, "step": 2650 }, { "epoch": 0.7734806629834254, "grad_norm": 2.023630142211914, "learning_rate": 0.0001264877829410883, "loss": 0.9323092460632324, "step": 2660 }, { "epoch": 0.7763884850247165, "grad_norm": 1.935514211654663, "learning_rate": 0.00012618797781442065, "loss": 0.997464942932129, "step": 2670 }, { "epoch": 0.7792963070660076, "grad_norm": 2.1317570209503174, "learning_rate": 0.00012588817268775296, "loss": 0.9762091636657715, "step": 2680 }, { "epoch": 0.7822041291072986, "grad_norm": 2.094515323638916, "learning_rate": 0.0001255883675610853, "loss": 0.919368839263916, "step": 2690 }, { "epoch": 0.7851119511485897, "grad_norm": 2.0645945072174072, "learning_rate": 0.00012528856243441765, "loss": 0.917721176147461, "step": 2700 }, { "epoch": 0.7880197731898808, "grad_norm": 2.178105592727661, "learning_rate": 0.00012498875730774996, "loss": 0.9504012107849121, "step": 2710 }, { "epoch": 0.7909275952311718, "grad_norm": 2.3576605319976807, "learning_rate": 0.0001246889521810823, "loss": 1.0032535552978517, "step": 2720 }, { "epoch": 0.7938354172724629, "grad_norm": 1.9736145734786987, "learning_rate": 0.00012438914705441466, "loss": 0.9342514991760253, "step": 2730 }, { "epoch": 0.796743239313754, "grad_norm": 2.074565887451172, "learning_rate": 0.00012408934192774696, "loss": 0.951146125793457, "step": 2740 }, { "epoch": 0.799651061355045, "grad_norm": 2.1545727252960205, "learning_rate": 0.0001237895368010793, "loss": 0.9272260665893555, "step": 2750 }, { "epoch": 0.8025588833963362, "grad_norm": 2.0214531421661377, "learning_rate": 0.00012348973167441163, "loss": 0.9915397644042969, "step": 2760 }, { "epoch": 0.8054667054376272, "grad_norm": 2.0707271099090576, "learning_rate": 0.00012318992654774397, "loss": 0.9817545890808106, "step": 2770 }, { "epoch": 0.8083745274789182, "grad_norm": 1.9729819297790527, "learning_rate": 0.0001228901214210763, "loss": 0.9666746139526368, "step": 2780 }, { "epoch": 0.8112823495202094, "grad_norm": 2.0239417552948, "learning_rate": 0.00012259031629440864, "loss": 0.9174615859985351, "step": 2790 }, { "epoch": 0.8141901715615004, "grad_norm": 2.0677332878112793, "learning_rate": 0.00012229051116774097, "loss": 0.9489200592041016, "step": 2800 }, { "epoch": 0.8170979936027916, "grad_norm": 2.0116071701049805, "learning_rate": 0.0001219907060410733, "loss": 0.8491521835327148, "step": 2810 }, { "epoch": 0.8200058156440826, "grad_norm": 2.005683422088623, "learning_rate": 0.00012169090091440565, "loss": 0.8922024726867676, "step": 2820 }, { "epoch": 0.8229136376853736, "grad_norm": 2.201763868331909, "learning_rate": 0.00012139109578773797, "loss": 0.9258706092834472, "step": 2830 }, { "epoch": 0.8258214597266648, "grad_norm": 1.9857138395309448, "learning_rate": 0.0001210912906610703, "loss": 0.9566280364990234, "step": 2840 }, { "epoch": 0.8287292817679558, "grad_norm": 2.107966899871826, "learning_rate": 0.00012079148553440264, "loss": 0.924662971496582, "step": 2850 }, { "epoch": 0.8316371038092468, "grad_norm": 2.1448616981506348, "learning_rate": 0.00012049168040773498, "loss": 0.9880090713500976, "step": 2860 }, { "epoch": 0.834544925850538, "grad_norm": 2.1623342037200928, "learning_rate": 0.0001201918752810673, "loss": 0.9296030044555664, "step": 2870 }, { "epoch": 0.837452747891829, "grad_norm": 2.149792194366455, "learning_rate": 0.00011989207015439965, "loss": 0.8926197052001953, "step": 2880 }, { "epoch": 0.8403605699331201, "grad_norm": 1.7601749897003174, "learning_rate": 0.00011959226502773198, "loss": 0.937494945526123, "step": 2890 }, { "epoch": 0.8432683919744112, "grad_norm": 1.8277206420898438, "learning_rate": 0.00011929245990106433, "loss": 0.9348894119262695, "step": 2900 }, { "epoch": 0.8461762140157022, "grad_norm": 2.417834520339966, "learning_rate": 0.00011899265477439665, "loss": 0.9014430046081543, "step": 2910 }, { "epoch": 0.8490840360569933, "grad_norm": 1.9275659322738647, "learning_rate": 0.00011869284964772897, "loss": 0.904062557220459, "step": 2920 }, { "epoch": 0.8519918580982844, "grad_norm": 1.8992774486541748, "learning_rate": 0.00011839304452106132, "loss": 0.9319318771362305, "step": 2930 }, { "epoch": 0.8548996801395755, "grad_norm": 1.7830348014831543, "learning_rate": 0.00011809323939439365, "loss": 0.9675899505615234, "step": 2940 }, { "epoch": 0.8578075021808665, "grad_norm": 1.9002785682678223, "learning_rate": 0.00011779343426772598, "loss": 0.9884864807128906, "step": 2950 }, { "epoch": 0.8607153242221576, "grad_norm": 2.256084442138672, "learning_rate": 0.00011749362914105832, "loss": 0.9169980049133301, "step": 2960 }, { "epoch": 0.8636231462634487, "grad_norm": 1.977953553199768, "learning_rate": 0.00011719382401439066, "loss": 0.9288657188415528, "step": 2970 }, { "epoch": 0.8665309683047397, "grad_norm": 2.223893404006958, "learning_rate": 0.00011689401888772298, "loss": 0.8921388626098633, "step": 2980 }, { "epoch": 0.8694387903460308, "grad_norm": 1.8232818841934204, "learning_rate": 0.00011659421376105533, "loss": 0.9274946212768554, "step": 2990 }, { "epoch": 0.8723466123873219, "grad_norm": 2.229280471801758, "learning_rate": 0.00011629440863438765, "loss": 0.9193262100219727, "step": 3000 }, { "epoch": 0.8723466123873219, "eval_loss": 0.9303778409957886, "eval_runtime": 469.5057, "eval_samples_per_second": 7.325, "eval_steps_per_second": 7.325, "step": 3000 }, { "epoch": 0.875254434428613, "grad_norm": 1.7871650457382202, "learning_rate": 0.00011599460350771998, "loss": 1.0009597778320312, "step": 3010 }, { "epoch": 0.8781622564699041, "grad_norm": 1.9837925434112549, "learning_rate": 0.00011569479838105233, "loss": 0.9461586952209473, "step": 3020 }, { "epoch": 0.8810700785111951, "grad_norm": 2.1220059394836426, "learning_rate": 0.00011539499325438465, "loss": 0.8776027679443359, "step": 3030 }, { "epoch": 0.8839779005524862, "grad_norm": 2.0386621952056885, "learning_rate": 0.00011509518812771699, "loss": 0.8935223579406738, "step": 3040 }, { "epoch": 0.8868857225937773, "grad_norm": 2.0495152473449707, "learning_rate": 0.00011479538300104933, "loss": 0.9414477348327637, "step": 3050 }, { "epoch": 0.8897935446350683, "grad_norm": 2.1676785945892334, "learning_rate": 0.00011449557787438165, "loss": 0.9668932914733886, "step": 3060 }, { "epoch": 0.8927013666763594, "grad_norm": 1.8518208265304565, "learning_rate": 0.00011419577274771398, "loss": 0.83909912109375, "step": 3070 }, { "epoch": 0.8956091887176505, "grad_norm": 2.6842968463897705, "learning_rate": 0.00011389596762104632, "loss": 1.001734161376953, "step": 3080 }, { "epoch": 0.8985170107589415, "grad_norm": 2.093907594680786, "learning_rate": 0.00011359616249437866, "loss": 0.9194964408874512, "step": 3090 }, { "epoch": 0.9014248328002327, "grad_norm": 1.7096366882324219, "learning_rate": 0.000113296357367711, "loss": 0.8888837814331054, "step": 3100 }, { "epoch": 0.9043326548415237, "grad_norm": 2.0421509742736816, "learning_rate": 0.00011299655224104333, "loss": 0.9483534812927246, "step": 3110 }, { "epoch": 0.9072404768828147, "grad_norm": 2.220430612564087, "learning_rate": 0.00011269674711437566, "loss": 0.855626106262207, "step": 3120 }, { "epoch": 0.9101482989241059, "grad_norm": 2.721151351928711, "learning_rate": 0.00011239694198770801, "loss": 0.9696638107299804, "step": 3130 }, { "epoch": 0.9130561209653969, "grad_norm": 2.0278444290161133, "learning_rate": 0.00011209713686104033, "loss": 0.9626810073852539, "step": 3140 }, { "epoch": 0.915963943006688, "grad_norm": 1.9929981231689453, "learning_rate": 0.00011179733173437265, "loss": 0.9121280670166015, "step": 3150 }, { "epoch": 0.9188717650479791, "grad_norm": 1.8903363943099976, "learning_rate": 0.000111497526607705, "loss": 0.8926810264587403, "step": 3160 }, { "epoch": 0.9217795870892701, "grad_norm": 1.9016224145889282, "learning_rate": 0.00011119772148103733, "loss": 0.9811866760253907, "step": 3170 }, { "epoch": 0.9246874091305612, "grad_norm": 1.7538851499557495, "learning_rate": 0.00011089791635436966, "loss": 0.9020210266113281, "step": 3180 }, { "epoch": 0.9275952311718523, "grad_norm": 1.8952163457870483, "learning_rate": 0.000110598111227702, "loss": 0.9186564445495605, "step": 3190 }, { "epoch": 0.9305030532131433, "grad_norm": 2.0162456035614014, "learning_rate": 0.00011029830610103434, "loss": 0.9249841690063476, "step": 3200 }, { "epoch": 0.9334108752544344, "grad_norm": 1.9944981336593628, "learning_rate": 0.00010999850097436666, "loss": 0.8584202766418457, "step": 3210 }, { "epoch": 0.9363186972957255, "grad_norm": 1.9651786088943481, "learning_rate": 0.000109698695847699, "loss": 0.8806270599365235, "step": 3220 }, { "epoch": 0.9392265193370166, "grad_norm": 1.7801518440246582, "learning_rate": 0.00010939889072103133, "loss": 0.9265996932983398, "step": 3230 }, { "epoch": 0.9421343413783076, "grad_norm": 2.065995454788208, "learning_rate": 0.00010909908559436366, "loss": 0.9499953269958497, "step": 3240 }, { "epoch": 0.9450421634195987, "grad_norm": 1.797984004020691, "learning_rate": 0.00010879928046769601, "loss": 0.8817252159118653, "step": 3250 }, { "epoch": 0.9479499854608898, "grad_norm": 1.7669072151184082, "learning_rate": 0.00010849947534102833, "loss": 0.9481887817382812, "step": 3260 }, { "epoch": 0.9508578075021809, "grad_norm": 2.1424496173858643, "learning_rate": 0.00010819967021436067, "loss": 0.9072481155395508, "step": 3270 }, { "epoch": 0.953765629543472, "grad_norm": 2.006564140319824, "learning_rate": 0.00010789986508769301, "loss": 0.9786685943603516, "step": 3280 }, { "epoch": 0.956673451584763, "grad_norm": 1.8808116912841797, "learning_rate": 0.00010760005996102533, "loss": 0.9131714820861816, "step": 3290 }, { "epoch": 0.959581273626054, "grad_norm": 1.8156124353408813, "learning_rate": 0.00010730025483435768, "loss": 0.9058806419372558, "step": 3300 }, { "epoch": 0.9624890956673452, "grad_norm": 2.1244983673095703, "learning_rate": 0.00010700044970769, "loss": 0.8985400199890137, "step": 3310 }, { "epoch": 0.9653969177086362, "grad_norm": 1.979709506034851, "learning_rate": 0.00010670064458102234, "loss": 0.8887692451477051, "step": 3320 }, { "epoch": 0.9683047397499273, "grad_norm": 1.8038535118103027, "learning_rate": 0.00010640083945435469, "loss": 0.9474887847900391, "step": 3330 }, { "epoch": 0.9712125617912184, "grad_norm": 1.883305549621582, "learning_rate": 0.00010610103432768701, "loss": 0.8624442100524903, "step": 3340 }, { "epoch": 0.9741203838325094, "grad_norm": 1.9134957790374756, "learning_rate": 0.00010580122920101934, "loss": 0.8966679573059082, "step": 3350 }, { "epoch": 0.9770282058738006, "grad_norm": 1.8334492444992065, "learning_rate": 0.00010550142407435169, "loss": 0.8644783020019531, "step": 3360 }, { "epoch": 0.9799360279150916, "grad_norm": 1.8988165855407715, "learning_rate": 0.00010520161894768401, "loss": 0.8418782234191895, "step": 3370 }, { "epoch": 0.9828438499563826, "grad_norm": 2.0219290256500244, "learning_rate": 0.00010490181382101633, "loss": 0.9593500137329102, "step": 3380 }, { "epoch": 0.9857516719976738, "grad_norm": 1.9475761651992798, "learning_rate": 0.00010460200869434868, "loss": 0.9046772956848145, "step": 3390 }, { "epoch": 0.9886594940389648, "grad_norm": 2.156895637512207, "learning_rate": 0.00010430220356768101, "loss": 0.9402565956115723, "step": 3400 }, { "epoch": 0.9915673160802558, "grad_norm": 1.8304574489593506, "learning_rate": 0.00010400239844101334, "loss": 0.9041817665100098, "step": 3410 }, { "epoch": 0.994475138121547, "grad_norm": 2.0752060413360596, "learning_rate": 0.00010370259331434568, "loss": 0.8676543235778809, "step": 3420 }, { "epoch": 0.997382960162838, "grad_norm": 1.845677375793457, "learning_rate": 0.00010340278818767802, "loss": 0.8220011711120605, "step": 3430 }, { "epoch": 1.0002907822041291, "grad_norm": 1.7879046201705933, "learning_rate": 0.00010310298306101034, "loss": 0.9136262893676758, "step": 3440 }, { "epoch": 1.0031986042454202, "grad_norm": 1.9771437644958496, "learning_rate": 0.00010280317793434269, "loss": 0.7697383880615234, "step": 3450 }, { "epoch": 1.0061064262867112, "grad_norm": 2.23533296585083, "learning_rate": 0.00010250337280767501, "loss": 0.8118080139160156, "step": 3460 }, { "epoch": 1.0090142483280022, "grad_norm": 1.9873441457748413, "learning_rate": 0.00010220356768100734, "loss": 0.7812850475311279, "step": 3470 }, { "epoch": 1.0119220703692935, "grad_norm": 1.8505840301513672, "learning_rate": 0.00010190376255433969, "loss": 0.8191473960876465, "step": 3480 }, { "epoch": 1.0148298924105845, "grad_norm": 2.0484778881073, "learning_rate": 0.00010160395742767201, "loss": 0.8035045623779297, "step": 3490 }, { "epoch": 1.0177377144518756, "grad_norm": 1.9655097723007202, "learning_rate": 0.00010130415230100436, "loss": 0.779402208328247, "step": 3500 }, { "epoch": 1.0177377144518756, "eval_loss": 0.9158970713615417, "eval_runtime": 474.9541, "eval_samples_per_second": 7.241, "eval_steps_per_second": 7.241, "step": 3500 }, { "epoch": 1.0206455364931666, "grad_norm": 2.0883100032806396, "learning_rate": 0.0001010043471743367, "loss": 0.8522077560424804, "step": 3510 }, { "epoch": 1.0235533585344576, "grad_norm": 2.098511219024658, "learning_rate": 0.00010070454204766901, "loss": 0.8504983901977539, "step": 3520 }, { "epoch": 1.0264611805757489, "grad_norm": 2.101414442062378, "learning_rate": 0.00010040473692100136, "loss": 0.7838150978088378, "step": 3530 }, { "epoch": 1.02936900261704, "grad_norm": 2.2732677459716797, "learning_rate": 0.00010010493179433368, "loss": 0.7565544605255127, "step": 3540 }, { "epoch": 1.032276824658331, "grad_norm": 1.695768117904663, "learning_rate": 9.980512666766603e-05, "loss": 0.7418520450592041, "step": 3550 }, { "epoch": 1.035184646699622, "grad_norm": 2.0946264266967773, "learning_rate": 9.950532154099835e-05, "loss": 0.820562744140625, "step": 3560 }, { "epoch": 1.038092468740913, "grad_norm": 2.103559732437134, "learning_rate": 9.920551641433069e-05, "loss": 0.769464635848999, "step": 3570 }, { "epoch": 1.041000290782204, "grad_norm": 1.9463759660720825, "learning_rate": 9.890571128766302e-05, "loss": 0.7375322818756104, "step": 3580 }, { "epoch": 1.0439081128234953, "grad_norm": 2.0933845043182373, "learning_rate": 9.860590616099536e-05, "loss": 0.7640562534332276, "step": 3590 }, { "epoch": 1.0468159348647863, "grad_norm": 1.9825164079666138, "learning_rate": 9.830610103432769e-05, "loss": 0.7955950736999512, "step": 3600 }, { "epoch": 1.0497237569060773, "grad_norm": 1.9863218069076538, "learning_rate": 9.800629590766003e-05, "loss": 0.7707761764526367, "step": 3610 }, { "epoch": 1.0526315789473684, "grad_norm": 1.9356553554534912, "learning_rate": 9.770649078099236e-05, "loss": 0.7836559772491455, "step": 3620 }, { "epoch": 1.0555394009886594, "grad_norm": 2.0146119594573975, "learning_rate": 9.74066856543247e-05, "loss": 0.7523280143737793, "step": 3630 }, { "epoch": 1.0584472230299506, "grad_norm": 2.056215763092041, "learning_rate": 9.710688052765703e-05, "loss": 0.8155484199523926, "step": 3640 }, { "epoch": 1.0613550450712417, "grad_norm": 1.8928337097167969, "learning_rate": 9.680707540098936e-05, "loss": 0.8006336212158203, "step": 3650 }, { "epoch": 1.0642628671125327, "grad_norm": 1.931458592414856, "learning_rate": 9.65072702743217e-05, "loss": 0.7413979530334472, "step": 3660 }, { "epoch": 1.0671706891538237, "grad_norm": 2.162804126739502, "learning_rate": 9.620746514765403e-05, "loss": 0.7883041381835938, "step": 3670 }, { "epoch": 1.0700785111951148, "grad_norm": 2.1737864017486572, "learning_rate": 9.590766002098637e-05, "loss": 0.7595384597778321, "step": 3680 }, { "epoch": 1.072986333236406, "grad_norm": 2.1503849029541016, "learning_rate": 9.560785489431869e-05, "loss": 0.8216732025146485, "step": 3690 }, { "epoch": 1.075894155277697, "grad_norm": 1.9446245431900024, "learning_rate": 9.530804976765104e-05, "loss": 0.8016197204589843, "step": 3700 }, { "epoch": 1.078801977318988, "grad_norm": 1.8525793552398682, "learning_rate": 9.500824464098337e-05, "loss": 0.8090981483459473, "step": 3710 }, { "epoch": 1.0817097993602791, "grad_norm": 1.8631248474121094, "learning_rate": 9.470843951431569e-05, "loss": 0.7200882434844971, "step": 3720 }, { "epoch": 1.0846176214015701, "grad_norm": 2.06455135345459, "learning_rate": 9.440863438764803e-05, "loss": 0.7492884635925293, "step": 3730 }, { "epoch": 1.0875254434428614, "grad_norm": 2.1832897663116455, "learning_rate": 9.410882926098037e-05, "loss": 0.8133129119873047, "step": 3740 }, { "epoch": 1.0904332654841524, "grad_norm": 1.9299908876419067, "learning_rate": 9.380902413431271e-05, "loss": 0.7218931674957275, "step": 3750 }, { "epoch": 1.0933410875254435, "grad_norm": 1.8345733880996704, "learning_rate": 9.350921900764503e-05, "loss": 0.8038671493530274, "step": 3760 }, { "epoch": 1.0962489095667345, "grad_norm": 2.200580596923828, "learning_rate": 9.320941388097736e-05, "loss": 0.758363676071167, "step": 3770 }, { "epoch": 1.0991567316080255, "grad_norm": 1.9002389907836914, "learning_rate": 9.290960875430971e-05, "loss": 0.7319043636322021, "step": 3780 }, { "epoch": 1.1020645536493165, "grad_norm": 2.178980827331543, "learning_rate": 9.260980362764203e-05, "loss": 0.8144195556640625, "step": 3790 }, { "epoch": 1.1049723756906078, "grad_norm": 1.777524709701538, "learning_rate": 9.230999850097437e-05, "loss": 0.7428917407989502, "step": 3800 }, { "epoch": 1.1078801977318988, "grad_norm": 2.269186019897461, "learning_rate": 9.20101933743067e-05, "loss": 0.7581167221069336, "step": 3810 }, { "epoch": 1.1107880197731899, "grad_norm": 2.0214948654174805, "learning_rate": 9.171038824763904e-05, "loss": 0.786471939086914, "step": 3820 }, { "epoch": 1.113695841814481, "grad_norm": 1.8476779460906982, "learning_rate": 9.141058312097137e-05, "loss": 0.7075447559356689, "step": 3830 }, { "epoch": 1.1166036638557721, "grad_norm": 1.7384737730026245, "learning_rate": 9.11107779943037e-05, "loss": 0.7290010452270508, "step": 3840 }, { "epoch": 1.1195114858970632, "grad_norm": 2.1066110134124756, "learning_rate": 9.081097286763604e-05, "loss": 0.7164917469024659, "step": 3850 }, { "epoch": 1.1224193079383542, "grad_norm": 2.2718801498413086, "learning_rate": 9.051116774096837e-05, "loss": 0.7832975864410401, "step": 3860 }, { "epoch": 1.1253271299796452, "grad_norm": 2.0165557861328125, "learning_rate": 9.021136261430071e-05, "loss": 0.840385913848877, "step": 3870 }, { "epoch": 1.1282349520209363, "grad_norm": 1.9852454662322998, "learning_rate": 8.991155748763304e-05, "loss": 0.8062166213989258, "step": 3880 }, { "epoch": 1.1311427740622273, "grad_norm": 1.8891923427581787, "learning_rate": 8.961175236096538e-05, "loss": 0.860920524597168, "step": 3890 }, { "epoch": 1.1340505961035185, "grad_norm": 2.2840018272399902, "learning_rate": 8.931194723429771e-05, "loss": 0.8350888252258301, "step": 3900 }, { "epoch": 1.1369584181448096, "grad_norm": 1.9931566715240479, "learning_rate": 8.901214210763005e-05, "loss": 0.7368578910827637, "step": 3910 }, { "epoch": 1.1398662401861006, "grad_norm": 2.0133721828460693, "learning_rate": 8.871233698096237e-05, "loss": 0.8287955284118652, "step": 3920 }, { "epoch": 1.1427740622273916, "grad_norm": 2.1664857864379883, "learning_rate": 8.841253185429472e-05, "loss": 0.7541079044342041, "step": 3930 }, { "epoch": 1.1456818842686827, "grad_norm": 2.1204898357391357, "learning_rate": 8.811272672762705e-05, "loss": 0.7905386447906494, "step": 3940 }, { "epoch": 1.148589706309974, "grad_norm": 1.8248870372772217, "learning_rate": 8.781292160095938e-05, "loss": 0.7721247673034668, "step": 3950 }, { "epoch": 1.151497528351265, "grad_norm": 1.8345229625701904, "learning_rate": 8.75131164742917e-05, "loss": 0.7757219314575196, "step": 3960 }, { "epoch": 1.154405350392556, "grad_norm": 1.6888972520828247, "learning_rate": 8.721331134762405e-05, "loss": 0.7593977451324463, "step": 3970 }, { "epoch": 1.157313172433847, "grad_norm": 1.9662666320800781, "learning_rate": 8.691350622095639e-05, "loss": 0.7545000076293945, "step": 3980 }, { "epoch": 1.160220994475138, "grad_norm": 1.987597107887268, "learning_rate": 8.661370109428871e-05, "loss": 0.7677037239074707, "step": 3990 }, { "epoch": 1.163128816516429, "grad_norm": 2.0462353229522705, "learning_rate": 8.631389596762104e-05, "loss": 0.7688228130340576, "step": 4000 }, { "epoch": 1.163128816516429, "eval_loss": 0.902101457118988, "eval_runtime": 471.9941, "eval_samples_per_second": 7.286, "eval_steps_per_second": 7.286, "step": 4000 }, { "epoch": 1.1660366385577203, "grad_norm": 2.0174319744110107, "learning_rate": 8.601409084095339e-05, "loss": 0.6873229026794434, "step": 4010 }, { "epoch": 1.1689444605990114, "grad_norm": 2.1310856342315674, "learning_rate": 8.571428571428571e-05, "loss": 0.7570897579193115, "step": 4020 }, { "epoch": 1.1718522826403024, "grad_norm": 2.121331214904785, "learning_rate": 8.541448058761805e-05, "loss": 0.8037228584289551, "step": 4030 }, { "epoch": 1.1747601046815934, "grad_norm": 1.866264820098877, "learning_rate": 8.511467546095038e-05, "loss": 0.7070183277130127, "step": 4040 }, { "epoch": 1.1776679267228847, "grad_norm": 2.1696619987487793, "learning_rate": 8.481487033428273e-05, "loss": 0.7907981872558594, "step": 4050 }, { "epoch": 1.1805757487641757, "grad_norm": 1.9860618114471436, "learning_rate": 8.451506520761505e-05, "loss": 0.801030445098877, "step": 4060 }, { "epoch": 1.1834835708054667, "grad_norm": 1.92996084690094, "learning_rate": 8.421526008094739e-05, "loss": 0.7623518943786621, "step": 4070 }, { "epoch": 1.1863913928467578, "grad_norm": 2.483771800994873, "learning_rate": 8.391545495427972e-05, "loss": 0.7155436992645263, "step": 4080 }, { "epoch": 1.1892992148880488, "grad_norm": 2.0799410343170166, "learning_rate": 8.361564982761205e-05, "loss": 0.7853653430938721, "step": 4090 }, { "epoch": 1.1922070369293398, "grad_norm": 1.9911246299743652, "learning_rate": 8.331584470094439e-05, "loss": 0.7899494171142578, "step": 4100 }, { "epoch": 1.195114858970631, "grad_norm": 2.0691795349121094, "learning_rate": 8.301603957427672e-05, "loss": 0.7937345027923584, "step": 4110 }, { "epoch": 1.198022681011922, "grad_norm": 1.9724209308624268, "learning_rate": 8.271623444760906e-05, "loss": 0.7867683887481689, "step": 4120 }, { "epoch": 1.2009305030532131, "grad_norm": 2.0218544006347656, "learning_rate": 8.241642932094139e-05, "loss": 0.7292540550231934, "step": 4130 }, { "epoch": 1.2038383250945042, "grad_norm": 1.8983079195022583, "learning_rate": 8.211662419427373e-05, "loss": 0.7957502841949463, "step": 4140 }, { "epoch": 1.2067461471357952, "grad_norm": 2.0544803142547607, "learning_rate": 8.181681906760606e-05, "loss": 0.8023088455200196, "step": 4150 }, { "epoch": 1.2096539691770865, "grad_norm": 2.2147371768951416, "learning_rate": 8.15170139409384e-05, "loss": 0.7521897315979004, "step": 4160 }, { "epoch": 1.2125617912183775, "grad_norm": 2.114715814590454, "learning_rate": 8.121720881427073e-05, "loss": 0.7361032485961914, "step": 4170 }, { "epoch": 1.2154696132596685, "grad_norm": 2.0806241035461426, "learning_rate": 8.091740368760306e-05, "loss": 0.810362434387207, "step": 4180 }, { "epoch": 1.2183774353009595, "grad_norm": 2.1156203746795654, "learning_rate": 8.061759856093539e-05, "loss": 0.7738988876342774, "step": 4190 }, { "epoch": 1.2212852573422506, "grad_norm": 2.063788652420044, "learning_rate": 8.031779343426773e-05, "loss": 0.7684555053710938, "step": 4200 }, { "epoch": 1.2241930793835416, "grad_norm": 1.8935747146606445, "learning_rate": 8.001798830760007e-05, "loss": 0.7432861328125, "step": 4210 }, { "epoch": 1.2271009014248329, "grad_norm": 1.9250372648239136, "learning_rate": 7.971818318093239e-05, "loss": 0.752553653717041, "step": 4220 }, { "epoch": 1.2300087234661239, "grad_norm": 1.8770952224731445, "learning_rate": 7.941837805426474e-05, "loss": 0.7726155281066894, "step": 4230 }, { "epoch": 1.232916545507415, "grad_norm": 1.8295116424560547, "learning_rate": 7.911857292759707e-05, "loss": 0.7889208793640137, "step": 4240 }, { "epoch": 1.235824367548706, "grad_norm": 1.932578682899475, "learning_rate": 7.88187678009294e-05, "loss": 0.8311320304870605, "step": 4250 }, { "epoch": 1.2387321895899972, "grad_norm": 1.9207606315612793, "learning_rate": 7.851896267426173e-05, "loss": 0.7485177516937256, "step": 4260 }, { "epoch": 1.2416400116312882, "grad_norm": 1.9031519889831543, "learning_rate": 7.821915754759408e-05, "loss": 0.7838140010833741, "step": 4270 }, { "epoch": 1.2445478336725793, "grad_norm": 1.8704465627670288, "learning_rate": 7.791935242092641e-05, "loss": 0.7774901390075684, "step": 4280 }, { "epoch": 1.2474556557138703, "grad_norm": 1.819907784461975, "learning_rate": 7.761954729425873e-05, "loss": 0.750859260559082, "step": 4290 }, { "epoch": 1.2503634777551613, "grad_norm": 1.9019205570220947, "learning_rate": 7.731974216759107e-05, "loss": 0.780170202255249, "step": 4300 }, { "epoch": 1.2532712997964524, "grad_norm": 2.030393362045288, "learning_rate": 7.701993704092341e-05, "loss": 0.8301087379455566, "step": 4310 }, { "epoch": 1.2561791218377436, "grad_norm": 1.842457890510559, "learning_rate": 7.672013191425573e-05, "loss": 0.7692018985748291, "step": 4320 }, { "epoch": 1.2590869438790346, "grad_norm": 1.8982383012771606, "learning_rate": 7.642032678758807e-05, "loss": 0.781231689453125, "step": 4330 }, { "epoch": 1.2619947659203257, "grad_norm": 1.8713147640228271, "learning_rate": 7.61205216609204e-05, "loss": 0.762873363494873, "step": 4340 }, { "epoch": 1.2649025879616167, "grad_norm": 2.0454752445220947, "learning_rate": 7.582071653425275e-05, "loss": 0.7406270980834961, "step": 4350 }, { "epoch": 1.267810410002908, "grad_norm": 2.4180965423583984, "learning_rate": 7.552091140758507e-05, "loss": 0.7520180702209472, "step": 4360 }, { "epoch": 1.270718232044199, "grad_norm": 1.986022710800171, "learning_rate": 7.52211062809174e-05, "loss": 0.8284387588500977, "step": 4370 }, { "epoch": 1.27362605408549, "grad_norm": 1.760955810546875, "learning_rate": 7.492130115424974e-05, "loss": 0.7012954711914062, "step": 4380 }, { "epoch": 1.276533876126781, "grad_norm": 1.8714261054992676, "learning_rate": 7.462149602758208e-05, "loss": 0.7935136795043946, "step": 4390 }, { "epoch": 1.279441698168072, "grad_norm": 1.9436787366867065, "learning_rate": 7.432169090091441e-05, "loss": 0.7934539318084717, "step": 4400 }, { "epoch": 1.282349520209363, "grad_norm": 1.9582066535949707, "learning_rate": 7.402188577424674e-05, "loss": 0.7850748538970947, "step": 4410 }, { "epoch": 1.2852573422506541, "grad_norm": 1.9535675048828125, "learning_rate": 7.372208064757908e-05, "loss": 0.7332793712615967, "step": 4420 }, { "epoch": 1.2881651642919454, "grad_norm": 1.7450060844421387, "learning_rate": 7.342227552091141e-05, "loss": 0.7462188720703125, "step": 4430 }, { "epoch": 1.2910729863332364, "grad_norm": 1.8309375047683716, "learning_rate": 7.312247039424375e-05, "loss": 0.7612934112548828, "step": 4440 }, { "epoch": 1.2939808083745274, "grad_norm": 2.1091737747192383, "learning_rate": 7.282266526757608e-05, "loss": 0.7570091247558594, "step": 4450 }, { "epoch": 1.2968886304158185, "grad_norm": 1.933371901512146, "learning_rate": 7.252286014090842e-05, "loss": 0.7872342586517334, "step": 4460 }, { "epoch": 1.2997964524571097, "grad_norm": 1.857756495475769, "learning_rate": 7.222305501424075e-05, "loss": 0.7867274761199952, "step": 4470 }, { "epoch": 1.3027042744984008, "grad_norm": 1.8640364408493042, "learning_rate": 7.192324988757309e-05, "loss": 0.7677565574645996, "step": 4480 }, { "epoch": 1.3056120965396918, "grad_norm": 1.8762586116790771, "learning_rate": 7.162344476090541e-05, "loss": 0.7633658885955811, "step": 4490 }, { "epoch": 1.3085199185809828, "grad_norm": 2.0363333225250244, "learning_rate": 7.132363963423776e-05, "loss": 0.7173254013061523, "step": 4500 }, { "epoch": 1.3085199185809828, "eval_loss": 0.8845488429069519, "eval_runtime": 473.8247, "eval_samples_per_second": 7.258, "eval_steps_per_second": 7.258, "step": 4500 }, { "epoch": 1.3114277406222739, "grad_norm": 2.0255188941955566, "learning_rate": 7.102383450757009e-05, "loss": 0.7590040683746337, "step": 4510 }, { "epoch": 1.3143355626635649, "grad_norm": 2.052431583404541, "learning_rate": 7.072402938090241e-05, "loss": 0.7589908599853515, "step": 4520 }, { "epoch": 1.3172433847048561, "grad_norm": 1.9324688911437988, "learning_rate": 7.042422425423475e-05, "loss": 0.7061916828155518, "step": 4530 }, { "epoch": 1.3201512067461472, "grad_norm": 1.881585717201233, "learning_rate": 7.012441912756709e-05, "loss": 0.7603434562683106, "step": 4540 }, { "epoch": 1.3230590287874382, "grad_norm": 2.1515471935272217, "learning_rate": 6.982461400089941e-05, "loss": 0.7740952491760253, "step": 4550 }, { "epoch": 1.3259668508287292, "grad_norm": 1.9391210079193115, "learning_rate": 6.952480887423175e-05, "loss": 0.7603271007537842, "step": 4560 }, { "epoch": 1.3288746728700205, "grad_norm": 1.9450794458389282, "learning_rate": 6.922500374756408e-05, "loss": 0.8049670219421386, "step": 4570 }, { "epoch": 1.3317824949113115, "grad_norm": 2.054507255554199, "learning_rate": 6.892519862089643e-05, "loss": 0.7841194152832032, "step": 4580 }, { "epoch": 1.3346903169526025, "grad_norm": 1.9239962100982666, "learning_rate": 6.862539349422875e-05, "loss": 0.7888126850128174, "step": 4590 }, { "epoch": 1.3375981389938936, "grad_norm": 2.1011009216308594, "learning_rate": 6.832558836756109e-05, "loss": 0.7314376354217529, "step": 4600 }, { "epoch": 1.3405059610351846, "grad_norm": 2.109041213989258, "learning_rate": 6.802578324089342e-05, "loss": 0.8058440208435058, "step": 4610 }, { "epoch": 1.3434137830764756, "grad_norm": 1.9356318712234497, "learning_rate": 6.772597811422576e-05, "loss": 0.7847652435302734, "step": 4620 }, { "epoch": 1.3463216051177667, "grad_norm": 2.092618942260742, "learning_rate": 6.742617298755809e-05, "loss": 0.7178999423980713, "step": 4630 }, { "epoch": 1.349229427159058, "grad_norm": 1.8008778095245361, "learning_rate": 6.712636786089042e-05, "loss": 0.7678854465484619, "step": 4640 }, { "epoch": 1.352137249200349, "grad_norm": 2.20794415473938, "learning_rate": 6.682656273422276e-05, "loss": 0.7475598335266114, "step": 4650 }, { "epoch": 1.35504507124164, "grad_norm": 1.785436749458313, "learning_rate": 6.65267576075551e-05, "loss": 0.7328590393066406, "step": 4660 }, { "epoch": 1.357952893282931, "grad_norm": 1.9333852529525757, "learning_rate": 6.622695248088743e-05, "loss": 0.7973165035247802, "step": 4670 }, { "epoch": 1.3608607153242223, "grad_norm": 2.149812698364258, "learning_rate": 6.592714735421976e-05, "loss": 0.7523110389709473, "step": 4680 }, { "epoch": 1.3637685373655133, "grad_norm": 2.0981409549713135, "learning_rate": 6.56273422275521e-05, "loss": 0.7891485691070557, "step": 4690 }, { "epoch": 1.3666763594068043, "grad_norm": 2.0596706867218018, "learning_rate": 6.532753710088443e-05, "loss": 0.7680598735809326, "step": 4700 }, { "epoch": 1.3695841814480954, "grad_norm": 2.0713438987731934, "learning_rate": 6.502773197421677e-05, "loss": 0.7212025165557862, "step": 4710 }, { "epoch": 1.3724920034893864, "grad_norm": 1.933114767074585, "learning_rate": 6.472792684754909e-05, "loss": 0.8231717109680176, "step": 4720 }, { "epoch": 1.3753998255306774, "grad_norm": 1.9748258590698242, "learning_rate": 6.442812172088144e-05, "loss": 0.710457706451416, "step": 4730 }, { "epoch": 1.3783076475719687, "grad_norm": 1.99483323097229, "learning_rate": 6.412831659421377e-05, "loss": 0.7033157348632812, "step": 4740 }, { "epoch": 1.3812154696132597, "grad_norm": 2.1739165782928467, "learning_rate": 6.382851146754609e-05, "loss": 0.7652640819549561, "step": 4750 }, { "epoch": 1.3841232916545507, "grad_norm": 1.9803380966186523, "learning_rate": 6.352870634087842e-05, "loss": 0.7610398769378662, "step": 4760 }, { "epoch": 1.3870311136958418, "grad_norm": 1.8965667486190796, "learning_rate": 6.322890121421077e-05, "loss": 0.8127297401428223, "step": 4770 }, { "epoch": 1.389938935737133, "grad_norm": 2.0516648292541504, "learning_rate": 6.292909608754311e-05, "loss": 0.7877964973449707, "step": 4780 }, { "epoch": 1.392846757778424, "grad_norm": 1.9544559717178345, "learning_rate": 6.262929096087543e-05, "loss": 0.7798377513885498, "step": 4790 }, { "epoch": 1.395754579819715, "grad_norm": 1.7984205484390259, "learning_rate": 6.232948583420776e-05, "loss": 0.7166555404663086, "step": 4800 }, { "epoch": 1.398662401861006, "grad_norm": 1.914433479309082, "learning_rate": 6.202968070754011e-05, "loss": 0.7964422225952148, "step": 4810 }, { "epoch": 1.4015702239022971, "grad_norm": 2.067776679992676, "learning_rate": 6.172987558087243e-05, "loss": 0.7551724910736084, "step": 4820 }, { "epoch": 1.4044780459435882, "grad_norm": 1.9109352827072144, "learning_rate": 6.143007045420477e-05, "loss": 0.7385217666625976, "step": 4830 }, { "epoch": 1.4073858679848792, "grad_norm": 1.9591761827468872, "learning_rate": 6.11302653275371e-05, "loss": 0.721226167678833, "step": 4840 }, { "epoch": 1.4102936900261704, "grad_norm": 2.115058183670044, "learning_rate": 6.0830460200869435e-05, "loss": 0.6777582168579102, "step": 4850 }, { "epoch": 1.4132015120674615, "grad_norm": 2.06552791595459, "learning_rate": 6.053065507420177e-05, "loss": 0.726964282989502, "step": 4860 }, { "epoch": 1.4161093341087525, "grad_norm": 2.1735804080963135, "learning_rate": 6.0230849947534104e-05, "loss": 0.7624166011810303, "step": 4870 }, { "epoch": 1.4190171561500435, "grad_norm": 2.2036550045013428, "learning_rate": 5.9931044820866446e-05, "loss": 0.7692136287689209, "step": 4880 }, { "epoch": 1.4219249781913348, "grad_norm": 1.8746955394744873, "learning_rate": 5.9631239694198773e-05, "loss": 0.7410698890686035, "step": 4890 }, { "epoch": 1.4248328002326258, "grad_norm": 1.9036871194839478, "learning_rate": 5.933143456753111e-05, "loss": 0.7848501205444336, "step": 4900 }, { "epoch": 1.4277406222739168, "grad_norm": 1.8983385562896729, "learning_rate": 5.903162944086344e-05, "loss": 0.7688620090484619, "step": 4910 }, { "epoch": 1.4306484443152079, "grad_norm": 2.1584537029266357, "learning_rate": 5.873182431419577e-05, "loss": 0.7612662315368652, "step": 4920 }, { "epoch": 1.433556266356499, "grad_norm": 2.033599376678467, "learning_rate": 5.843201918752811e-05, "loss": 0.7598844528198242, "step": 4930 }, { "epoch": 1.43646408839779, "grad_norm": 1.9845435619354248, "learning_rate": 5.8132214060860446e-05, "loss": 0.7194857597351074, "step": 4940 }, { "epoch": 1.4393719104390812, "grad_norm": 2.1257405281066895, "learning_rate": 5.7832408934192774e-05, "loss": 0.7137130737304688, "step": 4950 }, { "epoch": 1.4422797324803722, "grad_norm": 1.8075517416000366, "learning_rate": 5.753260380752511e-05, "loss": 0.7500784873962403, "step": 4960 }, { "epoch": 1.4451875545216633, "grad_norm": 2.136146068572998, "learning_rate": 5.723279868085745e-05, "loss": 0.7785260677337646, "step": 4970 }, { "epoch": 1.4480953765629543, "grad_norm": 1.9265216588974, "learning_rate": 5.6932993554189784e-05, "loss": 0.6893830299377441, "step": 4980 }, { "epoch": 1.4510031986042455, "grad_norm": 2.246067762374878, "learning_rate": 5.663318842752211e-05, "loss": 0.7811415672302247, "step": 4990 }, { "epoch": 1.4539110206455366, "grad_norm": 2.068582534790039, "learning_rate": 5.6333383300854446e-05, "loss": 0.7605808258056641, "step": 5000 }, { "epoch": 1.4539110206455366, "eval_loss": 0.8719142079353333, "eval_runtime": 472.3067, "eval_samples_per_second": 7.281, "eval_steps_per_second": 7.281, "step": 5000 }, { "epoch": 1.4568188426868276, "grad_norm": 1.819368600845337, "learning_rate": 5.603357817418679e-05, "loss": 0.6737122535705566, "step": 5010 }, { "epoch": 1.4597266647281186, "grad_norm": 4.519766330718994, "learning_rate": 5.573377304751911e-05, "loss": 0.723225736618042, "step": 5020 }, { "epoch": 1.4626344867694097, "grad_norm": 2.129854202270508, "learning_rate": 5.543396792085145e-05, "loss": 0.744614839553833, "step": 5030 }, { "epoch": 1.4655423088107007, "grad_norm": 1.8777213096618652, "learning_rate": 5.5134162794183784e-05, "loss": 0.7135731220245362, "step": 5040 }, { "epoch": 1.4684501308519917, "grad_norm": 1.9724328517913818, "learning_rate": 5.483435766751611e-05, "loss": 0.754638957977295, "step": 5050 }, { "epoch": 1.471357952893283, "grad_norm": 1.8722237348556519, "learning_rate": 5.4534552540848446e-05, "loss": 0.7721257209777832, "step": 5060 }, { "epoch": 1.474265774934574, "grad_norm": 2.2810893058776855, "learning_rate": 5.423474741418079e-05, "loss": 0.7189100742340088, "step": 5070 }, { "epoch": 1.477173596975865, "grad_norm": 2.171797513961792, "learning_rate": 5.393494228751312e-05, "loss": 0.7469087600708008, "step": 5080 }, { "epoch": 1.4800814190171563, "grad_norm": 1.837860107421875, "learning_rate": 5.363513716084545e-05, "loss": 0.743229341506958, "step": 5090 }, { "epoch": 1.4829892410584473, "grad_norm": 1.96957528591156, "learning_rate": 5.3335332034177784e-05, "loss": 0.7477756977081299, "step": 5100 }, { "epoch": 1.4858970630997383, "grad_norm": 2.0596463680267334, "learning_rate": 5.3035526907510126e-05, "loss": 0.7668298721313477, "step": 5110 }, { "epoch": 1.4888048851410294, "grad_norm": 1.9528030157089233, "learning_rate": 5.2735721780842453e-05, "loss": 0.7141449451446533, "step": 5120 }, { "epoch": 1.4917127071823204, "grad_norm": 1.8499971628189087, "learning_rate": 5.243591665417479e-05, "loss": 0.7491943359375, "step": 5130 }, { "epoch": 1.4946205292236114, "grad_norm": 2.016645908355713, "learning_rate": 5.213611152750712e-05, "loss": 0.7747302055358887, "step": 5140 }, { "epoch": 1.4975283512649025, "grad_norm": 1.8727567195892334, "learning_rate": 5.183630640083945e-05, "loss": 0.7703737735748291, "step": 5150 }, { "epoch": 1.5004361733061935, "grad_norm": 1.9545795917510986, "learning_rate": 5.153650127417179e-05, "loss": 0.7513375282287598, "step": 5160 }, { "epoch": 1.5033439953474848, "grad_norm": 2.101614236831665, "learning_rate": 5.1236696147504126e-05, "loss": 0.7481746196746826, "step": 5170 }, { "epoch": 1.5062518173887758, "grad_norm": 1.8322865962982178, "learning_rate": 5.093689102083646e-05, "loss": 0.6998108386993408, "step": 5180 }, { "epoch": 1.509159639430067, "grad_norm": 2.0042383670806885, "learning_rate": 5.063708589416879e-05, "loss": 0.7009212017059326, "step": 5190 }, { "epoch": 1.512067461471358, "grad_norm": 2.0147664546966553, "learning_rate": 5.033728076750113e-05, "loss": 0.6757394313812256, "step": 5200 }, { "epoch": 1.514975283512649, "grad_norm": 2.185133218765259, "learning_rate": 5.0037475640833464e-05, "loss": 0.8208953857421875, "step": 5210 }, { "epoch": 1.5178831055539401, "grad_norm": 1.9892646074295044, "learning_rate": 4.97376705141658e-05, "loss": 0.7156825065612793, "step": 5220 }, { "epoch": 1.5207909275952312, "grad_norm": 2.1808860301971436, "learning_rate": 4.9437865387498126e-05, "loss": 0.7173275470733642, "step": 5230 }, { "epoch": 1.5236987496365222, "grad_norm": 2.2999000549316406, "learning_rate": 4.913806026083047e-05, "loss": 0.7130680084228516, "step": 5240 }, { "epoch": 1.5266065716778132, "grad_norm": 1.9804385900497437, "learning_rate": 4.8838255134162795e-05, "loss": 0.7353660106658936, "step": 5250 }, { "epoch": 1.5295143937191042, "grad_norm": 2.0175087451934814, "learning_rate": 4.853845000749513e-05, "loss": 0.6823455333709717, "step": 5260 }, { "epoch": 1.5324222157603955, "grad_norm": 1.9505001306533813, "learning_rate": 4.8238644880827464e-05, "loss": 0.7360805034637451, "step": 5270 }, { "epoch": 1.5353300378016865, "grad_norm": 2.100215435028076, "learning_rate": 4.79388397541598e-05, "loss": 0.761193037033081, "step": 5280 }, { "epoch": 1.5382378598429776, "grad_norm": 1.9561179876327515, "learning_rate": 4.763903462749213e-05, "loss": 0.8232732772827148, "step": 5290 }, { "epoch": 1.5411456818842688, "grad_norm": 2.07002592086792, "learning_rate": 4.733922950082447e-05, "loss": 0.7684964656829834, "step": 5300 }, { "epoch": 1.5440535039255598, "grad_norm": 2.175922393798828, "learning_rate": 4.7039424374156795e-05, "loss": 0.743954849243164, "step": 5310 }, { "epoch": 1.5469613259668509, "grad_norm": 1.8928070068359375, "learning_rate": 4.673961924748914e-05, "loss": 0.8028405189514161, "step": 5320 }, { "epoch": 1.549869148008142, "grad_norm": 1.8495947122573853, "learning_rate": 4.6439814120821464e-05, "loss": 0.7481747150421143, "step": 5330 }, { "epoch": 1.552776970049433, "grad_norm": 2.0875566005706787, "learning_rate": 4.6140008994153806e-05, "loss": 0.7654068470001221, "step": 5340 }, { "epoch": 1.555684792090724, "grad_norm": 2.132359027862549, "learning_rate": 4.5840203867486133e-05, "loss": 0.7879802703857421, "step": 5350 }, { "epoch": 1.558592614132015, "grad_norm": 1.9833087921142578, "learning_rate": 4.554039874081847e-05, "loss": 0.7630073547363281, "step": 5360 }, { "epoch": 1.561500436173306, "grad_norm": 2.261916399002075, "learning_rate": 4.52405936141508e-05, "loss": 0.7673455238342285, "step": 5370 }, { "epoch": 1.5644082582145973, "grad_norm": 1.9294761419296265, "learning_rate": 4.494078848748314e-05, "loss": 0.7796091556549072, "step": 5380 }, { "epoch": 1.5673160802558883, "grad_norm": 1.8864835500717163, "learning_rate": 4.464098336081548e-05, "loss": 0.7843995094299316, "step": 5390 }, { "epoch": 1.5702239022971796, "grad_norm": 1.6363269090652466, "learning_rate": 4.4341178234147806e-05, "loss": 0.7239855766296387, "step": 5400 }, { "epoch": 1.5731317243384706, "grad_norm": 1.910172462463379, "learning_rate": 4.404137310748014e-05, "loss": 0.684738302230835, "step": 5410 }, { "epoch": 1.5760395463797616, "grad_norm": 1.936583161354065, "learning_rate": 4.3741567980812475e-05, "loss": 0.7819816112518311, "step": 5420 }, { "epoch": 1.5789473684210527, "grad_norm": 2.261993169784546, "learning_rate": 4.344176285414481e-05, "loss": 0.7665531158447265, "step": 5430 }, { "epoch": 1.5818551904623437, "grad_norm": 2.1516878604888916, "learning_rate": 4.314195772747714e-05, "loss": 0.7250272274017334, "step": 5440 }, { "epoch": 1.5847630125036347, "grad_norm": 1.9359904527664185, "learning_rate": 4.284215260080948e-05, "loss": 0.7476921081542969, "step": 5450 }, { "epoch": 1.5876708345449257, "grad_norm": 1.9610286951065063, "learning_rate": 4.2542347474141806e-05, "loss": 0.747294282913208, "step": 5460 }, { "epoch": 1.5905786565862168, "grad_norm": 1.8940902948379517, "learning_rate": 4.224254234747415e-05, "loss": 0.7050137519836426, "step": 5470 }, { "epoch": 1.593486478627508, "grad_norm": 2.1721606254577637, "learning_rate": 4.1942737220806475e-05, "loss": 0.7259575366973877, "step": 5480 }, { "epoch": 1.596394300668799, "grad_norm": 2.0317630767822266, "learning_rate": 4.164293209413881e-05, "loss": 0.6914929389953614, "step": 5490 }, { "epoch": 1.59930212271009, "grad_norm": 2.04887056350708, "learning_rate": 4.1343126967471144e-05, "loss": 0.7758480548858643, "step": 5500 }, { "epoch": 1.59930212271009, "eval_loss": 0.8598825931549072, "eval_runtime": 474.1318, "eval_samples_per_second": 7.253, "eval_steps_per_second": 7.253, "step": 5500 }, { "epoch": 1.6022099447513813, "grad_norm": 1.7526671886444092, "learning_rate": 4.104332184080348e-05, "loss": 0.6790880680084228, "step": 5510 }, { "epoch": 1.6051177667926724, "grad_norm": 2.0832862854003906, "learning_rate": 4.074351671413581e-05, "loss": 0.7566723346710205, "step": 5520 }, { "epoch": 1.6080255888339634, "grad_norm": 1.8792216777801514, "learning_rate": 4.044371158746815e-05, "loss": 0.6554183959960938, "step": 5530 }, { "epoch": 1.6109334108752544, "grad_norm": 2.0853688716888428, "learning_rate": 4.014390646080048e-05, "loss": 0.7421646118164062, "step": 5540 }, { "epoch": 1.6138412329165455, "grad_norm": 2.5147030353546143, "learning_rate": 3.984410133413282e-05, "loss": 0.7599525451660156, "step": 5550 }, { "epoch": 1.6167490549578365, "grad_norm": 2.236278772354126, "learning_rate": 3.954429620746515e-05, "loss": 0.8087862014770508, "step": 5560 }, { "epoch": 1.6196568769991275, "grad_norm": 1.9631987810134888, "learning_rate": 3.9244491080797486e-05, "loss": 0.7550980091094971, "step": 5570 }, { "epoch": 1.6225646990404188, "grad_norm": 1.8814702033996582, "learning_rate": 3.894468595412982e-05, "loss": 0.7769564628601074, "step": 5580 }, { "epoch": 1.6254725210817098, "grad_norm": 1.9273751974105835, "learning_rate": 3.864488082746215e-05, "loss": 0.7157449722290039, "step": 5590 }, { "epoch": 1.6283803431230008, "grad_norm": 1.9550398588180542, "learning_rate": 3.834507570079449e-05, "loss": 0.793599796295166, "step": 5600 }, { "epoch": 1.631288165164292, "grad_norm": 1.9033679962158203, "learning_rate": 3.804527057412682e-05, "loss": 0.7436941146850586, "step": 5610 }, { "epoch": 1.6341959872055831, "grad_norm": 1.9171406030654907, "learning_rate": 3.774546544745916e-05, "loss": 0.6863879203796387, "step": 5620 }, { "epoch": 1.6371038092468742, "grad_norm": 1.7775182723999023, "learning_rate": 3.7445660320791486e-05, "loss": 0.7105512142181396, "step": 5630 }, { "epoch": 1.6400116312881652, "grad_norm": 2.081458568572998, "learning_rate": 3.714585519412382e-05, "loss": 0.679707384109497, "step": 5640 }, { "epoch": 1.6429194533294562, "grad_norm": 2.0358259677886963, "learning_rate": 3.6846050067456155e-05, "loss": 0.7309046745300293, "step": 5650 }, { "epoch": 1.6458272753707472, "grad_norm": 2.1495471000671387, "learning_rate": 3.654624494078849e-05, "loss": 0.678877878189087, "step": 5660 }, { "epoch": 1.6487350974120383, "grad_norm": 2.19714617729187, "learning_rate": 3.6246439814120824e-05, "loss": 0.724186372756958, "step": 5670 }, { "epoch": 1.6516429194533293, "grad_norm": 1.930059552192688, "learning_rate": 3.594663468745316e-05, "loss": 0.7309486389160156, "step": 5680 }, { "epoch": 1.6545507414946206, "grad_norm": 2.045048236846924, "learning_rate": 3.5646829560785486e-05, "loss": 0.7321044921875, "step": 5690 }, { "epoch": 1.6574585635359116, "grad_norm": 1.8372198343276978, "learning_rate": 3.534702443411783e-05, "loss": 0.690158462524414, "step": 5700 }, { "epoch": 1.6603663855772028, "grad_norm": 2.2258548736572266, "learning_rate": 3.5047219307450155e-05, "loss": 0.6976329803466796, "step": 5710 }, { "epoch": 1.6632742076184939, "grad_norm": 1.861602544784546, "learning_rate": 3.4747414180782496e-05, "loss": 0.7170249462127686, "step": 5720 }, { "epoch": 1.666182029659785, "grad_norm": 1.9709389209747314, "learning_rate": 3.4447609054114824e-05, "loss": 0.7029520511627197, "step": 5730 }, { "epoch": 1.669089851701076, "grad_norm": 1.8129233121871948, "learning_rate": 3.414780392744716e-05, "loss": 0.7850728988647461, "step": 5740 }, { "epoch": 1.671997673742367, "grad_norm": 2.106326103210449, "learning_rate": 3.384799880077949e-05, "loss": 0.8082990646362305, "step": 5750 }, { "epoch": 1.674905495783658, "grad_norm": 1.839580774307251, "learning_rate": 3.354819367411183e-05, "loss": 0.7232970237731934, "step": 5760 }, { "epoch": 1.677813317824949, "grad_norm": 2.027674674987793, "learning_rate": 3.324838854744416e-05, "loss": 0.7808068752288818, "step": 5770 }, { "epoch": 1.68072113986624, "grad_norm": 2.2386841773986816, "learning_rate": 3.29485834207765e-05, "loss": 0.711140775680542, "step": 5780 }, { "epoch": 1.6836289619075313, "grad_norm": 2.136626958847046, "learning_rate": 3.264877829410883e-05, "loss": 0.7360716342926026, "step": 5790 }, { "epoch": 1.6865367839488223, "grad_norm": 2.048578977584839, "learning_rate": 3.2348973167441166e-05, "loss": 0.710478401184082, "step": 5800 }, { "epoch": 1.6894446059901134, "grad_norm": 1.9906225204467773, "learning_rate": 3.20491680407735e-05, "loss": 0.7086396217346191, "step": 5810 }, { "epoch": 1.6923524280314046, "grad_norm": 1.964224934577942, "learning_rate": 3.1749362914105835e-05, "loss": 0.7486394882202149, "step": 5820 }, { "epoch": 1.6952602500726957, "grad_norm": 1.887143850326538, "learning_rate": 3.144955778743817e-05, "loss": 0.6863440990447998, "step": 5830 }, { "epoch": 1.6981680721139867, "grad_norm": 2.00508713722229, "learning_rate": 3.11497526607705e-05, "loss": 0.6853577613830566, "step": 5840 }, { "epoch": 1.7010758941552777, "grad_norm": 2.0328657627105713, "learning_rate": 3.084994753410284e-05, "loss": 0.7252396583557129, "step": 5850 }, { "epoch": 1.7039837161965687, "grad_norm": 2.1302683353424072, "learning_rate": 3.0550142407435166e-05, "loss": 0.7472237586975098, "step": 5860 }, { "epoch": 1.7068915382378598, "grad_norm": 2.106820821762085, "learning_rate": 3.0250337280767504e-05, "loss": 0.723507022857666, "step": 5870 }, { "epoch": 1.7097993602791508, "grad_norm": 2.0230047702789307, "learning_rate": 2.9950532154099835e-05, "loss": 0.6797329425811768, "step": 5880 }, { "epoch": 1.7127071823204418, "grad_norm": 2.121976613998413, "learning_rate": 2.965072702743217e-05, "loss": 0.7394673347473144, "step": 5890 }, { "epoch": 1.715615004361733, "grad_norm": 1.8224197626113892, "learning_rate": 2.9350921900764504e-05, "loss": 0.758949613571167, "step": 5900 }, { "epoch": 1.7185228264030241, "grad_norm": 1.832812786102295, "learning_rate": 2.905111677409684e-05, "loss": 0.724758243560791, "step": 5910 }, { "epoch": 1.7214306484443154, "grad_norm": 2.112184524536133, "learning_rate": 2.8751311647429173e-05, "loss": 0.7687956809997558, "step": 5920 }, { "epoch": 1.7243384704856064, "grad_norm": 2.5346004962921143, "learning_rate": 2.8451506520761507e-05, "loss": 0.721607780456543, "step": 5930 }, { "epoch": 1.7272462925268974, "grad_norm": 1.9310683012008667, "learning_rate": 2.815170139409384e-05, "loss": 0.750163459777832, "step": 5940 }, { "epoch": 1.7301541145681885, "grad_norm": 2.023061752319336, "learning_rate": 2.7851896267426176e-05, "loss": 0.7398929119110107, "step": 5950 }, { "epoch": 1.7330619366094795, "grad_norm": 1.9423258304595947, "learning_rate": 2.7552091140758507e-05, "loss": 0.7022686004638672, "step": 5960 }, { "epoch": 1.7359697586507705, "grad_norm": 1.9257018566131592, "learning_rate": 2.7252286014090845e-05, "loss": 0.760784387588501, "step": 5970 }, { "epoch": 1.7388775806920616, "grad_norm": 1.9650042057037354, "learning_rate": 2.6952480887423176e-05, "loss": 0.7613511562347413, "step": 5980 }, { "epoch": 1.7417854027333526, "grad_norm": 2.08854341506958, "learning_rate": 2.6652675760755508e-05, "loss": 0.7318148612976074, "step": 5990 }, { "epoch": 1.7446932247746438, "grad_norm": 2.0090866088867188, "learning_rate": 2.6352870634087845e-05, "loss": 0.7362863540649414, "step": 6000 }, { "epoch": 1.7446932247746438, "eval_loss": 0.8498985171318054, "eval_runtime": 471.5916, "eval_samples_per_second": 7.292, "eval_steps_per_second": 7.292, "step": 6000 }, { "epoch": 1.7476010468159349, "grad_norm": 1.9278331995010376, "learning_rate": 2.6053065507420177e-05, "loss": 0.6806503295898437, "step": 6010 }, { "epoch": 1.750508868857226, "grad_norm": 2.056682825088501, "learning_rate": 2.5753260380752514e-05, "loss": 0.722940731048584, "step": 6020 }, { "epoch": 1.7534166908985171, "grad_norm": 1.9063704013824463, "learning_rate": 2.5453455254084846e-05, "loss": 0.6798623085021973, "step": 6030 }, { "epoch": 1.7563245129398082, "grad_norm": 1.8881698846817017, "learning_rate": 2.5153650127417177e-05, "loss": 0.7303658485412597, "step": 6040 }, { "epoch": 1.7592323349810992, "grad_norm": 2.0086960792541504, "learning_rate": 2.4853845000749515e-05, "loss": 0.7225898265838623, "step": 6050 }, { "epoch": 1.7621401570223902, "grad_norm": 2.1029181480407715, "learning_rate": 2.4554039874081846e-05, "loss": 0.7572491645812989, "step": 6060 }, { "epoch": 1.7650479790636813, "grad_norm": 1.9289077520370483, "learning_rate": 2.425423474741418e-05, "loss": 0.780693531036377, "step": 6070 }, { "epoch": 1.7679558011049723, "grad_norm": 2.0310537815093994, "learning_rate": 2.3954429620746515e-05, "loss": 0.7033182144165039, "step": 6080 }, { "epoch": 1.7708636231462633, "grad_norm": 2.0626399517059326, "learning_rate": 2.365462449407885e-05, "loss": 0.7750870227813721, "step": 6090 }, { "epoch": 1.7737714451875544, "grad_norm": 1.9674657583236694, "learning_rate": 2.3354819367411184e-05, "loss": 0.7491857051849365, "step": 6100 }, { "epoch": 1.7766792672288456, "grad_norm": 2.028435707092285, "learning_rate": 2.3055014240743518e-05, "loss": 0.7668484687805176, "step": 6110 }, { "epoch": 1.7795870892701366, "grad_norm": 1.8845252990722656, "learning_rate": 2.2755209114075853e-05, "loss": 0.7411411285400391, "step": 6120 }, { "epoch": 1.782494911311428, "grad_norm": 2.013805627822876, "learning_rate": 2.2455403987408187e-05, "loss": 0.7249200344085693, "step": 6130 }, { "epoch": 1.785402733352719, "grad_norm": 2.006671667098999, "learning_rate": 2.2155598860740522e-05, "loss": 0.7829705715179444, "step": 6140 }, { "epoch": 1.78831055539401, "grad_norm": 2.0205607414245605, "learning_rate": 2.1855793734072856e-05, "loss": 0.7822526454925537, "step": 6150 }, { "epoch": 1.791218377435301, "grad_norm": 1.9972530603408813, "learning_rate": 2.1555988607405187e-05, "loss": 0.7459287166595459, "step": 6160 }, { "epoch": 1.794126199476592, "grad_norm": 2.1279802322387695, "learning_rate": 2.1256183480737522e-05, "loss": 0.7397714614868164, "step": 6170 }, { "epoch": 1.797034021517883, "grad_norm": 2.1933655738830566, "learning_rate": 2.0956378354069856e-05, "loss": 0.6955758571624756, "step": 6180 }, { "epoch": 1.799941843559174, "grad_norm": 1.9316877126693726, "learning_rate": 2.065657322740219e-05, "loss": 0.727911376953125, "step": 6190 }, { "epoch": 1.8028496656004651, "grad_norm": 1.8654370307922363, "learning_rate": 2.0356768100734525e-05, "loss": 0.6596095561981201, "step": 6200 }, { "epoch": 1.8057574876417564, "grad_norm": 2.046983003616333, "learning_rate": 2.0056962974066856e-05, "loss": 0.7094531059265137, "step": 6210 }, { "epoch": 1.8086653096830474, "grad_norm": 2.1500179767608643, "learning_rate": 1.975715784739919e-05, "loss": 0.7008066654205323, "step": 6220 }, { "epoch": 1.8115731317243384, "grad_norm": 2.3866703510284424, "learning_rate": 1.9457352720731525e-05, "loss": 0.7228167533874512, "step": 6230 }, { "epoch": 1.8144809537656297, "grad_norm": 2.0172855854034424, "learning_rate": 1.915754759406386e-05, "loss": 0.6767440795898437, "step": 6240 }, { "epoch": 1.8173887758069207, "grad_norm": 1.8794561624526978, "learning_rate": 1.8857742467396194e-05, "loss": 0.7008272647857666, "step": 6250 }, { "epoch": 1.8202965978482117, "grad_norm": 2.0600626468658447, "learning_rate": 1.8557937340728526e-05, "loss": 0.7082679748535157, "step": 6260 }, { "epoch": 1.8232044198895028, "grad_norm": 1.869964599609375, "learning_rate": 1.825813221406086e-05, "loss": 0.7198378562927246, "step": 6270 }, { "epoch": 1.8261122419307938, "grad_norm": 1.9929425716400146, "learning_rate": 1.7958327087393195e-05, "loss": 0.7699549674987793, "step": 6280 }, { "epoch": 1.8290200639720848, "grad_norm": 1.9782027006149292, "learning_rate": 1.765852196072553e-05, "loss": 0.6976008892059327, "step": 6290 }, { "epoch": 1.8319278860133759, "grad_norm": 2.1247029304504395, "learning_rate": 1.7358716834057864e-05, "loss": 0.690690565109253, "step": 6300 }, { "epoch": 1.834835708054667, "grad_norm": 1.8903204202651978, "learning_rate": 1.7058911707390195e-05, "loss": 0.7616622447967529, "step": 6310 }, { "epoch": 1.8377435300959581, "grad_norm": 2.0401735305786133, "learning_rate": 1.675910658072253e-05, "loss": 0.6766151428222656, "step": 6320 }, { "epoch": 1.8406513521372492, "grad_norm": 1.9508399963378906, "learning_rate": 1.6459301454054864e-05, "loss": 0.7218796730041503, "step": 6330 }, { "epoch": 1.8435591741785404, "grad_norm": 2.1602697372436523, "learning_rate": 1.6159496327387198e-05, "loss": 0.753945779800415, "step": 6340 }, { "epoch": 1.8464669962198315, "grad_norm": 2.0386970043182373, "learning_rate": 1.5859691200719533e-05, "loss": 0.7185985088348389, "step": 6350 }, { "epoch": 1.8493748182611225, "grad_norm": 1.8012441396713257, "learning_rate": 1.5559886074051867e-05, "loss": 0.777646541595459, "step": 6360 }, { "epoch": 1.8522826403024135, "grad_norm": 1.8146647214889526, "learning_rate": 1.52600809473842e-05, "loss": 0.7361874580383301, "step": 6370 }, { "epoch": 1.8551904623437045, "grad_norm": 2.180405616760254, "learning_rate": 1.4960275820716534e-05, "loss": 0.6907386779785156, "step": 6380 }, { "epoch": 1.8580982843849956, "grad_norm": 2.1181304454803467, "learning_rate": 1.4660470694048869e-05, "loss": 0.6983653545379639, "step": 6390 }, { "epoch": 1.8610061064262866, "grad_norm": 1.8451800346374512, "learning_rate": 1.4360665567381203e-05, "loss": 0.6979443073272705, "step": 6400 }, { "epoch": 1.8639139284675776, "grad_norm": 2.0219805240631104, "learning_rate": 1.4060860440713536e-05, "loss": 0.6582244396209717, "step": 6410 }, { "epoch": 1.866821750508869, "grad_norm": 1.9723000526428223, "learning_rate": 1.376105531404587e-05, "loss": 0.7287851333618164, "step": 6420 }, { "epoch": 1.86972957255016, "grad_norm": 2.274547815322876, "learning_rate": 1.3461250187378205e-05, "loss": 0.7135519027709961, "step": 6430 }, { "epoch": 1.872637394591451, "grad_norm": 1.9583326578140259, "learning_rate": 1.316144506071054e-05, "loss": 0.6913717746734619, "step": 6440 }, { "epoch": 1.8755452166327422, "grad_norm": 2.0431573390960693, "learning_rate": 1.2861639934042874e-05, "loss": 0.7512425422668457, "step": 6450 }, { "epoch": 1.8784530386740332, "grad_norm": 2.096263885498047, "learning_rate": 1.2561834807375205e-05, "loss": 0.705587911605835, "step": 6460 }, { "epoch": 1.8813608607153243, "grad_norm": 1.940988302230835, "learning_rate": 1.226202968070754e-05, "loss": 0.7097304344177247, "step": 6470 }, { "epoch": 1.8842686827566153, "grad_norm": 1.9227349758148193, "learning_rate": 1.1962224554039874e-05, "loss": 0.7371804237365722, "step": 6480 }, { "epoch": 1.8871765047979063, "grad_norm": 1.936657428741455, "learning_rate": 1.1662419427372209e-05, "loss": 0.763831090927124, "step": 6490 }, { "epoch": 1.8900843268391974, "grad_norm": 2.0900590419769287, "learning_rate": 1.1362614300704542e-05, "loss": 0.7716608047485352, "step": 6500 }, { "epoch": 1.8900843268391974, "eval_loss": 0.8433617949485779, "eval_runtime": 473.6198, "eval_samples_per_second": 7.261, "eval_steps_per_second": 7.261, "step": 6500 }, { "epoch": 1.8929921488804884, "grad_norm": 2.003570318222046, "learning_rate": 1.1062809174036876e-05, "loss": 0.6903162956237793, "step": 6510 }, { "epoch": 1.8958999709217796, "grad_norm": 2.030358076095581, "learning_rate": 1.076300404736921e-05, "loss": 0.7362218856811523, "step": 6520 }, { "epoch": 1.8988077929630707, "grad_norm": 1.8056998252868652, "learning_rate": 1.0463198920701545e-05, "loss": 0.6997631072998047, "step": 6530 }, { "epoch": 1.9017156150043617, "grad_norm": 1.793885350227356, "learning_rate": 1.016339379403388e-05, "loss": 0.731913709640503, "step": 6540 }, { "epoch": 1.904623437045653, "grad_norm": 2.0713064670562744, "learning_rate": 9.863588667366213e-06, "loss": 0.7571732997894287, "step": 6550 }, { "epoch": 1.907531259086944, "grad_norm": 1.9770585298538208, "learning_rate": 9.563783540698547e-06, "loss": 0.7581112384796143, "step": 6560 }, { "epoch": 1.910439081128235, "grad_norm": 2.289889097213745, "learning_rate": 9.26397841403088e-06, "loss": 0.7515597343444824, "step": 6570 }, { "epoch": 1.913346903169526, "grad_norm": 2.222022294998169, "learning_rate": 8.964173287363214e-06, "loss": 0.7455884933471679, "step": 6580 }, { "epoch": 1.916254725210817, "grad_norm": 1.969591736793518, "learning_rate": 8.664368160695547e-06, "loss": 0.6826446533203125, "step": 6590 }, { "epoch": 1.919162547252108, "grad_norm": 2.064741611480713, "learning_rate": 8.364563034027882e-06, "loss": 0.6947153091430665, "step": 6600 }, { "epoch": 1.9220703692933991, "grad_norm": 2.0524702072143555, "learning_rate": 8.064757907360216e-06, "loss": 0.7242071151733398, "step": 6610 }, { "epoch": 1.9249781913346902, "grad_norm": 2.0459067821502686, "learning_rate": 7.76495278069255e-06, "loss": 0.7215473651885986, "step": 6620 }, { "epoch": 1.9278860133759814, "grad_norm": 2.034057140350342, "learning_rate": 7.465147654024884e-06, "loss": 0.7145359992980957, "step": 6630 }, { "epoch": 1.9307938354172725, "grad_norm": 2.039088487625122, "learning_rate": 7.165342527357217e-06, "loss": 0.7058275699615478, "step": 6640 }, { "epoch": 1.9337016574585635, "grad_norm": 2.0845980644226074, "learning_rate": 6.865537400689552e-06, "loss": 0.7172946453094482, "step": 6650 }, { "epoch": 1.9366094794998547, "grad_norm": 2.0260169506073, "learning_rate": 6.565732274021887e-06, "loss": 0.7668924808502198, "step": 6660 }, { "epoch": 1.9395173015411458, "grad_norm": 2.099771738052368, "learning_rate": 6.26592714735422e-06, "loss": 0.7463947772979737, "step": 6670 }, { "epoch": 1.9424251235824368, "grad_norm": 1.9962726831436157, "learning_rate": 5.966122020686554e-06, "loss": 0.698793363571167, "step": 6680 }, { "epoch": 1.9453329456237278, "grad_norm": 2.075279951095581, "learning_rate": 5.666316894018888e-06, "loss": 0.6867193222045899, "step": 6690 }, { "epoch": 1.9482407676650189, "grad_norm": 1.8623920679092407, "learning_rate": 5.366511767351222e-06, "loss": 0.7588799953460693, "step": 6700 }, { "epoch": 1.9511485897063099, "grad_norm": 2.0419015884399414, "learning_rate": 5.066706640683556e-06, "loss": 0.7772952079772949, "step": 6710 }, { "epoch": 1.954056411747601, "grad_norm": 1.9629226922988892, "learning_rate": 4.76690151401589e-06, "loss": 0.7375712394714355, "step": 6720 }, { "epoch": 1.9569642337888922, "grad_norm": 2.0185976028442383, "learning_rate": 4.467096387348224e-06, "loss": 0.759752893447876, "step": 6730 }, { "epoch": 1.9598720558301832, "grad_norm": 2.034822463989258, "learning_rate": 4.167291260680558e-06, "loss": 0.7086891174316406, "step": 6740 }, { "epoch": 1.9627798778714742, "grad_norm": 2.146571159362793, "learning_rate": 3.8674861340128915e-06, "loss": 0.7501019477844239, "step": 6750 }, { "epoch": 1.9656876999127655, "grad_norm": 2.0261270999908447, "learning_rate": 3.5676810073452256e-06, "loss": 0.749469804763794, "step": 6760 }, { "epoch": 1.9685955219540565, "grad_norm": 2.1559603214263916, "learning_rate": 3.2678758806775593e-06, "loss": 0.7909334659576416, "step": 6770 }, { "epoch": 1.9715033439953475, "grad_norm": 1.8334423303604126, "learning_rate": 2.9680707540098938e-06, "loss": 0.7327389717102051, "step": 6780 }, { "epoch": 1.9744111660366386, "grad_norm": 2.1545159816741943, "learning_rate": 2.668265627342228e-06, "loss": 0.7096083641052247, "step": 6790 }, { "epoch": 1.9773189880779296, "grad_norm": 1.9905986785888672, "learning_rate": 2.3684605006745615e-06, "loss": 0.7368163585662841, "step": 6800 }, { "epoch": 1.9802268101192206, "grad_norm": 2.1959192752838135, "learning_rate": 2.0686553740068956e-06, "loss": 0.6501263618469239, "step": 6810 }, { "epoch": 1.9831346321605117, "grad_norm": 1.9602024555206299, "learning_rate": 1.7688502473392297e-06, "loss": 0.7349401950836182, "step": 6820 }, { "epoch": 1.9860424542018027, "grad_norm": 1.9173307418823242, "learning_rate": 1.4690451206715635e-06, "loss": 0.7650456428527832, "step": 6830 }, { "epoch": 1.988950276243094, "grad_norm": 1.9146604537963867, "learning_rate": 1.1692399940038976e-06, "loss": 0.6636839866638183, "step": 6840 }, { "epoch": 1.991858098284385, "grad_norm": 1.845234990119934, "learning_rate": 8.694348673362316e-07, "loss": 0.6827308654785156, "step": 6850 }, { "epoch": 1.9947659203256762, "grad_norm": 1.9973350763320923, "learning_rate": 5.696297406685654e-07, "loss": 0.7055688381195069, "step": 6860 }, { "epoch": 1.9976737423669673, "grad_norm": 1.9779253005981445, "learning_rate": 2.698246140008994e-07, "loss": 0.7651626110076905, "step": 6870 }, { "epoch": 2.0, "eval_loss": 0.8412191271781921, "eval_runtime": 473.3499, "eval_samples_per_second": 7.265, "eval_steps_per_second": 7.265, "step": 6878 } ], "logging_steps": 10, "max_steps": 6878, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8901151514e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }