{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 2000, "global_step": 9796, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002041753866571385, "grad_norm": 8.025947570800781, "learning_rate": 3.877551020408164e-06, "loss": 10.3645, "step": 20 }, { "epoch": 0.00408350773314277, "grad_norm": 3.2897210121154785, "learning_rate": 7.959183673469388e-06, "loss": 9.9217, "step": 40 }, { "epoch": 0.006125261599714154, "grad_norm": 2.2591001987457275, "learning_rate": 1.2040816326530612e-05, "loss": 9.5264, "step": 60 }, { "epoch": 0.00816701546628554, "grad_norm": 2.2981972694396973, "learning_rate": 1.6122448979591836e-05, "loss": 9.1671, "step": 80 }, { "epoch": 0.010208769332856924, "grad_norm": 2.21460223197937, "learning_rate": 2.0204081632653062e-05, "loss": 8.7261, "step": 100 }, { "epoch": 0.012250523199428308, "grad_norm": 1.9112319946289062, "learning_rate": 2.4285714285714288e-05, "loss": 8.2058, "step": 120 }, { "epoch": 0.014292277065999694, "grad_norm": 2.184126377105713, "learning_rate": 2.8367346938775514e-05, "loss": 7.7208, "step": 140 }, { "epoch": 0.01633403093257108, "grad_norm": 2.0506958961486816, "learning_rate": 3.2448979591836736e-05, "loss": 7.2923, "step": 160 }, { "epoch": 0.018375784799142463, "grad_norm": 2.1403377056121826, "learning_rate": 3.653061224489796e-05, "loss": 6.9776, "step": 180 }, { "epoch": 0.020417538665713848, "grad_norm": 1.60039222240448, "learning_rate": 4.061224489795918e-05, "loss": 6.7076, "step": 200 }, { "epoch": 0.022459292532285232, "grad_norm": 1.9279309511184692, "learning_rate": 4.469387755102041e-05, "loss": 6.5291, "step": 220 }, { "epoch": 0.024501046398856616, "grad_norm": 1.9194912910461426, "learning_rate": 4.877551020408164e-05, "loss": 6.4868, "step": 240 }, { "epoch": 0.026542800265428004, "grad_norm": 2.763141632080078, "learning_rate": 5.285714285714286e-05, "loss": 6.2169, "step": 260 }, { "epoch": 0.02858455413199939, "grad_norm": 2.672356605529785, "learning_rate": 5.693877551020409e-05, "loss": 6.091, "step": 280 }, { "epoch": 0.030626307998570773, "grad_norm": 2.878016471862793, "learning_rate": 6.102040816326531e-05, "loss": 5.8563, "step": 300 }, { "epoch": 0.03266806186514216, "grad_norm": 3.3528285026550293, "learning_rate": 6.510204081632654e-05, "loss": 5.7202, "step": 320 }, { "epoch": 0.03470981573171354, "grad_norm": 3.0839903354644775, "learning_rate": 6.918367346938775e-05, "loss": 5.5353, "step": 340 }, { "epoch": 0.036751569598284926, "grad_norm": 3.1389739513397217, "learning_rate": 7.326530612244899e-05, "loss": 5.4565, "step": 360 }, { "epoch": 0.03879332346485631, "grad_norm": 3.9550576210021973, "learning_rate": 7.73469387755102e-05, "loss": 5.329, "step": 380 }, { "epoch": 0.040835077331427695, "grad_norm": 2.833421230316162, "learning_rate": 8.142857142857143e-05, "loss": 5.2686, "step": 400 }, { "epoch": 0.04287683119799908, "grad_norm": 2.902620315551758, "learning_rate": 8.551020408163266e-05, "loss": 5.2006, "step": 420 }, { "epoch": 0.044918585064570464, "grad_norm": 3.094481945037842, "learning_rate": 8.959183673469388e-05, "loss": 5.1094, "step": 440 }, { "epoch": 0.04696033893114185, "grad_norm": 2.9646449089050293, "learning_rate": 9.367346938775511e-05, "loss": 5.0168, "step": 460 }, { "epoch": 0.04900209279771323, "grad_norm": 3.2955989837646484, "learning_rate": 9.775510204081632e-05, "loss": 4.9646, "step": 480 }, { "epoch": 0.05104384666428462, "grad_norm": 3.2423243522644043, "learning_rate": 9.999976921990784e-05, "loss": 4.8964, "step": 500 }, { "epoch": 0.05308560053085601, "grad_norm": 2.9924113750457764, "learning_rate": 9.999760389312795e-05, "loss": 4.7879, "step": 520 }, { "epoch": 0.05512735439742739, "grad_norm": 2.9985177516937256, "learning_rate": 9.999315937293023e-05, "loss": 4.6535, "step": 540 }, { "epoch": 0.05716910826399878, "grad_norm": 2.6791439056396484, "learning_rate": 9.998643586192282e-05, "loss": 4.7047, "step": 560 }, { "epoch": 0.05921086213057016, "grad_norm": 2.489689826965332, "learning_rate": 9.997743366660403e-05, "loss": 4.6351, "step": 580 }, { "epoch": 0.061252615997141546, "grad_norm": 2.541220188140869, "learning_rate": 9.996615319734843e-05, "loss": 4.5739, "step": 600 }, { "epoch": 0.06329436986371292, "grad_norm": 2.589477300643921, "learning_rate": 9.995259496838807e-05, "loss": 4.514, "step": 620 }, { "epoch": 0.06533612373028432, "grad_norm": 2.494658946990967, "learning_rate": 9.99367595977891e-05, "loss": 4.4381, "step": 640 }, { "epoch": 0.06737787759685569, "grad_norm": 2.5053203105926514, "learning_rate": 9.991864780742358e-05, "loss": 4.4361, "step": 660 }, { "epoch": 0.06941963146342708, "grad_norm": 2.3512158393859863, "learning_rate": 9.989826042293652e-05, "loss": 4.3866, "step": 680 }, { "epoch": 0.07146138532999848, "grad_norm": 2.0839807987213135, "learning_rate": 9.987559837370832e-05, "loss": 4.3736, "step": 700 }, { "epoch": 0.07350313919656985, "grad_norm": 2.575279474258423, "learning_rate": 9.985066269281236e-05, "loss": 4.2662, "step": 720 }, { "epoch": 0.07554489306314124, "grad_norm": 2.251189708709717, "learning_rate": 9.98234545169679e-05, "loss": 4.3267, "step": 740 }, { "epoch": 0.07758664692971262, "grad_norm": 2.2394919395446777, "learning_rate": 9.979397508648829e-05, "loss": 4.1751, "step": 760 }, { "epoch": 0.07962840079628401, "grad_norm": 2.4333765506744385, "learning_rate": 9.97622257452244e-05, "loss": 4.1719, "step": 780 }, { "epoch": 0.08167015466285539, "grad_norm": 2.892970085144043, "learning_rate": 9.972820794050339e-05, "loss": 4.1344, "step": 800 }, { "epoch": 0.08371190852942678, "grad_norm": 2.1009159088134766, "learning_rate": 9.969192322306271e-05, "loss": 4.11, "step": 820 }, { "epoch": 0.08575366239599816, "grad_norm": 2.3663530349731445, "learning_rate": 9.965337324697941e-05, "loss": 4.0782, "step": 840 }, { "epoch": 0.08779541626256955, "grad_norm": 2.172346353530884, "learning_rate": 9.961255976959473e-05, "loss": 4.0917, "step": 860 }, { "epoch": 0.08983717012914093, "grad_norm": 2.654052495956421, "learning_rate": 9.956948465143401e-05, "loss": 4.0358, "step": 880 }, { "epoch": 0.09187892399571232, "grad_norm": 2.196387529373169, "learning_rate": 9.952414985612189e-05, "loss": 4.0193, "step": 900 }, { "epoch": 0.0939206778622837, "grad_norm": 2.4239919185638428, "learning_rate": 9.947655745029271e-05, "loss": 3.9508, "step": 920 }, { "epoch": 0.09596243172885509, "grad_norm": 2.322056770324707, "learning_rate": 9.942670960349643e-05, "loss": 3.9665, "step": 940 }, { "epoch": 0.09800418559542647, "grad_norm": 2.1790049076080322, "learning_rate": 9.937460858809963e-05, "loss": 3.8932, "step": 960 }, { "epoch": 0.10004593946199786, "grad_norm": 2.0862293243408203, "learning_rate": 9.932025677918195e-05, "loss": 3.8281, "step": 980 }, { "epoch": 0.10208769332856923, "grad_norm": 2.275364875793457, "learning_rate": 9.926365665442784e-05, "loss": 3.8873, "step": 1000 }, { "epoch": 0.10412944719514063, "grad_norm": 1.9620935916900635, "learning_rate": 9.920481079401356e-05, "loss": 3.8617, "step": 1020 }, { "epoch": 0.10617120106171202, "grad_norm": 2.0827794075012207, "learning_rate": 9.914372188048964e-05, "loss": 3.8451, "step": 1040 }, { "epoch": 0.1082129549282834, "grad_norm": 2.069380283355713, "learning_rate": 9.908039269865852e-05, "loss": 3.831, "step": 1060 }, { "epoch": 0.11025470879485479, "grad_norm": 2.0301127433776855, "learning_rate": 9.901482613544763e-05, "loss": 3.7971, "step": 1080 }, { "epoch": 0.11229646266142616, "grad_norm": 2.137688636779785, "learning_rate": 9.89470251797778e-05, "loss": 3.8138, "step": 1100 }, { "epoch": 0.11433821652799755, "grad_norm": 2.3098602294921875, "learning_rate": 9.887699292242698e-05, "loss": 3.7329, "step": 1120 }, { "epoch": 0.11637997039456893, "grad_norm": 2.110292911529541, "learning_rate": 9.880473255588936e-05, "loss": 3.7514, "step": 1140 }, { "epoch": 0.11842172426114032, "grad_norm": 2.170607805252075, "learning_rate": 9.873024737422982e-05, "loss": 3.7685, "step": 1160 }, { "epoch": 0.1204634781277117, "grad_norm": 2.0908591747283936, "learning_rate": 9.865354077293383e-05, "loss": 3.6734, "step": 1180 }, { "epoch": 0.12250523199428309, "grad_norm": 2.04089617729187, "learning_rate": 9.857461624875254e-05, "loss": 3.6068, "step": 1200 }, { "epoch": 0.12454698586085447, "grad_norm": 2.3476288318634033, "learning_rate": 9.849347739954352e-05, "loss": 3.7122, "step": 1220 }, { "epoch": 0.12658873972742585, "grad_norm": 1.9791185855865479, "learning_rate": 9.841012792410663e-05, "loss": 3.6852, "step": 1240 }, { "epoch": 0.12863049359399725, "grad_norm": 2.2237918376922607, "learning_rate": 9.832457162201544e-05, "loss": 3.5576, "step": 1260 }, { "epoch": 0.13067224746056863, "grad_norm": 1.9589416980743408, "learning_rate": 9.823681239344413e-05, "loss": 3.6479, "step": 1280 }, { "epoch": 0.13271400132714, "grad_norm": 2.128124237060547, "learning_rate": 9.814685423898949e-05, "loss": 3.5455, "step": 1300 }, { "epoch": 0.13475575519371139, "grad_norm": 1.8983638286590576, "learning_rate": 9.805470125948872e-05, "loss": 3.5029, "step": 1320 }, { "epoch": 0.1367975090602828, "grad_norm": 1.9615085124969482, "learning_rate": 9.796035765583243e-05, "loss": 3.5872, "step": 1340 }, { "epoch": 0.13883926292685417, "grad_norm": 2.1444954872131348, "learning_rate": 9.786382772877313e-05, "loss": 3.6173, "step": 1360 }, { "epoch": 0.14088101679342555, "grad_norm": 2.009953022003174, "learning_rate": 9.776511587872919e-05, "loss": 3.5656, "step": 1380 }, { "epoch": 0.14292277065999695, "grad_norm": 2.3367576599121094, "learning_rate": 9.766422660558421e-05, "loss": 3.5255, "step": 1400 }, { "epoch": 0.14496452452656833, "grad_norm": 1.9374524354934692, "learning_rate": 9.756116450848196e-05, "loss": 3.5598, "step": 1420 }, { "epoch": 0.1470062783931397, "grad_norm": 1.8847877979278564, "learning_rate": 9.745593428561664e-05, "loss": 3.4916, "step": 1440 }, { "epoch": 0.14904803225971108, "grad_norm": 2.090186357498169, "learning_rate": 9.734854073401878e-05, "loss": 3.4768, "step": 1460 }, { "epoch": 0.1510897861262825, "grad_norm": 2.259089946746826, "learning_rate": 9.723898874933648e-05, "loss": 3.4402, "step": 1480 }, { "epoch": 0.15313153999285387, "grad_norm": 2.1110124588012695, "learning_rate": 9.712728332561233e-05, "loss": 3.4796, "step": 1500 }, { "epoch": 0.15517329385942524, "grad_norm": 2.06239652633667, "learning_rate": 9.701342955505568e-05, "loss": 3.4343, "step": 1520 }, { "epoch": 0.15721504772599662, "grad_norm": 1.9148401021957397, "learning_rate": 9.689743262781056e-05, "loss": 3.4756, "step": 1540 }, { "epoch": 0.15925680159256803, "grad_norm": 1.9485000371932983, "learning_rate": 9.677929783171902e-05, "loss": 3.4358, "step": 1560 }, { "epoch": 0.1612985554591394, "grad_norm": 1.9415696859359741, "learning_rate": 9.665903055208014e-05, "loss": 3.3909, "step": 1580 }, { "epoch": 0.16334030932571078, "grad_norm": 2.157007932662964, "learning_rate": 9.653663627140447e-05, "loss": 3.4689, "step": 1600 }, { "epoch": 0.16538206319228216, "grad_norm": 2.0099036693573, "learning_rate": 9.64121205691642e-05, "loss": 3.4113, "step": 1620 }, { "epoch": 0.16742381705885356, "grad_norm": 2.059490442276001, "learning_rate": 9.628548912153871e-05, "loss": 3.3896, "step": 1640 }, { "epoch": 0.16946557092542494, "grad_norm": 2.002117395401001, "learning_rate": 9.615674770115588e-05, "loss": 3.3911, "step": 1660 }, { "epoch": 0.17150732479199632, "grad_norm": 1.8783079385757446, "learning_rate": 9.60259021768289e-05, "loss": 3.379, "step": 1680 }, { "epoch": 0.1735490786585677, "grad_norm": 1.8858823776245117, "learning_rate": 9.58929585132888e-05, "loss": 3.3839, "step": 1700 }, { "epoch": 0.1755908325251391, "grad_norm": 2.277445077896118, "learning_rate": 9.575792277091243e-05, "loss": 3.2725, "step": 1720 }, { "epoch": 0.17763258639171048, "grad_norm": 2.2689497470855713, "learning_rate": 9.562080110544631e-05, "loss": 3.3005, "step": 1740 }, { "epoch": 0.17967434025828186, "grad_norm": 2.0327837467193604, "learning_rate": 9.548159976772592e-05, "loss": 3.3085, "step": 1760 }, { "epoch": 0.18171609412485326, "grad_norm": 1.9416719675064087, "learning_rate": 9.534032510339084e-05, "loss": 3.2924, "step": 1780 }, { "epoch": 0.18375784799142464, "grad_norm": 1.9634373188018799, "learning_rate": 9.519698355259538e-05, "loss": 3.3577, "step": 1800 }, { "epoch": 0.18579960185799602, "grad_norm": 2.2403604984283447, "learning_rate": 9.505158164971505e-05, "loss": 3.3095, "step": 1820 }, { "epoch": 0.1878413557245674, "grad_norm": 2.115143299102783, "learning_rate": 9.490412602304872e-05, "loss": 3.3814, "step": 1840 }, { "epoch": 0.1898831095911388, "grad_norm": 2.1246602535247803, "learning_rate": 9.475462339451638e-05, "loss": 3.2305, "step": 1860 }, { "epoch": 0.19192486345771018, "grad_norm": 1.8857777118682861, "learning_rate": 9.460308057935278e-05, "loss": 3.2605, "step": 1880 }, { "epoch": 0.19396661732428155, "grad_norm": 2.1070351600646973, "learning_rate": 9.444950448579669e-05, "loss": 3.2251, "step": 1900 }, { "epoch": 0.19600837119085293, "grad_norm": 2.068774461746216, "learning_rate": 9.429390211477604e-05, "loss": 3.1887, "step": 1920 }, { "epoch": 0.19805012505742434, "grad_norm": 2.0654258728027344, "learning_rate": 9.413628055958878e-05, "loss": 3.318, "step": 1940 }, { "epoch": 0.20009187892399571, "grad_norm": 1.9323906898498535, "learning_rate": 9.397664700557947e-05, "loss": 3.1789, "step": 1960 }, { "epoch": 0.2021336327905671, "grad_norm": 1.9735854864120483, "learning_rate": 9.381500872981172e-05, "loss": 3.1787, "step": 1980 }, { "epoch": 0.20417538665713847, "grad_norm": 2.11936092376709, "learning_rate": 9.365137310073655e-05, "loss": 3.2329, "step": 2000 }, { "epoch": 0.20417538665713847, "eval_accuracy": 0.45679806228967057, "eval_loss": 3.088444471359253, "eval_runtime": 4.3098, "eval_samples_per_second": 23.203, "eval_steps_per_second": 3.016, "step": 2000 }, { "epoch": 0.20621714052370987, "grad_norm": 1.8326746225357056, "learning_rate": 9.348574757785642e-05, "loss": 3.2102, "step": 2020 }, { "epoch": 0.20825889439028125, "grad_norm": 2.0746572017669678, "learning_rate": 9.331813971138514e-05, "loss": 3.2175, "step": 2040 }, { "epoch": 0.21030064825685263, "grad_norm": 1.8772895336151123, "learning_rate": 9.314855714190383e-05, "loss": 3.1503, "step": 2060 }, { "epoch": 0.21234240212342403, "grad_norm": 2.020322799682617, "learning_rate": 9.297700760001247e-05, "loss": 3.1342, "step": 2080 }, { "epoch": 0.2143841559899954, "grad_norm": 1.7885624170303345, "learning_rate": 9.280349890597752e-05, "loss": 3.1144, "step": 2100 }, { "epoch": 0.2164259098565668, "grad_norm": 2.0323827266693115, "learning_rate": 9.262803896937555e-05, "loss": 3.1584, "step": 2120 }, { "epoch": 0.21846766372313817, "grad_norm": 1.8588685989379883, "learning_rate": 9.245063578873248e-05, "loss": 3.1057, "step": 2140 }, { "epoch": 0.22050941758970957, "grad_norm": 2.072542428970337, "learning_rate": 9.22712974511591e-05, "loss": 3.1134, "step": 2160 }, { "epoch": 0.22255117145628095, "grad_norm": 2.014954090118408, "learning_rate": 9.209003213198237e-05, "loss": 3.1189, "step": 2180 }, { "epoch": 0.22459292532285233, "grad_norm": 2.125472068786621, "learning_rate": 9.190684809437271e-05, "loss": 3.1635, "step": 2200 }, { "epoch": 0.2266346791894237, "grad_norm": 1.9532464742660522, "learning_rate": 9.172175368896738e-05, "loss": 3.0646, "step": 2220 }, { "epoch": 0.2286764330559951, "grad_norm": 1.9583076238632202, "learning_rate": 9.153475735348973e-05, "loss": 3.0165, "step": 2240 }, { "epoch": 0.2307181869225665, "grad_norm": 1.9474949836730957, "learning_rate": 9.13458676123646e-05, "loss": 3.0343, "step": 2260 }, { "epoch": 0.23275994078913786, "grad_norm": 2.0990636348724365, "learning_rate": 9.115509307632973e-05, "loss": 3.0198, "step": 2280 }, { "epoch": 0.23480169465570924, "grad_norm": 1.8146193027496338, "learning_rate": 9.096244244204324e-05, "loss": 3.0213, "step": 2300 }, { "epoch": 0.23684344852228065, "grad_norm": 2.1816246509552, "learning_rate": 9.076792449168712e-05, "loss": 3.0261, "step": 2320 }, { "epoch": 0.23888520238885202, "grad_norm": 1.9477081298828125, "learning_rate": 9.057154809256696e-05, "loss": 3.0106, "step": 2340 }, { "epoch": 0.2409269562554234, "grad_norm": 2.0003628730773926, "learning_rate": 9.037332219670768e-05, "loss": 3.017, "step": 2360 }, { "epoch": 0.24296871012199478, "grad_norm": 1.9379606246948242, "learning_rate": 9.017325584044547e-05, "loss": 2.9904, "step": 2380 }, { "epoch": 0.24501046398856619, "grad_norm": 1.9937708377838135, "learning_rate": 8.997135814401584e-05, "loss": 2.9522, "step": 2400 }, { "epoch": 0.24705221785513756, "grad_norm": 1.956358790397644, "learning_rate": 8.976763831113786e-05, "loss": 2.9674, "step": 2420 }, { "epoch": 0.24909397172170894, "grad_norm": 1.9048632383346558, "learning_rate": 8.956210562859462e-05, "loss": 2.92, "step": 2440 }, { "epoch": 0.25113572558828035, "grad_norm": 1.9640939235687256, "learning_rate": 8.935476946580987e-05, "loss": 2.97, "step": 2460 }, { "epoch": 0.2531774794548517, "grad_norm": 2.2429726123809814, "learning_rate": 8.91456392744209e-05, "loss": 3.0108, "step": 2480 }, { "epoch": 0.2552192333214231, "grad_norm": 2.055403709411621, "learning_rate": 8.893472458784769e-05, "loss": 2.916, "step": 2500 }, { "epoch": 0.2572609871879945, "grad_norm": 1.997952938079834, "learning_rate": 8.872203502085828e-05, "loss": 2.9029, "step": 2520 }, { "epoch": 0.25930274105456586, "grad_norm": 2.045036554336548, "learning_rate": 8.850758026913057e-05, "loss": 2.9879, "step": 2540 }, { "epoch": 0.26134449492113726, "grad_norm": 1.8752868175506592, "learning_rate": 8.829137010881019e-05, "loss": 2.8705, "step": 2560 }, { "epoch": 0.26338624878770867, "grad_norm": 2.005154609680176, "learning_rate": 8.807341439606498e-05, "loss": 2.8989, "step": 2580 }, { "epoch": 0.26542800265428, "grad_norm": 1.9689728021621704, "learning_rate": 8.78537230666356e-05, "loss": 2.8287, "step": 2600 }, { "epoch": 0.2674697565208514, "grad_norm": 1.9550490379333496, "learning_rate": 8.763230613538262e-05, "loss": 2.8997, "step": 2620 }, { "epoch": 0.26951151038742277, "grad_norm": 1.8367582559585571, "learning_rate": 8.740917369582997e-05, "loss": 2.811, "step": 2640 }, { "epoch": 0.2715532642539942, "grad_norm": 2.1363840103149414, "learning_rate": 8.718433591970485e-05, "loss": 2.8943, "step": 2660 }, { "epoch": 0.2735950181205656, "grad_norm": 1.7687076330184937, "learning_rate": 8.695780305647405e-05, "loss": 2.7989, "step": 2680 }, { "epoch": 0.27563677198713693, "grad_norm": 1.8358540534973145, "learning_rate": 8.672958543287666e-05, "loss": 2.7533, "step": 2700 }, { "epoch": 0.27767852585370834, "grad_norm": 1.9502025842666626, "learning_rate": 8.649969345245335e-05, "loss": 2.8681, "step": 2720 }, { "epoch": 0.27972027972027974, "grad_norm": 1.9075384140014648, "learning_rate": 8.62681375950721e-05, "loss": 2.7961, "step": 2740 }, { "epoch": 0.2817620335868511, "grad_norm": 1.8680235147476196, "learning_rate": 8.603492841645049e-05, "loss": 2.7387, "step": 2760 }, { "epoch": 0.2838037874534225, "grad_norm": 1.7814631462097168, "learning_rate": 8.580007654767447e-05, "loss": 2.7253, "step": 2780 }, { "epoch": 0.2858455413199939, "grad_norm": 1.8904640674591064, "learning_rate": 8.556359269471379e-05, "loss": 2.7686, "step": 2800 }, { "epoch": 0.28788729518656525, "grad_norm": 1.9197882413864136, "learning_rate": 8.532548763793387e-05, "loss": 2.7089, "step": 2820 }, { "epoch": 0.28992904905313666, "grad_norm": 1.8778643608093262, "learning_rate": 8.508577223160442e-05, "loss": 2.6991, "step": 2840 }, { "epoch": 0.291970802919708, "grad_norm": 1.9179952144622803, "learning_rate": 8.484445740340465e-05, "loss": 2.6281, "step": 2860 }, { "epoch": 0.2940125567862794, "grad_norm": 1.9334096908569336, "learning_rate": 8.460155415392513e-05, "loss": 2.6583, "step": 2880 }, { "epoch": 0.2960543106528508, "grad_norm": 1.783958077430725, "learning_rate": 8.43570735561662e-05, "loss": 2.7973, "step": 2900 }, { "epoch": 0.29809606451942217, "grad_norm": 1.974071741104126, "learning_rate": 8.411102675503338e-05, "loss": 2.704, "step": 2920 }, { "epoch": 0.30013781838599357, "grad_norm": 1.987671971321106, "learning_rate": 8.386342496682911e-05, "loss": 2.7177, "step": 2940 }, { "epoch": 0.302179572252565, "grad_norm": 1.8518048524856567, "learning_rate": 8.361427947874167e-05, "loss": 2.7156, "step": 2960 }, { "epoch": 0.3042213261191363, "grad_norm": 1.8664546012878418, "learning_rate": 8.336360164833043e-05, "loss": 2.7328, "step": 2980 }, { "epoch": 0.30626307998570773, "grad_norm": 1.7086060047149658, "learning_rate": 8.311140290300828e-05, "loss": 2.7043, "step": 3000 }, { "epoch": 0.3083048338522791, "grad_norm": 2.0161726474761963, "learning_rate": 8.285769473952052e-05, "loss": 2.6844, "step": 3020 }, { "epoch": 0.3103465877188505, "grad_norm": 1.9769707918167114, "learning_rate": 8.260248872342098e-05, "loss": 2.7191, "step": 3040 }, { "epoch": 0.3123883415854219, "grad_norm": 1.8690043687820435, "learning_rate": 8.234579648854458e-05, "loss": 2.71, "step": 3060 }, { "epoch": 0.31443009545199324, "grad_norm": 1.7195576429367065, "learning_rate": 8.208762973647715e-05, "loss": 2.6456, "step": 3080 }, { "epoch": 0.31647184931856465, "grad_norm": 1.874410629272461, "learning_rate": 8.182800023602189e-05, "loss": 2.7065, "step": 3100 }, { "epoch": 0.31851360318513605, "grad_norm": 1.7469574213027954, "learning_rate": 8.156691982266298e-05, "loss": 2.6669, "step": 3120 }, { "epoch": 0.3205553570517074, "grad_norm": 1.8419688940048218, "learning_rate": 8.130440039802594e-05, "loss": 2.6364, "step": 3140 }, { "epoch": 0.3225971109182788, "grad_norm": 1.8020317554473877, "learning_rate": 8.104045392933518e-05, "loss": 2.7316, "step": 3160 }, { "epoch": 0.3246388647848502, "grad_norm": 1.9503313302993774, "learning_rate": 8.077509244886838e-05, "loss": 2.6648, "step": 3180 }, { "epoch": 0.32668061865142156, "grad_norm": 1.700040578842163, "learning_rate": 8.050832805340806e-05, "loss": 2.655, "step": 3200 }, { "epoch": 0.32872237251799297, "grad_norm": 1.9254530668258667, "learning_rate": 8.024017290369004e-05, "loss": 2.6833, "step": 3220 }, { "epoch": 0.3307641263845643, "grad_norm": 1.7067421674728394, "learning_rate": 7.997063922384918e-05, "loss": 2.5835, "step": 3240 }, { "epoch": 0.3328058802511357, "grad_norm": 1.8383476734161377, "learning_rate": 7.96997393008621e-05, "loss": 2.6568, "step": 3260 }, { "epoch": 0.3348476341177071, "grad_norm": 1.941698670387268, "learning_rate": 7.942748548398699e-05, "loss": 2.58, "step": 3280 }, { "epoch": 0.3368893879842785, "grad_norm": 1.7115362882614136, "learning_rate": 7.915389018420081e-05, "loss": 2.5998, "step": 3300 }, { "epoch": 0.3389311418508499, "grad_norm": 1.8802528381347656, "learning_rate": 7.887896587363335e-05, "loss": 2.6176, "step": 3320 }, { "epoch": 0.3409728957174213, "grad_norm": 1.8691850900650024, "learning_rate": 7.860272508499876e-05, "loss": 2.5828, "step": 3340 }, { "epoch": 0.34301464958399264, "grad_norm": 1.7672555446624756, "learning_rate": 7.83251804110243e-05, "loss": 2.5475, "step": 3360 }, { "epoch": 0.34505640345056404, "grad_norm": 1.986730694770813, "learning_rate": 7.804634450387616e-05, "loss": 2.5288, "step": 3380 }, { "epoch": 0.3470981573171354, "grad_norm": 1.906487226486206, "learning_rate": 7.776623007458273e-05, "loss": 2.6522, "step": 3400 }, { "epoch": 0.3491399111837068, "grad_norm": 1.862870454788208, "learning_rate": 7.748484989245527e-05, "loss": 2.6063, "step": 3420 }, { "epoch": 0.3511816650502782, "grad_norm": 1.7368792295455933, "learning_rate": 7.720221678450561e-05, "loss": 2.571, "step": 3440 }, { "epoch": 0.35322341891684955, "grad_norm": 1.758864402770996, "learning_rate": 7.691834363486158e-05, "loss": 2.5684, "step": 3460 }, { "epoch": 0.35526517278342096, "grad_norm": 2.005373239517212, "learning_rate": 7.663324338417963e-05, "loss": 2.6349, "step": 3480 }, { "epoch": 0.35730692664999236, "grad_norm": 1.6954779624938965, "learning_rate": 7.634692902905485e-05, "loss": 2.5981, "step": 3500 }, { "epoch": 0.3593486805165637, "grad_norm": 1.843027114868164, "learning_rate": 7.605941362142861e-05, "loss": 2.5891, "step": 3520 }, { "epoch": 0.3613904343831351, "grad_norm": 1.7064664363861084, "learning_rate": 7.577071026799351e-05, "loss": 2.5105, "step": 3540 }, { "epoch": 0.3634321882497065, "grad_norm": 1.7401872873306274, "learning_rate": 7.548083212959588e-05, "loss": 2.6025, "step": 3560 }, { "epoch": 0.3654739421162779, "grad_norm": 1.8302804231643677, "learning_rate": 7.518979242063588e-05, "loss": 2.5219, "step": 3580 }, { "epoch": 0.3675156959828493, "grad_norm": 1.9890072345733643, "learning_rate": 7.489760440846512e-05, "loss": 2.5515, "step": 3600 }, { "epoch": 0.3695574498494206, "grad_norm": 1.660101056098938, "learning_rate": 7.460428141278177e-05, "loss": 2.5421, "step": 3620 }, { "epoch": 0.37159920371599203, "grad_norm": 2.000551223754883, "learning_rate": 7.430983680502344e-05, "loss": 2.5543, "step": 3640 }, { "epoch": 0.37364095758256344, "grad_norm": 1.8944804668426514, "learning_rate": 7.40142840077576e-05, "loss": 2.59, "step": 3660 }, { "epoch": 0.3756827114491348, "grad_norm": 1.8031728267669678, "learning_rate": 7.371763649406973e-05, "loss": 2.4947, "step": 3680 }, { "epoch": 0.3777244653157062, "grad_norm": 1.8673484325408936, "learning_rate": 7.34199077869491e-05, "loss": 2.572, "step": 3700 }, { "epoch": 0.3797662191822776, "grad_norm": 1.6313329935073853, "learning_rate": 7.312111145867227e-05, "loss": 2.5538, "step": 3720 }, { "epoch": 0.38180797304884895, "grad_norm": 1.9217092990875244, "learning_rate": 7.28212611301845e-05, "loss": 2.5914, "step": 3740 }, { "epoch": 0.38384972691542035, "grad_norm": 1.902541160583496, "learning_rate": 7.252037047047877e-05, "loss": 2.5325, "step": 3760 }, { "epoch": 0.38589148078199176, "grad_norm": 1.6973150968551636, "learning_rate": 7.221845319597258e-05, "loss": 2.5184, "step": 3780 }, { "epoch": 0.3879332346485631, "grad_norm": 1.7825627326965332, "learning_rate": 7.191552306988281e-05, "loss": 2.5445, "step": 3800 }, { "epoch": 0.3899749885151345, "grad_norm": 1.768978238105774, "learning_rate": 7.16115939015982e-05, "loss": 2.4173, "step": 3820 }, { "epoch": 0.39201674238170586, "grad_norm": 1.864883542060852, "learning_rate": 7.130667954604994e-05, "loss": 2.5214, "step": 3840 }, { "epoch": 0.39405849624827727, "grad_norm": 1.7492718696594238, "learning_rate": 7.100079390307996e-05, "loss": 2.4684, "step": 3860 }, { "epoch": 0.3961002501148487, "grad_norm": 1.8536518812179565, "learning_rate": 7.069395091680737e-05, "loss": 2.3979, "step": 3880 }, { "epoch": 0.39814200398142, "grad_norm": 1.7240726947784424, "learning_rate": 7.038616457499282e-05, "loss": 2.5017, "step": 3900 }, { "epoch": 0.40018375784799143, "grad_norm": 1.6829756498336792, "learning_rate": 7.007744890840073e-05, "loss": 2.4826, "step": 3920 }, { "epoch": 0.40222551171456283, "grad_norm": 1.8055912256240845, "learning_rate": 6.976781799015989e-05, "loss": 2.509, "step": 3940 }, { "epoch": 0.4042672655811342, "grad_norm": 1.678655982017517, "learning_rate": 6.945728593512165e-05, "loss": 2.4211, "step": 3960 }, { "epoch": 0.4063090194477056, "grad_norm": 1.7577173709869385, "learning_rate": 6.914586689921673e-05, "loss": 2.5082, "step": 3980 }, { "epoch": 0.40835077331427694, "grad_norm": 1.5879868268966675, "learning_rate": 6.883357507880985e-05, "loss": 2.5065, "step": 4000 }, { "epoch": 0.40835077331427694, "eval_accuracy": 0.5642516398330352, "eval_loss": 2.3732497692108154, "eval_runtime": 0.932, "eval_samples_per_second": 107.297, "eval_steps_per_second": 13.949, "step": 4000 }, { "epoch": 0.41039252718084834, "grad_norm": 1.8080238103866577, "learning_rate": 6.852042471005239e-05, "loss": 2.5482, "step": 4020 }, { "epoch": 0.41243428104741975, "grad_norm": 1.8754255771636963, "learning_rate": 6.820643006823369e-05, "loss": 2.4919, "step": 4040 }, { "epoch": 0.4144760349139911, "grad_norm": 1.69048273563385, "learning_rate": 6.789160546713006e-05, "loss": 2.5267, "step": 4060 }, { "epoch": 0.4165177887805625, "grad_norm": 1.8728243112564087, "learning_rate": 6.757596525835248e-05, "loss": 2.4872, "step": 4080 }, { "epoch": 0.4185595426471339, "grad_norm": 1.6728031635284424, "learning_rate": 6.725952383069222e-05, "loss": 2.4833, "step": 4100 }, { "epoch": 0.42060129651370526, "grad_norm": 1.8563932180404663, "learning_rate": 6.694229560946491e-05, "loss": 2.5139, "step": 4120 }, { "epoch": 0.42264305038027666, "grad_norm": 1.774522304534912, "learning_rate": 6.662429505585307e-05, "loss": 2.4729, "step": 4140 }, { "epoch": 0.42468480424684807, "grad_norm": 1.718322992324829, "learning_rate": 6.630553666624674e-05, "loss": 2.4711, "step": 4160 }, { "epoch": 0.4267265581134194, "grad_norm": 1.6811639070510864, "learning_rate": 6.59860349715828e-05, "loss": 2.4434, "step": 4180 }, { "epoch": 0.4287683119799908, "grad_norm": 1.7791935205459595, "learning_rate": 6.566580453668235e-05, "loss": 2.4409, "step": 4200 }, { "epoch": 0.4308100658465622, "grad_norm": 1.984272837638855, "learning_rate": 6.5344859959587e-05, "loss": 2.4813, "step": 4220 }, { "epoch": 0.4328518197131336, "grad_norm": 1.8698103427886963, "learning_rate": 6.50232158708932e-05, "loss": 2.5183, "step": 4240 }, { "epoch": 0.434893573579705, "grad_norm": 2.01855206489563, "learning_rate": 6.470088693308542e-05, "loss": 2.4713, "step": 4260 }, { "epoch": 0.43693532744627633, "grad_norm": 1.6866073608398438, "learning_rate": 6.437788783986766e-05, "loss": 2.4124, "step": 4280 }, { "epoch": 0.43897708131284774, "grad_norm": 1.72965407371521, "learning_rate": 6.405423331549372e-05, "loss": 2.4828, "step": 4300 }, { "epoch": 0.44101883517941914, "grad_norm": 1.7710672616958618, "learning_rate": 6.372993811409586e-05, "loss": 2.4777, "step": 4320 }, { "epoch": 0.4430605890459905, "grad_norm": 1.730212926864624, "learning_rate": 6.340501701901228e-05, "loss": 2.3678, "step": 4340 }, { "epoch": 0.4451023429125619, "grad_norm": 1.834564208984375, "learning_rate": 6.307948484211325e-05, "loss": 2.4935, "step": 4360 }, { "epoch": 0.44714409677913325, "grad_norm": 1.9029295444488525, "learning_rate": 6.275335642312581e-05, "loss": 2.4283, "step": 4380 }, { "epoch": 0.44918585064570465, "grad_norm": 1.7712889909744263, "learning_rate": 6.242664662895733e-05, "loss": 2.4234, "step": 4400 }, { "epoch": 0.45122760451227606, "grad_norm": 1.7402117252349854, "learning_rate": 6.209937035301779e-05, "loss": 2.3958, "step": 4420 }, { "epoch": 0.4532693583788474, "grad_norm": 1.6899279356002808, "learning_rate": 6.177154251454082e-05, "loss": 2.4292, "step": 4440 }, { "epoch": 0.4553111122454188, "grad_norm": 1.6087076663970947, "learning_rate": 6.144317805790361e-05, "loss": 2.3475, "step": 4460 }, { "epoch": 0.4573528661119902, "grad_norm": 1.795076608657837, "learning_rate": 6.111429195194569e-05, "loss": 2.4042, "step": 4480 }, { "epoch": 0.45939461997856157, "grad_norm": 1.7063649892807007, "learning_rate": 6.078489918928648e-05, "loss": 2.4797, "step": 4500 }, { "epoch": 0.461436373845133, "grad_norm": 1.8537471294403076, "learning_rate": 6.0455014785641905e-05, "loss": 2.3991, "step": 4520 }, { "epoch": 0.4634781277117044, "grad_norm": 1.65223228931427, "learning_rate": 6.0124653779139836e-05, "loss": 2.3587, "step": 4540 }, { "epoch": 0.46551988157827573, "grad_norm": 1.5966017246246338, "learning_rate": 5.979383122963463e-05, "loss": 2.3228, "step": 4560 }, { "epoch": 0.46756163544484713, "grad_norm": 1.8159209489822388, "learning_rate": 5.946256221802051e-05, "loss": 2.4072, "step": 4580 }, { "epoch": 0.4696033893114185, "grad_norm": 1.968110203742981, "learning_rate": 5.9130861845544195e-05, "loss": 2.4089, "step": 4600 }, { "epoch": 0.4716451431779899, "grad_norm": 1.7745741605758667, "learning_rate": 5.8798745233116437e-05, "loss": 2.3714, "step": 4620 }, { "epoch": 0.4736868970445613, "grad_norm": 1.6083648204803467, "learning_rate": 5.846622752062268e-05, "loss": 2.4029, "step": 4640 }, { "epoch": 0.47572865091113264, "grad_norm": 1.9344274997711182, "learning_rate": 5.8133323866233e-05, "loss": 2.3868, "step": 4660 }, { "epoch": 0.47777040477770405, "grad_norm": 1.8098623752593994, "learning_rate": 5.780004944571098e-05, "loss": 2.3503, "step": 4680 }, { "epoch": 0.47981215864427545, "grad_norm": 1.8762472867965698, "learning_rate": 5.746641945172202e-05, "loss": 2.3658, "step": 4700 }, { "epoch": 0.4818539125108468, "grad_norm": 1.7307778596878052, "learning_rate": 5.713244909314068e-05, "loss": 2.4414, "step": 4720 }, { "epoch": 0.4838956663774182, "grad_norm": 1.733116626739502, "learning_rate": 5.6798153594357394e-05, "loss": 2.4173, "step": 4740 }, { "epoch": 0.48593742024398956, "grad_norm": 1.7549914121627808, "learning_rate": 5.646354819458448e-05, "loss": 2.4411, "step": 4760 }, { "epoch": 0.48797917411056096, "grad_norm": 1.766013503074646, "learning_rate": 5.612864814716141e-05, "loss": 2.321, "step": 4780 }, { "epoch": 0.49002092797713237, "grad_norm": 1.9648267030715942, "learning_rate": 5.5793468718859454e-05, "loss": 2.3255, "step": 4800 }, { "epoch": 0.4920626818437037, "grad_norm": 1.8520512580871582, "learning_rate": 5.545802518918579e-05, "loss": 2.3627, "step": 4820 }, { "epoch": 0.4941044357102751, "grad_norm": 1.823463797569275, "learning_rate": 5.512233284968691e-05, "loss": 2.3878, "step": 4840 }, { "epoch": 0.49614618957684653, "grad_norm": 1.6332978010177612, "learning_rate": 5.47864070032516e-05, "loss": 2.3588, "step": 4860 }, { "epoch": 0.4981879434434179, "grad_norm": 1.5986392498016357, "learning_rate": 5.445026296341325e-05, "loss": 2.3696, "step": 4880 }, { "epoch": 0.5002296973099892, "grad_norm": 1.8225257396697998, "learning_rate": 5.411391605365186e-05, "loss": 2.3829, "step": 4900 }, { "epoch": 0.5022714511765607, "grad_norm": 1.64592444896698, "learning_rate": 5.3777381606695466e-05, "loss": 2.4061, "step": 4920 }, { "epoch": 0.504313205043132, "grad_norm": 2.1232595443725586, "learning_rate": 5.344067496382119e-05, "loss": 2.3568, "step": 4940 }, { "epoch": 0.5063549589097034, "grad_norm": 1.8577643632888794, "learning_rate": 5.3103811474155866e-05, "loss": 2.3863, "step": 4960 }, { "epoch": 0.5083967127762749, "grad_norm": 1.767614483833313, "learning_rate": 5.276680649397636e-05, "loss": 2.3102, "step": 4980 }, { "epoch": 0.5104384666428462, "grad_norm": 1.760667085647583, "learning_rate": 5.242967538600957e-05, "loss": 2.3777, "step": 5000 }, { "epoch": 0.5124802205094176, "grad_norm": 1.698043704032898, "learning_rate": 5.2092433518731996e-05, "loss": 2.3397, "step": 5020 }, { "epoch": 0.514521974375989, "grad_norm": 1.7603800296783447, "learning_rate": 5.1755096265669286e-05, "loss": 2.4018, "step": 5040 }, { "epoch": 0.5165637282425604, "grad_norm": 1.761828899383545, "learning_rate": 5.141767900469528e-05, "loss": 2.4144, "step": 5060 }, { "epoch": 0.5186054821091317, "grad_norm": 1.5829668045043945, "learning_rate": 5.108019711733113e-05, "loss": 2.3451, "step": 5080 }, { "epoch": 0.5206472359757032, "grad_norm": 1.738272786140442, "learning_rate": 5.074266598804401e-05, "loss": 2.3815, "step": 5100 }, { "epoch": 0.5226889898422745, "grad_norm": 1.7411034107208252, "learning_rate": 5.040510100354583e-05, "loss": 2.3293, "step": 5120 }, { "epoch": 0.5247307437088459, "grad_norm": 1.5550708770751953, "learning_rate": 5.006751755209186e-05, "loss": 2.3924, "step": 5140 }, { "epoch": 0.5267724975754173, "grad_norm": 1.8736240863800049, "learning_rate": 4.97299310227792e-05, "loss": 2.3273, "step": 5160 }, { "epoch": 0.5288142514419887, "grad_norm": 1.9709054231643677, "learning_rate": 4.939235680484522e-05, "loss": 2.3248, "step": 5180 }, { "epoch": 0.53085600530856, "grad_norm": 1.8487497568130493, "learning_rate": 4.905481028696609e-05, "loss": 2.2605, "step": 5200 }, { "epoch": 0.5328977591751315, "grad_norm": 1.979852318763733, "learning_rate": 4.871730685655527e-05, "loss": 2.3515, "step": 5220 }, { "epoch": 0.5349395130417028, "grad_norm": 1.7801086902618408, "learning_rate": 4.837986189906199e-05, "loss": 2.3046, "step": 5240 }, { "epoch": 0.5369812669082742, "grad_norm": 1.7478824853897095, "learning_rate": 4.8042490797269964e-05, "loss": 2.3024, "step": 5260 }, { "epoch": 0.5390230207748455, "grad_norm": 1.589666485786438, "learning_rate": 4.7705208930596055e-05, "loss": 2.3876, "step": 5280 }, { "epoch": 0.541064774641417, "grad_norm": 1.7482966184616089, "learning_rate": 4.736803167438932e-05, "loss": 2.3346, "step": 5300 }, { "epoch": 0.5431065285079884, "grad_norm": 1.674402117729187, "learning_rate": 4.703097439923e-05, "loss": 2.375, "step": 5320 }, { "epoch": 0.5451482823745597, "grad_norm": 1.8770043849945068, "learning_rate": 4.66940524702289e-05, "loss": 2.3162, "step": 5340 }, { "epoch": 0.5471900362411312, "grad_norm": 1.7583917379379272, "learning_rate": 4.635728124632692e-05, "loss": 2.2905, "step": 5360 }, { "epoch": 0.5492317901077025, "grad_norm": 1.8568443059921265, "learning_rate": 4.602067607959485e-05, "loss": 2.2572, "step": 5380 }, { "epoch": 0.5512735439742739, "grad_norm": 1.7253384590148926, "learning_rate": 4.5684252314533685e-05, "loss": 2.2371, "step": 5400 }, { "epoch": 0.5533152978408453, "grad_norm": 1.8005539178848267, "learning_rate": 4.534802528737497e-05, "loss": 2.2843, "step": 5420 }, { "epoch": 0.5553570517074167, "grad_norm": 1.7977638244628906, "learning_rate": 4.501201032538178e-05, "loss": 2.3507, "step": 5440 }, { "epoch": 0.557398805573988, "grad_norm": 1.858929991722107, "learning_rate": 4.4676222746149945e-05, "loss": 2.3676, "step": 5460 }, { "epoch": 0.5594405594405595, "grad_norm": 1.8355470895767212, "learning_rate": 4.434067785690983e-05, "loss": 2.3439, "step": 5480 }, { "epoch": 0.5614823133071308, "grad_norm": 1.8161711692810059, "learning_rate": 4.4005390953828525e-05, "loss": 2.2969, "step": 5500 }, { "epoch": 0.5635240671737022, "grad_norm": 1.8009469509124756, "learning_rate": 4.3670377321312535e-05, "loss": 2.312, "step": 5520 }, { "epoch": 0.5655658210402736, "grad_norm": 1.8935741186141968, "learning_rate": 4.333565223131107e-05, "loss": 2.2994, "step": 5540 }, { "epoch": 0.567607574906845, "grad_norm": 1.784012794494629, "learning_rate": 4.300123094261977e-05, "loss": 2.2646, "step": 5560 }, { "epoch": 0.5696493287734163, "grad_norm": 1.905248999595642, "learning_rate": 4.266712870018521e-05, "loss": 2.3729, "step": 5580 }, { "epoch": 0.5716910826399878, "grad_norm": 1.9181920289993286, "learning_rate": 4.2333360734409916e-05, "loss": 2.2724, "step": 5600 }, { "epoch": 0.5737328365065592, "grad_norm": 1.8163272142410278, "learning_rate": 4.1999942260458036e-05, "loss": 2.3075, "step": 5620 }, { "epoch": 0.5757745903731305, "grad_norm": 1.7135416269302368, "learning_rate": 4.16668884775618e-05, "loss": 2.2931, "step": 5640 }, { "epoch": 0.5778163442397019, "grad_norm": 1.6932008266448975, "learning_rate": 4.133421456832853e-05, "loss": 2.2992, "step": 5660 }, { "epoch": 0.5798580981062733, "grad_norm": 2.0134222507476807, "learning_rate": 4.100193569804871e-05, "loss": 2.308, "step": 5680 }, { "epoch": 0.5818998519728447, "grad_norm": 1.7452419996261597, "learning_rate": 4.067006701400449e-05, "loss": 2.2977, "step": 5700 }, { "epoch": 0.583941605839416, "grad_norm": 1.7522581815719604, "learning_rate": 4.033862364477927e-05, "loss": 2.36, "step": 5720 }, { "epoch": 0.5859833597059875, "grad_norm": 1.9982788562774658, "learning_rate": 4.000762069956805e-05, "loss": 2.2117, "step": 5740 }, { "epoch": 0.5880251135725588, "grad_norm": 1.7190653085708618, "learning_rate": 3.967707326748857e-05, "loss": 2.262, "step": 5760 }, { "epoch": 0.5900668674391302, "grad_norm": 1.9140125513076782, "learning_rate": 3.934699641689361e-05, "loss": 2.2146, "step": 5780 }, { "epoch": 0.5921086213057016, "grad_norm": 1.6861069202423096, "learning_rate": 3.901740519468392e-05, "loss": 2.2131, "step": 5800 }, { "epoch": 0.594150375172273, "grad_norm": 2.075723171234131, "learning_rate": 3.868831462562242e-05, "loss": 2.2359, "step": 5820 }, { "epoch": 0.5961921290388443, "grad_norm": 1.8186756372451782, "learning_rate": 3.835973971164924e-05, "loss": 2.2631, "step": 5840 }, { "epoch": 0.5982338829054158, "grad_norm": 1.8062045574188232, "learning_rate": 3.803169543119775e-05, "loss": 2.2327, "step": 5860 }, { "epoch": 0.6002756367719871, "grad_norm": 1.7123644351959229, "learning_rate": 3.770419673851191e-05, "loss": 2.298, "step": 5880 }, { "epoch": 0.6023173906385585, "grad_norm": 1.7167185544967651, "learning_rate": 3.7377258562964454e-05, "loss": 2.2363, "step": 5900 }, { "epoch": 0.60435914450513, "grad_norm": 1.7670440673828125, "learning_rate": 3.705089580837639e-05, "loss": 2.3135, "step": 5920 }, { "epoch": 0.6064008983717013, "grad_norm": 1.9746836423873901, "learning_rate": 3.6725123352337484e-05, "loss": 2.2978, "step": 5940 }, { "epoch": 0.6084426522382727, "grad_norm": 1.9177978038787842, "learning_rate": 3.639995604552818e-05, "loss": 2.2328, "step": 5960 }, { "epoch": 0.6104844061048441, "grad_norm": 1.7076138257980347, "learning_rate": 3.607540871104254e-05, "loss": 2.303, "step": 5980 }, { "epoch": 0.6125261599714155, "grad_norm": 1.7542641162872314, "learning_rate": 3.575149614371252e-05, "loss": 2.2408, "step": 6000 }, { "epoch": 0.6125261599714155, "eval_accuracy": 0.5903397391901167, "eval_loss": 2.165536880493164, "eval_runtime": 0.9415, "eval_samples_per_second": 106.217, "eval_steps_per_second": 13.808, "step": 6000 }, { "epoch": 0.6145679138379868, "grad_norm": 1.5742172002792358, "learning_rate": 3.5428233109433605e-05, "loss": 2.251, "step": 6020 }, { "epoch": 0.6166096677045582, "grad_norm": 1.807968020439148, "learning_rate": 3.510563434449154e-05, "loss": 2.2861, "step": 6040 }, { "epoch": 0.6186514215711296, "grad_norm": 2.236690044403076, "learning_rate": 3.4783714554890744e-05, "loss": 2.2114, "step": 6060 }, { "epoch": 0.620693175437701, "grad_norm": 1.7649223804473877, "learning_rate": 3.446248841568375e-05, "loss": 2.2497, "step": 6080 }, { "epoch": 0.6227349293042723, "grad_norm": 1.770786166191101, "learning_rate": 3.414197057030235e-05, "loss": 2.2657, "step": 6100 }, { "epoch": 0.6247766831708438, "grad_norm": 1.6890032291412354, "learning_rate": 3.382217562989004e-05, "loss": 2.2358, "step": 6120 }, { "epoch": 0.6268184370374151, "grad_norm": 1.6569480895996094, "learning_rate": 3.350311817263587e-05, "loss": 2.2054, "step": 6140 }, { "epoch": 0.6288601909039865, "grad_norm": 1.9036436080932617, "learning_rate": 3.318481274310998e-05, "loss": 2.2104, "step": 6160 }, { "epoch": 0.6309019447705579, "grad_norm": 1.7449889183044434, "learning_rate": 3.286727385160056e-05, "loss": 2.247, "step": 6180 }, { "epoch": 0.6329436986371293, "grad_norm": 1.7945327758789062, "learning_rate": 3.25505159734523e-05, "loss": 2.2526, "step": 6200 }, { "epoch": 0.6349854525037006, "grad_norm": 1.8365349769592285, "learning_rate": 3.223455354840662e-05, "loss": 2.2807, "step": 6220 }, { "epoch": 0.6370272063702721, "grad_norm": 1.794764518737793, "learning_rate": 3.191940097994334e-05, "loss": 2.2407, "step": 6240 }, { "epoch": 0.6390689602368435, "grad_norm": 1.7875920534133911, "learning_rate": 3.1605072634624125e-05, "loss": 2.2012, "step": 6260 }, { "epoch": 0.6411107141034148, "grad_norm": 1.7401834726333618, "learning_rate": 3.1291582841437584e-05, "loss": 2.2434, "step": 6280 }, { "epoch": 0.6431524679699863, "grad_norm": 1.7125718593597412, "learning_rate": 3.097894589114603e-05, "loss": 2.1703, "step": 6300 }, { "epoch": 0.6451942218365576, "grad_norm": 1.9303698539733887, "learning_rate": 3.066717603563399e-05, "loss": 2.2356, "step": 6320 }, { "epoch": 0.647235975703129, "grad_norm": 1.850679874420166, "learning_rate": 3.0356287487258627e-05, "loss": 2.266, "step": 6340 }, { "epoch": 0.6492777295697004, "grad_norm": 1.888827919960022, "learning_rate": 3.004629441820176e-05, "loss": 2.2256, "step": 6360 }, { "epoch": 0.6513194834362718, "grad_norm": 1.742695689201355, "learning_rate": 2.973721095982386e-05, "loss": 2.1682, "step": 6380 }, { "epoch": 0.6533612373028431, "grad_norm": 1.931808352470398, "learning_rate": 2.942905120201981e-05, "loss": 2.2093, "step": 6400 }, { "epoch": 0.6554029911694145, "grad_norm": 1.8920409679412842, "learning_rate": 2.9121829192576643e-05, "loss": 2.2243, "step": 6420 }, { "epoch": 0.6574447450359859, "grad_norm": 1.8664979934692383, "learning_rate": 2.881555893653314e-05, "loss": 2.2591, "step": 6440 }, { "epoch": 0.6594864989025573, "grad_norm": 1.5541925430297852, "learning_rate": 2.851025439554142e-05, "loss": 2.2014, "step": 6460 }, { "epoch": 0.6615282527691286, "grad_norm": 1.685796856880188, "learning_rate": 2.8205929487230437e-05, "loss": 2.1966, "step": 6480 }, { "epoch": 0.6635700066357001, "grad_norm": 1.7149288654327393, "learning_rate": 2.7902598084571602e-05, "loss": 2.194, "step": 6500 }, { "epoch": 0.6656117605022714, "grad_norm": 1.693506121635437, "learning_rate": 2.7600274015246247e-05, "loss": 2.18, "step": 6520 }, { "epoch": 0.6676535143688428, "grad_norm": 1.8124711513519287, "learning_rate": 2.7298971061015427e-05, "loss": 2.2222, "step": 6540 }, { "epoch": 0.6696952682354143, "grad_norm": 1.5904170274734497, "learning_rate": 2.699870295709156e-05, "loss": 2.194, "step": 6560 }, { "epoch": 0.6717370221019856, "grad_norm": 1.958520531654358, "learning_rate": 2.6699483391512324e-05, "loss": 2.2345, "step": 6580 }, { "epoch": 0.673778775968557, "grad_norm": 1.6518443822860718, "learning_rate": 2.6401326004516703e-05, "loss": 2.2032, "step": 6600 }, { "epoch": 0.6758205298351284, "grad_norm": 1.9994802474975586, "learning_rate": 2.6104244387923082e-05, "loss": 2.2239, "step": 6620 }, { "epoch": 0.6778622837016998, "grad_norm": 1.8239219188690186, "learning_rate": 2.5808252084509782e-05, "loss": 2.2229, "step": 6640 }, { "epoch": 0.6799040375682711, "grad_norm": 2.0363845825195312, "learning_rate": 2.551336258739761e-05, "loss": 2.2835, "step": 6660 }, { "epoch": 0.6819457914348426, "grad_norm": 1.8027400970458984, "learning_rate": 2.5219589339434818e-05, "loss": 2.2516, "step": 6680 }, { "epoch": 0.6839875453014139, "grad_norm": 1.7809832096099854, "learning_rate": 2.4926945732584197e-05, "loss": 2.1784, "step": 6700 }, { "epoch": 0.6860292991679853, "grad_norm": 1.6586663722991943, "learning_rate": 2.4635445107312723e-05, "loss": 2.255, "step": 6720 }, { "epoch": 0.6880710530345567, "grad_norm": 1.7714364528656006, "learning_rate": 2.4345100751983323e-05, "loss": 2.1998, "step": 6740 }, { "epoch": 0.6901128069011281, "grad_norm": 1.5663255453109741, "learning_rate": 2.4055925902249165e-05, "loss": 2.2396, "step": 6760 }, { "epoch": 0.6921545607676994, "grad_norm": 1.7424668073654175, "learning_rate": 2.376793374045026e-05, "loss": 2.2352, "step": 6780 }, { "epoch": 0.6941963146342708, "grad_norm": 1.8523235321044922, "learning_rate": 2.3481137395012513e-05, "loss": 2.2069, "step": 6800 }, { "epoch": 0.6962380685008422, "grad_norm": 1.9503777027130127, "learning_rate": 2.3195549939849326e-05, "loss": 2.23, "step": 6820 }, { "epoch": 0.6982798223674136, "grad_norm": 1.7033708095550537, "learning_rate": 2.2911184393765544e-05, "loss": 2.1973, "step": 6840 }, { "epoch": 0.7003215762339849, "grad_norm": 1.8798083066940308, "learning_rate": 2.2628053719864017e-05, "loss": 2.1606, "step": 6860 }, { "epoch": 0.7023633301005564, "grad_norm": 1.6673403978347778, "learning_rate": 2.234617082495462e-05, "loss": 2.2333, "step": 6880 }, { "epoch": 0.7044050839671278, "grad_norm": 1.7935930490493774, "learning_rate": 2.2065548558965928e-05, "loss": 2.1889, "step": 6900 }, { "epoch": 0.7064468378336991, "grad_norm": 2.050896167755127, "learning_rate": 2.178619971435941e-05, "loss": 2.1998, "step": 6920 }, { "epoch": 0.7084885917002706, "grad_norm": 1.661040186882019, "learning_rate": 2.1508137025546293e-05, "loss": 2.2013, "step": 6940 }, { "epoch": 0.7105303455668419, "grad_norm": 1.8157655000686646, "learning_rate": 2.123137316830703e-05, "loss": 2.2657, "step": 6960 }, { "epoch": 0.7125720994334133, "grad_norm": 1.941728115081787, "learning_rate": 2.095592075921347e-05, "loss": 2.2571, "step": 6980 }, { "epoch": 0.7146138532999847, "grad_norm": 1.7882014513015747, "learning_rate": 2.0681792355053698e-05, "loss": 2.1673, "step": 7000 }, { "epoch": 0.7166556071665561, "grad_norm": 1.8379241228103638, "learning_rate": 2.0409000452259663e-05, "loss": 2.1818, "step": 7020 }, { "epoch": 0.7186973610331274, "grad_norm": 1.6454790830612183, "learning_rate": 2.0137557486337493e-05, "loss": 2.215, "step": 7040 }, { "epoch": 0.7207391148996989, "grad_norm": 1.7670385837554932, "learning_rate": 1.986747583130061e-05, "loss": 2.2, "step": 7060 }, { "epoch": 0.7227808687662702, "grad_norm": 1.974037528038025, "learning_rate": 1.9598767799105637e-05, "loss": 2.1633, "step": 7080 }, { "epoch": 0.7248226226328416, "grad_norm": 1.7388969659805298, "learning_rate": 1.9331445639091128e-05, "loss": 2.1456, "step": 7100 }, { "epoch": 0.726864376499413, "grad_norm": 1.7744972705841064, "learning_rate": 1.9065521537419247e-05, "loss": 2.1907, "step": 7120 }, { "epoch": 0.7289061303659844, "grad_norm": 1.712636947631836, "learning_rate": 1.8801007616520155e-05, "loss": 2.1636, "step": 7140 }, { "epoch": 0.7309478842325557, "grad_norm": 1.7881085872650146, "learning_rate": 1.8537915934539486e-05, "loss": 2.1353, "step": 7160 }, { "epoch": 0.7329896380991272, "grad_norm": 1.9619174003601074, "learning_rate": 1.8276258484788535e-05, "loss": 2.2536, "step": 7180 }, { "epoch": 0.7350313919656986, "grad_norm": 1.9276570081710815, "learning_rate": 1.8016047195197676e-05, "loss": 2.1925, "step": 7200 }, { "epoch": 0.7370731458322699, "grad_norm": 1.85370934009552, "learning_rate": 1.7757293927772546e-05, "loss": 2.1963, "step": 7220 }, { "epoch": 0.7391148996988413, "grad_norm": 1.7941173315048218, "learning_rate": 1.750001047805327e-05, "loss": 2.2004, "step": 7240 }, { "epoch": 0.7411566535654127, "grad_norm": 1.8923784494400024, "learning_rate": 1.7244208574576836e-05, "loss": 2.2202, "step": 7260 }, { "epoch": 0.7431984074319841, "grad_norm": 1.8124483823776245, "learning_rate": 1.698989987834233e-05, "loss": 2.2263, "step": 7280 }, { "epoch": 0.7452401612985554, "grad_norm": 1.7676301002502441, "learning_rate": 1.6737095982279442e-05, "loss": 2.188, "step": 7300 }, { "epoch": 0.7472819151651269, "grad_norm": 1.7053779363632202, "learning_rate": 1.6485808410719973e-05, "loss": 2.161, "step": 7320 }, { "epoch": 0.7493236690316982, "grad_norm": 1.773569941520691, "learning_rate": 1.6236048618872456e-05, "loss": 2.1788, "step": 7340 }, { "epoch": 0.7513654228982696, "grad_norm": 1.8188519477844238, "learning_rate": 1.5987827992300003e-05, "loss": 2.1003, "step": 7360 }, { "epoch": 0.753407176764841, "grad_norm": 1.7184184789657593, "learning_rate": 1.5741157846401206e-05, "loss": 2.123, "step": 7380 }, { "epoch": 0.7554489306314124, "grad_norm": 1.7014732360839844, "learning_rate": 1.549604942589441e-05, "loss": 2.0777, "step": 7400 }, { "epoch": 0.7574906844979837, "grad_norm": 1.9096647500991821, "learning_rate": 1.5252513904305043e-05, "loss": 2.179, "step": 7420 }, { "epoch": 0.7595324383645552, "grad_norm": 1.902512550354004, "learning_rate": 1.5010562383456289e-05, "loss": 2.2498, "step": 7440 }, { "epoch": 0.7615741922311265, "grad_norm": 1.985844373703003, "learning_rate": 1.4770205892962985e-05, "loss": 2.1325, "step": 7460 }, { "epoch": 0.7636159460976979, "grad_norm": 1.8997377157211304, "learning_rate": 1.4531455389728832e-05, "loss": 2.1955, "step": 7480 }, { "epoch": 0.7656576999642694, "grad_norm": 1.8191885948181152, "learning_rate": 1.4294321757446921e-05, "loss": 2.1858, "step": 7500 }, { "epoch": 0.7676994538308407, "grad_norm": 2.116734743118286, "learning_rate": 1.4058815806103542e-05, "loss": 2.2039, "step": 7520 }, { "epoch": 0.769741207697412, "grad_norm": 1.7504349946975708, "learning_rate": 1.3824948271485466e-05, "loss": 2.2465, "step": 7540 }, { "epoch": 0.7717829615639835, "grad_norm": 1.7317745685577393, "learning_rate": 1.3592729814690514e-05, "loss": 2.1843, "step": 7560 }, { "epoch": 0.7738247154305549, "grad_norm": 1.7211949825286865, "learning_rate": 1.3362171021641546e-05, "loss": 2.1185, "step": 7580 }, { "epoch": 0.7758664692971262, "grad_norm": 2.0524799823760986, "learning_rate": 1.3133282402603914e-05, "loss": 2.1692, "step": 7600 }, { "epoch": 0.7779082231636976, "grad_norm": 1.834174394607544, "learning_rate": 1.2906074391706313e-05, "loss": 2.1719, "step": 7620 }, { "epoch": 0.779949977030269, "grad_norm": 1.7868515253067017, "learning_rate": 1.2680557346465177e-05, "loss": 2.1336, "step": 7640 }, { "epoch": 0.7819917308968404, "grad_norm": 1.7851990461349487, "learning_rate": 1.2456741547312439e-05, "loss": 2.1767, "step": 7660 }, { "epoch": 0.7840334847634117, "grad_norm": 1.8546749353408813, "learning_rate": 1.223463719712698e-05, "loss": 2.1274, "step": 7680 }, { "epoch": 0.7860752386299832, "grad_norm": 1.803139328956604, "learning_rate": 1.2014254420769466e-05, "loss": 2.2174, "step": 7700 }, { "epoch": 0.7881169924965545, "grad_norm": 1.827001929283142, "learning_rate": 1.1795603264620803e-05, "loss": 2.0761, "step": 7720 }, { "epoch": 0.7901587463631259, "grad_norm": 1.9686081409454346, "learning_rate": 1.1578693696124193e-05, "loss": 2.1226, "step": 7740 }, { "epoch": 0.7922005002296973, "grad_norm": 1.917121410369873, "learning_rate": 1.1363535603330672e-05, "loss": 2.1332, "step": 7760 }, { "epoch": 0.7942422540962687, "grad_norm": 1.7767516374588013, "learning_rate": 1.1150138794448462e-05, "loss": 2.2161, "step": 7780 }, { "epoch": 0.79628400796284, "grad_norm": 2.000962972640991, "learning_rate": 1.0938512997395795e-05, "loss": 2.1021, "step": 7800 }, { "epoch": 0.7983257618294115, "grad_norm": 1.9574683904647827, "learning_rate": 1.0728667859357455e-05, "loss": 2.215, "step": 7820 }, { "epoch": 0.8003675156959829, "grad_norm": 1.7920386791229248, "learning_rate": 1.0520612946345033e-05, "loss": 2.1434, "step": 7840 }, { "epoch": 0.8024092695625542, "grad_norm": 1.7746427059173584, "learning_rate": 1.0314357742760767e-05, "loss": 2.1842, "step": 7860 }, { "epoch": 0.8044510234291257, "grad_norm": 1.7369884252548218, "learning_rate": 1.0109911650965314e-05, "loss": 2.1778, "step": 7880 }, { "epoch": 0.806492777295697, "grad_norm": 1.9354164600372314, "learning_rate": 9.90728399084903e-06, "loss": 2.1365, "step": 7900 }, { "epoch": 0.8085345311622684, "grad_norm": 1.8886959552764893, "learning_rate": 9.706483999407167e-06, "loss": 2.1632, "step": 7920 }, { "epoch": 0.8105762850288398, "grad_norm": 1.88917076587677, "learning_rate": 9.507520830318744e-06, "loss": 2.1645, "step": 7940 }, { "epoch": 0.8126180388954112, "grad_norm": 1.9582138061523438, "learning_rate": 9.310403553529334e-06, "loss": 2.1434, "step": 7960 }, { "epoch": 0.8146597927619825, "grad_norm": 1.6584385633468628, "learning_rate": 9.115141154837542e-06, "loss": 2.1562, "step": 7980 }, { "epoch": 0.8167015466285539, "grad_norm": 1.8732690811157227, "learning_rate": 8.921742535485423e-06, "loss": 2.1828, "step": 8000 }, { "epoch": 0.8167015466285539, "eval_accuracy": 0.6103223043152372, "eval_loss": 2.054924964904785, "eval_runtime": 0.9473, "eval_samples_per_second": 105.567, "eval_steps_per_second": 13.724, "step": 8000 }, { "epoch": 0.8187433004951253, "grad_norm": 1.5993928909301758, "learning_rate": 8.730216511752682e-06, "loss": 2.1473, "step": 8020 }, { "epoch": 0.8207850543616967, "grad_norm": 2.070812940597534, "learning_rate": 8.54057181455475e-06, "loss": 2.1184, "step": 8040 }, { "epoch": 0.822826808228268, "grad_norm": 1.7426910400390625, "learning_rate": 8.35281708904485e-06, "loss": 2.1629, "step": 8060 }, { "epoch": 0.8248685620948395, "grad_norm": 1.935427188873291, "learning_rate": 8.166960894219839e-06, "loss": 2.1199, "step": 8080 }, { "epoch": 0.8269103159614108, "grad_norm": 1.909968614578247, "learning_rate": 7.983011702530053e-06, "loss": 2.119, "step": 8100 }, { "epoch": 0.8289520698279822, "grad_norm": 1.8139774799346924, "learning_rate": 7.800977899493078e-06, "loss": 2.1731, "step": 8120 }, { "epoch": 0.8309938236945537, "grad_norm": 1.8802262544631958, "learning_rate": 7.620867783311492e-06, "loss": 2.1194, "step": 8140 }, { "epoch": 0.833035577561125, "grad_norm": 1.9049795866012573, "learning_rate": 7.442689564494598e-06, "loss": 2.1618, "step": 8160 }, { "epoch": 0.8350773314276964, "grad_norm": 1.7457830905914307, "learning_rate": 7.266451365484106e-06, "loss": 2.1049, "step": 8180 }, { "epoch": 0.8371190852942678, "grad_norm": 1.7512338161468506, "learning_rate": 7.092161220283883e-06, "loss": 2.1386, "step": 8200 }, { "epoch": 0.8391608391608392, "grad_norm": 1.8232200145721436, "learning_rate": 6.919827074093727e-06, "loss": 2.169, "step": 8220 }, { "epoch": 0.8412025930274105, "grad_norm": 1.8132939338684082, "learning_rate": 6.749456782947122e-06, "loss": 2.0855, "step": 8240 }, { "epoch": 0.843244346893982, "grad_norm": 1.684288740158081, "learning_rate": 6.5810581133532055e-06, "loss": 2.1647, "step": 8260 }, { "epoch": 0.8452861007605533, "grad_norm": 1.8324394226074219, "learning_rate": 6.41463874194263e-06, "loss": 2.1535, "step": 8280 }, { "epoch": 0.8473278546271247, "grad_norm": 1.8422447443008423, "learning_rate": 6.250206255117674e-06, "loss": 2.1447, "step": 8300 }, { "epoch": 0.8493696084936961, "grad_norm": 1.7557770013809204, "learning_rate": 6.087768148706396e-06, "loss": 2.1535, "step": 8320 }, { "epoch": 0.8514113623602675, "grad_norm": 2.018083333969116, "learning_rate": 5.927331827620903e-06, "loss": 2.1919, "step": 8340 }, { "epoch": 0.8534531162268388, "grad_norm": 1.7010014057159424, "learning_rate": 5.768904605519814e-06, "loss": 2.0604, "step": 8360 }, { "epoch": 0.8554948700934102, "grad_norm": 1.727433681488037, "learning_rate": 5.612493704474881e-06, "loss": 2.158, "step": 8380 }, { "epoch": 0.8575366239599816, "grad_norm": 1.9105573892593384, "learning_rate": 5.458106254641715e-06, "loss": 2.1002, "step": 8400 }, { "epoch": 0.859578377826553, "grad_norm": 2.042494297027588, "learning_rate": 5.305749293934764e-06, "loss": 2.0963, "step": 8420 }, { "epoch": 0.8616201316931243, "grad_norm": 1.7217280864715576, "learning_rate": 5.15542976770651e-06, "loss": 2.1425, "step": 8440 }, { "epoch": 0.8636618855596958, "grad_norm": 1.809349775314331, "learning_rate": 5.007154528430841e-06, "loss": 2.1086, "step": 8460 }, { "epoch": 0.8657036394262672, "grad_norm": 2.1494014263153076, "learning_rate": 4.860930335390657e-06, "loss": 2.1839, "step": 8480 }, { "epoch": 0.8677453932928385, "grad_norm": 1.666319489479065, "learning_rate": 4.716763854369771e-06, "loss": 2.0493, "step": 8500 }, { "epoch": 0.86978714715941, "grad_norm": 1.9102964401245117, "learning_rate": 4.574661657349005e-06, "loss": 2.1334, "step": 8520 }, { "epoch": 0.8718289010259813, "grad_norm": 1.6554877758026123, "learning_rate": 4.434630222206643e-06, "loss": 2.1448, "step": 8540 }, { "epoch": 0.8738706548925527, "grad_norm": 1.9615174531936646, "learning_rate": 4.296675932423094e-06, "loss": 2.1221, "step": 8560 }, { "epoch": 0.8759124087591241, "grad_norm": 1.9856176376342773, "learning_rate": 4.160805076789925e-06, "loss": 2.1819, "step": 8580 }, { "epoch": 0.8779541626256955, "grad_norm": 1.932769536972046, "learning_rate": 4.027023849123157e-06, "loss": 2.1868, "step": 8600 }, { "epoch": 0.8799959164922668, "grad_norm": 2.1446657180786133, "learning_rate": 3.895338347980898e-06, "loss": 2.0986, "step": 8620 }, { "epoch": 0.8820376703588383, "grad_norm": 1.873093605041504, "learning_rate": 3.7657545763853885e-06, "loss": 2.1396, "step": 8640 }, { "epoch": 0.8840794242254096, "grad_norm": 1.8940656185150146, "learning_rate": 3.6382784415492975e-06, "loss": 2.1264, "step": 8660 }, { "epoch": 0.886121178091981, "grad_norm": 1.9061484336853027, "learning_rate": 3.5129157546064494e-06, "loss": 2.071, "step": 8680 }, { "epoch": 0.8881629319585524, "grad_norm": 1.9720280170440674, "learning_rate": 3.389672230346946e-06, "loss": 2.1745, "step": 8700 }, { "epoch": 0.8902046858251238, "grad_norm": 2.052313804626465, "learning_rate": 3.2685534869565827e-06, "loss": 2.1501, "step": 8720 }, { "epoch": 0.8922464396916951, "grad_norm": 1.8765654563903809, "learning_rate": 3.149565045760827e-06, "loss": 2.1464, "step": 8740 }, { "epoch": 0.8942881935582665, "grad_norm": 2.0794930458068848, "learning_rate": 3.0327123309730476e-06, "loss": 2.1361, "step": 8760 }, { "epoch": 0.896329947424838, "grad_norm": 2.288625955581665, "learning_rate": 2.9180006694472906e-06, "loss": 2.1158, "step": 8780 }, { "epoch": 0.8983717012914093, "grad_norm": 1.6408655643463135, "learning_rate": 2.805435290435432e-06, "loss": 2.1969, "step": 8800 }, { "epoch": 0.9004134551579807, "grad_norm": 2.131709098815918, "learning_rate": 2.6950213253487957e-06, "loss": 2.1485, "step": 8820 }, { "epoch": 0.9024552090245521, "grad_norm": 1.820241093635559, "learning_rate": 2.5867638075242453e-06, "loss": 2.0993, "step": 8840 }, { "epoch": 0.9044969628911235, "grad_norm": 1.9178062677383423, "learning_rate": 2.4806676719947076e-06, "loss": 2.1869, "step": 8860 }, { "epoch": 0.9065387167576948, "grad_norm": 1.8189531564712524, "learning_rate": 2.376737755264252e-06, "loss": 2.1455, "step": 8880 }, { "epoch": 0.9085804706242663, "grad_norm": 1.8824524879455566, "learning_rate": 2.2749787950875456e-06, "loss": 2.1327, "step": 8900 }, { "epoch": 0.9106222244908376, "grad_norm": 1.9590646028518677, "learning_rate": 2.1753954302539457e-06, "loss": 2.1054, "step": 8920 }, { "epoch": 0.912663978357409, "grad_norm": 1.8179956674575806, "learning_rate": 2.0779922003759834e-06, "loss": 2.1446, "step": 8940 }, { "epoch": 0.9147057322239804, "grad_norm": 1.7947405576705933, "learning_rate": 1.982773545682459e-06, "loss": 2.1537, "step": 8960 }, { "epoch": 0.9167474860905518, "grad_norm": 1.7115275859832764, "learning_rate": 1.8897438068160133e-06, "loss": 2.2059, "step": 8980 }, { "epoch": 0.9187892399571231, "grad_norm": 1.7798532247543335, "learning_rate": 1.7989072246352268e-06, "loss": 2.1772, "step": 9000 }, { "epoch": 0.9208309938236946, "grad_norm": 1.7296078205108643, "learning_rate": 1.7102679400213595e-06, "loss": 2.1178, "step": 9020 }, { "epoch": 0.922872747690266, "grad_norm": 1.825629472732544, "learning_rate": 1.6238299936895296e-06, "loss": 2.1715, "step": 9040 }, { "epoch": 0.9249145015568373, "grad_norm": 2.0452632904052734, "learning_rate": 1.539597326004527e-06, "loss": 2.0765, "step": 9060 }, { "epoch": 0.9269562554234088, "grad_norm": 1.9469128847122192, "learning_rate": 1.4575737768012188e-06, "loss": 2.1487, "step": 9080 }, { "epoch": 0.9289980092899801, "grad_norm": 1.9578220844268799, "learning_rate": 1.377763085209438e-06, "loss": 2.1755, "step": 9100 }, { "epoch": 0.9310397631565515, "grad_norm": 2.009042263031006, "learning_rate": 1.300168889483605e-06, "loss": 2.1552, "step": 9120 }, { "epoch": 0.9330815170231228, "grad_norm": 1.8502835035324097, "learning_rate": 1.2247947268368364e-06, "loss": 2.1131, "step": 9140 }, { "epoch": 0.9351232708896943, "grad_norm": 1.836517333984375, "learning_rate": 1.1516440332797029e-06, "loss": 2.1524, "step": 9160 }, { "epoch": 0.9371650247562656, "grad_norm": 1.7254841327667236, "learning_rate": 1.0807201434635873e-06, "loss": 2.1379, "step": 9180 }, { "epoch": 0.939206778622837, "grad_norm": 2.0271339416503906, "learning_rate": 1.0120262905286893e-06, "loss": 2.1637, "step": 9200 }, { "epoch": 0.9412485324894084, "grad_norm": 1.8865677118301392, "learning_rate": 9.45565605956622e-07, "loss": 2.1046, "step": 9220 }, { "epoch": 0.9432902863559798, "grad_norm": 2.141535758972168, "learning_rate": 8.813411194276755e-07, "loss": 2.1169, "step": 9240 }, { "epoch": 0.9453320402225511, "grad_norm": 1.7960325479507446, "learning_rate": 8.193557586826883e-07, "loss": 2.126, "step": 9260 }, { "epoch": 0.9473737940891226, "grad_norm": 1.9488415718078613, "learning_rate": 7.596123493895991e-07, "loss": 2.183, "step": 9280 }, { "epoch": 0.9494155479556939, "grad_norm": 1.852903962135315, "learning_rate": 7.021136150146268e-07, "loss": 2.1821, "step": 9300 }, { "epoch": 0.9514573018222653, "grad_norm": 1.7173832654953003, "learning_rate": 6.468621766981154e-07, "loss": 2.088, "step": 9320 }, { "epoch": 0.9534990556888367, "grad_norm": 1.7861875295639038, "learning_rate": 5.938605531350616e-07, "loss": 2.1763, "step": 9340 }, { "epoch": 0.9555408095554081, "grad_norm": 1.798170566558838, "learning_rate": 5.431111604602856e-07, "loss": 2.1429, "step": 9360 }, { "epoch": 0.9575825634219794, "grad_norm": 2.071655750274658, "learning_rate": 4.946163121382796e-07, "loss": 2.113, "step": 9380 }, { "epoch": 0.9596243172885509, "grad_norm": 1.9325952529907227, "learning_rate": 4.4837821885777564e-07, "loss": 2.0987, "step": 9400 }, { "epoch": 0.9616660711551223, "grad_norm": 1.6762099266052246, "learning_rate": 4.0439898843093736e-07, "loss": 2.1871, "step": 9420 }, { "epoch": 0.9637078250216936, "grad_norm": 1.692571759223938, "learning_rate": 3.6268062569729256e-07, "loss": 2.1638, "step": 9440 }, { "epoch": 0.9657495788882651, "grad_norm": 1.8023351430892944, "learning_rate": 3.232250324323338e-07, "loss": 2.2017, "step": 9460 }, { "epoch": 0.9677913327548364, "grad_norm": 1.7618221044540405, "learning_rate": 2.860340072608214e-07, "loss": 2.1171, "step": 9480 }, { "epoch": 0.9698330866214078, "grad_norm": 1.7170206308364868, "learning_rate": 2.511092455747932e-07, "loss": 2.1285, "step": 9500 }, { "epoch": 0.9718748404879791, "grad_norm": 1.8265429735183716, "learning_rate": 2.1845233945629873e-07, "loss": 2.1049, "step": 9520 }, { "epoch": 0.9739165943545506, "grad_norm": 1.7483819723129272, "learning_rate": 1.8806477760477948e-07, "loss": 2.1843, "step": 9540 }, { "epoch": 0.9759583482211219, "grad_norm": 1.9647390842437744, "learning_rate": 1.5994794526923983e-07, "loss": 2.0758, "step": 9560 }, { "epoch": 0.9780001020876933, "grad_norm": 2.0246357917785645, "learning_rate": 1.3410312418508098e-07, "loss": 2.1304, "step": 9580 }, { "epoch": 0.9800418559542647, "grad_norm": 1.840067744255066, "learning_rate": 1.105314925156975e-07, "loss": 2.1085, "step": 9600 }, { "epoch": 0.9820836098208361, "grad_norm": 1.9115033149719238, "learning_rate": 8.923412479873161e-08, "loss": 2.1393, "step": 9620 }, { "epoch": 0.9841253636874074, "grad_norm": 1.9047495126724243, "learning_rate": 7.021199189711225e-08, "loss": 2.1429, "step": 9640 }, { "epoch": 0.9861671175539789, "grad_norm": 1.8892066478729248, "learning_rate": 5.346596095480161e-08, "loss": 2.1316, "step": 9660 }, { "epoch": 0.9882088714205502, "grad_norm": 2.1352336406707764, "learning_rate": 3.8996795357254535e-08, "loss": 2.1511, "step": 9680 }, { "epoch": 0.9902506252871216, "grad_norm": 1.6663304567337036, "learning_rate": 2.6805154696613045e-08, "loss": 2.0825, "step": 9700 }, { "epoch": 0.9922923791536931, "grad_norm": 1.8192744255065918, "learning_rate": 1.6891594741663686e-08, "loss": 2.0561, "step": 9720 }, { "epoch": 0.9943341330202644, "grad_norm": 1.7815369367599487, "learning_rate": 9.256567412463391e-09, "loss": 2.1337, "step": 9740 }, { "epoch": 0.9963758868868358, "grad_norm": 1.8240342140197754, "learning_rate": 3.9004207597836965e-09, "loss": 2.1085, "step": 9760 }, { "epoch": 0.9984176407534072, "grad_norm": 1.8592042922973633, "learning_rate": 8.233989492012484e-10, "loss": 2.1937, "step": 9780 }, { "epoch": 1.0, "step": 9796, "total_flos": 8.040692801377455e+17, "train_loss": 2.8003817188734907, "train_runtime": 2355.0829, "train_samples_per_second": 133.092, "train_steps_per_second": 4.16 } ], "logging_steps": 20, "max_steps": 9796, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.040692801377455e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }