{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1313, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007616146230007616, "grad_norm": 9.639493086797174, "learning_rate": 8.333333333333333e-07, "loss": 1.2528, "step": 10 }, { "epoch": 0.015232292460015232, "grad_norm": 8.49004769576833, "learning_rate": 2.5e-06, "loss": 0.5082, "step": 20 }, { "epoch": 0.02284843869002285, "grad_norm": 6.555158164896051, "learning_rate": 4.166666666666667e-06, "loss": 0.3399, "step": 30 }, { "epoch": 0.030464584920030464, "grad_norm": 8.564554290087065, "learning_rate": 4.9805144193296965e-06, "loss": 0.3035, "step": 40 }, { "epoch": 0.03808073115003808, "grad_norm": 6.702661496981953, "learning_rate": 4.9454403741231495e-06, "loss": 0.3163, "step": 50 }, { "epoch": 0.0456968773800457, "grad_norm": 11.658563206953636, "learning_rate": 4.906469212782542e-06, "loss": 0.3014, "step": 60 }, { "epoch": 0.053313023610053314, "grad_norm": 9.653812130588781, "learning_rate": 4.867498051441934e-06, "loss": 0.28, "step": 70 }, { "epoch": 0.06092916984006093, "grad_norm": 6.566810094632355, "learning_rate": 4.832424006235386e-06, "loss": 0.2801, "step": 80 }, { "epoch": 0.06854531607006854, "grad_norm": 9.977494581893676, "learning_rate": 4.793452844894778e-06, "loss": 0.299, "step": 90 }, { "epoch": 0.07616146230007616, "grad_norm": 3.7598332928698057, "learning_rate": 4.75448168355417e-06, "loss": 0.2936, "step": 100 }, { "epoch": 0.08377760853008377, "grad_norm": 3.5678745009701713, "learning_rate": 4.715510522213562e-06, "loss": 0.3044, "step": 110 }, { "epoch": 0.0913937547600914, "grad_norm": 13.805243688595516, "learning_rate": 4.676539360872954e-06, "loss": 0.2683, "step": 120 }, { "epoch": 0.09900990099009901, "grad_norm": 5.852174401983805, "learning_rate": 4.6375681995323465e-06, "loss": 0.2444, "step": 130 }, { "epoch": 0.10662604722010663, "grad_norm": 7.855194077383853, "learning_rate": 4.5985970381917386e-06, "loss": 0.3041, "step": 140 }, { "epoch": 0.11424219345011424, "grad_norm": 8.043209218403092, "learning_rate": 4.559625876851131e-06, "loss": 0.2442, "step": 150 }, { "epoch": 0.12185833968012186, "grad_norm": 25.144531941751673, "learning_rate": 4.520654715510523e-06, "loss": 0.272, "step": 160 }, { "epoch": 0.12947448591012947, "grad_norm": 6.509174092287003, "learning_rate": 4.481683554169914e-06, "loss": 0.2623, "step": 170 }, { "epoch": 0.1370906321401371, "grad_norm": 9.500128771255806, "learning_rate": 4.442712392829306e-06, "loss": 0.2665, "step": 180 }, { "epoch": 0.1447067783701447, "grad_norm": 3.607016552182795, "learning_rate": 4.403741231488698e-06, "loss": 0.2802, "step": 190 }, { "epoch": 0.15232292460015232, "grad_norm": 4.42691540524899, "learning_rate": 4.36477007014809e-06, "loss": 0.2363, "step": 200 }, { "epoch": 0.15993907083015993, "grad_norm": 3.3159003473586197, "learning_rate": 4.3257989088074824e-06, "loss": 0.2326, "step": 210 }, { "epoch": 0.16755521706016754, "grad_norm": 4.689165992829481, "learning_rate": 4.286827747466875e-06, "loss": 0.2433, "step": 220 }, { "epoch": 0.17517136329017516, "grad_norm": 9.737741759699885, "learning_rate": 4.2478565861262675e-06, "loss": 0.2467, "step": 230 }, { "epoch": 0.1827875095201828, "grad_norm": 2.836341979209339, "learning_rate": 4.20888542478566e-06, "loss": 0.2391, "step": 240 }, { "epoch": 0.19040365575019041, "grad_norm": 1.5359361442871327, "learning_rate": 4.169914263445051e-06, "loss": 0.2367, "step": 250 }, { "epoch": 0.19801980198019803, "grad_norm": 4.250993517900826, "learning_rate": 4.130943102104443e-06, "loss": 0.2719, "step": 260 }, { "epoch": 0.20563594821020564, "grad_norm": 2.537220060582072, "learning_rate": 4.091971940763835e-06, "loss": 0.2269, "step": 270 }, { "epoch": 0.21325209444021326, "grad_norm": 1.3331572997631018, "learning_rate": 4.053000779423227e-06, "loss": 0.2316, "step": 280 }, { "epoch": 0.22086824067022087, "grad_norm": 3.608424270155421, "learning_rate": 4.014029618082619e-06, "loss": 0.248, "step": 290 }, { "epoch": 0.2284843869002285, "grad_norm": 4.4991481026863225, "learning_rate": 3.975058456742011e-06, "loss": 0.2397, "step": 300 }, { "epoch": 0.2361005331302361, "grad_norm": 3.8860134497329843, "learning_rate": 3.9360872954014035e-06, "loss": 0.2615, "step": 310 }, { "epoch": 0.24371667936024372, "grad_norm": 4.04363997587026, "learning_rate": 3.897116134060796e-06, "loss": 0.2428, "step": 320 }, { "epoch": 0.25133282559025133, "grad_norm": 1.8755213132629207, "learning_rate": 3.858144972720188e-06, "loss": 0.2192, "step": 330 }, { "epoch": 0.25894897182025894, "grad_norm": 6.113234551860946, "learning_rate": 3.81917381137958e-06, "loss": 0.2393, "step": 340 }, { "epoch": 0.26656511805026656, "grad_norm": 9.925025613514833, "learning_rate": 3.7802026500389715e-06, "loss": 0.2229, "step": 350 }, { "epoch": 0.2741812642802742, "grad_norm": 5.214638205967677, "learning_rate": 3.7412314886983636e-06, "loss": 0.2224, "step": 360 }, { "epoch": 0.2817974105102818, "grad_norm": 6.087030860156098, "learning_rate": 3.7022603273577557e-06, "loss": 0.2173, "step": 370 }, { "epoch": 0.2894135567402894, "grad_norm": 6.180290616323521, "learning_rate": 3.6632891660171478e-06, "loss": 0.2144, "step": 380 }, { "epoch": 0.297029702970297, "grad_norm": 2.402073266844856, "learning_rate": 3.62431800467654e-06, "loss": 0.2118, "step": 390 }, { "epoch": 0.30464584920030463, "grad_norm": 6.174997984528584, "learning_rate": 3.5853468433359316e-06, "loss": 0.2615, "step": 400 }, { "epoch": 0.31226199543031224, "grad_norm": 8.879077685914272, "learning_rate": 3.5463756819953237e-06, "loss": 0.2529, "step": 410 }, { "epoch": 0.31987814166031986, "grad_norm": 5.468782572651813, "learning_rate": 3.5074045206547158e-06, "loss": 0.2669, "step": 420 }, { "epoch": 0.3274942878903275, "grad_norm": 7.406738215292078, "learning_rate": 3.468433359314108e-06, "loss": 0.2632, "step": 430 }, { "epoch": 0.3351104341203351, "grad_norm": 3.70629740090698, "learning_rate": 3.4294621979735e-06, "loss": 0.2412, "step": 440 }, { "epoch": 0.3427265803503427, "grad_norm": 3.3764650248375676, "learning_rate": 3.390491036632892e-06, "loss": 0.2461, "step": 450 }, { "epoch": 0.3503427265803503, "grad_norm": 4.286293923557235, "learning_rate": 3.3515198752922838e-06, "loss": 0.2367, "step": 460 }, { "epoch": 0.357958872810358, "grad_norm": 3.2261118617888767, "learning_rate": 3.312548713951676e-06, "loss": 0.2508, "step": 470 }, { "epoch": 0.3655750190403656, "grad_norm": 2.000066076371398, "learning_rate": 3.273577552611068e-06, "loss": 0.2355, "step": 480 }, { "epoch": 0.3731911652703732, "grad_norm": 1.6980552649530136, "learning_rate": 3.23460639127046e-06, "loss": 0.2329, "step": 490 }, { "epoch": 0.38080731150038083, "grad_norm": 3.5865596510799524, "learning_rate": 3.195635229929852e-06, "loss": 0.2463, "step": 500 }, { "epoch": 0.38842345773038844, "grad_norm": 2.0737159974836428, "learning_rate": 3.1566640685892443e-06, "loss": 0.2296, "step": 510 }, { "epoch": 0.39603960396039606, "grad_norm": 12.615894511310687, "learning_rate": 3.117692907248636e-06, "loss": 0.2479, "step": 520 }, { "epoch": 0.4036557501904037, "grad_norm": 2.941177874523108, "learning_rate": 3.078721745908028e-06, "loss": 0.2382, "step": 530 }, { "epoch": 0.4112718964204113, "grad_norm": 3.4235048536712824, "learning_rate": 3.03975058456742e-06, "loss": 0.2655, "step": 540 }, { "epoch": 0.4188880426504189, "grad_norm": 2.280585583419532, "learning_rate": 3.0007794232268123e-06, "loss": 0.226, "step": 550 }, { "epoch": 0.4265041888804265, "grad_norm": 4.192992543465362, "learning_rate": 2.9618082618862044e-06, "loss": 0.237, "step": 560 }, { "epoch": 0.43412033511043413, "grad_norm": 4.376833355588179, "learning_rate": 2.9228371005455965e-06, "loss": 0.2354, "step": 570 }, { "epoch": 0.44173648134044174, "grad_norm": 5.5764858101422465, "learning_rate": 2.883865939204988e-06, "loss": 0.2377, "step": 580 }, { "epoch": 0.44935262757044936, "grad_norm": 11.079112280738615, "learning_rate": 2.8448947778643803e-06, "loss": 0.2302, "step": 590 }, { "epoch": 0.456968773800457, "grad_norm": 4.653154577810809, "learning_rate": 2.8059236165237724e-06, "loss": 0.2271, "step": 600 }, { "epoch": 0.4645849200304646, "grad_norm": 5.627141358500255, "learning_rate": 2.7669524551831645e-06, "loss": 0.214, "step": 610 }, { "epoch": 0.4722010662604722, "grad_norm": 3.0799884402462583, "learning_rate": 2.7279812938425566e-06, "loss": 0.2401, "step": 620 }, { "epoch": 0.4798172124904798, "grad_norm": 2.9094630692232566, "learning_rate": 2.6890101325019487e-06, "loss": 0.2156, "step": 630 }, { "epoch": 0.48743335872048743, "grad_norm": 4.7368426822797955, "learning_rate": 2.6500389711613408e-06, "loss": 0.2398, "step": 640 }, { "epoch": 0.49504950495049505, "grad_norm": 2.694907521185521, "learning_rate": 2.6110678098207325e-06, "loss": 0.2005, "step": 650 }, { "epoch": 0.5026656511805027, "grad_norm": 4.336379900966037, "learning_rate": 2.572096648480125e-06, "loss": 0.1999, "step": 660 }, { "epoch": 0.5102817974105103, "grad_norm": 5.533701246102975, "learning_rate": 2.533125487139517e-06, "loss": 0.2066, "step": 670 }, { "epoch": 0.5178979436405179, "grad_norm": 4.264409295509442, "learning_rate": 2.494154325798909e-06, "loss": 0.2127, "step": 680 }, { "epoch": 0.5255140898705255, "grad_norm": 6.757877305010082, "learning_rate": 2.455183164458301e-06, "loss": 0.2107, "step": 690 }, { "epoch": 0.5331302361005331, "grad_norm": 21.156572431608588, "learning_rate": 2.416212003117693e-06, "loss": 0.2004, "step": 700 }, { "epoch": 0.5407463823305407, "grad_norm": 7.9735056443944154, "learning_rate": 2.377240841777085e-06, "loss": 0.2365, "step": 710 }, { "epoch": 0.5483625285605483, "grad_norm": 5.553215674492296, "learning_rate": 2.338269680436477e-06, "loss": 0.1814, "step": 720 }, { "epoch": 0.555978674790556, "grad_norm": 94.69988106143563, "learning_rate": 2.3031956352299302e-06, "loss": 0.1908, "step": 730 }, { "epoch": 0.5635948210205636, "grad_norm": 1.649907078405919, "learning_rate": 2.264224473889322e-06, "loss": 0.1883, "step": 740 }, { "epoch": 0.5712109672505712, "grad_norm": 2.6805491138942052, "learning_rate": 2.225253312548714e-06, "loss": 0.2365, "step": 750 }, { "epoch": 0.5788271134805788, "grad_norm": 7.23861556333092, "learning_rate": 2.186282151208106e-06, "loss": 0.1761, "step": 760 }, { "epoch": 0.5864432597105864, "grad_norm": 3.2894178397808846, "learning_rate": 2.1473109898674982e-06, "loss": 0.2062, "step": 770 }, { "epoch": 0.594059405940594, "grad_norm": 10.630820097778582, "learning_rate": 2.1083398285268903e-06, "loss": 0.221, "step": 780 }, { "epoch": 0.6016755521706016, "grad_norm": 4.304468571093565, "learning_rate": 2.0693686671862824e-06, "loss": 0.1962, "step": 790 }, { "epoch": 0.6092916984006093, "grad_norm": 25.23668397466602, "learning_rate": 2.030397505845674e-06, "loss": 0.2064, "step": 800 }, { "epoch": 0.6169078446306169, "grad_norm": 7.801217919207177, "learning_rate": 1.9914263445050662e-06, "loss": 0.1903, "step": 810 }, { "epoch": 0.6245239908606245, "grad_norm": 2.751715853008115, "learning_rate": 1.9524551831644583e-06, "loss": 0.201, "step": 820 }, { "epoch": 0.6321401370906321, "grad_norm": 5.690806891610993, "learning_rate": 1.9134840218238504e-06, "loss": 0.1945, "step": 830 }, { "epoch": 0.6397562833206397, "grad_norm": 14.562381559438334, "learning_rate": 1.8745128604832425e-06, "loss": 0.1825, "step": 840 }, { "epoch": 0.6473724295506473, "grad_norm": 3.134405388535424, "learning_rate": 1.8355416991426344e-06, "loss": 0.2219, "step": 850 }, { "epoch": 0.654988575780655, "grad_norm": 4.669745434608825, "learning_rate": 1.7965705378020267e-06, "loss": 0.2153, "step": 860 }, { "epoch": 0.6626047220106626, "grad_norm": 7.086116154905358, "learning_rate": 1.7575993764614188e-06, "loss": 0.2205, "step": 870 }, { "epoch": 0.6702208682406702, "grad_norm": 3.572081402217666, "learning_rate": 1.718628215120811e-06, "loss": 0.1877, "step": 880 }, { "epoch": 0.6778370144706778, "grad_norm": 2.925715103242468, "learning_rate": 1.6796570537802028e-06, "loss": 0.2187, "step": 890 }, { "epoch": 0.6854531607006854, "grad_norm": 9.493856956533788, "learning_rate": 1.640685892439595e-06, "loss": 0.2125, "step": 900 }, { "epoch": 0.693069306930693, "grad_norm": 27.037214056603638, "learning_rate": 1.601714731098987e-06, "loss": 0.2317, "step": 910 }, { "epoch": 0.7006854531607006, "grad_norm": 4.472164841493445, "learning_rate": 1.562743569758379e-06, "loss": 0.2044, "step": 920 }, { "epoch": 0.7083015993907082, "grad_norm": 16.172978825850585, "learning_rate": 1.523772408417771e-06, "loss": 0.1894, "step": 930 }, { "epoch": 0.715917745620716, "grad_norm": 4.881279517002309, "learning_rate": 1.4848012470771631e-06, "loss": 0.2003, "step": 940 }, { "epoch": 0.7235338918507236, "grad_norm": 3.2541774506080903, "learning_rate": 1.445830085736555e-06, "loss": 0.19, "step": 950 }, { "epoch": 0.7311500380807312, "grad_norm": 4.81369927464371, "learning_rate": 1.4068589243959471e-06, "loss": 0.2147, "step": 960 }, { "epoch": 0.7387661843107388, "grad_norm": 4.714739168817072, "learning_rate": 1.3678877630553392e-06, "loss": 0.216, "step": 970 }, { "epoch": 0.7463823305407464, "grad_norm": 17.223534266258675, "learning_rate": 1.3289166017147311e-06, "loss": 0.1985, "step": 980 }, { "epoch": 0.753998476770754, "grad_norm": 8.493294082063118, "learning_rate": 1.2899454403741232e-06, "loss": 0.1977, "step": 990 }, { "epoch": 0.7616146230007617, "grad_norm": 9.836008899951883, "learning_rate": 1.2509742790335153e-06, "loss": 0.2164, "step": 1000 }, { "epoch": 0.7692307692307693, "grad_norm": 6.722203243193697, "learning_rate": 1.2120031176929072e-06, "loss": 0.1702, "step": 1010 }, { "epoch": 0.7768469154607769, "grad_norm": 6.285912185515779, "learning_rate": 1.1730319563522993e-06, "loss": 0.1839, "step": 1020 }, { "epoch": 0.7844630616907845, "grad_norm": 4.843126546344412, "learning_rate": 1.1340607950116914e-06, "loss": 0.1821, "step": 1030 }, { "epoch": 0.7920792079207921, "grad_norm": 4.402783083650455, "learning_rate": 1.0950896336710835e-06, "loss": 0.2193, "step": 1040 }, { "epoch": 0.7996953541507997, "grad_norm": 3.2568451258156808, "learning_rate": 1.0561184723304756e-06, "loss": 0.2043, "step": 1050 }, { "epoch": 0.8073115003808073, "grad_norm": 161.43472406681008, "learning_rate": 1.0171473109898675e-06, "loss": 0.1897, "step": 1060 }, { "epoch": 0.814927646610815, "grad_norm": 5.379668281357475, "learning_rate": 9.781761496492596e-07, "loss": 0.2093, "step": 1070 }, { "epoch": 0.8225437928408226, "grad_norm": 13.223854900370439, "learning_rate": 9.392049883086516e-07, "loss": 0.2536, "step": 1080 }, { "epoch": 0.8301599390708302, "grad_norm": 2.5590636509915097, "learning_rate": 9.002338269680437e-07, "loss": 0.1686, "step": 1090 }, { "epoch": 0.8377760853008378, "grad_norm": 2.4022953941248506, "learning_rate": 8.612626656274357e-07, "loss": 0.1858, "step": 1100 }, { "epoch": 0.8453922315308454, "grad_norm": 5.610490607350316, "learning_rate": 8.222915042868277e-07, "loss": 0.2067, "step": 1110 }, { "epoch": 0.853008377760853, "grad_norm": 2.6949684335315385, "learning_rate": 7.833203429462198e-07, "loss": 0.2081, "step": 1120 }, { "epoch": 0.8606245239908606, "grad_norm": 2.363658845139482, "learning_rate": 7.443491816056118e-07, "loss": 0.2167, "step": 1130 }, { "epoch": 0.8682406702208683, "grad_norm": 54.850194962785416, "learning_rate": 7.05378020265004e-07, "loss": 0.1661, "step": 1140 }, { "epoch": 0.8758568164508759, "grad_norm": 2.4702676793543312, "learning_rate": 6.66406858924396e-07, "loss": 0.1837, "step": 1150 }, { "epoch": 0.8834729626808835, "grad_norm": 1.291212127209575, "learning_rate": 6.27435697583788e-07, "loss": 0.1639, "step": 1160 }, { "epoch": 0.8910891089108911, "grad_norm": 4.989079218052827, "learning_rate": 5.884645362431801e-07, "loss": 0.1941, "step": 1170 }, { "epoch": 0.8987052551408987, "grad_norm": 3.934798653103373, "learning_rate": 5.494933749025721e-07, "loss": 0.2115, "step": 1180 }, { "epoch": 0.9063214013709063, "grad_norm": 8.76267158776298, "learning_rate": 5.105222135619641e-07, "loss": 0.1888, "step": 1190 }, { "epoch": 0.913937547600914, "grad_norm": 37.34043564266434, "learning_rate": 4.715510522213562e-07, "loss": 0.1932, "step": 1200 }, { "epoch": 0.9215536938309216, "grad_norm": 5.936087774353652, "learning_rate": 4.3257989088074824e-07, "loss": 0.201, "step": 1210 }, { "epoch": 0.9291698400609292, "grad_norm": 3.2120948066407005, "learning_rate": 3.9360872954014035e-07, "loss": 0.2004, "step": 1220 }, { "epoch": 0.9367859862909368, "grad_norm": 2.2714590166034525, "learning_rate": 3.546375681995324e-07, "loss": 0.1936, "step": 1230 }, { "epoch": 0.9444021325209444, "grad_norm": 3.1017617351534406, "learning_rate": 3.1566640685892445e-07, "loss": 0.1925, "step": 1240 }, { "epoch": 0.952018278750952, "grad_norm": 3.814780236337704, "learning_rate": 2.7669524551831645e-07, "loss": 0.2221, "step": 1250 }, { "epoch": 0.9596344249809596, "grad_norm": 7.0716534670331, "learning_rate": 2.3772408417770852e-07, "loss": 0.1985, "step": 1260 }, { "epoch": 0.9672505712109672, "grad_norm": 48.227925344783095, "learning_rate": 1.9875292283710057e-07, "loss": 0.1864, "step": 1270 }, { "epoch": 0.9748667174409749, "grad_norm": 4.310515199128544, "learning_rate": 1.597817614964926e-07, "loss": 0.2306, "step": 1280 }, { "epoch": 0.9824828636709825, "grad_norm": 5.071672951133308, "learning_rate": 1.2081060015588465e-07, "loss": 0.1761, "step": 1290 }, { "epoch": 0.9900990099009901, "grad_norm": 3.32925400548507, "learning_rate": 8.18394388152767e-08, "loss": 0.1799, "step": 1300 }, { "epoch": 0.9977151561309977, "grad_norm": 3.1779605362401053, "learning_rate": 4.286827747466875e-08, "loss": 0.1876, "step": 1310 } ], "logging_steps": 10, "max_steps": 1313, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1244660420050944.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }