{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9093078758949882, "eval_steps": 200, "global_step": 3200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011933174224343675, "grad_norm": 1080.0, "learning_rate": 5.588235294117647e-06, "loss": 39.624, "step": 20 }, { "epoch": 0.02386634844868735, "grad_norm": 230.0, "learning_rate": 1.1470588235294118e-05, "loss": 19.6023, "step": 40 }, { "epoch": 0.03579952267303103, "grad_norm": 1352.0, "learning_rate": 1.735294117647059e-05, "loss": 13.7646, "step": 60 }, { "epoch": 0.0477326968973747, "grad_norm": 332.0, "learning_rate": 1.993300852618758e-05, "loss": 8.7673, "step": 80 }, { "epoch": 0.059665871121718374, "grad_norm": 103.0, "learning_rate": 1.9811205846528625e-05, "loss": 3.2738, "step": 100 }, { "epoch": 0.07159904534606205, "grad_norm": 24.875, "learning_rate": 1.968940316686967e-05, "loss": 1.0399, "step": 120 }, { "epoch": 0.08353221957040573, "grad_norm": 19.75, "learning_rate": 1.956760048721072e-05, "loss": 0.7366, "step": 140 }, { "epoch": 0.0954653937947494, "grad_norm": 15.1875, "learning_rate": 1.9445797807551767e-05, "loss": 0.6329, "step": 160 }, { "epoch": 0.10739856801909307, "grad_norm": 14.4375, "learning_rate": 1.9323995127892817e-05, "loss": 0.5967, "step": 180 }, { "epoch": 0.11933174224343675, "grad_norm": 13.4375, "learning_rate": 1.9202192448233863e-05, "loss": 0.5892, "step": 200 }, { "epoch": 0.11933174224343675, "eval_loss": 0.24632702767848969, "eval_runtime": 29.1771, "eval_samples_per_second": 112.486, "eval_steps_per_second": 7.06, "step": 200 }, { "epoch": 0.13126491646778043, "grad_norm": 18.25, "learning_rate": 1.908038976857491e-05, "loss": 0.5012, "step": 220 }, { "epoch": 0.1431980906921241, "grad_norm": 11.125, "learning_rate": 1.895858708891596e-05, "loss": 0.5417, "step": 240 }, { "epoch": 0.15513126491646778, "grad_norm": 14.625, "learning_rate": 1.8836784409257005e-05, "loss": 0.5213, "step": 260 }, { "epoch": 0.16706443914081145, "grad_norm": 10.375, "learning_rate": 1.871498172959805e-05, "loss": 0.4793, "step": 280 }, { "epoch": 0.17899761336515513, "grad_norm": 12.75, "learning_rate": 1.85931790499391e-05, "loss": 0.5192, "step": 300 }, { "epoch": 0.1909307875894988, "grad_norm": 11.9375, "learning_rate": 1.8471376370280146e-05, "loss": 0.4576, "step": 320 }, { "epoch": 0.20286396181384247, "grad_norm": 9.5625, "learning_rate": 1.8349573690621192e-05, "loss": 0.4647, "step": 340 }, { "epoch": 0.21479713603818615, "grad_norm": 13.25, "learning_rate": 1.8227771010962242e-05, "loss": 0.4639, "step": 360 }, { "epoch": 0.22673031026252982, "grad_norm": 9.8125, "learning_rate": 1.8105968331303288e-05, "loss": 0.4751, "step": 380 }, { "epoch": 0.2386634844868735, "grad_norm": 10.1875, "learning_rate": 1.7984165651644338e-05, "loss": 0.4499, "step": 400 }, { "epoch": 0.2386634844868735, "eval_loss": 0.1926695704460144, "eval_runtime": 25.5499, "eval_samples_per_second": 128.454, "eval_steps_per_second": 8.063, "step": 400 }, { "epoch": 0.25059665871121717, "grad_norm": 11.375, "learning_rate": 1.7862362971985384e-05, "loss": 0.4325, "step": 420 }, { "epoch": 0.26252983293556087, "grad_norm": 12.6875, "learning_rate": 1.7740560292326433e-05, "loss": 0.4248, "step": 440 }, { "epoch": 0.2744630071599045, "grad_norm": 9.375, "learning_rate": 1.761875761266748e-05, "loss": 0.4425, "step": 460 }, { "epoch": 0.2863961813842482, "grad_norm": 9.4375, "learning_rate": 1.749695493300853e-05, "loss": 0.4535, "step": 480 }, { "epoch": 0.29832935560859186, "grad_norm": 11.4375, "learning_rate": 1.7375152253349575e-05, "loss": 0.4012, "step": 500 }, { "epoch": 0.31026252983293556, "grad_norm": 10.5625, "learning_rate": 1.7253349573690625e-05, "loss": 0.4603, "step": 520 }, { "epoch": 0.3221957040572792, "grad_norm": 9.75, "learning_rate": 1.713154689403167e-05, "loss": 0.4396, "step": 540 }, { "epoch": 0.3341288782816229, "grad_norm": 9.1875, "learning_rate": 1.7009744214372717e-05, "loss": 0.4142, "step": 560 }, { "epoch": 0.3460620525059666, "grad_norm": 8.8125, "learning_rate": 1.6887941534713767e-05, "loss": 0.3439, "step": 580 }, { "epoch": 0.35799522673031026, "grad_norm": 11.875, "learning_rate": 1.6766138855054813e-05, "loss": 0.4196, "step": 600 }, { "epoch": 0.35799522673031026, "eval_loss": 0.17961257696151733, "eval_runtime": 25.3721, "eval_samples_per_second": 129.354, "eval_steps_per_second": 8.119, "step": 600 }, { "epoch": 0.36992840095465396, "grad_norm": 9.75, "learning_rate": 1.6644336175395862e-05, "loss": 0.3922, "step": 620 }, { "epoch": 0.3818615751789976, "grad_norm": 25.5, "learning_rate": 1.652253349573691e-05, "loss": 0.4208, "step": 640 }, { "epoch": 0.3937947494033413, "grad_norm": 9.625, "learning_rate": 1.6400730816077954e-05, "loss": 0.4073, "step": 660 }, { "epoch": 0.40572792362768495, "grad_norm": 9.125, "learning_rate": 1.6278928136419004e-05, "loss": 0.3889, "step": 680 }, { "epoch": 0.41766109785202865, "grad_norm": 19.25, "learning_rate": 1.615712545676005e-05, "loss": 0.4102, "step": 700 }, { "epoch": 0.4295942720763723, "grad_norm": 9.8125, "learning_rate": 1.6035322777101096e-05, "loss": 0.4067, "step": 720 }, { "epoch": 0.441527446300716, "grad_norm": 9.25, "learning_rate": 1.5913520097442146e-05, "loss": 0.423, "step": 740 }, { "epoch": 0.45346062052505964, "grad_norm": 11.1875, "learning_rate": 1.5791717417783192e-05, "loss": 0.4148, "step": 760 }, { "epoch": 0.46539379474940334, "grad_norm": 14.4375, "learning_rate": 1.5669914738124238e-05, "loss": 0.4314, "step": 780 }, { "epoch": 0.477326968973747, "grad_norm": 10.0625, "learning_rate": 1.5548112058465288e-05, "loss": 0.4029, "step": 800 }, { "epoch": 0.477326968973747, "eval_loss": 0.16969533264636993, "eval_runtime": 25.3494, "eval_samples_per_second": 129.471, "eval_steps_per_second": 8.126, "step": 800 }, { "epoch": 0.4892601431980907, "grad_norm": 8.25, "learning_rate": 1.5426309378806334e-05, "loss": 0.3708, "step": 820 }, { "epoch": 0.5011933174224343, "grad_norm": 15.3125, "learning_rate": 1.5304506699147383e-05, "loss": 0.3914, "step": 840 }, { "epoch": 0.513126491646778, "grad_norm": 10.6875, "learning_rate": 1.518270401948843e-05, "loss": 0.3986, "step": 860 }, { "epoch": 0.5250596658711217, "grad_norm": 8.875, "learning_rate": 1.5060901339829477e-05, "loss": 0.3718, "step": 880 }, { "epoch": 0.5369928400954654, "grad_norm": 8.8125, "learning_rate": 1.4939098660170525e-05, "loss": 0.3512, "step": 900 }, { "epoch": 0.548926014319809, "grad_norm": 9.125, "learning_rate": 1.4817295980511573e-05, "loss": 0.3835, "step": 920 }, { "epoch": 0.5608591885441527, "grad_norm": 12.0625, "learning_rate": 1.4695493300852619e-05, "loss": 0.3675, "step": 940 }, { "epoch": 0.5727923627684964, "grad_norm": 14.0, "learning_rate": 1.4573690621193669e-05, "loss": 0.3876, "step": 960 }, { "epoch": 0.5847255369928401, "grad_norm": 10.375, "learning_rate": 1.4451887941534715e-05, "loss": 0.374, "step": 980 }, { "epoch": 0.5966587112171837, "grad_norm": 11.3125, "learning_rate": 1.4330085261875764e-05, "loss": 0.3892, "step": 1000 }, { "epoch": 0.5966587112171837, "eval_loss": 0.16406475007534027, "eval_runtime": 25.6207, "eval_samples_per_second": 128.1, "eval_steps_per_second": 8.04, "step": 1000 }, { "epoch": 0.6085918854415274, "grad_norm": 11.5, "learning_rate": 1.420828258221681e-05, "loss": 0.387, "step": 1020 }, { "epoch": 0.6205250596658711, "grad_norm": 8.875, "learning_rate": 1.4086479902557857e-05, "loss": 0.3694, "step": 1040 }, { "epoch": 0.6324582338902148, "grad_norm": 8.125, "learning_rate": 1.3964677222898906e-05, "loss": 0.3566, "step": 1060 }, { "epoch": 0.6443914081145584, "grad_norm": 8.4375, "learning_rate": 1.3842874543239952e-05, "loss": 0.3505, "step": 1080 }, { "epoch": 0.6563245823389021, "grad_norm": 16.5, "learning_rate": 1.3721071863580998e-05, "loss": 0.394, "step": 1100 }, { "epoch": 0.6682577565632458, "grad_norm": 8.8125, "learning_rate": 1.3599269183922048e-05, "loss": 0.339, "step": 1120 }, { "epoch": 0.6801909307875895, "grad_norm": 8.6875, "learning_rate": 1.3477466504263094e-05, "loss": 0.3791, "step": 1140 }, { "epoch": 0.6921241050119332, "grad_norm": 10.875, "learning_rate": 1.3355663824604142e-05, "loss": 0.3818, "step": 1160 }, { "epoch": 0.7040572792362768, "grad_norm": 7.46875, "learning_rate": 1.323386114494519e-05, "loss": 0.3798, "step": 1180 }, { "epoch": 0.7159904534606205, "grad_norm": 11.4375, "learning_rate": 1.3112058465286238e-05, "loss": 0.3636, "step": 1200 }, { "epoch": 0.7159904534606205, "eval_loss": 0.15978974103927612, "eval_runtime": 25.4524, "eval_samples_per_second": 128.946, "eval_steps_per_second": 8.094, "step": 1200 }, { "epoch": 0.7279236276849642, "grad_norm": 9.0625, "learning_rate": 1.2990255785627285e-05, "loss": 0.3563, "step": 1220 }, { "epoch": 0.7398568019093079, "grad_norm": 6.8125, "learning_rate": 1.2868453105968333e-05, "loss": 0.3486, "step": 1240 }, { "epoch": 0.7517899761336515, "grad_norm": 9.4375, "learning_rate": 1.274665042630938e-05, "loss": 0.3697, "step": 1260 }, { "epoch": 0.7637231503579952, "grad_norm": 8.1875, "learning_rate": 1.2624847746650429e-05, "loss": 0.3251, "step": 1280 }, { "epoch": 0.7756563245823389, "grad_norm": 10.375, "learning_rate": 1.2503045066991475e-05, "loss": 0.3583, "step": 1300 }, { "epoch": 0.7875894988066826, "grad_norm": 10.0, "learning_rate": 1.2381242387332521e-05, "loss": 0.3781, "step": 1320 }, { "epoch": 0.7995226730310262, "grad_norm": 7.59375, "learning_rate": 1.225943970767357e-05, "loss": 0.3474, "step": 1340 }, { "epoch": 0.8114558472553699, "grad_norm": 7.90625, "learning_rate": 1.2137637028014617e-05, "loss": 0.352, "step": 1360 }, { "epoch": 0.8233890214797136, "grad_norm": 10.3125, "learning_rate": 1.2015834348355663e-05, "loss": 0.3624, "step": 1380 }, { "epoch": 0.8353221957040573, "grad_norm": 9.5, "learning_rate": 1.1894031668696713e-05, "loss": 0.3687, "step": 1400 }, { "epoch": 0.8353221957040573, "eval_loss": 0.15651482343673706, "eval_runtime": 25.9638, "eval_samples_per_second": 126.407, "eval_steps_per_second": 7.934, "step": 1400 }, { "epoch": 0.847255369928401, "grad_norm": 9.125, "learning_rate": 1.1772228989037759e-05, "loss": 0.3323, "step": 1420 }, { "epoch": 0.8591885441527446, "grad_norm": 9.6875, "learning_rate": 1.1650426309378808e-05, "loss": 0.3234, "step": 1440 }, { "epoch": 0.8711217183770883, "grad_norm": 8.3125, "learning_rate": 1.1528623629719854e-05, "loss": 0.3573, "step": 1460 }, { "epoch": 0.883054892601432, "grad_norm": 8.9375, "learning_rate": 1.1406820950060902e-05, "loss": 0.3521, "step": 1480 }, { "epoch": 0.8949880668257757, "grad_norm": 10.9375, "learning_rate": 1.128501827040195e-05, "loss": 0.3433, "step": 1500 }, { "epoch": 0.9069212410501193, "grad_norm": 8.6875, "learning_rate": 1.1163215590742998e-05, "loss": 0.3472, "step": 1520 }, { "epoch": 0.918854415274463, "grad_norm": 6.65625, "learning_rate": 1.1041412911084044e-05, "loss": 0.3367, "step": 1540 }, { "epoch": 0.9307875894988067, "grad_norm": 6.625, "learning_rate": 1.0919610231425094e-05, "loss": 0.3576, "step": 1560 }, { "epoch": 0.9427207637231504, "grad_norm": 11.1875, "learning_rate": 1.079780755176614e-05, "loss": 0.374, "step": 1580 }, { "epoch": 0.954653937947494, "grad_norm": 11.4375, "learning_rate": 1.0676004872107186e-05, "loss": 0.3234, "step": 1600 }, { "epoch": 0.954653937947494, "eval_loss": 0.1529538482427597, "eval_runtime": 25.8628, "eval_samples_per_second": 126.9, "eval_steps_per_second": 7.965, "step": 1600 }, { "epoch": 0.9665871121718377, "grad_norm": 6.5, "learning_rate": 1.0554202192448235e-05, "loss": 0.3517, "step": 1620 }, { "epoch": 0.9785202863961814, "grad_norm": 10.0625, "learning_rate": 1.0432399512789282e-05, "loss": 0.3432, "step": 1640 }, { "epoch": 0.9904534606205251, "grad_norm": 11.0, "learning_rate": 1.0310596833130331e-05, "loss": 0.3255, "step": 1660 }, { "epoch": 1.0023866348448687, "grad_norm": 7.875, "learning_rate": 1.0188794153471377e-05, "loss": 0.3778, "step": 1680 }, { "epoch": 1.0143198090692125, "grad_norm": 7.90625, "learning_rate": 1.0066991473812423e-05, "loss": 0.3079, "step": 1700 }, { "epoch": 1.026252983293556, "grad_norm": 8.6875, "learning_rate": 9.945188794153471e-06, "loss": 0.2988, "step": 1720 }, { "epoch": 1.0381861575178997, "grad_norm": 9.8125, "learning_rate": 9.823386114494519e-06, "loss": 0.3046, "step": 1740 }, { "epoch": 1.0501193317422435, "grad_norm": 7.5, "learning_rate": 9.701583434835567e-06, "loss": 0.2845, "step": 1760 }, { "epoch": 1.062052505966587, "grad_norm": 6.03125, "learning_rate": 9.579780755176615e-06, "loss": 0.2947, "step": 1780 }, { "epoch": 1.0739856801909309, "grad_norm": 6.625, "learning_rate": 9.457978075517663e-06, "loss": 0.282, "step": 1800 }, { "epoch": 1.0739856801909309, "eval_loss": 0.15305505692958832, "eval_runtime": 25.921, "eval_samples_per_second": 126.615, "eval_steps_per_second": 7.947, "step": 1800 }, { "epoch": 1.0859188544152745, "grad_norm": 8.75, "learning_rate": 9.33617539585871e-06, "loss": 0.3005, "step": 1820 }, { "epoch": 1.097852028639618, "grad_norm": 12.8125, "learning_rate": 9.214372716199758e-06, "loss": 0.2861, "step": 1840 }, { "epoch": 1.1097852028639619, "grad_norm": 8.4375, "learning_rate": 9.092570036540804e-06, "loss": 0.3091, "step": 1860 }, { "epoch": 1.1217183770883055, "grad_norm": 10.5, "learning_rate": 8.970767356881852e-06, "loss": 0.3103, "step": 1880 }, { "epoch": 1.1336515513126493, "grad_norm": 8.4375, "learning_rate": 8.8489646772229e-06, "loss": 0.3093, "step": 1900 }, { "epoch": 1.1455847255369929, "grad_norm": 6.8125, "learning_rate": 8.727161997563948e-06, "loss": 0.262, "step": 1920 }, { "epoch": 1.1575178997613365, "grad_norm": 12.625, "learning_rate": 8.605359317904994e-06, "loss": 0.3012, "step": 1940 }, { "epoch": 1.1694510739856803, "grad_norm": 8.125, "learning_rate": 8.483556638246042e-06, "loss": 0.2993, "step": 1960 }, { "epoch": 1.1813842482100239, "grad_norm": 9.75, "learning_rate": 8.36175395858709e-06, "loss": 0.2843, "step": 1980 }, { "epoch": 1.1933174224343674, "grad_norm": 11.5, "learning_rate": 8.239951278928136e-06, "loss": 0.3082, "step": 2000 }, { "epoch": 1.1933174224343674, "eval_loss": 0.15235207974910736, "eval_runtime": 25.9706, "eval_samples_per_second": 126.374, "eval_steps_per_second": 7.932, "step": 2000 }, { "epoch": 1.2052505966587113, "grad_norm": 9.0625, "learning_rate": 8.118148599269184e-06, "loss": 0.3052, "step": 2020 }, { "epoch": 1.2171837708830548, "grad_norm": 10.625, "learning_rate": 7.996345919610232e-06, "loss": 0.2742, "step": 2040 }, { "epoch": 1.2291169451073987, "grad_norm": 10.875, "learning_rate": 7.87454323995128e-06, "loss": 0.2699, "step": 2060 }, { "epoch": 1.2410501193317423, "grad_norm": 10.0625, "learning_rate": 7.752740560292327e-06, "loss": 0.3029, "step": 2080 }, { "epoch": 1.2529832935560858, "grad_norm": 7.9375, "learning_rate": 7.630937880633375e-06, "loss": 0.2929, "step": 2100 }, { "epoch": 1.2649164677804297, "grad_norm": 12.625, "learning_rate": 7.509135200974422e-06, "loss": 0.2838, "step": 2120 }, { "epoch": 1.2768496420047732, "grad_norm": 8.5, "learning_rate": 7.38733252131547e-06, "loss": 0.2677, "step": 2140 }, { "epoch": 1.288782816229117, "grad_norm": 7.6875, "learning_rate": 7.265529841656517e-06, "loss": 0.291, "step": 2160 }, { "epoch": 1.3007159904534606, "grad_norm": 8.1875, "learning_rate": 7.143727161997565e-06, "loss": 0.297, "step": 2180 }, { "epoch": 1.3126491646778042, "grad_norm": 6.34375, "learning_rate": 7.0219244823386126e-06, "loss": 0.25, "step": 2200 }, { "epoch": 1.3126491646778042, "eval_loss": 0.15154734253883362, "eval_runtime": 25.7648, "eval_samples_per_second": 127.383, "eval_steps_per_second": 7.995, "step": 2200 }, { "epoch": 1.324582338902148, "grad_norm": 7.40625, "learning_rate": 6.900121802679659e-06, "loss": 0.2576, "step": 2220 }, { "epoch": 1.3365155131264916, "grad_norm": 9.9375, "learning_rate": 6.7783191230207066e-06, "loss": 0.2748, "step": 2240 }, { "epoch": 1.3484486873508352, "grad_norm": 7.03125, "learning_rate": 6.656516443361754e-06, "loss": 0.2938, "step": 2260 }, { "epoch": 1.360381861575179, "grad_norm": 7.625, "learning_rate": 6.534713763702802e-06, "loss": 0.2595, "step": 2280 }, { "epoch": 1.3723150357995226, "grad_norm": 9.25, "learning_rate": 6.412911084043849e-06, "loss": 0.2511, "step": 2300 }, { "epoch": 1.3842482100238662, "grad_norm": 9.0, "learning_rate": 6.291108404384897e-06, "loss": 0.3385, "step": 2320 }, { "epoch": 1.39618138424821, "grad_norm": 13.3125, "learning_rate": 6.169305724725945e-06, "loss": 0.2813, "step": 2340 }, { "epoch": 1.4081145584725536, "grad_norm": 7.25, "learning_rate": 6.047503045066993e-06, "loss": 0.274, "step": 2360 }, { "epoch": 1.4200477326968974, "grad_norm": 8.125, "learning_rate": 5.925700365408039e-06, "loss": 0.304, "step": 2380 }, { "epoch": 1.431980906921241, "grad_norm": 6.65625, "learning_rate": 5.803897685749087e-06, "loss": 0.2536, "step": 2400 }, { "epoch": 1.431980906921241, "eval_loss": 0.15176840126514435, "eval_runtime": 25.4429, "eval_samples_per_second": 128.995, "eval_steps_per_second": 8.097, "step": 2400 }, { "epoch": 1.4439140811455848, "grad_norm": 10.5, "learning_rate": 5.6820950060901346e-06, "loss": 0.2907, "step": 2420 }, { "epoch": 1.4558472553699284, "grad_norm": 8.75, "learning_rate": 5.5602923264311815e-06, "loss": 0.272, "step": 2440 }, { "epoch": 1.467780429594272, "grad_norm": 9.0, "learning_rate": 5.438489646772229e-06, "loss": 0.2712, "step": 2460 }, { "epoch": 1.4797136038186158, "grad_norm": 8.375, "learning_rate": 5.316686967113277e-06, "loss": 0.2827, "step": 2480 }, { "epoch": 1.4916467780429594, "grad_norm": 27.5, "learning_rate": 5.194884287454325e-06, "loss": 0.3101, "step": 2500 }, { "epoch": 1.503579952267303, "grad_norm": 10.6875, "learning_rate": 5.073081607795371e-06, "loss": 0.3107, "step": 2520 }, { "epoch": 1.5155131264916468, "grad_norm": 7.65625, "learning_rate": 4.951278928136419e-06, "loss": 0.3069, "step": 2540 }, { "epoch": 1.5274463007159904, "grad_norm": 7.375, "learning_rate": 4.829476248477467e-06, "loss": 0.284, "step": 2560 }, { "epoch": 1.539379474940334, "grad_norm": 10.3125, "learning_rate": 4.707673568818515e-06, "loss": 0.3069, "step": 2580 }, { "epoch": 1.5513126491646778, "grad_norm": 11.1875, "learning_rate": 4.5858708891595625e-06, "loss": 0.2793, "step": 2600 }, { "epoch": 1.5513126491646778, "eval_loss": 0.15092819929122925, "eval_runtime": 25.0071, "eval_samples_per_second": 131.243, "eval_steps_per_second": 8.238, "step": 2600 }, { "epoch": 1.5632458233890216, "grad_norm": 11.75, "learning_rate": 4.4640682095006095e-06, "loss": 0.2798, "step": 2620 }, { "epoch": 1.575178997613365, "grad_norm": 11.0625, "learning_rate": 4.3422655298416565e-06, "loss": 0.3004, "step": 2640 }, { "epoch": 1.5871121718377088, "grad_norm": 8.625, "learning_rate": 4.220462850182704e-06, "loss": 0.247, "step": 2660 }, { "epoch": 1.5990453460620526, "grad_norm": 12.3125, "learning_rate": 4.098660170523751e-06, "loss": 0.3249, "step": 2680 }, { "epoch": 1.6109785202863962, "grad_norm": 7.03125, "learning_rate": 3.976857490864799e-06, "loss": 0.3031, "step": 2700 }, { "epoch": 1.6229116945107398, "grad_norm": 7.5, "learning_rate": 3.855054811205847e-06, "loss": 0.2782, "step": 2720 }, { "epoch": 1.6348448687350836, "grad_norm": 7.0625, "learning_rate": 3.7332521315468944e-06, "loss": 0.281, "step": 2740 }, { "epoch": 1.6467780429594272, "grad_norm": 8.75, "learning_rate": 3.611449451887942e-06, "loss": 0.2716, "step": 2760 }, { "epoch": 1.6587112171837708, "grad_norm": 10.8125, "learning_rate": 3.4896467722289897e-06, "loss": 0.3014, "step": 2780 }, { "epoch": 1.6706443914081146, "grad_norm": 10.1875, "learning_rate": 3.3678440925700367e-06, "loss": 0.2973, "step": 2800 }, { "epoch": 1.6706443914081146, "eval_loss": 0.1503816545009613, "eval_runtime": 25.2633, "eval_samples_per_second": 129.912, "eval_steps_per_second": 8.154, "step": 2800 }, { "epoch": 1.6825775656324582, "grad_norm": 9.75, "learning_rate": 3.2460414129110845e-06, "loss": 0.277, "step": 2820 }, { "epoch": 1.6945107398568018, "grad_norm": 9.0625, "learning_rate": 3.124238733252132e-06, "loss": 0.322, "step": 2840 }, { "epoch": 1.7064439140811456, "grad_norm": 10.3125, "learning_rate": 3.002436053593179e-06, "loss": 0.2705, "step": 2860 }, { "epoch": 1.7183770883054894, "grad_norm": 7.625, "learning_rate": 2.8806333739342268e-06, "loss": 0.2676, "step": 2880 }, { "epoch": 1.7303102625298328, "grad_norm": 9.625, "learning_rate": 2.758830694275274e-06, "loss": 0.2913, "step": 2900 }, { "epoch": 1.7422434367541766, "grad_norm": 9.0625, "learning_rate": 2.637028014616322e-06, "loss": 0.2881, "step": 2920 }, { "epoch": 1.7541766109785204, "grad_norm": 8.5, "learning_rate": 2.515225334957369e-06, "loss": 0.3017, "step": 2940 }, { "epoch": 1.766109785202864, "grad_norm": 8.25, "learning_rate": 2.393422655298417e-06, "loss": 0.2822, "step": 2960 }, { "epoch": 1.7780429594272076, "grad_norm": 7.5625, "learning_rate": 2.2716199756394643e-06, "loss": 0.3068, "step": 2980 }, { "epoch": 1.7899761336515514, "grad_norm": 7.46875, "learning_rate": 2.1498172959805117e-06, "loss": 0.2606, "step": 3000 }, { "epoch": 1.7899761336515514, "eval_loss": 0.150226429104805, "eval_runtime": 24.845, "eval_samples_per_second": 132.099, "eval_steps_per_second": 8.291, "step": 3000 }, { "epoch": 1.801909307875895, "grad_norm": 8.6875, "learning_rate": 2.028014616321559e-06, "loss": 0.3155, "step": 3020 }, { "epoch": 1.8138424821002386, "grad_norm": 8.6875, "learning_rate": 1.9062119366626067e-06, "loss": 0.3068, "step": 3040 }, { "epoch": 1.8257756563245824, "grad_norm": 6.3125, "learning_rate": 1.7844092570036541e-06, "loss": 0.2573, "step": 3060 }, { "epoch": 1.837708830548926, "grad_norm": 8.1875, "learning_rate": 1.6626065773447018e-06, "loss": 0.2697, "step": 3080 }, { "epoch": 1.8496420047732696, "grad_norm": 10.1875, "learning_rate": 1.5408038976857492e-06, "loss": 0.3167, "step": 3100 }, { "epoch": 1.8615751789976134, "grad_norm": 7.78125, "learning_rate": 1.4190012180267968e-06, "loss": 0.3099, "step": 3120 }, { "epoch": 1.8735083532219572, "grad_norm": 11.4375, "learning_rate": 1.2971985383678442e-06, "loss": 0.2828, "step": 3140 }, { "epoch": 1.8854415274463006, "grad_norm": 11.25, "learning_rate": 1.1753958587088916e-06, "loss": 0.2783, "step": 3160 }, { "epoch": 1.8973747016706444, "grad_norm": 11.5, "learning_rate": 1.0535931790499393e-06, "loss": 0.2935, "step": 3180 }, { "epoch": 1.9093078758949882, "grad_norm": 10.4375, "learning_rate": 9.317904993909867e-07, "loss": 0.2925, "step": 3200 }, { "epoch": 1.9093078758949882, "eval_loss": 0.1499669998884201, "eval_runtime": 24.2008, "eval_samples_per_second": 135.615, "eval_steps_per_second": 8.512, "step": 3200 } ], "logging_steps": 20, "max_steps": 3352, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.187726578417664e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }