diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16314 @@ +{ + "best_global_step": 1380, + "best_metric": 0.6770720481872559, + "best_model_checkpoint": "saves/qwen3-4B/Qwen3-4B-SFT-science-2e-5/checkpoint-1380", + "epoch": 3.0, + "eval_steps": 230, + "global_step": 2313, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012976480129764801, + "grad_norm": 8.15907096862793, + "learning_rate": 0.0, + "loss": 1.117659091949463, + "step": 1 + }, + { + "epoch": 0.0025952960259529602, + "grad_norm": 7.67869234085083, + "learning_rate": 1.7241379310344828e-07, + "loss": 1.0263863801956177, + "step": 2 + }, + { + "epoch": 0.0038929440389294406, + "grad_norm": 8.24106502532959, + "learning_rate": 3.4482758620689656e-07, + "loss": 1.1220319271087646, + "step": 3 + }, + { + "epoch": 0.0051905920519059205, + "grad_norm": 8.60258960723877, + "learning_rate": 5.172413793103449e-07, + "loss": 1.1806347370147705, + "step": 4 + }, + { + "epoch": 0.006488240064882401, + "grad_norm": 7.782258033752441, + "learning_rate": 6.896551724137931e-07, + "loss": 1.105953574180603, + "step": 5 + }, + { + "epoch": 0.007785888077858881, + "grad_norm": 7.797566890716553, + "learning_rate": 8.620689655172415e-07, + "loss": 1.0968478918075562, + "step": 6 + }, + { + "epoch": 0.009083536090835361, + "grad_norm": 7.626895427703857, + "learning_rate": 1.0344827586206898e-06, + "loss": 1.0549066066741943, + "step": 7 + }, + { + "epoch": 0.010381184103811841, + "grad_norm": 7.147245407104492, + "learning_rate": 1.2068965517241381e-06, + "loss": 1.0259548425674438, + "step": 8 + }, + { + "epoch": 0.01167883211678832, + "grad_norm": 5.977053165435791, + "learning_rate": 1.3793103448275862e-06, + "loss": 0.954434335231781, + "step": 9 + }, + { + "epoch": 0.012976480129764802, + "grad_norm": 6.206176280975342, + "learning_rate": 1.5517241379310346e-06, + "loss": 1.049869418144226, + "step": 10 + }, + { + "epoch": 0.014274128142741281, + "grad_norm": 5.300525665283203, + "learning_rate": 1.724137931034483e-06, + "loss": 1.0076310634613037, + "step": 11 + }, + { + "epoch": 0.015571776155717762, + "grad_norm": 4.235332489013672, + "learning_rate": 1.896551724137931e-06, + "loss": 0.9547766447067261, + "step": 12 + }, + { + "epoch": 0.01686942416869424, + "grad_norm": 4.258054733276367, + "learning_rate": 2.0689655172413796e-06, + "loss": 0.9808558225631714, + "step": 13 + }, + { + "epoch": 0.018167072181670723, + "grad_norm": 3.9000754356384277, + "learning_rate": 2.241379310344828e-06, + "loss": 0.955378532409668, + "step": 14 + }, + { + "epoch": 0.019464720194647202, + "grad_norm": 2.9283816814422607, + "learning_rate": 2.4137931034482762e-06, + "loss": 0.9264786839485168, + "step": 15 + }, + { + "epoch": 0.020762368207623682, + "grad_norm": 2.1859076023101807, + "learning_rate": 2.5862068965517246e-06, + "loss": 0.8895066380500793, + "step": 16 + }, + { + "epoch": 0.02206001622060016, + "grad_norm": 2.1717398166656494, + "learning_rate": 2.7586206896551725e-06, + "loss": 0.9194827675819397, + "step": 17 + }, + { + "epoch": 0.02335766423357664, + "grad_norm": 1.7686649560928345, + "learning_rate": 2.931034482758621e-06, + "loss": 0.8839207291603088, + "step": 18 + }, + { + "epoch": 0.024655312246553124, + "grad_norm": 1.7060308456420898, + "learning_rate": 3.103448275862069e-06, + "loss": 0.8821989297866821, + "step": 19 + }, + { + "epoch": 0.025952960259529603, + "grad_norm": 1.4888310432434082, + "learning_rate": 3.2758620689655175e-06, + "loss": 0.7937015295028687, + "step": 20 + }, + { + "epoch": 0.027250608272506083, + "grad_norm": 1.5812122821807861, + "learning_rate": 3.448275862068966e-06, + "loss": 0.9222494959831238, + "step": 21 + }, + { + "epoch": 0.028548256285482562, + "grad_norm": 1.5842291116714478, + "learning_rate": 3.620689655172414e-06, + "loss": 0.8129012584686279, + "step": 22 + }, + { + "epoch": 0.02984590429845904, + "grad_norm": 1.5270442962646484, + "learning_rate": 3.793103448275862e-06, + "loss": 0.843705415725708, + "step": 23 + }, + { + "epoch": 0.031143552311435525, + "grad_norm": 1.1963210105895996, + "learning_rate": 3.96551724137931e-06, + "loss": 0.7932494878768921, + "step": 24 + }, + { + "epoch": 0.032441200324412, + "grad_norm": 1.0309710502624512, + "learning_rate": 4.137931034482759e-06, + "loss": 0.7899153828620911, + "step": 25 + }, + { + "epoch": 0.03373884833738848, + "grad_norm": 0.9451068639755249, + "learning_rate": 4.310344827586207e-06, + "loss": 0.8323757648468018, + "step": 26 + }, + { + "epoch": 0.035036496350364967, + "grad_norm": 0.9398018717765808, + "learning_rate": 4.482758620689656e-06, + "loss": 0.8048505187034607, + "step": 27 + }, + { + "epoch": 0.036334144363341446, + "grad_norm": 0.8759371042251587, + "learning_rate": 4.655172413793104e-06, + "loss": 0.8321108222007751, + "step": 28 + }, + { + "epoch": 0.037631792376317925, + "grad_norm": 0.7862148284912109, + "learning_rate": 4.8275862068965525e-06, + "loss": 0.8356962203979492, + "step": 29 + }, + { + "epoch": 0.038929440389294405, + "grad_norm": 0.8221083283424377, + "learning_rate": 5e-06, + "loss": 0.856194794178009, + "step": 30 + }, + { + "epoch": 0.040227088402270884, + "grad_norm": 0.7913339734077454, + "learning_rate": 5.172413793103449e-06, + "loss": 0.782647967338562, + "step": 31 + }, + { + "epoch": 0.041524736415247364, + "grad_norm": 0.7948570847511292, + "learning_rate": 5.344827586206896e-06, + "loss": 0.8002289533615112, + "step": 32 + }, + { + "epoch": 0.04282238442822384, + "grad_norm": 0.8172705769538879, + "learning_rate": 5.517241379310345e-06, + "loss": 0.8037389516830444, + "step": 33 + }, + { + "epoch": 0.04412003244120032, + "grad_norm": 0.7674341797828674, + "learning_rate": 5.689655172413794e-06, + "loss": 0.7561640739440918, + "step": 34 + }, + { + "epoch": 0.0454176804541768, + "grad_norm": 0.7508828043937683, + "learning_rate": 5.862068965517242e-06, + "loss": 0.820884108543396, + "step": 35 + }, + { + "epoch": 0.04671532846715328, + "grad_norm": 0.7388272285461426, + "learning_rate": 6.03448275862069e-06, + "loss": 0.8406673669815063, + "step": 36 + }, + { + "epoch": 0.04801297648012977, + "grad_norm": 0.6549146771430969, + "learning_rate": 6.206896551724138e-06, + "loss": 0.7618731260299683, + "step": 37 + }, + { + "epoch": 0.04931062449310625, + "grad_norm": 0.6996558904647827, + "learning_rate": 6.379310344827587e-06, + "loss": 0.7531220316886902, + "step": 38 + }, + { + "epoch": 0.05060827250608273, + "grad_norm": 0.659206748008728, + "learning_rate": 6.551724137931035e-06, + "loss": 0.8432419896125793, + "step": 39 + }, + { + "epoch": 0.05190592051905921, + "grad_norm": 0.6969435811042786, + "learning_rate": 6.724137931034484e-06, + "loss": 0.8152772784233093, + "step": 40 + }, + { + "epoch": 0.053203568532035686, + "grad_norm": 0.638674795627594, + "learning_rate": 6.896551724137932e-06, + "loss": 0.8012467622756958, + "step": 41 + }, + { + "epoch": 0.054501216545012166, + "grad_norm": 0.6248321533203125, + "learning_rate": 7.0689655172413796e-06, + "loss": 0.7576991319656372, + "step": 42 + }, + { + "epoch": 0.055798864557988645, + "grad_norm": 0.6499493718147278, + "learning_rate": 7.241379310344828e-06, + "loss": 0.7685450911521912, + "step": 43 + }, + { + "epoch": 0.057096512570965124, + "grad_norm": 0.6266531348228455, + "learning_rate": 7.413793103448277e-06, + "loss": 0.7682685852050781, + "step": 44 + }, + { + "epoch": 0.058394160583941604, + "grad_norm": 0.6328745484352112, + "learning_rate": 7.586206896551724e-06, + "loss": 0.8221952319145203, + "step": 45 + }, + { + "epoch": 0.05969180859691808, + "grad_norm": 0.6457077860832214, + "learning_rate": 7.758620689655173e-06, + "loss": 0.7616772055625916, + "step": 46 + }, + { + "epoch": 0.06098945660989456, + "grad_norm": 0.6841326951980591, + "learning_rate": 7.93103448275862e-06, + "loss": 0.7185612916946411, + "step": 47 + }, + { + "epoch": 0.06228710462287105, + "grad_norm": 0.653884768486023, + "learning_rate": 8.103448275862069e-06, + "loss": 0.8144221901893616, + "step": 48 + }, + { + "epoch": 0.06358475263584752, + "grad_norm": 0.6235163807868958, + "learning_rate": 8.275862068965518e-06, + "loss": 0.7789400815963745, + "step": 49 + }, + { + "epoch": 0.064882400648824, + "grad_norm": 0.6035148501396179, + "learning_rate": 8.448275862068966e-06, + "loss": 0.7788746356964111, + "step": 50 + }, + { + "epoch": 0.06618004866180048, + "grad_norm": 0.6197084784507751, + "learning_rate": 8.620689655172414e-06, + "loss": 0.7773774266242981, + "step": 51 + }, + { + "epoch": 0.06747769667477696, + "grad_norm": 0.6356611847877502, + "learning_rate": 8.793103448275862e-06, + "loss": 0.8119993209838867, + "step": 52 + }, + { + "epoch": 0.06877534468775345, + "grad_norm": 0.6229863166809082, + "learning_rate": 8.965517241379312e-06, + "loss": 0.8156378269195557, + "step": 53 + }, + { + "epoch": 0.07007299270072993, + "grad_norm": 0.6285703778266907, + "learning_rate": 9.13793103448276e-06, + "loss": 0.7589212656021118, + "step": 54 + }, + { + "epoch": 0.07137064071370641, + "grad_norm": 0.6221722960472107, + "learning_rate": 9.310344827586207e-06, + "loss": 0.7588199377059937, + "step": 55 + }, + { + "epoch": 0.07266828872668289, + "grad_norm": 0.5896920561790466, + "learning_rate": 9.482758620689655e-06, + "loss": 0.7869905233383179, + "step": 56 + }, + { + "epoch": 0.07396593673965937, + "grad_norm": 0.6120532155036926, + "learning_rate": 9.655172413793105e-06, + "loss": 0.7379593849182129, + "step": 57 + }, + { + "epoch": 0.07526358475263585, + "grad_norm": 0.6437456011772156, + "learning_rate": 9.827586206896553e-06, + "loss": 0.8263105154037476, + "step": 58 + }, + { + "epoch": 0.07656123276561233, + "grad_norm": 0.6005666851997375, + "learning_rate": 1e-05, + "loss": 0.8053442239761353, + "step": 59 + }, + { + "epoch": 0.07785888077858881, + "grad_norm": 0.618229866027832, + "learning_rate": 1.0172413793103449e-05, + "loss": 0.7303550243377686, + "step": 60 + }, + { + "epoch": 0.07915652879156529, + "grad_norm": 0.6245790719985962, + "learning_rate": 1.0344827586206898e-05, + "loss": 0.7618341445922852, + "step": 61 + }, + { + "epoch": 0.08045417680454177, + "grad_norm": 0.632989227771759, + "learning_rate": 1.0517241379310346e-05, + "loss": 0.8073338270187378, + "step": 62 + }, + { + "epoch": 0.08175182481751825, + "grad_norm": 0.6083235740661621, + "learning_rate": 1.0689655172413792e-05, + "loss": 0.7776636481285095, + "step": 63 + }, + { + "epoch": 0.08304947283049473, + "grad_norm": 0.6136429309844971, + "learning_rate": 1.0862068965517242e-05, + "loss": 0.8043953776359558, + "step": 64 + }, + { + "epoch": 0.08434712084347121, + "grad_norm": 0.6103477478027344, + "learning_rate": 1.103448275862069e-05, + "loss": 0.7928889989852905, + "step": 65 + }, + { + "epoch": 0.08564476885644769, + "grad_norm": 0.6038222312927246, + "learning_rate": 1.1206896551724138e-05, + "loss": 0.7927621603012085, + "step": 66 + }, + { + "epoch": 0.08694241686942417, + "grad_norm": 0.6238990426063538, + "learning_rate": 1.1379310344827587e-05, + "loss": 0.7877966165542603, + "step": 67 + }, + { + "epoch": 0.08824006488240065, + "grad_norm": 0.5899522304534912, + "learning_rate": 1.1551724137931035e-05, + "loss": 0.721104621887207, + "step": 68 + }, + { + "epoch": 0.08953771289537713, + "grad_norm": 0.6330446004867554, + "learning_rate": 1.1724137931034483e-05, + "loss": 0.8130797147750854, + "step": 69 + }, + { + "epoch": 0.0908353609083536, + "grad_norm": 0.6214055418968201, + "learning_rate": 1.1896551724137933e-05, + "loss": 0.78719162940979, + "step": 70 + }, + { + "epoch": 0.09213300892133008, + "grad_norm": 0.648266077041626, + "learning_rate": 1.206896551724138e-05, + "loss": 0.7923158407211304, + "step": 71 + }, + { + "epoch": 0.09343065693430656, + "grad_norm": 0.6473869681358337, + "learning_rate": 1.2241379310344827e-05, + "loss": 0.8679413795471191, + "step": 72 + }, + { + "epoch": 0.09472830494728304, + "grad_norm": 0.5954247117042542, + "learning_rate": 1.2413793103448277e-05, + "loss": 0.7424967288970947, + "step": 73 + }, + { + "epoch": 0.09602595296025954, + "grad_norm": 0.6318120956420898, + "learning_rate": 1.2586206896551725e-05, + "loss": 0.7612457275390625, + "step": 74 + }, + { + "epoch": 0.09732360097323602, + "grad_norm": 0.6183631420135498, + "learning_rate": 1.2758620689655174e-05, + "loss": 0.7567603588104248, + "step": 75 + }, + { + "epoch": 0.0986212489862125, + "grad_norm": 0.6186433434486389, + "learning_rate": 1.2931034482758622e-05, + "loss": 0.8088338375091553, + "step": 76 + }, + { + "epoch": 0.09991889699918897, + "grad_norm": 0.6034461855888367, + "learning_rate": 1.310344827586207e-05, + "loss": 0.7736937999725342, + "step": 77 + }, + { + "epoch": 0.10121654501216545, + "grad_norm": 0.6197369694709778, + "learning_rate": 1.327586206896552e-05, + "loss": 0.7498612999916077, + "step": 78 + }, + { + "epoch": 0.10251419302514193, + "grad_norm": 0.6505046486854553, + "learning_rate": 1.3448275862068967e-05, + "loss": 0.8144986629486084, + "step": 79 + }, + { + "epoch": 0.10381184103811841, + "grad_norm": 0.6240726113319397, + "learning_rate": 1.3620689655172414e-05, + "loss": 0.7407926321029663, + "step": 80 + }, + { + "epoch": 0.10510948905109489, + "grad_norm": 0.6124047040939331, + "learning_rate": 1.3793103448275863e-05, + "loss": 0.7526525855064392, + "step": 81 + }, + { + "epoch": 0.10640713706407137, + "grad_norm": 0.5982939004898071, + "learning_rate": 1.3965517241379311e-05, + "loss": 0.722671627998352, + "step": 82 + }, + { + "epoch": 0.10770478507704785, + "grad_norm": 0.5908958315849304, + "learning_rate": 1.4137931034482759e-05, + "loss": 0.7402417659759521, + "step": 83 + }, + { + "epoch": 0.10900243309002433, + "grad_norm": 0.6116979718208313, + "learning_rate": 1.4310344827586209e-05, + "loss": 0.7960222959518433, + "step": 84 + }, + { + "epoch": 0.11030008110300081, + "grad_norm": 0.6197500228881836, + "learning_rate": 1.4482758620689657e-05, + "loss": 0.7519891858100891, + "step": 85 + }, + { + "epoch": 0.11159772911597729, + "grad_norm": 2.220649480819702, + "learning_rate": 1.4655172413793105e-05, + "loss": 0.7659766674041748, + "step": 86 + }, + { + "epoch": 0.11289537712895377, + "grad_norm": 5.19334602355957, + "learning_rate": 1.4827586206896554e-05, + "loss": 0.7760565280914307, + "step": 87 + }, + { + "epoch": 0.11419302514193025, + "grad_norm": 0.6664707064628601, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.7354503870010376, + "step": 88 + }, + { + "epoch": 0.11549067315490673, + "grad_norm": 0.6490852236747742, + "learning_rate": 1.5172413793103448e-05, + "loss": 0.7803969979286194, + "step": 89 + }, + { + "epoch": 0.11678832116788321, + "grad_norm": 0.6153193116188049, + "learning_rate": 1.5344827586206898e-05, + "loss": 0.7803000807762146, + "step": 90 + }, + { + "epoch": 0.11808596918085969, + "grad_norm": 0.6364138722419739, + "learning_rate": 1.5517241379310346e-05, + "loss": 0.7799690961837769, + "step": 91 + }, + { + "epoch": 0.11938361719383617, + "grad_norm": 0.6558602452278137, + "learning_rate": 1.5689655172413794e-05, + "loss": 0.8238034248352051, + "step": 92 + }, + { + "epoch": 0.12068126520681265, + "grad_norm": 0.629127562046051, + "learning_rate": 1.586206896551724e-05, + "loss": 0.7694847583770752, + "step": 93 + }, + { + "epoch": 0.12197891321978913, + "grad_norm": 0.5806317925453186, + "learning_rate": 1.603448275862069e-05, + "loss": 0.7090768814086914, + "step": 94 + }, + { + "epoch": 0.12327656123276562, + "grad_norm": 0.673556387424469, + "learning_rate": 1.6206896551724137e-05, + "loss": 0.8536560535430908, + "step": 95 + }, + { + "epoch": 0.1245742092457421, + "grad_norm": 0.5968764424324036, + "learning_rate": 1.637931034482759e-05, + "loss": 0.7300469875335693, + "step": 96 + }, + { + "epoch": 0.12587185725871858, + "grad_norm": 0.6305297613143921, + "learning_rate": 1.6551724137931037e-05, + "loss": 0.7591036558151245, + "step": 97 + }, + { + "epoch": 0.12716950527169504, + "grad_norm": 0.606986403465271, + "learning_rate": 1.6724137931034485e-05, + "loss": 0.76216721534729, + "step": 98 + }, + { + "epoch": 0.12846715328467154, + "grad_norm": 0.6063655018806458, + "learning_rate": 1.6896551724137932e-05, + "loss": 0.68424391746521, + "step": 99 + }, + { + "epoch": 0.129764801297648, + "grad_norm": 0.7023365497589111, + "learning_rate": 1.706896551724138e-05, + "loss": 0.8325944542884827, + "step": 100 + }, + { + "epoch": 0.1310624493106245, + "grad_norm": 0.6358933448791504, + "learning_rate": 1.7241379310344828e-05, + "loss": 0.8054566383361816, + "step": 101 + }, + { + "epoch": 0.13236009732360096, + "grad_norm": 0.6431549191474915, + "learning_rate": 1.7413793103448276e-05, + "loss": 0.7429993748664856, + "step": 102 + }, + { + "epoch": 0.13365774533657745, + "grad_norm": 0.6152120232582092, + "learning_rate": 1.7586206896551724e-05, + "loss": 0.7206076383590698, + "step": 103 + }, + { + "epoch": 0.13495539334955392, + "grad_norm": 0.6442373991012573, + "learning_rate": 1.7758620689655175e-05, + "loss": 0.806060791015625, + "step": 104 + }, + { + "epoch": 0.1362530413625304, + "grad_norm": 0.6756954789161682, + "learning_rate": 1.7931034482758623e-05, + "loss": 0.8363012671470642, + "step": 105 + }, + { + "epoch": 0.1375506893755069, + "grad_norm": 0.743787407875061, + "learning_rate": 1.810344827586207e-05, + "loss": 0.8207604885101318, + "step": 106 + }, + { + "epoch": 0.13884833738848337, + "grad_norm": 0.686335563659668, + "learning_rate": 1.827586206896552e-05, + "loss": 0.7393860816955566, + "step": 107 + }, + { + "epoch": 0.14014598540145987, + "grad_norm": 0.6191396713256836, + "learning_rate": 1.8448275862068967e-05, + "loss": 0.7534383535385132, + "step": 108 + }, + { + "epoch": 0.14144363341443633, + "grad_norm": 0.6754934191703796, + "learning_rate": 1.8620689655172415e-05, + "loss": 0.8022092580795288, + "step": 109 + }, + { + "epoch": 0.14274128142741282, + "grad_norm": 0.6399085521697998, + "learning_rate": 1.8793103448275863e-05, + "loss": 0.8507853746414185, + "step": 110 + }, + { + "epoch": 0.1440389294403893, + "grad_norm": 0.6910972595214844, + "learning_rate": 1.896551724137931e-05, + "loss": 0.8276559710502625, + "step": 111 + }, + { + "epoch": 0.14533657745336578, + "grad_norm": 0.5906772613525391, + "learning_rate": 1.913793103448276e-05, + "loss": 0.7183451056480408, + "step": 112 + }, + { + "epoch": 0.14663422546634225, + "grad_norm": 0.6329069137573242, + "learning_rate": 1.931034482758621e-05, + "loss": 0.789232611656189, + "step": 113 + }, + { + "epoch": 0.14793187347931874, + "grad_norm": 0.6226819157600403, + "learning_rate": 1.9482758620689658e-05, + "loss": 0.7747266292572021, + "step": 114 + }, + { + "epoch": 0.1492295214922952, + "grad_norm": 0.65074223279953, + "learning_rate": 1.9655172413793106e-05, + "loss": 0.753608226776123, + "step": 115 + }, + { + "epoch": 0.1505271695052717, + "grad_norm": 0.6118033528327942, + "learning_rate": 1.9827586206896554e-05, + "loss": 0.7803196907043457, + "step": 116 + }, + { + "epoch": 0.15182481751824817, + "grad_norm": 0.6553196907043457, + "learning_rate": 2e-05, + "loss": 0.8216028213500977, + "step": 117 + }, + { + "epoch": 0.15312246553122466, + "grad_norm": 0.678218424320221, + "learning_rate": 1.999998977626552e-05, + "loss": 0.807174801826477, + "step": 118 + }, + { + "epoch": 0.15442011354420113, + "grad_norm": 0.6192781329154968, + "learning_rate": 1.999995910508299e-05, + "loss": 0.7289496660232544, + "step": 119 + }, + { + "epoch": 0.15571776155717762, + "grad_norm": 0.6038413047790527, + "learning_rate": 1.999990798651512e-05, + "loss": 0.7679600119590759, + "step": 120 + }, + { + "epoch": 0.15701540957015409, + "grad_norm": 0.6870720386505127, + "learning_rate": 1.9999836420666438e-05, + "loss": 0.8232643604278564, + "step": 121 + }, + { + "epoch": 0.15831305758313058, + "grad_norm": 0.623460590839386, + "learning_rate": 1.999974440768327e-05, + "loss": 0.7480977177619934, + "step": 122 + }, + { + "epoch": 0.15961070559610704, + "grad_norm": 0.651508629322052, + "learning_rate": 1.9999631947753776e-05, + "loss": 0.7708613276481628, + "step": 123 + }, + { + "epoch": 0.16090835360908354, + "grad_norm": 0.6450805068016052, + "learning_rate": 1.999949904110789e-05, + "loss": 0.8049247860908508, + "step": 124 + }, + { + "epoch": 0.16220600162206, + "grad_norm": 0.6157734990119934, + "learning_rate": 1.999934568801738e-05, + "loss": 0.7631984949111938, + "step": 125 + }, + { + "epoch": 0.1635036496350365, + "grad_norm": 0.6847337484359741, + "learning_rate": 1.999917188879582e-05, + "loss": 0.7424380779266357, + "step": 126 + }, + { + "epoch": 0.164801297648013, + "grad_norm": 0.6398855447769165, + "learning_rate": 1.9998977643798572e-05, + "loss": 0.7688143253326416, + "step": 127 + }, + { + "epoch": 0.16609894566098946, + "grad_norm": 0.6518498063087463, + "learning_rate": 1.999876295342283e-05, + "loss": 0.7191232442855835, + "step": 128 + }, + { + "epoch": 0.16739659367396595, + "grad_norm": 0.6462240219116211, + "learning_rate": 1.9998527818107577e-05, + "loss": 0.7375045418739319, + "step": 129 + }, + { + "epoch": 0.16869424168694241, + "grad_norm": 0.6727373600006104, + "learning_rate": 1.9998272238333606e-05, + "loss": 0.7088533639907837, + "step": 130 + }, + { + "epoch": 0.1699918896999189, + "grad_norm": 0.689372181892395, + "learning_rate": 1.9997996214623515e-05, + "loss": 0.8250190615653992, + "step": 131 + }, + { + "epoch": 0.17128953771289537, + "grad_norm": 0.6236900687217712, + "learning_rate": 1.9997699747541698e-05, + "loss": 0.7653014659881592, + "step": 132 + }, + { + "epoch": 0.17258718572587187, + "grad_norm": 0.617174506187439, + "learning_rate": 1.9997382837694355e-05, + "loss": 0.7043566703796387, + "step": 133 + }, + { + "epoch": 0.17388483373884833, + "grad_norm": 0.6391400694847107, + "learning_rate": 1.999704548572949e-05, + "loss": 0.8009853363037109, + "step": 134 + }, + { + "epoch": 0.17518248175182483, + "grad_norm": 0.6218752861022949, + "learning_rate": 1.9996687692336896e-05, + "loss": 0.7598843574523926, + "step": 135 + }, + { + "epoch": 0.1764801297648013, + "grad_norm": 0.5787500143051147, + "learning_rate": 1.9996309458248184e-05, + "loss": 0.7174202799797058, + "step": 136 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.6410360932350159, + "learning_rate": 1.999591078423673e-05, + "loss": 0.763797402381897, + "step": 137 + }, + { + "epoch": 0.17907542579075425, + "grad_norm": 0.970513641834259, + "learning_rate": 1.9995491671117734e-05, + "loss": 0.6977022290229797, + "step": 138 + }, + { + "epoch": 0.18037307380373074, + "grad_norm": 0.6853165030479431, + "learning_rate": 1.999505211974817e-05, + "loss": 0.7822556495666504, + "step": 139 + }, + { + "epoch": 0.1816707218167072, + "grad_norm": 0.6396400332450867, + "learning_rate": 1.999459213102681e-05, + "loss": 0.7862622737884521, + "step": 140 + }, + { + "epoch": 0.1829683698296837, + "grad_norm": 0.6066014766693115, + "learning_rate": 1.9994111705894218e-05, + "loss": 0.8506604433059692, + "step": 141 + }, + { + "epoch": 0.18426601784266017, + "grad_norm": 0.6197599172592163, + "learning_rate": 1.9993610845332734e-05, + "loss": 0.7890738844871521, + "step": 142 + }, + { + "epoch": 0.18556366585563666, + "grad_norm": 0.6512314677238464, + "learning_rate": 1.99930895503665e-05, + "loss": 0.7983291149139404, + "step": 143 + }, + { + "epoch": 0.18686131386861313, + "grad_norm": 0.5899611115455627, + "learning_rate": 1.9992547822061427e-05, + "loss": 0.7357482314109802, + "step": 144 + }, + { + "epoch": 0.18815896188158962, + "grad_norm": 0.6489595770835876, + "learning_rate": 1.9991985661525217e-05, + "loss": 0.875076174736023, + "step": 145 + }, + { + "epoch": 0.18945660989456609, + "grad_norm": 0.6258020997047424, + "learning_rate": 1.999140306990734e-05, + "loss": 0.7252365350723267, + "step": 146 + }, + { + "epoch": 0.19075425790754258, + "grad_norm": 0.6045345067977905, + "learning_rate": 1.999080004839905e-05, + "loss": 0.7721343040466309, + "step": 147 + }, + { + "epoch": 0.19205190592051907, + "grad_norm": 0.6506165862083435, + "learning_rate": 1.999017659823338e-05, + "loss": 0.8302021026611328, + "step": 148 + }, + { + "epoch": 0.19334955393349554, + "grad_norm": 0.6503569483757019, + "learning_rate": 1.9989532720685115e-05, + "loss": 0.825711190700531, + "step": 149 + }, + { + "epoch": 0.19464720194647203, + "grad_norm": 0.5828515887260437, + "learning_rate": 1.998886841707083e-05, + "loss": 0.7742114067077637, + "step": 150 + }, + { + "epoch": 0.1959448499594485, + "grad_norm": 0.5945319533348083, + "learning_rate": 1.9988183688748862e-05, + "loss": 0.8291171789169312, + "step": 151 + }, + { + "epoch": 0.197242497972425, + "grad_norm": 0.6298274993896484, + "learning_rate": 1.9987478537119297e-05, + "loss": 0.8312891721725464, + "step": 152 + }, + { + "epoch": 0.19854014598540146, + "grad_norm": 0.6161749958992004, + "learning_rate": 1.9986752963624002e-05, + "loss": 0.8070319890975952, + "step": 153 + }, + { + "epoch": 0.19983779399837795, + "grad_norm": 0.6540800929069519, + "learning_rate": 1.998600696974658e-05, + "loss": 0.7966468334197998, + "step": 154 + }, + { + "epoch": 0.20113544201135442, + "grad_norm": 0.628194272518158, + "learning_rate": 1.9985240557012406e-05, + "loss": 0.7929773926734924, + "step": 155 + }, + { + "epoch": 0.2024330900243309, + "grad_norm": 0.6037770509719849, + "learning_rate": 1.99844537269886e-05, + "loss": 0.6729363203048706, + "step": 156 + }, + { + "epoch": 0.20373073803730737, + "grad_norm": 0.6952143907546997, + "learning_rate": 1.9983646481284028e-05, + "loss": 0.8734431266784668, + "step": 157 + }, + { + "epoch": 0.20502838605028387, + "grad_norm": 0.6359195113182068, + "learning_rate": 1.9982818821549308e-05, + "loss": 0.7915219664573669, + "step": 158 + }, + { + "epoch": 0.20632603406326033, + "grad_norm": 0.578925609588623, + "learning_rate": 1.9981970749476792e-05, + "loss": 0.7327010631561279, + "step": 159 + }, + { + "epoch": 0.20762368207623683, + "grad_norm": 0.6001781821250916, + "learning_rate": 1.998110226680057e-05, + "loss": 0.7517937421798706, + "step": 160 + }, + { + "epoch": 0.2089213300892133, + "grad_norm": 0.6306588649749756, + "learning_rate": 1.9980213375296468e-05, + "loss": 0.7292003035545349, + "step": 161 + }, + { + "epoch": 0.21021897810218979, + "grad_norm": 0.5737298130989075, + "learning_rate": 1.997930407678205e-05, + "loss": 0.7056928873062134, + "step": 162 + }, + { + "epoch": 0.21151662611516625, + "grad_norm": 0.6045275926589966, + "learning_rate": 1.99783743731166e-05, + "loss": 0.738794207572937, + "step": 163 + }, + { + "epoch": 0.21281427412814274, + "grad_norm": 0.6090785264968872, + "learning_rate": 1.9977424266201126e-05, + "loss": 0.8411350846290588, + "step": 164 + }, + { + "epoch": 0.2141119221411192, + "grad_norm": 0.6489406824111938, + "learning_rate": 1.9976453757978355e-05, + "loss": 0.750893771648407, + "step": 165 + }, + { + "epoch": 0.2154095701540957, + "grad_norm": 0.5950313210487366, + "learning_rate": 1.997546285043273e-05, + "loss": 0.6694055199623108, + "step": 166 + }, + { + "epoch": 0.21670721816707217, + "grad_norm": 0.6618576645851135, + "learning_rate": 1.9974451545590407e-05, + "loss": 0.8072858452796936, + "step": 167 + }, + { + "epoch": 0.21800486618004866, + "grad_norm": 0.587589681148529, + "learning_rate": 1.997341984551925e-05, + "loss": 0.7707666158676147, + "step": 168 + }, + { + "epoch": 0.21930251419302516, + "grad_norm": 0.6130505204200745, + "learning_rate": 1.9972367752328824e-05, + "loss": 0.683761715888977, + "step": 169 + }, + { + "epoch": 0.22060016220600162, + "grad_norm": 0.6129958033561707, + "learning_rate": 1.9971295268170393e-05, + "loss": 0.7264688014984131, + "step": 170 + }, + { + "epoch": 0.22189781021897811, + "grad_norm": 0.6114361882209778, + "learning_rate": 1.9970202395236913e-05, + "loss": 0.7344344854354858, + "step": 171 + }, + { + "epoch": 0.22319545823195458, + "grad_norm": 0.6653074622154236, + "learning_rate": 1.996908913576304e-05, + "loss": 0.7358161211013794, + "step": 172 + }, + { + "epoch": 0.22449310624493107, + "grad_norm": 0.6639219522476196, + "learning_rate": 1.9967955492025094e-05, + "loss": 0.7851651906967163, + "step": 173 + }, + { + "epoch": 0.22579075425790754, + "grad_norm": 0.5558881759643555, + "learning_rate": 1.9966801466341107e-05, + "loss": 0.7109513878822327, + "step": 174 + }, + { + "epoch": 0.22708840227088403, + "grad_norm": 0.6213382482528687, + "learning_rate": 1.9965627061070755e-05, + "loss": 0.702171802520752, + "step": 175 + }, + { + "epoch": 0.2283860502838605, + "grad_norm": 0.6152480840682983, + "learning_rate": 1.996443227861541e-05, + "loss": 0.8059327602386475, + "step": 176 + }, + { + "epoch": 0.229683698296837, + "grad_norm": 1.3707772493362427, + "learning_rate": 1.996321712141809e-05, + "loss": 0.6749221682548523, + "step": 177 + }, + { + "epoch": 0.23098134630981346, + "grad_norm": 0.6016313433647156, + "learning_rate": 1.9961981591963494e-05, + "loss": 0.7931903004646301, + "step": 178 + }, + { + "epoch": 0.23227899432278995, + "grad_norm": 0.6266494393348694, + "learning_rate": 1.9960725692777956e-05, + "loss": 0.7843484878540039, + "step": 179 + }, + { + "epoch": 0.23357664233576642, + "grad_norm": 0.6365560293197632, + "learning_rate": 1.995944942642948e-05, + "loss": 0.769256055355072, + "step": 180 + }, + { + "epoch": 0.2348742903487429, + "grad_norm": 0.5864040851593018, + "learning_rate": 1.9958152795527706e-05, + "loss": 0.7252316474914551, + "step": 181 + }, + { + "epoch": 0.23617193836171937, + "grad_norm": 0.6339318156242371, + "learning_rate": 1.9956835802723916e-05, + "loss": 0.8299843668937683, + "step": 182 + }, + { + "epoch": 0.23746958637469587, + "grad_norm": 0.5974844098091125, + "learning_rate": 1.9955498450711026e-05, + "loss": 0.7282422184944153, + "step": 183 + }, + { + "epoch": 0.23876723438767233, + "grad_norm": 0.5841022729873657, + "learning_rate": 1.9954140742223586e-05, + "loss": 0.7407736778259277, + "step": 184 + }, + { + "epoch": 0.24006488240064883, + "grad_norm": 0.6066944599151611, + "learning_rate": 1.9952762680037758e-05, + "loss": 0.7745926380157471, + "step": 185 + }, + { + "epoch": 0.2413625304136253, + "grad_norm": 0.5798110365867615, + "learning_rate": 1.995136426697134e-05, + "loss": 0.7561591863632202, + "step": 186 + }, + { + "epoch": 0.24266017842660179, + "grad_norm": 0.5705812573432922, + "learning_rate": 1.9949945505883723e-05, + "loss": 0.7066362500190735, + "step": 187 + }, + { + "epoch": 0.24395782643957825, + "grad_norm": 0.6322996020317078, + "learning_rate": 1.994850639967592e-05, + "loss": 0.8032187819480896, + "step": 188 + }, + { + "epoch": 0.24525547445255474, + "grad_norm": 0.613441526889801, + "learning_rate": 1.994704695129054e-05, + "loss": 0.75013267993927, + "step": 189 + }, + { + "epoch": 0.24655312246553124, + "grad_norm": 0.609327495098114, + "learning_rate": 1.9945567163711788e-05, + "loss": 0.7675092220306396, + "step": 190 + }, + { + "epoch": 0.2478507704785077, + "grad_norm": 0.6119315028190613, + "learning_rate": 1.9944067039965445e-05, + "loss": 0.7201006412506104, + "step": 191 + }, + { + "epoch": 0.2491484184914842, + "grad_norm": 0.5587560534477234, + "learning_rate": 1.9942546583118894e-05, + "loss": 0.7847742438316345, + "step": 192 + }, + { + "epoch": 0.25044606650446066, + "grad_norm": 0.5934576988220215, + "learning_rate": 1.994100579628108e-05, + "loss": 0.74636310338974, + "step": 193 + }, + { + "epoch": 0.25174371451743716, + "grad_norm": 0.5709709525108337, + "learning_rate": 1.9939444682602522e-05, + "loss": 0.6807436347007751, + "step": 194 + }, + { + "epoch": 0.25304136253041365, + "grad_norm": 0.6085708737373352, + "learning_rate": 1.9937863245275303e-05, + "loss": 0.7877497673034668, + "step": 195 + }, + { + "epoch": 0.2543390105433901, + "grad_norm": 0.5789342522621155, + "learning_rate": 1.9936261487533066e-05, + "loss": 0.7314412593841553, + "step": 196 + }, + { + "epoch": 0.2556366585563666, + "grad_norm": 0.5808578133583069, + "learning_rate": 1.993463941265099e-05, + "loss": 0.7081149816513062, + "step": 197 + }, + { + "epoch": 0.2569343065693431, + "grad_norm": 0.5988272428512573, + "learning_rate": 1.993299702394582e-05, + "loss": 0.718379020690918, + "step": 198 + }, + { + "epoch": 0.25823195458231957, + "grad_norm": 0.6408476829528809, + "learning_rate": 1.9931334324775817e-05, + "loss": 0.8201683163642883, + "step": 199 + }, + { + "epoch": 0.259529602595296, + "grad_norm": 0.582078218460083, + "learning_rate": 1.9929651318540783e-05, + "loss": 0.7401193380355835, + "step": 200 + }, + { + "epoch": 0.2608272506082725, + "grad_norm": 0.607105553150177, + "learning_rate": 1.9927948008682038e-05, + "loss": 0.74293053150177, + "step": 201 + }, + { + "epoch": 0.262124898621249, + "grad_norm": 0.5975603461265564, + "learning_rate": 1.9926224398682424e-05, + "loss": 0.779903769493103, + "step": 202 + }, + { + "epoch": 0.2634225466342255, + "grad_norm": 0.5534036159515381, + "learning_rate": 1.992448049206628e-05, + "loss": 0.6884838342666626, + "step": 203 + }, + { + "epoch": 0.2647201946472019, + "grad_norm": 0.610633909702301, + "learning_rate": 1.9922716292399458e-05, + "loss": 0.7174521684646606, + "step": 204 + }, + { + "epoch": 0.2660178426601784, + "grad_norm": 0.5961881279945374, + "learning_rate": 1.9920931803289302e-05, + "loss": 0.7740389108657837, + "step": 205 + }, + { + "epoch": 0.2673154906731549, + "grad_norm": 0.5700147747993469, + "learning_rate": 1.9919127028384634e-05, + "loss": 0.7351720333099365, + "step": 206 + }, + { + "epoch": 0.2686131386861314, + "grad_norm": 0.6236000061035156, + "learning_rate": 1.9917301971375767e-05, + "loss": 0.8022093772888184, + "step": 207 + }, + { + "epoch": 0.26991078669910784, + "grad_norm": 0.5870935320854187, + "learning_rate": 1.991545663599448e-05, + "loss": 0.7842336297035217, + "step": 208 + }, + { + "epoch": 0.27120843471208433, + "grad_norm": 0.6193575263023376, + "learning_rate": 1.9913591026014016e-05, + "loss": 0.7481486797332764, + "step": 209 + }, + { + "epoch": 0.2725060827250608, + "grad_norm": 0.6119521260261536, + "learning_rate": 1.9911705145249076e-05, + "loss": 0.7951152324676514, + "step": 210 + }, + { + "epoch": 0.2738037307380373, + "grad_norm": 0.5536502599716187, + "learning_rate": 1.9909798997555806e-05, + "loss": 0.790625810623169, + "step": 211 + }, + { + "epoch": 0.2751013787510138, + "grad_norm": 0.5879918336868286, + "learning_rate": 1.99078725868318e-05, + "loss": 0.7092885971069336, + "step": 212 + }, + { + "epoch": 0.27639902676399025, + "grad_norm": 0.5877639055252075, + "learning_rate": 1.9905925917016077e-05, + "loss": 0.724690318107605, + "step": 213 + }, + { + "epoch": 0.27769667477696675, + "grad_norm": 0.5909678339958191, + "learning_rate": 1.9903958992089087e-05, + "loss": 0.7642319202423096, + "step": 214 + }, + { + "epoch": 0.27899432278994324, + "grad_norm": 0.5952388644218445, + "learning_rate": 1.990197181607269e-05, + "loss": 0.7681585550308228, + "step": 215 + }, + { + "epoch": 0.28029197080291973, + "grad_norm": 0.5698040723800659, + "learning_rate": 1.989996439303016e-05, + "loss": 0.7373849153518677, + "step": 216 + }, + { + "epoch": 0.28158961881589617, + "grad_norm": 0.5865874886512756, + "learning_rate": 1.989793672706617e-05, + "loss": 0.7335535287857056, + "step": 217 + }, + { + "epoch": 0.28288726682887266, + "grad_norm": 0.6045393943786621, + "learning_rate": 1.9895888822326783e-05, + "loss": 0.7242499589920044, + "step": 218 + }, + { + "epoch": 0.28418491484184916, + "grad_norm": 0.6004535555839539, + "learning_rate": 1.9893820682999444e-05, + "loss": 0.7604917287826538, + "step": 219 + }, + { + "epoch": 0.28548256285482565, + "grad_norm": 1.119056224822998, + "learning_rate": 1.9891732313312973e-05, + "loss": 0.772226095199585, + "step": 220 + }, + { + "epoch": 0.2867802108678021, + "grad_norm": 0.5902665853500366, + "learning_rate": 1.9889623717537564e-05, + "loss": 0.7658222317695618, + "step": 221 + }, + { + "epoch": 0.2880778588807786, + "grad_norm": 0.6264858245849609, + "learning_rate": 1.9887494899984757e-05, + "loss": 0.7901877760887146, + "step": 222 + }, + { + "epoch": 0.2893755068937551, + "grad_norm": 0.5469992756843567, + "learning_rate": 1.9885345865007444e-05, + "loss": 0.7618519067764282, + "step": 223 + }, + { + "epoch": 0.29067315490673157, + "grad_norm": 0.5550391674041748, + "learning_rate": 1.9883176616999863e-05, + "loss": 0.788576602935791, + "step": 224 + }, + { + "epoch": 0.291970802919708, + "grad_norm": 0.5628973245620728, + "learning_rate": 1.9880987160397573e-05, + "loss": 0.718231737613678, + "step": 225 + }, + { + "epoch": 0.2932684509326845, + "grad_norm": 0.5723385214805603, + "learning_rate": 1.987877749967746e-05, + "loss": 0.698378324508667, + "step": 226 + }, + { + "epoch": 0.294566098945661, + "grad_norm": 0.5784431099891663, + "learning_rate": 1.987654763935772e-05, + "loss": 0.7598991990089417, + "step": 227 + }, + { + "epoch": 0.2958637469586375, + "grad_norm": 0.5549972653388977, + "learning_rate": 1.9874297583997852e-05, + "loss": 0.7384412288665771, + "step": 228 + }, + { + "epoch": 0.2971613949716139, + "grad_norm": 0.5789146423339844, + "learning_rate": 1.9872027338198652e-05, + "loss": 0.7528890371322632, + "step": 229 + }, + { + "epoch": 0.2984590429845904, + "grad_norm": 0.6021227240562439, + "learning_rate": 1.98697369066022e-05, + "loss": 0.805375337600708, + "step": 230 + }, + { + "epoch": 0.2984590429845904, + "eval_loss": 0.7241292595863342, + "eval_runtime": 73.217, + "eval_samples_per_second": 70.913, + "eval_steps_per_second": 8.864, + "step": 230 + }, + { + "epoch": 0.2997566909975669, + "grad_norm": 0.6029407978057861, + "learning_rate": 1.986742629389184e-05, + "loss": 0.7631509900093079, + "step": 231 + }, + { + "epoch": 0.3010543390105434, + "grad_norm": 0.5768916606903076, + "learning_rate": 1.98650955047922e-05, + "loss": 0.7468521595001221, + "step": 232 + }, + { + "epoch": 0.3023519870235199, + "grad_norm": 0.550506055355072, + "learning_rate": 1.9862744544069146e-05, + "loss": 0.7611327767372131, + "step": 233 + }, + { + "epoch": 0.30364963503649633, + "grad_norm": 0.5796909332275391, + "learning_rate": 1.9860373416529804e-05, + "loss": 0.7168669700622559, + "step": 234 + }, + { + "epoch": 0.30494728304947283, + "grad_norm": 0.8639640808105469, + "learning_rate": 1.9857982127022527e-05, + "loss": 0.7404369115829468, + "step": 235 + }, + { + "epoch": 0.3062449310624493, + "grad_norm": 0.5862186551094055, + "learning_rate": 1.9855570680436896e-05, + "loss": 0.7222490310668945, + "step": 236 + }, + { + "epoch": 0.3075425790754258, + "grad_norm": 0.6011035442352295, + "learning_rate": 1.9853139081703712e-05, + "loss": 0.8068719506263733, + "step": 237 + }, + { + "epoch": 0.30884022708840225, + "grad_norm": 0.5739139318466187, + "learning_rate": 1.9850687335794974e-05, + "loss": 0.7303578853607178, + "step": 238 + }, + { + "epoch": 0.31013787510137875, + "grad_norm": 0.5833807587623596, + "learning_rate": 1.9848215447723888e-05, + "loss": 0.7608842849731445, + "step": 239 + }, + { + "epoch": 0.31143552311435524, + "grad_norm": 0.5929459929466248, + "learning_rate": 1.9845723422544834e-05, + "loss": 0.8103141188621521, + "step": 240 + }, + { + "epoch": 0.31273317112733173, + "grad_norm": 0.5728944540023804, + "learning_rate": 1.9843211265353376e-05, + "loss": 0.7196205854415894, + "step": 241 + }, + { + "epoch": 0.31403081914030817, + "grad_norm": 0.5517752170562744, + "learning_rate": 1.9840678981286237e-05, + "loss": 0.6758772730827332, + "step": 242 + }, + { + "epoch": 0.31532846715328466, + "grad_norm": 0.5443773865699768, + "learning_rate": 1.98381265755213e-05, + "loss": 0.6859534978866577, + "step": 243 + }, + { + "epoch": 0.31662611516626116, + "grad_norm": 0.5687966346740723, + "learning_rate": 1.9835554053277587e-05, + "loss": 0.7471268177032471, + "step": 244 + }, + { + "epoch": 0.31792376317923765, + "grad_norm": 0.5604870319366455, + "learning_rate": 1.9832961419815253e-05, + "loss": 0.6843122839927673, + "step": 245 + }, + { + "epoch": 0.3192214111922141, + "grad_norm": 0.5563496351242065, + "learning_rate": 1.983034868043558e-05, + "loss": 0.7023979425430298, + "step": 246 + }, + { + "epoch": 0.3205190592051906, + "grad_norm": 0.58856201171875, + "learning_rate": 1.9827715840480962e-05, + "loss": 0.826436460018158, + "step": 247 + }, + { + "epoch": 0.3218167072181671, + "grad_norm": 0.5512715578079224, + "learning_rate": 1.9825062905334883e-05, + "loss": 0.702526867389679, + "step": 248 + }, + { + "epoch": 0.32311435523114357, + "grad_norm": 0.541459858417511, + "learning_rate": 1.9822389880421927e-05, + "loss": 0.7273234128952026, + "step": 249 + }, + { + "epoch": 0.32441200324412, + "grad_norm": 0.5705904364585876, + "learning_rate": 1.9819696771207756e-05, + "loss": 0.783245325088501, + "step": 250 + }, + { + "epoch": 0.3257096512570965, + "grad_norm": 0.5666183829307556, + "learning_rate": 1.981698358319909e-05, + "loss": 0.7261844873428345, + "step": 251 + }, + { + "epoch": 0.327007299270073, + "grad_norm": 0.5902214646339417, + "learning_rate": 1.981425032194372e-05, + "loss": 0.7943121194839478, + "step": 252 + }, + { + "epoch": 0.3283049472830495, + "grad_norm": 0.6048629879951477, + "learning_rate": 1.981149699303047e-05, + "loss": 0.7712939381599426, + "step": 253 + }, + { + "epoch": 0.329602595296026, + "grad_norm": 0.5914484858512878, + "learning_rate": 1.9808723602089198e-05, + "loss": 0.7921222448348999, + "step": 254 + }, + { + "epoch": 0.3309002433090024, + "grad_norm": 0.5761268734931946, + "learning_rate": 1.980593015479079e-05, + "loss": 0.7280013561248779, + "step": 255 + }, + { + "epoch": 0.3321978913219789, + "grad_norm": 0.5902722477912903, + "learning_rate": 1.9803116656847136e-05, + "loss": 0.8062602877616882, + "step": 256 + }, + { + "epoch": 0.3334955393349554, + "grad_norm": 0.5620178580284119, + "learning_rate": 1.9800283114011134e-05, + "loss": 0.7278565168380737, + "step": 257 + }, + { + "epoch": 0.3347931873479319, + "grad_norm": 0.5686838626861572, + "learning_rate": 1.9797429532076652e-05, + "loss": 0.7540629506111145, + "step": 258 + }, + { + "epoch": 0.33609083536090834, + "grad_norm": 0.5724810361862183, + "learning_rate": 1.9794555916878548e-05, + "loss": 0.8088860511779785, + "step": 259 + }, + { + "epoch": 0.33738848337388483, + "grad_norm": 0.5640983581542969, + "learning_rate": 1.9791662274292638e-05, + "loss": 0.7638871669769287, + "step": 260 + }, + { + "epoch": 0.3386861313868613, + "grad_norm": 0.5784658193588257, + "learning_rate": 1.978874861023569e-05, + "loss": 0.7313830852508545, + "step": 261 + }, + { + "epoch": 0.3399837793998378, + "grad_norm": 0.5539552569389343, + "learning_rate": 1.9785814930665404e-05, + "loss": 0.7729085683822632, + "step": 262 + }, + { + "epoch": 0.34128142741281425, + "grad_norm": 0.561370849609375, + "learning_rate": 1.9782861241580417e-05, + "loss": 0.6871550679206848, + "step": 263 + }, + { + "epoch": 0.34257907542579075, + "grad_norm": 0.5643728375434875, + "learning_rate": 1.9779887549020273e-05, + "loss": 0.7683601379394531, + "step": 264 + }, + { + "epoch": 0.34387672343876724, + "grad_norm": 0.5431486964225769, + "learning_rate": 1.9776893859065424e-05, + "loss": 0.7228385210037231, + "step": 265 + }, + { + "epoch": 0.34517437145174373, + "grad_norm": 0.5863342881202698, + "learning_rate": 1.9773880177837202e-05, + "loss": 0.7906335592269897, + "step": 266 + }, + { + "epoch": 0.34647201946472017, + "grad_norm": 0.5614317655563354, + "learning_rate": 1.9770846511497833e-05, + "loss": 0.7299401164054871, + "step": 267 + }, + { + "epoch": 0.34776966747769666, + "grad_norm": 0.5694175958633423, + "learning_rate": 1.9767792866250386e-05, + "loss": 0.7474102973937988, + "step": 268 + }, + { + "epoch": 0.34906731549067316, + "grad_norm": 0.5707114934921265, + "learning_rate": 1.97647192483388e-05, + "loss": 0.7324154376983643, + "step": 269 + }, + { + "epoch": 0.35036496350364965, + "grad_norm": 0.5364754796028137, + "learning_rate": 1.976162566404784e-05, + "loss": 0.6927608251571655, + "step": 270 + }, + { + "epoch": 0.3516626115166261, + "grad_norm": 0.6064906120300293, + "learning_rate": 1.9758512119703106e-05, + "loss": 0.7652560472488403, + "step": 271 + }, + { + "epoch": 0.3529602595296026, + "grad_norm": 0.5919526815414429, + "learning_rate": 1.9755378621671006e-05, + "loss": 0.7977138757705688, + "step": 272 + }, + { + "epoch": 0.3542579075425791, + "grad_norm": 0.567382276058197, + "learning_rate": 1.9752225176358757e-05, + "loss": 0.7258316278457642, + "step": 273 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.5770947337150574, + "learning_rate": 1.974905179021435e-05, + "loss": 0.7411879301071167, + "step": 274 + }, + { + "epoch": 0.35685320356853206, + "grad_norm": 0.5705130100250244, + "learning_rate": 1.9745858469726555e-05, + "loss": 0.7439219951629639, + "step": 275 + }, + { + "epoch": 0.3581508515815085, + "grad_norm": 0.5373214483261108, + "learning_rate": 1.9742645221424905e-05, + "loss": 0.6836246252059937, + "step": 276 + }, + { + "epoch": 0.359448499594485, + "grad_norm": 0.596576988697052, + "learning_rate": 1.9739412051879686e-05, + "loss": 0.6741154789924622, + "step": 277 + }, + { + "epoch": 0.3607461476074615, + "grad_norm": 0.5719678997993469, + "learning_rate": 1.973615896770191e-05, + "loss": 0.7447401881217957, + "step": 278 + }, + { + "epoch": 0.362043795620438, + "grad_norm": 0.5882077813148499, + "learning_rate": 1.97328859755433e-05, + "loss": 0.7762616872787476, + "step": 279 + }, + { + "epoch": 0.3633414436334144, + "grad_norm": 0.6879026293754578, + "learning_rate": 1.972959308209631e-05, + "loss": 0.7956463098526001, + "step": 280 + }, + { + "epoch": 0.3646390916463909, + "grad_norm": 0.5789086222648621, + "learning_rate": 1.9726280294094067e-05, + "loss": 0.7541590929031372, + "step": 281 + }, + { + "epoch": 0.3659367396593674, + "grad_norm": 0.5802841186523438, + "learning_rate": 1.9722947618310384e-05, + "loss": 0.7047423124313354, + "step": 282 + }, + { + "epoch": 0.3672343876723439, + "grad_norm": 0.5507220029830933, + "learning_rate": 1.9719595061559742e-05, + "loss": 0.6714630722999573, + "step": 283 + }, + { + "epoch": 0.36853203568532034, + "grad_norm": 0.5980960726737976, + "learning_rate": 1.9716222630697266e-05, + "loss": 0.7872920036315918, + "step": 284 + }, + { + "epoch": 0.36982968369829683, + "grad_norm": 0.5855656266212463, + "learning_rate": 1.971283033261873e-05, + "loss": 0.7662516832351685, + "step": 285 + }, + { + "epoch": 0.3711273317112733, + "grad_norm": 0.5851466655731201, + "learning_rate": 1.9709418174260523e-05, + "loss": 0.7596746683120728, + "step": 286 + }, + { + "epoch": 0.3724249797242498, + "grad_norm": 0.5843831300735474, + "learning_rate": 1.9705986162599642e-05, + "loss": 0.7550405263900757, + "step": 287 + }, + { + "epoch": 0.37372262773722625, + "grad_norm": 0.5846932530403137, + "learning_rate": 1.9702534304653685e-05, + "loss": 0.7254443764686584, + "step": 288 + }, + { + "epoch": 0.37502027575020275, + "grad_norm": 0.6054766774177551, + "learning_rate": 1.9699062607480827e-05, + "loss": 0.7600511908531189, + "step": 289 + }, + { + "epoch": 0.37631792376317924, + "grad_norm": 0.5703001618385315, + "learning_rate": 1.969557107817981e-05, + "loss": 0.7401167750358582, + "step": 290 + }, + { + "epoch": 0.37761557177615573, + "grad_norm": 0.5855723023414612, + "learning_rate": 1.9692059723889927e-05, + "loss": 0.7476931214332581, + "step": 291 + }, + { + "epoch": 0.37891321978913217, + "grad_norm": 0.5804258584976196, + "learning_rate": 1.968852855179101e-05, + "loss": 0.7656409740447998, + "step": 292 + }, + { + "epoch": 0.38021086780210867, + "grad_norm": 0.5795084834098816, + "learning_rate": 1.9684977569103415e-05, + "loss": 0.7599056959152222, + "step": 293 + }, + { + "epoch": 0.38150851581508516, + "grad_norm": 0.5684756636619568, + "learning_rate": 1.9681406783087998e-05, + "loss": 0.674816370010376, + "step": 294 + }, + { + "epoch": 0.38280616382806165, + "grad_norm": 0.5463794469833374, + "learning_rate": 1.9677816201046113e-05, + "loss": 0.683580219745636, + "step": 295 + }, + { + "epoch": 0.38410381184103815, + "grad_norm": 0.5722465515136719, + "learning_rate": 1.9674205830319594e-05, + "loss": 0.693361222743988, + "step": 296 + }, + { + "epoch": 0.3854014598540146, + "grad_norm": 0.6253486275672913, + "learning_rate": 1.9670575678290732e-05, + "loss": 0.7917322516441345, + "step": 297 + }, + { + "epoch": 0.3866991078669911, + "grad_norm": 0.5660127401351929, + "learning_rate": 1.9666925752382275e-05, + "loss": 0.7436933517456055, + "step": 298 + }, + { + "epoch": 0.38799675587996757, + "grad_norm": 0.572499692440033, + "learning_rate": 1.9663256060057395e-05, + "loss": 0.6714681386947632, + "step": 299 + }, + { + "epoch": 0.38929440389294406, + "grad_norm": 0.5779220461845398, + "learning_rate": 1.9659566608819677e-05, + "loss": 0.7252252697944641, + "step": 300 + }, + { + "epoch": 0.3905920519059205, + "grad_norm": 0.5990428924560547, + "learning_rate": 1.9655857406213124e-05, + "loss": 0.7827754020690918, + "step": 301 + }, + { + "epoch": 0.391889699918897, + "grad_norm": 0.5721242427825928, + "learning_rate": 1.9652128459822113e-05, + "loss": 0.7102577686309814, + "step": 302 + }, + { + "epoch": 0.3931873479318735, + "grad_norm": 0.5870105028152466, + "learning_rate": 1.9648379777271397e-05, + "loss": 0.683538019657135, + "step": 303 + }, + { + "epoch": 0.39448499594485, + "grad_norm": 0.5920274257659912, + "learning_rate": 1.964461136622608e-05, + "loss": 0.7541404366493225, + "step": 304 + }, + { + "epoch": 0.3957826439578264, + "grad_norm": 0.5439295768737793, + "learning_rate": 1.9640823234391614e-05, + "loss": 0.675430417060852, + "step": 305 + }, + { + "epoch": 0.3970802919708029, + "grad_norm": 0.6126630902290344, + "learning_rate": 1.9637015389513765e-05, + "loss": 0.7898478507995605, + "step": 306 + }, + { + "epoch": 0.3983779399837794, + "grad_norm": 0.5664204359054565, + "learning_rate": 1.963318783937861e-05, + "loss": 0.6964154839515686, + "step": 307 + }, + { + "epoch": 0.3996755879967559, + "grad_norm": 0.5839046239852905, + "learning_rate": 1.962934059181253e-05, + "loss": 0.7421650886535645, + "step": 308 + }, + { + "epoch": 0.40097323600973234, + "grad_norm": 0.6044719815254211, + "learning_rate": 1.962547365468216e-05, + "loss": 0.7794229984283447, + "step": 309 + }, + { + "epoch": 0.40227088402270883, + "grad_norm": 0.5989699363708496, + "learning_rate": 1.962158703589442e-05, + "loss": 0.6963369846343994, + "step": 310 + }, + { + "epoch": 0.4035685320356853, + "grad_norm": 0.5891120433807373, + "learning_rate": 1.9617680743396452e-05, + "loss": 0.7737009525299072, + "step": 311 + }, + { + "epoch": 0.4048661800486618, + "grad_norm": 0.5753238201141357, + "learning_rate": 1.961375478517564e-05, + "loss": 0.6912685632705688, + "step": 312 + }, + { + "epoch": 0.40616382806163825, + "grad_norm": 0.6656221747398376, + "learning_rate": 1.9609809169259573e-05, + "loss": 0.7757899165153503, + "step": 313 + }, + { + "epoch": 0.40746147607461475, + "grad_norm": 0.6444079875946045, + "learning_rate": 1.960584390371604e-05, + "loss": 0.7399554252624512, + "step": 314 + }, + { + "epoch": 0.40875912408759124, + "grad_norm": 0.5455271601676941, + "learning_rate": 1.9601858996653004e-05, + "loss": 0.7261430025100708, + "step": 315 + }, + { + "epoch": 0.41005677210056773, + "grad_norm": 0.5660345554351807, + "learning_rate": 1.9597854456218588e-05, + "loss": 0.7287646532058716, + "step": 316 + }, + { + "epoch": 0.41135442011354423, + "grad_norm": 0.5909862518310547, + "learning_rate": 1.9593830290601067e-05, + "loss": 0.7831040620803833, + "step": 317 + }, + { + "epoch": 0.41265206812652067, + "grad_norm": 0.5852524638175964, + "learning_rate": 1.9589786508028842e-05, + "loss": 0.7229428291320801, + "step": 318 + }, + { + "epoch": 0.41394971613949716, + "grad_norm": 0.5916611552238464, + "learning_rate": 1.9585723116770425e-05, + "loss": 0.7438414692878723, + "step": 319 + }, + { + "epoch": 0.41524736415247365, + "grad_norm": 0.5859969854354858, + "learning_rate": 1.9581640125134415e-05, + "loss": 0.7692857384681702, + "step": 320 + }, + { + "epoch": 0.41654501216545015, + "grad_norm": 0.5748182535171509, + "learning_rate": 1.9577537541469506e-05, + "loss": 0.7208437919616699, + "step": 321 + }, + { + "epoch": 0.4178426601784266, + "grad_norm": 0.5739149451255798, + "learning_rate": 1.957341537416444e-05, + "loss": 0.6877571940422058, + "step": 322 + }, + { + "epoch": 0.4191403081914031, + "grad_norm": 0.6014899611473083, + "learning_rate": 1.9569273631648005e-05, + "loss": 0.7482254505157471, + "step": 323 + }, + { + "epoch": 0.42043795620437957, + "grad_norm": 0.5997340679168701, + "learning_rate": 1.9565112322389017e-05, + "loss": 0.735174298286438, + "step": 324 + }, + { + "epoch": 0.42173560421735606, + "grad_norm": 0.572567343711853, + "learning_rate": 1.95609314548963e-05, + "loss": 0.7159808874130249, + "step": 325 + }, + { + "epoch": 0.4230332522303325, + "grad_norm": 0.5567170977592468, + "learning_rate": 1.955673103771867e-05, + "loss": 0.6460487842559814, + "step": 326 + }, + { + "epoch": 0.424330900243309, + "grad_norm": 0.570945143699646, + "learning_rate": 1.9552511079444914e-05, + "loss": 0.780687689781189, + "step": 327 + }, + { + "epoch": 0.4256285482562855, + "grad_norm": 0.5721143484115601, + "learning_rate": 1.9548271588703783e-05, + "loss": 0.7781848907470703, + "step": 328 + }, + { + "epoch": 0.426926196269262, + "grad_norm": 0.5866307616233826, + "learning_rate": 1.954401257416396e-05, + "loss": 0.6634104251861572, + "step": 329 + }, + { + "epoch": 0.4282238442822384, + "grad_norm": 0.575668215751648, + "learning_rate": 1.9539734044534057e-05, + "loss": 0.7831740379333496, + "step": 330 + }, + { + "epoch": 0.4295214922952149, + "grad_norm": 0.5764342546463013, + "learning_rate": 1.9535436008562576e-05, + "loss": 0.7253679037094116, + "step": 331 + }, + { + "epoch": 0.4308191403081914, + "grad_norm": 0.5597108006477356, + "learning_rate": 1.9531118475037916e-05, + "loss": 0.6709398627281189, + "step": 332 + }, + { + "epoch": 0.4321167883211679, + "grad_norm": 0.595028817653656, + "learning_rate": 1.9526781452788342e-05, + "loss": 0.7365997433662415, + "step": 333 + }, + { + "epoch": 0.43341443633414434, + "grad_norm": 0.5742825865745544, + "learning_rate": 1.9522424950681964e-05, + "loss": 0.7389061450958252, + "step": 334 + }, + { + "epoch": 0.43471208434712083, + "grad_norm": 0.55686354637146, + "learning_rate": 1.951804897762673e-05, + "loss": 0.6932294964790344, + "step": 335 + }, + { + "epoch": 0.4360097323600973, + "grad_norm": 0.6195898652076721, + "learning_rate": 1.951365354257039e-05, + "loss": 0.689919114112854, + "step": 336 + }, + { + "epoch": 0.4373073803730738, + "grad_norm": 0.5357776284217834, + "learning_rate": 1.9509238654500505e-05, + "loss": 0.6890056133270264, + "step": 337 + }, + { + "epoch": 0.4386050283860503, + "grad_norm": 0.563254177570343, + "learning_rate": 1.95048043224444e-05, + "loss": 0.7118027806282043, + "step": 338 + }, + { + "epoch": 0.43990267639902675, + "grad_norm": 0.5649257302284241, + "learning_rate": 1.9500350555469164e-05, + "loss": 0.7314987182617188, + "step": 339 + }, + { + "epoch": 0.44120032441200324, + "grad_norm": 0.5675091743469238, + "learning_rate": 1.9495877362681613e-05, + "loss": 0.6302130222320557, + "step": 340 + }, + { + "epoch": 0.44249797242497974, + "grad_norm": 0.5489922761917114, + "learning_rate": 1.9491384753228308e-05, + "loss": 0.7357535362243652, + "step": 341 + }, + { + "epoch": 0.44379562043795623, + "grad_norm": 0.5530965924263, + "learning_rate": 1.948687273629549e-05, + "loss": 0.6449010372161865, + "step": 342 + }, + { + "epoch": 0.44509326845093267, + "grad_norm": 0.5747541189193726, + "learning_rate": 1.9482341321109096e-05, + "loss": 0.7252374887466431, + "step": 343 + }, + { + "epoch": 0.44639091646390916, + "grad_norm": 0.5609497427940369, + "learning_rate": 1.947779051693472e-05, + "loss": 0.7096484899520874, + "step": 344 + }, + { + "epoch": 0.44768856447688565, + "grad_norm": 0.5988261699676514, + "learning_rate": 1.9473220333077604e-05, + "loss": 0.7986630201339722, + "step": 345 + }, + { + "epoch": 0.44898621248986215, + "grad_norm": 0.6313751935958862, + "learning_rate": 1.946863077888262e-05, + "loss": 0.8356250524520874, + "step": 346 + }, + { + "epoch": 0.4502838605028386, + "grad_norm": 0.565196692943573, + "learning_rate": 1.946402186373424e-05, + "loss": 0.7527079582214355, + "step": 347 + }, + { + "epoch": 0.4515815085158151, + "grad_norm": 0.5944785475730896, + "learning_rate": 1.9459393597056536e-05, + "loss": 0.6996445655822754, + "step": 348 + }, + { + "epoch": 0.45287915652879157, + "grad_norm": 0.5384091734886169, + "learning_rate": 1.9454745988313135e-05, + "loss": 0.7005808353424072, + "step": 349 + }, + { + "epoch": 0.45417680454176806, + "grad_norm": 0.5926419496536255, + "learning_rate": 1.945007904700723e-05, + "loss": 0.7360185384750366, + "step": 350 + }, + { + "epoch": 0.4554744525547445, + "grad_norm": 0.5517107844352722, + "learning_rate": 1.9445392782681523e-05, + "loss": 0.6678152084350586, + "step": 351 + }, + { + "epoch": 0.456772100567721, + "grad_norm": 0.5527735352516174, + "learning_rate": 1.9440687204918245e-05, + "loss": 0.719680666923523, + "step": 352 + }, + { + "epoch": 0.4580697485806975, + "grad_norm": 0.5603200793266296, + "learning_rate": 1.943596232333911e-05, + "loss": 0.7023108005523682, + "step": 353 + }, + { + "epoch": 0.459367396593674, + "grad_norm": 0.5883275866508484, + "learning_rate": 1.9431218147605307e-05, + "loss": 0.7870659232139587, + "step": 354 + }, + { + "epoch": 0.4606650446066504, + "grad_norm": 0.5547419786453247, + "learning_rate": 1.9426454687417474e-05, + "loss": 0.693616509437561, + "step": 355 + }, + { + "epoch": 0.4619626926196269, + "grad_norm": 0.5387628674507141, + "learning_rate": 1.942167195251568e-05, + "loss": 0.6275761127471924, + "step": 356 + }, + { + "epoch": 0.4632603406326034, + "grad_norm": 0.5728762745857239, + "learning_rate": 1.941686995267941e-05, + "loss": 0.7649428844451904, + "step": 357 + }, + { + "epoch": 0.4645579886455799, + "grad_norm": 0.5744031667709351, + "learning_rate": 1.941204869772753e-05, + "loss": 0.746831476688385, + "step": 358 + }, + { + "epoch": 0.4658556366585564, + "grad_norm": 0.5453589558601379, + "learning_rate": 1.9407208197518296e-05, + "loss": 0.7251806259155273, + "step": 359 + }, + { + "epoch": 0.46715328467153283, + "grad_norm": 0.5643113851547241, + "learning_rate": 1.94023484619493e-05, + "loss": 0.6882834434509277, + "step": 360 + }, + { + "epoch": 0.4684509326845093, + "grad_norm": 0.5984339714050293, + "learning_rate": 1.9397469500957478e-05, + "loss": 0.7512071132659912, + "step": 361 + }, + { + "epoch": 0.4697485806974858, + "grad_norm": 0.5487557649612427, + "learning_rate": 1.939257132451906e-05, + "loss": 0.7803584337234497, + "step": 362 + }, + { + "epoch": 0.4710462287104623, + "grad_norm": 0.5798037648200989, + "learning_rate": 1.9387653942649586e-05, + "loss": 0.7196419835090637, + "step": 363 + }, + { + "epoch": 0.47234387672343875, + "grad_norm": 0.5554172396659851, + "learning_rate": 1.9382717365403854e-05, + "loss": 0.7393349409103394, + "step": 364 + }, + { + "epoch": 0.47364152473641524, + "grad_norm": 0.546137273311615, + "learning_rate": 1.9377761602875913e-05, + "loss": 0.7212686538696289, + "step": 365 + }, + { + "epoch": 0.47493917274939174, + "grad_norm": 0.5687487125396729, + "learning_rate": 1.937278666519905e-05, + "loss": 0.7769354581832886, + "step": 366 + }, + { + "epoch": 0.47623682076236823, + "grad_norm": 0.5400050282478333, + "learning_rate": 1.9367792562545744e-05, + "loss": 0.721081018447876, + "step": 367 + }, + { + "epoch": 0.47753446877534467, + "grad_norm": 0.5545980930328369, + "learning_rate": 1.9362779305127674e-05, + "loss": 0.6797982454299927, + "step": 368 + }, + { + "epoch": 0.47883211678832116, + "grad_norm": 0.5371907949447632, + "learning_rate": 1.9357746903195686e-05, + "loss": 0.7223237752914429, + "step": 369 + }, + { + "epoch": 0.48012976480129765, + "grad_norm": 0.534491240978241, + "learning_rate": 1.9352695367039764e-05, + "loss": 0.7010591626167297, + "step": 370 + }, + { + "epoch": 0.48142741281427415, + "grad_norm": 0.5431662797927856, + "learning_rate": 1.9347624706989026e-05, + "loss": 0.7298872470855713, + "step": 371 + }, + { + "epoch": 0.4827250608272506, + "grad_norm": 0.5843503475189209, + "learning_rate": 1.9342534933411683e-05, + "loss": 0.7810012698173523, + "step": 372 + }, + { + "epoch": 0.4840227088402271, + "grad_norm": 0.5278732776641846, + "learning_rate": 1.9337426056715036e-05, + "loss": 0.7204632759094238, + "step": 373 + }, + { + "epoch": 0.48532035685320357, + "grad_norm": 0.5900875926017761, + "learning_rate": 1.9332298087345447e-05, + "loss": 0.7081923484802246, + "step": 374 + }, + { + "epoch": 0.48661800486618007, + "grad_norm": 0.5549632906913757, + "learning_rate": 1.932715103578831e-05, + "loss": 0.7588300704956055, + "step": 375 + }, + { + "epoch": 0.4879156528791565, + "grad_norm": 0.5351032018661499, + "learning_rate": 1.9321984912568048e-05, + "loss": 0.6380345821380615, + "step": 376 + }, + { + "epoch": 0.489213300892133, + "grad_norm": 0.5553699135780334, + "learning_rate": 1.9316799728248074e-05, + "loss": 0.7115924954414368, + "step": 377 + }, + { + "epoch": 0.4905109489051095, + "grad_norm": 0.5904532670974731, + "learning_rate": 1.9311595493430776e-05, + "loss": 0.7918650507926941, + "step": 378 + }, + { + "epoch": 0.491808596918086, + "grad_norm": 0.5718861818313599, + "learning_rate": 1.93063722187575e-05, + "loss": 0.7574873566627502, + "step": 379 + }, + { + "epoch": 0.4931062449310625, + "grad_norm": 0.5575288534164429, + "learning_rate": 1.9301129914908516e-05, + "loss": 0.7619529962539673, + "step": 380 + }, + { + "epoch": 0.4944038929440389, + "grad_norm": 0.5972062945365906, + "learning_rate": 1.9295868592603012e-05, + "loss": 0.8739205598831177, + "step": 381 + }, + { + "epoch": 0.4957015409570154, + "grad_norm": 0.5725207328796387, + "learning_rate": 1.929058826259906e-05, + "loss": 0.7461530566215515, + "step": 382 + }, + { + "epoch": 0.4969991889699919, + "grad_norm": 0.7559300065040588, + "learning_rate": 1.9285288935693597e-05, + "loss": 0.7054376602172852, + "step": 383 + }, + { + "epoch": 0.4982968369829684, + "grad_norm": 0.5533690452575684, + "learning_rate": 1.9279970622722403e-05, + "loss": 0.742769718170166, + "step": 384 + }, + { + "epoch": 0.49959448499594483, + "grad_norm": 0.5702188014984131, + "learning_rate": 1.927463333456009e-05, + "loss": 0.7912020683288574, + "step": 385 + }, + { + "epoch": 0.5008921330089213, + "grad_norm": 0.5261266231536865, + "learning_rate": 1.9269277082120053e-05, + "loss": 0.7539711594581604, + "step": 386 + }, + { + "epoch": 0.5021897810218978, + "grad_norm": 0.5590584874153137, + "learning_rate": 1.926390187635448e-05, + "loss": 0.7646081447601318, + "step": 387 + }, + { + "epoch": 0.5034874290348743, + "grad_norm": 0.5796819925308228, + "learning_rate": 1.92585077282543e-05, + "loss": 0.7352266907691956, + "step": 388 + }, + { + "epoch": 0.5047850770478508, + "grad_norm": 0.5712133049964905, + "learning_rate": 1.9253094648849183e-05, + "loss": 0.7203606367111206, + "step": 389 + }, + { + "epoch": 0.5060827250608273, + "grad_norm": 0.597654402256012, + "learning_rate": 1.924766264920751e-05, + "loss": 0.8121019601821899, + "step": 390 + }, + { + "epoch": 0.5073803730738037, + "grad_norm": 0.5626549124717712, + "learning_rate": 1.9242211740436335e-05, + "loss": 0.7297658920288086, + "step": 391 + }, + { + "epoch": 0.5086780210867802, + "grad_norm": 0.6014045476913452, + "learning_rate": 1.9236741933681396e-05, + "loss": 0.7325990200042725, + "step": 392 + }, + { + "epoch": 0.5099756690997567, + "grad_norm": 0.5554893612861633, + "learning_rate": 1.9231253240127062e-05, + "loss": 0.680641770362854, + "step": 393 + }, + { + "epoch": 0.5112733171127332, + "grad_norm": 0.5787703394889832, + "learning_rate": 1.922574567099632e-05, + "loss": 0.7252123355865479, + "step": 394 + }, + { + "epoch": 0.5125709651257097, + "grad_norm": 0.5811824798583984, + "learning_rate": 1.9220219237550757e-05, + "loss": 0.7139418125152588, + "step": 395 + }, + { + "epoch": 0.5138686131386861, + "grad_norm": 0.547007143497467, + "learning_rate": 1.921467395109053e-05, + "loss": 0.6985068917274475, + "step": 396 + }, + { + "epoch": 0.5151662611516626, + "grad_norm": 0.6072813272476196, + "learning_rate": 1.9209109822954345e-05, + "loss": 0.7519763708114624, + "step": 397 + }, + { + "epoch": 0.5164639091646391, + "grad_norm": 0.5965511798858643, + "learning_rate": 1.9203526864519432e-05, + "loss": 0.7568516135215759, + "step": 398 + }, + { + "epoch": 0.5177615571776155, + "grad_norm": 0.5627179741859436, + "learning_rate": 1.919792508720154e-05, + "loss": 0.7021974921226501, + "step": 399 + }, + { + "epoch": 0.519059205190592, + "grad_norm": 0.5491631627082825, + "learning_rate": 1.9192304502454876e-05, + "loss": 0.6992515325546265, + "step": 400 + }, + { + "epoch": 0.5203568532035685, + "grad_norm": 0.5874002575874329, + "learning_rate": 1.918666512177211e-05, + "loss": 0.712739109992981, + "step": 401 + }, + { + "epoch": 0.521654501216545, + "grad_norm": 0.5660138726234436, + "learning_rate": 1.918100695668436e-05, + "loss": 0.6854047775268555, + "step": 402 + }, + { + "epoch": 0.5229521492295215, + "grad_norm": 0.565985381603241, + "learning_rate": 1.917533001876113e-05, + "loss": 0.7300174236297607, + "step": 403 + }, + { + "epoch": 0.524249797242498, + "grad_norm": 0.5489518642425537, + "learning_rate": 1.916963431961033e-05, + "loss": 0.7667282819747925, + "step": 404 + }, + { + "epoch": 0.5255474452554745, + "grad_norm": 0.569230318069458, + "learning_rate": 1.916391987087822e-05, + "loss": 0.7247310876846313, + "step": 405 + }, + { + "epoch": 0.526845093268451, + "grad_norm": 0.5969386696815491, + "learning_rate": 1.9158186684249397e-05, + "loss": 0.7719178199768066, + "step": 406 + }, + { + "epoch": 0.5281427412814275, + "grad_norm": 0.5550801157951355, + "learning_rate": 1.9152434771446783e-05, + "loss": 0.6853774785995483, + "step": 407 + }, + { + "epoch": 0.5294403892944038, + "grad_norm": 0.5440778136253357, + "learning_rate": 1.914666414423158e-05, + "loss": 0.681282639503479, + "step": 408 + }, + { + "epoch": 0.5307380373073803, + "grad_norm": 0.5368308424949646, + "learning_rate": 1.914087481440326e-05, + "loss": 0.7318757772445679, + "step": 409 + }, + { + "epoch": 0.5320356853203568, + "grad_norm": 0.6122865676879883, + "learning_rate": 1.9135066793799538e-05, + "loss": 0.6974803805351257, + "step": 410 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.5386953353881836, + "learning_rate": 1.912924009429635e-05, + "loss": 0.7397326827049255, + "step": 411 + }, + { + "epoch": 0.5346309813463098, + "grad_norm": 0.5616509914398193, + "learning_rate": 1.9123394727807816e-05, + "loss": 0.7613886594772339, + "step": 412 + }, + { + "epoch": 0.5359286293592863, + "grad_norm": 0.627604067325592, + "learning_rate": 1.9117530706286232e-05, + "loss": 0.7783684730529785, + "step": 413 + }, + { + "epoch": 0.5372262773722628, + "grad_norm": 0.5613445043563843, + "learning_rate": 1.9111648041722044e-05, + "loss": 0.7296919226646423, + "step": 414 + }, + { + "epoch": 0.5385239253852393, + "grad_norm": 0.5356356501579285, + "learning_rate": 1.91057467461438e-05, + "loss": 0.7119168639183044, + "step": 415 + }, + { + "epoch": 0.5398215733982157, + "grad_norm": 0.5709317326545715, + "learning_rate": 1.9099826831618168e-05, + "loss": 0.6891450881958008, + "step": 416 + }, + { + "epoch": 0.5411192214111922, + "grad_norm": 0.5525058507919312, + "learning_rate": 1.909388831024987e-05, + "loss": 0.7220831513404846, + "step": 417 + }, + { + "epoch": 0.5424168694241687, + "grad_norm": 0.5916740894317627, + "learning_rate": 1.908793119418168e-05, + "loss": 0.7380563020706177, + "step": 418 + }, + { + "epoch": 0.5437145174371452, + "grad_norm": 0.5553448796272278, + "learning_rate": 1.9081955495594388e-05, + "loss": 0.6854832172393799, + "step": 419 + }, + { + "epoch": 0.5450121654501217, + "grad_norm": 0.550918459892273, + "learning_rate": 1.9075961226706784e-05, + "loss": 0.755254864692688, + "step": 420 + }, + { + "epoch": 0.5463098134630981, + "grad_norm": 0.5704249739646912, + "learning_rate": 1.906994839977564e-05, + "loss": 0.762306272983551, + "step": 421 + }, + { + "epoch": 0.5476074614760746, + "grad_norm": 0.5444906949996948, + "learning_rate": 1.9063917027095664e-05, + "loss": 0.7424022555351257, + "step": 422 + }, + { + "epoch": 0.5489051094890511, + "grad_norm": 0.5842110514640808, + "learning_rate": 1.905786712099948e-05, + "loss": 0.7851117849349976, + "step": 423 + }, + { + "epoch": 0.5502027575020276, + "grad_norm": 0.5527293086051941, + "learning_rate": 1.9051798693857617e-05, + "loss": 0.7389935255050659, + "step": 424 + }, + { + "epoch": 0.551500405515004, + "grad_norm": 0.5890975594520569, + "learning_rate": 1.904571175807848e-05, + "loss": 0.7679333686828613, + "step": 425 + }, + { + "epoch": 0.5527980535279805, + "grad_norm": 0.5342135429382324, + "learning_rate": 1.9039606326108297e-05, + "loss": 0.7123668193817139, + "step": 426 + }, + { + "epoch": 0.554095701540957, + "grad_norm": 0.5628570914268494, + "learning_rate": 1.903348241043114e-05, + "loss": 0.7286348342895508, + "step": 427 + }, + { + "epoch": 0.5553933495539335, + "grad_norm": 0.5398725867271423, + "learning_rate": 1.902734002356887e-05, + "loss": 0.7192749977111816, + "step": 428 + }, + { + "epoch": 0.55669099756691, + "grad_norm": 0.5142056941986084, + "learning_rate": 1.9021179178081107e-05, + "loss": 0.6286910772323608, + "step": 429 + }, + { + "epoch": 0.5579886455798865, + "grad_norm": 0.5470032095909119, + "learning_rate": 1.9014999886565226e-05, + "loss": 0.6505739092826843, + "step": 430 + }, + { + "epoch": 0.559286293592863, + "grad_norm": 0.5600834488868713, + "learning_rate": 1.9008802161656308e-05, + "loss": 0.7014046907424927, + "step": 431 + }, + { + "epoch": 0.5605839416058395, + "grad_norm": 0.5533670783042908, + "learning_rate": 1.9002586016027136e-05, + "loss": 0.7095932364463806, + "step": 432 + }, + { + "epoch": 0.5618815896188158, + "grad_norm": 0.5443385243415833, + "learning_rate": 1.8996351462388153e-05, + "loss": 0.7492538094520569, + "step": 433 + }, + { + "epoch": 0.5631792376317923, + "grad_norm": 0.5775622129440308, + "learning_rate": 1.8990098513487447e-05, + "loss": 0.7882871627807617, + "step": 434 + }, + { + "epoch": 0.5644768856447688, + "grad_norm": 0.5645557045936584, + "learning_rate": 1.898382718211071e-05, + "loss": 0.6681729555130005, + "step": 435 + }, + { + "epoch": 0.5657745336577453, + "grad_norm": 0.562117874622345, + "learning_rate": 1.897753748108123e-05, + "loss": 0.7754248380661011, + "step": 436 + }, + { + "epoch": 0.5670721816707218, + "grad_norm": 0.5395199656486511, + "learning_rate": 1.8971229423259855e-05, + "loss": 0.6584359407424927, + "step": 437 + }, + { + "epoch": 0.5683698296836983, + "grad_norm": 0.5511093735694885, + "learning_rate": 1.8964903021544964e-05, + "loss": 0.7121752500534058, + "step": 438 + }, + { + "epoch": 0.5696674776966748, + "grad_norm": 0.5518468022346497, + "learning_rate": 1.895855828887245e-05, + "loss": 0.7533795237541199, + "step": 439 + }, + { + "epoch": 0.5709651257096513, + "grad_norm": 0.541132926940918, + "learning_rate": 1.895219523821568e-05, + "loss": 0.6961894035339355, + "step": 440 + }, + { + "epoch": 0.5722627737226277, + "grad_norm": 0.5566806197166443, + "learning_rate": 1.894581388258549e-05, + "loss": 0.7168055176734924, + "step": 441 + }, + { + "epoch": 0.5735604217356042, + "grad_norm": 0.8438438773155212, + "learning_rate": 1.8939414235030137e-05, + "loss": 0.7322010397911072, + "step": 442 + }, + { + "epoch": 0.5748580697485807, + "grad_norm": 0.5508759617805481, + "learning_rate": 1.893299630863527e-05, + "loss": 0.689163327217102, + "step": 443 + }, + { + "epoch": 0.5761557177615572, + "grad_norm": 0.577190637588501, + "learning_rate": 1.892656011652393e-05, + "loss": 0.7421369552612305, + "step": 444 + }, + { + "epoch": 0.5774533657745337, + "grad_norm": 0.5557067394256592, + "learning_rate": 1.8920105671856507e-05, + "loss": 0.6984370350837708, + "step": 445 + }, + { + "epoch": 0.5787510137875101, + "grad_norm": 0.5880769491195679, + "learning_rate": 1.89136329878307e-05, + "loss": 0.6648968458175659, + "step": 446 + }, + { + "epoch": 0.5800486618004866, + "grad_norm": 0.5225708484649658, + "learning_rate": 1.890714207768151e-05, + "loss": 0.6399903297424316, + "step": 447 + }, + { + "epoch": 0.5813463098134631, + "grad_norm": 2.8270366191864014, + "learning_rate": 1.8900632954681203e-05, + "loss": 0.7426702380180359, + "step": 448 + }, + { + "epoch": 0.5826439578264396, + "grad_norm": 0.5743777751922607, + "learning_rate": 1.8894105632139296e-05, + "loss": 0.7008408308029175, + "step": 449 + }, + { + "epoch": 0.583941605839416, + "grad_norm": 0.5848923325538635, + "learning_rate": 1.8887560123402505e-05, + "loss": 0.7745944261550903, + "step": 450 + }, + { + "epoch": 0.5852392538523925, + "grad_norm": 0.5533474087715149, + "learning_rate": 1.888099644185474e-05, + "loss": 0.7078051567077637, + "step": 451 + }, + { + "epoch": 0.586536901865369, + "grad_norm": 0.5359990000724792, + "learning_rate": 1.887441460091707e-05, + "loss": 0.7025009393692017, + "step": 452 + }, + { + "epoch": 0.5878345498783455, + "grad_norm": 0.5772839784622192, + "learning_rate": 1.886781461404769e-05, + "loss": 0.7109262347221375, + "step": 453 + }, + { + "epoch": 0.589132197891322, + "grad_norm": 0.5491592288017273, + "learning_rate": 1.886119649474191e-05, + "loss": 0.6828133463859558, + "step": 454 + }, + { + "epoch": 0.5904298459042985, + "grad_norm": 0.5495162606239319, + "learning_rate": 1.8854560256532098e-05, + "loss": 0.6600109338760376, + "step": 455 + }, + { + "epoch": 0.591727493917275, + "grad_norm": 0.5773736238479614, + "learning_rate": 1.8847905912987693e-05, + "loss": 0.6746517419815063, + "step": 456 + }, + { + "epoch": 0.5930251419302515, + "grad_norm": 0.5658586621284485, + "learning_rate": 1.8841233477715136e-05, + "loss": 0.6905688047409058, + "step": 457 + }, + { + "epoch": 0.5943227899432278, + "grad_norm": 0.544463574886322, + "learning_rate": 1.8834542964357875e-05, + "loss": 0.7656948566436768, + "step": 458 + }, + { + "epoch": 0.5956204379562043, + "grad_norm": 0.5466704964637756, + "learning_rate": 1.8827834386596306e-05, + "loss": 0.7320756912231445, + "step": 459 + }, + { + "epoch": 0.5969180859691808, + "grad_norm": 0.534042477607727, + "learning_rate": 1.882110775814778e-05, + "loss": 0.6747853755950928, + "step": 460 + }, + { + "epoch": 0.5969180859691808, + "eval_loss": 0.7028419375419617, + "eval_runtime": 72.8032, + "eval_samples_per_second": 71.316, + "eval_steps_per_second": 8.914, + "step": 460 + }, + { + "epoch": 0.5982157339821573, + "grad_norm": 0.5617560148239136, + "learning_rate": 1.881436309276655e-05, + "loss": 0.7175489068031311, + "step": 461 + }, + { + "epoch": 0.5995133819951338, + "grad_norm": 0.538003146648407, + "learning_rate": 1.8807600404243746e-05, + "loss": 0.6772977709770203, + "step": 462 + }, + { + "epoch": 0.6008110300081103, + "grad_norm": 0.5164902210235596, + "learning_rate": 1.8800819706407355e-05, + "loss": 0.7026697397232056, + "step": 463 + }, + { + "epoch": 0.6021086780210868, + "grad_norm": 0.519985556602478, + "learning_rate": 1.879402101312219e-05, + "loss": 0.6459539532661438, + "step": 464 + }, + { + "epoch": 0.6034063260340633, + "grad_norm": 0.5643022060394287, + "learning_rate": 1.8787204338289858e-05, + "loss": 0.7304619550704956, + "step": 465 + }, + { + "epoch": 0.6047039740470398, + "grad_norm": 0.5315333604812622, + "learning_rate": 1.8780369695848733e-05, + "loss": 0.7055330872535706, + "step": 466 + }, + { + "epoch": 0.6060016220600162, + "grad_norm": 0.5695874691009521, + "learning_rate": 1.8773517099773927e-05, + "loss": 0.7567015290260315, + "step": 467 + }, + { + "epoch": 0.6072992700729927, + "grad_norm": 0.5361006259918213, + "learning_rate": 1.8766646564077265e-05, + "loss": 0.7254809141159058, + "step": 468 + }, + { + "epoch": 0.6085969180859692, + "grad_norm": 0.5438353419303894, + "learning_rate": 1.8759758102807253e-05, + "loss": 0.6743266582489014, + "step": 469 + }, + { + "epoch": 0.6098945660989457, + "grad_norm": 0.5824978351593018, + "learning_rate": 1.8752851730049055e-05, + "loss": 0.7623616456985474, + "step": 470 + }, + { + "epoch": 0.6111922141119221, + "grad_norm": 0.546610951423645, + "learning_rate": 1.8745927459924454e-05, + "loss": 0.809882640838623, + "step": 471 + }, + { + "epoch": 0.6124898621248986, + "grad_norm": 0.5459777116775513, + "learning_rate": 1.8738985306591826e-05, + "loss": 0.6817529201507568, + "step": 472 + }, + { + "epoch": 0.6137875101378751, + "grad_norm": 0.5381180644035339, + "learning_rate": 1.8732025284246122e-05, + "loss": 0.7059892416000366, + "step": 473 + }, + { + "epoch": 0.6150851581508516, + "grad_norm": 0.5245769023895264, + "learning_rate": 1.8725047407118823e-05, + "loss": 0.7031271457672119, + "step": 474 + }, + { + "epoch": 0.616382806163828, + "grad_norm": 0.5284971594810486, + "learning_rate": 1.8718051689477923e-05, + "loss": 0.7379744052886963, + "step": 475 + }, + { + "epoch": 0.6176804541768045, + "grad_norm": 0.5659690499305725, + "learning_rate": 1.8711038145627893e-05, + "loss": 0.7798171639442444, + "step": 476 + }, + { + "epoch": 0.618978102189781, + "grad_norm": 0.5460679531097412, + "learning_rate": 1.8704006789909654e-05, + "loss": 0.7433549165725708, + "step": 477 + }, + { + "epoch": 0.6202757502027575, + "grad_norm": 0.5171265602111816, + "learning_rate": 1.8696957636700555e-05, + "loss": 0.7264508008956909, + "step": 478 + }, + { + "epoch": 0.621573398215734, + "grad_norm": 0.5979129672050476, + "learning_rate": 1.868989070041432e-05, + "loss": 0.7511105537414551, + "step": 479 + }, + { + "epoch": 0.6228710462287105, + "grad_norm": 0.5520970225334167, + "learning_rate": 1.8682805995501052e-05, + "loss": 0.6946426630020142, + "step": 480 + }, + { + "epoch": 0.624168694241687, + "grad_norm": 0.5510658025741577, + "learning_rate": 1.8675703536447178e-05, + "loss": 0.7265397310256958, + "step": 481 + }, + { + "epoch": 0.6254663422546635, + "grad_norm": 0.5842864513397217, + "learning_rate": 1.866858333777543e-05, + "loss": 0.7219571471214294, + "step": 482 + }, + { + "epoch": 0.6267639902676398, + "grad_norm": 0.5430331826210022, + "learning_rate": 1.8661445414044813e-05, + "loss": 0.7292179465293884, + "step": 483 + }, + { + "epoch": 0.6280616382806163, + "grad_norm": 0.5456423759460449, + "learning_rate": 1.865428977985057e-05, + "loss": 0.7341865301132202, + "step": 484 + }, + { + "epoch": 0.6293592862935928, + "grad_norm": 0.55687415599823, + "learning_rate": 1.8647116449824165e-05, + "loss": 0.7712036371231079, + "step": 485 + }, + { + "epoch": 0.6306569343065693, + "grad_norm": 0.574967622756958, + "learning_rate": 1.8639925438633243e-05, + "loss": 0.7341934442520142, + "step": 486 + }, + { + "epoch": 0.6319545823195458, + "grad_norm": 0.575878381729126, + "learning_rate": 1.86327167609816e-05, + "loss": 0.6782741546630859, + "step": 487 + }, + { + "epoch": 0.6332522303325223, + "grad_norm": 0.5638167858123779, + "learning_rate": 1.8625490431609154e-05, + "loss": 0.8088809251785278, + "step": 488 + }, + { + "epoch": 0.6345498783454988, + "grad_norm": 0.547574520111084, + "learning_rate": 1.8618246465291925e-05, + "loss": 0.7108902335166931, + "step": 489 + }, + { + "epoch": 0.6358475263584753, + "grad_norm": 0.5785483121871948, + "learning_rate": 1.861098487684199e-05, + "loss": 0.6963984370231628, + "step": 490 + }, + { + "epoch": 0.6371451743714518, + "grad_norm": 0.547226071357727, + "learning_rate": 1.8603705681107456e-05, + "loss": 0.6772190928459167, + "step": 491 + }, + { + "epoch": 0.6384428223844282, + "grad_norm": 0.5494422912597656, + "learning_rate": 1.8596408892972442e-05, + "loss": 0.7243861556053162, + "step": 492 + }, + { + "epoch": 0.6397404703974047, + "grad_norm": 0.5267540216445923, + "learning_rate": 1.858909452735703e-05, + "loss": 0.6649144887924194, + "step": 493 + }, + { + "epoch": 0.6410381184103812, + "grad_norm": 0.5952751636505127, + "learning_rate": 1.858176259921724e-05, + "loss": 0.7574429512023926, + "step": 494 + }, + { + "epoch": 0.6423357664233577, + "grad_norm": 0.5476658344268799, + "learning_rate": 1.857441312354502e-05, + "loss": 0.6968377828598022, + "step": 495 + }, + { + "epoch": 0.6436334144363342, + "grad_norm": 0.5507075786590576, + "learning_rate": 1.856704611536818e-05, + "loss": 0.7353919744491577, + "step": 496 + }, + { + "epoch": 0.6449310624493106, + "grad_norm": 0.5495625734329224, + "learning_rate": 1.8559661589750387e-05, + "loss": 0.7162117958068848, + "step": 497 + }, + { + "epoch": 0.6462287104622871, + "grad_norm": 0.5721608996391296, + "learning_rate": 1.8552259561791133e-05, + "loss": 0.6986855268478394, + "step": 498 + }, + { + "epoch": 0.6475263584752636, + "grad_norm": 0.5700922608375549, + "learning_rate": 1.8544840046625686e-05, + "loss": 0.8195285797119141, + "step": 499 + }, + { + "epoch": 0.64882400648824, + "grad_norm": 0.5746553540229797, + "learning_rate": 1.8537403059425082e-05, + "loss": 0.7492556571960449, + "step": 500 + }, + { + "epoch": 0.6501216545012165, + "grad_norm": 0.5598172545433044, + "learning_rate": 1.852994861539607e-05, + "loss": 0.6921173930168152, + "step": 501 + }, + { + "epoch": 0.651419302514193, + "grad_norm": 0.5589975714683533, + "learning_rate": 1.8522476729781106e-05, + "loss": 0.7157631516456604, + "step": 502 + }, + { + "epoch": 0.6527169505271695, + "grad_norm": 0.5745802521705627, + "learning_rate": 1.8514987417858306e-05, + "loss": 0.7679554224014282, + "step": 503 + }, + { + "epoch": 0.654014598540146, + "grad_norm": 0.581063449382782, + "learning_rate": 1.8507480694941416e-05, + "loss": 0.7761994004249573, + "step": 504 + }, + { + "epoch": 0.6553122465531225, + "grad_norm": 0.5932230353355408, + "learning_rate": 1.849995657637978e-05, + "loss": 0.748866081237793, + "step": 505 + }, + { + "epoch": 0.656609894566099, + "grad_norm": 0.5524072647094727, + "learning_rate": 1.8492415077558325e-05, + "loss": 0.7764031887054443, + "step": 506 + }, + { + "epoch": 0.6579075425790755, + "grad_norm": 0.5266931653022766, + "learning_rate": 1.8484856213897496e-05, + "loss": 0.7512728571891785, + "step": 507 + }, + { + "epoch": 0.659205190592052, + "grad_norm": 0.5363677740097046, + "learning_rate": 1.847728000085327e-05, + "loss": 0.7477032542228699, + "step": 508 + }, + { + "epoch": 0.6605028386050283, + "grad_norm": 0.5348376035690308, + "learning_rate": 1.8469686453917074e-05, + "loss": 0.6908712387084961, + "step": 509 + }, + { + "epoch": 0.6618004866180048, + "grad_norm": 0.5489766597747803, + "learning_rate": 1.846207558861579e-05, + "loss": 0.7576340436935425, + "step": 510 + }, + { + "epoch": 0.6630981346309813, + "grad_norm": 0.5426369309425354, + "learning_rate": 1.845444742051172e-05, + "loss": 0.7107582092285156, + "step": 511 + }, + { + "epoch": 0.6643957826439578, + "grad_norm": 0.5308833718299866, + "learning_rate": 1.8446801965202524e-05, + "loss": 0.6590298414230347, + "step": 512 + }, + { + "epoch": 0.6656934306569343, + "grad_norm": 0.5621533989906311, + "learning_rate": 1.8439139238321235e-05, + "loss": 0.7291080355644226, + "step": 513 + }, + { + "epoch": 0.6669910786699108, + "grad_norm": 0.5651385188102722, + "learning_rate": 1.8431459255536185e-05, + "loss": 0.7855580449104309, + "step": 514 + }, + { + "epoch": 0.6682887266828873, + "grad_norm": 0.5611156225204468, + "learning_rate": 1.8423762032551e-05, + "loss": 0.6918215751647949, + "step": 515 + }, + { + "epoch": 0.6695863746958638, + "grad_norm": 0.5477362275123596, + "learning_rate": 1.841604758510454e-05, + "loss": 0.7025431394577026, + "step": 516 + }, + { + "epoch": 0.6708840227088402, + "grad_norm": 0.5612704753875732, + "learning_rate": 1.840831592897091e-05, + "loss": 0.7540648579597473, + "step": 517 + }, + { + "epoch": 0.6721816707218167, + "grad_norm": 0.5650063753128052, + "learning_rate": 1.8400567079959383e-05, + "loss": 0.7409968376159668, + "step": 518 + }, + { + "epoch": 0.6734793187347932, + "grad_norm": 0.5648168921470642, + "learning_rate": 1.8392801053914396e-05, + "loss": 0.754462718963623, + "step": 519 + }, + { + "epoch": 0.6747769667477697, + "grad_norm": 0.5603179931640625, + "learning_rate": 1.8385017866715507e-05, + "loss": 0.7388665080070496, + "step": 520 + }, + { + "epoch": 0.6760746147607462, + "grad_norm": 0.5628640651702881, + "learning_rate": 1.8377217534277365e-05, + "loss": 0.7781612873077393, + "step": 521 + }, + { + "epoch": 0.6773722627737226, + "grad_norm": 0.593789279460907, + "learning_rate": 1.8369400072549674e-05, + "loss": 0.753161609172821, + "step": 522 + }, + { + "epoch": 0.6786699107866991, + "grad_norm": 0.5755636096000671, + "learning_rate": 1.8361565497517166e-05, + "loss": 0.7570379972457886, + "step": 523 + }, + { + "epoch": 0.6799675587996756, + "grad_norm": 0.5607541799545288, + "learning_rate": 1.835371382519956e-05, + "loss": 0.777469277381897, + "step": 524 + }, + { + "epoch": 0.681265206812652, + "grad_norm": 0.4994042217731476, + "learning_rate": 1.8345845071651543e-05, + "loss": 0.6544281840324402, + "step": 525 + }, + { + "epoch": 0.6825628548256285, + "grad_norm": 0.5685398578643799, + "learning_rate": 1.8337959252962728e-05, + "loss": 0.7024877071380615, + "step": 526 + }, + { + "epoch": 0.683860502838605, + "grad_norm": 0.5343568325042725, + "learning_rate": 1.8330056385257607e-05, + "loss": 0.7003896832466125, + "step": 527 + }, + { + "epoch": 0.6851581508515815, + "grad_norm": 0.5208355188369751, + "learning_rate": 1.8322136484695553e-05, + "loss": 0.6797738075256348, + "step": 528 + }, + { + "epoch": 0.686455798864558, + "grad_norm": 0.5621144771575928, + "learning_rate": 1.8314199567470755e-05, + "loss": 0.6609838008880615, + "step": 529 + }, + { + "epoch": 0.6877534468775345, + "grad_norm": 0.577298104763031, + "learning_rate": 1.83062456498122e-05, + "loss": 0.711292028427124, + "step": 530 + }, + { + "epoch": 0.689051094890511, + "grad_norm": 0.5840193629264832, + "learning_rate": 1.8298274747983638e-05, + "loss": 0.7950271368026733, + "step": 531 + }, + { + "epoch": 0.6903487429034875, + "grad_norm": 0.5348870158195496, + "learning_rate": 1.8290286878283542e-05, + "loss": 0.6982176303863525, + "step": 532 + }, + { + "epoch": 0.691646390916464, + "grad_norm": 0.5467864871025085, + "learning_rate": 1.8282282057045087e-05, + "loss": 0.7555949687957764, + "step": 533 + }, + { + "epoch": 0.6929440389294403, + "grad_norm": 0.5581674575805664, + "learning_rate": 1.827426030063611e-05, + "loss": 0.6723984479904175, + "step": 534 + }, + { + "epoch": 0.6942416869424168, + "grad_norm": 0.5615087151527405, + "learning_rate": 1.8266221625459064e-05, + "loss": 0.7201924324035645, + "step": 535 + }, + { + "epoch": 0.6955393349553933, + "grad_norm": 0.5710893273353577, + "learning_rate": 1.825816604795101e-05, + "loss": 0.7096928358078003, + "step": 536 + }, + { + "epoch": 0.6968369829683698, + "grad_norm": 0.5586241483688354, + "learning_rate": 1.8250093584583567e-05, + "loss": 0.7197962999343872, + "step": 537 + }, + { + "epoch": 0.6981346309813463, + "grad_norm": 0.5536755323410034, + "learning_rate": 1.8242004251862872e-05, + "loss": 0.678354799747467, + "step": 538 + }, + { + "epoch": 0.6994322789943228, + "grad_norm": 0.5744696855545044, + "learning_rate": 1.823389806632957e-05, + "loss": 0.7439010739326477, + "step": 539 + }, + { + "epoch": 0.7007299270072993, + "grad_norm": 0.5338960886001587, + "learning_rate": 1.8225775044558757e-05, + "loss": 0.731925904750824, + "step": 540 + }, + { + "epoch": 0.7020275750202758, + "grad_norm": 0.5696558356285095, + "learning_rate": 1.8217635203159957e-05, + "loss": 0.7480655312538147, + "step": 541 + }, + { + "epoch": 0.7033252230332522, + "grad_norm": 0.5994415283203125, + "learning_rate": 1.8209478558777084e-05, + "loss": 0.776438295841217, + "step": 542 + }, + { + "epoch": 0.7046228710462287, + "grad_norm": 0.578956127166748, + "learning_rate": 1.8201305128088412e-05, + "loss": 0.7190870046615601, + "step": 543 + }, + { + "epoch": 0.7059205190592052, + "grad_norm": 0.557142972946167, + "learning_rate": 1.819311492780654e-05, + "loss": 0.7524915933609009, + "step": 544 + }, + { + "epoch": 0.7072181670721817, + "grad_norm": 0.5244631171226501, + "learning_rate": 1.8184907974678348e-05, + "loss": 0.6941534876823425, + "step": 545 + }, + { + "epoch": 0.7085158150851582, + "grad_norm": 0.5301777720451355, + "learning_rate": 1.8176684285484985e-05, + "loss": 0.7010957598686218, + "step": 546 + }, + { + "epoch": 0.7098134630981346, + "grad_norm": 0.5309736728668213, + "learning_rate": 1.816844387704181e-05, + "loss": 0.6693360209465027, + "step": 547 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.5188398361206055, + "learning_rate": 1.8160186766198375e-05, + "loss": 0.7254098057746887, + "step": 548 + }, + { + "epoch": 0.7124087591240876, + "grad_norm": 0.5340986847877502, + "learning_rate": 1.815191296983838e-05, + "loss": 0.7227193713188171, + "step": 549 + }, + { + "epoch": 0.7137064071370641, + "grad_norm": 0.5604742765426636, + "learning_rate": 1.8143622504879647e-05, + "loss": 0.6893896460533142, + "step": 550 + }, + { + "epoch": 0.7150040551500405, + "grad_norm": 0.5265613794326782, + "learning_rate": 1.8135315388274075e-05, + "loss": 0.7178789377212524, + "step": 551 + }, + { + "epoch": 0.716301703163017, + "grad_norm": 0.5819421410560608, + "learning_rate": 1.8126991637007618e-05, + "loss": 0.7809138298034668, + "step": 552 + }, + { + "epoch": 0.7175993511759935, + "grad_norm": 0.5548515915870667, + "learning_rate": 1.8118651268100235e-05, + "loss": 0.7398655414581299, + "step": 553 + }, + { + "epoch": 0.71889699918897, + "grad_norm": 0.5281164050102234, + "learning_rate": 1.811029429860588e-05, + "loss": 0.7255332469940186, + "step": 554 + }, + { + "epoch": 0.7201946472019465, + "grad_norm": 0.51970374584198, + "learning_rate": 1.810192074561243e-05, + "loss": 0.6958039999008179, + "step": 555 + }, + { + "epoch": 0.721492295214923, + "grad_norm": 0.5574509501457214, + "learning_rate": 1.8093530626241684e-05, + "loss": 0.77367103099823, + "step": 556 + }, + { + "epoch": 0.7227899432278995, + "grad_norm": 0.5539534687995911, + "learning_rate": 1.8085123957649315e-05, + "loss": 0.7615116834640503, + "step": 557 + }, + { + "epoch": 0.724087591240876, + "grad_norm": 0.549517035484314, + "learning_rate": 1.8076700757024833e-05, + "loss": 0.777897834777832, + "step": 558 + }, + { + "epoch": 0.7253852392538523, + "grad_norm": 0.5480270981788635, + "learning_rate": 1.8068261041591548e-05, + "loss": 0.7139554619789124, + "step": 559 + }, + { + "epoch": 0.7266828872668288, + "grad_norm": 0.5337988138198853, + "learning_rate": 1.8059804828606545e-05, + "loss": 0.7470839023590088, + "step": 560 + }, + { + "epoch": 0.7279805352798053, + "grad_norm": 0.5055403709411621, + "learning_rate": 1.8051332135360637e-05, + "loss": 0.6575566530227661, + "step": 561 + }, + { + "epoch": 0.7292781832927818, + "grad_norm": 0.5452354550361633, + "learning_rate": 1.8042842979178338e-05, + "loss": 0.7080937623977661, + "step": 562 + }, + { + "epoch": 0.7305758313057583, + "grad_norm": 0.5276215672492981, + "learning_rate": 1.8034337377417826e-05, + "loss": 0.6609282493591309, + "step": 563 + }, + { + "epoch": 0.7318734793187348, + "grad_norm": 0.5823485851287842, + "learning_rate": 1.80258153474709e-05, + "loss": 0.7274823784828186, + "step": 564 + }, + { + "epoch": 0.7331711273317113, + "grad_norm": 0.5385794043540955, + "learning_rate": 1.8017276906762955e-05, + "loss": 0.6209210157394409, + "step": 565 + }, + { + "epoch": 0.7344687753446878, + "grad_norm": 0.6051076054573059, + "learning_rate": 1.8008722072752943e-05, + "loss": 0.7948423624038696, + "step": 566 + }, + { + "epoch": 0.7357664233576642, + "grad_norm": 0.8337801098823547, + "learning_rate": 1.8000150862933335e-05, + "loss": 0.7299556732177734, + "step": 567 + }, + { + "epoch": 0.7370640713706407, + "grad_norm": 0.5429887771606445, + "learning_rate": 1.7991563294830083e-05, + "loss": 0.686081051826477, + "step": 568 + }, + { + "epoch": 0.7383617193836172, + "grad_norm": 0.5419583916664124, + "learning_rate": 1.7982959386002592e-05, + "loss": 0.7415616512298584, + "step": 569 + }, + { + "epoch": 0.7396593673965937, + "grad_norm": 0.5454174280166626, + "learning_rate": 1.7974339154043677e-05, + "loss": 0.7275187969207764, + "step": 570 + }, + { + "epoch": 0.7409570154095702, + "grad_norm": 0.5611673593521118, + "learning_rate": 1.796570261657953e-05, + "loss": 0.7872575521469116, + "step": 571 + }, + { + "epoch": 0.7422546634225466, + "grad_norm": 0.5598644018173218, + "learning_rate": 1.7957049791269684e-05, + "loss": 0.7327409982681274, + "step": 572 + }, + { + "epoch": 0.7435523114355231, + "grad_norm": 0.558341920375824, + "learning_rate": 1.7948380695806983e-05, + "loss": 0.711640477180481, + "step": 573 + }, + { + "epoch": 0.7448499594484996, + "grad_norm": 0.5189648270606995, + "learning_rate": 1.793969534791752e-05, + "loss": 0.6593164801597595, + "step": 574 + }, + { + "epoch": 0.7461476074614761, + "grad_norm": 0.5739206671714783, + "learning_rate": 1.7930993765360644e-05, + "loss": 0.775146484375, + "step": 575 + }, + { + "epoch": 0.7474452554744525, + "grad_norm": 0.5306016802787781, + "learning_rate": 1.792227596592889e-05, + "loss": 0.6946839094161987, + "step": 576 + }, + { + "epoch": 0.748742903487429, + "grad_norm": 0.5487167835235596, + "learning_rate": 1.791354196744794e-05, + "loss": 0.7318082451820374, + "step": 577 + }, + { + "epoch": 0.7500405515004055, + "grad_norm": 0.5554513931274414, + "learning_rate": 1.790479178777662e-05, + "loss": 0.727341890335083, + "step": 578 + }, + { + "epoch": 0.751338199513382, + "grad_norm": 0.5512000918388367, + "learning_rate": 1.7896025444806834e-05, + "loss": 0.7673891186714172, + "step": 579 + }, + { + "epoch": 0.7526358475263585, + "grad_norm": 0.5614628195762634, + "learning_rate": 1.7887242956463528e-05, + "loss": 0.7410103678703308, + "step": 580 + }, + { + "epoch": 0.753933495539335, + "grad_norm": 0.5414284467697144, + "learning_rate": 1.7878444340704666e-05, + "loss": 0.7189674377441406, + "step": 581 + }, + { + "epoch": 0.7552311435523115, + "grad_norm": 0.5145770311355591, + "learning_rate": 1.78696296155212e-05, + "loss": 0.6776304244995117, + "step": 582 + }, + { + "epoch": 0.756528791565288, + "grad_norm": 0.5401176810264587, + "learning_rate": 1.7860798798937e-05, + "loss": 0.6960833072662354, + "step": 583 + }, + { + "epoch": 0.7578264395782643, + "grad_norm": 0.5560998916625977, + "learning_rate": 1.7851951909008864e-05, + "loss": 0.6736742258071899, + "step": 584 + }, + { + "epoch": 0.7591240875912408, + "grad_norm": 0.5505719780921936, + "learning_rate": 1.7843088963826437e-05, + "loss": 0.6757134795188904, + "step": 585 + }, + { + "epoch": 0.7604217356042173, + "grad_norm": 0.5717475414276123, + "learning_rate": 1.783420998151219e-05, + "loss": 0.7612842321395874, + "step": 586 + }, + { + "epoch": 0.7617193836171938, + "grad_norm": 0.5554843544960022, + "learning_rate": 1.782531498022141e-05, + "loss": 0.705300509929657, + "step": 587 + }, + { + "epoch": 0.7630170316301703, + "grad_norm": 0.5320503115653992, + "learning_rate": 1.781640397814211e-05, + "loss": 0.7508092522621155, + "step": 588 + }, + { + "epoch": 0.7643146796431468, + "grad_norm": 0.5554909706115723, + "learning_rate": 1.7807476993495047e-05, + "loss": 0.7732164859771729, + "step": 589 + }, + { + "epoch": 0.7656123276561233, + "grad_norm": 0.5467298030853271, + "learning_rate": 1.779853404453363e-05, + "loss": 0.7246618270874023, + "step": 590 + }, + { + "epoch": 0.7669099756690998, + "grad_norm": 0.5365788340568542, + "learning_rate": 1.7789575149543936e-05, + "loss": 0.6982936263084412, + "step": 591 + }, + { + "epoch": 0.7682076236820763, + "grad_norm": 0.5504671931266785, + "learning_rate": 1.7780600326844638e-05, + "loss": 0.7263147830963135, + "step": 592 + }, + { + "epoch": 0.7695052716950527, + "grad_norm": 0.549707293510437, + "learning_rate": 1.7771609594786968e-05, + "loss": 0.7235106229782104, + "step": 593 + }, + { + "epoch": 0.7708029197080292, + "grad_norm": 0.5401800274848938, + "learning_rate": 1.776260297175471e-05, + "loss": 0.7632750272750854, + "step": 594 + }, + { + "epoch": 0.7721005677210057, + "grad_norm": 0.5245280265808105, + "learning_rate": 1.775358047616412e-05, + "loss": 0.6609013080596924, + "step": 595 + }, + { + "epoch": 0.7733982157339822, + "grad_norm": 0.5566380023956299, + "learning_rate": 1.774454212646392e-05, + "loss": 0.7397713661193848, + "step": 596 + }, + { + "epoch": 0.7746958637469586, + "grad_norm": 0.5788303017616272, + "learning_rate": 1.773548794113525e-05, + "loss": 0.6708486676216125, + "step": 597 + }, + { + "epoch": 0.7759935117599351, + "grad_norm": 0.5494595170021057, + "learning_rate": 1.772641793869162e-05, + "loss": 0.7761523723602295, + "step": 598 + }, + { + "epoch": 0.7772911597729116, + "grad_norm": 0.5339208245277405, + "learning_rate": 1.7717332137678895e-05, + "loss": 0.6619516611099243, + "step": 599 + }, + { + "epoch": 0.7785888077858881, + "grad_norm": 0.5362167358398438, + "learning_rate": 1.770823055667524e-05, + "loss": 0.7144718170166016, + "step": 600 + }, + { + "epoch": 0.7798864557988645, + "grad_norm": 0.5141735076904297, + "learning_rate": 1.7699113214291082e-05, + "loss": 0.6293293237686157, + "step": 601 + }, + { + "epoch": 0.781184103811841, + "grad_norm": 0.5582875609397888, + "learning_rate": 1.768998012916908e-05, + "loss": 0.7720483541488647, + "step": 602 + }, + { + "epoch": 0.7824817518248175, + "grad_norm": 0.5367119312286377, + "learning_rate": 1.7680831319984077e-05, + "loss": 0.705078661441803, + "step": 603 + }, + { + "epoch": 0.783779399837794, + "grad_norm": 0.5382807850837708, + "learning_rate": 1.7671666805443076e-05, + "loss": 0.7088773846626282, + "step": 604 + }, + { + "epoch": 0.7850770478507705, + "grad_norm": 0.5625648498535156, + "learning_rate": 1.766248660428519e-05, + "loss": 0.7392460703849792, + "step": 605 + }, + { + "epoch": 0.786374695863747, + "grad_norm": 0.5586503744125366, + "learning_rate": 1.7653290735281605e-05, + "loss": 0.7484114170074463, + "step": 606 + }, + { + "epoch": 0.7876723438767235, + "grad_norm": 0.5572494864463806, + "learning_rate": 1.7644079217235547e-05, + "loss": 0.7409180402755737, + "step": 607 + }, + { + "epoch": 0.7889699918897, + "grad_norm": 0.5369569659233093, + "learning_rate": 1.763485206898224e-05, + "loss": 0.6471737027168274, + "step": 608 + }, + { + "epoch": 0.7902676399026763, + "grad_norm": 0.5504409074783325, + "learning_rate": 1.762560930938886e-05, + "loss": 0.7778940200805664, + "step": 609 + }, + { + "epoch": 0.7915652879156528, + "grad_norm": 0.5358904600143433, + "learning_rate": 1.7616350957354523e-05, + "loss": 0.694309413433075, + "step": 610 + }, + { + "epoch": 0.7928629359286293, + "grad_norm": 0.5360654592514038, + "learning_rate": 1.7607077031810204e-05, + "loss": 0.6945086717605591, + "step": 611 + }, + { + "epoch": 0.7941605839416058, + "grad_norm": 0.535325825214386, + "learning_rate": 1.759778755171874e-05, + "loss": 0.7578423619270325, + "step": 612 + }, + { + "epoch": 0.7954582319545823, + "grad_norm": 0.5466883182525635, + "learning_rate": 1.758848253607476e-05, + "loss": 0.7157893180847168, + "step": 613 + }, + { + "epoch": 0.7967558799675588, + "grad_norm": 0.5534203052520752, + "learning_rate": 1.7579162003904678e-05, + "loss": 0.7312074303627014, + "step": 614 + }, + { + "epoch": 0.7980535279805353, + "grad_norm": 0.5488491654396057, + "learning_rate": 1.756982597426661e-05, + "loss": 0.7318480014801025, + "step": 615 + }, + { + "epoch": 0.7993511759935118, + "grad_norm": 0.5375532507896423, + "learning_rate": 1.756047446625038e-05, + "loss": 0.7143536806106567, + "step": 616 + }, + { + "epoch": 0.8006488240064883, + "grad_norm": 0.5791228413581848, + "learning_rate": 1.7551107498977458e-05, + "loss": 0.642976701259613, + "step": 617 + }, + { + "epoch": 0.8019464720194647, + "grad_norm": 0.5346726179122925, + "learning_rate": 1.7541725091600918e-05, + "loss": 0.687232255935669, + "step": 618 + }, + { + "epoch": 0.8032441200324412, + "grad_norm": 0.5417895913124084, + "learning_rate": 1.7532327263305405e-05, + "loss": 0.7081488370895386, + "step": 619 + }, + { + "epoch": 0.8045417680454177, + "grad_norm": 0.5509006381034851, + "learning_rate": 1.75229140333071e-05, + "loss": 0.7728561162948608, + "step": 620 + }, + { + "epoch": 0.8058394160583942, + "grad_norm": 0.5634705424308777, + "learning_rate": 1.7513485420853683e-05, + "loss": 0.6951034069061279, + "step": 621 + }, + { + "epoch": 0.8071370640713706, + "grad_norm": 0.5197573900222778, + "learning_rate": 1.750404144522427e-05, + "loss": 0.7106211185455322, + "step": 622 + }, + { + "epoch": 0.8084347120843471, + "grad_norm": 0.5803437232971191, + "learning_rate": 1.7494582125729408e-05, + "loss": 0.7436937689781189, + "step": 623 + }, + { + "epoch": 0.8097323600973236, + "grad_norm": 0.541920006275177, + "learning_rate": 1.7485107481711014e-05, + "loss": 0.6682834029197693, + "step": 624 + }, + { + "epoch": 0.8110300081103001, + "grad_norm": 0.561758279800415, + "learning_rate": 1.7475617532542325e-05, + "loss": 0.6873137950897217, + "step": 625 + }, + { + "epoch": 0.8123276561232765, + "grad_norm": 0.5416638255119324, + "learning_rate": 1.7466112297627894e-05, + "loss": 0.7167541980743408, + "step": 626 + }, + { + "epoch": 0.813625304136253, + "grad_norm": 0.5338025093078613, + "learning_rate": 1.7456591796403525e-05, + "loss": 0.7321476340293884, + "step": 627 + }, + { + "epoch": 0.8149229521492295, + "grad_norm": 0.5378256440162659, + "learning_rate": 1.744705604833622e-05, + "loss": 0.6663627624511719, + "step": 628 + }, + { + "epoch": 0.816220600162206, + "grad_norm": 0.581386387348175, + "learning_rate": 1.7437505072924177e-05, + "loss": 0.755516767501831, + "step": 629 + }, + { + "epoch": 0.8175182481751825, + "grad_norm": 0.581896185874939, + "learning_rate": 1.742793888969673e-05, + "loss": 0.7974879145622253, + "step": 630 + }, + { + "epoch": 0.818815896188159, + "grad_norm": 0.521468460559845, + "learning_rate": 1.741835751821429e-05, + "loss": 0.7400495409965515, + "step": 631 + }, + { + "epoch": 0.8201135442011355, + "grad_norm": 0.5232843160629272, + "learning_rate": 1.7408760978068343e-05, + "loss": 0.6786386966705322, + "step": 632 + }, + { + "epoch": 0.821411192214112, + "grad_norm": 0.5813708901405334, + "learning_rate": 1.739914928888139e-05, + "loss": 0.7453535199165344, + "step": 633 + }, + { + "epoch": 0.8227088402270885, + "grad_norm": 0.5424124002456665, + "learning_rate": 1.7389522470306892e-05, + "loss": 0.7520110607147217, + "step": 634 + }, + { + "epoch": 0.8240064882400648, + "grad_norm": 0.5089052319526672, + "learning_rate": 1.7379880542029263e-05, + "loss": 0.7197295427322388, + "step": 635 + }, + { + "epoch": 0.8253041362530413, + "grad_norm": 0.5367469191551208, + "learning_rate": 1.7370223523763804e-05, + "loss": 0.7498934864997864, + "step": 636 + }, + { + "epoch": 0.8266017842660178, + "grad_norm": 0.5291455388069153, + "learning_rate": 1.7360551435256673e-05, + "loss": 0.7376183867454529, + "step": 637 + }, + { + "epoch": 0.8278994322789943, + "grad_norm": 0.5446896553039551, + "learning_rate": 1.7350864296284846e-05, + "loss": 0.735445499420166, + "step": 638 + }, + { + "epoch": 0.8291970802919708, + "grad_norm": 0.5124339461326599, + "learning_rate": 1.7341162126656063e-05, + "loss": 0.6861530542373657, + "step": 639 + }, + { + "epoch": 0.8304947283049473, + "grad_norm": 0.5077775120735168, + "learning_rate": 1.7331444946208815e-05, + "loss": 0.688785195350647, + "step": 640 + }, + { + "epoch": 0.8317923763179238, + "grad_norm": 0.5058798789978027, + "learning_rate": 1.732171277481227e-05, + "loss": 0.7133075594902039, + "step": 641 + }, + { + "epoch": 0.8330900243309003, + "grad_norm": 0.5404756665229797, + "learning_rate": 1.7311965632366254e-05, + "loss": 0.7240495681762695, + "step": 642 + }, + { + "epoch": 0.8343876723438767, + "grad_norm": 0.5313534736633301, + "learning_rate": 1.7302203538801212e-05, + "loss": 0.71756911277771, + "step": 643 + }, + { + "epoch": 0.8356853203568532, + "grad_norm": 0.5360015630722046, + "learning_rate": 1.729242651407815e-05, + "loss": 0.7652734518051147, + "step": 644 + }, + { + "epoch": 0.8369829683698297, + "grad_norm": 0.540046751499176, + "learning_rate": 1.7282634578188612e-05, + "loss": 0.7294871807098389, + "step": 645 + }, + { + "epoch": 0.8382806163828062, + "grad_norm": 0.5653432607650757, + "learning_rate": 1.7272827751154627e-05, + "loss": 0.7391757965087891, + "step": 646 + }, + { + "epoch": 0.8395782643957826, + "grad_norm": 0.5427312850952148, + "learning_rate": 1.7263006053028674e-05, + "loss": 0.6798534393310547, + "step": 647 + }, + { + "epoch": 0.8408759124087591, + "grad_norm": 0.539861261844635, + "learning_rate": 1.7253169503893637e-05, + "loss": 0.7292792201042175, + "step": 648 + }, + { + "epoch": 0.8421735604217356, + "grad_norm": 0.5300166010856628, + "learning_rate": 1.7243318123862777e-05, + "loss": 0.7026904821395874, + "step": 649 + }, + { + "epoch": 0.8434712084347121, + "grad_norm": 0.5242528319358826, + "learning_rate": 1.7233451933079663e-05, + "loss": 0.6926451921463013, + "step": 650 + }, + { + "epoch": 0.8447688564476885, + "grad_norm": 0.5352111458778381, + "learning_rate": 1.7223570951718166e-05, + "loss": 0.7006164789199829, + "step": 651 + }, + { + "epoch": 0.846066504460665, + "grad_norm": 0.5747525095939636, + "learning_rate": 1.7213675199982388e-05, + "loss": 0.7685414552688599, + "step": 652 + }, + { + "epoch": 0.8473641524736415, + "grad_norm": 0.5309545397758484, + "learning_rate": 1.7203764698106636e-05, + "loss": 0.7312856912612915, + "step": 653 + }, + { + "epoch": 0.848661800486618, + "grad_norm": 0.5124905705451965, + "learning_rate": 1.7193839466355383e-05, + "loss": 0.6484863758087158, + "step": 654 + }, + { + "epoch": 0.8499594484995945, + "grad_norm": 0.5323530435562134, + "learning_rate": 1.7183899525023212e-05, + "loss": 0.694681704044342, + "step": 655 + }, + { + "epoch": 0.851257096512571, + "grad_norm": 0.5242999792098999, + "learning_rate": 1.7173944894434783e-05, + "loss": 0.6672481298446655, + "step": 656 + }, + { + "epoch": 0.8525547445255475, + "grad_norm": 0.5519501566886902, + "learning_rate": 1.7163975594944807e-05, + "loss": 0.7557801604270935, + "step": 657 + }, + { + "epoch": 0.853852392538524, + "grad_norm": 0.5345069169998169, + "learning_rate": 1.715399164693797e-05, + "loss": 0.7127410173416138, + "step": 658 + }, + { + "epoch": 0.8551500405515005, + "grad_norm": 0.5087319016456604, + "learning_rate": 1.7143993070828913e-05, + "loss": 0.6801098585128784, + "step": 659 + }, + { + "epoch": 0.8564476885644768, + "grad_norm": 0.546444833278656, + "learning_rate": 1.713397988706221e-05, + "loss": 0.7135753631591797, + "step": 660 + }, + { + "epoch": 0.8577453365774533, + "grad_norm": 0.5438613891601562, + "learning_rate": 1.7123952116112275e-05, + "loss": 0.7199326753616333, + "step": 661 + }, + { + "epoch": 0.8590429845904298, + "grad_norm": 0.5320620536804199, + "learning_rate": 1.7113909778483364e-05, + "loss": 0.7263282537460327, + "step": 662 + }, + { + "epoch": 0.8603406326034063, + "grad_norm": 0.5496207475662231, + "learning_rate": 1.7103852894709517e-05, + "loss": 0.6767710447311401, + "step": 663 + }, + { + "epoch": 0.8616382806163828, + "grad_norm": 0.5515886545181274, + "learning_rate": 1.7093781485354517e-05, + "loss": 0.666580319404602, + "step": 664 + }, + { + "epoch": 0.8629359286293593, + "grad_norm": 0.5425974130630493, + "learning_rate": 1.7083695571011842e-05, + "loss": 0.7289122343063354, + "step": 665 + }, + { + "epoch": 0.8642335766423358, + "grad_norm": 0.5263716578483582, + "learning_rate": 1.707359517230464e-05, + "loss": 0.6910987496376038, + "step": 666 + }, + { + "epoch": 0.8655312246553123, + "grad_norm": 0.525571346282959, + "learning_rate": 1.7063480309885668e-05, + "loss": 0.6733009815216064, + "step": 667 + }, + { + "epoch": 0.8668288726682887, + "grad_norm": 0.5529440641403198, + "learning_rate": 1.7053351004437258e-05, + "loss": 0.6993213295936584, + "step": 668 + }, + { + "epoch": 0.8681265206812652, + "grad_norm": 0.5263779163360596, + "learning_rate": 1.7043207276671276e-05, + "loss": 0.7125247120857239, + "step": 669 + }, + { + "epoch": 0.8694241686942417, + "grad_norm": 0.5178059935569763, + "learning_rate": 1.7033049147329077e-05, + "loss": 0.7389542460441589, + "step": 670 + }, + { + "epoch": 0.8707218167072182, + "grad_norm": 0.5027527809143066, + "learning_rate": 1.702287663718147e-05, + "loss": 0.6378510594367981, + "step": 671 + }, + { + "epoch": 0.8720194647201946, + "grad_norm": 0.5320873260498047, + "learning_rate": 1.7012689767028656e-05, + "loss": 0.6820501089096069, + "step": 672 + }, + { + "epoch": 0.8733171127331711, + "grad_norm": 0.5544079542160034, + "learning_rate": 1.700248855770021e-05, + "loss": 0.7887839078903198, + "step": 673 + }, + { + "epoch": 0.8746147607461476, + "grad_norm": 0.5328344702720642, + "learning_rate": 1.6992273030055022e-05, + "loss": 0.7038314938545227, + "step": 674 + }, + { + "epoch": 0.8759124087591241, + "grad_norm": 0.5509505867958069, + "learning_rate": 1.6982043204981264e-05, + "loss": 0.7049298286437988, + "step": 675 + }, + { + "epoch": 0.8772100567721006, + "grad_norm": 0.5168129205703735, + "learning_rate": 1.6971799103396332e-05, + "loss": 0.6959193348884583, + "step": 676 + }, + { + "epoch": 0.878507704785077, + "grad_norm": 0.5376099944114685, + "learning_rate": 1.696154074624683e-05, + "loss": 0.7292076349258423, + "step": 677 + }, + { + "epoch": 0.8798053527980535, + "grad_norm": 0.5142057538032532, + "learning_rate": 1.6951268154508497e-05, + "loss": 0.7193281650543213, + "step": 678 + }, + { + "epoch": 0.88110300081103, + "grad_norm": 0.5402371287345886, + "learning_rate": 1.6940981349186182e-05, + "loss": 0.748397946357727, + "step": 679 + }, + { + "epoch": 0.8824006488240065, + "grad_norm": 0.5436865091323853, + "learning_rate": 1.69306803513138e-05, + "loss": 0.7238379716873169, + "step": 680 + }, + { + "epoch": 0.883698296836983, + "grad_norm": 0.5323321223258972, + "learning_rate": 1.6920365181954284e-05, + "loss": 0.7368711829185486, + "step": 681 + }, + { + "epoch": 0.8849959448499595, + "grad_norm": 0.5474384427070618, + "learning_rate": 1.6910035862199545e-05, + "loss": 0.7030202746391296, + "step": 682 + }, + { + "epoch": 0.886293592862936, + "grad_norm": 0.5428197979927063, + "learning_rate": 1.6899692413170422e-05, + "loss": 0.713437557220459, + "step": 683 + }, + { + "epoch": 0.8875912408759125, + "grad_norm": 0.5502634048461914, + "learning_rate": 1.688933485601666e-05, + "loss": 0.7090182304382324, + "step": 684 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.5356465578079224, + "learning_rate": 1.6878963211916833e-05, + "loss": 0.7201128005981445, + "step": 685 + }, + { + "epoch": 0.8901865369018653, + "grad_norm": 0.5563944578170776, + "learning_rate": 1.6868577502078336e-05, + "loss": 0.7264722585678101, + "step": 686 + }, + { + "epoch": 0.8914841849148418, + "grad_norm": 0.5522723197937012, + "learning_rate": 1.6858177747737312e-05, + "loss": 0.7600725889205933, + "step": 687 + }, + { + "epoch": 0.8927818329278183, + "grad_norm": 0.49715539813041687, + "learning_rate": 1.684776397015863e-05, + "loss": 0.6456987857818604, + "step": 688 + }, + { + "epoch": 0.8940794809407948, + "grad_norm": 0.5162433981895447, + "learning_rate": 1.6837336190635824e-05, + "loss": 0.6648015379905701, + "step": 689 + }, + { + "epoch": 0.8953771289537713, + "grad_norm": 0.5113485455513, + "learning_rate": 1.682689443049107e-05, + "loss": 0.7002501487731934, + "step": 690 + }, + { + "epoch": 0.8953771289537713, + "eval_loss": 0.6879991292953491, + "eval_runtime": 72.6036, + "eval_samples_per_second": 71.512, + "eval_steps_per_second": 8.939, + "step": 690 + }, + { + "epoch": 0.8966747769667478, + "grad_norm": 0.5129652619361877, + "learning_rate": 1.6816438711075114e-05, + "loss": 0.7118932008743286, + "step": 691 + }, + { + "epoch": 0.8979724249797243, + "grad_norm": 0.5204065442085266, + "learning_rate": 1.680596905376727e-05, + "loss": 0.7194908857345581, + "step": 692 + }, + { + "epoch": 0.8992700729927007, + "grad_norm": 0.5264798402786255, + "learning_rate": 1.6795485479975327e-05, + "loss": 0.6868776082992554, + "step": 693 + }, + { + "epoch": 0.9005677210056772, + "grad_norm": 0.5244487524032593, + "learning_rate": 1.6784988011135546e-05, + "loss": 0.7106890678405762, + "step": 694 + }, + { + "epoch": 0.9018653690186537, + "grad_norm": 0.5397396683692932, + "learning_rate": 1.6774476668712587e-05, + "loss": 0.695647656917572, + "step": 695 + }, + { + "epoch": 0.9031630170316302, + "grad_norm": 0.5147722959518433, + "learning_rate": 1.676395147419949e-05, + "loss": 0.7283300161361694, + "step": 696 + }, + { + "epoch": 0.9044606650446066, + "grad_norm": 0.5326966047286987, + "learning_rate": 1.6753412449117615e-05, + "loss": 0.7349389791488647, + "step": 697 + }, + { + "epoch": 0.9057583130575831, + "grad_norm": 0.522964596748352, + "learning_rate": 1.67428596150166e-05, + "loss": 0.7657152414321899, + "step": 698 + }, + { + "epoch": 0.9070559610705596, + "grad_norm": 0.5306779742240906, + "learning_rate": 1.6732292993474316e-05, + "loss": 0.6991469264030457, + "step": 699 + }, + { + "epoch": 0.9083536090835361, + "grad_norm": 0.517011284828186, + "learning_rate": 1.6721712606096833e-05, + "loss": 0.6861897706985474, + "step": 700 + }, + { + "epoch": 0.9096512570965126, + "grad_norm": 0.5209232568740845, + "learning_rate": 1.6711118474518363e-05, + "loss": 0.6535213589668274, + "step": 701 + }, + { + "epoch": 0.910948905109489, + "grad_norm": 0.538005530834198, + "learning_rate": 1.6700510620401223e-05, + "loss": 0.6827917695045471, + "step": 702 + }, + { + "epoch": 0.9122465531224655, + "grad_norm": 0.5532050132751465, + "learning_rate": 1.6689889065435796e-05, + "loss": 0.7328672409057617, + "step": 703 + }, + { + "epoch": 0.913544201135442, + "grad_norm": 0.5541777014732361, + "learning_rate": 1.667925383134047e-05, + "loss": 0.639081597328186, + "step": 704 + }, + { + "epoch": 0.9148418491484185, + "grad_norm": 0.5441383719444275, + "learning_rate": 1.66686049398616e-05, + "loss": 0.7073994874954224, + "step": 705 + }, + { + "epoch": 0.916139497161395, + "grad_norm": 0.5432547330856323, + "learning_rate": 1.6657942412773484e-05, + "loss": 0.7249147295951843, + "step": 706 + }, + { + "epoch": 0.9174371451743715, + "grad_norm": 0.5718936324119568, + "learning_rate": 1.664726627187829e-05, + "loss": 0.7475080490112305, + "step": 707 + }, + { + "epoch": 0.918734793187348, + "grad_norm": 0.5303789377212524, + "learning_rate": 1.6636576539006015e-05, + "loss": 0.7102556228637695, + "step": 708 + }, + { + "epoch": 0.9200324412003245, + "grad_norm": 0.5120844841003418, + "learning_rate": 1.6625873236014464e-05, + "loss": 0.7160992622375488, + "step": 709 + }, + { + "epoch": 0.9213300892133008, + "grad_norm": 0.5382957458496094, + "learning_rate": 1.6615156384789185e-05, + "loss": 0.6958597898483276, + "step": 710 + }, + { + "epoch": 0.9226277372262773, + "grad_norm": 0.5195145606994629, + "learning_rate": 1.660442600724342e-05, + "loss": 0.6958160400390625, + "step": 711 + }, + { + "epoch": 0.9239253852392538, + "grad_norm": 0.5473058223724365, + "learning_rate": 1.659368212531808e-05, + "loss": 0.7220757007598877, + "step": 712 + }, + { + "epoch": 0.9252230332522303, + "grad_norm": 0.5131781697273254, + "learning_rate": 1.6582924760981683e-05, + "loss": 0.7035195827484131, + "step": 713 + }, + { + "epoch": 0.9265206812652068, + "grad_norm": 0.5314381122589111, + "learning_rate": 1.6572153936230316e-05, + "loss": 0.6506175994873047, + "step": 714 + }, + { + "epoch": 0.9278183292781833, + "grad_norm": 0.565310001373291, + "learning_rate": 1.6561369673087588e-05, + "loss": 0.7714331746101379, + "step": 715 + }, + { + "epoch": 0.9291159772911598, + "grad_norm": 0.530504584312439, + "learning_rate": 1.6550571993604587e-05, + "loss": 0.7331136465072632, + "step": 716 + }, + { + "epoch": 0.9304136253041363, + "grad_norm": 0.5755041837692261, + "learning_rate": 1.6539760919859838e-05, + "loss": 0.7090123891830444, + "step": 717 + }, + { + "epoch": 0.9317112733171128, + "grad_norm": 0.5264776349067688, + "learning_rate": 1.6528936473959253e-05, + "loss": 0.7207454442977905, + "step": 718 + }, + { + "epoch": 0.9330089213300892, + "grad_norm": 0.5459887981414795, + "learning_rate": 1.6518098678036073e-05, + "loss": 0.7477676272392273, + "step": 719 + }, + { + "epoch": 0.9343065693430657, + "grad_norm": 0.5480107069015503, + "learning_rate": 1.650724755425086e-05, + "loss": 0.7585529685020447, + "step": 720 + }, + { + "epoch": 0.9356042173560422, + "grad_norm": 0.5156884789466858, + "learning_rate": 1.6496383124791406e-05, + "loss": 0.684555172920227, + "step": 721 + }, + { + "epoch": 0.9369018653690186, + "grad_norm": 0.5162327289581299, + "learning_rate": 1.6485505411872725e-05, + "loss": 0.7163575887680054, + "step": 722 + }, + { + "epoch": 0.9381995133819951, + "grad_norm": 0.5424114465713501, + "learning_rate": 1.6474614437736986e-05, + "loss": 0.722049355506897, + "step": 723 + }, + { + "epoch": 0.9394971613949716, + "grad_norm": 0.546845555305481, + "learning_rate": 1.6463710224653477e-05, + "loss": 0.7012547850608826, + "step": 724 + }, + { + "epoch": 0.9407948094079481, + "grad_norm": 0.5183011889457703, + "learning_rate": 1.6452792794918545e-05, + "loss": 0.7152835130691528, + "step": 725 + }, + { + "epoch": 0.9420924574209246, + "grad_norm": 0.5085439682006836, + "learning_rate": 1.644186217085558e-05, + "loss": 0.7061685919761658, + "step": 726 + }, + { + "epoch": 0.943390105433901, + "grad_norm": 0.5237677097320557, + "learning_rate": 1.6430918374814937e-05, + "loss": 0.7506479024887085, + "step": 727 + }, + { + "epoch": 0.9446877534468775, + "grad_norm": 0.5498985052108765, + "learning_rate": 1.641996142917391e-05, + "loss": 0.7604420185089111, + "step": 728 + }, + { + "epoch": 0.945985401459854, + "grad_norm": 0.506365180015564, + "learning_rate": 1.640899135633668e-05, + "loss": 0.7282454967498779, + "step": 729 + }, + { + "epoch": 0.9472830494728305, + "grad_norm": 0.5272793769836426, + "learning_rate": 1.6398008178734272e-05, + "loss": 0.7712985277175903, + "step": 730 + }, + { + "epoch": 0.948580697485807, + "grad_norm": 0.49885818362236023, + "learning_rate": 1.6387011918824493e-05, + "loss": 0.6967482566833496, + "step": 731 + }, + { + "epoch": 0.9498783454987835, + "grad_norm": 0.5086526274681091, + "learning_rate": 1.6376002599091925e-05, + "loss": 0.7118892073631287, + "step": 732 + }, + { + "epoch": 0.95117599351176, + "grad_norm": 0.5380651354789734, + "learning_rate": 1.6364980242047835e-05, + "loss": 0.7118611335754395, + "step": 733 + }, + { + "epoch": 0.9524736415247365, + "grad_norm": 0.5358894467353821, + "learning_rate": 1.635394487023015e-05, + "loss": 0.73922199010849, + "step": 734 + }, + { + "epoch": 0.9537712895377128, + "grad_norm": 0.518375813961029, + "learning_rate": 1.634289650620342e-05, + "loss": 0.7491021156311035, + "step": 735 + }, + { + "epoch": 0.9550689375506893, + "grad_norm": 0.5029126405715942, + "learning_rate": 1.633183517255875e-05, + "loss": 0.6724518537521362, + "step": 736 + }, + { + "epoch": 0.9563665855636658, + "grad_norm": 0.5309873819351196, + "learning_rate": 1.632076089191376e-05, + "loss": 0.7152642011642456, + "step": 737 + }, + { + "epoch": 0.9576642335766423, + "grad_norm": 0.5265018343925476, + "learning_rate": 1.630967368691256e-05, + "loss": 0.7223344445228577, + "step": 738 + }, + { + "epoch": 0.9589618815896188, + "grad_norm": 0.5360968112945557, + "learning_rate": 1.6298573580225676e-05, + "loss": 0.6773437261581421, + "step": 739 + }, + { + "epoch": 0.9602595296025953, + "grad_norm": 0.532696545124054, + "learning_rate": 1.6287460594550017e-05, + "loss": 0.6913273930549622, + "step": 740 + }, + { + "epoch": 0.9615571776155718, + "grad_norm": 0.5159463286399841, + "learning_rate": 1.6276334752608823e-05, + "loss": 0.7023458480834961, + "step": 741 + }, + { + "epoch": 0.9628548256285483, + "grad_norm": 0.5166627764701843, + "learning_rate": 1.6265196077151627e-05, + "loss": 0.6580889821052551, + "step": 742 + }, + { + "epoch": 0.9641524736415248, + "grad_norm": 0.5432324409484863, + "learning_rate": 1.62540445909542e-05, + "loss": 0.7707301378250122, + "step": 743 + }, + { + "epoch": 0.9654501216545012, + "grad_norm": 0.5537624955177307, + "learning_rate": 1.624288031681851e-05, + "loss": 0.718231737613678, + "step": 744 + }, + { + "epoch": 0.9667477696674777, + "grad_norm": 0.5601441860198975, + "learning_rate": 1.623170327757267e-05, + "loss": 0.7587568759918213, + "step": 745 + }, + { + "epoch": 0.9680454176804542, + "grad_norm": 0.5228809118270874, + "learning_rate": 1.62205134960709e-05, + "loss": 0.7063294649124146, + "step": 746 + }, + { + "epoch": 0.9693430656934306, + "grad_norm": 0.5264230370521545, + "learning_rate": 1.620931099519347e-05, + "loss": 0.7381964921951294, + "step": 747 + }, + { + "epoch": 0.9706407137064071, + "grad_norm": 0.5306467413902283, + "learning_rate": 1.619809579784665e-05, + "loss": 0.6895403861999512, + "step": 748 + }, + { + "epoch": 0.9719383617193836, + "grad_norm": 0.5162505507469177, + "learning_rate": 1.6186867926962695e-05, + "loss": 0.7042033672332764, + "step": 749 + }, + { + "epoch": 0.9732360097323601, + "grad_norm": 0.51023268699646, + "learning_rate": 1.6175627405499746e-05, + "loss": 0.7028312683105469, + "step": 750 + }, + { + "epoch": 0.9745336577453366, + "grad_norm": 0.5226272344589233, + "learning_rate": 1.6164374256441837e-05, + "loss": 0.7110305428504944, + "step": 751 + }, + { + "epoch": 0.975831305758313, + "grad_norm": 0.5189753174781799, + "learning_rate": 1.6153108502798796e-05, + "loss": 0.7227635979652405, + "step": 752 + }, + { + "epoch": 0.9771289537712895, + "grad_norm": 0.5253064036369324, + "learning_rate": 1.614183016760625e-05, + "loss": 0.708706259727478, + "step": 753 + }, + { + "epoch": 0.978426601784266, + "grad_norm": 0.5069226622581482, + "learning_rate": 1.613053927392553e-05, + "loss": 0.7607108354568481, + "step": 754 + }, + { + "epoch": 0.9797242497972425, + "grad_norm": 0.5430122017860413, + "learning_rate": 1.6119235844843664e-05, + "loss": 0.6882092356681824, + "step": 755 + }, + { + "epoch": 0.981021897810219, + "grad_norm": 0.5484969615936279, + "learning_rate": 1.6107919903473294e-05, + "loss": 0.6984055638313293, + "step": 756 + }, + { + "epoch": 0.9823195458231955, + "grad_norm": 0.5450364351272583, + "learning_rate": 1.6096591472952664e-05, + "loss": 0.7414028644561768, + "step": 757 + }, + { + "epoch": 0.983617193836172, + "grad_norm": 0.5095598101615906, + "learning_rate": 1.6085250576445548e-05, + "loss": 0.6796683073043823, + "step": 758 + }, + { + "epoch": 0.9849148418491485, + "grad_norm": 0.5161803364753723, + "learning_rate": 1.6073897237141203e-05, + "loss": 0.6673390865325928, + "step": 759 + }, + { + "epoch": 0.986212489862125, + "grad_norm": 0.5004435777664185, + "learning_rate": 1.6062531478254333e-05, + "loss": 0.6315610408782959, + "step": 760 + }, + { + "epoch": 0.9875101378751013, + "grad_norm": 0.5166559219360352, + "learning_rate": 1.605115332302505e-05, + "loss": 0.6672409176826477, + "step": 761 + }, + { + "epoch": 0.9888077858880778, + "grad_norm": 0.5332128405570984, + "learning_rate": 1.603976279471879e-05, + "loss": 0.7169513702392578, + "step": 762 + }, + { + "epoch": 0.9901054339010543, + "grad_norm": 0.5556347370147705, + "learning_rate": 1.6028359916626308e-05, + "loss": 0.708602786064148, + "step": 763 + }, + { + "epoch": 0.9914030819140308, + "grad_norm": 0.5154053568840027, + "learning_rate": 1.601694471206359e-05, + "loss": 0.6270056366920471, + "step": 764 + }, + { + "epoch": 0.9927007299270073, + "grad_norm": 0.5185645222663879, + "learning_rate": 1.600551720437186e-05, + "loss": 0.6873992085456848, + "step": 765 + }, + { + "epoch": 0.9939983779399838, + "grad_norm": 0.546991229057312, + "learning_rate": 1.599407741691746e-05, + "loss": 0.7366882562637329, + "step": 766 + }, + { + "epoch": 0.9952960259529603, + "grad_norm": 0.5219473838806152, + "learning_rate": 1.5982625373091877e-05, + "loss": 0.6808854937553406, + "step": 767 + }, + { + "epoch": 0.9965936739659368, + "grad_norm": 0.5348212122917175, + "learning_rate": 1.5971161096311628e-05, + "loss": 0.7217116355895996, + "step": 768 + }, + { + "epoch": 0.9978913219789132, + "grad_norm": 0.5152093172073364, + "learning_rate": 1.5959684610018267e-05, + "loss": 0.6545735597610474, + "step": 769 + }, + { + "epoch": 0.9991889699918897, + "grad_norm": 0.5182209610939026, + "learning_rate": 1.5948195937678297e-05, + "loss": 0.6775786280632019, + "step": 770 + }, + { + "epoch": 1.0, + "grad_norm": 0.634954571723938, + "learning_rate": 1.5936695102783148e-05, + "loss": 0.6640980839729309, + "step": 771 + }, + { + "epoch": 1.0012976480129765, + "grad_norm": 0.7553068399429321, + "learning_rate": 1.5925182128849116e-05, + "loss": 0.6133830547332764, + "step": 772 + }, + { + "epoch": 1.002595296025953, + "grad_norm": 0.6613984704017639, + "learning_rate": 1.591365703941732e-05, + "loss": 0.5815013647079468, + "step": 773 + }, + { + "epoch": 1.0038929440389295, + "grad_norm": 0.592282235622406, + "learning_rate": 1.5902119858053652e-05, + "loss": 0.5898460149765015, + "step": 774 + }, + { + "epoch": 1.005190592051906, + "grad_norm": 0.5373958945274353, + "learning_rate": 1.589057060834872e-05, + "loss": 0.6019303798675537, + "step": 775 + }, + { + "epoch": 1.0064882400648825, + "grad_norm": 0.6260755062103271, + "learning_rate": 1.5879009313917826e-05, + "loss": 0.5970971584320068, + "step": 776 + }, + { + "epoch": 1.007785888077859, + "grad_norm": 0.7529841661453247, + "learning_rate": 1.5867435998400885e-05, + "loss": 0.6816403865814209, + "step": 777 + }, + { + "epoch": 1.0090835360908355, + "grad_norm": 0.7224608659744263, + "learning_rate": 1.5855850685462404e-05, + "loss": 0.6263958215713501, + "step": 778 + }, + { + "epoch": 1.010381184103812, + "grad_norm": 0.6676880121231079, + "learning_rate": 1.584425339879141e-05, + "loss": 0.6304363012313843, + "step": 779 + }, + { + "epoch": 1.0116788321167882, + "grad_norm": 0.5799426436424255, + "learning_rate": 1.5832644162101417e-05, + "loss": 0.59343421459198, + "step": 780 + }, + { + "epoch": 1.0129764801297647, + "grad_norm": 0.570095956325531, + "learning_rate": 1.5821022999130385e-05, + "loss": 0.5410763025283813, + "step": 781 + }, + { + "epoch": 1.0142741281427412, + "grad_norm": 0.5948435068130493, + "learning_rate": 1.580938993364064e-05, + "loss": 0.5649259686470032, + "step": 782 + }, + { + "epoch": 1.0155717761557177, + "grad_norm": 0.6467446684837341, + "learning_rate": 1.579774498941886e-05, + "loss": 0.5860875844955444, + "step": 783 + }, + { + "epoch": 1.0168694241686942, + "grad_norm": 0.5886529088020325, + "learning_rate": 1.578608819027602e-05, + "loss": 0.5772626996040344, + "step": 784 + }, + { + "epoch": 1.0181670721816707, + "grad_norm": 0.5842233896255493, + "learning_rate": 1.5774419560047303e-05, + "loss": 0.6277778148651123, + "step": 785 + }, + { + "epoch": 1.0194647201946472, + "grad_norm": 0.590059220790863, + "learning_rate": 1.5762739122592123e-05, + "loss": 0.6396061182022095, + "step": 786 + }, + { + "epoch": 1.0207623682076237, + "grad_norm": 0.5897361636161804, + "learning_rate": 1.5751046901794008e-05, + "loss": 0.5980340242385864, + "step": 787 + }, + { + "epoch": 1.0220600162206002, + "grad_norm": 0.5984208583831787, + "learning_rate": 1.5739342921560593e-05, + "loss": 0.602581262588501, + "step": 788 + }, + { + "epoch": 1.0233576642335767, + "grad_norm": 0.5837097764015198, + "learning_rate": 1.5727627205823554e-05, + "loss": 0.5742583274841309, + "step": 789 + }, + { + "epoch": 1.0246553122465532, + "grad_norm": 0.5804028511047363, + "learning_rate": 1.571589977853857e-05, + "loss": 0.6103036999702454, + "step": 790 + }, + { + "epoch": 1.0259529602595296, + "grad_norm": 0.5784346461296082, + "learning_rate": 1.5704160663685254e-05, + "loss": 0.5436456203460693, + "step": 791 + }, + { + "epoch": 1.0272506082725061, + "grad_norm": 0.576518714427948, + "learning_rate": 1.5692409885267127e-05, + "loss": 0.6918940544128418, + "step": 792 + }, + { + "epoch": 1.0285482562854826, + "grad_norm": 0.5824302434921265, + "learning_rate": 1.568064746731156e-05, + "loss": 0.6090575456619263, + "step": 793 + }, + { + "epoch": 1.0298459042984591, + "grad_norm": 0.5279770493507385, + "learning_rate": 1.5668873433869718e-05, + "loss": 0.5268336534500122, + "step": 794 + }, + { + "epoch": 1.0311435523114356, + "grad_norm": 0.5494199395179749, + "learning_rate": 1.5657087809016517e-05, + "loss": 0.5766473412513733, + "step": 795 + }, + { + "epoch": 1.0324412003244121, + "grad_norm": 0.5452569723129272, + "learning_rate": 1.564529061685058e-05, + "loss": 0.5949534177780151, + "step": 796 + }, + { + "epoch": 1.0337388483373884, + "grad_norm": 0.5392066240310669, + "learning_rate": 1.5633481881494178e-05, + "loss": 0.5571380853652954, + "step": 797 + }, + { + "epoch": 1.0350364963503649, + "grad_norm": 0.5568217635154724, + "learning_rate": 1.562166162709319e-05, + "loss": 0.5642133951187134, + "step": 798 + }, + { + "epoch": 1.0363341443633414, + "grad_norm": 0.5702704191207886, + "learning_rate": 1.560982987781704e-05, + "loss": 0.6047669649124146, + "step": 799 + }, + { + "epoch": 1.0376317923763179, + "grad_norm": 0.532315731048584, + "learning_rate": 1.5597986657858656e-05, + "loss": 0.5958635807037354, + "step": 800 + }, + { + "epoch": 1.0389294403892944, + "grad_norm": 0.5331001877784729, + "learning_rate": 1.5586131991434434e-05, + "loss": 0.5987897515296936, + "step": 801 + }, + { + "epoch": 1.0402270884022708, + "grad_norm": 0.5481564402580261, + "learning_rate": 1.5574265902784163e-05, + "loss": 0.5622409582138062, + "step": 802 + }, + { + "epoch": 1.0415247364152473, + "grad_norm": 0.5720167756080627, + "learning_rate": 1.556238841617099e-05, + "loss": 0.6064007878303528, + "step": 803 + }, + { + "epoch": 1.0428223844282238, + "grad_norm": 0.5809172987937927, + "learning_rate": 1.555049955588137e-05, + "loss": 0.6170299053192139, + "step": 804 + }, + { + "epoch": 1.0441200324412003, + "grad_norm": 0.5783301591873169, + "learning_rate": 1.5538599346225013e-05, + "loss": 0.568396270275116, + "step": 805 + }, + { + "epoch": 1.0454176804541768, + "grad_norm": 0.5668922662734985, + "learning_rate": 1.552668781153484e-05, + "loss": 0.576393723487854, + "step": 806 + }, + { + "epoch": 1.0467153284671533, + "grad_norm": 0.5634539723396301, + "learning_rate": 1.5514764976166916e-05, + "loss": 0.6574882864952087, + "step": 807 + }, + { + "epoch": 1.0480129764801298, + "grad_norm": 0.5463752150535583, + "learning_rate": 1.5502830864500426e-05, + "loss": 0.5930934548377991, + "step": 808 + }, + { + "epoch": 1.0493106244931063, + "grad_norm": 0.5872495174407959, + "learning_rate": 1.5490885500937606e-05, + "loss": 0.609790563583374, + "step": 809 + }, + { + "epoch": 1.0506082725060828, + "grad_norm": 0.5574213266372681, + "learning_rate": 1.5478928909903705e-05, + "loss": 0.60848468542099, + "step": 810 + }, + { + "epoch": 1.0519059205190593, + "grad_norm": 0.5493984818458557, + "learning_rate": 1.5466961115846927e-05, + "loss": 0.5494011640548706, + "step": 811 + }, + { + "epoch": 1.0532035685320358, + "grad_norm": 0.5724595785140991, + "learning_rate": 1.545498214323837e-05, + "loss": 0.5948253273963928, + "step": 812 + }, + { + "epoch": 1.0545012165450123, + "grad_norm": 0.5360091924667358, + "learning_rate": 1.544299201657202e-05, + "loss": 0.6195284128189087, + "step": 813 + }, + { + "epoch": 1.0557988645579885, + "grad_norm": 0.5609839558601379, + "learning_rate": 1.543099076036463e-05, + "loss": 0.5945447087287903, + "step": 814 + }, + { + "epoch": 1.057096512570965, + "grad_norm": 0.5413586497306824, + "learning_rate": 1.5418978399155748e-05, + "loss": 0.55891352891922, + "step": 815 + }, + { + "epoch": 1.0583941605839415, + "grad_norm": 0.5763382315635681, + "learning_rate": 1.54069549575076e-05, + "loss": 0.5900748372077942, + "step": 816 + }, + { + "epoch": 1.059691808596918, + "grad_norm": 0.5625810623168945, + "learning_rate": 1.539492046000509e-05, + "loss": 0.5834665298461914, + "step": 817 + }, + { + "epoch": 1.0609894566098945, + "grad_norm": 0.5442895889282227, + "learning_rate": 1.5382874931255717e-05, + "loss": 0.6234191656112671, + "step": 818 + }, + { + "epoch": 1.062287104622871, + "grad_norm": 0.5448631048202515, + "learning_rate": 1.5370818395889536e-05, + "loss": 0.5617302060127258, + "step": 819 + }, + { + "epoch": 1.0635847526358475, + "grad_norm": 0.5880674719810486, + "learning_rate": 1.5358750878559113e-05, + "loss": 0.6024942994117737, + "step": 820 + }, + { + "epoch": 1.064882400648824, + "grad_norm": 0.5762202143669128, + "learning_rate": 1.5346672403939465e-05, + "loss": 0.625447154045105, + "step": 821 + }, + { + "epoch": 1.0661800486618005, + "grad_norm": 0.5726525187492371, + "learning_rate": 1.5334582996728017e-05, + "loss": 0.6527541875839233, + "step": 822 + }, + { + "epoch": 1.067477696674777, + "grad_norm": 0.5863476991653442, + "learning_rate": 1.532248268164455e-05, + "loss": 0.6537057161331177, + "step": 823 + }, + { + "epoch": 1.0687753446877535, + "grad_norm": 0.5855088829994202, + "learning_rate": 1.5310371483431138e-05, + "loss": 0.5910706520080566, + "step": 824 + }, + { + "epoch": 1.07007299270073, + "grad_norm": 0.5428813695907593, + "learning_rate": 1.529824942685212e-05, + "loss": 0.6206585168838501, + "step": 825 + }, + { + "epoch": 1.0713706407137065, + "grad_norm": 0.5427327156066895, + "learning_rate": 1.528611653669403e-05, + "loss": 0.6064955592155457, + "step": 826 + }, + { + "epoch": 1.072668288726683, + "grad_norm": 0.5533806085586548, + "learning_rate": 1.5273972837765566e-05, + "loss": 0.6161221861839294, + "step": 827 + }, + { + "epoch": 1.0739659367396595, + "grad_norm": 0.5330477356910706, + "learning_rate": 1.526181835489751e-05, + "loss": 0.584095299243927, + "step": 828 + }, + { + "epoch": 1.075263584752636, + "grad_norm": 0.5572231411933899, + "learning_rate": 1.5249653112942708e-05, + "loss": 0.6146395206451416, + "step": 829 + }, + { + "epoch": 1.0765612327656124, + "grad_norm": 0.5302649140357971, + "learning_rate": 1.5237477136776e-05, + "loss": 0.5835666060447693, + "step": 830 + }, + { + "epoch": 1.0778588807785887, + "grad_norm": 0.524252712726593, + "learning_rate": 1.5225290451294173e-05, + "loss": 0.5483739376068115, + "step": 831 + }, + { + "epoch": 1.0791565287915652, + "grad_norm": 0.5535216331481934, + "learning_rate": 1.521309308141592e-05, + "loss": 0.5715370774269104, + "step": 832 + }, + { + "epoch": 1.0804541768045417, + "grad_norm": 0.5739737749099731, + "learning_rate": 1.5200885052081767e-05, + "loss": 0.6168693900108337, + "step": 833 + }, + { + "epoch": 1.0817518248175182, + "grad_norm": 0.5620468258857727, + "learning_rate": 1.518866638825405e-05, + "loss": 0.6358708143234253, + "step": 834 + }, + { + "epoch": 1.0830494728304947, + "grad_norm": 0.5504558086395264, + "learning_rate": 1.517643711491684e-05, + "loss": 0.5625787973403931, + "step": 835 + }, + { + "epoch": 1.0843471208434712, + "grad_norm": 0.527152955532074, + "learning_rate": 1.516419725707591e-05, + "loss": 0.5917230248451233, + "step": 836 + }, + { + "epoch": 1.0856447688564477, + "grad_norm": 0.5097678899765015, + "learning_rate": 1.5151946839758673e-05, + "loss": 0.5631688237190247, + "step": 837 + }, + { + "epoch": 1.0869424168694242, + "grad_norm": 0.5500524044036865, + "learning_rate": 1.5139685888014123e-05, + "loss": 0.6300808787345886, + "step": 838 + }, + { + "epoch": 1.0882400648824007, + "grad_norm": 0.580634355545044, + "learning_rate": 1.512741442691281e-05, + "loss": 0.6707481145858765, + "step": 839 + }, + { + "epoch": 1.0895377128953772, + "grad_norm": 0.5668573379516602, + "learning_rate": 1.5115132481546763e-05, + "loss": 0.5974687337875366, + "step": 840 + }, + { + "epoch": 1.0908353609083536, + "grad_norm": 0.5720273852348328, + "learning_rate": 1.5102840077029452e-05, + "loss": 0.5461701154708862, + "step": 841 + }, + { + "epoch": 1.0921330089213301, + "grad_norm": 0.5787645578384399, + "learning_rate": 1.509053723849574e-05, + "loss": 0.6476290225982666, + "step": 842 + }, + { + "epoch": 1.0934306569343066, + "grad_norm": 0.5475322604179382, + "learning_rate": 1.5078223991101805e-05, + "loss": 0.5730643272399902, + "step": 843 + }, + { + "epoch": 1.0947283049472831, + "grad_norm": 0.5544430017471313, + "learning_rate": 1.5065900360025128e-05, + "loss": 0.6112351417541504, + "step": 844 + }, + { + "epoch": 1.0960259529602596, + "grad_norm": 0.6194364428520203, + "learning_rate": 1.5053566370464416e-05, + "loss": 0.612515926361084, + "step": 845 + }, + { + "epoch": 1.0973236009732361, + "grad_norm": 0.5542813539505005, + "learning_rate": 1.5041222047639558e-05, + "loss": 0.60612952709198, + "step": 846 + }, + { + "epoch": 1.0986212489862126, + "grad_norm": 0.5259748697280884, + "learning_rate": 1.5028867416791566e-05, + "loss": 0.5666128396987915, + "step": 847 + }, + { + "epoch": 1.0999188969991889, + "grad_norm": 0.5615611672401428, + "learning_rate": 1.5016502503182533e-05, + "loss": 0.5991164445877075, + "step": 848 + }, + { + "epoch": 1.1012165450121654, + "grad_norm": 0.5396665334701538, + "learning_rate": 1.5004127332095579e-05, + "loss": 0.608413815498352, + "step": 849 + }, + { + "epoch": 1.1025141930251419, + "grad_norm": 0.5625605583190918, + "learning_rate": 1.49917419288348e-05, + "loss": 0.6390218138694763, + "step": 850 + }, + { + "epoch": 1.1038118410381184, + "grad_norm": 0.5652357935905457, + "learning_rate": 1.4979346318725203e-05, + "loss": 0.613496720790863, + "step": 851 + }, + { + "epoch": 1.1051094890510949, + "grad_norm": 0.5494624376296997, + "learning_rate": 1.4966940527112679e-05, + "loss": 0.6234304308891296, + "step": 852 + }, + { + "epoch": 1.1064071370640713, + "grad_norm": 0.546302855014801, + "learning_rate": 1.4954524579363932e-05, + "loss": 0.6565023064613342, + "step": 853 + }, + { + "epoch": 1.1077047850770478, + "grad_norm": 0.5649261474609375, + "learning_rate": 1.4942098500866428e-05, + "loss": 0.6422203183174133, + "step": 854 + }, + { + "epoch": 1.1090024330900243, + "grad_norm": 0.5499486923217773, + "learning_rate": 1.4929662317028359e-05, + "loss": 0.6043179035186768, + "step": 855 + }, + { + "epoch": 1.1103000811030008, + "grad_norm": 0.5544485449790955, + "learning_rate": 1.491721605327857e-05, + "loss": 0.5800666213035583, + "step": 856 + }, + { + "epoch": 1.1115977291159773, + "grad_norm": 0.5804775953292847, + "learning_rate": 1.490475973506652e-05, + "loss": 0.6427537798881531, + "step": 857 + }, + { + "epoch": 1.1128953771289538, + "grad_norm": 0.5342238545417786, + "learning_rate": 1.4892293387862221e-05, + "loss": 0.6311315298080444, + "step": 858 + }, + { + "epoch": 1.1141930251419303, + "grad_norm": 0.5803128480911255, + "learning_rate": 1.487981703715621e-05, + "loss": 0.6198186874389648, + "step": 859 + }, + { + "epoch": 1.1154906731549068, + "grad_norm": 0.5532170534133911, + "learning_rate": 1.4867330708459463e-05, + "loss": 0.6145609617233276, + "step": 860 + }, + { + "epoch": 1.1167883211678833, + "grad_norm": 0.5493961572647095, + "learning_rate": 1.4854834427303353e-05, + "loss": 0.6166091561317444, + "step": 861 + }, + { + "epoch": 1.1180859691808598, + "grad_norm": 0.5559639930725098, + "learning_rate": 1.4842328219239618e-05, + "loss": 0.6064823865890503, + "step": 862 + }, + { + "epoch": 1.119383617193836, + "grad_norm": 0.5540943145751953, + "learning_rate": 1.4829812109840291e-05, + "loss": 0.5765544176101685, + "step": 863 + }, + { + "epoch": 1.1206812652068125, + "grad_norm": 0.5384024381637573, + "learning_rate": 1.4817286124697647e-05, + "loss": 0.565604567527771, + "step": 864 + }, + { + "epoch": 1.121978913219789, + "grad_norm": 0.5547834634780884, + "learning_rate": 1.480475028942415e-05, + "loss": 0.6463969349861145, + "step": 865 + }, + { + "epoch": 1.1232765612327655, + "grad_norm": 0.5574260354042053, + "learning_rate": 1.4792204629652414e-05, + "loss": 0.5858181118965149, + "step": 866 + }, + { + "epoch": 1.124574209245742, + "grad_norm": 0.5450447201728821, + "learning_rate": 1.4779649171035138e-05, + "loss": 0.6112916469573975, + "step": 867 + }, + { + "epoch": 1.1258718572587185, + "grad_norm": 0.5452038645744324, + "learning_rate": 1.4767083939245055e-05, + "loss": 0.6333041787147522, + "step": 868 + }, + { + "epoch": 1.127169505271695, + "grad_norm": 0.5453193187713623, + "learning_rate": 1.475450895997489e-05, + "loss": 0.6154720783233643, + "step": 869 + }, + { + "epoch": 1.1284671532846715, + "grad_norm": 0.5503911375999451, + "learning_rate": 1.4741924258937283e-05, + "loss": 0.580187201499939, + "step": 870 + }, + { + "epoch": 1.129764801297648, + "grad_norm": 0.564156174659729, + "learning_rate": 1.472932986186477e-05, + "loss": 0.6397178173065186, + "step": 871 + }, + { + "epoch": 1.1310624493106245, + "grad_norm": 0.5705751180648804, + "learning_rate": 1.47167257945097e-05, + "loss": 0.6369278430938721, + "step": 872 + }, + { + "epoch": 1.132360097323601, + "grad_norm": 0.562324583530426, + "learning_rate": 1.4704112082644207e-05, + "loss": 0.5986394882202148, + "step": 873 + }, + { + "epoch": 1.1336577453365775, + "grad_norm": 0.5652042031288147, + "learning_rate": 1.4691488752060132e-05, + "loss": 0.6185961365699768, + "step": 874 + }, + { + "epoch": 1.134955393349554, + "grad_norm": 0.5481469035148621, + "learning_rate": 1.4678855828568996e-05, + "loss": 0.5570172071456909, + "step": 875 + }, + { + "epoch": 1.1362530413625305, + "grad_norm": 0.5480834245681763, + "learning_rate": 1.4666213338001929e-05, + "loss": 0.5788794755935669, + "step": 876 + }, + { + "epoch": 1.137550689375507, + "grad_norm": 0.5426838994026184, + "learning_rate": 1.4653561306209625e-05, + "loss": 0.5975257158279419, + "step": 877 + }, + { + "epoch": 1.1388483373884835, + "grad_norm": 0.5632731914520264, + "learning_rate": 1.4640899759062285e-05, + "loss": 0.6319808959960938, + "step": 878 + }, + { + "epoch": 1.14014598540146, + "grad_norm": 0.5687447786331177, + "learning_rate": 1.462822872244957e-05, + "loss": 0.6043187379837036, + "step": 879 + }, + { + "epoch": 1.1414436334144362, + "grad_norm": 0.5472837686538696, + "learning_rate": 1.461554822228054e-05, + "loss": 0.607802152633667, + "step": 880 + }, + { + "epoch": 1.142741281427413, + "grad_norm": 0.5329515933990479, + "learning_rate": 1.460285828448361e-05, + "loss": 0.5557148456573486, + "step": 881 + }, + { + "epoch": 1.1440389294403892, + "grad_norm": 0.5272259712219238, + "learning_rate": 1.4590158935006494e-05, + "loss": 0.5320879817008972, + "step": 882 + }, + { + "epoch": 1.1453365774533657, + "grad_norm": 0.5834517478942871, + "learning_rate": 1.4577450199816142e-05, + "loss": 0.6263319253921509, + "step": 883 + }, + { + "epoch": 1.1466342254663422, + "grad_norm": 0.5725152492523193, + "learning_rate": 1.4564732104898702e-05, + "loss": 0.659183919429779, + "step": 884 + }, + { + "epoch": 1.1479318734793187, + "grad_norm": 0.5416671633720398, + "learning_rate": 1.4552004676259462e-05, + "loss": 0.5948503613471985, + "step": 885 + }, + { + "epoch": 1.1492295214922952, + "grad_norm": 0.5543138384819031, + "learning_rate": 1.453926793992279e-05, + "loss": 0.6404953002929688, + "step": 886 + }, + { + "epoch": 1.1505271695052717, + "grad_norm": 0.5595470070838928, + "learning_rate": 1.4526521921932091e-05, + "loss": 0.6393734812736511, + "step": 887 + }, + { + "epoch": 1.1518248175182482, + "grad_norm": 0.5882608294487, + "learning_rate": 1.4513766648349742e-05, + "loss": 0.5654003024101257, + "step": 888 + }, + { + "epoch": 1.1531224655312247, + "grad_norm": 0.5529691576957703, + "learning_rate": 1.4501002145257048e-05, + "loss": 0.6137228012084961, + "step": 889 + }, + { + "epoch": 1.1544201135442012, + "grad_norm": 0.5548762083053589, + "learning_rate": 1.4488228438754191e-05, + "loss": 0.603983998298645, + "step": 890 + }, + { + "epoch": 1.1557177615571776, + "grad_norm": 0.5486696362495422, + "learning_rate": 1.4475445554960166e-05, + "loss": 0.6514973640441895, + "step": 891 + }, + { + "epoch": 1.1570154095701541, + "grad_norm": 0.5455385446548462, + "learning_rate": 1.4462653520012736e-05, + "loss": 0.6550310850143433, + "step": 892 + }, + { + "epoch": 1.1583130575831306, + "grad_norm": 0.5628224015235901, + "learning_rate": 1.4449852360068372e-05, + "loss": 0.6537249088287354, + "step": 893 + }, + { + "epoch": 1.1596107055961071, + "grad_norm": 0.5596909523010254, + "learning_rate": 1.4437042101302212e-05, + "loss": 0.6253930926322937, + "step": 894 + }, + { + "epoch": 1.1609083536090836, + "grad_norm": 0.5298051238059998, + "learning_rate": 1.4424222769907985e-05, + "loss": 0.57865309715271, + "step": 895 + }, + { + "epoch": 1.1622060016220601, + "grad_norm": 0.5473706722259521, + "learning_rate": 1.4411394392097985e-05, + "loss": 0.5876542329788208, + "step": 896 + }, + { + "epoch": 1.1635036496350364, + "grad_norm": 0.5646262168884277, + "learning_rate": 1.4398556994102996e-05, + "loss": 0.6242583990097046, + "step": 897 + }, + { + "epoch": 1.164801297648013, + "grad_norm": 0.5632451176643372, + "learning_rate": 1.4385710602172245e-05, + "loss": 0.6315684914588928, + "step": 898 + }, + { + "epoch": 1.1660989456609894, + "grad_norm": 0.5819709300994873, + "learning_rate": 1.4372855242573356e-05, + "loss": 0.5947535037994385, + "step": 899 + }, + { + "epoch": 1.1673965936739659, + "grad_norm": 0.5634546875953674, + "learning_rate": 1.4359990941592283e-05, + "loss": 0.6281697750091553, + "step": 900 + }, + { + "epoch": 1.1686942416869424, + "grad_norm": 0.5534945130348206, + "learning_rate": 1.4347117725533269e-05, + "loss": 0.567562460899353, + "step": 901 + }, + { + "epoch": 1.1699918896999189, + "grad_norm": 0.5352903604507446, + "learning_rate": 1.4334235620718774e-05, + "loss": 0.5504214763641357, + "step": 902 + }, + { + "epoch": 1.1712895377128953, + "grad_norm": 0.5894420146942139, + "learning_rate": 1.4321344653489453e-05, + "loss": 0.5871877074241638, + "step": 903 + }, + { + "epoch": 1.1725871857258718, + "grad_norm": 0.5826941728591919, + "learning_rate": 1.4308444850204066e-05, + "loss": 0.5854516625404358, + "step": 904 + }, + { + "epoch": 1.1738848337388483, + "grad_norm": 0.5583464503288269, + "learning_rate": 1.4295536237239445e-05, + "loss": 0.6143467426300049, + "step": 905 + }, + { + "epoch": 1.1751824817518248, + "grad_norm": 0.5566253662109375, + "learning_rate": 1.4282618840990438e-05, + "loss": 0.6143018007278442, + "step": 906 + }, + { + "epoch": 1.1764801297648013, + "grad_norm": 0.5643221735954285, + "learning_rate": 1.4269692687869849e-05, + "loss": 0.6445101499557495, + "step": 907 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.583202600479126, + "learning_rate": 1.425675780430839e-05, + "loss": 0.6551916599273682, + "step": 908 + }, + { + "epoch": 1.1790754257907543, + "grad_norm": 0.5802360773086548, + "learning_rate": 1.4243814216754626e-05, + "loss": 0.6176046133041382, + "step": 909 + }, + { + "epoch": 1.1803730738037308, + "grad_norm": 0.5651218295097351, + "learning_rate": 1.4230861951674914e-05, + "loss": 0.6476747393608093, + "step": 910 + }, + { + "epoch": 1.1816707218167073, + "grad_norm": 0.5351070761680603, + "learning_rate": 1.421790103555336e-05, + "loss": 0.5974748134613037, + "step": 911 + }, + { + "epoch": 1.1829683698296838, + "grad_norm": 0.5506876111030579, + "learning_rate": 1.4204931494891759e-05, + "loss": 0.5977579355239868, + "step": 912 + }, + { + "epoch": 1.1842660178426603, + "grad_norm": 0.5496414303779602, + "learning_rate": 1.4191953356209535e-05, + "loss": 0.5993613004684448, + "step": 913 + }, + { + "epoch": 1.1855636658556366, + "grad_norm": 0.5448877215385437, + "learning_rate": 1.4178966646043702e-05, + "loss": 0.5849076509475708, + "step": 914 + }, + { + "epoch": 1.186861313868613, + "grad_norm": 0.5505439043045044, + "learning_rate": 1.4165971390948787e-05, + "loss": 0.6557425856590271, + "step": 915 + }, + { + "epoch": 1.1881589618815895, + "grad_norm": 0.5327088236808777, + "learning_rate": 1.4152967617496805e-05, + "loss": 0.5915898084640503, + "step": 916 + }, + { + "epoch": 1.189456609894566, + "grad_norm": 0.5534889698028564, + "learning_rate": 1.4139955352277176e-05, + "loss": 0.574662983417511, + "step": 917 + }, + { + "epoch": 1.1907542579075425, + "grad_norm": 0.5179355144500732, + "learning_rate": 1.4126934621896692e-05, + "loss": 0.5562629699707031, + "step": 918 + }, + { + "epoch": 1.192051905920519, + "grad_norm": 0.5698444247245789, + "learning_rate": 1.4113905452979455e-05, + "loss": 0.6139298677444458, + "step": 919 + }, + { + "epoch": 1.1933495539334955, + "grad_norm": 0.5280522108078003, + "learning_rate": 1.410086787216681e-05, + "loss": 0.5793087482452393, + "step": 920 + }, + { + "epoch": 1.1933495539334955, + "eval_loss": 0.6891781091690063, + "eval_runtime": 72.4953, + "eval_samples_per_second": 71.618, + "eval_steps_per_second": 8.952, + "step": 920 + }, + { + "epoch": 1.194647201946472, + "grad_norm": 0.518786609172821, + "learning_rate": 1.4087821906117314e-05, + "loss": 0.5602763891220093, + "step": 921 + }, + { + "epoch": 1.1959448499594485, + "grad_norm": 0.5518815517425537, + "learning_rate": 1.4074767581506666e-05, + "loss": 0.6225783824920654, + "step": 922 + }, + { + "epoch": 1.197242497972425, + "grad_norm": 0.5233501195907593, + "learning_rate": 1.4061704925027653e-05, + "loss": 0.5846587419509888, + "step": 923 + }, + { + "epoch": 1.1985401459854015, + "grad_norm": 0.5470210313796997, + "learning_rate": 1.4048633963390105e-05, + "loss": 0.5750600099563599, + "step": 924 + }, + { + "epoch": 1.199837793998378, + "grad_norm": 0.5647477507591248, + "learning_rate": 1.4035554723320828e-05, + "loss": 0.5977157354354858, + "step": 925 + }, + { + "epoch": 1.2011354420113545, + "grad_norm": 0.5179945230484009, + "learning_rate": 1.4022467231563554e-05, + "loss": 0.5806452035903931, + "step": 926 + }, + { + "epoch": 1.202433090024331, + "grad_norm": 0.5535194873809814, + "learning_rate": 1.4009371514878898e-05, + "loss": 0.6628227233886719, + "step": 927 + }, + { + "epoch": 1.2037307380373075, + "grad_norm": 0.6273780465126038, + "learning_rate": 1.399626760004428e-05, + "loss": 0.6142767667770386, + "step": 928 + }, + { + "epoch": 1.205028386050284, + "grad_norm": 0.5373409390449524, + "learning_rate": 1.3983155513853897e-05, + "loss": 0.6562739610671997, + "step": 929 + }, + { + "epoch": 1.2063260340632604, + "grad_norm": 0.5411200523376465, + "learning_rate": 1.3970035283118639e-05, + "loss": 0.5903608202934265, + "step": 930 + }, + { + "epoch": 1.2076236820762367, + "grad_norm": 0.5595235824584961, + "learning_rate": 1.3956906934666056e-05, + "loss": 0.6051539182662964, + "step": 931 + }, + { + "epoch": 1.2089213300892132, + "grad_norm": 0.5300971865653992, + "learning_rate": 1.3943770495340307e-05, + "loss": 0.643832802772522, + "step": 932 + }, + { + "epoch": 1.2102189781021897, + "grad_norm": 0.5413315892219543, + "learning_rate": 1.3930625992002076e-05, + "loss": 0.5942864418029785, + "step": 933 + }, + { + "epoch": 1.2115166261151662, + "grad_norm": 0.558797299861908, + "learning_rate": 1.391747345152855e-05, + "loss": 0.619717001914978, + "step": 934 + }, + { + "epoch": 1.2128142741281427, + "grad_norm": 0.5264928936958313, + "learning_rate": 1.3904312900813345e-05, + "loss": 0.5522656440734863, + "step": 935 + }, + { + "epoch": 1.2141119221411192, + "grad_norm": 0.5257030725479126, + "learning_rate": 1.3891144366766457e-05, + "loss": 0.5786164999008179, + "step": 936 + }, + { + "epoch": 1.2154095701540957, + "grad_norm": 0.577509343624115, + "learning_rate": 1.3877967876314205e-05, + "loss": 0.6315740346908569, + "step": 937 + }, + { + "epoch": 1.2167072181670722, + "grad_norm": 0.5317774415016174, + "learning_rate": 1.3864783456399174e-05, + "loss": 0.5896605253219604, + "step": 938 + }, + { + "epoch": 1.2180048661800487, + "grad_norm": 0.5598568320274353, + "learning_rate": 1.3851591133980167e-05, + "loss": 0.6161408424377441, + "step": 939 + }, + { + "epoch": 1.2193025141930252, + "grad_norm": 0.5387381911277771, + "learning_rate": 1.3838390936032146e-05, + "loss": 0.5705558061599731, + "step": 940 + }, + { + "epoch": 1.2206001622060016, + "grad_norm": 0.5279619693756104, + "learning_rate": 1.3825182889546173e-05, + "loss": 0.5650646686553955, + "step": 941 + }, + { + "epoch": 1.2218978102189781, + "grad_norm": 0.5602632164955139, + "learning_rate": 1.3811967021529362e-05, + "loss": 0.6143766045570374, + "step": 942 + }, + { + "epoch": 1.2231954582319546, + "grad_norm": 0.5425279140472412, + "learning_rate": 1.3798743359004816e-05, + "loss": 0.602745771408081, + "step": 943 + }, + { + "epoch": 1.2244931062449311, + "grad_norm": 0.5385331511497498, + "learning_rate": 1.378551192901158e-05, + "loss": 0.5555763244628906, + "step": 944 + }, + { + "epoch": 1.2257907542579076, + "grad_norm": 0.5338374972343445, + "learning_rate": 1.3772272758604576e-05, + "loss": 0.5934339165687561, + "step": 945 + }, + { + "epoch": 1.2270884022708841, + "grad_norm": 0.5479584336280823, + "learning_rate": 1.375902587485456e-05, + "loss": 0.5891726016998291, + "step": 946 + }, + { + "epoch": 1.2283860502838606, + "grad_norm": 0.5357087254524231, + "learning_rate": 1.3745771304848056e-05, + "loss": 0.5626200437545776, + "step": 947 + }, + { + "epoch": 1.2296836982968369, + "grad_norm": 0.5543829202651978, + "learning_rate": 1.3732509075687302e-05, + "loss": 0.5829602479934692, + "step": 948 + }, + { + "epoch": 1.2309813463098134, + "grad_norm": 0.5650047659873962, + "learning_rate": 1.3719239214490203e-05, + "loss": 0.6154081225395203, + "step": 949 + }, + { + "epoch": 1.2322789943227899, + "grad_norm": 0.5745924711227417, + "learning_rate": 1.3705961748390264e-05, + "loss": 0.5824979543685913, + "step": 950 + }, + { + "epoch": 1.2335766423357664, + "grad_norm": 0.5524203777313232, + "learning_rate": 1.3692676704536547e-05, + "loss": 0.6566962599754333, + "step": 951 + }, + { + "epoch": 1.2348742903487429, + "grad_norm": 0.5592309832572937, + "learning_rate": 1.3679384110093601e-05, + "loss": 0.5955104231834412, + "step": 952 + }, + { + "epoch": 1.2361719383617193, + "grad_norm": 0.5388526916503906, + "learning_rate": 1.3666083992241414e-05, + "loss": 0.6259311437606812, + "step": 953 + }, + { + "epoch": 1.2374695863746958, + "grad_norm": 0.5431481599807739, + "learning_rate": 1.3652776378175366e-05, + "loss": 0.6409016847610474, + "step": 954 + }, + { + "epoch": 1.2387672343876723, + "grad_norm": 0.5381134748458862, + "learning_rate": 1.3639461295106157e-05, + "loss": 0.5895624160766602, + "step": 955 + }, + { + "epoch": 1.2400648824006488, + "grad_norm": 0.5462051630020142, + "learning_rate": 1.3626138770259765e-05, + "loss": 0.5515483617782593, + "step": 956 + }, + { + "epoch": 1.2413625304136253, + "grad_norm": 0.5416935682296753, + "learning_rate": 1.3612808830877377e-05, + "loss": 0.5839380621910095, + "step": 957 + }, + { + "epoch": 1.2426601784266018, + "grad_norm": 0.543431282043457, + "learning_rate": 1.3599471504215347e-05, + "loss": 0.6129022836685181, + "step": 958 + }, + { + "epoch": 1.2439578264395783, + "grad_norm": 0.5546287894248962, + "learning_rate": 1.358612681754513e-05, + "loss": 0.5957478284835815, + "step": 959 + }, + { + "epoch": 1.2452554744525548, + "grad_norm": 0.5636503100395203, + "learning_rate": 1.357277479815324e-05, + "loss": 0.6206330060958862, + "step": 960 + }, + { + "epoch": 1.2465531224655313, + "grad_norm": 0.5537446141242981, + "learning_rate": 1.355941547334117e-05, + "loss": 0.5747988224029541, + "step": 961 + }, + { + "epoch": 1.2478507704785078, + "grad_norm": 0.5459409952163696, + "learning_rate": 1.3546048870425356e-05, + "loss": 0.5868381261825562, + "step": 962 + }, + { + "epoch": 1.2491484184914843, + "grad_norm": 0.5428374409675598, + "learning_rate": 1.3532675016737127e-05, + "loss": 0.6297606825828552, + "step": 963 + }, + { + "epoch": 1.2504460665044608, + "grad_norm": 0.5484406352043152, + "learning_rate": 1.3519293939622622e-05, + "loss": 0.6754599213600159, + "step": 964 + }, + { + "epoch": 1.251743714517437, + "grad_norm": 0.5630886554718018, + "learning_rate": 1.3505905666442757e-05, + "loss": 0.655160129070282, + "step": 965 + }, + { + "epoch": 1.2530413625304138, + "grad_norm": 0.5442233085632324, + "learning_rate": 1.3492510224573165e-05, + "loss": 0.5808818936347961, + "step": 966 + }, + { + "epoch": 1.25433901054339, + "grad_norm": 0.5171942114830017, + "learning_rate": 1.3479107641404134e-05, + "loss": 0.5760788321495056, + "step": 967 + }, + { + "epoch": 1.2556366585563665, + "grad_norm": 0.5334968566894531, + "learning_rate": 1.3465697944340552e-05, + "loss": 0.5447085499763489, + "step": 968 + }, + { + "epoch": 1.256934306569343, + "grad_norm": 0.5165731310844421, + "learning_rate": 1.3452281160801856e-05, + "loss": 0.600307822227478, + "step": 969 + }, + { + "epoch": 1.2582319545823195, + "grad_norm": 0.5485058426856995, + "learning_rate": 1.3438857318221974e-05, + "loss": 0.6196280717849731, + "step": 970 + }, + { + "epoch": 1.259529602595296, + "grad_norm": 0.5499110817909241, + "learning_rate": 1.3425426444049265e-05, + "loss": 0.6000030040740967, + "step": 971 + }, + { + "epoch": 1.2608272506082725, + "grad_norm": 0.5815853476524353, + "learning_rate": 1.3411988565746467e-05, + "loss": 0.6568498611450195, + "step": 972 + }, + { + "epoch": 1.262124898621249, + "grad_norm": 0.5364983081817627, + "learning_rate": 1.3398543710790642e-05, + "loss": 0.6078934073448181, + "step": 973 + }, + { + "epoch": 1.2634225466342255, + "grad_norm": 0.5517644286155701, + "learning_rate": 1.3385091906673115e-05, + "loss": 0.6221879720687866, + "step": 974 + }, + { + "epoch": 1.264720194647202, + "grad_norm": 0.5543562769889832, + "learning_rate": 1.3371633180899417e-05, + "loss": 0.6666390895843506, + "step": 975 + }, + { + "epoch": 1.2660178426601785, + "grad_norm": 0.5409432053565979, + "learning_rate": 1.335816756098924e-05, + "loss": 0.6188746690750122, + "step": 976 + }, + { + "epoch": 1.267315490673155, + "grad_norm": 0.590812087059021, + "learning_rate": 1.3344695074476365e-05, + "loss": 0.6498491764068604, + "step": 977 + }, + { + "epoch": 1.2686131386861315, + "grad_norm": 0.5648714900016785, + "learning_rate": 1.3331215748908622e-05, + "loss": 0.6376237869262695, + "step": 978 + }, + { + "epoch": 1.269910786699108, + "grad_norm": 0.5377125144004822, + "learning_rate": 1.3317729611847818e-05, + "loss": 0.6080333590507507, + "step": 979 + }, + { + "epoch": 1.2712084347120842, + "grad_norm": 0.6160985231399536, + "learning_rate": 1.3304236690869688e-05, + "loss": 0.6452457904815674, + "step": 980 + }, + { + "epoch": 1.272506082725061, + "grad_norm": 0.5675063133239746, + "learning_rate": 1.329073701356384e-05, + "loss": 0.6066033840179443, + "step": 981 + }, + { + "epoch": 1.2738037307380372, + "grad_norm": 0.5339285731315613, + "learning_rate": 1.3277230607533698e-05, + "loss": 0.563126266002655, + "step": 982 + }, + { + "epoch": 1.275101378751014, + "grad_norm": 0.558273434638977, + "learning_rate": 1.3263717500396446e-05, + "loss": 0.6070864796638489, + "step": 983 + }, + { + "epoch": 1.2763990267639902, + "grad_norm": 0.5663204789161682, + "learning_rate": 1.3250197719782966e-05, + "loss": 0.6016590595245361, + "step": 984 + }, + { + "epoch": 1.2776966747769667, + "grad_norm": 0.5561959743499756, + "learning_rate": 1.3236671293337788e-05, + "loss": 0.6111094951629639, + "step": 985 + }, + { + "epoch": 1.2789943227899432, + "grad_norm": 0.5440069437026978, + "learning_rate": 1.3223138248719032e-05, + "loss": 0.6232655644416809, + "step": 986 + }, + { + "epoch": 1.2802919708029197, + "grad_norm": 0.5603107810020447, + "learning_rate": 1.3209598613598344e-05, + "loss": 0.5950015783309937, + "step": 987 + }, + { + "epoch": 1.2815896188158962, + "grad_norm": 0.538038969039917, + "learning_rate": 1.3196052415660856e-05, + "loss": 0.6100248098373413, + "step": 988 + }, + { + "epoch": 1.2828872668288727, + "grad_norm": 0.5667180418968201, + "learning_rate": 1.318249968260511e-05, + "loss": 0.6681912541389465, + "step": 989 + }, + { + "epoch": 1.2841849148418492, + "grad_norm": 0.5527055859565735, + "learning_rate": 1.316894044214302e-05, + "loss": 0.6051948070526123, + "step": 990 + }, + { + "epoch": 1.2854825628548256, + "grad_norm": 0.5413651466369629, + "learning_rate": 1.3155374721999797e-05, + "loss": 0.5882329940795898, + "step": 991 + }, + { + "epoch": 1.2867802108678021, + "grad_norm": 0.5323876738548279, + "learning_rate": 1.3141802549913907e-05, + "loss": 0.6183469295501709, + "step": 992 + }, + { + "epoch": 1.2880778588807786, + "grad_norm": 0.5273195505142212, + "learning_rate": 1.3128223953637003e-05, + "loss": 0.5676054954528809, + "step": 993 + }, + { + "epoch": 1.2893755068937551, + "grad_norm": 0.567756175994873, + "learning_rate": 1.3114638960933883e-05, + "loss": 0.6798044443130493, + "step": 994 + }, + { + "epoch": 1.2906731549067316, + "grad_norm": 0.5517603754997253, + "learning_rate": 1.3101047599582415e-05, + "loss": 0.6340286731719971, + "step": 995 + }, + { + "epoch": 1.2919708029197081, + "grad_norm": 0.5477331280708313, + "learning_rate": 1.3087449897373494e-05, + "loss": 0.6021038889884949, + "step": 996 + }, + { + "epoch": 1.2932684509326844, + "grad_norm": 0.551368772983551, + "learning_rate": 1.307384588211098e-05, + "loss": 0.5940453410148621, + "step": 997 + }, + { + "epoch": 1.294566098945661, + "grad_norm": 0.5456337928771973, + "learning_rate": 1.306023558161164e-05, + "loss": 0.6023222208023071, + "step": 998 + }, + { + "epoch": 1.2958637469586374, + "grad_norm": 0.5676029324531555, + "learning_rate": 1.3046619023705095e-05, + "loss": 0.6922143697738647, + "step": 999 + }, + { + "epoch": 1.2971613949716139, + "grad_norm": 0.5776983499526978, + "learning_rate": 1.3032996236233756e-05, + "loss": 0.6589181423187256, + "step": 1000 + }, + { + "epoch": 1.2984590429845904, + "grad_norm": 0.5594776272773743, + "learning_rate": 1.3019367247052781e-05, + "loss": 0.6284008622169495, + "step": 1001 + }, + { + "epoch": 1.2997566909975669, + "grad_norm": 0.5632730722427368, + "learning_rate": 1.300573208403e-05, + "loss": 0.586546778678894, + "step": 1002 + }, + { + "epoch": 1.3010543390105433, + "grad_norm": 0.5418180823326111, + "learning_rate": 1.2992090775045868e-05, + "loss": 0.5931944847106934, + "step": 1003 + }, + { + "epoch": 1.3023519870235198, + "grad_norm": 0.5260592699050903, + "learning_rate": 1.2978443347993415e-05, + "loss": 0.5439613461494446, + "step": 1004 + }, + { + "epoch": 1.3036496350364963, + "grad_norm": 0.546437680721283, + "learning_rate": 1.296478983077817e-05, + "loss": 0.5946912169456482, + "step": 1005 + }, + { + "epoch": 1.3049472830494728, + "grad_norm": 0.5575598478317261, + "learning_rate": 1.2951130251318125e-05, + "loss": 0.6190862655639648, + "step": 1006 + }, + { + "epoch": 1.3062449310624493, + "grad_norm": 0.5441600680351257, + "learning_rate": 1.2937464637543655e-05, + "loss": 0.613700270652771, + "step": 1007 + }, + { + "epoch": 1.3075425790754258, + "grad_norm": 0.5194239020347595, + "learning_rate": 1.2923793017397488e-05, + "loss": 0.551931619644165, + "step": 1008 + }, + { + "epoch": 1.3088402270884023, + "grad_norm": 0.521641194820404, + "learning_rate": 1.2910115418834624e-05, + "loss": 0.544873833656311, + "step": 1009 + }, + { + "epoch": 1.3101378751013788, + "grad_norm": 0.5697146654129028, + "learning_rate": 1.289643186982229e-05, + "loss": 0.6762262582778931, + "step": 1010 + }, + { + "epoch": 1.3114355231143553, + "grad_norm": 0.5358358025550842, + "learning_rate": 1.2882742398339884e-05, + "loss": 0.5811675190925598, + "step": 1011 + }, + { + "epoch": 1.3127331711273318, + "grad_norm": 0.5812531113624573, + "learning_rate": 1.2869047032378905e-05, + "loss": 0.6202974319458008, + "step": 1012 + }, + { + "epoch": 1.3140308191403083, + "grad_norm": 0.5383328795433044, + "learning_rate": 1.2855345799942915e-05, + "loss": 0.58216392993927, + "step": 1013 + }, + { + "epoch": 1.3153284671532846, + "grad_norm": 0.5470954775810242, + "learning_rate": 1.2841638729047463e-05, + "loss": 0.5842857360839844, + "step": 1014 + }, + { + "epoch": 1.3166261151662613, + "grad_norm": 0.5181686878204346, + "learning_rate": 1.2827925847720041e-05, + "loss": 0.5985524654388428, + "step": 1015 + }, + { + "epoch": 1.3179237631792375, + "grad_norm": 0.5179515480995178, + "learning_rate": 1.2814207184000018e-05, + "loss": 0.5709914565086365, + "step": 1016 + }, + { + "epoch": 1.319221411192214, + "grad_norm": 0.5449542999267578, + "learning_rate": 1.2800482765938594e-05, + "loss": 0.646975576877594, + "step": 1017 + }, + { + "epoch": 1.3205190592051905, + "grad_norm": 0.5302087664604187, + "learning_rate": 1.2786752621598726e-05, + "loss": 0.6145081520080566, + "step": 1018 + }, + { + "epoch": 1.321816707218167, + "grad_norm": 0.5520698428153992, + "learning_rate": 1.2773016779055089e-05, + "loss": 0.5821577906608582, + "step": 1019 + }, + { + "epoch": 1.3231143552311435, + "grad_norm": 0.5411002039909363, + "learning_rate": 1.2759275266393998e-05, + "loss": 0.5899526476860046, + "step": 1020 + }, + { + "epoch": 1.32441200324412, + "grad_norm": 0.5193924307823181, + "learning_rate": 1.2745528111713373e-05, + "loss": 0.5851880311965942, + "step": 1021 + }, + { + "epoch": 1.3257096512570965, + "grad_norm": 0.5581620931625366, + "learning_rate": 1.2731775343122663e-05, + "loss": 0.6368898153305054, + "step": 1022 + }, + { + "epoch": 1.327007299270073, + "grad_norm": 0.5761281847953796, + "learning_rate": 1.2718016988742799e-05, + "loss": 0.6208426356315613, + "step": 1023 + }, + { + "epoch": 1.3283049472830495, + "grad_norm": 0.5429732799530029, + "learning_rate": 1.270425307670614e-05, + "loss": 0.5906336307525635, + "step": 1024 + }, + { + "epoch": 1.329602595296026, + "grad_norm": 0.5482628345489502, + "learning_rate": 1.2690483635156392e-05, + "loss": 0.6205004453659058, + "step": 1025 + }, + { + "epoch": 1.3309002433090025, + "grad_norm": 0.53929603099823, + "learning_rate": 1.2676708692248583e-05, + "loss": 0.5814516544342041, + "step": 1026 + }, + { + "epoch": 1.332197891321979, + "grad_norm": 0.5420404076576233, + "learning_rate": 1.2662928276148985e-05, + "loss": 0.6052178740501404, + "step": 1027 + }, + { + "epoch": 1.3334955393349555, + "grad_norm": 0.5524218678474426, + "learning_rate": 1.264914241503506e-05, + "loss": 0.639128565788269, + "step": 1028 + }, + { + "epoch": 1.334793187347932, + "grad_norm": 0.5308884978294373, + "learning_rate": 1.2635351137095408e-05, + "loss": 0.5758256316184998, + "step": 1029 + }, + { + "epoch": 1.3360908353609084, + "grad_norm": 0.556959867477417, + "learning_rate": 1.2621554470529698e-05, + "loss": 0.6215351223945618, + "step": 1030 + }, + { + "epoch": 1.3373884833738847, + "grad_norm": 0.5299232006072998, + "learning_rate": 1.2607752443548622e-05, + "loss": 0.6064879298210144, + "step": 1031 + }, + { + "epoch": 1.3386861313868614, + "grad_norm": 0.5557371973991394, + "learning_rate": 1.259394508437383e-05, + "loss": 0.62589031457901, + "step": 1032 + }, + { + "epoch": 1.3399837793998377, + "grad_norm": 0.5563995242118835, + "learning_rate": 1.2580132421237883e-05, + "loss": 0.6236660480499268, + "step": 1033 + }, + { + "epoch": 1.3412814274128142, + "grad_norm": 0.5666968822479248, + "learning_rate": 1.2566314482384174e-05, + "loss": 0.6252362728118896, + "step": 1034 + }, + { + "epoch": 1.3425790754257907, + "grad_norm": 0.5652741193771362, + "learning_rate": 1.2552491296066895e-05, + "loss": 0.6189643144607544, + "step": 1035 + }, + { + "epoch": 1.3438767234387672, + "grad_norm": 0.5583733320236206, + "learning_rate": 1.2538662890550959e-05, + "loss": 0.6765375137329102, + "step": 1036 + }, + { + "epoch": 1.3451743714517437, + "grad_norm": 0.5742061138153076, + "learning_rate": 1.252482929411196e-05, + "loss": 0.6477082967758179, + "step": 1037 + }, + { + "epoch": 1.3464720194647202, + "grad_norm": 0.5400403141975403, + "learning_rate": 1.25109905350361e-05, + "loss": 0.5811231136322021, + "step": 1038 + }, + { + "epoch": 1.3477696674776967, + "grad_norm": 0.5390773415565491, + "learning_rate": 1.249714664162014e-05, + "loss": 0.6055101156234741, + "step": 1039 + }, + { + "epoch": 1.3490673154906732, + "grad_norm": 0.5596996545791626, + "learning_rate": 1.2483297642171332e-05, + "loss": 0.6074774265289307, + "step": 1040 + }, + { + "epoch": 1.3503649635036497, + "grad_norm": 0.5600677728652954, + "learning_rate": 1.246944356500738e-05, + "loss": 0.6564399003982544, + "step": 1041 + }, + { + "epoch": 1.3516626115166261, + "grad_norm": 0.5470819473266602, + "learning_rate": 1.2455584438456366e-05, + "loss": 0.6430810689926147, + "step": 1042 + }, + { + "epoch": 1.3529602595296026, + "grad_norm": 0.5539683699607849, + "learning_rate": 1.2441720290856694e-05, + "loss": 0.6132862567901611, + "step": 1043 + }, + { + "epoch": 1.3542579075425791, + "grad_norm": 0.5648192167282104, + "learning_rate": 1.2427851150557036e-05, + "loss": 0.6304311156272888, + "step": 1044 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.5195255279541016, + "learning_rate": 1.241397704591627e-05, + "loss": 0.5641679763793945, + "step": 1045 + }, + { + "epoch": 1.3568532035685321, + "grad_norm": 0.5658749341964722, + "learning_rate": 1.2400098005303436e-05, + "loss": 0.6409952044487, + "step": 1046 + }, + { + "epoch": 1.3581508515815086, + "grad_norm": 0.5088870525360107, + "learning_rate": 1.238621405709766e-05, + "loss": 0.5354233384132385, + "step": 1047 + }, + { + "epoch": 1.3594484995944849, + "grad_norm": 0.5734469890594482, + "learning_rate": 1.2372325229688093e-05, + "loss": 0.6188406944274902, + "step": 1048 + }, + { + "epoch": 1.3607461476074616, + "grad_norm": 0.5380412936210632, + "learning_rate": 1.235843155147388e-05, + "loss": 0.5657402873039246, + "step": 1049 + }, + { + "epoch": 1.3620437956204379, + "grad_norm": 0.5315279960632324, + "learning_rate": 1.2344533050864071e-05, + "loss": 0.5667376518249512, + "step": 1050 + }, + { + "epoch": 1.3633414436334144, + "grad_norm": 0.5081866979598999, + "learning_rate": 1.2330629756277588e-05, + "loss": 0.5432066917419434, + "step": 1051 + }, + { + "epoch": 1.3646390916463909, + "grad_norm": 0.5798763036727905, + "learning_rate": 1.2316721696143141e-05, + "loss": 0.6364309191703796, + "step": 1052 + }, + { + "epoch": 1.3659367396593673, + "grad_norm": 0.5289844870567322, + "learning_rate": 1.23028088988992e-05, + "loss": 0.5321639180183411, + "step": 1053 + }, + { + "epoch": 1.3672343876723438, + "grad_norm": 0.5852347612380981, + "learning_rate": 1.228889139299391e-05, + "loss": 0.6831628084182739, + "step": 1054 + }, + { + "epoch": 1.3685320356853203, + "grad_norm": 0.5265390872955322, + "learning_rate": 1.2274969206885048e-05, + "loss": 0.5725244283676147, + "step": 1055 + }, + { + "epoch": 1.3698296836982968, + "grad_norm": 0.6298306584358215, + "learning_rate": 1.2261042369039966e-05, + "loss": 0.6366633176803589, + "step": 1056 + }, + { + "epoch": 1.3711273317112733, + "grad_norm": 0.521314263343811, + "learning_rate": 1.2247110907935518e-05, + "loss": 0.5725533962249756, + "step": 1057 + }, + { + "epoch": 1.3724249797242498, + "grad_norm": 0.5249886512756348, + "learning_rate": 1.2233174852058015e-05, + "loss": 0.577233076095581, + "step": 1058 + }, + { + "epoch": 1.3737226277372263, + "grad_norm": 0.5558046102523804, + "learning_rate": 1.2219234229903163e-05, + "loss": 0.6044833660125732, + "step": 1059 + }, + { + "epoch": 1.3750202757502028, + "grad_norm": 0.5569727420806885, + "learning_rate": 1.2205289069976012e-05, + "loss": 0.5831769704818726, + "step": 1060 + }, + { + "epoch": 1.3763179237631793, + "grad_norm": 0.5547581911087036, + "learning_rate": 1.2191339400790881e-05, + "loss": 0.5798386335372925, + "step": 1061 + }, + { + "epoch": 1.3776155717761558, + "grad_norm": 0.5544263124465942, + "learning_rate": 1.2177385250871312e-05, + "loss": 0.607170581817627, + "step": 1062 + }, + { + "epoch": 1.378913219789132, + "grad_norm": 0.5475184321403503, + "learning_rate": 1.2163426648750009e-05, + "loss": 0.596827507019043, + "step": 1063 + }, + { + "epoch": 1.3802108678021088, + "grad_norm": 0.551906168460846, + "learning_rate": 1.2149463622968782e-05, + "loss": 0.5992593169212341, + "step": 1064 + }, + { + "epoch": 1.381508515815085, + "grad_norm": 0.5418475270271301, + "learning_rate": 1.2135496202078487e-05, + "loss": 0.5538514852523804, + "step": 1065 + }, + { + "epoch": 1.3828061638280618, + "grad_norm": 0.5357592105865479, + "learning_rate": 1.2121524414638958e-05, + "loss": 0.6014474630355835, + "step": 1066 + }, + { + "epoch": 1.384103811841038, + "grad_norm": 0.5673146843910217, + "learning_rate": 1.2107548289218968e-05, + "loss": 0.5835940837860107, + "step": 1067 + }, + { + "epoch": 1.3854014598540145, + "grad_norm": 0.5655810832977295, + "learning_rate": 1.2093567854396158e-05, + "loss": 0.6108807325363159, + "step": 1068 + }, + { + "epoch": 1.386699107866991, + "grad_norm": 0.5361012816429138, + "learning_rate": 1.2079583138756976e-05, + "loss": 0.6093813180923462, + "step": 1069 + }, + { + "epoch": 1.3879967558799675, + "grad_norm": 0.5419613122940063, + "learning_rate": 1.206559417089663e-05, + "loss": 0.6026707887649536, + "step": 1070 + }, + { + "epoch": 1.389294403892944, + "grad_norm": 0.5429274439811707, + "learning_rate": 1.205160097941901e-05, + "loss": 0.6365257501602173, + "step": 1071 + }, + { + "epoch": 1.3905920519059205, + "grad_norm": 0.5734850764274597, + "learning_rate": 1.2037603592936656e-05, + "loss": 0.6649122834205627, + "step": 1072 + }, + { + "epoch": 1.391889699918897, + "grad_norm": 0.5734902024269104, + "learning_rate": 1.2023602040070679e-05, + "loss": 0.7125487327575684, + "step": 1073 + }, + { + "epoch": 1.3931873479318735, + "grad_norm": 0.5633674263954163, + "learning_rate": 1.2009596349450717e-05, + "loss": 0.6474109292030334, + "step": 1074 + }, + { + "epoch": 1.39448499594485, + "grad_norm": 0.5378244519233704, + "learning_rate": 1.1995586549714855e-05, + "loss": 0.6136443614959717, + "step": 1075 + }, + { + "epoch": 1.3957826439578265, + "grad_norm": 0.558250904083252, + "learning_rate": 1.198157266950959e-05, + "loss": 0.6676377058029175, + "step": 1076 + }, + { + "epoch": 1.397080291970803, + "grad_norm": 0.5315516591072083, + "learning_rate": 1.1967554737489762e-05, + "loss": 0.607810378074646, + "step": 1077 + }, + { + "epoch": 1.3983779399837795, + "grad_norm": 0.5391795039176941, + "learning_rate": 1.1953532782318491e-05, + "loss": 0.5898000597953796, + "step": 1078 + }, + { + "epoch": 1.399675587996756, + "grad_norm": 0.5466244220733643, + "learning_rate": 1.1939506832667129e-05, + "loss": 0.5943995118141174, + "step": 1079 + }, + { + "epoch": 1.4009732360097322, + "grad_norm": 0.5457687973976135, + "learning_rate": 1.1925476917215191e-05, + "loss": 0.6089761257171631, + "step": 1080 + }, + { + "epoch": 1.402270884022709, + "grad_norm": 0.5727429389953613, + "learning_rate": 1.1911443064650301e-05, + "loss": 0.6369843482971191, + "step": 1081 + }, + { + "epoch": 1.4035685320356852, + "grad_norm": 0.5765259861946106, + "learning_rate": 1.189740530366814e-05, + "loss": 0.6176037788391113, + "step": 1082 + }, + { + "epoch": 1.404866180048662, + "grad_norm": 0.5793892741203308, + "learning_rate": 1.1883363662972375e-05, + "loss": 0.6147127747535706, + "step": 1083 + }, + { + "epoch": 1.4061638280616382, + "grad_norm": 0.5127638578414917, + "learning_rate": 1.1869318171274606e-05, + "loss": 0.5739990472793579, + "step": 1084 + }, + { + "epoch": 1.4074614760746147, + "grad_norm": 0.5451372861862183, + "learning_rate": 1.1855268857294308e-05, + "loss": 0.6005086898803711, + "step": 1085 + }, + { + "epoch": 1.4087591240875912, + "grad_norm": 0.5556860566139221, + "learning_rate": 1.1841215749758774e-05, + "loss": 0.6003910303115845, + "step": 1086 + }, + { + "epoch": 1.4100567721005677, + "grad_norm": 0.5883124470710754, + "learning_rate": 1.182715887740305e-05, + "loss": 0.6721568703651428, + "step": 1087 + }, + { + "epoch": 1.4113544201135442, + "grad_norm": 0.5330623388290405, + "learning_rate": 1.1813098268969886e-05, + "loss": 0.617790699005127, + "step": 1088 + }, + { + "epoch": 1.4126520681265207, + "grad_norm": 0.5409324169158936, + "learning_rate": 1.1799033953209664e-05, + "loss": 0.6154944896697998, + "step": 1089 + }, + { + "epoch": 1.4139497161394972, + "grad_norm": 0.5280669927597046, + "learning_rate": 1.178496595888035e-05, + "loss": 0.6064777970314026, + "step": 1090 + }, + { + "epoch": 1.4152473641524737, + "grad_norm": 0.5559468269348145, + "learning_rate": 1.1770894314747433e-05, + "loss": 0.6379706263542175, + "step": 1091 + }, + { + "epoch": 1.4165450121654501, + "grad_norm": 0.5678933262825012, + "learning_rate": 1.1756819049583861e-05, + "loss": 0.5879865288734436, + "step": 1092 + }, + { + "epoch": 1.4178426601784266, + "grad_norm": 0.5317026972770691, + "learning_rate": 1.1742740192169995e-05, + "loss": 0.6252385377883911, + "step": 1093 + }, + { + "epoch": 1.4191403081914031, + "grad_norm": 0.5503518581390381, + "learning_rate": 1.1728657771293529e-05, + "loss": 0.5956102013587952, + "step": 1094 + }, + { + "epoch": 1.4204379562043796, + "grad_norm": 0.5392619967460632, + "learning_rate": 1.171457181574945e-05, + "loss": 0.6110433340072632, + "step": 1095 + }, + { + "epoch": 1.4217356042173561, + "grad_norm": 0.554594099521637, + "learning_rate": 1.1700482354339972e-05, + "loss": 0.6505380272865295, + "step": 1096 + }, + { + "epoch": 1.4230332522303324, + "grad_norm": 0.5639646053314209, + "learning_rate": 1.168638941587448e-05, + "loss": 0.6052155494689941, + "step": 1097 + }, + { + "epoch": 1.424330900243309, + "grad_norm": 0.5569002032279968, + "learning_rate": 1.1672293029169466e-05, + "loss": 0.5856403112411499, + "step": 1098 + }, + { + "epoch": 1.4256285482562854, + "grad_norm": 0.5615402460098267, + "learning_rate": 1.165819322304847e-05, + "loss": 0.6077978610992432, + "step": 1099 + }, + { + "epoch": 1.426926196269262, + "grad_norm": 0.5535939931869507, + "learning_rate": 1.164409002634203e-05, + "loss": 0.6245694160461426, + "step": 1100 + }, + { + "epoch": 1.4282238442822384, + "grad_norm": 0.5362287759780884, + "learning_rate": 1.162998346788761e-05, + "loss": 0.6105297803878784, + "step": 1101 + }, + { + "epoch": 1.4295214922952149, + "grad_norm": 0.5390259027481079, + "learning_rate": 1.1615873576529556e-05, + "loss": 0.6066164970397949, + "step": 1102 + }, + { + "epoch": 1.4308191403081914, + "grad_norm": 0.5315901041030884, + "learning_rate": 1.1601760381119022e-05, + "loss": 0.5768907070159912, + "step": 1103 + }, + { + "epoch": 1.4321167883211678, + "grad_norm": 0.5727961659431458, + "learning_rate": 1.158764391051392e-05, + "loss": 0.6904894113540649, + "step": 1104 + }, + { + "epoch": 1.4334144363341443, + "grad_norm": 0.5435361862182617, + "learning_rate": 1.1573524193578863e-05, + "loss": 0.5838584899902344, + "step": 1105 + }, + { + "epoch": 1.4347120843471208, + "grad_norm": 0.5609909296035767, + "learning_rate": 1.1559401259185095e-05, + "loss": 0.6729065775871277, + "step": 1106 + }, + { + "epoch": 1.4360097323600973, + "grad_norm": 0.5284282565116882, + "learning_rate": 1.1545275136210441e-05, + "loss": 0.5950232744216919, + "step": 1107 + }, + { + "epoch": 1.4373073803730738, + "grad_norm": 0.603245735168457, + "learning_rate": 1.153114585353925e-05, + "loss": 0.6702573299407959, + "step": 1108 + }, + { + "epoch": 1.4386050283860503, + "grad_norm": 0.5415088534355164, + "learning_rate": 1.1517013440062326e-05, + "loss": 0.5716216564178467, + "step": 1109 + }, + { + "epoch": 1.4399026763990268, + "grad_norm": 0.4960046708583832, + "learning_rate": 1.1502877924676881e-05, + "loss": 0.5501525402069092, + "step": 1110 + }, + { + "epoch": 1.4412003244120033, + "grad_norm": 0.5444253087043762, + "learning_rate": 1.1488739336286467e-05, + "loss": 0.6333913207054138, + "step": 1111 + }, + { + "epoch": 1.4424979724249798, + "grad_norm": 0.5255866646766663, + "learning_rate": 1.1474597703800915e-05, + "loss": 0.6024140119552612, + "step": 1112 + }, + { + "epoch": 1.4437956204379563, + "grad_norm": 0.5488544702529907, + "learning_rate": 1.1460453056136285e-05, + "loss": 0.6334477663040161, + "step": 1113 + }, + { + "epoch": 1.4450932684509326, + "grad_norm": 0.5465590953826904, + "learning_rate": 1.14463054222148e-05, + "loss": 0.6596208810806274, + "step": 1114 + }, + { + "epoch": 1.4463909164639093, + "grad_norm": 0.5492766499519348, + "learning_rate": 1.1432154830964796e-05, + "loss": 0.6396174430847168, + "step": 1115 + }, + { + "epoch": 1.4476885644768855, + "grad_norm": 0.5476314425468445, + "learning_rate": 1.1418001311320649e-05, + "loss": 0.6056069135665894, + "step": 1116 + }, + { + "epoch": 1.4489862124898623, + "grad_norm": 0.5088196396827698, + "learning_rate": 1.1403844892222717e-05, + "loss": 0.5474177002906799, + "step": 1117 + }, + { + "epoch": 1.4502838605028385, + "grad_norm": 0.5697342753410339, + "learning_rate": 1.1389685602617302e-05, + "loss": 0.6007769107818604, + "step": 1118 + }, + { + "epoch": 1.451581508515815, + "grad_norm": 0.5281476974487305, + "learning_rate": 1.1375523471456564e-05, + "loss": 0.5913225412368774, + "step": 1119 + }, + { + "epoch": 1.4528791565287915, + "grad_norm": 0.5619297027587891, + "learning_rate": 1.1361358527698481e-05, + "loss": 0.611336350440979, + "step": 1120 + }, + { + "epoch": 1.454176804541768, + "grad_norm": 0.531401515007019, + "learning_rate": 1.134719080030677e-05, + "loss": 0.5786083936691284, + "step": 1121 + }, + { + "epoch": 1.4554744525547445, + "grad_norm": 0.5428561568260193, + "learning_rate": 1.1333020318250854e-05, + "loss": 0.6208731532096863, + "step": 1122 + }, + { + "epoch": 1.456772100567721, + "grad_norm": 0.5384306311607361, + "learning_rate": 1.131884711050578e-05, + "loss": 0.5843198895454407, + "step": 1123 + }, + { + "epoch": 1.4580697485806975, + "grad_norm": 0.5160107016563416, + "learning_rate": 1.1304671206052168e-05, + "loss": 0.5473004579544067, + "step": 1124 + }, + { + "epoch": 1.459367396593674, + "grad_norm": 0.5360195636749268, + "learning_rate": 1.1290492633876164e-05, + "loss": 0.626501202583313, + "step": 1125 + }, + { + "epoch": 1.4606650446066505, + "grad_norm": 0.5251026749610901, + "learning_rate": 1.1276311422969349e-05, + "loss": 0.5944849848747253, + "step": 1126 + }, + { + "epoch": 1.461962692619627, + "grad_norm": 0.564008355140686, + "learning_rate": 1.1262127602328712e-05, + "loss": 0.6147276163101196, + "step": 1127 + }, + { + "epoch": 1.4632603406326035, + "grad_norm": 0.5388748645782471, + "learning_rate": 1.124794120095658e-05, + "loss": 0.5849318504333496, + "step": 1128 + }, + { + "epoch": 1.46455798864558, + "grad_norm": 0.5595386624336243, + "learning_rate": 1.1233752247860549e-05, + "loss": 0.6283015012741089, + "step": 1129 + }, + { + "epoch": 1.4658556366585564, + "grad_norm": 0.5528329014778137, + "learning_rate": 1.1219560772053442e-05, + "loss": 0.6135470867156982, + "step": 1130 + }, + { + "epoch": 1.4671532846715327, + "grad_norm": 0.5480870008468628, + "learning_rate": 1.1205366802553231e-05, + "loss": 0.579879879951477, + "step": 1131 + }, + { + "epoch": 1.4684509326845094, + "grad_norm": 0.6012369990348816, + "learning_rate": 1.1191170368382992e-05, + "loss": 0.67568039894104, + "step": 1132 + }, + { + "epoch": 1.4697485806974857, + "grad_norm": 0.5386692881584167, + "learning_rate": 1.117697149857084e-05, + "loss": 0.6155050992965698, + "step": 1133 + }, + { + "epoch": 1.4710462287104624, + "grad_norm": 0.540510892868042, + "learning_rate": 1.1162770222149873e-05, + "loss": 0.6193840503692627, + "step": 1134 + }, + { + "epoch": 1.4723438767234387, + "grad_norm": 0.5231954455375671, + "learning_rate": 1.1148566568158099e-05, + "loss": 0.5806912183761597, + "step": 1135 + }, + { + "epoch": 1.4736415247364152, + "grad_norm": 0.5371982455253601, + "learning_rate": 1.1134360565638402e-05, + "loss": 0.6294920444488525, + "step": 1136 + }, + { + "epoch": 1.4749391727493917, + "grad_norm": 0.5294065475463867, + "learning_rate": 1.1120152243638457e-05, + "loss": 0.6405944228172302, + "step": 1137 + }, + { + "epoch": 1.4762368207623682, + "grad_norm": 0.5396026372909546, + "learning_rate": 1.1105941631210694e-05, + "loss": 0.622348427772522, + "step": 1138 + }, + { + "epoch": 1.4775344687753447, + "grad_norm": 0.5184268951416016, + "learning_rate": 1.1091728757412212e-05, + "loss": 0.5783290863037109, + "step": 1139 + }, + { + "epoch": 1.4788321167883212, + "grad_norm": 0.5296680331230164, + "learning_rate": 1.107751365130474e-05, + "loss": 0.5765876770019531, + "step": 1140 + }, + { + "epoch": 1.4801297648012977, + "grad_norm": 0.5528906583786011, + "learning_rate": 1.1063296341954577e-05, + "loss": 0.5958802700042725, + "step": 1141 + }, + { + "epoch": 1.4814274128142741, + "grad_norm": 0.549384355545044, + "learning_rate": 1.1049076858432517e-05, + "loss": 0.6524186730384827, + "step": 1142 + }, + { + "epoch": 1.4827250608272506, + "grad_norm": 0.5553792119026184, + "learning_rate": 1.1034855229813812e-05, + "loss": 0.63478684425354, + "step": 1143 + }, + { + "epoch": 1.4840227088402271, + "grad_norm": 0.5639452934265137, + "learning_rate": 1.1020631485178084e-05, + "loss": 0.6482947468757629, + "step": 1144 + }, + { + "epoch": 1.4853203568532036, + "grad_norm": 0.5332263708114624, + "learning_rate": 1.1006405653609295e-05, + "loss": 0.6563082337379456, + "step": 1145 + }, + { + "epoch": 1.4866180048661801, + "grad_norm": 0.5505067110061646, + "learning_rate": 1.0992177764195671e-05, + "loss": 0.6217901706695557, + "step": 1146 + }, + { + "epoch": 1.4879156528791566, + "grad_norm": 0.5751034021377563, + "learning_rate": 1.0977947846029642e-05, + "loss": 0.618269681930542, + "step": 1147 + }, + { + "epoch": 1.4892133008921329, + "grad_norm": 0.5259911417961121, + "learning_rate": 1.0963715928207795e-05, + "loss": 0.5809241533279419, + "step": 1148 + }, + { + "epoch": 1.4905109489051096, + "grad_norm": 0.5405173301696777, + "learning_rate": 1.094948203983079e-05, + "loss": 0.6440936923027039, + "step": 1149 + }, + { + "epoch": 1.4918085969180859, + "grad_norm": 0.5359426736831665, + "learning_rate": 1.0935246210003334e-05, + "loss": 0.5997065305709839, + "step": 1150 + }, + { + "epoch": 1.4918085969180859, + "eval_loss": 0.6832194328308105, + "eval_runtime": 72.4893, + "eval_samples_per_second": 71.624, + "eval_steps_per_second": 8.953, + "step": 1150 + }, + { + "epoch": 1.4931062449310626, + "grad_norm": 0.545395016670227, + "learning_rate": 1.0921008467834094e-05, + "loss": 0.6377010345458984, + "step": 1151 + }, + { + "epoch": 1.4944038929440389, + "grad_norm": 0.553674578666687, + "learning_rate": 1.0906768842435647e-05, + "loss": 0.6331782937049866, + "step": 1152 + }, + { + "epoch": 1.4957015409570154, + "grad_norm": 0.5127398371696472, + "learning_rate": 1.0892527362924426e-05, + "loss": 0.5681911110877991, + "step": 1153 + }, + { + "epoch": 1.4969991889699918, + "grad_norm": 0.5308411717414856, + "learning_rate": 1.0878284058420647e-05, + "loss": 0.6325392127037048, + "step": 1154 + }, + { + "epoch": 1.4982968369829683, + "grad_norm": 0.5330897569656372, + "learning_rate": 1.0864038958048267e-05, + "loss": 0.5603891611099243, + "step": 1155 + }, + { + "epoch": 1.4995944849959448, + "grad_norm": 0.5287606716156006, + "learning_rate": 1.084979209093491e-05, + "loss": 0.5920351147651672, + "step": 1156 + }, + { + "epoch": 1.5008921330089213, + "grad_norm": 0.5484432578086853, + "learning_rate": 1.0835543486211815e-05, + "loss": 0.6529064178466797, + "step": 1157 + }, + { + "epoch": 1.5021897810218978, + "grad_norm": 0.5554434061050415, + "learning_rate": 1.0821293173013769e-05, + "loss": 0.6203141212463379, + "step": 1158 + }, + { + "epoch": 1.5034874290348743, + "grad_norm": 0.4985191226005554, + "learning_rate": 1.0807041180479054e-05, + "loss": 0.5167315006256104, + "step": 1159 + }, + { + "epoch": 1.5047850770478508, + "grad_norm": 0.5687364339828491, + "learning_rate": 1.0792787537749392e-05, + "loss": 0.6727509498596191, + "step": 1160 + }, + { + "epoch": 1.5060827250608273, + "grad_norm": 0.5391871333122253, + "learning_rate": 1.0778532273969877e-05, + "loss": 0.5891563892364502, + "step": 1161 + }, + { + "epoch": 1.5073803730738038, + "grad_norm": 0.5688561201095581, + "learning_rate": 1.0764275418288908e-05, + "loss": 0.6336361169815063, + "step": 1162 + }, + { + "epoch": 1.50867802108678, + "grad_norm": 0.5307201743125916, + "learning_rate": 1.0750016999858151e-05, + "loss": 0.6088765263557434, + "step": 1163 + }, + { + "epoch": 1.5099756690997568, + "grad_norm": 0.5417827367782593, + "learning_rate": 1.0735757047832461e-05, + "loss": 0.6234108209609985, + "step": 1164 + }, + { + "epoch": 1.511273317112733, + "grad_norm": 0.5165390968322754, + "learning_rate": 1.0721495591369832e-05, + "loss": 0.5378797054290771, + "step": 1165 + }, + { + "epoch": 1.5125709651257098, + "grad_norm": 0.5508493781089783, + "learning_rate": 1.0707232659631333e-05, + "loss": 0.6575205326080322, + "step": 1166 + }, + { + "epoch": 1.513868613138686, + "grad_norm": 0.5701325535774231, + "learning_rate": 1.0692968281781046e-05, + "loss": 0.5776763558387756, + "step": 1167 + }, + { + "epoch": 1.5151662611516628, + "grad_norm": 0.5180992484092712, + "learning_rate": 1.0678702486986016e-05, + "loss": 0.5627498626708984, + "step": 1168 + }, + { + "epoch": 1.516463909164639, + "grad_norm": 0.5465271472930908, + "learning_rate": 1.0664435304416185e-05, + "loss": 0.5880453586578369, + "step": 1169 + }, + { + "epoch": 1.5177615571776155, + "grad_norm": 0.5629556775093079, + "learning_rate": 1.065016676324433e-05, + "loss": 0.6594117879867554, + "step": 1170 + }, + { + "epoch": 1.519059205190592, + "grad_norm": 0.5278184413909912, + "learning_rate": 1.0635896892645998e-05, + "loss": 0.5453213453292847, + "step": 1171 + }, + { + "epoch": 1.5203568532035685, + "grad_norm": 0.5409108400344849, + "learning_rate": 1.0621625721799473e-05, + "loss": 0.6020928025245667, + "step": 1172 + }, + { + "epoch": 1.521654501216545, + "grad_norm": 0.5297386050224304, + "learning_rate": 1.0607353279885682e-05, + "loss": 0.581575870513916, + "step": 1173 + }, + { + "epoch": 1.5229521492295215, + "grad_norm": 0.5326167345046997, + "learning_rate": 1.0593079596088155e-05, + "loss": 0.5731886029243469, + "step": 1174 + }, + { + "epoch": 1.524249797242498, + "grad_norm": 0.5496317148208618, + "learning_rate": 1.0578804699592968e-05, + "loss": 0.6127786636352539, + "step": 1175 + }, + { + "epoch": 1.5255474452554745, + "grad_norm": 0.5222692489624023, + "learning_rate": 1.0564528619588668e-05, + "loss": 0.5508180856704712, + "step": 1176 + }, + { + "epoch": 1.526845093268451, + "grad_norm": 0.5078931450843811, + "learning_rate": 1.0550251385266223e-05, + "loss": 0.590618908405304, + "step": 1177 + }, + { + "epoch": 1.5281427412814275, + "grad_norm": 0.545173704624176, + "learning_rate": 1.0535973025818969e-05, + "loss": 0.5988805294036865, + "step": 1178 + }, + { + "epoch": 1.529440389294404, + "grad_norm": 0.5643585920333862, + "learning_rate": 1.0521693570442533e-05, + "loss": 0.6470606327056885, + "step": 1179 + }, + { + "epoch": 1.5307380373073802, + "grad_norm": 0.5382372140884399, + "learning_rate": 1.050741304833479e-05, + "loss": 0.6253216862678528, + "step": 1180 + }, + { + "epoch": 1.532035685320357, + "grad_norm": 0.527792751789093, + "learning_rate": 1.0493131488695789e-05, + "loss": 0.5740289092063904, + "step": 1181 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.5286063551902771, + "learning_rate": 1.0478848920727707e-05, + "loss": 0.5898089408874512, + "step": 1182 + }, + { + "epoch": 1.53463098134631, + "grad_norm": 0.5210081338882446, + "learning_rate": 1.0464565373634784e-05, + "loss": 0.5460256338119507, + "step": 1183 + }, + { + "epoch": 1.5359286293592862, + "grad_norm": 0.542233943939209, + "learning_rate": 1.0450280876623253e-05, + "loss": 0.6149614453315735, + "step": 1184 + }, + { + "epoch": 1.537226277372263, + "grad_norm": 0.5287345051765442, + "learning_rate": 1.0435995458901298e-05, + "loss": 0.5987131595611572, + "step": 1185 + }, + { + "epoch": 1.5385239253852392, + "grad_norm": 0.542398989200592, + "learning_rate": 1.042170914967898e-05, + "loss": 0.5659464001655579, + "step": 1186 + }, + { + "epoch": 1.5398215733982157, + "grad_norm": 0.5581417679786682, + "learning_rate": 1.0407421978168186e-05, + "loss": 0.648675262928009, + "step": 1187 + }, + { + "epoch": 1.5411192214111922, + "grad_norm": 0.542323112487793, + "learning_rate": 1.0393133973582572e-05, + "loss": 0.6466338634490967, + "step": 1188 + }, + { + "epoch": 1.5424168694241687, + "grad_norm": 0.5204232335090637, + "learning_rate": 1.0378845165137483e-05, + "loss": 0.5785092115402222, + "step": 1189 + }, + { + "epoch": 1.5437145174371452, + "grad_norm": 0.5261425375938416, + "learning_rate": 1.0364555582049917e-05, + "loss": 0.6130785346031189, + "step": 1190 + }, + { + "epoch": 1.5450121654501217, + "grad_norm": 0.5651884078979492, + "learning_rate": 1.0350265253538458e-05, + "loss": 0.6042903661727905, + "step": 1191 + }, + { + "epoch": 1.5463098134630981, + "grad_norm": 0.5569320917129517, + "learning_rate": 1.033597420882321e-05, + "loss": 0.6515809297561646, + "step": 1192 + }, + { + "epoch": 1.5476074614760746, + "grad_norm": 0.5539842844009399, + "learning_rate": 1.0321682477125743e-05, + "loss": 0.6051802039146423, + "step": 1193 + }, + { + "epoch": 1.5489051094890511, + "grad_norm": 0.5327019691467285, + "learning_rate": 1.0307390087669026e-05, + "loss": 0.5866248607635498, + "step": 1194 + }, + { + "epoch": 1.5502027575020276, + "grad_norm": 0.5504518151283264, + "learning_rate": 1.0293097069677382e-05, + "loss": 0.6087076663970947, + "step": 1195 + }, + { + "epoch": 1.5515004055150041, + "grad_norm": 0.5322021842002869, + "learning_rate": 1.0278803452376416e-05, + "loss": 0.5527307391166687, + "step": 1196 + }, + { + "epoch": 1.5527980535279804, + "grad_norm": 0.5314878821372986, + "learning_rate": 1.0264509264992954e-05, + "loss": 0.623512327671051, + "step": 1197 + }, + { + "epoch": 1.554095701540957, + "grad_norm": 0.5596524477005005, + "learning_rate": 1.0250214536754996e-05, + "loss": 0.6276538372039795, + "step": 1198 + }, + { + "epoch": 1.5553933495539334, + "grad_norm": 0.5265888571739197, + "learning_rate": 1.0235919296891641e-05, + "loss": 0.5611189603805542, + "step": 1199 + }, + { + "epoch": 1.55669099756691, + "grad_norm": 0.5899763107299805, + "learning_rate": 1.0221623574633035e-05, + "loss": 0.6541014909744263, + "step": 1200 + }, + { + "epoch": 1.5579886455798864, + "grad_norm": 0.545138955116272, + "learning_rate": 1.0207327399210311e-05, + "loss": 0.5935692191123962, + "step": 1201 + }, + { + "epoch": 1.559286293592863, + "grad_norm": 0.5380452871322632, + "learning_rate": 1.0193030799855534e-05, + "loss": 0.5741644501686096, + "step": 1202 + }, + { + "epoch": 1.5605839416058394, + "grad_norm": 0.5540161728858948, + "learning_rate": 1.0178733805801626e-05, + "loss": 0.625443696975708, + "step": 1203 + }, + { + "epoch": 1.5618815896188158, + "grad_norm": 0.5784110426902771, + "learning_rate": 1.0164436446282324e-05, + "loss": 0.6342917680740356, + "step": 1204 + }, + { + "epoch": 1.5631792376317923, + "grad_norm": 0.5346982479095459, + "learning_rate": 1.015013875053211e-05, + "loss": 0.5571820735931396, + "step": 1205 + }, + { + "epoch": 1.5644768856447688, + "grad_norm": 0.5152148008346558, + "learning_rate": 1.013584074778615e-05, + "loss": 0.5197643041610718, + "step": 1206 + }, + { + "epoch": 1.5657745336577453, + "grad_norm": 0.5702791213989258, + "learning_rate": 1.0121542467280245e-05, + "loss": 0.6099081635475159, + "step": 1207 + }, + { + "epoch": 1.5670721816707218, + "grad_norm": 0.5424299836158752, + "learning_rate": 1.0107243938250755e-05, + "loss": 0.5385927557945251, + "step": 1208 + }, + { + "epoch": 1.5683698296836983, + "grad_norm": 0.5413081049919128, + "learning_rate": 1.0092945189934558e-05, + "loss": 0.6308001279830933, + "step": 1209 + }, + { + "epoch": 1.5696674776966748, + "grad_norm": 0.5650938749313354, + "learning_rate": 1.007864625156897e-05, + "loss": 0.656417965888977, + "step": 1210 + }, + { + "epoch": 1.5709651257096513, + "grad_norm": 0.5578048229217529, + "learning_rate": 1.0064347152391703e-05, + "loss": 0.5987565517425537, + "step": 1211 + }, + { + "epoch": 1.5722627737226276, + "grad_norm": 0.5425694584846497, + "learning_rate": 1.0050047921640797e-05, + "loss": 0.5794038772583008, + "step": 1212 + }, + { + "epoch": 1.5735604217356043, + "grad_norm": 0.5536248087882996, + "learning_rate": 1.003574858855456e-05, + "loss": 0.6126576066017151, + "step": 1213 + }, + { + "epoch": 1.5748580697485806, + "grad_norm": 0.5221614837646484, + "learning_rate": 1.0021449182371504e-05, + "loss": 0.5808907747268677, + "step": 1214 + }, + { + "epoch": 1.5761557177615573, + "grad_norm": 0.5314812660217285, + "learning_rate": 1.0007149732330299e-05, + "loss": 0.5740360021591187, + "step": 1215 + }, + { + "epoch": 1.5774533657745335, + "grad_norm": 0.556327223777771, + "learning_rate": 9.992850267669703e-06, + "loss": 0.6449018716812134, + "step": 1216 + }, + { + "epoch": 1.5787510137875103, + "grad_norm": 0.5447148680686951, + "learning_rate": 9.978550817628501e-06, + "loss": 0.5590343475341797, + "step": 1217 + }, + { + "epoch": 1.5800486618004865, + "grad_norm": 0.5570490956306458, + "learning_rate": 9.964251411445444e-06, + "loss": 0.6283855438232422, + "step": 1218 + }, + { + "epoch": 1.5813463098134632, + "grad_norm": 0.5475562214851379, + "learning_rate": 9.949952078359208e-06, + "loss": 0.6058873534202576, + "step": 1219 + }, + { + "epoch": 1.5826439578264395, + "grad_norm": 0.5271614789962769, + "learning_rate": 9.935652847608302e-06, + "loss": 0.6080070734024048, + "step": 1220 + }, + { + "epoch": 1.583941605839416, + "grad_norm": 0.5340768098831177, + "learning_rate": 9.921353748431036e-06, + "loss": 0.5789950489997864, + "step": 1221 + }, + { + "epoch": 1.5852392538523925, + "grad_norm": 0.5284969806671143, + "learning_rate": 9.907054810065446e-06, + "loss": 0.5514812469482422, + "step": 1222 + }, + { + "epoch": 1.586536901865369, + "grad_norm": 0.5400740504264832, + "learning_rate": 9.89275606174925e-06, + "loss": 0.5774392485618591, + "step": 1223 + }, + { + "epoch": 1.5878345498783455, + "grad_norm": 0.5264250040054321, + "learning_rate": 9.878457532719757e-06, + "loss": 0.5731384754180908, + "step": 1224 + }, + { + "epoch": 1.589132197891322, + "grad_norm": 0.5703708529472351, + "learning_rate": 9.864159252213852e-06, + "loss": 0.6473686695098877, + "step": 1225 + }, + { + "epoch": 1.5904298459042985, + "grad_norm": 0.5441808104515076, + "learning_rate": 9.849861249467893e-06, + "loss": 0.6381841897964478, + "step": 1226 + }, + { + "epoch": 1.591727493917275, + "grad_norm": 0.5486851930618286, + "learning_rate": 9.83556355371768e-06, + "loss": 0.613477349281311, + "step": 1227 + }, + { + "epoch": 1.5930251419302515, + "grad_norm": 0.5925759673118591, + "learning_rate": 9.821266194198375e-06, + "loss": 0.5966989994049072, + "step": 1228 + }, + { + "epoch": 1.5943227899432277, + "grad_norm": 0.503745436668396, + "learning_rate": 9.806969200144471e-06, + "loss": 0.5462368726730347, + "step": 1229 + }, + { + "epoch": 1.5956204379562045, + "grad_norm": 0.525786817073822, + "learning_rate": 9.79267260078969e-06, + "loss": 0.5990958213806152, + "step": 1230 + }, + { + "epoch": 1.5969180859691807, + "grad_norm": 0.5402313470840454, + "learning_rate": 9.778376425366967e-06, + "loss": 0.6069964170455933, + "step": 1231 + }, + { + "epoch": 1.5982157339821574, + "grad_norm": 0.566880464553833, + "learning_rate": 9.764080703108362e-06, + "loss": 0.6295340061187744, + "step": 1232 + }, + { + "epoch": 1.5995133819951337, + "grad_norm": 0.5545258522033691, + "learning_rate": 9.749785463245006e-06, + "loss": 0.6260232925415039, + "step": 1233 + }, + { + "epoch": 1.6008110300081104, + "grad_norm": 0.5898419618606567, + "learning_rate": 9.735490735007047e-06, + "loss": 0.6146451830863953, + "step": 1234 + }, + { + "epoch": 1.6021086780210867, + "grad_norm": 0.5249006748199463, + "learning_rate": 9.721196547623585e-06, + "loss": 0.6049670577049255, + "step": 1235 + }, + { + "epoch": 1.6034063260340634, + "grad_norm": 0.5289062857627869, + "learning_rate": 9.706902930322621e-06, + "loss": 0.6006771326065063, + "step": 1236 + }, + { + "epoch": 1.6047039740470397, + "grad_norm": 0.5482916235923767, + "learning_rate": 9.692609912330975e-06, + "loss": 0.621732771396637, + "step": 1237 + }, + { + "epoch": 1.6060016220600162, + "grad_norm": 0.5499362945556641, + "learning_rate": 9.67831752287426e-06, + "loss": 0.6316919922828674, + "step": 1238 + }, + { + "epoch": 1.6072992700729927, + "grad_norm": 0.5119637250900269, + "learning_rate": 9.66402579117679e-06, + "loss": 0.5918980240821838, + "step": 1239 + }, + { + "epoch": 1.6085969180859692, + "grad_norm": 0.5473806262016296, + "learning_rate": 9.649734746461544e-06, + "loss": 0.6354460716247559, + "step": 1240 + }, + { + "epoch": 1.6098945660989457, + "grad_norm": 0.5340628027915955, + "learning_rate": 9.635444417950083e-06, + "loss": 0.5693660378456116, + "step": 1241 + }, + { + "epoch": 1.6111922141119221, + "grad_norm": 0.5385611653327942, + "learning_rate": 9.62115483486252e-06, + "loss": 0.5467959642410278, + "step": 1242 + }, + { + "epoch": 1.6124898621248986, + "grad_norm": 0.5278156399726868, + "learning_rate": 9.606866026417431e-06, + "loss": 0.6024355888366699, + "step": 1243 + }, + { + "epoch": 1.6137875101378751, + "grad_norm": 0.5506213903427124, + "learning_rate": 9.592578021831817e-06, + "loss": 0.6594349145889282, + "step": 1244 + }, + { + "epoch": 1.6150851581508516, + "grad_norm": 0.5613592267036438, + "learning_rate": 9.578290850321023e-06, + "loss": 0.6147022247314453, + "step": 1245 + }, + { + "epoch": 1.616382806163828, + "grad_norm": 0.5302473306655884, + "learning_rate": 9.564004541098709e-06, + "loss": 0.5724552869796753, + "step": 1246 + }, + { + "epoch": 1.6176804541768046, + "grad_norm": 0.5463687777519226, + "learning_rate": 9.549719123376749e-06, + "loss": 0.6859567165374756, + "step": 1247 + }, + { + "epoch": 1.6189781021897809, + "grad_norm": 0.578063428401947, + "learning_rate": 9.535434626365221e-06, + "loss": 0.654534101486206, + "step": 1248 + }, + { + "epoch": 1.6202757502027576, + "grad_norm": 0.5842363238334656, + "learning_rate": 9.521151079272295e-06, + "loss": 0.6818944811820984, + "step": 1249 + }, + { + "epoch": 1.6215733982157339, + "grad_norm": 0.5462816953659058, + "learning_rate": 9.506868511304216e-06, + "loss": 0.5978901386260986, + "step": 1250 + }, + { + "epoch": 1.6228710462287106, + "grad_norm": 0.5496495962142944, + "learning_rate": 9.492586951665214e-06, + "loss": 0.6664569973945618, + "step": 1251 + }, + { + "epoch": 1.6241686942416869, + "grad_norm": 0.541262149810791, + "learning_rate": 9.47830642955747e-06, + "loss": 0.5771492719650269, + "step": 1252 + }, + { + "epoch": 1.6254663422546636, + "grad_norm": 0.5542916655540466, + "learning_rate": 9.464026974181035e-06, + "loss": 0.6377862095832825, + "step": 1253 + }, + { + "epoch": 1.6267639902676398, + "grad_norm": 0.5212349891662598, + "learning_rate": 9.44974861473378e-06, + "loss": 0.5878604650497437, + "step": 1254 + }, + { + "epoch": 1.6280616382806163, + "grad_norm": 0.5611302256584167, + "learning_rate": 9.435471380411335e-06, + "loss": 0.636326789855957, + "step": 1255 + }, + { + "epoch": 1.6293592862935928, + "grad_norm": 0.5258191227912903, + "learning_rate": 9.421195300407035e-06, + "loss": 0.5580926537513733, + "step": 1256 + }, + { + "epoch": 1.6306569343065693, + "grad_norm": 0.5298276543617249, + "learning_rate": 9.406920403911848e-06, + "loss": 0.6048216819763184, + "step": 1257 + }, + { + "epoch": 1.6319545823195458, + "grad_norm": 0.5328834056854248, + "learning_rate": 9.392646720114325e-06, + "loss": 0.6379623413085938, + "step": 1258 + }, + { + "epoch": 1.6332522303325223, + "grad_norm": 0.5315790176391602, + "learning_rate": 9.37837427820053e-06, + "loss": 0.6466155052185059, + "step": 1259 + }, + { + "epoch": 1.6345498783454988, + "grad_norm": 0.5353376269340515, + "learning_rate": 9.364103107354002e-06, + "loss": 0.5879526138305664, + "step": 1260 + }, + { + "epoch": 1.6358475263584753, + "grad_norm": 0.5551068186759949, + "learning_rate": 9.349833236755675e-06, + "loss": 0.5988892316818237, + "step": 1261 + }, + { + "epoch": 1.6371451743714518, + "grad_norm": 0.5331724286079407, + "learning_rate": 9.335564695583816e-06, + "loss": 0.5948902368545532, + "step": 1262 + }, + { + "epoch": 1.638442822384428, + "grad_norm": 0.54310542345047, + "learning_rate": 9.321297513013987e-06, + "loss": 0.6055219769477844, + "step": 1263 + }, + { + "epoch": 1.6397404703974048, + "grad_norm": 0.5368586182594299, + "learning_rate": 9.307031718218956e-06, + "loss": 0.6035459637641907, + "step": 1264 + }, + { + "epoch": 1.641038118410381, + "grad_norm": 0.5460159182548523, + "learning_rate": 9.292767340368672e-06, + "loss": 0.6447773575782776, + "step": 1265 + }, + { + "epoch": 1.6423357664233578, + "grad_norm": 0.5599712133407593, + "learning_rate": 9.278504408630171e-06, + "loss": 0.6332420110702515, + "step": 1266 + }, + { + "epoch": 1.643633414436334, + "grad_norm": 0.5388185977935791, + "learning_rate": 9.264242952167544e-06, + "loss": 0.6116797924041748, + "step": 1267 + }, + { + "epoch": 1.6449310624493108, + "grad_norm": 0.5109002590179443, + "learning_rate": 9.24998300014185e-06, + "loss": 0.628926694393158, + "step": 1268 + }, + { + "epoch": 1.646228710462287, + "grad_norm": 0.5572671890258789, + "learning_rate": 9.235724581711096e-06, + "loss": 0.5795090794563293, + "step": 1269 + }, + { + "epoch": 1.6475263584752637, + "grad_norm": 0.777040421962738, + "learning_rate": 9.221467726030126e-06, + "loss": 0.644891083240509, + "step": 1270 + }, + { + "epoch": 1.64882400648824, + "grad_norm": 0.5158191919326782, + "learning_rate": 9.207212462250611e-06, + "loss": 0.5630925893783569, + "step": 1271 + }, + { + "epoch": 1.6501216545012165, + "grad_norm": 0.5111160278320312, + "learning_rate": 9.192958819520948e-06, + "loss": 0.5322938561439514, + "step": 1272 + }, + { + "epoch": 1.651419302514193, + "grad_norm": 0.5043333768844604, + "learning_rate": 9.178706826986236e-06, + "loss": 0.5961562395095825, + "step": 1273 + }, + { + "epoch": 1.6527169505271695, + "grad_norm": 0.5496838092803955, + "learning_rate": 9.164456513788186e-06, + "loss": 0.6005456447601318, + "step": 1274 + }, + { + "epoch": 1.654014598540146, + "grad_norm": 0.5577642321586609, + "learning_rate": 9.150207909065093e-06, + "loss": 0.6366305351257324, + "step": 1275 + }, + { + "epoch": 1.6553122465531225, + "grad_norm": 0.5257747769355774, + "learning_rate": 9.135961041951735e-06, + "loss": 0.5669390559196472, + "step": 1276 + }, + { + "epoch": 1.656609894566099, + "grad_norm": 0.5349394083023071, + "learning_rate": 9.121715941579358e-06, + "loss": 0.5594930052757263, + "step": 1277 + }, + { + "epoch": 1.6579075425790755, + "grad_norm": 0.5282658338546753, + "learning_rate": 9.107472637075578e-06, + "loss": 0.6159694194793701, + "step": 1278 + }, + { + "epoch": 1.659205190592052, + "grad_norm": 0.5608229637145996, + "learning_rate": 9.093231157564357e-06, + "loss": 0.6022686958312988, + "step": 1279 + }, + { + "epoch": 1.6605028386050282, + "grad_norm": 0.5175761580467224, + "learning_rate": 9.078991532165911e-06, + "loss": 0.5850685834884644, + "step": 1280 + }, + { + "epoch": 1.661800486618005, + "grad_norm": 0.5338742733001709, + "learning_rate": 9.06475378999667e-06, + "loss": 0.5943388938903809, + "step": 1281 + }, + { + "epoch": 1.6630981346309812, + "grad_norm": 0.5751469135284424, + "learning_rate": 9.050517960169211e-06, + "loss": 0.6381434798240662, + "step": 1282 + }, + { + "epoch": 1.664395782643958, + "grad_norm": 0.5597715377807617, + "learning_rate": 9.036284071792212e-06, + "loss": 0.6742138862609863, + "step": 1283 + }, + { + "epoch": 1.6656934306569342, + "grad_norm": 0.5457910895347595, + "learning_rate": 9.022052153970361e-06, + "loss": 0.6068155169487, + "step": 1284 + }, + { + "epoch": 1.666991078669911, + "grad_norm": 0.5507814884185791, + "learning_rate": 9.007822235804334e-06, + "loss": 0.6176409125328064, + "step": 1285 + }, + { + "epoch": 1.6682887266828872, + "grad_norm": 0.5373377203941345, + "learning_rate": 8.993594346390709e-06, + "loss": 0.5884984731674194, + "step": 1286 + }, + { + "epoch": 1.669586374695864, + "grad_norm": 0.523912787437439, + "learning_rate": 8.979368514821917e-06, + "loss": 0.5794025659561157, + "step": 1287 + }, + { + "epoch": 1.6708840227088402, + "grad_norm": 0.5313317179679871, + "learning_rate": 8.965144770186192e-06, + "loss": 0.6304433345794678, + "step": 1288 + }, + { + "epoch": 1.6721816707218167, + "grad_norm": 0.5308225154876709, + "learning_rate": 8.950923141567482e-06, + "loss": 0.5822694301605225, + "step": 1289 + }, + { + "epoch": 1.6734793187347932, + "grad_norm": 0.5657337307929993, + "learning_rate": 8.936703658045426e-06, + "loss": 0.7206499576568604, + "step": 1290 + }, + { + "epoch": 1.6747769667477697, + "grad_norm": 0.5842191576957703, + "learning_rate": 8.92248634869526e-06, + "loss": 0.6483322381973267, + "step": 1291 + }, + { + "epoch": 1.6760746147607462, + "grad_norm": 0.5084115266799927, + "learning_rate": 8.90827124258779e-06, + "loss": 0.60451340675354, + "step": 1292 + }, + { + "epoch": 1.6773722627737226, + "grad_norm": 0.5080921053886414, + "learning_rate": 8.894058368789308e-06, + "loss": 0.5007386803627014, + "step": 1293 + }, + { + "epoch": 1.6786699107866991, + "grad_norm": 0.5186359286308289, + "learning_rate": 8.879847756361544e-06, + "loss": 0.5846607685089111, + "step": 1294 + }, + { + "epoch": 1.6799675587996756, + "grad_norm": 0.5321721434593201, + "learning_rate": 8.8656394343616e-06, + "loss": 0.5854955315589905, + "step": 1295 + }, + { + "epoch": 1.6812652068126521, + "grad_norm": 0.5577939748764038, + "learning_rate": 8.851433431841904e-06, + "loss": 0.6218785643577576, + "step": 1296 + }, + { + "epoch": 1.6825628548256284, + "grad_norm": 0.5574389696121216, + "learning_rate": 8.837229777850129e-06, + "loss": 0.639427661895752, + "step": 1297 + }, + { + "epoch": 1.683860502838605, + "grad_norm": 0.5620577335357666, + "learning_rate": 8.823028501429161e-06, + "loss": 0.6334304809570312, + "step": 1298 + }, + { + "epoch": 1.6851581508515814, + "grad_norm": 0.5603854656219482, + "learning_rate": 8.808829631617009e-06, + "loss": 0.5796216726303101, + "step": 1299 + }, + { + "epoch": 1.686455798864558, + "grad_norm": 0.5886275172233582, + "learning_rate": 8.79463319744677e-06, + "loss": 0.6645929217338562, + "step": 1300 + }, + { + "epoch": 1.6877534468775344, + "grad_norm": 0.5587744116783142, + "learning_rate": 8.78043922794656e-06, + "loss": 0.6387877464294434, + "step": 1301 + }, + { + "epoch": 1.689051094890511, + "grad_norm": 0.5619886517524719, + "learning_rate": 8.766247752139453e-06, + "loss": 0.658257007598877, + "step": 1302 + }, + { + "epoch": 1.6903487429034874, + "grad_norm": 0.5658282041549683, + "learning_rate": 8.752058799043422e-06, + "loss": 0.6349663734436035, + "step": 1303 + }, + { + "epoch": 1.691646390916464, + "grad_norm": 0.5596343874931335, + "learning_rate": 8.737872397671293e-06, + "loss": 0.5926494002342224, + "step": 1304 + }, + { + "epoch": 1.6929440389294403, + "grad_norm": 0.5565075874328613, + "learning_rate": 8.723688577030655e-06, + "loss": 0.6093648672103882, + "step": 1305 + }, + { + "epoch": 1.6942416869424168, + "grad_norm": 0.5608682036399841, + "learning_rate": 8.709507366123841e-06, + "loss": 0.6120996475219727, + "step": 1306 + }, + { + "epoch": 1.6955393349553933, + "grad_norm": 0.5365821719169617, + "learning_rate": 8.695328793947833e-06, + "loss": 0.5509933233261108, + "step": 1307 + }, + { + "epoch": 1.6968369829683698, + "grad_norm": 0.537822961807251, + "learning_rate": 8.681152889494227e-06, + "loss": 0.6313689947128296, + "step": 1308 + }, + { + "epoch": 1.6981346309813463, + "grad_norm": 0.5853676199913025, + "learning_rate": 8.66697968174915e-06, + "loss": 0.6015232801437378, + "step": 1309 + }, + { + "epoch": 1.6994322789943228, + "grad_norm": 0.5395903587341309, + "learning_rate": 8.652809199693236e-06, + "loss": 0.5783022046089172, + "step": 1310 + }, + { + "epoch": 1.7007299270072993, + "grad_norm": 0.5408870577812195, + "learning_rate": 8.638641472301524e-06, + "loss": 0.6224579215049744, + "step": 1311 + }, + { + "epoch": 1.7020275750202758, + "grad_norm": 0.5533918142318726, + "learning_rate": 8.624476528543439e-06, + "loss": 0.6317031383514404, + "step": 1312 + }, + { + "epoch": 1.7033252230332523, + "grad_norm": 0.577556848526001, + "learning_rate": 8.610314397382701e-06, + "loss": 0.6522644758224487, + "step": 1313 + }, + { + "epoch": 1.7046228710462286, + "grad_norm": 0.5453810095787048, + "learning_rate": 8.596155107777288e-06, + "loss": 0.6072216629981995, + "step": 1314 + }, + { + "epoch": 1.7059205190592053, + "grad_norm": 0.5380662679672241, + "learning_rate": 8.581998688679356e-06, + "loss": 0.6069589853286743, + "step": 1315 + }, + { + "epoch": 1.7072181670721815, + "grad_norm": 0.5374992489814758, + "learning_rate": 8.567845169035205e-06, + "loss": 0.6239044070243835, + "step": 1316 + }, + { + "epoch": 1.7085158150851583, + "grad_norm": 0.5366406440734863, + "learning_rate": 8.553694577785201e-06, + "loss": 0.5901238322257996, + "step": 1317 + }, + { + "epoch": 1.7098134630981345, + "grad_norm": 0.5510634779930115, + "learning_rate": 8.539546943863717e-06, + "loss": 0.6066378355026245, + "step": 1318 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.5579630732536316, + "learning_rate": 8.525402296199089e-06, + "loss": 0.6439074873924255, + "step": 1319 + }, + { + "epoch": 1.7124087591240875, + "grad_norm": 0.5268120765686035, + "learning_rate": 8.511260663713537e-06, + "loss": 0.5521663427352905, + "step": 1320 + }, + { + "epoch": 1.7137064071370642, + "grad_norm": 0.5076732635498047, + "learning_rate": 8.497122075323122e-06, + "loss": 0.5523797273635864, + "step": 1321 + }, + { + "epoch": 1.7150040551500405, + "grad_norm": 0.5172733068466187, + "learning_rate": 8.482986559937676e-06, + "loss": 0.6011000275611877, + "step": 1322 + }, + { + "epoch": 1.716301703163017, + "grad_norm": 0.5152168869972229, + "learning_rate": 8.468854146460754e-06, + "loss": 0.5801671743392944, + "step": 1323 + }, + { + "epoch": 1.7175993511759935, + "grad_norm": 0.5168895721435547, + "learning_rate": 8.45472486378956e-06, + "loss": 0.6005280613899231, + "step": 1324 + }, + { + "epoch": 1.71889699918897, + "grad_norm": 0.571263313293457, + "learning_rate": 8.440598740814909e-06, + "loss": 0.6543586850166321, + "step": 1325 + }, + { + "epoch": 1.7201946472019465, + "grad_norm": 0.5240177512168884, + "learning_rate": 8.426475806421139e-06, + "loss": 0.613470196723938, + "step": 1326 + }, + { + "epoch": 1.721492295214923, + "grad_norm": 0.5217388272285461, + "learning_rate": 8.412356089486082e-06, + "loss": 0.5799127817153931, + "step": 1327 + }, + { + "epoch": 1.7227899432278995, + "grad_norm": 0.5473462343215942, + "learning_rate": 8.39823961888098e-06, + "loss": 0.6159072518348694, + "step": 1328 + }, + { + "epoch": 1.724087591240876, + "grad_norm": 0.9222651124000549, + "learning_rate": 8.384126423470447e-06, + "loss": 0.6260055303573608, + "step": 1329 + }, + { + "epoch": 1.7253852392538525, + "grad_norm": 0.5530563592910767, + "learning_rate": 8.37001653211239e-06, + "loss": 0.5505119562149048, + "step": 1330 + }, + { + "epoch": 1.7266828872668287, + "grad_norm": 0.5369389653205872, + "learning_rate": 8.355909973657975e-06, + "loss": 0.6139888763427734, + "step": 1331 + }, + { + "epoch": 1.7279805352798054, + "grad_norm": 0.5347586870193481, + "learning_rate": 8.341806776951532e-06, + "loss": 0.6265066862106323, + "step": 1332 + }, + { + "epoch": 1.7292781832927817, + "grad_norm": 0.545946478843689, + "learning_rate": 8.327706970830537e-06, + "loss": 0.6024926900863647, + "step": 1333 + }, + { + "epoch": 1.7305758313057584, + "grad_norm": 0.5450059771537781, + "learning_rate": 8.313610584125523e-06, + "loss": 0.658405065536499, + "step": 1334 + }, + { + "epoch": 1.7318734793187347, + "grad_norm": 0.5516889691352844, + "learning_rate": 8.299517645660033e-06, + "loss": 0.5770267248153687, + "step": 1335 + }, + { + "epoch": 1.7331711273317114, + "grad_norm": 0.557074785232544, + "learning_rate": 8.285428184250554e-06, + "loss": 0.5421329736709595, + "step": 1336 + }, + { + "epoch": 1.7344687753446877, + "grad_norm": 0.543565571308136, + "learning_rate": 8.271342228706478e-06, + "loss": 0.6527873277664185, + "step": 1337 + }, + { + "epoch": 1.7357664233576642, + "grad_norm": 0.49616673588752747, + "learning_rate": 8.257259807830009e-06, + "loss": 0.5355008840560913, + "step": 1338 + }, + { + "epoch": 1.7370640713706407, + "grad_norm": 0.5389429330825806, + "learning_rate": 8.243180950416142e-06, + "loss": 0.6072633862495422, + "step": 1339 + }, + { + "epoch": 1.7383617193836172, + "grad_norm": 0.542195737361908, + "learning_rate": 8.22910568525257e-06, + "loss": 0.5909712314605713, + "step": 1340 + }, + { + "epoch": 1.7396593673965937, + "grad_norm": 0.5480629205703735, + "learning_rate": 8.215034041119655e-06, + "loss": 0.5966728925704956, + "step": 1341 + }, + { + "epoch": 1.7409570154095702, + "grad_norm": 0.5179266929626465, + "learning_rate": 8.200966046790339e-06, + "loss": 0.608291745185852, + "step": 1342 + }, + { + "epoch": 1.7422546634225466, + "grad_norm": 0.525390625, + "learning_rate": 8.186901731030117e-06, + "loss": 0.6019555330276489, + "step": 1343 + }, + { + "epoch": 1.7435523114355231, + "grad_norm": 0.5716756582260132, + "learning_rate": 8.172841122596951e-06, + "loss": 0.6858773827552795, + "step": 1344 + }, + { + "epoch": 1.7448499594484996, + "grad_norm": 0.53510981798172, + "learning_rate": 8.158784250241226e-06, + "loss": 0.6193398833274841, + "step": 1345 + }, + { + "epoch": 1.7461476074614761, + "grad_norm": 0.509371280670166, + "learning_rate": 8.144731142705693e-06, + "loss": 0.5310204029083252, + "step": 1346 + }, + { + "epoch": 1.7474452554744526, + "grad_norm": 0.520005464553833, + "learning_rate": 8.130681828725394e-06, + "loss": 0.5864765644073486, + "step": 1347 + }, + { + "epoch": 1.748742903487429, + "grad_norm": 0.530784010887146, + "learning_rate": 8.116636337027626e-06, + "loss": 0.5898761749267578, + "step": 1348 + }, + { + "epoch": 1.7500405515004056, + "grad_norm": 0.528357982635498, + "learning_rate": 8.10259469633186e-06, + "loss": 0.611457347869873, + "step": 1349 + }, + { + "epoch": 1.7513381995133819, + "grad_norm": 0.5243317484855652, + "learning_rate": 8.0885569353497e-06, + "loss": 0.5851372480392456, + "step": 1350 + }, + { + "epoch": 1.7526358475263586, + "grad_norm": 0.5656478404998779, + "learning_rate": 8.07452308278481e-06, + "loss": 0.6243469715118408, + "step": 1351 + }, + { + "epoch": 1.7539334955393349, + "grad_norm": 0.5173115134239197, + "learning_rate": 8.060493167332874e-06, + "loss": 0.5658408403396606, + "step": 1352 + }, + { + "epoch": 1.7552311435523116, + "grad_norm": 0.5283849835395813, + "learning_rate": 8.04646721768151e-06, + "loss": 0.6133898496627808, + "step": 1353 + }, + { + "epoch": 1.7565287915652879, + "grad_norm": 0.5533227324485779, + "learning_rate": 8.032445262510241e-06, + "loss": 0.6251792907714844, + "step": 1354 + }, + { + "epoch": 1.7578264395782643, + "grad_norm": 0.5281651020050049, + "learning_rate": 8.018427330490411e-06, + "loss": 0.5514408349990845, + "step": 1355 + }, + { + "epoch": 1.7591240875912408, + "grad_norm": 0.5382410883903503, + "learning_rate": 8.004413450285147e-06, + "loss": 0.6591918468475342, + "step": 1356 + }, + { + "epoch": 1.7604217356042173, + "grad_norm": 0.566716194152832, + "learning_rate": 7.990403650549285e-06, + "loss": 0.6281836628913879, + "step": 1357 + }, + { + "epoch": 1.7617193836171938, + "grad_norm": 0.5423158407211304, + "learning_rate": 7.976397959929324e-06, + "loss": 0.5953754782676697, + "step": 1358 + }, + { + "epoch": 1.7630170316301703, + "grad_norm": 0.5327609181404114, + "learning_rate": 7.962396407063346e-06, + "loss": 0.6248747110366821, + "step": 1359 + }, + { + "epoch": 1.7643146796431468, + "grad_norm": 0.5314010381698608, + "learning_rate": 7.948399020580995e-06, + "loss": 0.5661095380783081, + "step": 1360 + }, + { + "epoch": 1.7656123276561233, + "grad_norm": 0.5650714039802551, + "learning_rate": 7.934405829103376e-06, + "loss": 0.6127238869667053, + "step": 1361 + }, + { + "epoch": 1.7669099756690998, + "grad_norm": 0.546101987361908, + "learning_rate": 7.920416861243028e-06, + "loss": 0.5874890089035034, + "step": 1362 + }, + { + "epoch": 1.7682076236820763, + "grad_norm": 0.5429707169532776, + "learning_rate": 7.906432145603844e-06, + "loss": 0.6140427589416504, + "step": 1363 + }, + { + "epoch": 1.7695052716950528, + "grad_norm": 0.5710042715072632, + "learning_rate": 7.892451710781035e-06, + "loss": 0.612266480922699, + "step": 1364 + }, + { + "epoch": 1.770802919708029, + "grad_norm": 0.55032879114151, + "learning_rate": 7.878475585361045e-06, + "loss": 0.6138355135917664, + "step": 1365 + }, + { + "epoch": 1.7721005677210058, + "grad_norm": 0.5812238454818726, + "learning_rate": 7.864503797921518e-06, + "loss": 0.6380466818809509, + "step": 1366 + }, + { + "epoch": 1.773398215733982, + "grad_norm": 0.5375271439552307, + "learning_rate": 7.850536377031221e-06, + "loss": 0.6307961344718933, + "step": 1367 + }, + { + "epoch": 1.7746958637469588, + "grad_norm": 0.5584734082221985, + "learning_rate": 7.836573351249996e-06, + "loss": 0.6312189698219299, + "step": 1368 + }, + { + "epoch": 1.775993511759935, + "grad_norm": 0.5133419036865234, + "learning_rate": 7.822614749128692e-06, + "loss": 0.5199952125549316, + "step": 1369 + }, + { + "epoch": 1.7772911597729117, + "grad_norm": 0.5400519371032715, + "learning_rate": 7.808660599209124e-06, + "loss": 0.630193829536438, + "step": 1370 + }, + { + "epoch": 1.778588807785888, + "grad_norm": 0.5627943277359009, + "learning_rate": 7.794710930023993e-06, + "loss": 0.6233404874801636, + "step": 1371 + }, + { + "epoch": 1.7798864557988645, + "grad_norm": 0.510907769203186, + "learning_rate": 7.78076577009684e-06, + "loss": 0.5262112021446228, + "step": 1372 + }, + { + "epoch": 1.781184103811841, + "grad_norm": 0.5093023777008057, + "learning_rate": 7.76682514794199e-06, + "loss": 0.5871707201004028, + "step": 1373 + }, + { + "epoch": 1.7824817518248175, + "grad_norm": 0.5214765667915344, + "learning_rate": 7.752889092064484e-06, + "loss": 0.5635697841644287, + "step": 1374 + }, + { + "epoch": 1.783779399837794, + "grad_norm": 0.5440617799758911, + "learning_rate": 7.738957630960037e-06, + "loss": 0.5805234909057617, + "step": 1375 + }, + { + "epoch": 1.7850770478507705, + "grad_norm": 0.5365013480186462, + "learning_rate": 7.725030793114952e-06, + "loss": 0.615504801273346, + "step": 1376 + }, + { + "epoch": 1.786374695863747, + "grad_norm": 0.5464739203453064, + "learning_rate": 7.711108607006094e-06, + "loss": 0.6203770637512207, + "step": 1377 + }, + { + "epoch": 1.7876723438767235, + "grad_norm": 0.5313665866851807, + "learning_rate": 7.697191101100802e-06, + "loss": 0.6234644055366516, + "step": 1378 + }, + { + "epoch": 1.7889699918897, + "grad_norm": 0.5652154684066772, + "learning_rate": 7.683278303856862e-06, + "loss": 0.6404775977134705, + "step": 1379 + }, + { + "epoch": 1.7902676399026762, + "grad_norm": 0.5399373769760132, + "learning_rate": 7.669370243722415e-06, + "loss": 0.6136540770530701, + "step": 1380 + }, + { + "epoch": 1.7902676399026762, + "eval_loss": 0.6770720481872559, + "eval_runtime": 72.4181, + "eval_samples_per_second": 71.695, + "eval_steps_per_second": 8.962, + "step": 1380 + }, + { + "epoch": 1.791565287915653, + "grad_norm": 0.5250906944274902, + "learning_rate": 7.655466949135932e-06, + "loss": 0.6147629022598267, + "step": 1381 + }, + { + "epoch": 1.7928629359286292, + "grad_norm": 0.5089812278747559, + "learning_rate": 7.641568448526122e-06, + "loss": 0.5584423542022705, + "step": 1382 + }, + { + "epoch": 1.794160583941606, + "grad_norm": 0.53523850440979, + "learning_rate": 7.627674770311909e-06, + "loss": 0.5899471640586853, + "step": 1383 + }, + { + "epoch": 1.7954582319545822, + "grad_norm": 0.5330705642700195, + "learning_rate": 7.613785942902343e-06, + "loss": 0.6054921746253967, + "step": 1384 + }, + { + "epoch": 1.796755879967559, + "grad_norm": 0.514224648475647, + "learning_rate": 7.599901994696566e-06, + "loss": 0.57494056224823, + "step": 1385 + }, + { + "epoch": 1.7980535279805352, + "grad_norm": 0.5187469124794006, + "learning_rate": 7.586022954083731e-06, + "loss": 0.5410253405570984, + "step": 1386 + }, + { + "epoch": 1.799351175993512, + "grad_norm": 0.5295100808143616, + "learning_rate": 7.572148849442971e-06, + "loss": 0.5727859139442444, + "step": 1387 + }, + { + "epoch": 1.8006488240064882, + "grad_norm": 0.5229355692863464, + "learning_rate": 7.5582797091433105e-06, + "loss": 0.5822583436965942, + "step": 1388 + }, + { + "epoch": 1.8019464720194647, + "grad_norm": 0.5615860223770142, + "learning_rate": 7.544415561543639e-06, + "loss": 0.6505988836288452, + "step": 1389 + }, + { + "epoch": 1.8032441200324412, + "grad_norm": 0.538707971572876, + "learning_rate": 7.5305564349926215e-06, + "loss": 0.5953875184059143, + "step": 1390 + }, + { + "epoch": 1.8045417680454177, + "grad_norm": 0.5197842717170715, + "learning_rate": 7.516702357828672e-06, + "loss": 0.61934494972229, + "step": 1391 + }, + { + "epoch": 1.8058394160583942, + "grad_norm": 0.49861758947372437, + "learning_rate": 7.502853358379865e-06, + "loss": 0.5522242784500122, + "step": 1392 + }, + { + "epoch": 1.8071370640713706, + "grad_norm": 0.5618783235549927, + "learning_rate": 7.489009464963903e-06, + "loss": 0.6682146787643433, + "step": 1393 + }, + { + "epoch": 1.8084347120843471, + "grad_norm": 0.9511061906814575, + "learning_rate": 7.475170705888042e-06, + "loss": 0.5893583297729492, + "step": 1394 + }, + { + "epoch": 1.8097323600973236, + "grad_norm": 0.6068239808082581, + "learning_rate": 7.461337109449045e-06, + "loss": 0.6168926954269409, + "step": 1395 + }, + { + "epoch": 1.8110300081103001, + "grad_norm": 0.517159640789032, + "learning_rate": 7.447508703933109e-06, + "loss": 0.5870746374130249, + "step": 1396 + }, + { + "epoch": 1.8123276561232764, + "grad_norm": 0.5260257720947266, + "learning_rate": 7.433685517615831e-06, + "loss": 0.6144825220108032, + "step": 1397 + }, + { + "epoch": 1.8136253041362531, + "grad_norm": 0.4919078052043915, + "learning_rate": 7.4198675787621185e-06, + "loss": 0.6141817569732666, + "step": 1398 + }, + { + "epoch": 1.8149229521492294, + "grad_norm": 0.5349772572517395, + "learning_rate": 7.406054915626172e-06, + "loss": 0.5727092027664185, + "step": 1399 + }, + { + "epoch": 1.816220600162206, + "grad_norm": 0.5762760639190674, + "learning_rate": 7.392247556451382e-06, + "loss": 0.647359311580658, + "step": 1400 + }, + { + "epoch": 1.8175182481751824, + "grad_norm": 0.5478885769844055, + "learning_rate": 7.378445529470303e-06, + "loss": 0.6371256113052368, + "step": 1401 + }, + { + "epoch": 1.818815896188159, + "grad_norm": 0.5577658414840698, + "learning_rate": 7.364648862904593e-06, + "loss": 0.6552213430404663, + "step": 1402 + }, + { + "epoch": 1.8201135442011354, + "grad_norm": 0.5350478887557983, + "learning_rate": 7.35085758496494e-06, + "loss": 0.5756250023841858, + "step": 1403 + }, + { + "epoch": 1.821411192214112, + "grad_norm": 0.5247483849525452, + "learning_rate": 7.337071723851018e-06, + "loss": 0.5872269868850708, + "step": 1404 + }, + { + "epoch": 1.8227088402270883, + "grad_norm": 0.5715752840042114, + "learning_rate": 7.323291307751418e-06, + "loss": 0.6395775079727173, + "step": 1405 + }, + { + "epoch": 1.8240064882400648, + "grad_norm": 0.5355315208435059, + "learning_rate": 7.3095163648436115e-06, + "loss": 0.5502926707267761, + "step": 1406 + }, + { + "epoch": 1.8253041362530413, + "grad_norm": 0.5468769073486328, + "learning_rate": 7.295746923293865e-06, + "loss": 0.6266253590583801, + "step": 1407 + }, + { + "epoch": 1.8266017842660178, + "grad_norm": 0.5183525681495667, + "learning_rate": 7.2819830112572035e-06, + "loss": 0.5890312194824219, + "step": 1408 + }, + { + "epoch": 1.8278994322789943, + "grad_norm": 0.5416871905326843, + "learning_rate": 7.268224656877339e-06, + "loss": 0.6163492798805237, + "step": 1409 + }, + { + "epoch": 1.8291970802919708, + "grad_norm": 0.5376898646354675, + "learning_rate": 7.25447188828663e-06, + "loss": 0.6440437436103821, + "step": 1410 + }, + { + "epoch": 1.8304947283049473, + "grad_norm": 0.5264099836349487, + "learning_rate": 7.240724733606002e-06, + "loss": 0.6445986032485962, + "step": 1411 + }, + { + "epoch": 1.8317923763179238, + "grad_norm": 0.5397512912750244, + "learning_rate": 7.2269832209449145e-06, + "loss": 0.5767061710357666, + "step": 1412 + }, + { + "epoch": 1.8330900243309003, + "grad_norm": 0.5331466794013977, + "learning_rate": 7.213247378401274e-06, + "loss": 0.6515385508537292, + "step": 1413 + }, + { + "epoch": 1.8343876723438766, + "grad_norm": 0.5380875468254089, + "learning_rate": 7.199517234061408e-06, + "loss": 0.5956803560256958, + "step": 1414 + }, + { + "epoch": 1.8356853203568533, + "grad_norm": 0.5553707480430603, + "learning_rate": 7.1857928159999814e-06, + "loss": 0.5990528464317322, + "step": 1415 + }, + { + "epoch": 1.8369829683698295, + "grad_norm": 0.5348111391067505, + "learning_rate": 7.172074152279963e-06, + "loss": 0.5816199779510498, + "step": 1416 + }, + { + "epoch": 1.8382806163828063, + "grad_norm": 0.63777756690979, + "learning_rate": 7.1583612709525405e-06, + "loss": 0.6647042036056519, + "step": 1417 + }, + { + "epoch": 1.8395782643957825, + "grad_norm": 0.5394327640533447, + "learning_rate": 7.14465420005709e-06, + "loss": 0.629410982131958, + "step": 1418 + }, + { + "epoch": 1.8408759124087593, + "grad_norm": 0.5467361807823181, + "learning_rate": 7.130952967621096e-06, + "loss": 0.5931155681610107, + "step": 1419 + }, + { + "epoch": 1.8421735604217355, + "grad_norm": 0.5642380714416504, + "learning_rate": 7.11725760166012e-06, + "loss": 0.59910649061203, + "step": 1420 + }, + { + "epoch": 1.8434712084347122, + "grad_norm": 0.5448968410491943, + "learning_rate": 7.103568130177713e-06, + "loss": 0.5758746862411499, + "step": 1421 + }, + { + "epoch": 1.8447688564476885, + "grad_norm": 0.5109772682189941, + "learning_rate": 7.089884581165382e-06, + "loss": 0.5374370217323303, + "step": 1422 + }, + { + "epoch": 1.846066504460665, + "grad_norm": 0.5496018528938293, + "learning_rate": 7.076206982602516e-06, + "loss": 0.6080317497253418, + "step": 1423 + }, + { + "epoch": 1.8473641524736415, + "grad_norm": 0.5525946021080017, + "learning_rate": 7.06253536245635e-06, + "loss": 0.6326315402984619, + "step": 1424 + }, + { + "epoch": 1.848661800486618, + "grad_norm": 0.5555429458618164, + "learning_rate": 7.048869748681879e-06, + "loss": 0.6499879360198975, + "step": 1425 + }, + { + "epoch": 1.8499594484995945, + "grad_norm": 0.5364986062049866, + "learning_rate": 7.035210169221834e-06, + "loss": 0.6402702331542969, + "step": 1426 + }, + { + "epoch": 1.851257096512571, + "grad_norm": 0.5398283004760742, + "learning_rate": 7.021556652006588e-06, + "loss": 0.636422872543335, + "step": 1427 + }, + { + "epoch": 1.8525547445255475, + "grad_norm": 0.5333319306373596, + "learning_rate": 7.007909224954135e-06, + "loss": 0.6210685968399048, + "step": 1428 + }, + { + "epoch": 1.853852392538524, + "grad_norm": 0.5136668086051941, + "learning_rate": 6.994267915970003e-06, + "loss": 0.5984174013137817, + "step": 1429 + }, + { + "epoch": 1.8551500405515005, + "grad_norm": 0.5352861285209656, + "learning_rate": 6.980632752947221e-06, + "loss": 0.6331675052642822, + "step": 1430 + }, + { + "epoch": 1.8564476885644767, + "grad_norm": 0.5386180281639099, + "learning_rate": 6.967003763766247e-06, + "loss": 0.599821925163269, + "step": 1431 + }, + { + "epoch": 1.8577453365774534, + "grad_norm": 0.5548969507217407, + "learning_rate": 6.953380976294907e-06, + "loss": 0.6447435617446899, + "step": 1432 + }, + { + "epoch": 1.8590429845904297, + "grad_norm": 0.5061814188957214, + "learning_rate": 6.9397644183883616e-06, + "loss": 0.6045181751251221, + "step": 1433 + }, + { + "epoch": 1.8603406326034064, + "grad_norm": 0.49961408972740173, + "learning_rate": 6.926154117889022e-06, + "loss": 0.5710508823394775, + "step": 1434 + }, + { + "epoch": 1.8616382806163827, + "grad_norm": 0.5761319398880005, + "learning_rate": 6.91255010262651e-06, + "loss": 0.6047182679176331, + "step": 1435 + }, + { + "epoch": 1.8629359286293594, + "grad_norm": 0.5302688479423523, + "learning_rate": 6.898952400417587e-06, + "loss": 0.5881869792938232, + "step": 1436 + }, + { + "epoch": 1.8642335766423357, + "grad_norm": 0.567452609539032, + "learning_rate": 6.885361039066121e-06, + "loss": 0.6580846905708313, + "step": 1437 + }, + { + "epoch": 1.8655312246553124, + "grad_norm": 0.5567494034767151, + "learning_rate": 6.8717760463629965e-06, + "loss": 0.6213802099227905, + "step": 1438 + }, + { + "epoch": 1.8668288726682887, + "grad_norm": 0.535961925983429, + "learning_rate": 6.858197450086097e-06, + "loss": 0.6174903512001038, + "step": 1439 + }, + { + "epoch": 1.8681265206812652, + "grad_norm": 0.5607694387435913, + "learning_rate": 6.844625278000205e-06, + "loss": 0.658057451248169, + "step": 1440 + }, + { + "epoch": 1.8694241686942417, + "grad_norm": 0.5164813995361328, + "learning_rate": 6.831059557856984e-06, + "loss": 0.6188488602638245, + "step": 1441 + }, + { + "epoch": 1.8707218167072182, + "grad_norm": 0.5046887397766113, + "learning_rate": 6.81750031739489e-06, + "loss": 0.5495269298553467, + "step": 1442 + }, + { + "epoch": 1.8720194647201946, + "grad_norm": 0.5218680500984192, + "learning_rate": 6.803947584339148e-06, + "loss": 0.5858875513076782, + "step": 1443 + }, + { + "epoch": 1.8733171127331711, + "grad_norm": 0.5279871225357056, + "learning_rate": 6.79040138640166e-06, + "loss": 0.5829395055770874, + "step": 1444 + }, + { + "epoch": 1.8746147607461476, + "grad_norm": 0.5364516377449036, + "learning_rate": 6.7768617512809745e-06, + "loss": 0.6135284900665283, + "step": 1445 + }, + { + "epoch": 1.8759124087591241, + "grad_norm": 0.5465746521949768, + "learning_rate": 6.763328706662214e-06, + "loss": 0.5970785617828369, + "step": 1446 + }, + { + "epoch": 1.8772100567721006, + "grad_norm": 0.5328618288040161, + "learning_rate": 6.749802280217037e-06, + "loss": 0.6004316806793213, + "step": 1447 + }, + { + "epoch": 1.878507704785077, + "grad_norm": 0.5282012224197388, + "learning_rate": 6.7362824996035545e-06, + "loss": 0.5903221368789673, + "step": 1448 + }, + { + "epoch": 1.8798053527980536, + "grad_norm": 0.5416566133499146, + "learning_rate": 6.722769392466304e-06, + "loss": 0.624277651309967, + "step": 1449 + }, + { + "epoch": 1.8811030008110299, + "grad_norm": 0.5569058060646057, + "learning_rate": 6.709262986436162e-06, + "loss": 0.6214337348937988, + "step": 1450 + }, + { + "epoch": 1.8824006488240066, + "grad_norm": 0.5567551255226135, + "learning_rate": 6.695763309130318e-06, + "loss": 0.5963641405105591, + "step": 1451 + }, + { + "epoch": 1.8836982968369829, + "grad_norm": 0.5245199203491211, + "learning_rate": 6.682270388152185e-06, + "loss": 0.5722153186798096, + "step": 1452 + }, + { + "epoch": 1.8849959448499596, + "grad_norm": 0.5476487874984741, + "learning_rate": 6.668784251091381e-06, + "loss": 0.573593258857727, + "step": 1453 + }, + { + "epoch": 1.8862935928629359, + "grad_norm": 0.5254029631614685, + "learning_rate": 6.655304925523635e-06, + "loss": 0.5607786774635315, + "step": 1454 + }, + { + "epoch": 1.8875912408759126, + "grad_norm": 0.5431527495384216, + "learning_rate": 6.641832439010765e-06, + "loss": 0.5841714143753052, + "step": 1455 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.5374141931533813, + "learning_rate": 6.628366819100586e-06, + "loss": 0.5811495780944824, + "step": 1456 + }, + { + "epoch": 1.8901865369018653, + "grad_norm": 0.5369722247123718, + "learning_rate": 6.614908093326891e-06, + "loss": 0.6311888694763184, + "step": 1457 + }, + { + "epoch": 1.8914841849148418, + "grad_norm": 0.5656461119651794, + "learning_rate": 6.601456289209362e-06, + "loss": 0.6515893936157227, + "step": 1458 + }, + { + "epoch": 1.8927818329278183, + "grad_norm": 0.5076130032539368, + "learning_rate": 6.588011434253534e-06, + "loss": 0.5477322340011597, + "step": 1459 + }, + { + "epoch": 1.8940794809407948, + "grad_norm": 0.5373955965042114, + "learning_rate": 6.574573555950738e-06, + "loss": 0.5668719410896301, + "step": 1460 + }, + { + "epoch": 1.8953771289537713, + "grad_norm": 0.5303026437759399, + "learning_rate": 6.561142681778027e-06, + "loss": 0.5856397747993469, + "step": 1461 + }, + { + "epoch": 1.8966747769667478, + "grad_norm": 0.5287466049194336, + "learning_rate": 6.547718839198145e-06, + "loss": 0.574636697769165, + "step": 1462 + }, + { + "epoch": 1.8979724249797243, + "grad_norm": 0.546556830406189, + "learning_rate": 6.53430205565945e-06, + "loss": 0.6119240522384644, + "step": 1463 + }, + { + "epoch": 1.8992700729927008, + "grad_norm": 0.5332784652709961, + "learning_rate": 6.520892358595869e-06, + "loss": 0.6177451014518738, + "step": 1464 + }, + { + "epoch": 1.900567721005677, + "grad_norm": 0.5086203217506409, + "learning_rate": 6.507489775426834e-06, + "loss": 0.6066810488700867, + "step": 1465 + }, + { + "epoch": 1.9018653690186538, + "grad_norm": 0.5467303991317749, + "learning_rate": 6.494094333557243e-06, + "loss": 0.5971111059188843, + "step": 1466 + }, + { + "epoch": 1.90316301703163, + "grad_norm": 0.5070620179176331, + "learning_rate": 6.4807060603773795e-06, + "loss": 0.6063017845153809, + "step": 1467 + }, + { + "epoch": 1.9044606650446068, + "grad_norm": 0.553736686706543, + "learning_rate": 6.467324983262877e-06, + "loss": 0.579677402973175, + "step": 1468 + }, + { + "epoch": 1.905758313057583, + "grad_norm": 0.5139430165290833, + "learning_rate": 6.453951129574644e-06, + "loss": 0.5715341567993164, + "step": 1469 + }, + { + "epoch": 1.9070559610705597, + "grad_norm": 0.5478905439376831, + "learning_rate": 6.4405845266588356e-06, + "loss": 0.6066344976425171, + "step": 1470 + }, + { + "epoch": 1.908353609083536, + "grad_norm": 0.5382056832313538, + "learning_rate": 6.427225201846763e-06, + "loss": 0.5792092084884644, + "step": 1471 + }, + { + "epoch": 1.9096512570965127, + "grad_norm": 0.5592162013053894, + "learning_rate": 6.413873182454873e-06, + "loss": 0.6224773526191711, + "step": 1472 + }, + { + "epoch": 1.910948905109489, + "grad_norm": 0.5435997843742371, + "learning_rate": 6.4005284957846546e-06, + "loss": 0.5740009546279907, + "step": 1473 + }, + { + "epoch": 1.9122465531224655, + "grad_norm": 0.5480201840400696, + "learning_rate": 6.3871911691226276e-06, + "loss": 0.5897870063781738, + "step": 1474 + }, + { + "epoch": 1.913544201135442, + "grad_norm": 0.5461702942848206, + "learning_rate": 6.373861229740237e-06, + "loss": 0.6223511695861816, + "step": 1475 + }, + { + "epoch": 1.9148418491484185, + "grad_norm": 0.5337714552879333, + "learning_rate": 6.360538704893845e-06, + "loss": 0.5608541369438171, + "step": 1476 + }, + { + "epoch": 1.916139497161395, + "grad_norm": 0.5573077201843262, + "learning_rate": 6.3472236218246366e-06, + "loss": 0.6532754302024841, + "step": 1477 + }, + { + "epoch": 1.9174371451743715, + "grad_norm": 0.5389246940612793, + "learning_rate": 6.333916007758591e-06, + "loss": 0.5982533693313599, + "step": 1478 + }, + { + "epoch": 1.918734793187348, + "grad_norm": 0.5433958768844604, + "learning_rate": 6.320615889906403e-06, + "loss": 0.592591404914856, + "step": 1479 + }, + { + "epoch": 1.9200324412003245, + "grad_norm": 0.5413274765014648, + "learning_rate": 6.307323295463457e-06, + "loss": 0.6429393291473389, + "step": 1480 + }, + { + "epoch": 1.921330089213301, + "grad_norm": 0.5350672602653503, + "learning_rate": 6.294038251609738e-06, + "loss": 0.5930889844894409, + "step": 1481 + }, + { + "epoch": 1.9226277372262772, + "grad_norm": 0.5042331218719482, + "learning_rate": 6.280760785509802e-06, + "loss": 0.5509825944900513, + "step": 1482 + }, + { + "epoch": 1.923925385239254, + "grad_norm": 0.5447627902030945, + "learning_rate": 6.2674909243127e-06, + "loss": 0.6052374839782715, + "step": 1483 + }, + { + "epoch": 1.9252230332522302, + "grad_norm": 0.5395492911338806, + "learning_rate": 6.254228695151949e-06, + "loss": 0.6406330466270447, + "step": 1484 + }, + { + "epoch": 1.926520681265207, + "grad_norm": 0.5140017867088318, + "learning_rate": 6.240974125145443e-06, + "loss": 0.5923643112182617, + "step": 1485 + }, + { + "epoch": 1.9278183292781832, + "grad_norm": 0.5255963802337646, + "learning_rate": 6.227727241395429e-06, + "loss": 0.612221360206604, + "step": 1486 + }, + { + "epoch": 1.92911597729116, + "grad_norm": 0.5396282076835632, + "learning_rate": 6.214488070988424e-06, + "loss": 0.5972959399223328, + "step": 1487 + }, + { + "epoch": 1.9304136253041362, + "grad_norm": 0.5345456004142761, + "learning_rate": 6.201256640995184e-06, + "loss": 0.5695825815200806, + "step": 1488 + }, + { + "epoch": 1.931711273317113, + "grad_norm": 0.5186867713928223, + "learning_rate": 6.188032978470639e-06, + "loss": 0.6117428541183472, + "step": 1489 + }, + { + "epoch": 1.9330089213300892, + "grad_norm": 0.5213980674743652, + "learning_rate": 6.174817110453828e-06, + "loss": 0.584017276763916, + "step": 1490 + }, + { + "epoch": 1.9343065693430657, + "grad_norm": 0.541926920413971, + "learning_rate": 6.161609063967857e-06, + "loss": 0.6257720589637756, + "step": 1491 + }, + { + "epoch": 1.9356042173560422, + "grad_norm": 0.5566191673278809, + "learning_rate": 6.1484088660198325e-06, + "loss": 0.6734557151794434, + "step": 1492 + }, + { + "epoch": 1.9369018653690186, + "grad_norm": 0.5532911419868469, + "learning_rate": 6.135216543600828e-06, + "loss": 0.5978685021400452, + "step": 1493 + }, + { + "epoch": 1.9381995133819951, + "grad_norm": 0.5523790717124939, + "learning_rate": 6.1220321236857974e-06, + "loss": 0.6684085130691528, + "step": 1494 + }, + { + "epoch": 1.9394971613949716, + "grad_norm": 0.5317186713218689, + "learning_rate": 6.108855633233546e-06, + "loss": 0.5903822183609009, + "step": 1495 + }, + { + "epoch": 1.9407948094079481, + "grad_norm": 0.52325439453125, + "learning_rate": 6.0956870991866545e-06, + "loss": 0.5855342149734497, + "step": 1496 + }, + { + "epoch": 1.9420924574209246, + "grad_norm": 0.5201572775840759, + "learning_rate": 6.0825265484714526e-06, + "loss": 0.5801212787628174, + "step": 1497 + }, + { + "epoch": 1.9433901054339011, + "grad_norm": 0.5488981008529663, + "learning_rate": 6.0693740079979235e-06, + "loss": 0.647799015045166, + "step": 1498 + }, + { + "epoch": 1.9446877534468774, + "grad_norm": 0.49936795234680176, + "learning_rate": 6.056229504659696e-06, + "loss": 0.5507512092590332, + "step": 1499 + }, + { + "epoch": 1.945985401459854, + "grad_norm": 0.5403010249137878, + "learning_rate": 6.043093065333945e-06, + "loss": 0.5773292779922485, + "step": 1500 + }, + { + "epoch": 1.9472830494728304, + "grad_norm": 0.532992422580719, + "learning_rate": 6.029964716881367e-06, + "loss": 0.561974048614502, + "step": 1501 + }, + { + "epoch": 1.948580697485807, + "grad_norm": 0.5226876139640808, + "learning_rate": 6.016844486146106e-06, + "loss": 0.6117234230041504, + "step": 1502 + }, + { + "epoch": 1.9498783454987834, + "grad_norm": 0.5627997517585754, + "learning_rate": 6.003732399955722e-06, + "loss": 0.5736496448516846, + "step": 1503 + }, + { + "epoch": 1.95117599351176, + "grad_norm": 0.5260640382766724, + "learning_rate": 5.990628485121106e-06, + "loss": 0.5524093508720398, + "step": 1504 + }, + { + "epoch": 1.9524736415247363, + "grad_norm": 0.5555213689804077, + "learning_rate": 5.97753276843645e-06, + "loss": 0.6590294241905212, + "step": 1505 + }, + { + "epoch": 1.9537712895377128, + "grad_norm": 0.5117315053939819, + "learning_rate": 5.964445276679176e-06, + "loss": 0.5593676567077637, + "step": 1506 + }, + { + "epoch": 1.9550689375506893, + "grad_norm": 0.5474593043327332, + "learning_rate": 5.9513660366099005e-06, + "loss": 0.5995163321495056, + "step": 1507 + }, + { + "epoch": 1.9563665855636658, + "grad_norm": 0.5376996397972107, + "learning_rate": 5.93829507497235e-06, + "loss": 0.5445429086685181, + "step": 1508 + }, + { + "epoch": 1.9576642335766423, + "grad_norm": 0.539804220199585, + "learning_rate": 5.925232418493338e-06, + "loss": 0.6023607850074768, + "step": 1509 + }, + { + "epoch": 1.9589618815896188, + "grad_norm": 0.5308881402015686, + "learning_rate": 5.912178093882688e-06, + "loss": 0.5908794403076172, + "step": 1510 + }, + { + "epoch": 1.9602595296025953, + "grad_norm": 0.5358856320381165, + "learning_rate": 5.8991321278331934e-06, + "loss": 0.5432258248329163, + "step": 1511 + }, + { + "epoch": 1.9615571776155718, + "grad_norm": 0.5521926879882812, + "learning_rate": 5.8860945470205466e-06, + "loss": 0.6700773239135742, + "step": 1512 + }, + { + "epoch": 1.9628548256285483, + "grad_norm": 0.5567953586578369, + "learning_rate": 5.8730653781033085e-06, + "loss": 0.6132399439811707, + "step": 1513 + }, + { + "epoch": 1.9641524736415248, + "grad_norm": 0.5308123826980591, + "learning_rate": 5.860044647722827e-06, + "loss": 0.595048189163208, + "step": 1514 + }, + { + "epoch": 1.9654501216545013, + "grad_norm": 0.5229505896568298, + "learning_rate": 5.847032382503202e-06, + "loss": 0.5752079486846924, + "step": 1515 + }, + { + "epoch": 1.9667477696674776, + "grad_norm": 0.5336843729019165, + "learning_rate": 5.834028609051218e-06, + "loss": 0.6190193891525269, + "step": 1516 + }, + { + "epoch": 1.9680454176804543, + "grad_norm": 0.5378988981246948, + "learning_rate": 5.8210333539563e-06, + "loss": 0.5807895660400391, + "step": 1517 + }, + { + "epoch": 1.9693430656934305, + "grad_norm": 0.5520551800727844, + "learning_rate": 5.808046643790468e-06, + "loss": 0.6308130621910095, + "step": 1518 + }, + { + "epoch": 1.9706407137064073, + "grad_norm": 0.5014427900314331, + "learning_rate": 5.795068505108243e-06, + "loss": 0.584097146987915, + "step": 1519 + }, + { + "epoch": 1.9719383617193835, + "grad_norm": 0.5326021313667297, + "learning_rate": 5.782098964446641e-06, + "loss": 0.5909327268600464, + "step": 1520 + }, + { + "epoch": 1.9732360097323602, + "grad_norm": 0.5124540328979492, + "learning_rate": 5.769138048325087e-06, + "loss": 0.5518309473991394, + "step": 1521 + }, + { + "epoch": 1.9745336577453365, + "grad_norm": 0.5387500524520874, + "learning_rate": 5.756185783245376e-06, + "loss": 0.5835770964622498, + "step": 1522 + }, + { + "epoch": 1.975831305758313, + "grad_norm": 0.568587064743042, + "learning_rate": 5.743242195691612e-06, + "loss": 0.5821942687034607, + "step": 1523 + }, + { + "epoch": 1.9771289537712895, + "grad_norm": 0.5374230742454529, + "learning_rate": 5.730307312130152e-06, + "loss": 0.6571119427680969, + "step": 1524 + }, + { + "epoch": 1.978426601784266, + "grad_norm": 0.5388919115066528, + "learning_rate": 5.717381159009563e-06, + "loss": 0.5895075798034668, + "step": 1525 + }, + { + "epoch": 1.9797242497972425, + "grad_norm": 0.5499215722084045, + "learning_rate": 5.704463762760559e-06, + "loss": 0.61728835105896, + "step": 1526 + }, + { + "epoch": 1.981021897810219, + "grad_norm": 0.5375927686691284, + "learning_rate": 5.691555149795933e-06, + "loss": 0.6732977032661438, + "step": 1527 + }, + { + "epoch": 1.9823195458231955, + "grad_norm": 0.5313878655433655, + "learning_rate": 5.678655346510549e-06, + "loss": 0.61357581615448, + "step": 1528 + }, + { + "epoch": 1.983617193836172, + "grad_norm": 0.5222123265266418, + "learning_rate": 5.6657643792812265e-06, + "loss": 0.5704218745231628, + "step": 1529 + }, + { + "epoch": 1.9849148418491485, + "grad_norm": 0.5498616099357605, + "learning_rate": 5.652882274466736e-06, + "loss": 0.6428430080413818, + "step": 1530 + }, + { + "epoch": 1.986212489862125, + "grad_norm": 0.5288700461387634, + "learning_rate": 5.640009058407719e-06, + "loss": 0.5776660442352295, + "step": 1531 + }, + { + "epoch": 1.9875101378751014, + "grad_norm": 0.5719195008277893, + "learning_rate": 5.627144757426647e-06, + "loss": 0.6659935116767883, + "step": 1532 + }, + { + "epoch": 1.9888077858880777, + "grad_norm": 0.5699102282524109, + "learning_rate": 5.614289397827757e-06, + "loss": 0.649441123008728, + "step": 1533 + }, + { + "epoch": 1.9901054339010544, + "grad_norm": 0.5806236267089844, + "learning_rate": 5.601443005897012e-06, + "loss": 0.6462723016738892, + "step": 1534 + }, + { + "epoch": 1.9914030819140307, + "grad_norm": 0.5485842823982239, + "learning_rate": 5.588605607902017e-06, + "loss": 0.6063494086265564, + "step": 1535 + }, + { + "epoch": 1.9927007299270074, + "grad_norm": 0.5317525863647461, + "learning_rate": 5.57577723009202e-06, + "loss": 0.5641921162605286, + "step": 1536 + }, + { + "epoch": 1.9939983779399837, + "grad_norm": 0.5366416573524475, + "learning_rate": 5.5629578986977894e-06, + "loss": 0.623965322971344, + "step": 1537 + }, + { + "epoch": 1.9952960259529604, + "grad_norm": 0.5662190318107605, + "learning_rate": 5.550147639931631e-06, + "loss": 0.6340383291244507, + "step": 1538 + }, + { + "epoch": 1.9965936739659367, + "grad_norm": 0.5266711711883545, + "learning_rate": 5.537346479987269e-06, + "loss": 0.6086807250976562, + "step": 1539 + }, + { + "epoch": 1.9978913219789132, + "grad_norm": 0.5435559153556824, + "learning_rate": 5.524554445039838e-06, + "loss": 0.640510082244873, + "step": 1540 + }, + { + "epoch": 1.9991889699918897, + "grad_norm": 0.5433489084243774, + "learning_rate": 5.511771561245813e-06, + "loss": 0.5800854563713074, + "step": 1541 + }, + { + "epoch": 2.0, + "grad_norm": 0.6513635516166687, + "learning_rate": 5.498997854742956e-06, + "loss": 0.546117901802063, + "step": 1542 + }, + { + "epoch": 2.0012976480129763, + "grad_norm": 0.7124117016792297, + "learning_rate": 5.4862333516502634e-06, + "loss": 0.5231295824050903, + "step": 1543 + }, + { + "epoch": 2.002595296025953, + "grad_norm": 0.727088451385498, + "learning_rate": 5.473478078067913e-06, + "loss": 0.5810973644256592, + "step": 1544 + }, + { + "epoch": 2.0038929440389293, + "grad_norm": 0.6788406372070312, + "learning_rate": 5.460732060077212e-06, + "loss": 0.47124871611595154, + "step": 1545 + }, + { + "epoch": 2.005190592051906, + "grad_norm": 0.6010527610778809, + "learning_rate": 5.44799532374054e-06, + "loss": 0.5422745943069458, + "step": 1546 + }, + { + "epoch": 2.0064882400648822, + "grad_norm": 0.609658420085907, + "learning_rate": 5.435267895101303e-06, + "loss": 0.48424142599105835, + "step": 1547 + }, + { + "epoch": 2.007785888077859, + "grad_norm": 0.5703460574150085, + "learning_rate": 5.422549800183861e-06, + "loss": 0.5136675834655762, + "step": 1548 + }, + { + "epoch": 2.0090835360908352, + "grad_norm": 0.5782158970832825, + "learning_rate": 5.409841064993512e-06, + "loss": 0.509381890296936, + "step": 1549 + }, + { + "epoch": 2.010381184103812, + "grad_norm": 0.6222527623176575, + "learning_rate": 5.39714171551639e-06, + "loss": 0.4843388795852661, + "step": 1550 + }, + { + "epoch": 2.011678832116788, + "grad_norm": 0.7037692666053772, + "learning_rate": 5.384451777719464e-06, + "loss": 0.5681462287902832, + "step": 1551 + }, + { + "epoch": 2.012976480129765, + "grad_norm": 0.7455988526344299, + "learning_rate": 5.371771277550432e-06, + "loss": 0.551672101020813, + "step": 1552 + }, + { + "epoch": 2.014274128142741, + "grad_norm": 0.7268160581588745, + "learning_rate": 5.359100240937717e-06, + "loss": 0.5382372140884399, + "step": 1553 + }, + { + "epoch": 2.015571776155718, + "grad_norm": 0.6356255412101746, + "learning_rate": 5.3464386937903764e-06, + "loss": 0.5280675888061523, + "step": 1554 + }, + { + "epoch": 2.016869424168694, + "grad_norm": 0.5975467562675476, + "learning_rate": 5.33378666199807e-06, + "loss": 0.47013112902641296, + "step": 1555 + }, + { + "epoch": 2.018167072181671, + "grad_norm": 0.6236818432807922, + "learning_rate": 5.321144171431003e-06, + "loss": 0.4888884425163269, + "step": 1556 + }, + { + "epoch": 2.019464720194647, + "grad_norm": 0.6166471838951111, + "learning_rate": 5.308511247939872e-06, + "loss": 0.5211419463157654, + "step": 1557 + }, + { + "epoch": 2.020762368207624, + "grad_norm": 0.6095893383026123, + "learning_rate": 5.295887917355794e-06, + "loss": 0.5085535049438477, + "step": 1558 + }, + { + "epoch": 2.0220600162206, + "grad_norm": 0.6039384007453918, + "learning_rate": 5.283274205490303e-06, + "loss": 0.4754714369773865, + "step": 1559 + }, + { + "epoch": 2.0233576642335764, + "grad_norm": 0.6331435441970825, + "learning_rate": 5.270670138135234e-06, + "loss": 0.5521947145462036, + "step": 1560 + }, + { + "epoch": 2.024655312246553, + "grad_norm": 0.6151823997497559, + "learning_rate": 5.25807574106272e-06, + "loss": 0.5278744697570801, + "step": 1561 + }, + { + "epoch": 2.0259529602595294, + "grad_norm": 0.5749709606170654, + "learning_rate": 5.245491040025115e-06, + "loss": 0.4914984107017517, + "step": 1562 + }, + { + "epoch": 2.027250608272506, + "grad_norm": 0.5855306386947632, + "learning_rate": 5.232916060754947e-06, + "loss": 0.5195509195327759, + "step": 1563 + }, + { + "epoch": 2.0285482562854824, + "grad_norm": 0.5908445119857788, + "learning_rate": 5.220350828964865e-06, + "loss": 0.48390451073646545, + "step": 1564 + }, + { + "epoch": 2.029845904298459, + "grad_norm": 0.5874761343002319, + "learning_rate": 5.207795370347588e-06, + "loss": 0.5324580669403076, + "step": 1565 + }, + { + "epoch": 2.0311435523114354, + "grad_norm": 0.5893219709396362, + "learning_rate": 5.195249710575853e-06, + "loss": 0.5100334286689758, + "step": 1566 + }, + { + "epoch": 2.032441200324412, + "grad_norm": 0.5876151919364929, + "learning_rate": 5.182713875302361e-06, + "loss": 0.4768049716949463, + "step": 1567 + }, + { + "epoch": 2.0337388483373884, + "grad_norm": 0.6265038251876831, + "learning_rate": 5.1701878901597106e-06, + "loss": 0.5602673292160034, + "step": 1568 + }, + { + "epoch": 2.035036496350365, + "grad_norm": 0.5975306034088135, + "learning_rate": 5.157671780760385e-06, + "loss": 0.5052694082260132, + "step": 1569 + }, + { + "epoch": 2.0363341443633414, + "grad_norm": 0.5611022114753723, + "learning_rate": 5.145165572696652e-06, + "loss": 0.49101999402046204, + "step": 1570 + }, + { + "epoch": 2.037631792376318, + "grad_norm": 0.5829542875289917, + "learning_rate": 5.132669291540544e-06, + "loss": 0.474854052066803, + "step": 1571 + }, + { + "epoch": 2.0389294403892944, + "grad_norm": 0.5918568968772888, + "learning_rate": 5.1201829628437926e-06, + "loss": 0.4853309988975525, + "step": 1572 + }, + { + "epoch": 2.040227088402271, + "grad_norm": 0.5785784125328064, + "learning_rate": 5.107706612137776e-06, + "loss": 0.5171955227851868, + "step": 1573 + }, + { + "epoch": 2.0415247364152473, + "grad_norm": 0.5528171062469482, + "learning_rate": 5.095240264933486e-06, + "loss": 0.47794681787490845, + "step": 1574 + }, + { + "epoch": 2.042822384428224, + "grad_norm": 0.5567626357078552, + "learning_rate": 5.082783946721434e-06, + "loss": 0.4940184950828552, + "step": 1575 + }, + { + "epoch": 2.0441200324412003, + "grad_norm": 0.5630913376808167, + "learning_rate": 5.070337682971642e-06, + "loss": 0.5437344312667847, + "step": 1576 + }, + { + "epoch": 2.0454176804541766, + "grad_norm": 0.5575384497642517, + "learning_rate": 5.057901499133573e-06, + "loss": 0.49236786365509033, + "step": 1577 + }, + { + "epoch": 2.0467153284671533, + "grad_norm": 0.5638654828071594, + "learning_rate": 5.0454754206360705e-06, + "loss": 0.4736412465572357, + "step": 1578 + }, + { + "epoch": 2.0480129764801296, + "grad_norm": 0.5577630996704102, + "learning_rate": 5.033059472887322e-06, + "loss": 0.5147624015808105, + "step": 1579 + }, + { + "epoch": 2.0493106244931063, + "grad_norm": 0.5717137455940247, + "learning_rate": 5.0206536812748004e-06, + "loss": 0.4905228614807129, + "step": 1580 + }, + { + "epoch": 2.0506082725060826, + "grad_norm": 0.5646504759788513, + "learning_rate": 5.008258071165202e-06, + "loss": 0.5036407113075256, + "step": 1581 + }, + { + "epoch": 2.0519059205190593, + "grad_norm": 0.5792942047119141, + "learning_rate": 4.995872667904424e-06, + "loss": 0.5340180993080139, + "step": 1582 + }, + { + "epoch": 2.0532035685320356, + "grad_norm": 0.573951244354248, + "learning_rate": 4.98349749681747e-06, + "loss": 0.4675467610359192, + "step": 1583 + }, + { + "epoch": 2.0545012165450123, + "grad_norm": 0.5502886772155762, + "learning_rate": 4.971132583208438e-06, + "loss": 0.4816184937953949, + "step": 1584 + }, + { + "epoch": 2.0557988645579885, + "grad_norm": 0.5748745203018188, + "learning_rate": 4.958777952360445e-06, + "loss": 0.49751102924346924, + "step": 1585 + }, + { + "epoch": 2.0570965125709653, + "grad_norm": 0.593724250793457, + "learning_rate": 4.946433629535585e-06, + "loss": 0.48918506503105164, + "step": 1586 + }, + { + "epoch": 2.0583941605839415, + "grad_norm": 0.5852590799331665, + "learning_rate": 4.934099639974874e-06, + "loss": 0.5142393708229065, + "step": 1587 + }, + { + "epoch": 2.0596918085969182, + "grad_norm": 0.5500675439834595, + "learning_rate": 4.921776008898198e-06, + "loss": 0.43804582953453064, + "step": 1588 + }, + { + "epoch": 2.0609894566098945, + "grad_norm": 0.572162389755249, + "learning_rate": 4.909462761504264e-06, + "loss": 0.5290922522544861, + "step": 1589 + }, + { + "epoch": 2.0622871046228712, + "grad_norm": 0.5475997924804688, + "learning_rate": 4.897159922970551e-06, + "loss": 0.489504873752594, + "step": 1590 + }, + { + "epoch": 2.0635847526358475, + "grad_norm": 0.5753741264343262, + "learning_rate": 4.884867518453238e-06, + "loss": 0.5394560694694519, + "step": 1591 + }, + { + "epoch": 2.0648824006488242, + "grad_norm": 0.5752173662185669, + "learning_rate": 4.872585573087195e-06, + "loss": 0.5700497627258301, + "step": 1592 + }, + { + "epoch": 2.0661800486618005, + "grad_norm": 0.5844142436981201, + "learning_rate": 4.860314111985881e-06, + "loss": 0.5502715110778809, + "step": 1593 + }, + { + "epoch": 2.0674776966747768, + "grad_norm": 0.5586737990379333, + "learning_rate": 4.848053160241333e-06, + "loss": 0.48312538862228394, + "step": 1594 + }, + { + "epoch": 2.0687753446877535, + "grad_norm": 0.5547072887420654, + "learning_rate": 4.835802742924091e-06, + "loss": 0.4890977442264557, + "step": 1595 + }, + { + "epoch": 2.0700729927007298, + "grad_norm": 0.5696388483047485, + "learning_rate": 4.823562885083161e-06, + "loss": 0.5179868936538696, + "step": 1596 + }, + { + "epoch": 2.0713706407137065, + "grad_norm": 0.5792607069015503, + "learning_rate": 4.811333611745953e-06, + "loss": 0.5098393559455872, + "step": 1597 + }, + { + "epoch": 2.0726682887266827, + "grad_norm": 0.5769554972648621, + "learning_rate": 4.799114947918238e-06, + "loss": 0.4976171553134918, + "step": 1598 + }, + { + "epoch": 2.0739659367396595, + "grad_norm": 0.6067489981651306, + "learning_rate": 4.786906918584083e-06, + "loss": 0.5139312148094177, + "step": 1599 + }, + { + "epoch": 2.0752635847526357, + "grad_norm": 0.5910279750823975, + "learning_rate": 4.774709548705831e-06, + "loss": 0.5157588720321655, + "step": 1600 + }, + { + "epoch": 2.0765612327656124, + "grad_norm": 0.5831329226493835, + "learning_rate": 4.762522863224001e-06, + "loss": 0.5141895413398743, + "step": 1601 + }, + { + "epoch": 2.0778588807785887, + "grad_norm": 0.5735464692115784, + "learning_rate": 4.750346887057292e-06, + "loss": 0.47724485397338867, + "step": 1602 + }, + { + "epoch": 2.0791565287915654, + "grad_norm": 0.5806788206100464, + "learning_rate": 4.738181645102493e-06, + "loss": 0.4755935072898865, + "step": 1603 + }, + { + "epoch": 2.0804541768045417, + "grad_norm": 0.5973532199859619, + "learning_rate": 4.726027162234434e-06, + "loss": 0.5464816093444824, + "step": 1604 + }, + { + "epoch": 2.0817518248175184, + "grad_norm": 0.5893049240112305, + "learning_rate": 4.713883463305972e-06, + "loss": 0.5293697118759155, + "step": 1605 + }, + { + "epoch": 2.0830494728304947, + "grad_norm": 0.5956568717956543, + "learning_rate": 4.701750573147885e-06, + "loss": 0.5268076658248901, + "step": 1606 + }, + { + "epoch": 2.0843471208434714, + "grad_norm": 0.5941202044487, + "learning_rate": 4.689628516568866e-06, + "loss": 0.526781439781189, + "step": 1607 + }, + { + "epoch": 2.0856447688564477, + "grad_norm": 0.5724000334739685, + "learning_rate": 4.677517318355455e-06, + "loss": 0.5051593780517578, + "step": 1608 + }, + { + "epoch": 2.086942416869424, + "grad_norm": 0.5567840933799744, + "learning_rate": 4.6654170032719825e-06, + "loss": 0.48566874861717224, + "step": 1609 + }, + { + "epoch": 2.0882400648824007, + "grad_norm": 0.5653722882270813, + "learning_rate": 4.6533275960605355e-06, + "loss": 0.5071468353271484, + "step": 1610 + }, + { + "epoch": 2.0882400648824007, + "eval_loss": 0.6963403820991516, + "eval_runtime": 72.3826, + "eval_samples_per_second": 71.73, + "eval_steps_per_second": 8.966, + "step": 1610 + }, + { + "epoch": 2.089537712895377, + "grad_norm": 0.5640507340431213, + "learning_rate": 4.641249121440892e-06, + "loss": 0.5107710361480713, + "step": 1611 + }, + { + "epoch": 2.0908353609083536, + "grad_norm": 0.5841313004493713, + "learning_rate": 4.629181604110464e-06, + "loss": 0.5194936990737915, + "step": 1612 + }, + { + "epoch": 2.09213300892133, + "grad_norm": 0.5427317023277283, + "learning_rate": 4.617125068744288e-06, + "loss": 0.44176995754241943, + "step": 1613 + }, + { + "epoch": 2.0934306569343066, + "grad_norm": 0.6006700992584229, + "learning_rate": 4.605079539994911e-06, + "loss": 0.5314173102378845, + "step": 1614 + }, + { + "epoch": 2.094728304947283, + "grad_norm": 0.5708412528038025, + "learning_rate": 4.593045042492404e-06, + "loss": 0.5313728451728821, + "step": 1615 + }, + { + "epoch": 2.0960259529602596, + "grad_norm": 0.5850820541381836, + "learning_rate": 4.581021600844258e-06, + "loss": 0.4967271089553833, + "step": 1616 + }, + { + "epoch": 2.097323600973236, + "grad_norm": 0.5869132280349731, + "learning_rate": 4.569009239635374e-06, + "loss": 0.5268970727920532, + "step": 1617 + }, + { + "epoch": 2.0986212489862126, + "grad_norm": 0.5825201869010925, + "learning_rate": 4.557007983427987e-06, + "loss": 0.5315977334976196, + "step": 1618 + }, + { + "epoch": 2.099918896999189, + "grad_norm": 0.5721443891525269, + "learning_rate": 4.54501785676163e-06, + "loss": 0.4732065498828888, + "step": 1619 + }, + { + "epoch": 2.1012165450121656, + "grad_norm": 0.5872232913970947, + "learning_rate": 4.533038884153077e-06, + "loss": 0.5813014507293701, + "step": 1620 + }, + { + "epoch": 2.102514193025142, + "grad_norm": 0.5751720666885376, + "learning_rate": 4.521071090096298e-06, + "loss": 0.4687768518924713, + "step": 1621 + }, + { + "epoch": 2.1038118410381186, + "grad_norm": 0.5663445591926575, + "learning_rate": 4.509114499062393e-06, + "loss": 0.49182090163230896, + "step": 1622 + }, + { + "epoch": 2.105109489051095, + "grad_norm": 0.5650926828384399, + "learning_rate": 4.4971691354995795e-06, + "loss": 0.5067583322525024, + "step": 1623 + }, + { + "epoch": 2.1064071370640716, + "grad_norm": 0.6090897917747498, + "learning_rate": 4.485235023833087e-06, + "loss": 0.5684949159622192, + "step": 1624 + }, + { + "epoch": 2.107704785077048, + "grad_norm": 0.6066005229949951, + "learning_rate": 4.4733121884651665e-06, + "loss": 0.5100910067558289, + "step": 1625 + }, + { + "epoch": 2.1090024330900246, + "grad_norm": 0.5951321125030518, + "learning_rate": 4.46140065377499e-06, + "loss": 0.4774884283542633, + "step": 1626 + }, + { + "epoch": 2.110300081103001, + "grad_norm": 0.5725848078727722, + "learning_rate": 4.449500444118633e-06, + "loss": 0.5018754005432129, + "step": 1627 + }, + { + "epoch": 2.111597729115977, + "grad_norm": 0.5799410343170166, + "learning_rate": 4.437611583829014e-06, + "loss": 0.49752479791641235, + "step": 1628 + }, + { + "epoch": 2.112895377128954, + "grad_norm": 0.5619634985923767, + "learning_rate": 4.42573409721584e-06, + "loss": 0.4756616950035095, + "step": 1629 + }, + { + "epoch": 2.11419302514193, + "grad_norm": 0.5556355118751526, + "learning_rate": 4.413868008565569e-06, + "loss": 0.4895199239253998, + "step": 1630 + }, + { + "epoch": 2.115490673154907, + "grad_norm": 0.5813250541687012, + "learning_rate": 4.402013342141347e-06, + "loss": 0.45987099409103394, + "step": 1631 + }, + { + "epoch": 2.116788321167883, + "grad_norm": 0.5723846554756165, + "learning_rate": 4.390170122182965e-06, + "loss": 0.4845224916934967, + "step": 1632 + }, + { + "epoch": 2.11808596918086, + "grad_norm": 0.5540896058082581, + "learning_rate": 4.378338372906813e-06, + "loss": 0.4948923587799072, + "step": 1633 + }, + { + "epoch": 2.119383617193836, + "grad_norm": 0.61214679479599, + "learning_rate": 4.3665181185058255e-06, + "loss": 0.5314114093780518, + "step": 1634 + }, + { + "epoch": 2.1206812652068128, + "grad_norm": 0.5635900497436523, + "learning_rate": 4.354709383149421e-06, + "loss": 0.4875974655151367, + "step": 1635 + }, + { + "epoch": 2.121978913219789, + "grad_norm": 0.5833781957626343, + "learning_rate": 4.342912190983487e-06, + "loss": 0.5470179915428162, + "step": 1636 + }, + { + "epoch": 2.1232765612327658, + "grad_norm": 0.5999435782432556, + "learning_rate": 4.331126566130284e-06, + "loss": 0.5479536056518555, + "step": 1637 + }, + { + "epoch": 2.124574209245742, + "grad_norm": 0.589368999004364, + "learning_rate": 4.319352532688444e-06, + "loss": 0.5104061961174011, + "step": 1638 + }, + { + "epoch": 2.1258718572587187, + "grad_norm": 0.5677252411842346, + "learning_rate": 4.3075901147328745e-06, + "loss": 0.5259417295455933, + "step": 1639 + }, + { + "epoch": 2.127169505271695, + "grad_norm": 0.5625855326652527, + "learning_rate": 4.295839336314749e-06, + "loss": 0.49216002225875854, + "step": 1640 + }, + { + "epoch": 2.1284671532846717, + "grad_norm": 0.5749784111976624, + "learning_rate": 4.284100221461432e-06, + "loss": 0.47341352701187134, + "step": 1641 + }, + { + "epoch": 2.129764801297648, + "grad_norm": 0.5952023267745972, + "learning_rate": 4.272372794176446e-06, + "loss": 0.5849668979644775, + "step": 1642 + }, + { + "epoch": 2.1310624493106243, + "grad_norm": 0.6117653250694275, + "learning_rate": 4.260657078439409e-06, + "loss": 0.5250235795974731, + "step": 1643 + }, + { + "epoch": 2.132360097323601, + "grad_norm": 0.5717377662658691, + "learning_rate": 4.248953098205997e-06, + "loss": 0.49503540992736816, + "step": 1644 + }, + { + "epoch": 2.1336577453365773, + "grad_norm": 0.5875842571258545, + "learning_rate": 4.237260877407878e-06, + "loss": 0.5329856872558594, + "step": 1645 + }, + { + "epoch": 2.134955393349554, + "grad_norm": 0.5664336085319519, + "learning_rate": 4.225580439952699e-06, + "loss": 0.5302871465682983, + "step": 1646 + }, + { + "epoch": 2.1362530413625302, + "grad_norm": 0.5786408185958862, + "learning_rate": 4.213911809723987e-06, + "loss": 0.49267759919166565, + "step": 1647 + }, + { + "epoch": 2.137550689375507, + "grad_norm": 0.5607128143310547, + "learning_rate": 4.20225501058114e-06, + "loss": 0.5211464166641235, + "step": 1648 + }, + { + "epoch": 2.1388483373884832, + "grad_norm": 0.5761646628379822, + "learning_rate": 4.190610066359364e-06, + "loss": 0.5178772211074829, + "step": 1649 + }, + { + "epoch": 2.14014598540146, + "grad_norm": 0.5818209648132324, + "learning_rate": 4.1789770008696205e-06, + "loss": 0.5244809985160828, + "step": 1650 + }, + { + "epoch": 2.141443633414436, + "grad_norm": 0.6208338141441345, + "learning_rate": 4.167355837898585e-06, + "loss": 0.5720170736312866, + "step": 1651 + }, + { + "epoch": 2.142741281427413, + "grad_norm": 0.59494549036026, + "learning_rate": 4.155746601208594e-06, + "loss": 0.5233884453773499, + "step": 1652 + }, + { + "epoch": 2.144038929440389, + "grad_norm": 0.5718002915382385, + "learning_rate": 4.144149314537599e-06, + "loss": 0.48552173376083374, + "step": 1653 + }, + { + "epoch": 2.145336577453366, + "grad_norm": 0.5601415634155273, + "learning_rate": 4.1325640015991185e-06, + "loss": 0.4996642768383026, + "step": 1654 + }, + { + "epoch": 2.146634225466342, + "grad_norm": 0.5795076489448547, + "learning_rate": 4.120990686082174e-06, + "loss": 0.5177854895591736, + "step": 1655 + }, + { + "epoch": 2.147931873479319, + "grad_norm": 0.5665140151977539, + "learning_rate": 4.109429391651283e-06, + "loss": 0.46502965688705444, + "step": 1656 + }, + { + "epoch": 2.149229521492295, + "grad_norm": 0.5985783934593201, + "learning_rate": 4.097880141946354e-06, + "loss": 0.4880366325378418, + "step": 1657 + }, + { + "epoch": 2.150527169505272, + "grad_norm": 0.5875007510185242, + "learning_rate": 4.08634296058268e-06, + "loss": 0.4756428599357605, + "step": 1658 + }, + { + "epoch": 2.151824817518248, + "grad_norm": 0.5694658160209656, + "learning_rate": 4.074817871150887e-06, + "loss": 0.5224863886833191, + "step": 1659 + }, + { + "epoch": 2.153122465531225, + "grad_norm": 0.5686694979667664, + "learning_rate": 4.063304897216856e-06, + "loss": 0.4963817000389099, + "step": 1660 + }, + { + "epoch": 2.154420113544201, + "grad_norm": 0.5916073322296143, + "learning_rate": 4.051804062321706e-06, + "loss": 0.5067265629768372, + "step": 1661 + }, + { + "epoch": 2.1557177615571774, + "grad_norm": 0.5737749338150024, + "learning_rate": 4.040315389981736e-06, + "loss": 0.547669529914856, + "step": 1662 + }, + { + "epoch": 2.157015409570154, + "grad_norm": 0.5631166696548462, + "learning_rate": 4.028838903688372e-06, + "loss": 0.5300416946411133, + "step": 1663 + }, + { + "epoch": 2.1583130575831304, + "grad_norm": 0.5811983942985535, + "learning_rate": 4.017374626908125e-06, + "loss": 0.5100100040435791, + "step": 1664 + }, + { + "epoch": 2.159610705596107, + "grad_norm": 0.571027934551239, + "learning_rate": 4.005922583082538e-06, + "loss": 0.5137525200843811, + "step": 1665 + }, + { + "epoch": 2.1609083536090834, + "grad_norm": 0.5910731554031372, + "learning_rate": 3.994482795628142e-06, + "loss": 0.5244160890579224, + "step": 1666 + }, + { + "epoch": 2.16220600162206, + "grad_norm": 0.5894386768341064, + "learning_rate": 3.983055287936411e-06, + "loss": 0.5517876148223877, + "step": 1667 + }, + { + "epoch": 2.1635036496350364, + "grad_norm": 0.5779116153717041, + "learning_rate": 3.971640083373696e-06, + "loss": 0.5097295045852661, + "step": 1668 + }, + { + "epoch": 2.164801297648013, + "grad_norm": 0.5987510085105896, + "learning_rate": 3.960237205281213e-06, + "loss": 0.511284589767456, + "step": 1669 + }, + { + "epoch": 2.1660989456609894, + "grad_norm": 0.5853222608566284, + "learning_rate": 3.948846676974953e-06, + "loss": 0.5473302602767944, + "step": 1670 + }, + { + "epoch": 2.167396593673966, + "grad_norm": 0.5716820359230042, + "learning_rate": 3.937468521745666e-06, + "loss": 0.4697805345058441, + "step": 1671 + }, + { + "epoch": 2.1686942416869424, + "grad_norm": 0.5948668122291565, + "learning_rate": 3.9261027628588e-06, + "loss": 0.5532658100128174, + "step": 1672 + }, + { + "epoch": 2.169991889699919, + "grad_norm": 0.5779493451118469, + "learning_rate": 3.9147494235544544e-06, + "loss": 0.495819091796875, + "step": 1673 + }, + { + "epoch": 2.1712895377128953, + "grad_norm": 0.588945746421814, + "learning_rate": 3.903408527047336e-06, + "loss": 0.50020432472229, + "step": 1674 + }, + { + "epoch": 2.172587185725872, + "grad_norm": 0.5889913439750671, + "learning_rate": 3.892080096526707e-06, + "loss": 0.5079851150512695, + "step": 1675 + }, + { + "epoch": 2.1738848337388483, + "grad_norm": 0.5692569017410278, + "learning_rate": 3.880764155156339e-06, + "loss": 0.47483527660369873, + "step": 1676 + }, + { + "epoch": 2.1751824817518246, + "grad_norm": 0.6015142202377319, + "learning_rate": 3.8694607260744745e-06, + "loss": 0.5588316321372986, + "step": 1677 + }, + { + "epoch": 2.1764801297648013, + "grad_norm": 0.5825367569923401, + "learning_rate": 3.858169832393752e-06, + "loss": 0.5049576759338379, + "step": 1678 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 0.6517031788825989, + "learning_rate": 3.846891497201206e-06, + "loss": 0.5698549151420593, + "step": 1679 + }, + { + "epoch": 2.1790754257907543, + "grad_norm": 0.5972406268119812, + "learning_rate": 3.835625743558168e-06, + "loss": 0.5489758253097534, + "step": 1680 + }, + { + "epoch": 2.1803730738037306, + "grad_norm": 0.590186595916748, + "learning_rate": 3.824372594500256e-06, + "loss": 0.5560799837112427, + "step": 1681 + }, + { + "epoch": 2.1816707218167073, + "grad_norm": 0.6042253375053406, + "learning_rate": 3.813132073037309e-06, + "loss": 0.5188357830047607, + "step": 1682 + }, + { + "epoch": 2.1829683698296836, + "grad_norm": 0.5862630605697632, + "learning_rate": 3.8019042021533513e-06, + "loss": 0.49817925691604614, + "step": 1683 + }, + { + "epoch": 2.1842660178426603, + "grad_norm": 0.5700656175613403, + "learning_rate": 3.7906890048065358e-06, + "loss": 0.5223833322525024, + "step": 1684 + }, + { + "epoch": 2.1855636658556366, + "grad_norm": 0.5849031805992126, + "learning_rate": 3.779486503929106e-06, + "loss": 0.5123599767684937, + "step": 1685 + }, + { + "epoch": 2.1868613138686133, + "grad_norm": 0.5997171998023987, + "learning_rate": 3.7682967224273317e-06, + "loss": 0.5369530320167542, + "step": 1686 + }, + { + "epoch": 2.1881589618815895, + "grad_norm": 0.5994778275489807, + "learning_rate": 3.757119683181493e-06, + "loss": 0.47989219427108765, + "step": 1687 + }, + { + "epoch": 2.1894566098945663, + "grad_norm": 0.5771443247795105, + "learning_rate": 3.7459554090458018e-06, + "loss": 0.4408413767814636, + "step": 1688 + }, + { + "epoch": 2.1907542579075425, + "grad_norm": 0.5725969672203064, + "learning_rate": 3.7348039228483758e-06, + "loss": 0.46296805143356323, + "step": 1689 + }, + { + "epoch": 2.1920519059205192, + "grad_norm": 0.5743042826652527, + "learning_rate": 3.7236652473911817e-06, + "loss": 0.482837975025177, + "step": 1690 + }, + { + "epoch": 2.1933495539334955, + "grad_norm": 0.5836053490638733, + "learning_rate": 3.7125394054499843e-06, + "loss": 0.5156795978546143, + "step": 1691 + }, + { + "epoch": 2.1946472019464722, + "grad_norm": 0.5889219641685486, + "learning_rate": 3.7014264197743267e-06, + "loss": 0.5081969499588013, + "step": 1692 + }, + { + "epoch": 2.1959448499594485, + "grad_norm": 0.6140073537826538, + "learning_rate": 3.6903263130874423e-06, + "loss": 0.5605005025863647, + "step": 1693 + }, + { + "epoch": 2.197242497972425, + "grad_norm": 0.5697020292282104, + "learning_rate": 3.679239108086241e-06, + "loss": 0.5305500030517578, + "step": 1694 + }, + { + "epoch": 2.1985401459854015, + "grad_norm": 0.5989742875099182, + "learning_rate": 3.668164827441254e-06, + "loss": 0.5370711088180542, + "step": 1695 + }, + { + "epoch": 2.1998377939983778, + "grad_norm": 0.608519971370697, + "learning_rate": 3.657103493796581e-06, + "loss": 0.5120800137519836, + "step": 1696 + }, + { + "epoch": 2.2011354420113545, + "grad_norm": 0.5787931084632874, + "learning_rate": 3.6460551297698486e-06, + "loss": 0.5016961693763733, + "step": 1697 + }, + { + "epoch": 2.2024330900243307, + "grad_norm": 0.5809414982795715, + "learning_rate": 3.6350197579521696e-06, + "loss": 0.5177795886993408, + "step": 1698 + }, + { + "epoch": 2.2037307380373075, + "grad_norm": 0.6027206778526306, + "learning_rate": 3.6239974009080746e-06, + "loss": 0.500653862953186, + "step": 1699 + }, + { + "epoch": 2.2050283860502837, + "grad_norm": 0.5894326567649841, + "learning_rate": 3.6129880811755093e-06, + "loss": 0.5206901431083679, + "step": 1700 + }, + { + "epoch": 2.2063260340632604, + "grad_norm": 0.591676652431488, + "learning_rate": 3.601991821265731e-06, + "loss": 0.49031156301498413, + "step": 1701 + }, + { + "epoch": 2.2076236820762367, + "grad_norm": 0.567371666431427, + "learning_rate": 3.591008643663323e-06, + "loss": 0.49885687232017517, + "step": 1702 + }, + { + "epoch": 2.2089213300892134, + "grad_norm": 0.5756494998931885, + "learning_rate": 3.580038570826093e-06, + "loss": 0.499514639377594, + "step": 1703 + }, + { + "epoch": 2.2102189781021897, + "grad_norm": 0.5830073356628418, + "learning_rate": 3.5690816251850657e-06, + "loss": 0.4895148277282715, + "step": 1704 + }, + { + "epoch": 2.2115166261151664, + "grad_norm": 0.6235371828079224, + "learning_rate": 3.5581378291444223e-06, + "loss": 0.5166549682617188, + "step": 1705 + }, + { + "epoch": 2.2128142741281427, + "grad_norm": 0.5604133605957031, + "learning_rate": 3.5472072050814565e-06, + "loss": 0.4416266083717346, + "step": 1706 + }, + { + "epoch": 2.2141119221411194, + "grad_norm": 0.5687461495399475, + "learning_rate": 3.5362897753465265e-06, + "loss": 0.48436877131462097, + "step": 1707 + }, + { + "epoch": 2.2154095701540957, + "grad_norm": 0.5818923115730286, + "learning_rate": 3.5253855622630174e-06, + "loss": 0.5402669906616211, + "step": 1708 + }, + { + "epoch": 2.2167072181670724, + "grad_norm": 0.6057185530662537, + "learning_rate": 3.514494588127275e-06, + "loss": 0.5666176080703735, + "step": 1709 + }, + { + "epoch": 2.2180048661800487, + "grad_norm": 0.5755799412727356, + "learning_rate": 3.5036168752085977e-06, + "loss": 0.48957937955856323, + "step": 1710 + }, + { + "epoch": 2.219302514193025, + "grad_norm": 0.5948247313499451, + "learning_rate": 3.4927524457491456e-06, + "loss": 0.4885704219341278, + "step": 1711 + }, + { + "epoch": 2.2206001622060016, + "grad_norm": 0.5859489440917969, + "learning_rate": 3.4819013219639295e-06, + "loss": 0.4678208827972412, + "step": 1712 + }, + { + "epoch": 2.221897810218978, + "grad_norm": 0.5540412068367004, + "learning_rate": 3.471063526040752e-06, + "loss": 0.481825053691864, + "step": 1713 + }, + { + "epoch": 2.2231954582319546, + "grad_norm": 0.5437055826187134, + "learning_rate": 3.460239080140163e-06, + "loss": 0.4387455880641937, + "step": 1714 + }, + { + "epoch": 2.224493106244931, + "grad_norm": 0.5966470241546631, + "learning_rate": 3.4494280063954146e-06, + "loss": 0.545790433883667, + "step": 1715 + }, + { + "epoch": 2.2257907542579076, + "grad_norm": 0.5654957294464111, + "learning_rate": 3.4386303269124142e-06, + "loss": 0.4880921244621277, + "step": 1716 + }, + { + "epoch": 2.227088402270884, + "grad_norm": 0.5839219689369202, + "learning_rate": 3.4278460637696865e-06, + "loss": 0.5272015333175659, + "step": 1717 + }, + { + "epoch": 2.2283860502838606, + "grad_norm": 0.5752228498458862, + "learning_rate": 3.4170752390183183e-06, + "loss": 0.5249931812286377, + "step": 1718 + }, + { + "epoch": 2.229683698296837, + "grad_norm": 0.580033540725708, + "learning_rate": 3.4063178746819193e-06, + "loss": 0.4954257309436798, + "step": 1719 + }, + { + "epoch": 2.2309813463098136, + "grad_norm": 0.5703238844871521, + "learning_rate": 3.395573992756579e-06, + "loss": 0.502043604850769, + "step": 1720 + }, + { + "epoch": 2.23227899432279, + "grad_norm": 0.5960628986358643, + "learning_rate": 3.384843615210819e-06, + "loss": 0.5299471616744995, + "step": 1721 + }, + { + "epoch": 2.2335766423357666, + "grad_norm": 0.5959639549255371, + "learning_rate": 3.3741267639855345e-06, + "loss": 0.6064699292182922, + "step": 1722 + }, + { + "epoch": 2.234874290348743, + "grad_norm": 0.5705887079238892, + "learning_rate": 3.3634234609939888e-06, + "loss": 0.49739521741867065, + "step": 1723 + }, + { + "epoch": 2.2361719383617196, + "grad_norm": 0.5743765830993652, + "learning_rate": 3.352733728121712e-06, + "loss": 0.5017514228820801, + "step": 1724 + }, + { + "epoch": 2.237469586374696, + "grad_norm": 0.5511932969093323, + "learning_rate": 3.3420575872265184e-06, + "loss": 0.4473830759525299, + "step": 1725 + }, + { + "epoch": 2.238767234387672, + "grad_norm": 0.5601068139076233, + "learning_rate": 3.3313950601384016e-06, + "loss": 0.4705375134944916, + "step": 1726 + }, + { + "epoch": 2.240064882400649, + "grad_norm": 0.5842630863189697, + "learning_rate": 3.320746168659534e-06, + "loss": 0.5488964319229126, + "step": 1727 + }, + { + "epoch": 2.241362530413625, + "grad_norm": 0.5851315855979919, + "learning_rate": 3.3101109345642056e-06, + "loss": 0.4903653860092163, + "step": 1728 + }, + { + "epoch": 2.242660178426602, + "grad_norm": 0.5913082361221313, + "learning_rate": 3.299489379598777e-06, + "loss": 0.5187092423439026, + "step": 1729 + }, + { + "epoch": 2.243957826439578, + "grad_norm": 0.5963798761367798, + "learning_rate": 3.288881525481639e-06, + "loss": 0.5145666003227234, + "step": 1730 + }, + { + "epoch": 2.245255474452555, + "grad_norm": 0.5765670537948608, + "learning_rate": 3.278287393903172e-06, + "loss": 0.47934818267822266, + "step": 1731 + }, + { + "epoch": 2.246553122465531, + "grad_norm": 0.5776212215423584, + "learning_rate": 3.2677070065256855e-06, + "loss": 0.5102344751358032, + "step": 1732 + }, + { + "epoch": 2.247850770478508, + "grad_norm": 0.5738791823387146, + "learning_rate": 3.257140384983405e-06, + "loss": 0.5097633600234985, + "step": 1733 + }, + { + "epoch": 2.249148418491484, + "grad_norm": 0.5827375650405884, + "learning_rate": 3.2465875508823876e-06, + "loss": 0.49323970079421997, + "step": 1734 + }, + { + "epoch": 2.2504460665044608, + "grad_norm": 0.5527526140213013, + "learning_rate": 3.2360485258005115e-06, + "loss": 0.47956135869026184, + "step": 1735 + }, + { + "epoch": 2.251743714517437, + "grad_norm": 0.581285297870636, + "learning_rate": 3.2255233312874155e-06, + "loss": 0.5309310555458069, + "step": 1736 + }, + { + "epoch": 2.2530413625304138, + "grad_norm": 0.6052958965301514, + "learning_rate": 3.2150119888644594e-06, + "loss": 0.5168576240539551, + "step": 1737 + }, + { + "epoch": 2.25433901054339, + "grad_norm": 0.5458951592445374, + "learning_rate": 3.2045145200246763e-06, + "loss": 0.45663541555404663, + "step": 1738 + }, + { + "epoch": 2.2556366585563667, + "grad_norm": 0.6066997647285461, + "learning_rate": 3.1940309462327334e-06, + "loss": 0.5442982912063599, + "step": 1739 + }, + { + "epoch": 2.256934306569343, + "grad_norm": 0.5723252296447754, + "learning_rate": 3.1835612889248868e-06, + "loss": 0.5069276094436646, + "step": 1740 + }, + { + "epoch": 2.2582319545823197, + "grad_norm": 0.571399986743927, + "learning_rate": 3.1731055695089384e-06, + "loss": 0.46238988637924194, + "step": 1741 + }, + { + "epoch": 2.259529602595296, + "grad_norm": 0.5810062289237976, + "learning_rate": 3.162663809364178e-06, + "loss": 0.5127156972885132, + "step": 1742 + }, + { + "epoch": 2.2608272506082727, + "grad_norm": 0.57572340965271, + "learning_rate": 3.152236029841376e-06, + "loss": 0.4930036664009094, + "step": 1743 + }, + { + "epoch": 2.262124898621249, + "grad_norm": 0.580849826335907, + "learning_rate": 3.1418222522626907e-06, + "loss": 0.5655021071434021, + "step": 1744 + }, + { + "epoch": 2.2634225466342253, + "grad_norm": 0.5487149953842163, + "learning_rate": 3.1314224979216633e-06, + "loss": 0.4654723107814789, + "step": 1745 + }, + { + "epoch": 2.264720194647202, + "grad_norm": 0.5340819954872131, + "learning_rate": 3.1210367880831684e-06, + "loss": 0.4503304362297058, + "step": 1746 + }, + { + "epoch": 2.2660178426601782, + "grad_norm": 0.5930841565132141, + "learning_rate": 3.1106651439833434e-06, + "loss": 0.5008471608161926, + "step": 1747 + }, + { + "epoch": 2.267315490673155, + "grad_norm": 0.6097638010978699, + "learning_rate": 3.1003075868295794e-06, + "loss": 0.5474433898925781, + "step": 1748 + }, + { + "epoch": 2.2686131386861312, + "grad_norm": 0.5703378319740295, + "learning_rate": 3.0899641378004596e-06, + "loss": 0.4988810420036316, + "step": 1749 + }, + { + "epoch": 2.269910786699108, + "grad_norm": 0.5475755333900452, + "learning_rate": 3.079634818045719e-06, + "loss": 0.4420495927333832, + "step": 1750 + }, + { + "epoch": 2.2712084347120842, + "grad_norm": 0.5802868008613586, + "learning_rate": 3.069319648686202e-06, + "loss": 0.4927031397819519, + "step": 1751 + }, + { + "epoch": 2.272506082725061, + "grad_norm": 0.5564054846763611, + "learning_rate": 3.0590186508138186e-06, + "loss": 0.4879905581474304, + "step": 1752 + }, + { + "epoch": 2.273803730738037, + "grad_norm": 0.5730741620063782, + "learning_rate": 3.048731845491504e-06, + "loss": 0.4577972888946533, + "step": 1753 + }, + { + "epoch": 2.275101378751014, + "grad_norm": 0.5826799869537354, + "learning_rate": 3.038459253753172e-06, + "loss": 0.49198514223098755, + "step": 1754 + }, + { + "epoch": 2.27639902676399, + "grad_norm": 0.5650803446769714, + "learning_rate": 3.0282008966036647e-06, + "loss": 0.48484641313552856, + "step": 1755 + }, + { + "epoch": 2.277696674776967, + "grad_norm": 0.579980731010437, + "learning_rate": 3.0179567950187396e-06, + "loss": 0.4821101427078247, + "step": 1756 + }, + { + "epoch": 2.278994322789943, + "grad_norm": 0.562907874584198, + "learning_rate": 3.0077269699449795e-06, + "loss": 0.47341495752334595, + "step": 1757 + }, + { + "epoch": 2.28029197080292, + "grad_norm": 0.584148108959198, + "learning_rate": 2.9975114422997932e-06, + "loss": 0.48562386631965637, + "step": 1758 + }, + { + "epoch": 2.281589618815896, + "grad_norm": 0.5975433588027954, + "learning_rate": 2.9873102329713478e-06, + "loss": 0.5041466951370239, + "step": 1759 + }, + { + "epoch": 2.2828872668288724, + "grad_norm": 0.5545569062232971, + "learning_rate": 2.9771233628185346e-06, + "loss": 0.45113393664360046, + "step": 1760 + }, + { + "epoch": 2.284184914841849, + "grad_norm": 0.5939710140228271, + "learning_rate": 2.9669508526709256e-06, + "loss": 0.550965428352356, + "step": 1761 + }, + { + "epoch": 2.285482562854826, + "grad_norm": 0.6028052568435669, + "learning_rate": 2.9567927233287307e-06, + "loss": 0.5310263633728027, + "step": 1762 + }, + { + "epoch": 2.286780210867802, + "grad_norm": 0.5738025903701782, + "learning_rate": 2.9466489955627452e-06, + "loss": 0.5576157569885254, + "step": 1763 + }, + { + "epoch": 2.2880778588807784, + "grad_norm": 0.5776515007019043, + "learning_rate": 2.936519690114338e-06, + "loss": 0.4818328022956848, + "step": 1764 + }, + { + "epoch": 2.289375506893755, + "grad_norm": 0.5612311363220215, + "learning_rate": 2.9264048276953606e-06, + "loss": 0.4919436573982239, + "step": 1765 + }, + { + "epoch": 2.2906731549067314, + "grad_norm": 0.5739221572875977, + "learning_rate": 2.9163044289881604e-06, + "loss": 0.5123167634010315, + "step": 1766 + }, + { + "epoch": 2.291970802919708, + "grad_norm": 0.5849712491035461, + "learning_rate": 2.906218514645487e-06, + "loss": 0.48645591735839844, + "step": 1767 + }, + { + "epoch": 2.2932684509326844, + "grad_norm": 0.5921924114227295, + "learning_rate": 2.8961471052904855e-06, + "loss": 0.5228952169418335, + "step": 1768 + }, + { + "epoch": 2.294566098945661, + "grad_norm": 0.5667364001274109, + "learning_rate": 2.8860902215166374e-06, + "loss": 0.4713795781135559, + "step": 1769 + }, + { + "epoch": 2.2958637469586374, + "grad_norm": 0.5740687847137451, + "learning_rate": 2.876047883887727e-06, + "loss": 0.5572628974914551, + "step": 1770 + }, + { + "epoch": 2.297161394971614, + "grad_norm": 0.5873590111732483, + "learning_rate": 2.866020112937792e-06, + "loss": 0.5043233036994934, + "step": 1771 + }, + { + "epoch": 2.2984590429845904, + "grad_norm": 0.6047444343566895, + "learning_rate": 2.8560069291710857e-06, + "loss": 0.5389963984489441, + "step": 1772 + }, + { + "epoch": 2.299756690997567, + "grad_norm": 0.5967015624046326, + "learning_rate": 2.8460083530620342e-06, + "loss": 0.5294721126556396, + "step": 1773 + }, + { + "epoch": 2.3010543390105433, + "grad_norm": 0.549340546131134, + "learning_rate": 2.8360244050551943e-06, + "loss": 0.4317038357257843, + "step": 1774 + }, + { + "epoch": 2.30235198702352, + "grad_norm": 0.5504307150840759, + "learning_rate": 2.8260551055652154e-06, + "loss": 0.529647946357727, + "step": 1775 + }, + { + "epoch": 2.3036496350364963, + "grad_norm": 0.603110671043396, + "learning_rate": 2.8161004749767893e-06, + "loss": 0.5209970474243164, + "step": 1776 + }, + { + "epoch": 2.304947283049473, + "grad_norm": 0.6039415001869202, + "learning_rate": 2.8061605336446194e-06, + "loss": 0.5043014287948608, + "step": 1777 + }, + { + "epoch": 2.3062449310624493, + "grad_norm": 0.5883081555366516, + "learning_rate": 2.796235301893362e-06, + "loss": 0.4972041845321655, + "step": 1778 + }, + { + "epoch": 2.3075425790754256, + "grad_norm": 0.5843275785446167, + "learning_rate": 2.7863248000176146e-06, + "loss": 0.4763846695423126, + "step": 1779 + }, + { + "epoch": 2.3088402270884023, + "grad_norm": 0.5958689451217651, + "learning_rate": 2.776429048281837e-06, + "loss": 0.534402072429657, + "step": 1780 + }, + { + "epoch": 2.3101378751013786, + "grad_norm": 0.5908694267272949, + "learning_rate": 2.7665480669203383e-06, + "loss": 0.5190926790237427, + "step": 1781 + }, + { + "epoch": 2.3114355231143553, + "grad_norm": 0.5524806380271912, + "learning_rate": 2.756681876137227e-06, + "loss": 0.4656313359737396, + "step": 1782 + }, + { + "epoch": 2.3127331711273316, + "grad_norm": 0.5877224206924438, + "learning_rate": 2.7468304961063642e-06, + "loss": 0.5328505635261536, + "step": 1783 + }, + { + "epoch": 2.3140308191403083, + "grad_norm": 0.5791632533073425, + "learning_rate": 2.736993946971329e-06, + "loss": 0.49198758602142334, + "step": 1784 + }, + { + "epoch": 2.3153284671532846, + "grad_norm": 0.5888563990592957, + "learning_rate": 2.727172248845378e-06, + "loss": 0.5110273957252502, + "step": 1785 + }, + { + "epoch": 2.3166261151662613, + "grad_norm": 0.5828698873519897, + "learning_rate": 2.717365421811389e-06, + "loss": 0.5017109513282776, + "step": 1786 + }, + { + "epoch": 2.3179237631792375, + "grad_norm": 0.5837040543556213, + "learning_rate": 2.7075734859218526e-06, + "loss": 0.48261111974716187, + "step": 1787 + }, + { + "epoch": 2.3192214111922143, + "grad_norm": 0.5555887222290039, + "learning_rate": 2.6977964611987885e-06, + "loss": 0.47618377208709717, + "step": 1788 + }, + { + "epoch": 2.3205190592051905, + "grad_norm": 0.5828522443771362, + "learning_rate": 2.6880343676337485e-06, + "loss": 0.5134596824645996, + "step": 1789 + }, + { + "epoch": 2.3218167072181672, + "grad_norm": 0.5784159898757935, + "learning_rate": 2.6782872251877347e-06, + "loss": 0.5150825381278992, + "step": 1790 + }, + { + "epoch": 2.3231143552311435, + "grad_norm": 0.5633057951927185, + "learning_rate": 2.6685550537911886e-06, + "loss": 0.5161488056182861, + "step": 1791 + }, + { + "epoch": 2.3244120032441202, + "grad_norm": 0.6642704010009766, + "learning_rate": 2.658837873343938e-06, + "loss": 0.49425986409187317, + "step": 1792 + }, + { + "epoch": 2.3257096512570965, + "grad_norm": 1.5263655185699463, + "learning_rate": 2.6491357037151565e-06, + "loss": 0.5067033767700195, + "step": 1793 + }, + { + "epoch": 2.3270072992700728, + "grad_norm": 0.5753558278083801, + "learning_rate": 2.639448564743328e-06, + "loss": 0.5167245864868164, + "step": 1794 + }, + { + "epoch": 2.3283049472830495, + "grad_norm": 0.576946496963501, + "learning_rate": 2.6297764762362e-06, + "loss": 0.4853561818599701, + "step": 1795 + }, + { + "epoch": 2.329602595296026, + "grad_norm": 0.5866283774375916, + "learning_rate": 2.6201194579707377e-06, + "loss": 0.5048178434371948, + "step": 1796 + }, + { + "epoch": 2.3309002433090025, + "grad_norm": 0.5844078660011292, + "learning_rate": 2.6104775296931118e-06, + "loss": 0.5524246096611023, + "step": 1797 + }, + { + "epoch": 2.3321978913219787, + "grad_norm": 0.5873027443885803, + "learning_rate": 2.6008507111186142e-06, + "loss": 0.4834699034690857, + "step": 1798 + }, + { + "epoch": 2.3334955393349555, + "grad_norm": 0.5751008987426758, + "learning_rate": 2.5912390219316573e-06, + "loss": 0.46085190773010254, + "step": 1799 + }, + { + "epoch": 2.3347931873479317, + "grad_norm": 0.5933749675750732, + "learning_rate": 2.5816424817857122e-06, + "loss": 0.5757045745849609, + "step": 1800 + }, + { + "epoch": 2.3360908353609084, + "grad_norm": 0.5685113668441772, + "learning_rate": 2.572061110303271e-06, + "loss": 0.5482950210571289, + "step": 1801 + }, + { + "epoch": 2.3373884833738847, + "grad_norm": 0.5949112176895142, + "learning_rate": 2.562494927075824e-06, + "loss": 0.45071443915367126, + "step": 1802 + }, + { + "epoch": 2.3386861313868614, + "grad_norm": 0.5924611687660217, + "learning_rate": 2.552943951663782e-06, + "loss": 0.5145446062088013, + "step": 1803 + }, + { + "epoch": 2.3399837793998377, + "grad_norm": 0.6171916127204895, + "learning_rate": 2.543408203596479e-06, + "loss": 0.5408798456192017, + "step": 1804 + }, + { + "epoch": 2.3412814274128144, + "grad_norm": 0.5777391791343689, + "learning_rate": 2.5338877023721055e-06, + "loss": 0.4972618818283081, + "step": 1805 + }, + { + "epoch": 2.3425790754257907, + "grad_norm": 0.5500625371932983, + "learning_rate": 2.5243824674576743e-06, + "loss": 0.47741931676864624, + "step": 1806 + }, + { + "epoch": 2.3438767234387674, + "grad_norm": 0.6426427960395813, + "learning_rate": 2.514892518288988e-06, + "loss": 0.4675457179546356, + "step": 1807 + }, + { + "epoch": 2.3451743714517437, + "grad_norm": 0.5633028149604797, + "learning_rate": 2.5054178742705936e-06, + "loss": 0.4990037679672241, + "step": 1808 + }, + { + "epoch": 2.34647201946472, + "grad_norm": 0.5860106945037842, + "learning_rate": 2.4959585547757294e-06, + "loss": 0.5247271060943604, + "step": 1809 + }, + { + "epoch": 2.3477696674776967, + "grad_norm": 0.6035534143447876, + "learning_rate": 2.486514579146322e-06, + "loss": 0.5100830793380737, + "step": 1810 + }, + { + "epoch": 2.3490673154906734, + "grad_norm": 0.5890262722969055, + "learning_rate": 2.4770859666929027e-06, + "loss": 0.4713430106639862, + "step": 1811 + }, + { + "epoch": 2.3503649635036497, + "grad_norm": 0.5817517638206482, + "learning_rate": 2.4676727366945995e-06, + "loss": 0.5113362073898315, + "step": 1812 + }, + { + "epoch": 2.351662611516626, + "grad_norm": 0.5895565748214722, + "learning_rate": 2.4582749083990875e-06, + "loss": 0.5131444931030273, + "step": 1813 + }, + { + "epoch": 2.3529602595296026, + "grad_norm": 0.6126547455787659, + "learning_rate": 2.448892501022544e-06, + "loss": 0.5126985907554626, + "step": 1814 + }, + { + "epoch": 2.354257907542579, + "grad_norm": 0.6138656139373779, + "learning_rate": 2.4395255337496202e-06, + "loss": 0.5113729238510132, + "step": 1815 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 0.5864330530166626, + "learning_rate": 2.4301740257333918e-06, + "loss": 0.49038761854171753, + "step": 1816 + }, + { + "epoch": 2.356853203568532, + "grad_norm": 0.5852108597755432, + "learning_rate": 2.4208379960953255e-06, + "loss": 0.5150374174118042, + "step": 1817 + }, + { + "epoch": 2.3581508515815086, + "grad_norm": 0.5658332705497742, + "learning_rate": 2.4115174639252425e-06, + "loss": 0.45495855808258057, + "step": 1818 + }, + { + "epoch": 2.359448499594485, + "grad_norm": 0.6017063856124878, + "learning_rate": 2.4022124482812627e-06, + "loss": 0.505713701248169, + "step": 1819 + }, + { + "epoch": 2.3607461476074616, + "grad_norm": 0.5778226852416992, + "learning_rate": 2.3929229681898005e-06, + "loss": 0.5222234725952148, + "step": 1820 + }, + { + "epoch": 2.362043795620438, + "grad_norm": 0.5651443004608154, + "learning_rate": 2.3836490426454816e-06, + "loss": 0.49572640657424927, + "step": 1821 + }, + { + "epoch": 2.3633414436334146, + "grad_norm": 0.5689359307289124, + "learning_rate": 2.3743906906111415e-06, + "loss": 0.5316051840782166, + "step": 1822 + }, + { + "epoch": 2.364639091646391, + "grad_norm": 0.5702098608016968, + "learning_rate": 2.365147931017764e-06, + "loss": 0.4997398257255554, + "step": 1823 + }, + { + "epoch": 2.3659367396593676, + "grad_norm": 0.5760017037391663, + "learning_rate": 2.355920782764455e-06, + "loss": 0.48562324047088623, + "step": 1824 + }, + { + "epoch": 2.367234387672344, + "grad_norm": 0.5816190242767334, + "learning_rate": 2.3467092647183962e-06, + "loss": 0.4969868063926697, + "step": 1825 + }, + { + "epoch": 2.3685320356853206, + "grad_norm": 0.573274552822113, + "learning_rate": 2.337513395714812e-06, + "loss": 0.5109938383102417, + "step": 1826 + }, + { + "epoch": 2.369829683698297, + "grad_norm": 0.6311878561973572, + "learning_rate": 2.3283331945569256e-06, + "loss": 0.5642886161804199, + "step": 1827 + }, + { + "epoch": 2.371127331711273, + "grad_norm": 0.584414541721344, + "learning_rate": 2.3191686800159272e-06, + "loss": 0.4909813404083252, + "step": 1828 + }, + { + "epoch": 2.37242497972425, + "grad_norm": 0.5963045954704285, + "learning_rate": 2.310019870830923e-06, + "loss": 0.5222618579864502, + "step": 1829 + }, + { + "epoch": 2.373722627737226, + "grad_norm": 0.5990424752235413, + "learning_rate": 2.300886785708919e-06, + "loss": 0.527482271194458, + "step": 1830 + }, + { + "epoch": 2.375020275750203, + "grad_norm": 0.5891411900520325, + "learning_rate": 2.2917694433247626e-06, + "loss": 0.5050874948501587, + "step": 1831 + }, + { + "epoch": 2.376317923763179, + "grad_norm": 0.6118223071098328, + "learning_rate": 2.282667862321104e-06, + "loss": 0.5382136106491089, + "step": 1832 + }, + { + "epoch": 2.377615571776156, + "grad_norm": 0.6039783358573914, + "learning_rate": 2.2735820613083837e-06, + "loss": 0.5693233013153076, + "step": 1833 + }, + { + "epoch": 2.378913219789132, + "grad_norm": 0.5887247323989868, + "learning_rate": 2.264512058864755e-06, + "loss": 0.5109111666679382, + "step": 1834 + }, + { + "epoch": 2.3802108678021088, + "grad_norm": 0.5879799723625183, + "learning_rate": 2.2554578735360823e-06, + "loss": 0.5213186740875244, + "step": 1835 + }, + { + "epoch": 2.381508515815085, + "grad_norm": 0.5826606154441833, + "learning_rate": 2.246419523835882e-06, + "loss": 0.4647579789161682, + "step": 1836 + }, + { + "epoch": 2.3828061638280618, + "grad_norm": 0.5773786306381226, + "learning_rate": 2.2373970282452916e-06, + "loss": 0.4783990681171417, + "step": 1837 + }, + { + "epoch": 2.384103811841038, + "grad_norm": 0.5842030644416809, + "learning_rate": 2.2283904052130313e-06, + "loss": 0.5339592695236206, + "step": 1838 + }, + { + "epoch": 2.3854014598540147, + "grad_norm": 0.569379985332489, + "learning_rate": 2.2193996731553656e-06, + "loss": 0.4958034157752991, + "step": 1839 + }, + { + "epoch": 2.386699107866991, + "grad_norm": 0.6030622124671936, + "learning_rate": 2.2104248504560643e-06, + "loss": 0.4680197834968567, + "step": 1840 + }, + { + "epoch": 2.386699107866991, + "eval_loss": 0.6960097551345825, + "eval_runtime": 72.3931, + "eval_samples_per_second": 71.72, + "eval_steps_per_second": 8.965, + "step": 1840 + }, + { + "epoch": 2.3879967558799677, + "grad_norm": 0.5678315758705139, + "learning_rate": 2.2014659554663732e-06, + "loss": 0.5050360560417175, + "step": 1841 + }, + { + "epoch": 2.389294403892944, + "grad_norm": 0.5803557634353638, + "learning_rate": 2.192523006504956e-06, + "loss": 0.45793968439102173, + "step": 1842 + }, + { + "epoch": 2.3905920519059203, + "grad_norm": 0.5823774933815002, + "learning_rate": 2.183596021857891e-06, + "loss": 0.4527888596057892, + "step": 1843 + }, + { + "epoch": 2.391889699918897, + "grad_norm": 0.5696638226509094, + "learning_rate": 2.1746850197785928e-06, + "loss": 0.48019784688949585, + "step": 1844 + }, + { + "epoch": 2.3931873479318737, + "grad_norm": 0.5827446579933167, + "learning_rate": 2.16579001848781e-06, + "loss": 0.5040067434310913, + "step": 1845 + }, + { + "epoch": 2.39448499594485, + "grad_norm": 0.5871142148971558, + "learning_rate": 2.156911036173568e-06, + "loss": 0.47293055057525635, + "step": 1846 + }, + { + "epoch": 2.3957826439578263, + "grad_norm": 0.558737576007843, + "learning_rate": 2.1480480909911384e-06, + "loss": 0.47470247745513916, + "step": 1847 + }, + { + "epoch": 2.397080291970803, + "grad_norm": 0.5871817469596863, + "learning_rate": 2.139201201062999e-06, + "loss": 0.5189757347106934, + "step": 1848 + }, + { + "epoch": 2.3983779399837792, + "grad_norm": 0.5788654088973999, + "learning_rate": 2.130370384478807e-06, + "loss": 0.49212944507598877, + "step": 1849 + }, + { + "epoch": 2.399675587996756, + "grad_norm": 0.6011954545974731, + "learning_rate": 2.1215556592953357e-06, + "loss": 0.5247466564178467, + "step": 1850 + }, + { + "epoch": 2.4009732360097322, + "grad_norm": 0.5478853583335876, + "learning_rate": 2.11275704353648e-06, + "loss": 0.4548777937889099, + "step": 1851 + }, + { + "epoch": 2.402270884022709, + "grad_norm": 0.5758265852928162, + "learning_rate": 2.10397455519317e-06, + "loss": 0.5072181224822998, + "step": 1852 + }, + { + "epoch": 2.403568532035685, + "grad_norm": 0.5652422308921814, + "learning_rate": 2.095208212223383e-06, + "loss": 0.524145245552063, + "step": 1853 + }, + { + "epoch": 2.404866180048662, + "grad_norm": 0.5495245456695557, + "learning_rate": 2.0864580325520623e-06, + "loss": 0.47712084650993347, + "step": 1854 + }, + { + "epoch": 2.406163828061638, + "grad_norm": 0.5936484932899475, + "learning_rate": 2.077724034071116e-06, + "loss": 0.5134607553482056, + "step": 1855 + }, + { + "epoch": 2.407461476074615, + "grad_norm": 0.5818508863449097, + "learning_rate": 2.069006234639357e-06, + "loss": 0.46304088830947876, + "step": 1856 + }, + { + "epoch": 2.408759124087591, + "grad_norm": 0.6046934723854065, + "learning_rate": 2.060304652082481e-06, + "loss": 0.5234611630439758, + "step": 1857 + }, + { + "epoch": 2.410056772100568, + "grad_norm": 0.6409534215927124, + "learning_rate": 2.051619304193022e-06, + "loss": 0.5672463178634644, + "step": 1858 + }, + { + "epoch": 2.411354420113544, + "grad_norm": 0.5750660300254822, + "learning_rate": 2.0429502087303164e-06, + "loss": 0.4885750710964203, + "step": 1859 + }, + { + "epoch": 2.412652068126521, + "grad_norm": 0.6407312750816345, + "learning_rate": 2.0342973834204715e-06, + "loss": 0.4792509973049164, + "step": 1860 + }, + { + "epoch": 2.413949716139497, + "grad_norm": 0.5465012192726135, + "learning_rate": 2.0256608459563244e-06, + "loss": 0.4969291388988495, + "step": 1861 + }, + { + "epoch": 2.4152473641524734, + "grad_norm": 0.5713889002799988, + "learning_rate": 2.017040613997412e-06, + "loss": 0.48591309785842896, + "step": 1862 + }, + { + "epoch": 2.41654501216545, + "grad_norm": 0.5666239857673645, + "learning_rate": 2.008436705169917e-06, + "loss": 0.44293344020843506, + "step": 1863 + }, + { + "epoch": 2.4178426601784264, + "grad_norm": 0.5586820244789124, + "learning_rate": 1.9998491370666684e-06, + "loss": 0.45493143796920776, + "step": 1864 + }, + { + "epoch": 2.419140308191403, + "grad_norm": 0.5613408088684082, + "learning_rate": 1.991277927247056e-06, + "loss": 0.49673575162887573, + "step": 1865 + }, + { + "epoch": 2.4204379562043794, + "grad_norm": 0.5929522514343262, + "learning_rate": 1.9827230932370467e-06, + "loss": 0.5190791487693787, + "step": 1866 + }, + { + "epoch": 2.421735604217356, + "grad_norm": 0.5624476075172424, + "learning_rate": 1.9741846525291033e-06, + "loss": 0.4601350724697113, + "step": 1867 + }, + { + "epoch": 2.4230332522303324, + "grad_norm": 0.5859534740447998, + "learning_rate": 1.9656626225821774e-06, + "loss": 0.4977201819419861, + "step": 1868 + }, + { + "epoch": 2.424330900243309, + "grad_norm": 0.5921490788459778, + "learning_rate": 1.957157020821664e-06, + "loss": 0.5139193534851074, + "step": 1869 + }, + { + "epoch": 2.4256285482562854, + "grad_norm": 0.5974218845367432, + "learning_rate": 1.9486678646393654e-06, + "loss": 0.5071057081222534, + "step": 1870 + }, + { + "epoch": 2.426926196269262, + "grad_norm": 0.5919764041900635, + "learning_rate": 1.9401951713934574e-06, + "loss": 0.49057209491729736, + "step": 1871 + }, + { + "epoch": 2.4282238442822384, + "grad_norm": 0.5927568674087524, + "learning_rate": 1.931738958408457e-06, + "loss": 0.5092151165008545, + "step": 1872 + }, + { + "epoch": 2.429521492295215, + "grad_norm": 0.5767861604690552, + "learning_rate": 1.9232992429751694e-06, + "loss": 0.4838736355304718, + "step": 1873 + }, + { + "epoch": 2.4308191403081914, + "grad_norm": 0.5671409964561462, + "learning_rate": 1.9148760423506884e-06, + "loss": 0.4564237594604492, + "step": 1874 + }, + { + "epoch": 2.432116788321168, + "grad_norm": 0.5710315108299255, + "learning_rate": 1.9064693737583173e-06, + "loss": 0.5324878096580505, + "step": 1875 + }, + { + "epoch": 2.4334144363341443, + "grad_norm": 0.5930359959602356, + "learning_rate": 1.8980792543875758e-06, + "loss": 0.5325191617012024, + "step": 1876 + }, + { + "epoch": 2.4347120843471206, + "grad_norm": 0.5865573287010193, + "learning_rate": 1.8897057013941256e-06, + "loss": 0.4776073694229126, + "step": 1877 + }, + { + "epoch": 2.4360097323600973, + "grad_norm": 0.5611563920974731, + "learning_rate": 1.8813487318997658e-06, + "loss": 0.5060328841209412, + "step": 1878 + }, + { + "epoch": 2.437307380373074, + "grad_norm": 0.5972756147384644, + "learning_rate": 1.8730083629923857e-06, + "loss": 0.4804626405239105, + "step": 1879 + }, + { + "epoch": 2.4386050283860503, + "grad_norm": 0.5864998698234558, + "learning_rate": 1.8646846117259277e-06, + "loss": 0.49063995480537415, + "step": 1880 + }, + { + "epoch": 2.4399026763990266, + "grad_norm": 0.5928548574447632, + "learning_rate": 1.856377495120355e-06, + "loss": 0.5291346311569214, + "step": 1881 + }, + { + "epoch": 2.4412003244120033, + "grad_norm": 0.5551499724388123, + "learning_rate": 1.8480870301616227e-06, + "loss": 0.5005500912666321, + "step": 1882 + }, + { + "epoch": 2.4424979724249796, + "grad_norm": 0.5950235724449158, + "learning_rate": 1.839813233801626e-06, + "loss": 0.5388972759246826, + "step": 1883 + }, + { + "epoch": 2.4437956204379563, + "grad_norm": 0.5625823736190796, + "learning_rate": 1.8315561229581925e-06, + "loss": 0.49611175060272217, + "step": 1884 + }, + { + "epoch": 2.4450932684509326, + "grad_norm": 0.5934765934944153, + "learning_rate": 1.8233157145150183e-06, + "loss": 0.5419527292251587, + "step": 1885 + }, + { + "epoch": 2.4463909164639093, + "grad_norm": 0.5831634402275085, + "learning_rate": 1.8150920253216542e-06, + "loss": 0.5380743145942688, + "step": 1886 + }, + { + "epoch": 2.4476885644768855, + "grad_norm": 0.5773998498916626, + "learning_rate": 1.8068850721934639e-06, + "loss": 0.5360612869262695, + "step": 1887 + }, + { + "epoch": 2.4489862124898623, + "grad_norm": 0.5667778253555298, + "learning_rate": 1.7986948719115872e-06, + "loss": 0.4837849736213684, + "step": 1888 + }, + { + "epoch": 2.4502838605028385, + "grad_norm": 0.5844002962112427, + "learning_rate": 1.7905214412229177e-06, + "loss": 0.5097035765647888, + "step": 1889 + }, + { + "epoch": 2.4515815085158152, + "grad_norm": 0.571603536605835, + "learning_rate": 1.7823647968400437e-06, + "loss": 0.4986342787742615, + "step": 1890 + }, + { + "epoch": 2.4528791565287915, + "grad_norm": 0.5814788341522217, + "learning_rate": 1.7742249554412426e-06, + "loss": 0.5466139316558838, + "step": 1891 + }, + { + "epoch": 2.4541768045417682, + "grad_norm": 0.602313756942749, + "learning_rate": 1.76610193367043e-06, + "loss": 0.5179327726364136, + "step": 1892 + }, + { + "epoch": 2.4554744525547445, + "grad_norm": 0.5728641748428345, + "learning_rate": 1.757995748137129e-06, + "loss": 0.4758206903934479, + "step": 1893 + }, + { + "epoch": 2.456772100567721, + "grad_norm": 0.5834367871284485, + "learning_rate": 1.7499064154164358e-06, + "loss": 0.48661813139915466, + "step": 1894 + }, + { + "epoch": 2.4580697485806975, + "grad_norm": 0.6014889478683472, + "learning_rate": 1.7418339520489936e-06, + "loss": 0.5374865531921387, + "step": 1895 + }, + { + "epoch": 2.4593673965936738, + "grad_norm": 0.5678799152374268, + "learning_rate": 1.7337783745409363e-06, + "loss": 0.47202199697494507, + "step": 1896 + }, + { + "epoch": 2.4606650446066505, + "grad_norm": 0.5770121216773987, + "learning_rate": 1.7257396993638942e-06, + "loss": 0.4832342565059662, + "step": 1897 + }, + { + "epoch": 2.4619626926196267, + "grad_norm": 0.5571733713150024, + "learning_rate": 1.717717942954914e-06, + "loss": 0.5462654829025269, + "step": 1898 + }, + { + "epoch": 2.4632603406326035, + "grad_norm": 0.5752882361412048, + "learning_rate": 1.7097131217164598e-06, + "loss": 0.5042911171913147, + "step": 1899 + }, + { + "epoch": 2.4645579886455797, + "grad_norm": 0.5651837587356567, + "learning_rate": 1.7017252520163652e-06, + "loss": 0.5055532455444336, + "step": 1900 + }, + { + "epoch": 2.4658556366585564, + "grad_norm": 0.5626855492591858, + "learning_rate": 1.6937543501878018e-06, + "loss": 0.5025293827056885, + "step": 1901 + }, + { + "epoch": 2.4671532846715327, + "grad_norm": 0.5588532090187073, + "learning_rate": 1.6858004325292466e-06, + "loss": 0.5056187510490417, + "step": 1902 + }, + { + "epoch": 2.4684509326845094, + "grad_norm": 0.6047312021255493, + "learning_rate": 1.6778635153044486e-06, + "loss": 0.5340344309806824, + "step": 1903 + }, + { + "epoch": 2.4697485806974857, + "grad_norm": 0.5701199769973755, + "learning_rate": 1.6699436147423942e-06, + "loss": 0.47314453125, + "step": 1904 + }, + { + "epoch": 2.4710462287104624, + "grad_norm": 0.5887412428855896, + "learning_rate": 1.662040747037277e-06, + "loss": 0.5806034207344055, + "step": 1905 + }, + { + "epoch": 2.4723438767234387, + "grad_norm": 0.5856630206108093, + "learning_rate": 1.654154928348455e-06, + "loss": 0.542724609375, + "step": 1906 + }, + { + "epoch": 2.4736415247364154, + "grad_norm": 0.5869402885437012, + "learning_rate": 1.646286174800441e-06, + "loss": 0.5193344354629517, + "step": 1907 + }, + { + "epoch": 2.4749391727493917, + "grad_norm": 0.5962528586387634, + "learning_rate": 1.6384345024828374e-06, + "loss": 0.49579355120658875, + "step": 1908 + }, + { + "epoch": 2.4762368207623684, + "grad_norm": 0.5732969641685486, + "learning_rate": 1.6305999274503282e-06, + "loss": 0.4678477346897125, + "step": 1909 + }, + { + "epoch": 2.4775344687753447, + "grad_norm": 0.5851303339004517, + "learning_rate": 1.6227824657226366e-06, + "loss": 0.4453192949295044, + "step": 1910 + }, + { + "epoch": 2.478832116788321, + "grad_norm": 0.5631725192070007, + "learning_rate": 1.614982133284495e-06, + "loss": 0.47414714097976685, + "step": 1911 + }, + { + "epoch": 2.4801297648012977, + "grad_norm": 0.5917407274246216, + "learning_rate": 1.6071989460856063e-06, + "loss": 0.51967453956604, + "step": 1912 + }, + { + "epoch": 2.4814274128142744, + "grad_norm": 0.5762115716934204, + "learning_rate": 1.5994329200406223e-06, + "loss": 0.47164011001586914, + "step": 1913 + }, + { + "epoch": 2.4827250608272506, + "grad_norm": 0.5615324974060059, + "learning_rate": 1.5916840710290937e-06, + "loss": 0.5057311058044434, + "step": 1914 + }, + { + "epoch": 2.484022708840227, + "grad_norm": 0.5691003203392029, + "learning_rate": 1.5839524148954622e-06, + "loss": 0.46432405710220337, + "step": 1915 + }, + { + "epoch": 2.4853203568532036, + "grad_norm": 0.5725374221801758, + "learning_rate": 1.5762379674490048e-06, + "loss": 0.46116703748703003, + "step": 1916 + }, + { + "epoch": 2.48661800486618, + "grad_norm": 0.6240981221199036, + "learning_rate": 1.5685407444638146e-06, + "loss": 0.5304262638092041, + "step": 1917 + }, + { + "epoch": 2.4879156528791566, + "grad_norm": 0.5866638422012329, + "learning_rate": 1.5608607616787663e-06, + "loss": 0.46918168663978577, + "step": 1918 + }, + { + "epoch": 2.489213300892133, + "grad_norm": 0.6103445291519165, + "learning_rate": 1.553198034797474e-06, + "loss": 0.5785281658172607, + "step": 1919 + }, + { + "epoch": 2.4905109489051096, + "grad_norm": 0.5748964548110962, + "learning_rate": 1.5455525794882841e-06, + "loss": 0.47489288449287415, + "step": 1920 + }, + { + "epoch": 2.491808596918086, + "grad_norm": 0.5849605202674866, + "learning_rate": 1.5379244113842106e-06, + "loss": 0.5081884860992432, + "step": 1921 + }, + { + "epoch": 2.4931062449310626, + "grad_norm": 0.5827904343605042, + "learning_rate": 1.53031354608293e-06, + "loss": 0.5528438091278076, + "step": 1922 + }, + { + "epoch": 2.494403892944039, + "grad_norm": 0.5817930102348328, + "learning_rate": 1.5227199991467335e-06, + "loss": 0.5150377154350281, + "step": 1923 + }, + { + "epoch": 2.4957015409570156, + "grad_norm": 0.5756059288978577, + "learning_rate": 1.5151437861025032e-06, + "loss": 0.4410705268383026, + "step": 1924 + }, + { + "epoch": 2.496999188969992, + "grad_norm": 0.5646528005599976, + "learning_rate": 1.5075849224416783e-06, + "loss": 0.5073448419570923, + "step": 1925 + }, + { + "epoch": 2.4982968369829686, + "grad_norm": 0.5877253413200378, + "learning_rate": 1.5000434236202211e-06, + "loss": 0.5140043497085571, + "step": 1926 + }, + { + "epoch": 2.499594484995945, + "grad_norm": 0.5703092813491821, + "learning_rate": 1.4925193050585873e-06, + "loss": 0.5106258392333984, + "step": 1927 + }, + { + "epoch": 2.5008921330089215, + "grad_norm": 0.5841608643531799, + "learning_rate": 1.4850125821416983e-06, + "loss": 0.49111461639404297, + "step": 1928 + }, + { + "epoch": 2.502189781021898, + "grad_norm": 0.5806940197944641, + "learning_rate": 1.4775232702188947e-06, + "loss": 0.477137953042984, + "step": 1929 + }, + { + "epoch": 2.503487429034874, + "grad_norm": 0.5762841105461121, + "learning_rate": 1.4700513846039332e-06, + "loss": 0.4592735469341278, + "step": 1930 + }, + { + "epoch": 2.504785077047851, + "grad_norm": 0.5808306932449341, + "learning_rate": 1.4625969405749218e-06, + "loss": 0.5200600624084473, + "step": 1931 + }, + { + "epoch": 2.5060827250608275, + "grad_norm": 0.5846347212791443, + "learning_rate": 1.4551599533743155e-06, + "loss": 0.5185432434082031, + "step": 1932 + }, + { + "epoch": 2.507380373073804, + "grad_norm": 0.6160796284675598, + "learning_rate": 1.4477404382088689e-06, + "loss": 0.5391091108322144, + "step": 1933 + }, + { + "epoch": 2.50867802108678, + "grad_norm": 0.5582398176193237, + "learning_rate": 1.4403384102496132e-06, + "loss": 0.4704029858112335, + "step": 1934 + }, + { + "epoch": 2.509975669099757, + "grad_norm": 0.5653654932975769, + "learning_rate": 1.4329538846318225e-06, + "loss": 0.524503231048584, + "step": 1935 + }, + { + "epoch": 2.511273317112733, + "grad_norm": 0.5886475443840027, + "learning_rate": 1.4255868764549852e-06, + "loss": 0.4819219708442688, + "step": 1936 + }, + { + "epoch": 2.5125709651257098, + "grad_norm": 0.5662146806716919, + "learning_rate": 1.4182374007827605e-06, + "loss": 0.5265961289405823, + "step": 1937 + }, + { + "epoch": 2.513868613138686, + "grad_norm": 0.5975550413131714, + "learning_rate": 1.410905472642975e-06, + "loss": 0.5036963224411011, + "step": 1938 + }, + { + "epoch": 2.5151662611516628, + "grad_norm": 0.5727776885032654, + "learning_rate": 1.4035911070275576e-06, + "loss": 0.4989280104637146, + "step": 1939 + }, + { + "epoch": 2.516463909164639, + "grad_norm": 0.6097977161407471, + "learning_rate": 1.3962943188925438e-06, + "loss": 0.535049557685852, + "step": 1940 + }, + { + "epoch": 2.5177615571776153, + "grad_norm": 0.5695138573646545, + "learning_rate": 1.3890151231580117e-06, + "loss": 0.5146960020065308, + "step": 1941 + }, + { + "epoch": 2.519059205190592, + "grad_norm": 0.5890569686889648, + "learning_rate": 1.3817535347080768e-06, + "loss": 0.5350029468536377, + "step": 1942 + }, + { + "epoch": 2.5203568532035687, + "grad_norm": 0.5916978120803833, + "learning_rate": 1.3745095683908482e-06, + "loss": 0.5213718414306641, + "step": 1943 + }, + { + "epoch": 2.521654501216545, + "grad_norm": 0.5767956972122192, + "learning_rate": 1.3672832390184042e-06, + "loss": 0.506149411201477, + "step": 1944 + }, + { + "epoch": 2.5229521492295213, + "grad_norm": 0.5916143655776978, + "learning_rate": 1.3600745613667598e-06, + "loss": 0.5128974318504333, + "step": 1945 + }, + { + "epoch": 2.524249797242498, + "grad_norm": 0.5634325742721558, + "learning_rate": 1.3528835501758365e-06, + "loss": 0.5004685521125793, + "step": 1946 + }, + { + "epoch": 2.5255474452554747, + "grad_norm": 0.5783470869064331, + "learning_rate": 1.345710220149431e-06, + "loss": 0.5014833807945251, + "step": 1947 + }, + { + "epoch": 2.526845093268451, + "grad_norm": 0.5838568210601807, + "learning_rate": 1.3385545859551886e-06, + "loss": 0.540973424911499, + "step": 1948 + }, + { + "epoch": 2.5281427412814272, + "grad_norm": 0.5862357020378113, + "learning_rate": 1.3314166622245717e-06, + "loss": 0.5124210715293884, + "step": 1949 + }, + { + "epoch": 2.529440389294404, + "grad_norm": 0.5789701342582703, + "learning_rate": 1.324296463552821e-06, + "loss": 0.4796435236930847, + "step": 1950 + }, + { + "epoch": 2.5307380373073802, + "grad_norm": 0.5998684167861938, + "learning_rate": 1.3171940044989495e-06, + "loss": 0.5745923519134521, + "step": 1951 + }, + { + "epoch": 2.532035685320357, + "grad_norm": 0.5753020644187927, + "learning_rate": 1.3101092995856802e-06, + "loss": 0.4947076439857483, + "step": 1952 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.5820896029472351, + "learning_rate": 1.3030423632994493e-06, + "loss": 0.4961175322532654, + "step": 1953 + }, + { + "epoch": 2.53463098134631, + "grad_norm": 0.5677821040153503, + "learning_rate": 1.2959932100903472e-06, + "loss": 0.49631717801094055, + "step": 1954 + }, + { + "epoch": 2.535928629359286, + "grad_norm": 0.5767098665237427, + "learning_rate": 1.2889618543721094e-06, + "loss": 0.5189783573150635, + "step": 1955 + }, + { + "epoch": 2.537226277372263, + "grad_norm": 0.5949708819389343, + "learning_rate": 1.2819483105220798e-06, + "loss": 0.5087240934371948, + "step": 1956 + }, + { + "epoch": 2.538523925385239, + "grad_norm": 0.582380473613739, + "learning_rate": 1.274952592881179e-06, + "loss": 0.48820894956588745, + "step": 1957 + }, + { + "epoch": 2.539821573398216, + "grad_norm": 0.578072726726532, + "learning_rate": 1.2679747157538801e-06, + "loss": 0.5089854598045349, + "step": 1958 + }, + { + "epoch": 2.541119221411192, + "grad_norm": 0.5774610042572021, + "learning_rate": 1.2610146934081768e-06, + "loss": 0.49252915382385254, + "step": 1959 + }, + { + "epoch": 2.5424168694241684, + "grad_norm": 0.58970707654953, + "learning_rate": 1.2540725400755472e-06, + "loss": 0.5605252981185913, + "step": 1960 + }, + { + "epoch": 2.543714517437145, + "grad_norm": 0.566736102104187, + "learning_rate": 1.2471482699509463e-06, + "loss": 0.5428552627563477, + "step": 1961 + }, + { + "epoch": 2.545012165450122, + "grad_norm": 0.5720308423042297, + "learning_rate": 1.2402418971927487e-06, + "loss": 0.5265427827835083, + "step": 1962 + }, + { + "epoch": 2.546309813463098, + "grad_norm": 0.5800856351852417, + "learning_rate": 1.2333534359227383e-06, + "loss": 0.5138852596282959, + "step": 1963 + }, + { + "epoch": 2.5476074614760744, + "grad_norm": 0.5780075788497925, + "learning_rate": 1.226482900226077e-06, + "loss": 0.48286569118499756, + "step": 1964 + }, + { + "epoch": 2.548905109489051, + "grad_norm": 0.5666484236717224, + "learning_rate": 1.2196303041512714e-06, + "loss": 0.5184611082077026, + "step": 1965 + }, + { + "epoch": 2.550202757502028, + "grad_norm": 0.5936673879623413, + "learning_rate": 1.2127956617101445e-06, + "loss": 0.5331882238388062, + "step": 1966 + }, + { + "epoch": 2.551500405515004, + "grad_norm": 0.5658625364303589, + "learning_rate": 1.2059789868778116e-06, + "loss": 0.5007424354553223, + "step": 1967 + }, + { + "epoch": 2.5527980535279804, + "grad_norm": 0.5596531629562378, + "learning_rate": 1.1991802935926455e-06, + "loss": 0.4455481767654419, + "step": 1968 + }, + { + "epoch": 2.554095701540957, + "grad_norm": 0.5873602628707886, + "learning_rate": 1.1923995957562585e-06, + "loss": 0.4800918698310852, + "step": 1969 + }, + { + "epoch": 2.5553933495539334, + "grad_norm": 0.5768440961837769, + "learning_rate": 1.1856369072334517e-06, + "loss": 0.5240867733955383, + "step": 1970 + }, + { + "epoch": 2.55669099756691, + "grad_norm": 0.5888426899909973, + "learning_rate": 1.178892241852222e-06, + "loss": 0.4650096893310547, + "step": 1971 + }, + { + "epoch": 2.5579886455798864, + "grad_norm": 0.5748341083526611, + "learning_rate": 1.1721656134036962e-06, + "loss": 0.5009864568710327, + "step": 1972 + }, + { + "epoch": 2.559286293592863, + "grad_norm": 0.5902095437049866, + "learning_rate": 1.165457035642128e-06, + "loss": 0.5109707117080688, + "step": 1973 + }, + { + "epoch": 2.5605839416058394, + "grad_norm": 0.6467815041542053, + "learning_rate": 1.1587665222848643e-06, + "loss": 0.4991541802883148, + "step": 1974 + }, + { + "epoch": 2.5618815896188156, + "grad_norm": 0.5866140127182007, + "learning_rate": 1.1520940870123065e-06, + "loss": 0.48706984519958496, + "step": 1975 + }, + { + "epoch": 2.5631792376317923, + "grad_norm": 0.5842229127883911, + "learning_rate": 1.1454397434679022e-06, + "loss": 0.5219037532806396, + "step": 1976 + }, + { + "epoch": 2.564476885644769, + "grad_norm": 0.5731110572814941, + "learning_rate": 1.1388035052580936e-06, + "loss": 0.5115249156951904, + "step": 1977 + }, + { + "epoch": 2.5657745336577453, + "grad_norm": 0.5784810185432434, + "learning_rate": 1.1321853859523113e-06, + "loss": 0.49307000637054443, + "step": 1978 + }, + { + "epoch": 2.5670721816707216, + "grad_norm": 0.5523423552513123, + "learning_rate": 1.1255853990829323e-06, + "loss": 0.4534381031990051, + "step": 1979 + }, + { + "epoch": 2.5683698296836983, + "grad_norm": 0.576626718044281, + "learning_rate": 1.119003558145262e-06, + "loss": 0.5025165677070618, + "step": 1980 + }, + { + "epoch": 2.569667477696675, + "grad_norm": 0.6068827509880066, + "learning_rate": 1.1124398765974976e-06, + "loss": 0.5154992341995239, + "step": 1981 + }, + { + "epoch": 2.5709651257096513, + "grad_norm": 0.5544149875640869, + "learning_rate": 1.1058943678607082e-06, + "loss": 0.4641039967536926, + "step": 1982 + }, + { + "epoch": 2.5722627737226276, + "grad_norm": 0.591013491153717, + "learning_rate": 1.0993670453187965e-06, + "loss": 0.5354744791984558, + "step": 1983 + }, + { + "epoch": 2.5735604217356043, + "grad_norm": 0.5729239583015442, + "learning_rate": 1.0928579223184943e-06, + "loss": 0.4895523190498352, + "step": 1984 + }, + { + "epoch": 2.5748580697485806, + "grad_norm": 0.5629091858863831, + "learning_rate": 1.0863670121693037e-06, + "loss": 0.4998272955417633, + "step": 1985 + }, + { + "epoch": 2.5761557177615573, + "grad_norm": 0.5692305564880371, + "learning_rate": 1.0798943281434958e-06, + "loss": 0.5316153764724731, + "step": 1986 + }, + { + "epoch": 2.5774533657745335, + "grad_norm": 0.5875282287597656, + "learning_rate": 1.0734398834760695e-06, + "loss": 0.47188982367515564, + "step": 1987 + }, + { + "epoch": 2.5787510137875103, + "grad_norm": 0.613525927066803, + "learning_rate": 1.067003691364733e-06, + "loss": 0.5325276851654053, + "step": 1988 + }, + { + "epoch": 2.5800486618004865, + "grad_norm": 0.5971388816833496, + "learning_rate": 1.060585764969867e-06, + "loss": 0.5428590774536133, + "step": 1989 + }, + { + "epoch": 2.5813463098134632, + "grad_norm": 0.5674665570259094, + "learning_rate": 1.0541861174145097e-06, + "loss": 0.47022098302841187, + "step": 1990 + }, + { + "epoch": 2.5826439578264395, + "grad_norm": 0.5619399547576904, + "learning_rate": 1.047804761784319e-06, + "loss": 0.48155295848846436, + "step": 1991 + }, + { + "epoch": 2.5839416058394162, + "grad_norm": 0.5751737952232361, + "learning_rate": 1.0414417111275533e-06, + "loss": 0.5390469431877136, + "step": 1992 + }, + { + "epoch": 2.5852392538523925, + "grad_norm": 0.5782447457313538, + "learning_rate": 1.0350969784550368e-06, + "loss": 0.5048004984855652, + "step": 1993 + }, + { + "epoch": 2.5865369018653688, + "grad_norm": 0.5656158328056335, + "learning_rate": 1.028770576740148e-06, + "loss": 0.5237029194831848, + "step": 1994 + }, + { + "epoch": 2.5878345498783455, + "grad_norm": 0.568681538105011, + "learning_rate": 1.022462518918772e-06, + "loss": 0.4539422392845154, + "step": 1995 + }, + { + "epoch": 2.589132197891322, + "grad_norm": 0.560100793838501, + "learning_rate": 1.0161728178892928e-06, + "loss": 0.45414865016937256, + "step": 1996 + }, + { + "epoch": 2.5904298459042985, + "grad_norm": 0.5668950080871582, + "learning_rate": 1.0099014865125557e-06, + "loss": 0.4774186611175537, + "step": 1997 + }, + { + "epoch": 2.5917274939172747, + "grad_norm": 0.606434166431427, + "learning_rate": 1.0036485376118477e-06, + "loss": 0.565065324306488, + "step": 1998 + }, + { + "epoch": 2.5930251419302515, + "grad_norm": 0.5841239094734192, + "learning_rate": 9.974139839728658e-07, + "loss": 0.5483173131942749, + "step": 1999 + }, + { + "epoch": 2.5943227899432277, + "grad_norm": 0.591903805732727, + "learning_rate": 9.91197838343696e-07, + "loss": 0.539207398891449, + "step": 2000 + }, + { + "epoch": 2.5956204379562045, + "grad_norm": 0.5807414650917053, + "learning_rate": 9.850001134347765e-07, + "loss": 0.5179691314697266, + "step": 2001 + }, + { + "epoch": 2.5969180859691807, + "grad_norm": 0.5769233107566833, + "learning_rate": 9.788208219188932e-07, + "loss": 0.4748839735984802, + "step": 2002 + }, + { + "epoch": 2.5982157339821574, + "grad_norm": 0.5766239762306213, + "learning_rate": 9.726599764311318e-07, + "loss": 0.48025619983673096, + "step": 2003 + }, + { + "epoch": 2.5995133819951337, + "grad_norm": 0.5754262208938599, + "learning_rate": 9.665175895688594e-07, + "loss": 0.47812211513519287, + "step": 2004 + }, + { + "epoch": 2.6008110300081104, + "grad_norm": 0.5699096918106079, + "learning_rate": 9.603936738917063e-07, + "loss": 0.5337727069854736, + "step": 2005 + }, + { + "epoch": 2.6021086780210867, + "grad_norm": 0.6039567589759827, + "learning_rate": 9.54288241921525e-07, + "loss": 0.5216813087463379, + "step": 2006 + }, + { + "epoch": 2.6034063260340634, + "grad_norm": 0.5594240427017212, + "learning_rate": 9.482013061423833e-07, + "loss": 0.5251287221908569, + "step": 2007 + }, + { + "epoch": 2.6047039740470397, + "grad_norm": 0.5856126546859741, + "learning_rate": 9.421328790005213e-07, + "loss": 0.5040426850318909, + "step": 2008 + }, + { + "epoch": 2.606001622060016, + "grad_norm": 0.5794676542282104, + "learning_rate": 9.360829729043375e-07, + "loss": 0.5068378448486328, + "step": 2009 + }, + { + "epoch": 2.6072992700729927, + "grad_norm": 0.5879704356193542, + "learning_rate": 9.300516002243587e-07, + "loss": 0.5116778016090393, + "step": 2010 + }, + { + "epoch": 2.6085969180859694, + "grad_norm": 0.5978105068206787, + "learning_rate": 9.240387732932155e-07, + "loss": 0.525846004486084, + "step": 2011 + }, + { + "epoch": 2.6098945660989457, + "grad_norm": 0.5788280367851257, + "learning_rate": 9.180445044056164e-07, + "loss": 0.5172775983810425, + "step": 2012 + }, + { + "epoch": 2.611192214111922, + "grad_norm": 0.5901548862457275, + "learning_rate": 9.120688058183269e-07, + "loss": 0.5301088094711304, + "step": 2013 + }, + { + "epoch": 2.6124898621248986, + "grad_norm": 0.5967061519622803, + "learning_rate": 9.061116897501321e-07, + "loss": 0.5318504571914673, + "step": 2014 + }, + { + "epoch": 2.6137875101378754, + "grad_norm": 0.5555222034454346, + "learning_rate": 9.001731683818338e-07, + "loss": 0.5011588335037231, + "step": 2015 + }, + { + "epoch": 2.6150851581508516, + "grad_norm": 0.613298237323761, + "learning_rate": 8.942532538561988e-07, + "loss": 0.5700482130050659, + "step": 2016 + }, + { + "epoch": 2.616382806163828, + "grad_norm": 0.599183201789856, + "learning_rate": 8.883519582779598e-07, + "loss": 0.5524272322654724, + "step": 2017 + }, + { + "epoch": 2.6176804541768046, + "grad_norm": 0.6120027899742126, + "learning_rate": 8.82469293713768e-07, + "loss": 0.47205424308776855, + "step": 2018 + }, + { + "epoch": 2.618978102189781, + "grad_norm": 0.5907730460166931, + "learning_rate": 8.766052721921858e-07, + "loss": 0.507009744644165, + "step": 2019 + }, + { + "epoch": 2.6202757502027576, + "grad_norm": 0.5603318810462952, + "learning_rate": 8.70759905703652e-07, + "loss": 0.48432788252830505, + "step": 2020 + }, + { + "epoch": 2.621573398215734, + "grad_norm": 0.5962936282157898, + "learning_rate": 8.649332062004622e-07, + "loss": 0.4898841381072998, + "step": 2021 + }, + { + "epoch": 2.6228710462287106, + "grad_norm": 0.7598771452903748, + "learning_rate": 8.59125185596742e-07, + "loss": 0.5321274995803833, + "step": 2022 + }, + { + "epoch": 2.624168694241687, + "grad_norm": 0.5821399092674255, + "learning_rate": 8.533358557684246e-07, + "loss": 0.512812614440918, + "step": 2023 + }, + { + "epoch": 2.6254663422546636, + "grad_norm": 0.5900049805641174, + "learning_rate": 8.475652285532199e-07, + "loss": 0.5129188299179077, + "step": 2024 + }, + { + "epoch": 2.62676399026764, + "grad_norm": 0.5779396295547485, + "learning_rate": 8.41813315750607e-07, + "loss": 0.4839695394039154, + "step": 2025 + }, + { + "epoch": 2.6280616382806166, + "grad_norm": 0.581840455532074, + "learning_rate": 8.360801291217835e-07, + "loss": 0.4942781925201416, + "step": 2026 + }, + { + "epoch": 2.629359286293593, + "grad_norm": 0.5503793954849243, + "learning_rate": 8.303656803896731e-07, + "loss": 0.4754694700241089, + "step": 2027 + }, + { + "epoch": 2.630656934306569, + "grad_norm": 0.5595881342887878, + "learning_rate": 8.246699812388714e-07, + "loss": 0.48087698221206665, + "step": 2028 + }, + { + "epoch": 2.631954582319546, + "grad_norm": 0.5697108507156372, + "learning_rate": 8.189930433156424e-07, + "loss": 0.5032870173454285, + "step": 2029 + }, + { + "epoch": 2.6332522303325225, + "grad_norm": 0.5761867761611938, + "learning_rate": 8.133348782278916e-07, + "loss": 0.5013032555580139, + "step": 2030 + }, + { + "epoch": 2.634549878345499, + "grad_norm": 0.6058787703514099, + "learning_rate": 8.07695497545129e-07, + "loss": 0.44857025146484375, + "step": 2031 + }, + { + "epoch": 2.635847526358475, + "grad_norm": 0.5961512327194214, + "learning_rate": 8.020749127984629e-07, + "loss": 0.5228594541549683, + "step": 2032 + }, + { + "epoch": 2.637145174371452, + "grad_norm": 0.5766192078590393, + "learning_rate": 7.964731354805677e-07, + "loss": 0.4745315611362457, + "step": 2033 + }, + { + "epoch": 2.638442822384428, + "grad_norm": 0.5896121859550476, + "learning_rate": 7.908901770456579e-07, + "loss": 0.519614577293396, + "step": 2034 + }, + { + "epoch": 2.639740470397405, + "grad_norm": 0.5732361078262329, + "learning_rate": 7.853260489094727e-07, + "loss": 0.48370620608329773, + "step": 2035 + }, + { + "epoch": 2.641038118410381, + "grad_norm": 0.5929004549980164, + "learning_rate": 7.79780762449246e-07, + "loss": 0.5153477191925049, + "step": 2036 + }, + { + "epoch": 2.6423357664233578, + "grad_norm": 0.587020754814148, + "learning_rate": 7.742543290036797e-07, + "loss": 0.4829615652561188, + "step": 2037 + }, + { + "epoch": 2.643633414436334, + "grad_norm": 0.5629860758781433, + "learning_rate": 7.687467598729403e-07, + "loss": 0.5223960876464844, + "step": 2038 + }, + { + "epoch": 2.6449310624493108, + "grad_norm": 0.5553507208824158, + "learning_rate": 7.63258066318604e-07, + "loss": 0.4827447235584259, + "step": 2039 + }, + { + "epoch": 2.646228710462287, + "grad_norm": 0.5940564274787903, + "learning_rate": 7.577882595636665e-07, + "loss": 0.538356602191925, + "step": 2040 + }, + { + "epoch": 2.6475263584752637, + "grad_norm": 0.5712041258811951, + "learning_rate": 7.523373507924947e-07, + "loss": 0.48258891701698303, + "step": 2041 + }, + { + "epoch": 2.64882400648824, + "grad_norm": 0.5664177536964417, + "learning_rate": 7.469053511508184e-07, + "loss": 0.4672595262527466, + "step": 2042 + }, + { + "epoch": 2.6501216545012163, + "grad_norm": 0.6014147996902466, + "learning_rate": 7.414922717457018e-07, + "loss": 0.5549574494361877, + "step": 2043 + }, + { + "epoch": 2.651419302514193, + "grad_norm": 0.588028073310852, + "learning_rate": 7.360981236455222e-07, + "loss": 0.5366802215576172, + "step": 2044 + }, + { + "epoch": 2.6527169505271697, + "grad_norm": 0.5555592179298401, + "learning_rate": 7.307229178799469e-07, + "loss": 0.49787813425064087, + "step": 2045 + }, + { + "epoch": 2.654014598540146, + "grad_norm": 0.5918477177619934, + "learning_rate": 7.253666654399128e-07, + "loss": 0.5271812081336975, + "step": 2046 + }, + { + "epoch": 2.6553122465531223, + "grad_norm": 0.6544379591941833, + "learning_rate": 7.200293772775968e-07, + "loss": 0.5332372784614563, + "step": 2047 + }, + { + "epoch": 2.656609894566099, + "grad_norm": 0.578555166721344, + "learning_rate": 7.14711064306407e-07, + "loss": 0.496245801448822, + "step": 2048 + }, + { + "epoch": 2.6579075425790757, + "grad_norm": 0.5929746627807617, + "learning_rate": 7.094117374009446e-07, + "loss": 0.5187441110610962, + "step": 2049 + }, + { + "epoch": 2.659205190592052, + "grad_norm": 0.5854722261428833, + "learning_rate": 7.041314073969918e-07, + "loss": 0.4945400655269623, + "step": 2050 + }, + { + "epoch": 2.6605028386050282, + "grad_norm": 0.6011053323745728, + "learning_rate": 6.988700850914876e-07, + "loss": 0.48466387391090393, + "step": 2051 + }, + { + "epoch": 2.661800486618005, + "grad_norm": 0.5774915814399719, + "learning_rate": 6.93627781242504e-07, + "loss": 0.5133316516876221, + "step": 2052 + }, + { + "epoch": 2.663098134630981, + "grad_norm": 0.5776026248931885, + "learning_rate": 6.884045065692257e-07, + "loss": 0.5115536451339722, + "step": 2053 + }, + { + "epoch": 2.664395782643958, + "grad_norm": 0.6011329293251038, + "learning_rate": 6.83200271751927e-07, + "loss": 0.5355618000030518, + "step": 2054 + }, + { + "epoch": 2.665693430656934, + "grad_norm": 0.5973834991455078, + "learning_rate": 6.780150874319524e-07, + "loss": 0.5230112075805664, + "step": 2055 + }, + { + "epoch": 2.666991078669911, + "grad_norm": 0.5917934775352478, + "learning_rate": 6.72848964211692e-07, + "loss": 0.5399461388587952, + "step": 2056 + }, + { + "epoch": 2.668288726682887, + "grad_norm": 0.5736814141273499, + "learning_rate": 6.677019126545548e-07, + "loss": 0.49193501472473145, + "step": 2057 + }, + { + "epoch": 2.669586374695864, + "grad_norm": 0.5814056396484375, + "learning_rate": 6.625739432849643e-07, + "loss": 0.5203338861465454, + "step": 2058 + }, + { + "epoch": 2.67088402270884, + "grad_norm": 0.601714015007019, + "learning_rate": 6.574650665883197e-07, + "loss": 0.5449438095092773, + "step": 2059 + }, + { + "epoch": 2.672181670721817, + "grad_norm": 0.5884926319122314, + "learning_rate": 6.523752930109761e-07, + "loss": 0.5138452053070068, + "step": 2060 + }, + { + "epoch": 2.673479318734793, + "grad_norm": 0.5702131390571594, + "learning_rate": 6.473046329602384e-07, + "loss": 0.4545958638191223, + "step": 2061 + }, + { + "epoch": 2.6747769667477694, + "grad_norm": 0.5839261412620544, + "learning_rate": 6.422530968043173e-07, + "loss": 0.5412476658821106, + "step": 2062 + }, + { + "epoch": 2.676074614760746, + "grad_norm": 0.5880113244056702, + "learning_rate": 6.372206948723292e-07, + "loss": 0.5263261795043945, + "step": 2063 + }, + { + "epoch": 2.677372262773723, + "grad_norm": 0.5763228535652161, + "learning_rate": 6.322074374542608e-07, + "loss": 0.5082492828369141, + "step": 2064 + }, + { + "epoch": 2.678669910786699, + "grad_norm": 0.5878806710243225, + "learning_rate": 6.272133348009546e-07, + "loss": 0.5076773166656494, + "step": 2065 + }, + { + "epoch": 2.6799675587996754, + "grad_norm": 0.5525650978088379, + "learning_rate": 6.222383971240875e-07, + "loss": 0.48154234886169434, + "step": 2066 + }, + { + "epoch": 2.681265206812652, + "grad_norm": 0.6016013622283936, + "learning_rate": 6.17282634596148e-07, + "loss": 0.503459095954895, + "step": 2067 + }, + { + "epoch": 2.6825628548256284, + "grad_norm": 0.6026131510734558, + "learning_rate": 6.123460573504147e-07, + "loss": 0.4821071922779083, + "step": 2068 + }, + { + "epoch": 2.683860502838605, + "grad_norm": 0.5926850438117981, + "learning_rate": 6.074286754809411e-07, + "loss": 0.5161428451538086, + "step": 2069 + }, + { + "epoch": 2.6851581508515814, + "grad_norm": 0.5853096842765808, + "learning_rate": 6.025304990425241e-07, + "loss": 0.5262787342071533, + "step": 2070 + }, + { + "epoch": 2.6851581508515814, + "eval_loss": 0.6954009532928467, + "eval_runtime": 72.3609, + "eval_samples_per_second": 71.751, + "eval_steps_per_second": 8.969, + "step": 2070 + }, + { + "epoch": 2.686455798864558, + "grad_norm": 0.5976012945175171, + "learning_rate": 5.976515380507008e-07, + "loss": 0.5311732888221741, + "step": 2071 + }, + { + "epoch": 2.6877534468775344, + "grad_norm": 0.5981724262237549, + "learning_rate": 5.927918024817059e-07, + "loss": 0.5703781247138977, + "step": 2072 + }, + { + "epoch": 2.689051094890511, + "grad_norm": 0.5645772814750671, + "learning_rate": 5.879513022724714e-07, + "loss": 0.4812767505645752, + "step": 2073 + }, + { + "epoch": 2.6903487429034874, + "grad_norm": 0.5886021852493286, + "learning_rate": 5.831300473205948e-07, + "loss": 0.5149608254432678, + "step": 2074 + }, + { + "epoch": 2.691646390916464, + "grad_norm": 0.5895439982414246, + "learning_rate": 5.783280474843222e-07, + "loss": 0.5148745179176331, + "step": 2075 + }, + { + "epoch": 2.6929440389294403, + "grad_norm": 0.571723461151123, + "learning_rate": 5.735453125825275e-07, + "loss": 0.5035296082496643, + "step": 2076 + }, + { + "epoch": 2.6942416869424166, + "grad_norm": 0.6077845096588135, + "learning_rate": 5.687818523946931e-07, + "loss": 0.5260845422744751, + "step": 2077 + }, + { + "epoch": 2.6955393349553933, + "grad_norm": 0.5872023105621338, + "learning_rate": 5.640376766608902e-07, + "loss": 0.49081629514694214, + "step": 2078 + }, + { + "epoch": 2.69683698296837, + "grad_norm": 0.5637922286987305, + "learning_rate": 5.593127950817579e-07, + "loss": 0.49831029772758484, + "step": 2079 + }, + { + "epoch": 2.6981346309813463, + "grad_norm": 0.588504433631897, + "learning_rate": 5.546072173184791e-07, + "loss": 0.5403261184692383, + "step": 2080 + }, + { + "epoch": 2.6994322789943226, + "grad_norm": 0.5554431080818176, + "learning_rate": 5.499209529927751e-07, + "loss": 0.4801977872848511, + "step": 2081 + }, + { + "epoch": 2.7007299270072993, + "grad_norm": 0.594923198223114, + "learning_rate": 5.452540116868654e-07, + "loss": 0.552370011806488, + "step": 2082 + }, + { + "epoch": 2.702027575020276, + "grad_norm": 0.5900223851203918, + "learning_rate": 5.406064029434666e-07, + "loss": 0.5598849058151245, + "step": 2083 + }, + { + "epoch": 2.7033252230332523, + "grad_norm": 0.5767436027526855, + "learning_rate": 5.359781362657623e-07, + "loss": 0.5048878192901611, + "step": 2084 + }, + { + "epoch": 2.7046228710462286, + "grad_norm": 0.551128089427948, + "learning_rate": 5.313692211173838e-07, + "loss": 0.5155936479568481, + "step": 2085 + }, + { + "epoch": 2.7059205190592053, + "grad_norm": 0.5880531072616577, + "learning_rate": 5.26779666922399e-07, + "loss": 0.5444161295890808, + "step": 2086 + }, + { + "epoch": 2.7072181670721815, + "grad_norm": 0.5545855164527893, + "learning_rate": 5.222094830652835e-07, + "loss": 0.4949781894683838, + "step": 2087 + }, + { + "epoch": 2.7085158150851583, + "grad_norm": 0.5254430174827576, + "learning_rate": 5.176586788909066e-07, + "loss": 0.48143208026885986, + "step": 2088 + }, + { + "epoch": 2.7098134630981345, + "grad_norm": 0.5895472764968872, + "learning_rate": 5.131272637045104e-07, + "loss": 0.5467052459716797, + "step": 2089 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 0.5603579878807068, + "learning_rate": 5.086152467716932e-07, + "loss": 0.48797622323036194, + "step": 2090 + }, + { + "epoch": 2.7124087591240875, + "grad_norm": 0.5788029432296753, + "learning_rate": 5.041226373183861e-07, + "loss": 0.5119057297706604, + "step": 2091 + }, + { + "epoch": 2.7137064071370642, + "grad_norm": 0.5590220093727112, + "learning_rate": 4.996494445308409e-07, + "loss": 0.46394574642181396, + "step": 2092 + }, + { + "epoch": 2.7150040551500405, + "grad_norm": 0.5895569920539856, + "learning_rate": 4.951956775556e-07, + "loss": 0.4952976703643799, + "step": 2093 + }, + { + "epoch": 2.7163017031630172, + "grad_norm": 0.5719903707504272, + "learning_rate": 4.907613454994964e-07, + "loss": 0.5015777349472046, + "step": 2094 + }, + { + "epoch": 2.7175993511759935, + "grad_norm": 0.5849481821060181, + "learning_rate": 4.863464574296106e-07, + "loss": 0.5244485139846802, + "step": 2095 + }, + { + "epoch": 2.7188969991889698, + "grad_norm": 0.5956225991249084, + "learning_rate": 4.819510223732738e-07, + "loss": 0.5492672324180603, + "step": 2096 + }, + { + "epoch": 2.7201946472019465, + "grad_norm": 0.5836542844772339, + "learning_rate": 4.775750493180386e-07, + "loss": 0.48292914032936096, + "step": 2097 + }, + { + "epoch": 2.721492295214923, + "grad_norm": 0.5966354012489319, + "learning_rate": 4.7321854721166127e-07, + "loss": 0.5208597183227539, + "step": 2098 + }, + { + "epoch": 2.7227899432278995, + "grad_norm": 0.536894679069519, + "learning_rate": 4.6888152496208593e-07, + "loss": 0.4349246621131897, + "step": 2099 + }, + { + "epoch": 2.7240875912408757, + "grad_norm": 0.589508593082428, + "learning_rate": 4.645639914374278e-07, + "loss": 0.5353684425354004, + "step": 2100 + }, + { + "epoch": 2.7253852392538525, + "grad_norm": 0.5571612119674683, + "learning_rate": 4.602659554659461e-07, + "loss": 0.4614424705505371, + "step": 2101 + }, + { + "epoch": 2.7266828872668287, + "grad_norm": 0.6046862602233887, + "learning_rate": 4.559874258360408e-07, + "loss": 0.5189507603645325, + "step": 2102 + }, + { + "epoch": 2.7279805352798054, + "grad_norm": 0.5680896639823914, + "learning_rate": 4.5172841129621726e-07, + "loss": 0.5085829496383667, + "step": 2103 + }, + { + "epoch": 2.7292781832927817, + "grad_norm": 0.5765218138694763, + "learning_rate": 4.474889205550881e-07, + "loss": 0.5140299797058105, + "step": 2104 + }, + { + "epoch": 2.7305758313057584, + "grad_norm": 0.587651252746582, + "learning_rate": 4.4326896228133354e-07, + "loss": 0.4957928955554962, + "step": 2105 + }, + { + "epoch": 2.7318734793187347, + "grad_norm": 0.5494794249534607, + "learning_rate": 4.3906854510370245e-07, + "loss": 0.5062738060951233, + "step": 2106 + }, + { + "epoch": 2.7331711273317114, + "grad_norm": 0.5937455296516418, + "learning_rate": 4.348876776109856e-07, + "loss": 0.5094043016433716, + "step": 2107 + }, + { + "epoch": 2.7344687753446877, + "grad_norm": 0.5641949772834778, + "learning_rate": 4.307263683519969e-07, + "loss": 0.48215553164482117, + "step": 2108 + }, + { + "epoch": 2.7357664233576644, + "grad_norm": 0.5819230079650879, + "learning_rate": 4.2658462583556216e-07, + "loss": 0.5357835292816162, + "step": 2109 + }, + { + "epoch": 2.7370640713706407, + "grad_norm": 0.5532712936401367, + "learning_rate": 4.2246245853049706e-07, + "loss": 0.47937077283859253, + "step": 2110 + }, + { + "epoch": 2.738361719383617, + "grad_norm": 0.6110063195228577, + "learning_rate": 4.1835987486558595e-07, + "loss": 0.4744276702404022, + "step": 2111 + }, + { + "epoch": 2.7396593673965937, + "grad_norm": 0.5573598146438599, + "learning_rate": 4.142768832295807e-07, + "loss": 0.5128625631332397, + "step": 2112 + }, + { + "epoch": 2.7409570154095704, + "grad_norm": 0.5569184422492981, + "learning_rate": 4.102134919711609e-07, + "loss": 0.47407659888267517, + "step": 2113 + }, + { + "epoch": 2.7422546634225466, + "grad_norm": 0.5868476629257202, + "learning_rate": 4.061697093989347e-07, + "loss": 0.5311683416366577, + "step": 2114 + }, + { + "epoch": 2.743552311435523, + "grad_norm": 0.5694899559020996, + "learning_rate": 4.021455437814148e-07, + "loss": 0.4629291892051697, + "step": 2115 + }, + { + "epoch": 2.7448499594484996, + "grad_norm": 0.5624482035636902, + "learning_rate": 3.981410033469979e-07, + "loss": 0.4855622351169586, + "step": 2116 + }, + { + "epoch": 2.7461476074614763, + "grad_norm": 0.576919436454773, + "learning_rate": 3.941560962839619e-07, + "loss": 0.47935816645622253, + "step": 2117 + }, + { + "epoch": 2.7474452554744526, + "grad_norm": 0.5966827869415283, + "learning_rate": 3.9019083074042784e-07, + "loss": 0.4561656415462494, + "step": 2118 + }, + { + "epoch": 2.748742903487429, + "grad_norm": 0.5702851414680481, + "learning_rate": 3.862452148243623e-07, + "loss": 0.4796487092971802, + "step": 2119 + }, + { + "epoch": 2.7500405515004056, + "grad_norm": 0.5755755305290222, + "learning_rate": 3.823192566035494e-07, + "loss": 0.5047421455383301, + "step": 2120 + }, + { + "epoch": 2.751338199513382, + "grad_norm": 0.5769697427749634, + "learning_rate": 3.7841296410558225e-07, + "loss": 0.48532968759536743, + "step": 2121 + }, + { + "epoch": 2.7526358475263586, + "grad_norm": 0.5873609781265259, + "learning_rate": 3.7452634531783935e-07, + "loss": 0.5122209787368774, + "step": 2122 + }, + { + "epoch": 2.753933495539335, + "grad_norm": 0.5939727425575256, + "learning_rate": 3.706594081874737e-07, + "loss": 0.49794304370880127, + "step": 2123 + }, + { + "epoch": 2.7552311435523116, + "grad_norm": 0.5834800601005554, + "learning_rate": 3.6681216062138923e-07, + "loss": 0.5340889096260071, + "step": 2124 + }, + { + "epoch": 2.756528791565288, + "grad_norm": 0.576677680015564, + "learning_rate": 3.6298461048623887e-07, + "loss": 0.5236599445343018, + "step": 2125 + }, + { + "epoch": 2.757826439578264, + "grad_norm": 0.5462478399276733, + "learning_rate": 3.5917676560838775e-07, + "loss": 0.47627806663513184, + "step": 2126 + }, + { + "epoch": 2.759124087591241, + "grad_norm": 0.5982619524002075, + "learning_rate": 3.5538863377392095e-07, + "loss": 0.4933459460735321, + "step": 2127 + }, + { + "epoch": 2.7604217356042176, + "grad_norm": 0.5802999138832092, + "learning_rate": 3.5162022272860475e-07, + "loss": 0.5381085872650146, + "step": 2128 + }, + { + "epoch": 2.761719383617194, + "grad_norm": 0.5820630788803101, + "learning_rate": 3.478715401778876e-07, + "loss": 0.5177547931671143, + "step": 2129 + }, + { + "epoch": 2.76301703163017, + "grad_norm": 0.6046480536460876, + "learning_rate": 3.44142593786877e-07, + "loss": 0.5715194940567017, + "step": 2130 + }, + { + "epoch": 2.764314679643147, + "grad_norm": 0.5816249847412109, + "learning_rate": 3.404333911803237e-07, + "loss": 0.48858851194381714, + "step": 2131 + }, + { + "epoch": 2.7656123276561235, + "grad_norm": 0.5709452629089355, + "learning_rate": 3.367439399426087e-07, + "loss": 0.5259594917297363, + "step": 2132 + }, + { + "epoch": 2.7669099756691, + "grad_norm": 0.5610825419425964, + "learning_rate": 3.330742476177273e-07, + "loss": 0.49785754084587097, + "step": 2133 + }, + { + "epoch": 2.768207623682076, + "grad_norm": 0.5751505494117737, + "learning_rate": 3.2942432170926743e-07, + "loss": 0.45043110847473145, + "step": 2134 + }, + { + "epoch": 2.769505271695053, + "grad_norm": 0.5675750374794006, + "learning_rate": 3.257941696804079e-07, + "loss": 0.5171366930007935, + "step": 2135 + }, + { + "epoch": 2.770802919708029, + "grad_norm": 0.5672844052314758, + "learning_rate": 3.2218379895388896e-07, + "loss": 0.467257022857666, + "step": 2136 + }, + { + "epoch": 2.7721005677210058, + "grad_norm": 0.6082518696784973, + "learning_rate": 3.185932169120043e-07, + "loss": 0.5202172994613647, + "step": 2137 + }, + { + "epoch": 2.773398215733982, + "grad_norm": 0.5631950497627258, + "learning_rate": 3.150224308965866e-07, + "loss": 0.5058823823928833, + "step": 2138 + }, + { + "epoch": 2.7746958637469588, + "grad_norm": 0.6380532383918762, + "learning_rate": 3.114714482089898e-07, + "loss": 0.5831983089447021, + "step": 2139 + }, + { + "epoch": 2.775993511759935, + "grad_norm": 0.5557391047477722, + "learning_rate": 3.079402761100736e-07, + "loss": 0.4567191004753113, + "step": 2140 + }, + { + "epoch": 2.7772911597729117, + "grad_norm": 0.562920868396759, + "learning_rate": 3.0442892182019236e-07, + "loss": 0.4184800386428833, + "step": 2141 + }, + { + "epoch": 2.778588807785888, + "grad_norm": 0.63033127784729, + "learning_rate": 3.00937392519175e-07, + "loss": 0.5374839901924133, + "step": 2142 + }, + { + "epoch": 2.7798864557988647, + "grad_norm": 0.5735025405883789, + "learning_rate": 2.974656953463173e-07, + "loss": 0.4503205716609955, + "step": 2143 + }, + { + "epoch": 2.781184103811841, + "grad_norm": 0.6051810383796692, + "learning_rate": 2.9401383740035983e-07, + "loss": 0.4981985092163086, + "step": 2144 + }, + { + "epoch": 2.7824817518248173, + "grad_norm": 0.6038339734077454, + "learning_rate": 2.905818257394799e-07, + "loss": 0.5327208638191223, + "step": 2145 + }, + { + "epoch": 2.783779399837794, + "grad_norm": 0.5686031579971313, + "learning_rate": 2.871696673812718e-07, + "loss": 0.4990962743759155, + "step": 2146 + }, + { + "epoch": 2.7850770478507707, + "grad_norm": 0.5870386958122253, + "learning_rate": 2.837773693027346e-07, + "loss": 0.5274587869644165, + "step": 2147 + }, + { + "epoch": 2.786374695863747, + "grad_norm": 0.6039890050888062, + "learning_rate": 2.8040493844026185e-07, + "loss": 0.4969175457954407, + "step": 2148 + }, + { + "epoch": 2.7876723438767232, + "grad_norm": 0.5605257749557495, + "learning_rate": 2.7705238168961867e-07, + "loss": 0.466129869222641, + "step": 2149 + }, + { + "epoch": 2.7889699918897, + "grad_norm": 0.5661087036132812, + "learning_rate": 2.7371970590593597e-07, + "loss": 0.5182359218597412, + "step": 2150 + }, + { + "epoch": 2.7902676399026762, + "grad_norm": 0.6032746434211731, + "learning_rate": 2.7040691790369165e-07, + "loss": 0.4847348928451538, + "step": 2151 + }, + { + "epoch": 2.791565287915653, + "grad_norm": 0.5873638987541199, + "learning_rate": 2.671140244567005e-07, + "loss": 0.4982571005821228, + "step": 2152 + }, + { + "epoch": 2.792862935928629, + "grad_norm": 0.5877160429954529, + "learning_rate": 2.6384103229809445e-07, + "loss": 0.47337985038757324, + "step": 2153 + }, + { + "epoch": 2.794160583941606, + "grad_norm": 0.6034269332885742, + "learning_rate": 2.605879481203144e-07, + "loss": 0.5359882116317749, + "step": 2154 + }, + { + "epoch": 2.795458231954582, + "grad_norm": 0.5855337381362915, + "learning_rate": 2.5735477857509406e-07, + "loss": 0.48935824632644653, + "step": 2155 + }, + { + "epoch": 2.796755879967559, + "grad_norm": 0.5761221647262573, + "learning_rate": 2.5414153027344846e-07, + "loss": 0.5092116594314575, + "step": 2156 + }, + { + "epoch": 2.798053527980535, + "grad_norm": 0.5906012654304504, + "learning_rate": 2.5094820978565416e-07, + "loss": 0.4823336601257324, + "step": 2157 + }, + { + "epoch": 2.799351175993512, + "grad_norm": 0.5929545164108276, + "learning_rate": 2.4777482364124695e-07, + "loss": 0.48247990012168884, + "step": 2158 + }, + { + "epoch": 2.800648824006488, + "grad_norm": 0.5614597797393799, + "learning_rate": 2.446213783289941e-07, + "loss": 0.48732107877731323, + "step": 2159 + }, + { + "epoch": 2.8019464720194645, + "grad_norm": 0.6198487281799316, + "learning_rate": 2.4148788029689565e-07, + "loss": 0.544142484664917, + "step": 2160 + }, + { + "epoch": 2.803244120032441, + "grad_norm": 0.5842984318733215, + "learning_rate": 2.3837433595216174e-07, + "loss": 0.5269244313240051, + "step": 2161 + }, + { + "epoch": 2.804541768045418, + "grad_norm": 0.5822996497154236, + "learning_rate": 2.3528075166120323e-07, + "loss": 0.49836334586143494, + "step": 2162 + }, + { + "epoch": 2.805839416058394, + "grad_norm": 0.5670111775398254, + "learning_rate": 2.3220713374961457e-07, + "loss": 0.5108374357223511, + "step": 2163 + }, + { + "epoch": 2.8071370640713704, + "grad_norm": 0.5872285962104797, + "learning_rate": 2.2915348850216955e-07, + "loss": 0.49880123138427734, + "step": 2164 + }, + { + "epoch": 2.808434712084347, + "grad_norm": 0.5544793605804443, + "learning_rate": 2.2611982216279693e-07, + "loss": 0.5181583166122437, + "step": 2165 + }, + { + "epoch": 2.809732360097324, + "grad_norm": 0.5830904245376587, + "learning_rate": 2.2310614093457917e-07, + "loss": 0.48121365904808044, + "step": 2166 + }, + { + "epoch": 2.8110300081103, + "grad_norm": 0.6001294255256653, + "learning_rate": 2.2011245097972812e-07, + "loss": 0.500962495803833, + "step": 2167 + }, + { + "epoch": 2.8123276561232764, + "grad_norm": 0.6160042881965637, + "learning_rate": 2.171387584195861e-07, + "loss": 0.5166311264038086, + "step": 2168 + }, + { + "epoch": 2.813625304136253, + "grad_norm": 0.5664080381393433, + "learning_rate": 2.1418506933459926e-07, + "loss": 0.4849929213523865, + "step": 2169 + }, + { + "epoch": 2.8149229521492294, + "grad_norm": 0.60596764087677, + "learning_rate": 2.1125138976431425e-07, + "loss": 0.5384441018104553, + "step": 2170 + }, + { + "epoch": 2.816220600162206, + "grad_norm": 0.6017642617225647, + "learning_rate": 2.0833772570736376e-07, + "loss": 0.5182196497917175, + "step": 2171 + }, + { + "epoch": 2.8175182481751824, + "grad_norm": 0.567242443561554, + "learning_rate": 2.0544408312145325e-07, + "loss": 0.5023871660232544, + "step": 2172 + }, + { + "epoch": 2.818815896188159, + "grad_norm": 0.5743298530578613, + "learning_rate": 2.025704679233498e-07, + "loss": 0.4737445116043091, + "step": 2173 + }, + { + "epoch": 2.8201135442011354, + "grad_norm": 0.5686278343200684, + "learning_rate": 1.9971688598886874e-07, + "loss": 0.4916064441204071, + "step": 2174 + }, + { + "epoch": 2.821411192214112, + "grad_norm": 0.5849027037620544, + "learning_rate": 1.9688334315286383e-07, + "loss": 0.5161796808242798, + "step": 2175 + }, + { + "epoch": 2.8227088402270883, + "grad_norm": 0.5709643959999084, + "learning_rate": 1.9406984520921156e-07, + "loss": 0.5027370452880859, + "step": 2176 + }, + { + "epoch": 2.824006488240065, + "grad_norm": 0.6077797412872314, + "learning_rate": 1.9127639791080345e-07, + "loss": 0.561673641204834, + "step": 2177 + }, + { + "epoch": 2.8253041362530413, + "grad_norm": 0.5836532711982727, + "learning_rate": 1.885030069695326e-07, + "loss": 0.5252400636672974, + "step": 2178 + }, + { + "epoch": 2.8266017842660176, + "grad_norm": 0.5875435471534729, + "learning_rate": 1.8574967805628174e-07, + "loss": 0.5136289596557617, + "step": 2179 + }, + { + "epoch": 2.8278994322789943, + "grad_norm": 0.5999600291252136, + "learning_rate": 1.8301641680090965e-07, + "loss": 0.5113690495491028, + "step": 2180 + }, + { + "epoch": 2.829197080291971, + "grad_norm": 0.5720099210739136, + "learning_rate": 1.8030322879224792e-07, + "loss": 0.5277208089828491, + "step": 2181 + }, + { + "epoch": 2.8304947283049473, + "grad_norm": 0.5587209463119507, + "learning_rate": 1.7761011957807439e-07, + "loss": 0.5302145481109619, + "step": 2182 + }, + { + "epoch": 2.8317923763179236, + "grad_norm": 0.574344277381897, + "learning_rate": 1.7493709466511965e-07, + "loss": 0.5009472370147705, + "step": 2183 + }, + { + "epoch": 2.8330900243309003, + "grad_norm": 0.5876274704933167, + "learning_rate": 1.7228415951904165e-07, + "loss": 0.49587976932525635, + "step": 2184 + }, + { + "epoch": 2.8343876723438766, + "grad_norm": 0.5799663662910461, + "learning_rate": 1.6965131956442004e-07, + "loss": 0.5200576782226562, + "step": 2185 + }, + { + "epoch": 2.8356853203568533, + "grad_norm": 0.5789362192153931, + "learning_rate": 1.670385801847485e-07, + "loss": 0.4996534585952759, + "step": 2186 + }, + { + "epoch": 2.8369829683698295, + "grad_norm": 0.5791637897491455, + "learning_rate": 1.6444594672241688e-07, + "loss": 0.5251076221466064, + "step": 2187 + }, + { + "epoch": 2.8382806163828063, + "grad_norm": 0.581289529800415, + "learning_rate": 1.6187342447870235e-07, + "loss": 0.47298407554626465, + "step": 2188 + }, + { + "epoch": 2.8395782643957825, + "grad_norm": 0.5624388456344604, + "learning_rate": 1.5932101871376503e-07, + "loss": 0.48804956674575806, + "step": 2189 + }, + { + "epoch": 2.8408759124087593, + "grad_norm": 0.5740110278129578, + "learning_rate": 1.567887346466257e-07, + "loss": 0.4583921730518341, + "step": 2190 + }, + { + "epoch": 2.8421735604217355, + "grad_norm": 0.5799588561058044, + "learning_rate": 1.54276577455168e-07, + "loss": 0.5046111345291138, + "step": 2191 + }, + { + "epoch": 2.8434712084347122, + "grad_norm": 0.5686801671981812, + "learning_rate": 1.517845522761141e-07, + "loss": 0.5424494743347168, + "step": 2192 + }, + { + "epoch": 2.8447688564476885, + "grad_norm": 0.5737746953964233, + "learning_rate": 1.4931266420502687e-07, + "loss": 0.5258438587188721, + "step": 2193 + }, + { + "epoch": 2.846066504460665, + "grad_norm": 0.5844926238059998, + "learning_rate": 1.468609182962899e-07, + "loss": 0.5294222831726074, + "step": 2194 + }, + { + "epoch": 2.8473641524736415, + "grad_norm": 0.6161758899688721, + "learning_rate": 1.4442931956310525e-07, + "loss": 0.48813527822494507, + "step": 2195 + }, + { + "epoch": 2.848661800486618, + "grad_norm": 0.5877721905708313, + "learning_rate": 1.420178729774746e-07, + "loss": 0.5104416608810425, + "step": 2196 + }, + { + "epoch": 2.8499594484995945, + "grad_norm": 0.607412576675415, + "learning_rate": 1.3962658347019819e-07, + "loss": 0.5552476644515991, + "step": 2197 + }, + { + "epoch": 2.8512570965125708, + "grad_norm": 0.5500598549842834, + "learning_rate": 1.372554559308559e-07, + "loss": 0.5361748933792114, + "step": 2198 + }, + { + "epoch": 2.8525547445255475, + "grad_norm": 0.5887991786003113, + "learning_rate": 1.3490449520780492e-07, + "loss": 0.5089778304100037, + "step": 2199 + }, + { + "epoch": 2.853852392538524, + "grad_norm": 0.5767118334770203, + "learning_rate": 1.3257370610816333e-07, + "loss": 0.4646577537059784, + "step": 2200 + }, + { + "epoch": 2.8551500405515005, + "grad_norm": 0.5947672128677368, + "learning_rate": 1.3026309339780442e-07, + "loss": 0.45190826058387756, + "step": 2201 + }, + { + "epoch": 2.8564476885644767, + "grad_norm": 0.576164722442627, + "learning_rate": 1.2797266180134994e-07, + "loss": 0.47920286655426025, + "step": 2202 + }, + { + "epoch": 2.8577453365774534, + "grad_norm": 0.5928218364715576, + "learning_rate": 1.2570241600214805e-07, + "loss": 0.4952476918697357, + "step": 2203 + }, + { + "epoch": 2.8590429845904297, + "grad_norm": 0.5796513557434082, + "learning_rate": 1.2345236064228216e-07, + "loss": 0.4798247218132019, + "step": 2204 + }, + { + "epoch": 2.8603406326034064, + "grad_norm": 0.6173388361930847, + "learning_rate": 1.212225003225409e-07, + "loss": 0.5353522300720215, + "step": 2205 + }, + { + "epoch": 2.8616382806163827, + "grad_norm": 0.582225501537323, + "learning_rate": 1.1901283960242704e-07, + "loss": 0.4966939091682434, + "step": 2206 + }, + { + "epoch": 2.8629359286293594, + "grad_norm": 0.573807954788208, + "learning_rate": 1.168233830001364e-07, + "loss": 0.5133891701698303, + "step": 2207 + }, + { + "epoch": 2.8642335766423357, + "grad_norm": 0.5719092488288879, + "learning_rate": 1.1465413499255452e-07, + "loss": 0.5084906816482544, + "step": 2208 + }, + { + "epoch": 2.8655312246553124, + "grad_norm": 0.563827395439148, + "learning_rate": 1.1250510001524329e-07, + "loss": 0.551742434501648, + "step": 2209 + }, + { + "epoch": 2.8668288726682887, + "grad_norm": 0.5915552973747253, + "learning_rate": 1.103762824624377e-07, + "loss": 0.5108176469802856, + "step": 2210 + }, + { + "epoch": 2.8681265206812654, + "grad_norm": 0.5619785189628601, + "learning_rate": 1.0826768668702691e-07, + "loss": 0.5008025169372559, + "step": 2211 + }, + { + "epoch": 2.8694241686942417, + "grad_norm": 0.5829325914382935, + "learning_rate": 1.0617931700055984e-07, + "loss": 0.5187573432922363, + "step": 2212 + }, + { + "epoch": 2.870721816707218, + "grad_norm": 0.6110272407531738, + "learning_rate": 1.0411117767322065e-07, + "loss": 0.5479835271835327, + "step": 2213 + }, + { + "epoch": 2.8720194647201946, + "grad_norm": 0.5755971074104309, + "learning_rate": 1.0206327293383222e-07, + "loss": 0.5030970573425293, + "step": 2214 + }, + { + "epoch": 2.8733171127331714, + "grad_norm": 0.5851888060569763, + "learning_rate": 1.000356069698416e-07, + "loss": 0.5171909928321838, + "step": 2215 + }, + { + "epoch": 2.8746147607461476, + "grad_norm": 0.558315098285675, + "learning_rate": 9.802818392731117e-08, + "loss": 0.47078371047973633, + "step": 2216 + }, + { + "epoch": 2.875912408759124, + "grad_norm": 0.6229851841926575, + "learning_rate": 9.60410079109153e-08, + "loss": 0.5632795095443726, + "step": 2217 + }, + { + "epoch": 2.8772100567721006, + "grad_norm": 0.5876999497413635, + "learning_rate": 9.407408298392373e-08, + "loss": 0.5133551359176636, + "step": 2218 + }, + { + "epoch": 2.878507704785077, + "grad_norm": 0.5872880220413208, + "learning_rate": 9.212741316820039e-08, + "loss": 0.4713757038116455, + "step": 2219 + }, + { + "epoch": 2.8798053527980536, + "grad_norm": 0.5895143747329712, + "learning_rate": 9.020100244419461e-08, + "loss": 0.5900079607963562, + "step": 2220 + }, + { + "epoch": 2.88110300081103, + "grad_norm": 0.5657681822776794, + "learning_rate": 8.829485475092548e-08, + "loss": 0.5136827230453491, + "step": 2221 + }, + { + "epoch": 2.8824006488240066, + "grad_norm": 0.8106376528739929, + "learning_rate": 8.640897398598525e-08, + "loss": 0.6291136741638184, + "step": 2222 + }, + { + "epoch": 2.883698296836983, + "grad_norm": 0.5875924825668335, + "learning_rate": 8.454336400552154e-08, + "loss": 0.4933609962463379, + "step": 2223 + }, + { + "epoch": 2.8849959448499596, + "grad_norm": 0.5977309346199036, + "learning_rate": 8.269802862423405e-08, + "loss": 0.5197732448577881, + "step": 2224 + }, + { + "epoch": 2.886293592862936, + "grad_norm": 0.5707021951675415, + "learning_rate": 8.087297161536778e-08, + "loss": 0.5037369132041931, + "step": 2225 + }, + { + "epoch": 2.8875912408759126, + "grad_norm": 0.5633382797241211, + "learning_rate": 7.906819671070098e-08, + "loss": 0.4686581492424011, + "step": 2226 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.5665260553359985, + "learning_rate": 7.728370760054283e-08, + "loss": 0.4968178868293762, + "step": 2227 + }, + { + "epoch": 2.890186536901865, + "grad_norm": 0.557956874370575, + "learning_rate": 7.55195079337212e-08, + "loss": 0.4842921793460846, + "step": 2228 + }, + { + "epoch": 2.891484184914842, + "grad_norm": 0.5774162411689758, + "learning_rate": 7.377560131757832e-08, + "loss": 0.48150286078453064, + "step": 2229 + }, + { + "epoch": 2.8927818329278185, + "grad_norm": 0.5605522990226746, + "learning_rate": 7.205199131796182e-08, + "loss": 0.47593769431114197, + "step": 2230 + }, + { + "epoch": 2.894079480940795, + "grad_norm": 0.5713371634483337, + "learning_rate": 7.034868145921802e-08, + "loss": 0.5388371348381042, + "step": 2231 + }, + { + "epoch": 2.895377128953771, + "grad_norm": 0.566564679145813, + "learning_rate": 6.866567522418322e-08, + "loss": 0.5253296494483948, + "step": 2232 + }, + { + "epoch": 2.896674776966748, + "grad_norm": 0.6038841605186462, + "learning_rate": 6.700297605418127e-08, + "loss": 0.4850519895553589, + "step": 2233 + }, + { + "epoch": 2.8979724249797245, + "grad_norm": 0.5850130915641785, + "learning_rate": 6.53605873490093e-08, + "loss": 0.526265025138855, + "step": 2234 + }, + { + "epoch": 2.899270072992701, + "grad_norm": 0.5685164332389832, + "learning_rate": 6.373851246693763e-08, + "loss": 0.49016064405441284, + "step": 2235 + }, + { + "epoch": 2.900567721005677, + "grad_norm": 0.585509717464447, + "learning_rate": 6.21367547246976e-08, + "loss": 0.49361756443977356, + "step": 2236 + }, + { + "epoch": 2.9018653690186538, + "grad_norm": 0.5846717357635498, + "learning_rate": 6.055531739747933e-08, + "loss": 0.5073826313018799, + "step": 2237 + }, + { + "epoch": 2.90316301703163, + "grad_norm": 0.6035211682319641, + "learning_rate": 5.899420371892173e-08, + "loss": 0.4748195707798004, + "step": 2238 + }, + { + "epoch": 2.9044606650446068, + "grad_norm": 0.5725396275520325, + "learning_rate": 5.745341688110806e-08, + "loss": 0.49574536085128784, + "step": 2239 + }, + { + "epoch": 2.905758313057583, + "grad_norm": 0.5700922012329102, + "learning_rate": 5.593296003455595e-08, + "loss": 0.4746463894844055, + "step": 2240 + }, + { + "epoch": 2.9070559610705597, + "grad_norm": 0.5627117156982422, + "learning_rate": 5.4432836288215165e-08, + "loss": 0.512833833694458, + "step": 2241 + }, + { + "epoch": 2.908353609083536, + "grad_norm": 0.5812812447547913, + "learning_rate": 5.2953048709459834e-08, + "loss": 0.48332545161247253, + "step": 2242 + }, + { + "epoch": 2.9096512570965127, + "grad_norm": 0.5835334062576294, + "learning_rate": 5.1493600324080684e-08, + "loss": 0.507304847240448, + "step": 2243 + }, + { + "epoch": 2.910948905109489, + "grad_norm": 0.5789167284965515, + "learning_rate": 5.0054494116279497e-08, + "loss": 0.5132785439491272, + "step": 2244 + }, + { + "epoch": 2.9122465531224657, + "grad_norm": 0.5582759976387024, + "learning_rate": 4.8635733028664644e-08, + "loss": 0.4791605472564697, + "step": 2245 + }, + { + "epoch": 2.913544201135442, + "grad_norm": 0.5968536138534546, + "learning_rate": 4.723731996224446e-08, + "loss": 0.5294557809829712, + "step": 2246 + }, + { + "epoch": 2.9148418491484183, + "grad_norm": 0.5799421072006226, + "learning_rate": 4.585925777641831e-08, + "loss": 0.5392569303512573, + "step": 2247 + }, + { + "epoch": 2.916139497161395, + "grad_norm": 0.5876581072807312, + "learning_rate": 4.450154928897443e-08, + "loss": 0.5044458508491516, + "step": 2248 + }, + { + "epoch": 2.9174371451743717, + "grad_norm": 0.5795705914497375, + "learning_rate": 4.316419727608434e-08, + "loss": 0.518474280834198, + "step": 2249 + }, + { + "epoch": 2.918734793187348, + "grad_norm": 0.5783658027648926, + "learning_rate": 4.1847204472293954e-08, + "loss": 0.5036035180091858, + "step": 2250 + }, + { + "epoch": 2.9200324412003242, + "grad_norm": 0.5799797773361206, + "learning_rate": 4.055057357052139e-08, + "loss": 0.5075333118438721, + "step": 2251 + }, + { + "epoch": 2.921330089213301, + "grad_norm": 0.5816603899002075, + "learning_rate": 3.927430722204473e-08, + "loss": 0.49955567717552185, + "step": 2252 + }, + { + "epoch": 2.9226277372262772, + "grad_norm": 0.5603087544441223, + "learning_rate": 3.801840803651091e-08, + "loss": 0.4799802005290985, + "step": 2253 + }, + { + "epoch": 2.923925385239254, + "grad_norm": 0.5984447598457336, + "learning_rate": 3.678287858191132e-08, + "loss": 0.4863054156303406, + "step": 2254 + }, + { + "epoch": 2.92522303325223, + "grad_norm": 0.5684608817100525, + "learning_rate": 3.5567721384593965e-08, + "loss": 0.5202617645263672, + "step": 2255 + }, + { + "epoch": 2.926520681265207, + "grad_norm": 0.6067941784858704, + "learning_rate": 3.437293892924576e-08, + "loss": 0.5111681818962097, + "step": 2256 + }, + { + "epoch": 2.927818329278183, + "grad_norm": 0.6141681671142578, + "learning_rate": 3.3198533658895804e-08, + "loss": 0.5316765904426575, + "step": 2257 + }, + { + "epoch": 2.92911597729116, + "grad_norm": 0.5799176096916199, + "learning_rate": 3.2044507974905433e-08, + "loss": 0.46131962537765503, + "step": 2258 + }, + { + "epoch": 2.930413625304136, + "grad_norm": 0.5954794883728027, + "learning_rate": 3.091086423696377e-08, + "loss": 0.520176887512207, + "step": 2259 + }, + { + "epoch": 2.931711273317113, + "grad_norm": 0.5652449131011963, + "learning_rate": 2.9797604763087684e-08, + "loss": 0.5085136890411377, + "step": 2260 + }, + { + "epoch": 2.933008921330089, + "grad_norm": 0.5852287411689758, + "learning_rate": 2.8704731829609643e-08, + "loss": 0.5083173513412476, + "step": 2261 + }, + { + "epoch": 2.9343065693430654, + "grad_norm": 0.5846629738807678, + "learning_rate": 2.763224767117767e-08, + "loss": 0.5292702913284302, + "step": 2262 + }, + { + "epoch": 2.935604217356042, + "grad_norm": 0.5861793756484985, + "learning_rate": 2.6580154480750907e-08, + "loss": 0.5053665637969971, + "step": 2263 + }, + { + "epoch": 2.936901865369019, + "grad_norm": 0.5602736473083496, + "learning_rate": 2.554845440959408e-08, + "loss": 0.5189537405967712, + "step": 2264 + }, + { + "epoch": 2.938199513381995, + "grad_norm": 0.5991557240486145, + "learning_rate": 2.4537149567271935e-08, + "loss": 0.5867321491241455, + "step": 2265 + }, + { + "epoch": 2.9394971613949714, + "grad_norm": 0.5465215444564819, + "learning_rate": 2.3546242021648126e-08, + "loss": 0.5084092617034912, + "step": 2266 + }, + { + "epoch": 2.940794809407948, + "grad_norm": 0.6008067727088928, + "learning_rate": 2.2575733798876342e-08, + "loss": 0.5280360579490662, + "step": 2267 + }, + { + "epoch": 2.942092457420925, + "grad_norm": 0.5549503564834595, + "learning_rate": 2.162562688340142e-08, + "loss": 0.4592389762401581, + "step": 2268 + }, + { + "epoch": 2.943390105433901, + "grad_norm": 0.600985586643219, + "learning_rate": 2.0695923217950442e-08, + "loss": 0.5138071179389954, + "step": 2269 + }, + { + "epoch": 2.9446877534468774, + "grad_norm": 0.5776973366737366, + "learning_rate": 1.9786624703532764e-08, + "loss": 0.560516357421875, + "step": 2270 + }, + { + "epoch": 2.945985401459854, + "grad_norm": 0.5803866982460022, + "learning_rate": 1.8897733199434443e-08, + "loss": 0.48770207166671753, + "step": 2271 + }, + { + "epoch": 2.9472830494728304, + "grad_norm": 0.5844945907592773, + "learning_rate": 1.8029250523211582e-08, + "loss": 0.5004736185073853, + "step": 2272 + }, + { + "epoch": 2.948580697485807, + "grad_norm": 0.5826125144958496, + "learning_rate": 1.718117845069367e-08, + "loss": 0.4950000047683716, + "step": 2273 + }, + { + "epoch": 2.9498783454987834, + "grad_norm": 0.5776214003562927, + "learning_rate": 1.635351871597246e-08, + "loss": 0.5560945868492126, + "step": 2274 + }, + { + "epoch": 2.95117599351176, + "grad_norm": 0.565700352191925, + "learning_rate": 1.554627301140199e-08, + "loss": 0.4630610942840576, + "step": 2275 + }, + { + "epoch": 2.9524736415247363, + "grad_norm": 0.5994547605514526, + "learning_rate": 1.4759442987596351e-08, + "loss": 0.5141358375549316, + "step": 2276 + }, + { + "epoch": 2.9537712895377126, + "grad_norm": 0.573093831539154, + "learning_rate": 1.3993030253423023e-08, + "loss": 0.4815256893634796, + "step": 2277 + }, + { + "epoch": 2.9550689375506893, + "grad_norm": 0.5978487730026245, + "learning_rate": 1.3247036376002886e-08, + "loss": 0.5149579048156738, + "step": 2278 + }, + { + "epoch": 2.956366585563666, + "grad_norm": 0.6069895625114441, + "learning_rate": 1.252146288070355e-08, + "loss": 0.5201846361160278, + "step": 2279 + }, + { + "epoch": 2.9576642335766423, + "grad_norm": 0.5879092216491699, + "learning_rate": 1.1816311251140466e-08, + "loss": 0.5039907693862915, + "step": 2280 + }, + { + "epoch": 2.9589618815896186, + "grad_norm": 0.5550662875175476, + "learning_rate": 1.113158292916916e-08, + "loss": 0.5198723077774048, + "step": 2281 + }, + { + "epoch": 2.9602595296025953, + "grad_norm": 0.5664054155349731, + "learning_rate": 1.0467279314886336e-08, + "loss": 0.5281890630722046, + "step": 2282 + }, + { + "epoch": 2.961557177615572, + "grad_norm": 0.5738133788108826, + "learning_rate": 9.82340176662433e-09, + "loss": 0.47895991802215576, + "step": 2283 + }, + { + "epoch": 2.9628548256285483, + "grad_norm": 0.5834701657295227, + "learning_rate": 9.199951600951106e-09, + "loss": 0.49841928482055664, + "step": 2284 + }, + { + "epoch": 2.9641524736415246, + "grad_norm": 0.553411602973938, + "learning_rate": 8.596930092662493e-09, + "loss": 0.5044345855712891, + "step": 2285 + }, + { + "epoch": 2.9654501216545013, + "grad_norm": 0.5765789151191711, + "learning_rate": 8.014338474785499e-09, + "loss": 0.45714667439460754, + "step": 2286 + }, + { + "epoch": 2.9667477696674776, + "grad_norm": 0.5678233504295349, + "learning_rate": 7.45217793857389e-09, + "loss": 0.5142921209335327, + "step": 2287 + }, + { + "epoch": 2.9680454176804543, + "grad_norm": 0.5809730887413025, + "learning_rate": 6.910449633501515e-09, + "loss": 0.5097491145133972, + "step": 2288 + }, + { + "epoch": 2.9693430656934305, + "grad_norm": 0.863067626953125, + "learning_rate": 6.389154667266751e-09, + "loss": 0.49733829498291016, + "step": 2289 + }, + { + "epoch": 2.9706407137064073, + "grad_norm": 0.5724239349365234, + "learning_rate": 5.888294105785841e-09, + "loss": 0.5271996855735779, + "step": 2290 + }, + { + "epoch": 2.9719383617193835, + "grad_norm": 0.5894045829772949, + "learning_rate": 5.407868973191788e-09, + "loss": 0.5507649183273315, + "step": 2291 + }, + { + "epoch": 2.9732360097323602, + "grad_norm": 0.5670002698898315, + "learning_rate": 4.947880251832127e-09, + "loss": 0.5069165229797363, + "step": 2292 + }, + { + "epoch": 2.9745336577453365, + "grad_norm": 0.6079567074775696, + "learning_rate": 4.508328882268931e-09, + "loss": 0.5027692317962646, + "step": 2293 + }, + { + "epoch": 2.9758313057583132, + "grad_norm": 0.5965436697006226, + "learning_rate": 4.089215763271037e-09, + "loss": 0.4549415707588196, + "step": 2294 + }, + { + "epoch": 2.9771289537712895, + "grad_norm": 0.5540100336074829, + "learning_rate": 3.6905417518195985e-09, + "loss": 0.5082988739013672, + "step": 2295 + }, + { + "epoch": 2.9784266017842658, + "grad_norm": 0.5584218502044678, + "learning_rate": 3.312307663103642e-09, + "loss": 0.49896612763404846, + "step": 2296 + }, + { + "epoch": 2.9797242497972425, + "grad_norm": 0.5825123190879822, + "learning_rate": 2.954514270513409e-09, + "loss": 0.5268645286560059, + "step": 2297 + }, + { + "epoch": 2.981021897810219, + "grad_norm": 0.6069872379302979, + "learning_rate": 2.6171623056481245e-09, + "loss": 0.5306706428527832, + "step": 2298 + }, + { + "epoch": 2.9823195458231955, + "grad_norm": 0.619730532169342, + "learning_rate": 2.300252458306007e-09, + "loss": 0.5466433167457581, + "step": 2299 + }, + { + "epoch": 2.9836171938361717, + "grad_norm": 0.575143039226532, + "learning_rate": 2.0037853764887096e-09, + "loss": 0.5247520804405212, + "step": 2300 + }, + { + "epoch": 2.9836171938361717, + "eval_loss": 0.6951664686203003, + "eval_runtime": 72.3726, + "eval_samples_per_second": 71.74, + "eval_steps_per_second": 8.967, + "step": 2300 + }, + { + "epoch": 2.9849148418491485, + "grad_norm": 0.5698785781860352, + "learning_rate": 1.7277616663946562e-09, + "loss": 0.5104506015777588, + "step": 2301 + }, + { + "epoch": 2.986212489862125, + "grad_norm": 0.5820271372795105, + "learning_rate": 1.4721818924223752e-09, + "loss": 0.5188534259796143, + "step": 2302 + }, + { + "epoch": 2.9875101378751014, + "grad_norm": 0.5771408081054688, + "learning_rate": 1.2370465771693874e-09, + "loss": 0.5191137194633484, + "step": 2303 + }, + { + "epoch": 2.9888077858880777, + "grad_norm": 0.555460512638092, + "learning_rate": 1.0223562014277654e-09, + "loss": 0.4951835870742798, + "step": 2304 + }, + { + "epoch": 2.9901054339010544, + "grad_norm": 0.602135956287384, + "learning_rate": 8.281112041841343e-10, + "loss": 0.5143213272094727, + "step": 2305 + }, + { + "epoch": 2.9914030819140307, + "grad_norm": 0.5755578875541687, + "learning_rate": 6.543119826207811e-10, + "loss": 0.5067423582077026, + "step": 2306 + }, + { + "epoch": 2.9927007299270074, + "grad_norm": 0.585641622543335, + "learning_rate": 5.009588921123243e-10, + "loss": 0.49582135677337646, + "step": 2307 + }, + { + "epoch": 2.9939983779399837, + "grad_norm": 0.5883374214172363, + "learning_rate": 3.680522462279346e-10, + "loss": 0.4730003774166107, + "step": 2308 + }, + { + "epoch": 2.9952960259529604, + "grad_norm": 0.585075318813324, + "learning_rate": 2.555923167291141e-10, + "loss": 0.5166332721710205, + "step": 2309 + }, + { + "epoch": 2.9965936739659367, + "grad_norm": 0.5931539535522461, + "learning_rate": 1.635793335652558e-10, + "loss": 0.5443276166915894, + "step": 2310 + }, + { + "epoch": 2.997891321978913, + "grad_norm": 0.6000698804855347, + "learning_rate": 9.20134848814147e-11, + "loss": 0.4828116297721863, + "step": 2311 + }, + { + "epoch": 2.9991889699918897, + "grad_norm": 0.5825672149658203, + "learning_rate": 4.08949170105366e-11, + "loss": 0.48934438824653625, + "step": 2312 + }, + { + "epoch": 3.0, + "grad_norm": 0.8691220283508301, + "learning_rate": 1.022373447900904e-11, + "loss": 0.5870037078857422, + "step": 2313 + }, + { + "epoch": 3.0, + "step": 2313, + "total_flos": 8.852766725217714e+18, + "train_loss": 0.5397342537911073, + "train_runtime": 26894.7398, + "train_samples_per_second": 11.002, + "train_steps_per_second": 0.086 + } + ], + "logging_steps": 1.0, + "max_steps": 2313, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 230, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.852766725217714e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}