Qwen3-4B-SFT-science-2e-5 / trainer_state.json
guangyangnlp's picture
Upload folder using huggingface_hub
a155fe9 verified
Raw
History Blame Contribute Delete
433 kB
{
"best_global_step": 1380,
"best_metric": 0.6770720481872559,
"best_model_checkpoint": "saves/qwen3-4B/Qwen3-4B-SFT-science-2e-5/checkpoint-1380",
"epoch": 3.0,
"eval_steps": 230,
"global_step": 2313,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012976480129764801,
"grad_norm": 8.15907096862793,
"learning_rate": 0.0,
"loss": 1.117659091949463,
"step": 1
},
{
"epoch": 0.0025952960259529602,
"grad_norm": 7.67869234085083,
"learning_rate": 1.7241379310344828e-07,
"loss": 1.0263863801956177,
"step": 2
},
{
"epoch": 0.0038929440389294406,
"grad_norm": 8.24106502532959,
"learning_rate": 3.4482758620689656e-07,
"loss": 1.1220319271087646,
"step": 3
},
{
"epoch": 0.0051905920519059205,
"grad_norm": 8.60258960723877,
"learning_rate": 5.172413793103449e-07,
"loss": 1.1806347370147705,
"step": 4
},
{
"epoch": 0.006488240064882401,
"grad_norm": 7.782258033752441,
"learning_rate": 6.896551724137931e-07,
"loss": 1.105953574180603,
"step": 5
},
{
"epoch": 0.007785888077858881,
"grad_norm": 7.797566890716553,
"learning_rate": 8.620689655172415e-07,
"loss": 1.0968478918075562,
"step": 6
},
{
"epoch": 0.009083536090835361,
"grad_norm": 7.626895427703857,
"learning_rate": 1.0344827586206898e-06,
"loss": 1.0549066066741943,
"step": 7
},
{
"epoch": 0.010381184103811841,
"grad_norm": 7.147245407104492,
"learning_rate": 1.2068965517241381e-06,
"loss": 1.0259548425674438,
"step": 8
},
{
"epoch": 0.01167883211678832,
"grad_norm": 5.977053165435791,
"learning_rate": 1.3793103448275862e-06,
"loss": 0.954434335231781,
"step": 9
},
{
"epoch": 0.012976480129764802,
"grad_norm": 6.206176280975342,
"learning_rate": 1.5517241379310346e-06,
"loss": 1.049869418144226,
"step": 10
},
{
"epoch": 0.014274128142741281,
"grad_norm": 5.300525665283203,
"learning_rate": 1.724137931034483e-06,
"loss": 1.0076310634613037,
"step": 11
},
{
"epoch": 0.015571776155717762,
"grad_norm": 4.235332489013672,
"learning_rate": 1.896551724137931e-06,
"loss": 0.9547766447067261,
"step": 12
},
{
"epoch": 0.01686942416869424,
"grad_norm": 4.258054733276367,
"learning_rate": 2.0689655172413796e-06,
"loss": 0.9808558225631714,
"step": 13
},
{
"epoch": 0.018167072181670723,
"grad_norm": 3.9000754356384277,
"learning_rate": 2.241379310344828e-06,
"loss": 0.955378532409668,
"step": 14
},
{
"epoch": 0.019464720194647202,
"grad_norm": 2.9283816814422607,
"learning_rate": 2.4137931034482762e-06,
"loss": 0.9264786839485168,
"step": 15
},
{
"epoch": 0.020762368207623682,
"grad_norm": 2.1859076023101807,
"learning_rate": 2.5862068965517246e-06,
"loss": 0.8895066380500793,
"step": 16
},
{
"epoch": 0.02206001622060016,
"grad_norm": 2.1717398166656494,
"learning_rate": 2.7586206896551725e-06,
"loss": 0.9194827675819397,
"step": 17
},
{
"epoch": 0.02335766423357664,
"grad_norm": 1.7686649560928345,
"learning_rate": 2.931034482758621e-06,
"loss": 0.8839207291603088,
"step": 18
},
{
"epoch": 0.024655312246553124,
"grad_norm": 1.7060308456420898,
"learning_rate": 3.103448275862069e-06,
"loss": 0.8821989297866821,
"step": 19
},
{
"epoch": 0.025952960259529603,
"grad_norm": 1.4888310432434082,
"learning_rate": 3.2758620689655175e-06,
"loss": 0.7937015295028687,
"step": 20
},
{
"epoch": 0.027250608272506083,
"grad_norm": 1.5812122821807861,
"learning_rate": 3.448275862068966e-06,
"loss": 0.9222494959831238,
"step": 21
},
{
"epoch": 0.028548256285482562,
"grad_norm": 1.5842291116714478,
"learning_rate": 3.620689655172414e-06,
"loss": 0.8129012584686279,
"step": 22
},
{
"epoch": 0.02984590429845904,
"grad_norm": 1.5270442962646484,
"learning_rate": 3.793103448275862e-06,
"loss": 0.843705415725708,
"step": 23
},
{
"epoch": 0.031143552311435525,
"grad_norm": 1.1963210105895996,
"learning_rate": 3.96551724137931e-06,
"loss": 0.7932494878768921,
"step": 24
},
{
"epoch": 0.032441200324412,
"grad_norm": 1.0309710502624512,
"learning_rate": 4.137931034482759e-06,
"loss": 0.7899153828620911,
"step": 25
},
{
"epoch": 0.03373884833738848,
"grad_norm": 0.9451068639755249,
"learning_rate": 4.310344827586207e-06,
"loss": 0.8323757648468018,
"step": 26
},
{
"epoch": 0.035036496350364967,
"grad_norm": 0.9398018717765808,
"learning_rate": 4.482758620689656e-06,
"loss": 0.8048505187034607,
"step": 27
},
{
"epoch": 0.036334144363341446,
"grad_norm": 0.8759371042251587,
"learning_rate": 4.655172413793104e-06,
"loss": 0.8321108222007751,
"step": 28
},
{
"epoch": 0.037631792376317925,
"grad_norm": 0.7862148284912109,
"learning_rate": 4.8275862068965525e-06,
"loss": 0.8356962203979492,
"step": 29
},
{
"epoch": 0.038929440389294405,
"grad_norm": 0.8221083283424377,
"learning_rate": 5e-06,
"loss": 0.856194794178009,
"step": 30
},
{
"epoch": 0.040227088402270884,
"grad_norm": 0.7913339734077454,
"learning_rate": 5.172413793103449e-06,
"loss": 0.782647967338562,
"step": 31
},
{
"epoch": 0.041524736415247364,
"grad_norm": 0.7948570847511292,
"learning_rate": 5.344827586206896e-06,
"loss": 0.8002289533615112,
"step": 32
},
{
"epoch": 0.04282238442822384,
"grad_norm": 0.8172705769538879,
"learning_rate": 5.517241379310345e-06,
"loss": 0.8037389516830444,
"step": 33
},
{
"epoch": 0.04412003244120032,
"grad_norm": 0.7674341797828674,
"learning_rate": 5.689655172413794e-06,
"loss": 0.7561640739440918,
"step": 34
},
{
"epoch": 0.0454176804541768,
"grad_norm": 0.7508828043937683,
"learning_rate": 5.862068965517242e-06,
"loss": 0.820884108543396,
"step": 35
},
{
"epoch": 0.04671532846715328,
"grad_norm": 0.7388272285461426,
"learning_rate": 6.03448275862069e-06,
"loss": 0.8406673669815063,
"step": 36
},
{
"epoch": 0.04801297648012977,
"grad_norm": 0.6549146771430969,
"learning_rate": 6.206896551724138e-06,
"loss": 0.7618731260299683,
"step": 37
},
{
"epoch": 0.04931062449310625,
"grad_norm": 0.6996558904647827,
"learning_rate": 6.379310344827587e-06,
"loss": 0.7531220316886902,
"step": 38
},
{
"epoch": 0.05060827250608273,
"grad_norm": 0.659206748008728,
"learning_rate": 6.551724137931035e-06,
"loss": 0.8432419896125793,
"step": 39
},
{
"epoch": 0.05190592051905921,
"grad_norm": 0.6969435811042786,
"learning_rate": 6.724137931034484e-06,
"loss": 0.8152772784233093,
"step": 40
},
{
"epoch": 0.053203568532035686,
"grad_norm": 0.638674795627594,
"learning_rate": 6.896551724137932e-06,
"loss": 0.8012467622756958,
"step": 41
},
{
"epoch": 0.054501216545012166,
"grad_norm": 0.6248321533203125,
"learning_rate": 7.0689655172413796e-06,
"loss": 0.7576991319656372,
"step": 42
},
{
"epoch": 0.055798864557988645,
"grad_norm": 0.6499493718147278,
"learning_rate": 7.241379310344828e-06,
"loss": 0.7685450911521912,
"step": 43
},
{
"epoch": 0.057096512570965124,
"grad_norm": 0.6266531348228455,
"learning_rate": 7.413793103448277e-06,
"loss": 0.7682685852050781,
"step": 44
},
{
"epoch": 0.058394160583941604,
"grad_norm": 0.6328745484352112,
"learning_rate": 7.586206896551724e-06,
"loss": 0.8221952319145203,
"step": 45
},
{
"epoch": 0.05969180859691808,
"grad_norm": 0.6457077860832214,
"learning_rate": 7.758620689655173e-06,
"loss": 0.7616772055625916,
"step": 46
},
{
"epoch": 0.06098945660989456,
"grad_norm": 0.6841326951980591,
"learning_rate": 7.93103448275862e-06,
"loss": 0.7185612916946411,
"step": 47
},
{
"epoch": 0.06228710462287105,
"grad_norm": 0.653884768486023,
"learning_rate": 8.103448275862069e-06,
"loss": 0.8144221901893616,
"step": 48
},
{
"epoch": 0.06358475263584752,
"grad_norm": 0.6235163807868958,
"learning_rate": 8.275862068965518e-06,
"loss": 0.7789400815963745,
"step": 49
},
{
"epoch": 0.064882400648824,
"grad_norm": 0.6035148501396179,
"learning_rate": 8.448275862068966e-06,
"loss": 0.7788746356964111,
"step": 50
},
{
"epoch": 0.06618004866180048,
"grad_norm": 0.6197084784507751,
"learning_rate": 8.620689655172414e-06,
"loss": 0.7773774266242981,
"step": 51
},
{
"epoch": 0.06747769667477696,
"grad_norm": 0.6356611847877502,
"learning_rate": 8.793103448275862e-06,
"loss": 0.8119993209838867,
"step": 52
},
{
"epoch": 0.06877534468775345,
"grad_norm": 0.6229863166809082,
"learning_rate": 8.965517241379312e-06,
"loss": 0.8156378269195557,
"step": 53
},
{
"epoch": 0.07007299270072993,
"grad_norm": 0.6285703778266907,
"learning_rate": 9.13793103448276e-06,
"loss": 0.7589212656021118,
"step": 54
},
{
"epoch": 0.07137064071370641,
"grad_norm": 0.6221722960472107,
"learning_rate": 9.310344827586207e-06,
"loss": 0.7588199377059937,
"step": 55
},
{
"epoch": 0.07266828872668289,
"grad_norm": 0.5896920561790466,
"learning_rate": 9.482758620689655e-06,
"loss": 0.7869905233383179,
"step": 56
},
{
"epoch": 0.07396593673965937,
"grad_norm": 0.6120532155036926,
"learning_rate": 9.655172413793105e-06,
"loss": 0.7379593849182129,
"step": 57
},
{
"epoch": 0.07526358475263585,
"grad_norm": 0.6437456011772156,
"learning_rate": 9.827586206896553e-06,
"loss": 0.8263105154037476,
"step": 58
},
{
"epoch": 0.07656123276561233,
"grad_norm": 0.6005666851997375,
"learning_rate": 1e-05,
"loss": 0.8053442239761353,
"step": 59
},
{
"epoch": 0.07785888077858881,
"grad_norm": 0.618229866027832,
"learning_rate": 1.0172413793103449e-05,
"loss": 0.7303550243377686,
"step": 60
},
{
"epoch": 0.07915652879156529,
"grad_norm": 0.6245790719985962,
"learning_rate": 1.0344827586206898e-05,
"loss": 0.7618341445922852,
"step": 61
},
{
"epoch": 0.08045417680454177,
"grad_norm": 0.632989227771759,
"learning_rate": 1.0517241379310346e-05,
"loss": 0.8073338270187378,
"step": 62
},
{
"epoch": 0.08175182481751825,
"grad_norm": 0.6083235740661621,
"learning_rate": 1.0689655172413792e-05,
"loss": 0.7776636481285095,
"step": 63
},
{
"epoch": 0.08304947283049473,
"grad_norm": 0.6136429309844971,
"learning_rate": 1.0862068965517242e-05,
"loss": 0.8043953776359558,
"step": 64
},
{
"epoch": 0.08434712084347121,
"grad_norm": 0.6103477478027344,
"learning_rate": 1.103448275862069e-05,
"loss": 0.7928889989852905,
"step": 65
},
{
"epoch": 0.08564476885644769,
"grad_norm": 0.6038222312927246,
"learning_rate": 1.1206896551724138e-05,
"loss": 0.7927621603012085,
"step": 66
},
{
"epoch": 0.08694241686942417,
"grad_norm": 0.6238990426063538,
"learning_rate": 1.1379310344827587e-05,
"loss": 0.7877966165542603,
"step": 67
},
{
"epoch": 0.08824006488240065,
"grad_norm": 0.5899522304534912,
"learning_rate": 1.1551724137931035e-05,
"loss": 0.721104621887207,
"step": 68
},
{
"epoch": 0.08953771289537713,
"grad_norm": 0.6330446004867554,
"learning_rate": 1.1724137931034483e-05,
"loss": 0.8130797147750854,
"step": 69
},
{
"epoch": 0.0908353609083536,
"grad_norm": 0.6214055418968201,
"learning_rate": 1.1896551724137933e-05,
"loss": 0.78719162940979,
"step": 70
},
{
"epoch": 0.09213300892133008,
"grad_norm": 0.648266077041626,
"learning_rate": 1.206896551724138e-05,
"loss": 0.7923158407211304,
"step": 71
},
{
"epoch": 0.09343065693430656,
"grad_norm": 0.6473869681358337,
"learning_rate": 1.2241379310344827e-05,
"loss": 0.8679413795471191,
"step": 72
},
{
"epoch": 0.09472830494728304,
"grad_norm": 0.5954247117042542,
"learning_rate": 1.2413793103448277e-05,
"loss": 0.7424967288970947,
"step": 73
},
{
"epoch": 0.09602595296025954,
"grad_norm": 0.6318120956420898,
"learning_rate": 1.2586206896551725e-05,
"loss": 0.7612457275390625,
"step": 74
},
{
"epoch": 0.09732360097323602,
"grad_norm": 0.6183631420135498,
"learning_rate": 1.2758620689655174e-05,
"loss": 0.7567603588104248,
"step": 75
},
{
"epoch": 0.0986212489862125,
"grad_norm": 0.6186433434486389,
"learning_rate": 1.2931034482758622e-05,
"loss": 0.8088338375091553,
"step": 76
},
{
"epoch": 0.09991889699918897,
"grad_norm": 0.6034461855888367,
"learning_rate": 1.310344827586207e-05,
"loss": 0.7736937999725342,
"step": 77
},
{
"epoch": 0.10121654501216545,
"grad_norm": 0.6197369694709778,
"learning_rate": 1.327586206896552e-05,
"loss": 0.7498612999916077,
"step": 78
},
{
"epoch": 0.10251419302514193,
"grad_norm": 0.6505046486854553,
"learning_rate": 1.3448275862068967e-05,
"loss": 0.8144986629486084,
"step": 79
},
{
"epoch": 0.10381184103811841,
"grad_norm": 0.6240726113319397,
"learning_rate": 1.3620689655172414e-05,
"loss": 0.7407926321029663,
"step": 80
},
{
"epoch": 0.10510948905109489,
"grad_norm": 0.6124047040939331,
"learning_rate": 1.3793103448275863e-05,
"loss": 0.7526525855064392,
"step": 81
},
{
"epoch": 0.10640713706407137,
"grad_norm": 0.5982939004898071,
"learning_rate": 1.3965517241379311e-05,
"loss": 0.722671627998352,
"step": 82
},
{
"epoch": 0.10770478507704785,
"grad_norm": 0.5908958315849304,
"learning_rate": 1.4137931034482759e-05,
"loss": 0.7402417659759521,
"step": 83
},
{
"epoch": 0.10900243309002433,
"grad_norm": 0.6116979718208313,
"learning_rate": 1.4310344827586209e-05,
"loss": 0.7960222959518433,
"step": 84
},
{
"epoch": 0.11030008110300081,
"grad_norm": 0.6197500228881836,
"learning_rate": 1.4482758620689657e-05,
"loss": 0.7519891858100891,
"step": 85
},
{
"epoch": 0.11159772911597729,
"grad_norm": 2.220649480819702,
"learning_rate": 1.4655172413793105e-05,
"loss": 0.7659766674041748,
"step": 86
},
{
"epoch": 0.11289537712895377,
"grad_norm": 5.19334602355957,
"learning_rate": 1.4827586206896554e-05,
"loss": 0.7760565280914307,
"step": 87
},
{
"epoch": 0.11419302514193025,
"grad_norm": 0.6664707064628601,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.7354503870010376,
"step": 88
},
{
"epoch": 0.11549067315490673,
"grad_norm": 0.6490852236747742,
"learning_rate": 1.5172413793103448e-05,
"loss": 0.7803969979286194,
"step": 89
},
{
"epoch": 0.11678832116788321,
"grad_norm": 0.6153193116188049,
"learning_rate": 1.5344827586206898e-05,
"loss": 0.7803000807762146,
"step": 90
},
{
"epoch": 0.11808596918085969,
"grad_norm": 0.6364138722419739,
"learning_rate": 1.5517241379310346e-05,
"loss": 0.7799690961837769,
"step": 91
},
{
"epoch": 0.11938361719383617,
"grad_norm": 0.6558602452278137,
"learning_rate": 1.5689655172413794e-05,
"loss": 0.8238034248352051,
"step": 92
},
{
"epoch": 0.12068126520681265,
"grad_norm": 0.629127562046051,
"learning_rate": 1.586206896551724e-05,
"loss": 0.7694847583770752,
"step": 93
},
{
"epoch": 0.12197891321978913,
"grad_norm": 0.5806317925453186,
"learning_rate": 1.603448275862069e-05,
"loss": 0.7090768814086914,
"step": 94
},
{
"epoch": 0.12327656123276562,
"grad_norm": 0.673556387424469,
"learning_rate": 1.6206896551724137e-05,
"loss": 0.8536560535430908,
"step": 95
},
{
"epoch": 0.1245742092457421,
"grad_norm": 0.5968764424324036,
"learning_rate": 1.637931034482759e-05,
"loss": 0.7300469875335693,
"step": 96
},
{
"epoch": 0.12587185725871858,
"grad_norm": 0.6305297613143921,
"learning_rate": 1.6551724137931037e-05,
"loss": 0.7591036558151245,
"step": 97
},
{
"epoch": 0.12716950527169504,
"grad_norm": 0.606986403465271,
"learning_rate": 1.6724137931034485e-05,
"loss": 0.76216721534729,
"step": 98
},
{
"epoch": 0.12846715328467154,
"grad_norm": 0.6063655018806458,
"learning_rate": 1.6896551724137932e-05,
"loss": 0.68424391746521,
"step": 99
},
{
"epoch": 0.129764801297648,
"grad_norm": 0.7023365497589111,
"learning_rate": 1.706896551724138e-05,
"loss": 0.8325944542884827,
"step": 100
},
{
"epoch": 0.1310624493106245,
"grad_norm": 0.6358933448791504,
"learning_rate": 1.7241379310344828e-05,
"loss": 0.8054566383361816,
"step": 101
},
{
"epoch": 0.13236009732360096,
"grad_norm": 0.6431549191474915,
"learning_rate": 1.7413793103448276e-05,
"loss": 0.7429993748664856,
"step": 102
},
{
"epoch": 0.13365774533657745,
"grad_norm": 0.6152120232582092,
"learning_rate": 1.7586206896551724e-05,
"loss": 0.7206076383590698,
"step": 103
},
{
"epoch": 0.13495539334955392,
"grad_norm": 0.6442373991012573,
"learning_rate": 1.7758620689655175e-05,
"loss": 0.806060791015625,
"step": 104
},
{
"epoch": 0.1362530413625304,
"grad_norm": 0.6756954789161682,
"learning_rate": 1.7931034482758623e-05,
"loss": 0.8363012671470642,
"step": 105
},
{
"epoch": 0.1375506893755069,
"grad_norm": 0.743787407875061,
"learning_rate": 1.810344827586207e-05,
"loss": 0.8207604885101318,
"step": 106
},
{
"epoch": 0.13884833738848337,
"grad_norm": 0.686335563659668,
"learning_rate": 1.827586206896552e-05,
"loss": 0.7393860816955566,
"step": 107
},
{
"epoch": 0.14014598540145987,
"grad_norm": 0.6191396713256836,
"learning_rate": 1.8448275862068967e-05,
"loss": 0.7534383535385132,
"step": 108
},
{
"epoch": 0.14144363341443633,
"grad_norm": 0.6754934191703796,
"learning_rate": 1.8620689655172415e-05,
"loss": 0.8022092580795288,
"step": 109
},
{
"epoch": 0.14274128142741282,
"grad_norm": 0.6399085521697998,
"learning_rate": 1.8793103448275863e-05,
"loss": 0.8507853746414185,
"step": 110
},
{
"epoch": 0.1440389294403893,
"grad_norm": 0.6910972595214844,
"learning_rate": 1.896551724137931e-05,
"loss": 0.8276559710502625,
"step": 111
},
{
"epoch": 0.14533657745336578,
"grad_norm": 0.5906772613525391,
"learning_rate": 1.913793103448276e-05,
"loss": 0.7183451056480408,
"step": 112
},
{
"epoch": 0.14663422546634225,
"grad_norm": 0.6329069137573242,
"learning_rate": 1.931034482758621e-05,
"loss": 0.789232611656189,
"step": 113
},
{
"epoch": 0.14793187347931874,
"grad_norm": 0.6226819157600403,
"learning_rate": 1.9482758620689658e-05,
"loss": 0.7747266292572021,
"step": 114
},
{
"epoch": 0.1492295214922952,
"grad_norm": 0.65074223279953,
"learning_rate": 1.9655172413793106e-05,
"loss": 0.753608226776123,
"step": 115
},
{
"epoch": 0.1505271695052717,
"grad_norm": 0.6118033528327942,
"learning_rate": 1.9827586206896554e-05,
"loss": 0.7803196907043457,
"step": 116
},
{
"epoch": 0.15182481751824817,
"grad_norm": 0.6553196907043457,
"learning_rate": 2e-05,
"loss": 0.8216028213500977,
"step": 117
},
{
"epoch": 0.15312246553122466,
"grad_norm": 0.678218424320221,
"learning_rate": 1.999998977626552e-05,
"loss": 0.807174801826477,
"step": 118
},
{
"epoch": 0.15442011354420113,
"grad_norm": 0.6192781329154968,
"learning_rate": 1.999995910508299e-05,
"loss": 0.7289496660232544,
"step": 119
},
{
"epoch": 0.15571776155717762,
"grad_norm": 0.6038413047790527,
"learning_rate": 1.999990798651512e-05,
"loss": 0.7679600119590759,
"step": 120
},
{
"epoch": 0.15701540957015409,
"grad_norm": 0.6870720386505127,
"learning_rate": 1.9999836420666438e-05,
"loss": 0.8232643604278564,
"step": 121
},
{
"epoch": 0.15831305758313058,
"grad_norm": 0.623460590839386,
"learning_rate": 1.999974440768327e-05,
"loss": 0.7480977177619934,
"step": 122
},
{
"epoch": 0.15961070559610704,
"grad_norm": 0.651508629322052,
"learning_rate": 1.9999631947753776e-05,
"loss": 0.7708613276481628,
"step": 123
},
{
"epoch": 0.16090835360908354,
"grad_norm": 0.6450805068016052,
"learning_rate": 1.999949904110789e-05,
"loss": 0.8049247860908508,
"step": 124
},
{
"epoch": 0.16220600162206,
"grad_norm": 0.6157734990119934,
"learning_rate": 1.999934568801738e-05,
"loss": 0.7631984949111938,
"step": 125
},
{
"epoch": 0.1635036496350365,
"grad_norm": 0.6847337484359741,
"learning_rate": 1.999917188879582e-05,
"loss": 0.7424380779266357,
"step": 126
},
{
"epoch": 0.164801297648013,
"grad_norm": 0.6398855447769165,
"learning_rate": 1.9998977643798572e-05,
"loss": 0.7688143253326416,
"step": 127
},
{
"epoch": 0.16609894566098946,
"grad_norm": 0.6518498063087463,
"learning_rate": 1.999876295342283e-05,
"loss": 0.7191232442855835,
"step": 128
},
{
"epoch": 0.16739659367396595,
"grad_norm": 0.6462240219116211,
"learning_rate": 1.9998527818107577e-05,
"loss": 0.7375045418739319,
"step": 129
},
{
"epoch": 0.16869424168694241,
"grad_norm": 0.6727373600006104,
"learning_rate": 1.9998272238333606e-05,
"loss": 0.7088533639907837,
"step": 130
},
{
"epoch": 0.1699918896999189,
"grad_norm": 0.689372181892395,
"learning_rate": 1.9997996214623515e-05,
"loss": 0.8250190615653992,
"step": 131
},
{
"epoch": 0.17128953771289537,
"grad_norm": 0.6236900687217712,
"learning_rate": 1.9997699747541698e-05,
"loss": 0.7653014659881592,
"step": 132
},
{
"epoch": 0.17258718572587187,
"grad_norm": 0.617174506187439,
"learning_rate": 1.9997382837694355e-05,
"loss": 0.7043566703796387,
"step": 133
},
{
"epoch": 0.17388483373884833,
"grad_norm": 0.6391400694847107,
"learning_rate": 1.999704548572949e-05,
"loss": 0.8009853363037109,
"step": 134
},
{
"epoch": 0.17518248175182483,
"grad_norm": 0.6218752861022949,
"learning_rate": 1.9996687692336896e-05,
"loss": 0.7598843574523926,
"step": 135
},
{
"epoch": 0.1764801297648013,
"grad_norm": 0.5787500143051147,
"learning_rate": 1.9996309458248184e-05,
"loss": 0.7174202799797058,
"step": 136
},
{
"epoch": 0.17777777777777778,
"grad_norm": 0.6410360932350159,
"learning_rate": 1.999591078423673e-05,
"loss": 0.763797402381897,
"step": 137
},
{
"epoch": 0.17907542579075425,
"grad_norm": 0.970513641834259,
"learning_rate": 1.9995491671117734e-05,
"loss": 0.6977022290229797,
"step": 138
},
{
"epoch": 0.18037307380373074,
"grad_norm": 0.6853165030479431,
"learning_rate": 1.999505211974817e-05,
"loss": 0.7822556495666504,
"step": 139
},
{
"epoch": 0.1816707218167072,
"grad_norm": 0.6396400332450867,
"learning_rate": 1.999459213102681e-05,
"loss": 0.7862622737884521,
"step": 140
},
{
"epoch": 0.1829683698296837,
"grad_norm": 0.6066014766693115,
"learning_rate": 1.9994111705894218e-05,
"loss": 0.8506604433059692,
"step": 141
},
{
"epoch": 0.18426601784266017,
"grad_norm": 0.6197599172592163,
"learning_rate": 1.9993610845332734e-05,
"loss": 0.7890738844871521,
"step": 142
},
{
"epoch": 0.18556366585563666,
"grad_norm": 0.6512314677238464,
"learning_rate": 1.99930895503665e-05,
"loss": 0.7983291149139404,
"step": 143
},
{
"epoch": 0.18686131386861313,
"grad_norm": 0.5899611115455627,
"learning_rate": 1.9992547822061427e-05,
"loss": 0.7357482314109802,
"step": 144
},
{
"epoch": 0.18815896188158962,
"grad_norm": 0.6489595770835876,
"learning_rate": 1.9991985661525217e-05,
"loss": 0.875076174736023,
"step": 145
},
{
"epoch": 0.18945660989456609,
"grad_norm": 0.6258020997047424,
"learning_rate": 1.999140306990734e-05,
"loss": 0.7252365350723267,
"step": 146
},
{
"epoch": 0.19075425790754258,
"grad_norm": 0.6045345067977905,
"learning_rate": 1.999080004839905e-05,
"loss": 0.7721343040466309,
"step": 147
},
{
"epoch": 0.19205190592051907,
"grad_norm": 0.6506165862083435,
"learning_rate": 1.999017659823338e-05,
"loss": 0.8302021026611328,
"step": 148
},
{
"epoch": 0.19334955393349554,
"grad_norm": 0.6503569483757019,
"learning_rate": 1.9989532720685115e-05,
"loss": 0.825711190700531,
"step": 149
},
{
"epoch": 0.19464720194647203,
"grad_norm": 0.5828515887260437,
"learning_rate": 1.998886841707083e-05,
"loss": 0.7742114067077637,
"step": 150
},
{
"epoch": 0.1959448499594485,
"grad_norm": 0.5945319533348083,
"learning_rate": 1.9988183688748862e-05,
"loss": 0.8291171789169312,
"step": 151
},
{
"epoch": 0.197242497972425,
"grad_norm": 0.6298274993896484,
"learning_rate": 1.9987478537119297e-05,
"loss": 0.8312891721725464,
"step": 152
},
{
"epoch": 0.19854014598540146,
"grad_norm": 0.6161749958992004,
"learning_rate": 1.9986752963624002e-05,
"loss": 0.8070319890975952,
"step": 153
},
{
"epoch": 0.19983779399837795,
"grad_norm": 0.6540800929069519,
"learning_rate": 1.998600696974658e-05,
"loss": 0.7966468334197998,
"step": 154
},
{
"epoch": 0.20113544201135442,
"grad_norm": 0.628194272518158,
"learning_rate": 1.9985240557012406e-05,
"loss": 0.7929773926734924,
"step": 155
},
{
"epoch": 0.2024330900243309,
"grad_norm": 0.6037770509719849,
"learning_rate": 1.99844537269886e-05,
"loss": 0.6729363203048706,
"step": 156
},
{
"epoch": 0.20373073803730737,
"grad_norm": 0.6952143907546997,
"learning_rate": 1.9983646481284028e-05,
"loss": 0.8734431266784668,
"step": 157
},
{
"epoch": 0.20502838605028387,
"grad_norm": 0.6359195113182068,
"learning_rate": 1.9982818821549308e-05,
"loss": 0.7915219664573669,
"step": 158
},
{
"epoch": 0.20632603406326033,
"grad_norm": 0.578925609588623,
"learning_rate": 1.9981970749476792e-05,
"loss": 0.7327010631561279,
"step": 159
},
{
"epoch": 0.20762368207623683,
"grad_norm": 0.6001781821250916,
"learning_rate": 1.998110226680057e-05,
"loss": 0.7517937421798706,
"step": 160
},
{
"epoch": 0.2089213300892133,
"grad_norm": 0.6306588649749756,
"learning_rate": 1.9980213375296468e-05,
"loss": 0.7292003035545349,
"step": 161
},
{
"epoch": 0.21021897810218979,
"grad_norm": 0.5737298130989075,
"learning_rate": 1.997930407678205e-05,
"loss": 0.7056928873062134,
"step": 162
},
{
"epoch": 0.21151662611516625,
"grad_norm": 0.6045275926589966,
"learning_rate": 1.99783743731166e-05,
"loss": 0.738794207572937,
"step": 163
},
{
"epoch": 0.21281427412814274,
"grad_norm": 0.6090785264968872,
"learning_rate": 1.9977424266201126e-05,
"loss": 0.8411350846290588,
"step": 164
},
{
"epoch": 0.2141119221411192,
"grad_norm": 0.6489406824111938,
"learning_rate": 1.9976453757978355e-05,
"loss": 0.750893771648407,
"step": 165
},
{
"epoch": 0.2154095701540957,
"grad_norm": 0.5950313210487366,
"learning_rate": 1.997546285043273e-05,
"loss": 0.6694055199623108,
"step": 166
},
{
"epoch": 0.21670721816707217,
"grad_norm": 0.6618576645851135,
"learning_rate": 1.9974451545590407e-05,
"loss": 0.8072858452796936,
"step": 167
},
{
"epoch": 0.21800486618004866,
"grad_norm": 0.587589681148529,
"learning_rate": 1.997341984551925e-05,
"loss": 0.7707666158676147,
"step": 168
},
{
"epoch": 0.21930251419302516,
"grad_norm": 0.6130505204200745,
"learning_rate": 1.9972367752328824e-05,
"loss": 0.683761715888977,
"step": 169
},
{
"epoch": 0.22060016220600162,
"grad_norm": 0.6129958033561707,
"learning_rate": 1.9971295268170393e-05,
"loss": 0.7264688014984131,
"step": 170
},
{
"epoch": 0.22189781021897811,
"grad_norm": 0.6114361882209778,
"learning_rate": 1.9970202395236913e-05,
"loss": 0.7344344854354858,
"step": 171
},
{
"epoch": 0.22319545823195458,
"grad_norm": 0.6653074622154236,
"learning_rate": 1.996908913576304e-05,
"loss": 0.7358161211013794,
"step": 172
},
{
"epoch": 0.22449310624493107,
"grad_norm": 0.6639219522476196,
"learning_rate": 1.9967955492025094e-05,
"loss": 0.7851651906967163,
"step": 173
},
{
"epoch": 0.22579075425790754,
"grad_norm": 0.5558881759643555,
"learning_rate": 1.9966801466341107e-05,
"loss": 0.7109513878822327,
"step": 174
},
{
"epoch": 0.22708840227088403,
"grad_norm": 0.6213382482528687,
"learning_rate": 1.9965627061070755e-05,
"loss": 0.702171802520752,
"step": 175
},
{
"epoch": 0.2283860502838605,
"grad_norm": 0.6152480840682983,
"learning_rate": 1.996443227861541e-05,
"loss": 0.8059327602386475,
"step": 176
},
{
"epoch": 0.229683698296837,
"grad_norm": 1.3707772493362427,
"learning_rate": 1.996321712141809e-05,
"loss": 0.6749221682548523,
"step": 177
},
{
"epoch": 0.23098134630981346,
"grad_norm": 0.6016313433647156,
"learning_rate": 1.9961981591963494e-05,
"loss": 0.7931903004646301,
"step": 178
},
{
"epoch": 0.23227899432278995,
"grad_norm": 0.6266494393348694,
"learning_rate": 1.9960725692777956e-05,
"loss": 0.7843484878540039,
"step": 179
},
{
"epoch": 0.23357664233576642,
"grad_norm": 0.6365560293197632,
"learning_rate": 1.995944942642948e-05,
"loss": 0.769256055355072,
"step": 180
},
{
"epoch": 0.2348742903487429,
"grad_norm": 0.5864040851593018,
"learning_rate": 1.9958152795527706e-05,
"loss": 0.7252316474914551,
"step": 181
},
{
"epoch": 0.23617193836171937,
"grad_norm": 0.6339318156242371,
"learning_rate": 1.9956835802723916e-05,
"loss": 0.8299843668937683,
"step": 182
},
{
"epoch": 0.23746958637469587,
"grad_norm": 0.5974844098091125,
"learning_rate": 1.9955498450711026e-05,
"loss": 0.7282422184944153,
"step": 183
},
{
"epoch": 0.23876723438767233,
"grad_norm": 0.5841022729873657,
"learning_rate": 1.9954140742223586e-05,
"loss": 0.7407736778259277,
"step": 184
},
{
"epoch": 0.24006488240064883,
"grad_norm": 0.6066944599151611,
"learning_rate": 1.9952762680037758e-05,
"loss": 0.7745926380157471,
"step": 185
},
{
"epoch": 0.2413625304136253,
"grad_norm": 0.5798110365867615,
"learning_rate": 1.995136426697134e-05,
"loss": 0.7561591863632202,
"step": 186
},
{
"epoch": 0.24266017842660179,
"grad_norm": 0.5705812573432922,
"learning_rate": 1.9949945505883723e-05,
"loss": 0.7066362500190735,
"step": 187
},
{
"epoch": 0.24395782643957825,
"grad_norm": 0.6322996020317078,
"learning_rate": 1.994850639967592e-05,
"loss": 0.8032187819480896,
"step": 188
},
{
"epoch": 0.24525547445255474,
"grad_norm": 0.613441526889801,
"learning_rate": 1.994704695129054e-05,
"loss": 0.75013267993927,
"step": 189
},
{
"epoch": 0.24655312246553124,
"grad_norm": 0.609327495098114,
"learning_rate": 1.9945567163711788e-05,
"loss": 0.7675092220306396,
"step": 190
},
{
"epoch": 0.2478507704785077,
"grad_norm": 0.6119315028190613,
"learning_rate": 1.9944067039965445e-05,
"loss": 0.7201006412506104,
"step": 191
},
{
"epoch": 0.2491484184914842,
"grad_norm": 0.5587560534477234,
"learning_rate": 1.9942546583118894e-05,
"loss": 0.7847742438316345,
"step": 192
},
{
"epoch": 0.25044606650446066,
"grad_norm": 0.5934576988220215,
"learning_rate": 1.994100579628108e-05,
"loss": 0.74636310338974,
"step": 193
},
{
"epoch": 0.25174371451743716,
"grad_norm": 0.5709709525108337,
"learning_rate": 1.9939444682602522e-05,
"loss": 0.6807436347007751,
"step": 194
},
{
"epoch": 0.25304136253041365,
"grad_norm": 0.6085708737373352,
"learning_rate": 1.9937863245275303e-05,
"loss": 0.7877497673034668,
"step": 195
},
{
"epoch": 0.2543390105433901,
"grad_norm": 0.5789342522621155,
"learning_rate": 1.9936261487533066e-05,
"loss": 0.7314412593841553,
"step": 196
},
{
"epoch": 0.2556366585563666,
"grad_norm": 0.5808578133583069,
"learning_rate": 1.993463941265099e-05,
"loss": 0.7081149816513062,
"step": 197
},
{
"epoch": 0.2569343065693431,
"grad_norm": 0.5988272428512573,
"learning_rate": 1.993299702394582e-05,
"loss": 0.718379020690918,
"step": 198
},
{
"epoch": 0.25823195458231957,
"grad_norm": 0.6408476829528809,
"learning_rate": 1.9931334324775817e-05,
"loss": 0.8201683163642883,
"step": 199
},
{
"epoch": 0.259529602595296,
"grad_norm": 0.582078218460083,
"learning_rate": 1.9929651318540783e-05,
"loss": 0.7401193380355835,
"step": 200
},
{
"epoch": 0.2608272506082725,
"grad_norm": 0.607105553150177,
"learning_rate": 1.9927948008682038e-05,
"loss": 0.74293053150177,
"step": 201
},
{
"epoch": 0.262124898621249,
"grad_norm": 0.5975603461265564,
"learning_rate": 1.9926224398682424e-05,
"loss": 0.779903769493103,
"step": 202
},
{
"epoch": 0.2634225466342255,
"grad_norm": 0.5534036159515381,
"learning_rate": 1.992448049206628e-05,
"loss": 0.6884838342666626,
"step": 203
},
{
"epoch": 0.2647201946472019,
"grad_norm": 0.610633909702301,
"learning_rate": 1.9922716292399458e-05,
"loss": 0.7174521684646606,
"step": 204
},
{
"epoch": 0.2660178426601784,
"grad_norm": 0.5961881279945374,
"learning_rate": 1.9920931803289302e-05,
"loss": 0.7740389108657837,
"step": 205
},
{
"epoch": 0.2673154906731549,
"grad_norm": 0.5700147747993469,
"learning_rate": 1.9919127028384634e-05,
"loss": 0.7351720333099365,
"step": 206
},
{
"epoch": 0.2686131386861314,
"grad_norm": 0.6236000061035156,
"learning_rate": 1.9917301971375767e-05,
"loss": 0.8022093772888184,
"step": 207
},
{
"epoch": 0.26991078669910784,
"grad_norm": 0.5870935320854187,
"learning_rate": 1.991545663599448e-05,
"loss": 0.7842336297035217,
"step": 208
},
{
"epoch": 0.27120843471208433,
"grad_norm": 0.6193575263023376,
"learning_rate": 1.9913591026014016e-05,
"loss": 0.7481486797332764,
"step": 209
},
{
"epoch": 0.2725060827250608,
"grad_norm": 0.6119521260261536,
"learning_rate": 1.9911705145249076e-05,
"loss": 0.7951152324676514,
"step": 210
},
{
"epoch": 0.2738037307380373,
"grad_norm": 0.5536502599716187,
"learning_rate": 1.9909798997555806e-05,
"loss": 0.790625810623169,
"step": 211
},
{
"epoch": 0.2751013787510138,
"grad_norm": 0.5879918336868286,
"learning_rate": 1.99078725868318e-05,
"loss": 0.7092885971069336,
"step": 212
},
{
"epoch": 0.27639902676399025,
"grad_norm": 0.5877639055252075,
"learning_rate": 1.9905925917016077e-05,
"loss": 0.724690318107605,
"step": 213
},
{
"epoch": 0.27769667477696675,
"grad_norm": 0.5909678339958191,
"learning_rate": 1.9903958992089087e-05,
"loss": 0.7642319202423096,
"step": 214
},
{
"epoch": 0.27899432278994324,
"grad_norm": 0.5952388644218445,
"learning_rate": 1.990197181607269e-05,
"loss": 0.7681585550308228,
"step": 215
},
{
"epoch": 0.28029197080291973,
"grad_norm": 0.5698040723800659,
"learning_rate": 1.989996439303016e-05,
"loss": 0.7373849153518677,
"step": 216
},
{
"epoch": 0.28158961881589617,
"grad_norm": 0.5865874886512756,
"learning_rate": 1.989793672706617e-05,
"loss": 0.7335535287857056,
"step": 217
},
{
"epoch": 0.28288726682887266,
"grad_norm": 0.6045393943786621,
"learning_rate": 1.9895888822326783e-05,
"loss": 0.7242499589920044,
"step": 218
},
{
"epoch": 0.28418491484184916,
"grad_norm": 0.6004535555839539,
"learning_rate": 1.9893820682999444e-05,
"loss": 0.7604917287826538,
"step": 219
},
{
"epoch": 0.28548256285482565,
"grad_norm": 1.119056224822998,
"learning_rate": 1.9891732313312973e-05,
"loss": 0.772226095199585,
"step": 220
},
{
"epoch": 0.2867802108678021,
"grad_norm": 0.5902665853500366,
"learning_rate": 1.9889623717537564e-05,
"loss": 0.7658222317695618,
"step": 221
},
{
"epoch": 0.2880778588807786,
"grad_norm": 0.6264858245849609,
"learning_rate": 1.9887494899984757e-05,
"loss": 0.7901877760887146,
"step": 222
},
{
"epoch": 0.2893755068937551,
"grad_norm": 0.5469992756843567,
"learning_rate": 1.9885345865007444e-05,
"loss": 0.7618519067764282,
"step": 223
},
{
"epoch": 0.29067315490673157,
"grad_norm": 0.5550391674041748,
"learning_rate": 1.9883176616999863e-05,
"loss": 0.788576602935791,
"step": 224
},
{
"epoch": 0.291970802919708,
"grad_norm": 0.5628973245620728,
"learning_rate": 1.9880987160397573e-05,
"loss": 0.718231737613678,
"step": 225
},
{
"epoch": 0.2932684509326845,
"grad_norm": 0.5723385214805603,
"learning_rate": 1.987877749967746e-05,
"loss": 0.698378324508667,
"step": 226
},
{
"epoch": 0.294566098945661,
"grad_norm": 0.5784431099891663,
"learning_rate": 1.987654763935772e-05,
"loss": 0.7598991990089417,
"step": 227
},
{
"epoch": 0.2958637469586375,
"grad_norm": 0.5549972653388977,
"learning_rate": 1.9874297583997852e-05,
"loss": 0.7384412288665771,
"step": 228
},
{
"epoch": 0.2971613949716139,
"grad_norm": 0.5789146423339844,
"learning_rate": 1.9872027338198652e-05,
"loss": 0.7528890371322632,
"step": 229
},
{
"epoch": 0.2984590429845904,
"grad_norm": 0.6021227240562439,
"learning_rate": 1.98697369066022e-05,
"loss": 0.805375337600708,
"step": 230
},
{
"epoch": 0.2984590429845904,
"eval_loss": 0.7241292595863342,
"eval_runtime": 73.217,
"eval_samples_per_second": 70.913,
"eval_steps_per_second": 8.864,
"step": 230
},
{
"epoch": 0.2997566909975669,
"grad_norm": 0.6029407978057861,
"learning_rate": 1.986742629389184e-05,
"loss": 0.7631509900093079,
"step": 231
},
{
"epoch": 0.3010543390105434,
"grad_norm": 0.5768916606903076,
"learning_rate": 1.98650955047922e-05,
"loss": 0.7468521595001221,
"step": 232
},
{
"epoch": 0.3023519870235199,
"grad_norm": 0.550506055355072,
"learning_rate": 1.9862744544069146e-05,
"loss": 0.7611327767372131,
"step": 233
},
{
"epoch": 0.30364963503649633,
"grad_norm": 0.5796909332275391,
"learning_rate": 1.9860373416529804e-05,
"loss": 0.7168669700622559,
"step": 234
},
{
"epoch": 0.30494728304947283,
"grad_norm": 0.8639640808105469,
"learning_rate": 1.9857982127022527e-05,
"loss": 0.7404369115829468,
"step": 235
},
{
"epoch": 0.3062449310624493,
"grad_norm": 0.5862186551094055,
"learning_rate": 1.9855570680436896e-05,
"loss": 0.7222490310668945,
"step": 236
},
{
"epoch": 0.3075425790754258,
"grad_norm": 0.6011035442352295,
"learning_rate": 1.9853139081703712e-05,
"loss": 0.8068719506263733,
"step": 237
},
{
"epoch": 0.30884022708840225,
"grad_norm": 0.5739139318466187,
"learning_rate": 1.9850687335794974e-05,
"loss": 0.7303578853607178,
"step": 238
},
{
"epoch": 0.31013787510137875,
"grad_norm": 0.5833807587623596,
"learning_rate": 1.9848215447723888e-05,
"loss": 0.7608842849731445,
"step": 239
},
{
"epoch": 0.31143552311435524,
"grad_norm": 0.5929459929466248,
"learning_rate": 1.9845723422544834e-05,
"loss": 0.8103141188621521,
"step": 240
},
{
"epoch": 0.31273317112733173,
"grad_norm": 0.5728944540023804,
"learning_rate": 1.9843211265353376e-05,
"loss": 0.7196205854415894,
"step": 241
},
{
"epoch": 0.31403081914030817,
"grad_norm": 0.5517752170562744,
"learning_rate": 1.9840678981286237e-05,
"loss": 0.6758772730827332,
"step": 242
},
{
"epoch": 0.31532846715328466,
"grad_norm": 0.5443773865699768,
"learning_rate": 1.98381265755213e-05,
"loss": 0.6859534978866577,
"step": 243
},
{
"epoch": 0.31662611516626116,
"grad_norm": 0.5687966346740723,
"learning_rate": 1.9835554053277587e-05,
"loss": 0.7471268177032471,
"step": 244
},
{
"epoch": 0.31792376317923765,
"grad_norm": 0.5604870319366455,
"learning_rate": 1.9832961419815253e-05,
"loss": 0.6843122839927673,
"step": 245
},
{
"epoch": 0.3192214111922141,
"grad_norm": 0.5563496351242065,
"learning_rate": 1.983034868043558e-05,
"loss": 0.7023979425430298,
"step": 246
},
{
"epoch": 0.3205190592051906,
"grad_norm": 0.58856201171875,
"learning_rate": 1.9827715840480962e-05,
"loss": 0.826436460018158,
"step": 247
},
{
"epoch": 0.3218167072181671,
"grad_norm": 0.5512715578079224,
"learning_rate": 1.9825062905334883e-05,
"loss": 0.702526867389679,
"step": 248
},
{
"epoch": 0.32311435523114357,
"grad_norm": 0.541459858417511,
"learning_rate": 1.9822389880421927e-05,
"loss": 0.7273234128952026,
"step": 249
},
{
"epoch": 0.32441200324412,
"grad_norm": 0.5705904364585876,
"learning_rate": 1.9819696771207756e-05,
"loss": 0.783245325088501,
"step": 250
},
{
"epoch": 0.3257096512570965,
"grad_norm": 0.5666183829307556,
"learning_rate": 1.981698358319909e-05,
"loss": 0.7261844873428345,
"step": 251
},
{
"epoch": 0.327007299270073,
"grad_norm": 0.5902214646339417,
"learning_rate": 1.981425032194372e-05,
"loss": 0.7943121194839478,
"step": 252
},
{
"epoch": 0.3283049472830495,
"grad_norm": 0.6048629879951477,
"learning_rate": 1.981149699303047e-05,
"loss": 0.7712939381599426,
"step": 253
},
{
"epoch": 0.329602595296026,
"grad_norm": 0.5914484858512878,
"learning_rate": 1.9808723602089198e-05,
"loss": 0.7921222448348999,
"step": 254
},
{
"epoch": 0.3309002433090024,
"grad_norm": 0.5761268734931946,
"learning_rate": 1.980593015479079e-05,
"loss": 0.7280013561248779,
"step": 255
},
{
"epoch": 0.3321978913219789,
"grad_norm": 0.5902722477912903,
"learning_rate": 1.9803116656847136e-05,
"loss": 0.8062602877616882,
"step": 256
},
{
"epoch": 0.3334955393349554,
"grad_norm": 0.5620178580284119,
"learning_rate": 1.9800283114011134e-05,
"loss": 0.7278565168380737,
"step": 257
},
{
"epoch": 0.3347931873479319,
"grad_norm": 0.5686838626861572,
"learning_rate": 1.9797429532076652e-05,
"loss": 0.7540629506111145,
"step": 258
},
{
"epoch": 0.33609083536090834,
"grad_norm": 0.5724810361862183,
"learning_rate": 1.9794555916878548e-05,
"loss": 0.8088860511779785,
"step": 259
},
{
"epoch": 0.33738848337388483,
"grad_norm": 0.5640983581542969,
"learning_rate": 1.9791662274292638e-05,
"loss": 0.7638871669769287,
"step": 260
},
{
"epoch": 0.3386861313868613,
"grad_norm": 0.5784658193588257,
"learning_rate": 1.978874861023569e-05,
"loss": 0.7313830852508545,
"step": 261
},
{
"epoch": 0.3399837793998378,
"grad_norm": 0.5539552569389343,
"learning_rate": 1.9785814930665404e-05,
"loss": 0.7729085683822632,
"step": 262
},
{
"epoch": 0.34128142741281425,
"grad_norm": 0.561370849609375,
"learning_rate": 1.9782861241580417e-05,
"loss": 0.6871550679206848,
"step": 263
},
{
"epoch": 0.34257907542579075,
"grad_norm": 0.5643728375434875,
"learning_rate": 1.9779887549020273e-05,
"loss": 0.7683601379394531,
"step": 264
},
{
"epoch": 0.34387672343876724,
"grad_norm": 0.5431486964225769,
"learning_rate": 1.9776893859065424e-05,
"loss": 0.7228385210037231,
"step": 265
},
{
"epoch": 0.34517437145174373,
"grad_norm": 0.5863342881202698,
"learning_rate": 1.9773880177837202e-05,
"loss": 0.7906335592269897,
"step": 266
},
{
"epoch": 0.34647201946472017,
"grad_norm": 0.5614317655563354,
"learning_rate": 1.9770846511497833e-05,
"loss": 0.7299401164054871,
"step": 267
},
{
"epoch": 0.34776966747769666,
"grad_norm": 0.5694175958633423,
"learning_rate": 1.9767792866250386e-05,
"loss": 0.7474102973937988,
"step": 268
},
{
"epoch": 0.34906731549067316,
"grad_norm": 0.5707114934921265,
"learning_rate": 1.97647192483388e-05,
"loss": 0.7324154376983643,
"step": 269
},
{
"epoch": 0.35036496350364965,
"grad_norm": 0.5364754796028137,
"learning_rate": 1.976162566404784e-05,
"loss": 0.6927608251571655,
"step": 270
},
{
"epoch": 0.3516626115166261,
"grad_norm": 0.6064906120300293,
"learning_rate": 1.9758512119703106e-05,
"loss": 0.7652560472488403,
"step": 271
},
{
"epoch": 0.3529602595296026,
"grad_norm": 0.5919526815414429,
"learning_rate": 1.9755378621671006e-05,
"loss": 0.7977138757705688,
"step": 272
},
{
"epoch": 0.3542579075425791,
"grad_norm": 0.567382276058197,
"learning_rate": 1.9752225176358757e-05,
"loss": 0.7258316278457642,
"step": 273
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.5770947337150574,
"learning_rate": 1.974905179021435e-05,
"loss": 0.7411879301071167,
"step": 274
},
{
"epoch": 0.35685320356853206,
"grad_norm": 0.5705130100250244,
"learning_rate": 1.9745858469726555e-05,
"loss": 0.7439219951629639,
"step": 275
},
{
"epoch": 0.3581508515815085,
"grad_norm": 0.5373214483261108,
"learning_rate": 1.9742645221424905e-05,
"loss": 0.6836246252059937,
"step": 276
},
{
"epoch": 0.359448499594485,
"grad_norm": 0.596576988697052,
"learning_rate": 1.9739412051879686e-05,
"loss": 0.6741154789924622,
"step": 277
},
{
"epoch": 0.3607461476074615,
"grad_norm": 0.5719678997993469,
"learning_rate": 1.973615896770191e-05,
"loss": 0.7447401881217957,
"step": 278
},
{
"epoch": 0.362043795620438,
"grad_norm": 0.5882077813148499,
"learning_rate": 1.97328859755433e-05,
"loss": 0.7762616872787476,
"step": 279
},
{
"epoch": 0.3633414436334144,
"grad_norm": 0.6879026293754578,
"learning_rate": 1.972959308209631e-05,
"loss": 0.7956463098526001,
"step": 280
},
{
"epoch": 0.3646390916463909,
"grad_norm": 0.5789086222648621,
"learning_rate": 1.9726280294094067e-05,
"loss": 0.7541590929031372,
"step": 281
},
{
"epoch": 0.3659367396593674,
"grad_norm": 0.5802841186523438,
"learning_rate": 1.9722947618310384e-05,
"loss": 0.7047423124313354,
"step": 282
},
{
"epoch": 0.3672343876723439,
"grad_norm": 0.5507220029830933,
"learning_rate": 1.9719595061559742e-05,
"loss": 0.6714630722999573,
"step": 283
},
{
"epoch": 0.36853203568532034,
"grad_norm": 0.5980960726737976,
"learning_rate": 1.9716222630697266e-05,
"loss": 0.7872920036315918,
"step": 284
},
{
"epoch": 0.36982968369829683,
"grad_norm": 0.5855656266212463,
"learning_rate": 1.971283033261873e-05,
"loss": 0.7662516832351685,
"step": 285
},
{
"epoch": 0.3711273317112733,
"grad_norm": 0.5851466655731201,
"learning_rate": 1.9709418174260523e-05,
"loss": 0.7596746683120728,
"step": 286
},
{
"epoch": 0.3724249797242498,
"grad_norm": 0.5843831300735474,
"learning_rate": 1.9705986162599642e-05,
"loss": 0.7550405263900757,
"step": 287
},
{
"epoch": 0.37372262773722625,
"grad_norm": 0.5846932530403137,
"learning_rate": 1.9702534304653685e-05,
"loss": 0.7254443764686584,
"step": 288
},
{
"epoch": 0.37502027575020275,
"grad_norm": 0.6054766774177551,
"learning_rate": 1.9699062607480827e-05,
"loss": 0.7600511908531189,
"step": 289
},
{
"epoch": 0.37631792376317924,
"grad_norm": 0.5703001618385315,
"learning_rate": 1.969557107817981e-05,
"loss": 0.7401167750358582,
"step": 290
},
{
"epoch": 0.37761557177615573,
"grad_norm": 0.5855723023414612,
"learning_rate": 1.9692059723889927e-05,
"loss": 0.7476931214332581,
"step": 291
},
{
"epoch": 0.37891321978913217,
"grad_norm": 0.5804258584976196,
"learning_rate": 1.968852855179101e-05,
"loss": 0.7656409740447998,
"step": 292
},
{
"epoch": 0.38021086780210867,
"grad_norm": 0.5795084834098816,
"learning_rate": 1.9684977569103415e-05,
"loss": 0.7599056959152222,
"step": 293
},
{
"epoch": 0.38150851581508516,
"grad_norm": 0.5684756636619568,
"learning_rate": 1.9681406783087998e-05,
"loss": 0.674816370010376,
"step": 294
},
{
"epoch": 0.38280616382806165,
"grad_norm": 0.5463794469833374,
"learning_rate": 1.9677816201046113e-05,
"loss": 0.683580219745636,
"step": 295
},
{
"epoch": 0.38410381184103815,
"grad_norm": 0.5722465515136719,
"learning_rate": 1.9674205830319594e-05,
"loss": 0.693361222743988,
"step": 296
},
{
"epoch": 0.3854014598540146,
"grad_norm": 0.6253486275672913,
"learning_rate": 1.9670575678290732e-05,
"loss": 0.7917322516441345,
"step": 297
},
{
"epoch": 0.3866991078669911,
"grad_norm": 0.5660127401351929,
"learning_rate": 1.9666925752382275e-05,
"loss": 0.7436933517456055,
"step": 298
},
{
"epoch": 0.38799675587996757,
"grad_norm": 0.572499692440033,
"learning_rate": 1.9663256060057395e-05,
"loss": 0.6714681386947632,
"step": 299
},
{
"epoch": 0.38929440389294406,
"grad_norm": 0.5779220461845398,
"learning_rate": 1.9659566608819677e-05,
"loss": 0.7252252697944641,
"step": 300
},
{
"epoch": 0.3905920519059205,
"grad_norm": 0.5990428924560547,
"learning_rate": 1.9655857406213124e-05,
"loss": 0.7827754020690918,
"step": 301
},
{
"epoch": 0.391889699918897,
"grad_norm": 0.5721242427825928,
"learning_rate": 1.9652128459822113e-05,
"loss": 0.7102577686309814,
"step": 302
},
{
"epoch": 0.3931873479318735,
"grad_norm": 0.5870105028152466,
"learning_rate": 1.9648379777271397e-05,
"loss": 0.683538019657135,
"step": 303
},
{
"epoch": 0.39448499594485,
"grad_norm": 0.5920274257659912,
"learning_rate": 1.964461136622608e-05,
"loss": 0.7541404366493225,
"step": 304
},
{
"epoch": 0.3957826439578264,
"grad_norm": 0.5439295768737793,
"learning_rate": 1.9640823234391614e-05,
"loss": 0.675430417060852,
"step": 305
},
{
"epoch": 0.3970802919708029,
"grad_norm": 0.6126630902290344,
"learning_rate": 1.9637015389513765e-05,
"loss": 0.7898478507995605,
"step": 306
},
{
"epoch": 0.3983779399837794,
"grad_norm": 0.5664204359054565,
"learning_rate": 1.963318783937861e-05,
"loss": 0.6964154839515686,
"step": 307
},
{
"epoch": 0.3996755879967559,
"grad_norm": 0.5839046239852905,
"learning_rate": 1.962934059181253e-05,
"loss": 0.7421650886535645,
"step": 308
},
{
"epoch": 0.40097323600973234,
"grad_norm": 0.6044719815254211,
"learning_rate": 1.962547365468216e-05,
"loss": 0.7794229984283447,
"step": 309
},
{
"epoch": 0.40227088402270883,
"grad_norm": 0.5989699363708496,
"learning_rate": 1.962158703589442e-05,
"loss": 0.6963369846343994,
"step": 310
},
{
"epoch": 0.4035685320356853,
"grad_norm": 0.5891120433807373,
"learning_rate": 1.9617680743396452e-05,
"loss": 0.7737009525299072,
"step": 311
},
{
"epoch": 0.4048661800486618,
"grad_norm": 0.5753238201141357,
"learning_rate": 1.961375478517564e-05,
"loss": 0.6912685632705688,
"step": 312
},
{
"epoch": 0.40616382806163825,
"grad_norm": 0.6656221747398376,
"learning_rate": 1.9609809169259573e-05,
"loss": 0.7757899165153503,
"step": 313
},
{
"epoch": 0.40746147607461475,
"grad_norm": 0.6444079875946045,
"learning_rate": 1.960584390371604e-05,
"loss": 0.7399554252624512,
"step": 314
},
{
"epoch": 0.40875912408759124,
"grad_norm": 0.5455271601676941,
"learning_rate": 1.9601858996653004e-05,
"loss": 0.7261430025100708,
"step": 315
},
{
"epoch": 0.41005677210056773,
"grad_norm": 0.5660345554351807,
"learning_rate": 1.9597854456218588e-05,
"loss": 0.7287646532058716,
"step": 316
},
{
"epoch": 0.41135442011354423,
"grad_norm": 0.5909862518310547,
"learning_rate": 1.9593830290601067e-05,
"loss": 0.7831040620803833,
"step": 317
},
{
"epoch": 0.41265206812652067,
"grad_norm": 0.5852524638175964,
"learning_rate": 1.9589786508028842e-05,
"loss": 0.7229428291320801,
"step": 318
},
{
"epoch": 0.41394971613949716,
"grad_norm": 0.5916611552238464,
"learning_rate": 1.9585723116770425e-05,
"loss": 0.7438414692878723,
"step": 319
},
{
"epoch": 0.41524736415247365,
"grad_norm": 0.5859969854354858,
"learning_rate": 1.9581640125134415e-05,
"loss": 0.7692857384681702,
"step": 320
},
{
"epoch": 0.41654501216545015,
"grad_norm": 0.5748182535171509,
"learning_rate": 1.9577537541469506e-05,
"loss": 0.7208437919616699,
"step": 321
},
{
"epoch": 0.4178426601784266,
"grad_norm": 0.5739149451255798,
"learning_rate": 1.957341537416444e-05,
"loss": 0.6877571940422058,
"step": 322
},
{
"epoch": 0.4191403081914031,
"grad_norm": 0.6014899611473083,
"learning_rate": 1.9569273631648005e-05,
"loss": 0.7482254505157471,
"step": 323
},
{
"epoch": 0.42043795620437957,
"grad_norm": 0.5997340679168701,
"learning_rate": 1.9565112322389017e-05,
"loss": 0.735174298286438,
"step": 324
},
{
"epoch": 0.42173560421735606,
"grad_norm": 0.572567343711853,
"learning_rate": 1.95609314548963e-05,
"loss": 0.7159808874130249,
"step": 325
},
{
"epoch": 0.4230332522303325,
"grad_norm": 0.5567170977592468,
"learning_rate": 1.955673103771867e-05,
"loss": 0.6460487842559814,
"step": 326
},
{
"epoch": 0.424330900243309,
"grad_norm": 0.570945143699646,
"learning_rate": 1.9552511079444914e-05,
"loss": 0.780687689781189,
"step": 327
},
{
"epoch": 0.4256285482562855,
"grad_norm": 0.5721143484115601,
"learning_rate": 1.9548271588703783e-05,
"loss": 0.7781848907470703,
"step": 328
},
{
"epoch": 0.426926196269262,
"grad_norm": 0.5866307616233826,
"learning_rate": 1.954401257416396e-05,
"loss": 0.6634104251861572,
"step": 329
},
{
"epoch": 0.4282238442822384,
"grad_norm": 0.575668215751648,
"learning_rate": 1.9539734044534057e-05,
"loss": 0.7831740379333496,
"step": 330
},
{
"epoch": 0.4295214922952149,
"grad_norm": 0.5764342546463013,
"learning_rate": 1.9535436008562576e-05,
"loss": 0.7253679037094116,
"step": 331
},
{
"epoch": 0.4308191403081914,
"grad_norm": 0.5597108006477356,
"learning_rate": 1.9531118475037916e-05,
"loss": 0.6709398627281189,
"step": 332
},
{
"epoch": 0.4321167883211679,
"grad_norm": 0.595028817653656,
"learning_rate": 1.9526781452788342e-05,
"loss": 0.7365997433662415,
"step": 333
},
{
"epoch": 0.43341443633414434,
"grad_norm": 0.5742825865745544,
"learning_rate": 1.9522424950681964e-05,
"loss": 0.7389061450958252,
"step": 334
},
{
"epoch": 0.43471208434712083,
"grad_norm": 0.55686354637146,
"learning_rate": 1.951804897762673e-05,
"loss": 0.6932294964790344,
"step": 335
},
{
"epoch": 0.4360097323600973,
"grad_norm": 0.6195898652076721,
"learning_rate": 1.951365354257039e-05,
"loss": 0.689919114112854,
"step": 336
},
{
"epoch": 0.4373073803730738,
"grad_norm": 0.5357776284217834,
"learning_rate": 1.9509238654500505e-05,
"loss": 0.6890056133270264,
"step": 337
},
{
"epoch": 0.4386050283860503,
"grad_norm": 0.563254177570343,
"learning_rate": 1.95048043224444e-05,
"loss": 0.7118027806282043,
"step": 338
},
{
"epoch": 0.43990267639902675,
"grad_norm": 0.5649257302284241,
"learning_rate": 1.9500350555469164e-05,
"loss": 0.7314987182617188,
"step": 339
},
{
"epoch": 0.44120032441200324,
"grad_norm": 0.5675091743469238,
"learning_rate": 1.9495877362681613e-05,
"loss": 0.6302130222320557,
"step": 340
},
{
"epoch": 0.44249797242497974,
"grad_norm": 0.5489922761917114,
"learning_rate": 1.9491384753228308e-05,
"loss": 0.7357535362243652,
"step": 341
},
{
"epoch": 0.44379562043795623,
"grad_norm": 0.5530965924263,
"learning_rate": 1.948687273629549e-05,
"loss": 0.6449010372161865,
"step": 342
},
{
"epoch": 0.44509326845093267,
"grad_norm": 0.5747541189193726,
"learning_rate": 1.9482341321109096e-05,
"loss": 0.7252374887466431,
"step": 343
},
{
"epoch": 0.44639091646390916,
"grad_norm": 0.5609497427940369,
"learning_rate": 1.947779051693472e-05,
"loss": 0.7096484899520874,
"step": 344
},
{
"epoch": 0.44768856447688565,
"grad_norm": 0.5988261699676514,
"learning_rate": 1.9473220333077604e-05,
"loss": 0.7986630201339722,
"step": 345
},
{
"epoch": 0.44898621248986215,
"grad_norm": 0.6313751935958862,
"learning_rate": 1.946863077888262e-05,
"loss": 0.8356250524520874,
"step": 346
},
{
"epoch": 0.4502838605028386,
"grad_norm": 0.565196692943573,
"learning_rate": 1.946402186373424e-05,
"loss": 0.7527079582214355,
"step": 347
},
{
"epoch": 0.4515815085158151,
"grad_norm": 0.5944785475730896,
"learning_rate": 1.9459393597056536e-05,
"loss": 0.6996445655822754,
"step": 348
},
{
"epoch": 0.45287915652879157,
"grad_norm": 0.5384091734886169,
"learning_rate": 1.9454745988313135e-05,
"loss": 0.7005808353424072,
"step": 349
},
{
"epoch": 0.45417680454176806,
"grad_norm": 0.5926419496536255,
"learning_rate": 1.945007904700723e-05,
"loss": 0.7360185384750366,
"step": 350
},
{
"epoch": 0.4554744525547445,
"grad_norm": 0.5517107844352722,
"learning_rate": 1.9445392782681523e-05,
"loss": 0.6678152084350586,
"step": 351
},
{
"epoch": 0.456772100567721,
"grad_norm": 0.5527735352516174,
"learning_rate": 1.9440687204918245e-05,
"loss": 0.719680666923523,
"step": 352
},
{
"epoch": 0.4580697485806975,
"grad_norm": 0.5603200793266296,
"learning_rate": 1.943596232333911e-05,
"loss": 0.7023108005523682,
"step": 353
},
{
"epoch": 0.459367396593674,
"grad_norm": 0.5883275866508484,
"learning_rate": 1.9431218147605307e-05,
"loss": 0.7870659232139587,
"step": 354
},
{
"epoch": 0.4606650446066504,
"grad_norm": 0.5547419786453247,
"learning_rate": 1.9426454687417474e-05,
"loss": 0.693616509437561,
"step": 355
},
{
"epoch": 0.4619626926196269,
"grad_norm": 0.5387628674507141,
"learning_rate": 1.942167195251568e-05,
"loss": 0.6275761127471924,
"step": 356
},
{
"epoch": 0.4632603406326034,
"grad_norm": 0.5728762745857239,
"learning_rate": 1.941686995267941e-05,
"loss": 0.7649428844451904,
"step": 357
},
{
"epoch": 0.4645579886455799,
"grad_norm": 0.5744031667709351,
"learning_rate": 1.941204869772753e-05,
"loss": 0.746831476688385,
"step": 358
},
{
"epoch": 0.4658556366585564,
"grad_norm": 0.5453589558601379,
"learning_rate": 1.9407208197518296e-05,
"loss": 0.7251806259155273,
"step": 359
},
{
"epoch": 0.46715328467153283,
"grad_norm": 0.5643113851547241,
"learning_rate": 1.94023484619493e-05,
"loss": 0.6882834434509277,
"step": 360
},
{
"epoch": 0.4684509326845093,
"grad_norm": 0.5984339714050293,
"learning_rate": 1.9397469500957478e-05,
"loss": 0.7512071132659912,
"step": 361
},
{
"epoch": 0.4697485806974858,
"grad_norm": 0.5487557649612427,
"learning_rate": 1.939257132451906e-05,
"loss": 0.7803584337234497,
"step": 362
},
{
"epoch": 0.4710462287104623,
"grad_norm": 0.5798037648200989,
"learning_rate": 1.9387653942649586e-05,
"loss": 0.7196419835090637,
"step": 363
},
{
"epoch": 0.47234387672343875,
"grad_norm": 0.5554172396659851,
"learning_rate": 1.9382717365403854e-05,
"loss": 0.7393349409103394,
"step": 364
},
{
"epoch": 0.47364152473641524,
"grad_norm": 0.546137273311615,
"learning_rate": 1.9377761602875913e-05,
"loss": 0.7212686538696289,
"step": 365
},
{
"epoch": 0.47493917274939174,
"grad_norm": 0.5687487125396729,
"learning_rate": 1.937278666519905e-05,
"loss": 0.7769354581832886,
"step": 366
},
{
"epoch": 0.47623682076236823,
"grad_norm": 0.5400050282478333,
"learning_rate": 1.9367792562545744e-05,
"loss": 0.721081018447876,
"step": 367
},
{
"epoch": 0.47753446877534467,
"grad_norm": 0.5545980930328369,
"learning_rate": 1.9362779305127674e-05,
"loss": 0.6797982454299927,
"step": 368
},
{
"epoch": 0.47883211678832116,
"grad_norm": 0.5371907949447632,
"learning_rate": 1.9357746903195686e-05,
"loss": 0.7223237752914429,
"step": 369
},
{
"epoch": 0.48012976480129765,
"grad_norm": 0.534491240978241,
"learning_rate": 1.9352695367039764e-05,
"loss": 0.7010591626167297,
"step": 370
},
{
"epoch": 0.48142741281427415,
"grad_norm": 0.5431662797927856,
"learning_rate": 1.9347624706989026e-05,
"loss": 0.7298872470855713,
"step": 371
},
{
"epoch": 0.4827250608272506,
"grad_norm": 0.5843503475189209,
"learning_rate": 1.9342534933411683e-05,
"loss": 0.7810012698173523,
"step": 372
},
{
"epoch": 0.4840227088402271,
"grad_norm": 0.5278732776641846,
"learning_rate": 1.9337426056715036e-05,
"loss": 0.7204632759094238,
"step": 373
},
{
"epoch": 0.48532035685320357,
"grad_norm": 0.5900875926017761,
"learning_rate": 1.9332298087345447e-05,
"loss": 0.7081923484802246,
"step": 374
},
{
"epoch": 0.48661800486618007,
"grad_norm": 0.5549632906913757,
"learning_rate": 1.932715103578831e-05,
"loss": 0.7588300704956055,
"step": 375
},
{
"epoch": 0.4879156528791565,
"grad_norm": 0.5351032018661499,
"learning_rate": 1.9321984912568048e-05,
"loss": 0.6380345821380615,
"step": 376
},
{
"epoch": 0.489213300892133,
"grad_norm": 0.5553699135780334,
"learning_rate": 1.9316799728248074e-05,
"loss": 0.7115924954414368,
"step": 377
},
{
"epoch": 0.4905109489051095,
"grad_norm": 0.5904532670974731,
"learning_rate": 1.9311595493430776e-05,
"loss": 0.7918650507926941,
"step": 378
},
{
"epoch": 0.491808596918086,
"grad_norm": 0.5718861818313599,
"learning_rate": 1.93063722187575e-05,
"loss": 0.7574873566627502,
"step": 379
},
{
"epoch": 0.4931062449310625,
"grad_norm": 0.5575288534164429,
"learning_rate": 1.9301129914908516e-05,
"loss": 0.7619529962539673,
"step": 380
},
{
"epoch": 0.4944038929440389,
"grad_norm": 0.5972062945365906,
"learning_rate": 1.9295868592603012e-05,
"loss": 0.8739205598831177,
"step": 381
},
{
"epoch": 0.4957015409570154,
"grad_norm": 0.5725207328796387,
"learning_rate": 1.929058826259906e-05,
"loss": 0.7461530566215515,
"step": 382
},
{
"epoch": 0.4969991889699919,
"grad_norm": 0.7559300065040588,
"learning_rate": 1.9285288935693597e-05,
"loss": 0.7054376602172852,
"step": 383
},
{
"epoch": 0.4982968369829684,
"grad_norm": 0.5533690452575684,
"learning_rate": 1.9279970622722403e-05,
"loss": 0.742769718170166,
"step": 384
},
{
"epoch": 0.49959448499594483,
"grad_norm": 0.5702188014984131,
"learning_rate": 1.927463333456009e-05,
"loss": 0.7912020683288574,
"step": 385
},
{
"epoch": 0.5008921330089213,
"grad_norm": 0.5261266231536865,
"learning_rate": 1.9269277082120053e-05,
"loss": 0.7539711594581604,
"step": 386
},
{
"epoch": 0.5021897810218978,
"grad_norm": 0.5590584874153137,
"learning_rate": 1.926390187635448e-05,
"loss": 0.7646081447601318,
"step": 387
},
{
"epoch": 0.5034874290348743,
"grad_norm": 0.5796819925308228,
"learning_rate": 1.92585077282543e-05,
"loss": 0.7352266907691956,
"step": 388
},
{
"epoch": 0.5047850770478508,
"grad_norm": 0.5712133049964905,
"learning_rate": 1.9253094648849183e-05,
"loss": 0.7203606367111206,
"step": 389
},
{
"epoch": 0.5060827250608273,
"grad_norm": 0.597654402256012,
"learning_rate": 1.924766264920751e-05,
"loss": 0.8121019601821899,
"step": 390
},
{
"epoch": 0.5073803730738037,
"grad_norm": 0.5626549124717712,
"learning_rate": 1.9242211740436335e-05,
"loss": 0.7297658920288086,
"step": 391
},
{
"epoch": 0.5086780210867802,
"grad_norm": 0.6014045476913452,
"learning_rate": 1.9236741933681396e-05,
"loss": 0.7325990200042725,
"step": 392
},
{
"epoch": 0.5099756690997567,
"grad_norm": 0.5554893612861633,
"learning_rate": 1.9231253240127062e-05,
"loss": 0.680641770362854,
"step": 393
},
{
"epoch": 0.5112733171127332,
"grad_norm": 0.5787703394889832,
"learning_rate": 1.922574567099632e-05,
"loss": 0.7252123355865479,
"step": 394
},
{
"epoch": 0.5125709651257097,
"grad_norm": 0.5811824798583984,
"learning_rate": 1.9220219237550757e-05,
"loss": 0.7139418125152588,
"step": 395
},
{
"epoch": 0.5138686131386861,
"grad_norm": 0.547007143497467,
"learning_rate": 1.921467395109053e-05,
"loss": 0.6985068917274475,
"step": 396
},
{
"epoch": 0.5151662611516626,
"grad_norm": 0.6072813272476196,
"learning_rate": 1.9209109822954345e-05,
"loss": 0.7519763708114624,
"step": 397
},
{
"epoch": 0.5164639091646391,
"grad_norm": 0.5965511798858643,
"learning_rate": 1.9203526864519432e-05,
"loss": 0.7568516135215759,
"step": 398
},
{
"epoch": 0.5177615571776155,
"grad_norm": 0.5627179741859436,
"learning_rate": 1.919792508720154e-05,
"loss": 0.7021974921226501,
"step": 399
},
{
"epoch": 0.519059205190592,
"grad_norm": 0.5491631627082825,
"learning_rate": 1.9192304502454876e-05,
"loss": 0.6992515325546265,
"step": 400
},
{
"epoch": 0.5203568532035685,
"grad_norm": 0.5874002575874329,
"learning_rate": 1.918666512177211e-05,
"loss": 0.712739109992981,
"step": 401
},
{
"epoch": 0.521654501216545,
"grad_norm": 0.5660138726234436,
"learning_rate": 1.918100695668436e-05,
"loss": 0.6854047775268555,
"step": 402
},
{
"epoch": 0.5229521492295215,
"grad_norm": 0.565985381603241,
"learning_rate": 1.917533001876113e-05,
"loss": 0.7300174236297607,
"step": 403
},
{
"epoch": 0.524249797242498,
"grad_norm": 0.5489518642425537,
"learning_rate": 1.916963431961033e-05,
"loss": 0.7667282819747925,
"step": 404
},
{
"epoch": 0.5255474452554745,
"grad_norm": 0.569230318069458,
"learning_rate": 1.916391987087822e-05,
"loss": 0.7247310876846313,
"step": 405
},
{
"epoch": 0.526845093268451,
"grad_norm": 0.5969386696815491,
"learning_rate": 1.9158186684249397e-05,
"loss": 0.7719178199768066,
"step": 406
},
{
"epoch": 0.5281427412814275,
"grad_norm": 0.5550801157951355,
"learning_rate": 1.9152434771446783e-05,
"loss": 0.6853774785995483,
"step": 407
},
{
"epoch": 0.5294403892944038,
"grad_norm": 0.5440778136253357,
"learning_rate": 1.914666414423158e-05,
"loss": 0.681282639503479,
"step": 408
},
{
"epoch": 0.5307380373073803,
"grad_norm": 0.5368308424949646,
"learning_rate": 1.914087481440326e-05,
"loss": 0.7318757772445679,
"step": 409
},
{
"epoch": 0.5320356853203568,
"grad_norm": 0.6122865676879883,
"learning_rate": 1.9135066793799538e-05,
"loss": 0.6974803805351257,
"step": 410
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.5386953353881836,
"learning_rate": 1.912924009429635e-05,
"loss": 0.7397326827049255,
"step": 411
},
{
"epoch": 0.5346309813463098,
"grad_norm": 0.5616509914398193,
"learning_rate": 1.9123394727807816e-05,
"loss": 0.7613886594772339,
"step": 412
},
{
"epoch": 0.5359286293592863,
"grad_norm": 0.627604067325592,
"learning_rate": 1.9117530706286232e-05,
"loss": 0.7783684730529785,
"step": 413
},
{
"epoch": 0.5372262773722628,
"grad_norm": 0.5613445043563843,
"learning_rate": 1.9111648041722044e-05,
"loss": 0.7296919226646423,
"step": 414
},
{
"epoch": 0.5385239253852393,
"grad_norm": 0.5356356501579285,
"learning_rate": 1.91057467461438e-05,
"loss": 0.7119168639183044,
"step": 415
},
{
"epoch": 0.5398215733982157,
"grad_norm": 0.5709317326545715,
"learning_rate": 1.9099826831618168e-05,
"loss": 0.6891450881958008,
"step": 416
},
{
"epoch": 0.5411192214111922,
"grad_norm": 0.5525058507919312,
"learning_rate": 1.909388831024987e-05,
"loss": 0.7220831513404846,
"step": 417
},
{
"epoch": 0.5424168694241687,
"grad_norm": 0.5916740894317627,
"learning_rate": 1.908793119418168e-05,
"loss": 0.7380563020706177,
"step": 418
},
{
"epoch": 0.5437145174371452,
"grad_norm": 0.5553448796272278,
"learning_rate": 1.9081955495594388e-05,
"loss": 0.6854832172393799,
"step": 419
},
{
"epoch": 0.5450121654501217,
"grad_norm": 0.550918459892273,
"learning_rate": 1.9075961226706784e-05,
"loss": 0.755254864692688,
"step": 420
},
{
"epoch": 0.5463098134630981,
"grad_norm": 0.5704249739646912,
"learning_rate": 1.906994839977564e-05,
"loss": 0.762306272983551,
"step": 421
},
{
"epoch": 0.5476074614760746,
"grad_norm": 0.5444906949996948,
"learning_rate": 1.9063917027095664e-05,
"loss": 0.7424022555351257,
"step": 422
},
{
"epoch": 0.5489051094890511,
"grad_norm": 0.5842110514640808,
"learning_rate": 1.905786712099948e-05,
"loss": 0.7851117849349976,
"step": 423
},
{
"epoch": 0.5502027575020276,
"grad_norm": 0.5527293086051941,
"learning_rate": 1.9051798693857617e-05,
"loss": 0.7389935255050659,
"step": 424
},
{
"epoch": 0.551500405515004,
"grad_norm": 0.5890975594520569,
"learning_rate": 1.904571175807848e-05,
"loss": 0.7679333686828613,
"step": 425
},
{
"epoch": 0.5527980535279805,
"grad_norm": 0.5342135429382324,
"learning_rate": 1.9039606326108297e-05,
"loss": 0.7123668193817139,
"step": 426
},
{
"epoch": 0.554095701540957,
"grad_norm": 0.5628570914268494,
"learning_rate": 1.903348241043114e-05,
"loss": 0.7286348342895508,
"step": 427
},
{
"epoch": 0.5553933495539335,
"grad_norm": 0.5398725867271423,
"learning_rate": 1.902734002356887e-05,
"loss": 0.7192749977111816,
"step": 428
},
{
"epoch": 0.55669099756691,
"grad_norm": 0.5142056941986084,
"learning_rate": 1.9021179178081107e-05,
"loss": 0.6286910772323608,
"step": 429
},
{
"epoch": 0.5579886455798865,
"grad_norm": 0.5470032095909119,
"learning_rate": 1.9014999886565226e-05,
"loss": 0.6505739092826843,
"step": 430
},
{
"epoch": 0.559286293592863,
"grad_norm": 0.5600834488868713,
"learning_rate": 1.9008802161656308e-05,
"loss": 0.7014046907424927,
"step": 431
},
{
"epoch": 0.5605839416058395,
"grad_norm": 0.5533670783042908,
"learning_rate": 1.9002586016027136e-05,
"loss": 0.7095932364463806,
"step": 432
},
{
"epoch": 0.5618815896188158,
"grad_norm": 0.5443385243415833,
"learning_rate": 1.8996351462388153e-05,
"loss": 0.7492538094520569,
"step": 433
},
{
"epoch": 0.5631792376317923,
"grad_norm": 0.5775622129440308,
"learning_rate": 1.8990098513487447e-05,
"loss": 0.7882871627807617,
"step": 434
},
{
"epoch": 0.5644768856447688,
"grad_norm": 0.5645557045936584,
"learning_rate": 1.898382718211071e-05,
"loss": 0.6681729555130005,
"step": 435
},
{
"epoch": 0.5657745336577453,
"grad_norm": 0.562117874622345,
"learning_rate": 1.897753748108123e-05,
"loss": 0.7754248380661011,
"step": 436
},
{
"epoch": 0.5670721816707218,
"grad_norm": 0.5395199656486511,
"learning_rate": 1.8971229423259855e-05,
"loss": 0.6584359407424927,
"step": 437
},
{
"epoch": 0.5683698296836983,
"grad_norm": 0.5511093735694885,
"learning_rate": 1.8964903021544964e-05,
"loss": 0.7121752500534058,
"step": 438
},
{
"epoch": 0.5696674776966748,
"grad_norm": 0.5518468022346497,
"learning_rate": 1.895855828887245e-05,
"loss": 0.7533795237541199,
"step": 439
},
{
"epoch": 0.5709651257096513,
"grad_norm": 0.541132926940918,
"learning_rate": 1.895219523821568e-05,
"loss": 0.6961894035339355,
"step": 440
},
{
"epoch": 0.5722627737226277,
"grad_norm": 0.5566806197166443,
"learning_rate": 1.894581388258549e-05,
"loss": 0.7168055176734924,
"step": 441
},
{
"epoch": 0.5735604217356042,
"grad_norm": 0.8438438773155212,
"learning_rate": 1.8939414235030137e-05,
"loss": 0.7322010397911072,
"step": 442
},
{
"epoch": 0.5748580697485807,
"grad_norm": 0.5508759617805481,
"learning_rate": 1.893299630863527e-05,
"loss": 0.689163327217102,
"step": 443
},
{
"epoch": 0.5761557177615572,
"grad_norm": 0.577190637588501,
"learning_rate": 1.892656011652393e-05,
"loss": 0.7421369552612305,
"step": 444
},
{
"epoch": 0.5774533657745337,
"grad_norm": 0.5557067394256592,
"learning_rate": 1.8920105671856507e-05,
"loss": 0.6984370350837708,
"step": 445
},
{
"epoch": 0.5787510137875101,
"grad_norm": 0.5880769491195679,
"learning_rate": 1.89136329878307e-05,
"loss": 0.6648968458175659,
"step": 446
},
{
"epoch": 0.5800486618004866,
"grad_norm": 0.5225708484649658,
"learning_rate": 1.890714207768151e-05,
"loss": 0.6399903297424316,
"step": 447
},
{
"epoch": 0.5813463098134631,
"grad_norm": 2.8270366191864014,
"learning_rate": 1.8900632954681203e-05,
"loss": 0.7426702380180359,
"step": 448
},
{
"epoch": 0.5826439578264396,
"grad_norm": 0.5743777751922607,
"learning_rate": 1.8894105632139296e-05,
"loss": 0.7008408308029175,
"step": 449
},
{
"epoch": 0.583941605839416,
"grad_norm": 0.5848923325538635,
"learning_rate": 1.8887560123402505e-05,
"loss": 0.7745944261550903,
"step": 450
},
{
"epoch": 0.5852392538523925,
"grad_norm": 0.5533474087715149,
"learning_rate": 1.888099644185474e-05,
"loss": 0.7078051567077637,
"step": 451
},
{
"epoch": 0.586536901865369,
"grad_norm": 0.5359990000724792,
"learning_rate": 1.887441460091707e-05,
"loss": 0.7025009393692017,
"step": 452
},
{
"epoch": 0.5878345498783455,
"grad_norm": 0.5772839784622192,
"learning_rate": 1.886781461404769e-05,
"loss": 0.7109262347221375,
"step": 453
},
{
"epoch": 0.589132197891322,
"grad_norm": 0.5491592288017273,
"learning_rate": 1.886119649474191e-05,
"loss": 0.6828133463859558,
"step": 454
},
{
"epoch": 0.5904298459042985,
"grad_norm": 0.5495162606239319,
"learning_rate": 1.8854560256532098e-05,
"loss": 0.6600109338760376,
"step": 455
},
{
"epoch": 0.591727493917275,
"grad_norm": 0.5773736238479614,
"learning_rate": 1.8847905912987693e-05,
"loss": 0.6746517419815063,
"step": 456
},
{
"epoch": 0.5930251419302515,
"grad_norm": 0.5658586621284485,
"learning_rate": 1.8841233477715136e-05,
"loss": 0.6905688047409058,
"step": 457
},
{
"epoch": 0.5943227899432278,
"grad_norm": 0.544463574886322,
"learning_rate": 1.8834542964357875e-05,
"loss": 0.7656948566436768,
"step": 458
},
{
"epoch": 0.5956204379562043,
"grad_norm": 0.5466704964637756,
"learning_rate": 1.8827834386596306e-05,
"loss": 0.7320756912231445,
"step": 459
},
{
"epoch": 0.5969180859691808,
"grad_norm": 0.534042477607727,
"learning_rate": 1.882110775814778e-05,
"loss": 0.6747853755950928,
"step": 460
},
{
"epoch": 0.5969180859691808,
"eval_loss": 0.7028419375419617,
"eval_runtime": 72.8032,
"eval_samples_per_second": 71.316,
"eval_steps_per_second": 8.914,
"step": 460
},
{
"epoch": 0.5982157339821573,
"grad_norm": 0.5617560148239136,
"learning_rate": 1.881436309276655e-05,
"loss": 0.7175489068031311,
"step": 461
},
{
"epoch": 0.5995133819951338,
"grad_norm": 0.538003146648407,
"learning_rate": 1.8807600404243746e-05,
"loss": 0.6772977709770203,
"step": 462
},
{
"epoch": 0.6008110300081103,
"grad_norm": 0.5164902210235596,
"learning_rate": 1.8800819706407355e-05,
"loss": 0.7026697397232056,
"step": 463
},
{
"epoch": 0.6021086780210868,
"grad_norm": 0.519985556602478,
"learning_rate": 1.879402101312219e-05,
"loss": 0.6459539532661438,
"step": 464
},
{
"epoch": 0.6034063260340633,
"grad_norm": 0.5643022060394287,
"learning_rate": 1.8787204338289858e-05,
"loss": 0.7304619550704956,
"step": 465
},
{
"epoch": 0.6047039740470398,
"grad_norm": 0.5315333604812622,
"learning_rate": 1.8780369695848733e-05,
"loss": 0.7055330872535706,
"step": 466
},
{
"epoch": 0.6060016220600162,
"grad_norm": 0.5695874691009521,
"learning_rate": 1.8773517099773927e-05,
"loss": 0.7567015290260315,
"step": 467
},
{
"epoch": 0.6072992700729927,
"grad_norm": 0.5361006259918213,
"learning_rate": 1.8766646564077265e-05,
"loss": 0.7254809141159058,
"step": 468
},
{
"epoch": 0.6085969180859692,
"grad_norm": 0.5438353419303894,
"learning_rate": 1.8759758102807253e-05,
"loss": 0.6743266582489014,
"step": 469
},
{
"epoch": 0.6098945660989457,
"grad_norm": 0.5824978351593018,
"learning_rate": 1.8752851730049055e-05,
"loss": 0.7623616456985474,
"step": 470
},
{
"epoch": 0.6111922141119221,
"grad_norm": 0.546610951423645,
"learning_rate": 1.8745927459924454e-05,
"loss": 0.809882640838623,
"step": 471
},
{
"epoch": 0.6124898621248986,
"grad_norm": 0.5459777116775513,
"learning_rate": 1.8738985306591826e-05,
"loss": 0.6817529201507568,
"step": 472
},
{
"epoch": 0.6137875101378751,
"grad_norm": 0.5381180644035339,
"learning_rate": 1.8732025284246122e-05,
"loss": 0.7059892416000366,
"step": 473
},
{
"epoch": 0.6150851581508516,
"grad_norm": 0.5245769023895264,
"learning_rate": 1.8725047407118823e-05,
"loss": 0.7031271457672119,
"step": 474
},
{
"epoch": 0.616382806163828,
"grad_norm": 0.5284971594810486,
"learning_rate": 1.8718051689477923e-05,
"loss": 0.7379744052886963,
"step": 475
},
{
"epoch": 0.6176804541768045,
"grad_norm": 0.5659690499305725,
"learning_rate": 1.8711038145627893e-05,
"loss": 0.7798171639442444,
"step": 476
},
{
"epoch": 0.618978102189781,
"grad_norm": 0.5460679531097412,
"learning_rate": 1.8704006789909654e-05,
"loss": 0.7433549165725708,
"step": 477
},
{
"epoch": 0.6202757502027575,
"grad_norm": 0.5171265602111816,
"learning_rate": 1.8696957636700555e-05,
"loss": 0.7264508008956909,
"step": 478
},
{
"epoch": 0.621573398215734,
"grad_norm": 0.5979129672050476,
"learning_rate": 1.868989070041432e-05,
"loss": 0.7511105537414551,
"step": 479
},
{
"epoch": 0.6228710462287105,
"grad_norm": 0.5520970225334167,
"learning_rate": 1.8682805995501052e-05,
"loss": 0.6946426630020142,
"step": 480
},
{
"epoch": 0.624168694241687,
"grad_norm": 0.5510658025741577,
"learning_rate": 1.8675703536447178e-05,
"loss": 0.7265397310256958,
"step": 481
},
{
"epoch": 0.6254663422546635,
"grad_norm": 0.5842864513397217,
"learning_rate": 1.866858333777543e-05,
"loss": 0.7219571471214294,
"step": 482
},
{
"epoch": 0.6267639902676398,
"grad_norm": 0.5430331826210022,
"learning_rate": 1.8661445414044813e-05,
"loss": 0.7292179465293884,
"step": 483
},
{
"epoch": 0.6280616382806163,
"grad_norm": 0.5456423759460449,
"learning_rate": 1.865428977985057e-05,
"loss": 0.7341865301132202,
"step": 484
},
{
"epoch": 0.6293592862935928,
"grad_norm": 0.55687415599823,
"learning_rate": 1.8647116449824165e-05,
"loss": 0.7712036371231079,
"step": 485
},
{
"epoch": 0.6306569343065693,
"grad_norm": 0.574967622756958,
"learning_rate": 1.8639925438633243e-05,
"loss": 0.7341934442520142,
"step": 486
},
{
"epoch": 0.6319545823195458,
"grad_norm": 0.575878381729126,
"learning_rate": 1.86327167609816e-05,
"loss": 0.6782741546630859,
"step": 487
},
{
"epoch": 0.6332522303325223,
"grad_norm": 0.5638167858123779,
"learning_rate": 1.8625490431609154e-05,
"loss": 0.8088809251785278,
"step": 488
},
{
"epoch": 0.6345498783454988,
"grad_norm": 0.547574520111084,
"learning_rate": 1.8618246465291925e-05,
"loss": 0.7108902335166931,
"step": 489
},
{
"epoch": 0.6358475263584753,
"grad_norm": 0.5785483121871948,
"learning_rate": 1.861098487684199e-05,
"loss": 0.6963984370231628,
"step": 490
},
{
"epoch": 0.6371451743714518,
"grad_norm": 0.547226071357727,
"learning_rate": 1.8603705681107456e-05,
"loss": 0.6772190928459167,
"step": 491
},
{
"epoch": 0.6384428223844282,
"grad_norm": 0.5494422912597656,
"learning_rate": 1.8596408892972442e-05,
"loss": 0.7243861556053162,
"step": 492
},
{
"epoch": 0.6397404703974047,
"grad_norm": 0.5267540216445923,
"learning_rate": 1.858909452735703e-05,
"loss": 0.6649144887924194,
"step": 493
},
{
"epoch": 0.6410381184103812,
"grad_norm": 0.5952751636505127,
"learning_rate": 1.858176259921724e-05,
"loss": 0.7574429512023926,
"step": 494
},
{
"epoch": 0.6423357664233577,
"grad_norm": 0.5476658344268799,
"learning_rate": 1.857441312354502e-05,
"loss": 0.6968377828598022,
"step": 495
},
{
"epoch": 0.6436334144363342,
"grad_norm": 0.5507075786590576,
"learning_rate": 1.856704611536818e-05,
"loss": 0.7353919744491577,
"step": 496
},
{
"epoch": 0.6449310624493106,
"grad_norm": 0.5495625734329224,
"learning_rate": 1.8559661589750387e-05,
"loss": 0.7162117958068848,
"step": 497
},
{
"epoch": 0.6462287104622871,
"grad_norm": 0.5721608996391296,
"learning_rate": 1.8552259561791133e-05,
"loss": 0.6986855268478394,
"step": 498
},
{
"epoch": 0.6475263584752636,
"grad_norm": 0.5700922608375549,
"learning_rate": 1.8544840046625686e-05,
"loss": 0.8195285797119141,
"step": 499
},
{
"epoch": 0.64882400648824,
"grad_norm": 0.5746553540229797,
"learning_rate": 1.8537403059425082e-05,
"loss": 0.7492556571960449,
"step": 500
},
{
"epoch": 0.6501216545012165,
"grad_norm": 0.5598172545433044,
"learning_rate": 1.852994861539607e-05,
"loss": 0.6921173930168152,
"step": 501
},
{
"epoch": 0.651419302514193,
"grad_norm": 0.5589975714683533,
"learning_rate": 1.8522476729781106e-05,
"loss": 0.7157631516456604,
"step": 502
},
{
"epoch": 0.6527169505271695,
"grad_norm": 0.5745802521705627,
"learning_rate": 1.8514987417858306e-05,
"loss": 0.7679554224014282,
"step": 503
},
{
"epoch": 0.654014598540146,
"grad_norm": 0.581063449382782,
"learning_rate": 1.8507480694941416e-05,
"loss": 0.7761994004249573,
"step": 504
},
{
"epoch": 0.6553122465531225,
"grad_norm": 0.5932230353355408,
"learning_rate": 1.849995657637978e-05,
"loss": 0.748866081237793,
"step": 505
},
{
"epoch": 0.656609894566099,
"grad_norm": 0.5524072647094727,
"learning_rate": 1.8492415077558325e-05,
"loss": 0.7764031887054443,
"step": 506
},
{
"epoch": 0.6579075425790755,
"grad_norm": 0.5266931653022766,
"learning_rate": 1.8484856213897496e-05,
"loss": 0.7512728571891785,
"step": 507
},
{
"epoch": 0.659205190592052,
"grad_norm": 0.5363677740097046,
"learning_rate": 1.847728000085327e-05,
"loss": 0.7477032542228699,
"step": 508
},
{
"epoch": 0.6605028386050283,
"grad_norm": 0.5348376035690308,
"learning_rate": 1.8469686453917074e-05,
"loss": 0.6908712387084961,
"step": 509
},
{
"epoch": 0.6618004866180048,
"grad_norm": 0.5489766597747803,
"learning_rate": 1.846207558861579e-05,
"loss": 0.7576340436935425,
"step": 510
},
{
"epoch": 0.6630981346309813,
"grad_norm": 0.5426369309425354,
"learning_rate": 1.845444742051172e-05,
"loss": 0.7107582092285156,
"step": 511
},
{
"epoch": 0.6643957826439578,
"grad_norm": 0.5308833718299866,
"learning_rate": 1.8446801965202524e-05,
"loss": 0.6590298414230347,
"step": 512
},
{
"epoch": 0.6656934306569343,
"grad_norm": 0.5621533989906311,
"learning_rate": 1.8439139238321235e-05,
"loss": 0.7291080355644226,
"step": 513
},
{
"epoch": 0.6669910786699108,
"grad_norm": 0.5651385188102722,
"learning_rate": 1.8431459255536185e-05,
"loss": 0.7855580449104309,
"step": 514
},
{
"epoch": 0.6682887266828873,
"grad_norm": 0.5611156225204468,
"learning_rate": 1.8423762032551e-05,
"loss": 0.6918215751647949,
"step": 515
},
{
"epoch": 0.6695863746958638,
"grad_norm": 0.5477362275123596,
"learning_rate": 1.841604758510454e-05,
"loss": 0.7025431394577026,
"step": 516
},
{
"epoch": 0.6708840227088402,
"grad_norm": 0.5612704753875732,
"learning_rate": 1.840831592897091e-05,
"loss": 0.7540648579597473,
"step": 517
},
{
"epoch": 0.6721816707218167,
"grad_norm": 0.5650063753128052,
"learning_rate": 1.8400567079959383e-05,
"loss": 0.7409968376159668,
"step": 518
},
{
"epoch": 0.6734793187347932,
"grad_norm": 0.5648168921470642,
"learning_rate": 1.8392801053914396e-05,
"loss": 0.754462718963623,
"step": 519
},
{
"epoch": 0.6747769667477697,
"grad_norm": 0.5603179931640625,
"learning_rate": 1.8385017866715507e-05,
"loss": 0.7388665080070496,
"step": 520
},
{
"epoch": 0.6760746147607462,
"grad_norm": 0.5628640651702881,
"learning_rate": 1.8377217534277365e-05,
"loss": 0.7781612873077393,
"step": 521
},
{
"epoch": 0.6773722627737226,
"grad_norm": 0.593789279460907,
"learning_rate": 1.8369400072549674e-05,
"loss": 0.753161609172821,
"step": 522
},
{
"epoch": 0.6786699107866991,
"grad_norm": 0.5755636096000671,
"learning_rate": 1.8361565497517166e-05,
"loss": 0.7570379972457886,
"step": 523
},
{
"epoch": 0.6799675587996756,
"grad_norm": 0.5607541799545288,
"learning_rate": 1.835371382519956e-05,
"loss": 0.777469277381897,
"step": 524
},
{
"epoch": 0.681265206812652,
"grad_norm": 0.4994042217731476,
"learning_rate": 1.8345845071651543e-05,
"loss": 0.6544281840324402,
"step": 525
},
{
"epoch": 0.6825628548256285,
"grad_norm": 0.5685398578643799,
"learning_rate": 1.8337959252962728e-05,
"loss": 0.7024877071380615,
"step": 526
},
{
"epoch": 0.683860502838605,
"grad_norm": 0.5343568325042725,
"learning_rate": 1.8330056385257607e-05,
"loss": 0.7003896832466125,
"step": 527
},
{
"epoch": 0.6851581508515815,
"grad_norm": 0.5208355188369751,
"learning_rate": 1.8322136484695553e-05,
"loss": 0.6797738075256348,
"step": 528
},
{
"epoch": 0.686455798864558,
"grad_norm": 0.5621144771575928,
"learning_rate": 1.8314199567470755e-05,
"loss": 0.6609838008880615,
"step": 529
},
{
"epoch": 0.6877534468775345,
"grad_norm": 0.577298104763031,
"learning_rate": 1.83062456498122e-05,
"loss": 0.711292028427124,
"step": 530
},
{
"epoch": 0.689051094890511,
"grad_norm": 0.5840193629264832,
"learning_rate": 1.8298274747983638e-05,
"loss": 0.7950271368026733,
"step": 531
},
{
"epoch": 0.6903487429034875,
"grad_norm": 0.5348870158195496,
"learning_rate": 1.8290286878283542e-05,
"loss": 0.6982176303863525,
"step": 532
},
{
"epoch": 0.691646390916464,
"grad_norm": 0.5467864871025085,
"learning_rate": 1.8282282057045087e-05,
"loss": 0.7555949687957764,
"step": 533
},
{
"epoch": 0.6929440389294403,
"grad_norm": 0.5581674575805664,
"learning_rate": 1.827426030063611e-05,
"loss": 0.6723984479904175,
"step": 534
},
{
"epoch": 0.6942416869424168,
"grad_norm": 0.5615087151527405,
"learning_rate": 1.8266221625459064e-05,
"loss": 0.7201924324035645,
"step": 535
},
{
"epoch": 0.6955393349553933,
"grad_norm": 0.5710893273353577,
"learning_rate": 1.825816604795101e-05,
"loss": 0.7096928358078003,
"step": 536
},
{
"epoch": 0.6968369829683698,
"grad_norm": 0.5586241483688354,
"learning_rate": 1.8250093584583567e-05,
"loss": 0.7197962999343872,
"step": 537
},
{
"epoch": 0.6981346309813463,
"grad_norm": 0.5536755323410034,
"learning_rate": 1.8242004251862872e-05,
"loss": 0.678354799747467,
"step": 538
},
{
"epoch": 0.6994322789943228,
"grad_norm": 0.5744696855545044,
"learning_rate": 1.823389806632957e-05,
"loss": 0.7439010739326477,
"step": 539
},
{
"epoch": 0.7007299270072993,
"grad_norm": 0.5338960886001587,
"learning_rate": 1.8225775044558757e-05,
"loss": 0.731925904750824,
"step": 540
},
{
"epoch": 0.7020275750202758,
"grad_norm": 0.5696558356285095,
"learning_rate": 1.8217635203159957e-05,
"loss": 0.7480655312538147,
"step": 541
},
{
"epoch": 0.7033252230332522,
"grad_norm": 0.5994415283203125,
"learning_rate": 1.8209478558777084e-05,
"loss": 0.776438295841217,
"step": 542
},
{
"epoch": 0.7046228710462287,
"grad_norm": 0.578956127166748,
"learning_rate": 1.8201305128088412e-05,
"loss": 0.7190870046615601,
"step": 543
},
{
"epoch": 0.7059205190592052,
"grad_norm": 0.557142972946167,
"learning_rate": 1.819311492780654e-05,
"loss": 0.7524915933609009,
"step": 544
},
{
"epoch": 0.7072181670721817,
"grad_norm": 0.5244631171226501,
"learning_rate": 1.8184907974678348e-05,
"loss": 0.6941534876823425,
"step": 545
},
{
"epoch": 0.7085158150851582,
"grad_norm": 0.5301777720451355,
"learning_rate": 1.8176684285484985e-05,
"loss": 0.7010957598686218,
"step": 546
},
{
"epoch": 0.7098134630981346,
"grad_norm": 0.5309736728668213,
"learning_rate": 1.816844387704181e-05,
"loss": 0.6693360209465027,
"step": 547
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.5188398361206055,
"learning_rate": 1.8160186766198375e-05,
"loss": 0.7254098057746887,
"step": 548
},
{
"epoch": 0.7124087591240876,
"grad_norm": 0.5340986847877502,
"learning_rate": 1.815191296983838e-05,
"loss": 0.7227193713188171,
"step": 549
},
{
"epoch": 0.7137064071370641,
"grad_norm": 0.5604742765426636,
"learning_rate": 1.8143622504879647e-05,
"loss": 0.6893896460533142,
"step": 550
},
{
"epoch": 0.7150040551500405,
"grad_norm": 0.5265613794326782,
"learning_rate": 1.8135315388274075e-05,
"loss": 0.7178789377212524,
"step": 551
},
{
"epoch": 0.716301703163017,
"grad_norm": 0.5819421410560608,
"learning_rate": 1.8126991637007618e-05,
"loss": 0.7809138298034668,
"step": 552
},
{
"epoch": 0.7175993511759935,
"grad_norm": 0.5548515915870667,
"learning_rate": 1.8118651268100235e-05,
"loss": 0.7398655414581299,
"step": 553
},
{
"epoch": 0.71889699918897,
"grad_norm": 0.5281164050102234,
"learning_rate": 1.811029429860588e-05,
"loss": 0.7255332469940186,
"step": 554
},
{
"epoch": 0.7201946472019465,
"grad_norm": 0.51970374584198,
"learning_rate": 1.810192074561243e-05,
"loss": 0.6958039999008179,
"step": 555
},
{
"epoch": 0.721492295214923,
"grad_norm": 0.5574509501457214,
"learning_rate": 1.8093530626241684e-05,
"loss": 0.77367103099823,
"step": 556
},
{
"epoch": 0.7227899432278995,
"grad_norm": 0.5539534687995911,
"learning_rate": 1.8085123957649315e-05,
"loss": 0.7615116834640503,
"step": 557
},
{
"epoch": 0.724087591240876,
"grad_norm": 0.549517035484314,
"learning_rate": 1.8076700757024833e-05,
"loss": 0.777897834777832,
"step": 558
},
{
"epoch": 0.7253852392538523,
"grad_norm": 0.5480270981788635,
"learning_rate": 1.8068261041591548e-05,
"loss": 0.7139554619789124,
"step": 559
},
{
"epoch": 0.7266828872668288,
"grad_norm": 0.5337988138198853,
"learning_rate": 1.8059804828606545e-05,
"loss": 0.7470839023590088,
"step": 560
},
{
"epoch": 0.7279805352798053,
"grad_norm": 0.5055403709411621,
"learning_rate": 1.8051332135360637e-05,
"loss": 0.6575566530227661,
"step": 561
},
{
"epoch": 0.7292781832927818,
"grad_norm": 0.5452354550361633,
"learning_rate": 1.8042842979178338e-05,
"loss": 0.7080937623977661,
"step": 562
},
{
"epoch": 0.7305758313057583,
"grad_norm": 0.5276215672492981,
"learning_rate": 1.8034337377417826e-05,
"loss": 0.6609282493591309,
"step": 563
},
{
"epoch": 0.7318734793187348,
"grad_norm": 0.5823485851287842,
"learning_rate": 1.80258153474709e-05,
"loss": 0.7274823784828186,
"step": 564
},
{
"epoch": 0.7331711273317113,
"grad_norm": 0.5385794043540955,
"learning_rate": 1.8017276906762955e-05,
"loss": 0.6209210157394409,
"step": 565
},
{
"epoch": 0.7344687753446878,
"grad_norm": 0.6051076054573059,
"learning_rate": 1.8008722072752943e-05,
"loss": 0.7948423624038696,
"step": 566
},
{
"epoch": 0.7357664233576642,
"grad_norm": 0.8337801098823547,
"learning_rate": 1.8000150862933335e-05,
"loss": 0.7299556732177734,
"step": 567
},
{
"epoch": 0.7370640713706407,
"grad_norm": 0.5429887771606445,
"learning_rate": 1.7991563294830083e-05,
"loss": 0.686081051826477,
"step": 568
},
{
"epoch": 0.7383617193836172,
"grad_norm": 0.5419583916664124,
"learning_rate": 1.7982959386002592e-05,
"loss": 0.7415616512298584,
"step": 569
},
{
"epoch": 0.7396593673965937,
"grad_norm": 0.5454174280166626,
"learning_rate": 1.7974339154043677e-05,
"loss": 0.7275187969207764,
"step": 570
},
{
"epoch": 0.7409570154095702,
"grad_norm": 0.5611673593521118,
"learning_rate": 1.796570261657953e-05,
"loss": 0.7872575521469116,
"step": 571
},
{
"epoch": 0.7422546634225466,
"grad_norm": 0.5598644018173218,
"learning_rate": 1.7957049791269684e-05,
"loss": 0.7327409982681274,
"step": 572
},
{
"epoch": 0.7435523114355231,
"grad_norm": 0.558341920375824,
"learning_rate": 1.7948380695806983e-05,
"loss": 0.711640477180481,
"step": 573
},
{
"epoch": 0.7448499594484996,
"grad_norm": 0.5189648270606995,
"learning_rate": 1.793969534791752e-05,
"loss": 0.6593164801597595,
"step": 574
},
{
"epoch": 0.7461476074614761,
"grad_norm": 0.5739206671714783,
"learning_rate": 1.7930993765360644e-05,
"loss": 0.775146484375,
"step": 575
},
{
"epoch": 0.7474452554744525,
"grad_norm": 0.5306016802787781,
"learning_rate": 1.792227596592889e-05,
"loss": 0.6946839094161987,
"step": 576
},
{
"epoch": 0.748742903487429,
"grad_norm": 0.5487167835235596,
"learning_rate": 1.791354196744794e-05,
"loss": 0.7318082451820374,
"step": 577
},
{
"epoch": 0.7500405515004055,
"grad_norm": 0.5554513931274414,
"learning_rate": 1.790479178777662e-05,
"loss": 0.727341890335083,
"step": 578
},
{
"epoch": 0.751338199513382,
"grad_norm": 0.5512000918388367,
"learning_rate": 1.7896025444806834e-05,
"loss": 0.7673891186714172,
"step": 579
},
{
"epoch": 0.7526358475263585,
"grad_norm": 0.5614628195762634,
"learning_rate": 1.7887242956463528e-05,
"loss": 0.7410103678703308,
"step": 580
},
{
"epoch": 0.753933495539335,
"grad_norm": 0.5414284467697144,
"learning_rate": 1.7878444340704666e-05,
"loss": 0.7189674377441406,
"step": 581
},
{
"epoch": 0.7552311435523115,
"grad_norm": 0.5145770311355591,
"learning_rate": 1.78696296155212e-05,
"loss": 0.6776304244995117,
"step": 582
},
{
"epoch": 0.756528791565288,
"grad_norm": 0.5401176810264587,
"learning_rate": 1.7860798798937e-05,
"loss": 0.6960833072662354,
"step": 583
},
{
"epoch": 0.7578264395782643,
"grad_norm": 0.5560998916625977,
"learning_rate": 1.7851951909008864e-05,
"loss": 0.6736742258071899,
"step": 584
},
{
"epoch": 0.7591240875912408,
"grad_norm": 0.5505719780921936,
"learning_rate": 1.7843088963826437e-05,
"loss": 0.6757134795188904,
"step": 585
},
{
"epoch": 0.7604217356042173,
"grad_norm": 0.5717475414276123,
"learning_rate": 1.783420998151219e-05,
"loss": 0.7612842321395874,
"step": 586
},
{
"epoch": 0.7617193836171938,
"grad_norm": 0.5554843544960022,
"learning_rate": 1.782531498022141e-05,
"loss": 0.705300509929657,
"step": 587
},
{
"epoch": 0.7630170316301703,
"grad_norm": 0.5320503115653992,
"learning_rate": 1.781640397814211e-05,
"loss": 0.7508092522621155,
"step": 588
},
{
"epoch": 0.7643146796431468,
"grad_norm": 0.5554909706115723,
"learning_rate": 1.7807476993495047e-05,
"loss": 0.7732164859771729,
"step": 589
},
{
"epoch": 0.7656123276561233,
"grad_norm": 0.5467298030853271,
"learning_rate": 1.779853404453363e-05,
"loss": 0.7246618270874023,
"step": 590
},
{
"epoch": 0.7669099756690998,
"grad_norm": 0.5365788340568542,
"learning_rate": 1.7789575149543936e-05,
"loss": 0.6982936263084412,
"step": 591
},
{
"epoch": 0.7682076236820763,
"grad_norm": 0.5504671931266785,
"learning_rate": 1.7780600326844638e-05,
"loss": 0.7263147830963135,
"step": 592
},
{
"epoch": 0.7695052716950527,
"grad_norm": 0.549707293510437,
"learning_rate": 1.7771609594786968e-05,
"loss": 0.7235106229782104,
"step": 593
},
{
"epoch": 0.7708029197080292,
"grad_norm": 0.5401800274848938,
"learning_rate": 1.776260297175471e-05,
"loss": 0.7632750272750854,
"step": 594
},
{
"epoch": 0.7721005677210057,
"grad_norm": 0.5245280265808105,
"learning_rate": 1.775358047616412e-05,
"loss": 0.6609013080596924,
"step": 595
},
{
"epoch": 0.7733982157339822,
"grad_norm": 0.5566380023956299,
"learning_rate": 1.774454212646392e-05,
"loss": 0.7397713661193848,
"step": 596
},
{
"epoch": 0.7746958637469586,
"grad_norm": 0.5788303017616272,
"learning_rate": 1.773548794113525e-05,
"loss": 0.6708486676216125,
"step": 597
},
{
"epoch": 0.7759935117599351,
"grad_norm": 0.5494595170021057,
"learning_rate": 1.772641793869162e-05,
"loss": 0.7761523723602295,
"step": 598
},
{
"epoch": 0.7772911597729116,
"grad_norm": 0.5339208245277405,
"learning_rate": 1.7717332137678895e-05,
"loss": 0.6619516611099243,
"step": 599
},
{
"epoch": 0.7785888077858881,
"grad_norm": 0.5362167358398438,
"learning_rate": 1.770823055667524e-05,
"loss": 0.7144718170166016,
"step": 600
},
{
"epoch": 0.7798864557988645,
"grad_norm": 0.5141735076904297,
"learning_rate": 1.7699113214291082e-05,
"loss": 0.6293293237686157,
"step": 601
},
{
"epoch": 0.781184103811841,
"grad_norm": 0.5582875609397888,
"learning_rate": 1.768998012916908e-05,
"loss": 0.7720483541488647,
"step": 602
},
{
"epoch": 0.7824817518248175,
"grad_norm": 0.5367119312286377,
"learning_rate": 1.7680831319984077e-05,
"loss": 0.705078661441803,
"step": 603
},
{
"epoch": 0.783779399837794,
"grad_norm": 0.5382807850837708,
"learning_rate": 1.7671666805443076e-05,
"loss": 0.7088773846626282,
"step": 604
},
{
"epoch": 0.7850770478507705,
"grad_norm": 0.5625648498535156,
"learning_rate": 1.766248660428519e-05,
"loss": 0.7392460703849792,
"step": 605
},
{
"epoch": 0.786374695863747,
"grad_norm": 0.5586503744125366,
"learning_rate": 1.7653290735281605e-05,
"loss": 0.7484114170074463,
"step": 606
},
{
"epoch": 0.7876723438767235,
"grad_norm": 0.5572494864463806,
"learning_rate": 1.7644079217235547e-05,
"loss": 0.7409180402755737,
"step": 607
},
{
"epoch": 0.7889699918897,
"grad_norm": 0.5369569659233093,
"learning_rate": 1.763485206898224e-05,
"loss": 0.6471737027168274,
"step": 608
},
{
"epoch": 0.7902676399026763,
"grad_norm": 0.5504409074783325,
"learning_rate": 1.762560930938886e-05,
"loss": 0.7778940200805664,
"step": 609
},
{
"epoch": 0.7915652879156528,
"grad_norm": 0.5358904600143433,
"learning_rate": 1.7616350957354523e-05,
"loss": 0.694309413433075,
"step": 610
},
{
"epoch": 0.7928629359286293,
"grad_norm": 0.5360654592514038,
"learning_rate": 1.7607077031810204e-05,
"loss": 0.6945086717605591,
"step": 611
},
{
"epoch": 0.7941605839416058,
"grad_norm": 0.535325825214386,
"learning_rate": 1.759778755171874e-05,
"loss": 0.7578423619270325,
"step": 612
},
{
"epoch": 0.7954582319545823,
"grad_norm": 0.5466883182525635,
"learning_rate": 1.758848253607476e-05,
"loss": 0.7157893180847168,
"step": 613
},
{
"epoch": 0.7967558799675588,
"grad_norm": 0.5534203052520752,
"learning_rate": 1.7579162003904678e-05,
"loss": 0.7312074303627014,
"step": 614
},
{
"epoch": 0.7980535279805353,
"grad_norm": 0.5488491654396057,
"learning_rate": 1.756982597426661e-05,
"loss": 0.7318480014801025,
"step": 615
},
{
"epoch": 0.7993511759935118,
"grad_norm": 0.5375532507896423,
"learning_rate": 1.756047446625038e-05,
"loss": 0.7143536806106567,
"step": 616
},
{
"epoch": 0.8006488240064883,
"grad_norm": 0.5791228413581848,
"learning_rate": 1.7551107498977458e-05,
"loss": 0.642976701259613,
"step": 617
},
{
"epoch": 0.8019464720194647,
"grad_norm": 0.5346726179122925,
"learning_rate": 1.7541725091600918e-05,
"loss": 0.687232255935669,
"step": 618
},
{
"epoch": 0.8032441200324412,
"grad_norm": 0.5417895913124084,
"learning_rate": 1.7532327263305405e-05,
"loss": 0.7081488370895386,
"step": 619
},
{
"epoch": 0.8045417680454177,
"grad_norm": 0.5509006381034851,
"learning_rate": 1.75229140333071e-05,
"loss": 0.7728561162948608,
"step": 620
},
{
"epoch": 0.8058394160583942,
"grad_norm": 0.5634705424308777,
"learning_rate": 1.7513485420853683e-05,
"loss": 0.6951034069061279,
"step": 621
},
{
"epoch": 0.8071370640713706,
"grad_norm": 0.5197573900222778,
"learning_rate": 1.750404144522427e-05,
"loss": 0.7106211185455322,
"step": 622
},
{
"epoch": 0.8084347120843471,
"grad_norm": 0.5803437232971191,
"learning_rate": 1.7494582125729408e-05,
"loss": 0.7436937689781189,
"step": 623
},
{
"epoch": 0.8097323600973236,
"grad_norm": 0.541920006275177,
"learning_rate": 1.7485107481711014e-05,
"loss": 0.6682834029197693,
"step": 624
},
{
"epoch": 0.8110300081103001,
"grad_norm": 0.561758279800415,
"learning_rate": 1.7475617532542325e-05,
"loss": 0.6873137950897217,
"step": 625
},
{
"epoch": 0.8123276561232765,
"grad_norm": 0.5416638255119324,
"learning_rate": 1.7466112297627894e-05,
"loss": 0.7167541980743408,
"step": 626
},
{
"epoch": 0.813625304136253,
"grad_norm": 0.5338025093078613,
"learning_rate": 1.7456591796403525e-05,
"loss": 0.7321476340293884,
"step": 627
},
{
"epoch": 0.8149229521492295,
"grad_norm": 0.5378256440162659,
"learning_rate": 1.744705604833622e-05,
"loss": 0.6663627624511719,
"step": 628
},
{
"epoch": 0.816220600162206,
"grad_norm": 0.581386387348175,
"learning_rate": 1.7437505072924177e-05,
"loss": 0.755516767501831,
"step": 629
},
{
"epoch": 0.8175182481751825,
"grad_norm": 0.581896185874939,
"learning_rate": 1.742793888969673e-05,
"loss": 0.7974879145622253,
"step": 630
},
{
"epoch": 0.818815896188159,
"grad_norm": 0.521468460559845,
"learning_rate": 1.741835751821429e-05,
"loss": 0.7400495409965515,
"step": 631
},
{
"epoch": 0.8201135442011355,
"grad_norm": 0.5232843160629272,
"learning_rate": 1.7408760978068343e-05,
"loss": 0.6786386966705322,
"step": 632
},
{
"epoch": 0.821411192214112,
"grad_norm": 0.5813708901405334,
"learning_rate": 1.739914928888139e-05,
"loss": 0.7453535199165344,
"step": 633
},
{
"epoch": 0.8227088402270885,
"grad_norm": 0.5424124002456665,
"learning_rate": 1.7389522470306892e-05,
"loss": 0.7520110607147217,
"step": 634
},
{
"epoch": 0.8240064882400648,
"grad_norm": 0.5089052319526672,
"learning_rate": 1.7379880542029263e-05,
"loss": 0.7197295427322388,
"step": 635
},
{
"epoch": 0.8253041362530413,
"grad_norm": 0.5367469191551208,
"learning_rate": 1.7370223523763804e-05,
"loss": 0.7498934864997864,
"step": 636
},
{
"epoch": 0.8266017842660178,
"grad_norm": 0.5291455388069153,
"learning_rate": 1.7360551435256673e-05,
"loss": 0.7376183867454529,
"step": 637
},
{
"epoch": 0.8278994322789943,
"grad_norm": 0.5446896553039551,
"learning_rate": 1.7350864296284846e-05,
"loss": 0.735445499420166,
"step": 638
},
{
"epoch": 0.8291970802919708,
"grad_norm": 0.5124339461326599,
"learning_rate": 1.7341162126656063e-05,
"loss": 0.6861530542373657,
"step": 639
},
{
"epoch": 0.8304947283049473,
"grad_norm": 0.5077775120735168,
"learning_rate": 1.7331444946208815e-05,
"loss": 0.688785195350647,
"step": 640
},
{
"epoch": 0.8317923763179238,
"grad_norm": 0.5058798789978027,
"learning_rate": 1.732171277481227e-05,
"loss": 0.7133075594902039,
"step": 641
},
{
"epoch": 0.8330900243309003,
"grad_norm": 0.5404756665229797,
"learning_rate": 1.7311965632366254e-05,
"loss": 0.7240495681762695,
"step": 642
},
{
"epoch": 0.8343876723438767,
"grad_norm": 0.5313534736633301,
"learning_rate": 1.7302203538801212e-05,
"loss": 0.71756911277771,
"step": 643
},
{
"epoch": 0.8356853203568532,
"grad_norm": 0.5360015630722046,
"learning_rate": 1.729242651407815e-05,
"loss": 0.7652734518051147,
"step": 644
},
{
"epoch": 0.8369829683698297,
"grad_norm": 0.540046751499176,
"learning_rate": 1.7282634578188612e-05,
"loss": 0.7294871807098389,
"step": 645
},
{
"epoch": 0.8382806163828062,
"grad_norm": 0.5653432607650757,
"learning_rate": 1.7272827751154627e-05,
"loss": 0.7391757965087891,
"step": 646
},
{
"epoch": 0.8395782643957826,
"grad_norm": 0.5427312850952148,
"learning_rate": 1.7263006053028674e-05,
"loss": 0.6798534393310547,
"step": 647
},
{
"epoch": 0.8408759124087591,
"grad_norm": 0.539861261844635,
"learning_rate": 1.7253169503893637e-05,
"loss": 0.7292792201042175,
"step": 648
},
{
"epoch": 0.8421735604217356,
"grad_norm": 0.5300166010856628,
"learning_rate": 1.7243318123862777e-05,
"loss": 0.7026904821395874,
"step": 649
},
{
"epoch": 0.8434712084347121,
"grad_norm": 0.5242528319358826,
"learning_rate": 1.7233451933079663e-05,
"loss": 0.6926451921463013,
"step": 650
},
{
"epoch": 0.8447688564476885,
"grad_norm": 0.5352111458778381,
"learning_rate": 1.7223570951718166e-05,
"loss": 0.7006164789199829,
"step": 651
},
{
"epoch": 0.846066504460665,
"grad_norm": 0.5747525095939636,
"learning_rate": 1.7213675199982388e-05,
"loss": 0.7685414552688599,
"step": 652
},
{
"epoch": 0.8473641524736415,
"grad_norm": 0.5309545397758484,
"learning_rate": 1.7203764698106636e-05,
"loss": 0.7312856912612915,
"step": 653
},
{
"epoch": 0.848661800486618,
"grad_norm": 0.5124905705451965,
"learning_rate": 1.7193839466355383e-05,
"loss": 0.6484863758087158,
"step": 654
},
{
"epoch": 0.8499594484995945,
"grad_norm": 0.5323530435562134,
"learning_rate": 1.7183899525023212e-05,
"loss": 0.694681704044342,
"step": 655
},
{
"epoch": 0.851257096512571,
"grad_norm": 0.5242999792098999,
"learning_rate": 1.7173944894434783e-05,
"loss": 0.6672481298446655,
"step": 656
},
{
"epoch": 0.8525547445255475,
"grad_norm": 0.5519501566886902,
"learning_rate": 1.7163975594944807e-05,
"loss": 0.7557801604270935,
"step": 657
},
{
"epoch": 0.853852392538524,
"grad_norm": 0.5345069169998169,
"learning_rate": 1.715399164693797e-05,
"loss": 0.7127410173416138,
"step": 658
},
{
"epoch": 0.8551500405515005,
"grad_norm": 0.5087319016456604,
"learning_rate": 1.7143993070828913e-05,
"loss": 0.6801098585128784,
"step": 659
},
{
"epoch": 0.8564476885644768,
"grad_norm": 0.546444833278656,
"learning_rate": 1.713397988706221e-05,
"loss": 0.7135753631591797,
"step": 660
},
{
"epoch": 0.8577453365774533,
"grad_norm": 0.5438613891601562,
"learning_rate": 1.7123952116112275e-05,
"loss": 0.7199326753616333,
"step": 661
},
{
"epoch": 0.8590429845904298,
"grad_norm": 0.5320620536804199,
"learning_rate": 1.7113909778483364e-05,
"loss": 0.7263282537460327,
"step": 662
},
{
"epoch": 0.8603406326034063,
"grad_norm": 0.5496207475662231,
"learning_rate": 1.7103852894709517e-05,
"loss": 0.6767710447311401,
"step": 663
},
{
"epoch": 0.8616382806163828,
"grad_norm": 0.5515886545181274,
"learning_rate": 1.7093781485354517e-05,
"loss": 0.666580319404602,
"step": 664
},
{
"epoch": 0.8629359286293593,
"grad_norm": 0.5425974130630493,
"learning_rate": 1.7083695571011842e-05,
"loss": 0.7289122343063354,
"step": 665
},
{
"epoch": 0.8642335766423358,
"grad_norm": 0.5263716578483582,
"learning_rate": 1.707359517230464e-05,
"loss": 0.6910987496376038,
"step": 666
},
{
"epoch": 0.8655312246553123,
"grad_norm": 0.525571346282959,
"learning_rate": 1.7063480309885668e-05,
"loss": 0.6733009815216064,
"step": 667
},
{
"epoch": 0.8668288726682887,
"grad_norm": 0.5529440641403198,
"learning_rate": 1.7053351004437258e-05,
"loss": 0.6993213295936584,
"step": 668
},
{
"epoch": 0.8681265206812652,
"grad_norm": 0.5263779163360596,
"learning_rate": 1.7043207276671276e-05,
"loss": 0.7125247120857239,
"step": 669
},
{
"epoch": 0.8694241686942417,
"grad_norm": 0.5178059935569763,
"learning_rate": 1.7033049147329077e-05,
"loss": 0.7389542460441589,
"step": 670
},
{
"epoch": 0.8707218167072182,
"grad_norm": 0.5027527809143066,
"learning_rate": 1.702287663718147e-05,
"loss": 0.6378510594367981,
"step": 671
},
{
"epoch": 0.8720194647201946,
"grad_norm": 0.5320873260498047,
"learning_rate": 1.7012689767028656e-05,
"loss": 0.6820501089096069,
"step": 672
},
{
"epoch": 0.8733171127331711,
"grad_norm": 0.5544079542160034,
"learning_rate": 1.700248855770021e-05,
"loss": 0.7887839078903198,
"step": 673
},
{
"epoch": 0.8746147607461476,
"grad_norm": 0.5328344702720642,
"learning_rate": 1.6992273030055022e-05,
"loss": 0.7038314938545227,
"step": 674
},
{
"epoch": 0.8759124087591241,
"grad_norm": 0.5509505867958069,
"learning_rate": 1.6982043204981264e-05,
"loss": 0.7049298286437988,
"step": 675
},
{
"epoch": 0.8772100567721006,
"grad_norm": 0.5168129205703735,
"learning_rate": 1.6971799103396332e-05,
"loss": 0.6959193348884583,
"step": 676
},
{
"epoch": 0.878507704785077,
"grad_norm": 0.5376099944114685,
"learning_rate": 1.696154074624683e-05,
"loss": 0.7292076349258423,
"step": 677
},
{
"epoch": 0.8798053527980535,
"grad_norm": 0.5142057538032532,
"learning_rate": 1.6951268154508497e-05,
"loss": 0.7193281650543213,
"step": 678
},
{
"epoch": 0.88110300081103,
"grad_norm": 0.5402371287345886,
"learning_rate": 1.6940981349186182e-05,
"loss": 0.748397946357727,
"step": 679
},
{
"epoch": 0.8824006488240065,
"grad_norm": 0.5436865091323853,
"learning_rate": 1.69306803513138e-05,
"loss": 0.7238379716873169,
"step": 680
},
{
"epoch": 0.883698296836983,
"grad_norm": 0.5323321223258972,
"learning_rate": 1.6920365181954284e-05,
"loss": 0.7368711829185486,
"step": 681
},
{
"epoch": 0.8849959448499595,
"grad_norm": 0.5474384427070618,
"learning_rate": 1.6910035862199545e-05,
"loss": 0.7030202746391296,
"step": 682
},
{
"epoch": 0.886293592862936,
"grad_norm": 0.5428197979927063,
"learning_rate": 1.6899692413170422e-05,
"loss": 0.713437557220459,
"step": 683
},
{
"epoch": 0.8875912408759125,
"grad_norm": 0.5502634048461914,
"learning_rate": 1.688933485601666e-05,
"loss": 0.7090182304382324,
"step": 684
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.5356465578079224,
"learning_rate": 1.6878963211916833e-05,
"loss": 0.7201128005981445,
"step": 685
},
{
"epoch": 0.8901865369018653,
"grad_norm": 0.5563944578170776,
"learning_rate": 1.6868577502078336e-05,
"loss": 0.7264722585678101,
"step": 686
},
{
"epoch": 0.8914841849148418,
"grad_norm": 0.5522723197937012,
"learning_rate": 1.6858177747737312e-05,
"loss": 0.7600725889205933,
"step": 687
},
{
"epoch": 0.8927818329278183,
"grad_norm": 0.49715539813041687,
"learning_rate": 1.684776397015863e-05,
"loss": 0.6456987857818604,
"step": 688
},
{
"epoch": 0.8940794809407948,
"grad_norm": 0.5162433981895447,
"learning_rate": 1.6837336190635824e-05,
"loss": 0.6648015379905701,
"step": 689
},
{
"epoch": 0.8953771289537713,
"grad_norm": 0.5113485455513,
"learning_rate": 1.682689443049107e-05,
"loss": 0.7002501487731934,
"step": 690
},
{
"epoch": 0.8953771289537713,
"eval_loss": 0.6879991292953491,
"eval_runtime": 72.6036,
"eval_samples_per_second": 71.512,
"eval_steps_per_second": 8.939,
"step": 690
},
{
"epoch": 0.8966747769667478,
"grad_norm": 0.5129652619361877,
"learning_rate": 1.6816438711075114e-05,
"loss": 0.7118932008743286,
"step": 691
},
{
"epoch": 0.8979724249797243,
"grad_norm": 0.5204065442085266,
"learning_rate": 1.680596905376727e-05,
"loss": 0.7194908857345581,
"step": 692
},
{
"epoch": 0.8992700729927007,
"grad_norm": 0.5264798402786255,
"learning_rate": 1.6795485479975327e-05,
"loss": 0.6868776082992554,
"step": 693
},
{
"epoch": 0.9005677210056772,
"grad_norm": 0.5244487524032593,
"learning_rate": 1.6784988011135546e-05,
"loss": 0.7106890678405762,
"step": 694
},
{
"epoch": 0.9018653690186537,
"grad_norm": 0.5397396683692932,
"learning_rate": 1.6774476668712587e-05,
"loss": 0.695647656917572,
"step": 695
},
{
"epoch": 0.9031630170316302,
"grad_norm": 0.5147722959518433,
"learning_rate": 1.676395147419949e-05,
"loss": 0.7283300161361694,
"step": 696
},
{
"epoch": 0.9044606650446066,
"grad_norm": 0.5326966047286987,
"learning_rate": 1.6753412449117615e-05,
"loss": 0.7349389791488647,
"step": 697
},
{
"epoch": 0.9057583130575831,
"grad_norm": 0.522964596748352,
"learning_rate": 1.67428596150166e-05,
"loss": 0.7657152414321899,
"step": 698
},
{
"epoch": 0.9070559610705596,
"grad_norm": 0.5306779742240906,
"learning_rate": 1.6732292993474316e-05,
"loss": 0.6991469264030457,
"step": 699
},
{
"epoch": 0.9083536090835361,
"grad_norm": 0.517011284828186,
"learning_rate": 1.6721712606096833e-05,
"loss": 0.6861897706985474,
"step": 700
},
{
"epoch": 0.9096512570965126,
"grad_norm": 0.5209232568740845,
"learning_rate": 1.6711118474518363e-05,
"loss": 0.6535213589668274,
"step": 701
},
{
"epoch": 0.910948905109489,
"grad_norm": 0.538005530834198,
"learning_rate": 1.6700510620401223e-05,
"loss": 0.6827917695045471,
"step": 702
},
{
"epoch": 0.9122465531224655,
"grad_norm": 0.5532050132751465,
"learning_rate": 1.6689889065435796e-05,
"loss": 0.7328672409057617,
"step": 703
},
{
"epoch": 0.913544201135442,
"grad_norm": 0.5541777014732361,
"learning_rate": 1.667925383134047e-05,
"loss": 0.639081597328186,
"step": 704
},
{
"epoch": 0.9148418491484185,
"grad_norm": 0.5441383719444275,
"learning_rate": 1.66686049398616e-05,
"loss": 0.7073994874954224,
"step": 705
},
{
"epoch": 0.916139497161395,
"grad_norm": 0.5432547330856323,
"learning_rate": 1.6657942412773484e-05,
"loss": 0.7249147295951843,
"step": 706
},
{
"epoch": 0.9174371451743715,
"grad_norm": 0.5718936324119568,
"learning_rate": 1.664726627187829e-05,
"loss": 0.7475080490112305,
"step": 707
},
{
"epoch": 0.918734793187348,
"grad_norm": 0.5303789377212524,
"learning_rate": 1.6636576539006015e-05,
"loss": 0.7102556228637695,
"step": 708
},
{
"epoch": 0.9200324412003245,
"grad_norm": 0.5120844841003418,
"learning_rate": 1.6625873236014464e-05,
"loss": 0.7160992622375488,
"step": 709
},
{
"epoch": 0.9213300892133008,
"grad_norm": 0.5382957458496094,
"learning_rate": 1.6615156384789185e-05,
"loss": 0.6958597898483276,
"step": 710
},
{
"epoch": 0.9226277372262773,
"grad_norm": 0.5195145606994629,
"learning_rate": 1.660442600724342e-05,
"loss": 0.6958160400390625,
"step": 711
},
{
"epoch": 0.9239253852392538,
"grad_norm": 0.5473058223724365,
"learning_rate": 1.659368212531808e-05,
"loss": 0.7220757007598877,
"step": 712
},
{
"epoch": 0.9252230332522303,
"grad_norm": 0.5131781697273254,
"learning_rate": 1.6582924760981683e-05,
"loss": 0.7035195827484131,
"step": 713
},
{
"epoch": 0.9265206812652068,
"grad_norm": 0.5314381122589111,
"learning_rate": 1.6572153936230316e-05,
"loss": 0.6506175994873047,
"step": 714
},
{
"epoch": 0.9278183292781833,
"grad_norm": 0.565310001373291,
"learning_rate": 1.6561369673087588e-05,
"loss": 0.7714331746101379,
"step": 715
},
{
"epoch": 0.9291159772911598,
"grad_norm": 0.530504584312439,
"learning_rate": 1.6550571993604587e-05,
"loss": 0.7331136465072632,
"step": 716
},
{
"epoch": 0.9304136253041363,
"grad_norm": 0.5755041837692261,
"learning_rate": 1.6539760919859838e-05,
"loss": 0.7090123891830444,
"step": 717
},
{
"epoch": 0.9317112733171128,
"grad_norm": 0.5264776349067688,
"learning_rate": 1.6528936473959253e-05,
"loss": 0.7207454442977905,
"step": 718
},
{
"epoch": 0.9330089213300892,
"grad_norm": 0.5459887981414795,
"learning_rate": 1.6518098678036073e-05,
"loss": 0.7477676272392273,
"step": 719
},
{
"epoch": 0.9343065693430657,
"grad_norm": 0.5480107069015503,
"learning_rate": 1.650724755425086e-05,
"loss": 0.7585529685020447,
"step": 720
},
{
"epoch": 0.9356042173560422,
"grad_norm": 0.5156884789466858,
"learning_rate": 1.6496383124791406e-05,
"loss": 0.684555172920227,
"step": 721
},
{
"epoch": 0.9369018653690186,
"grad_norm": 0.5162327289581299,
"learning_rate": 1.6485505411872725e-05,
"loss": 0.7163575887680054,
"step": 722
},
{
"epoch": 0.9381995133819951,
"grad_norm": 0.5424114465713501,
"learning_rate": 1.6474614437736986e-05,
"loss": 0.722049355506897,
"step": 723
},
{
"epoch": 0.9394971613949716,
"grad_norm": 0.546845555305481,
"learning_rate": 1.6463710224653477e-05,
"loss": 0.7012547850608826,
"step": 724
},
{
"epoch": 0.9407948094079481,
"grad_norm": 0.5183011889457703,
"learning_rate": 1.6452792794918545e-05,
"loss": 0.7152835130691528,
"step": 725
},
{
"epoch": 0.9420924574209246,
"grad_norm": 0.5085439682006836,
"learning_rate": 1.644186217085558e-05,
"loss": 0.7061685919761658,
"step": 726
},
{
"epoch": 0.943390105433901,
"grad_norm": 0.5237677097320557,
"learning_rate": 1.6430918374814937e-05,
"loss": 0.7506479024887085,
"step": 727
},
{
"epoch": 0.9446877534468775,
"grad_norm": 0.5498985052108765,
"learning_rate": 1.641996142917391e-05,
"loss": 0.7604420185089111,
"step": 728
},
{
"epoch": 0.945985401459854,
"grad_norm": 0.506365180015564,
"learning_rate": 1.640899135633668e-05,
"loss": 0.7282454967498779,
"step": 729
},
{
"epoch": 0.9472830494728305,
"grad_norm": 0.5272793769836426,
"learning_rate": 1.6398008178734272e-05,
"loss": 0.7712985277175903,
"step": 730
},
{
"epoch": 0.948580697485807,
"grad_norm": 0.49885818362236023,
"learning_rate": 1.6387011918824493e-05,
"loss": 0.6967482566833496,
"step": 731
},
{
"epoch": 0.9498783454987835,
"grad_norm": 0.5086526274681091,
"learning_rate": 1.6376002599091925e-05,
"loss": 0.7118892073631287,
"step": 732
},
{
"epoch": 0.95117599351176,
"grad_norm": 0.5380651354789734,
"learning_rate": 1.6364980242047835e-05,
"loss": 0.7118611335754395,
"step": 733
},
{
"epoch": 0.9524736415247365,
"grad_norm": 0.5358894467353821,
"learning_rate": 1.635394487023015e-05,
"loss": 0.73922199010849,
"step": 734
},
{
"epoch": 0.9537712895377128,
"grad_norm": 0.518375813961029,
"learning_rate": 1.634289650620342e-05,
"loss": 0.7491021156311035,
"step": 735
},
{
"epoch": 0.9550689375506893,
"grad_norm": 0.5029126405715942,
"learning_rate": 1.633183517255875e-05,
"loss": 0.6724518537521362,
"step": 736
},
{
"epoch": 0.9563665855636658,
"grad_norm": 0.5309873819351196,
"learning_rate": 1.632076089191376e-05,
"loss": 0.7152642011642456,
"step": 737
},
{
"epoch": 0.9576642335766423,
"grad_norm": 0.5265018343925476,
"learning_rate": 1.630967368691256e-05,
"loss": 0.7223344445228577,
"step": 738
},
{
"epoch": 0.9589618815896188,
"grad_norm": 0.5360968112945557,
"learning_rate": 1.6298573580225676e-05,
"loss": 0.6773437261581421,
"step": 739
},
{
"epoch": 0.9602595296025953,
"grad_norm": 0.532696545124054,
"learning_rate": 1.6287460594550017e-05,
"loss": 0.6913273930549622,
"step": 740
},
{
"epoch": 0.9615571776155718,
"grad_norm": 0.5159463286399841,
"learning_rate": 1.6276334752608823e-05,
"loss": 0.7023458480834961,
"step": 741
},
{
"epoch": 0.9628548256285483,
"grad_norm": 0.5166627764701843,
"learning_rate": 1.6265196077151627e-05,
"loss": 0.6580889821052551,
"step": 742
},
{
"epoch": 0.9641524736415248,
"grad_norm": 0.5432324409484863,
"learning_rate": 1.62540445909542e-05,
"loss": 0.7707301378250122,
"step": 743
},
{
"epoch": 0.9654501216545012,
"grad_norm": 0.5537624955177307,
"learning_rate": 1.624288031681851e-05,
"loss": 0.718231737613678,
"step": 744
},
{
"epoch": 0.9667477696674777,
"grad_norm": 0.5601441860198975,
"learning_rate": 1.623170327757267e-05,
"loss": 0.7587568759918213,
"step": 745
},
{
"epoch": 0.9680454176804542,
"grad_norm": 0.5228809118270874,
"learning_rate": 1.62205134960709e-05,
"loss": 0.7063294649124146,
"step": 746
},
{
"epoch": 0.9693430656934306,
"grad_norm": 0.5264230370521545,
"learning_rate": 1.620931099519347e-05,
"loss": 0.7381964921951294,
"step": 747
},
{
"epoch": 0.9706407137064071,
"grad_norm": 0.5306467413902283,
"learning_rate": 1.619809579784665e-05,
"loss": 0.6895403861999512,
"step": 748
},
{
"epoch": 0.9719383617193836,
"grad_norm": 0.5162505507469177,
"learning_rate": 1.6186867926962695e-05,
"loss": 0.7042033672332764,
"step": 749
},
{
"epoch": 0.9732360097323601,
"grad_norm": 0.51023268699646,
"learning_rate": 1.6175627405499746e-05,
"loss": 0.7028312683105469,
"step": 750
},
{
"epoch": 0.9745336577453366,
"grad_norm": 0.5226272344589233,
"learning_rate": 1.6164374256441837e-05,
"loss": 0.7110305428504944,
"step": 751
},
{
"epoch": 0.975831305758313,
"grad_norm": 0.5189753174781799,
"learning_rate": 1.6153108502798796e-05,
"loss": 0.7227635979652405,
"step": 752
},
{
"epoch": 0.9771289537712895,
"grad_norm": 0.5253064036369324,
"learning_rate": 1.614183016760625e-05,
"loss": 0.708706259727478,
"step": 753
},
{
"epoch": 0.978426601784266,
"grad_norm": 0.5069226622581482,
"learning_rate": 1.613053927392553e-05,
"loss": 0.7607108354568481,
"step": 754
},
{
"epoch": 0.9797242497972425,
"grad_norm": 0.5430122017860413,
"learning_rate": 1.6119235844843664e-05,
"loss": 0.6882092356681824,
"step": 755
},
{
"epoch": 0.981021897810219,
"grad_norm": 0.5484969615936279,
"learning_rate": 1.6107919903473294e-05,
"loss": 0.6984055638313293,
"step": 756
},
{
"epoch": 0.9823195458231955,
"grad_norm": 0.5450364351272583,
"learning_rate": 1.6096591472952664e-05,
"loss": 0.7414028644561768,
"step": 757
},
{
"epoch": 0.983617193836172,
"grad_norm": 0.5095598101615906,
"learning_rate": 1.6085250576445548e-05,
"loss": 0.6796683073043823,
"step": 758
},
{
"epoch": 0.9849148418491485,
"grad_norm": 0.5161803364753723,
"learning_rate": 1.6073897237141203e-05,
"loss": 0.6673390865325928,
"step": 759
},
{
"epoch": 0.986212489862125,
"grad_norm": 0.5004435777664185,
"learning_rate": 1.6062531478254333e-05,
"loss": 0.6315610408782959,
"step": 760
},
{
"epoch": 0.9875101378751013,
"grad_norm": 0.5166559219360352,
"learning_rate": 1.605115332302505e-05,
"loss": 0.6672409176826477,
"step": 761
},
{
"epoch": 0.9888077858880778,
"grad_norm": 0.5332128405570984,
"learning_rate": 1.603976279471879e-05,
"loss": 0.7169513702392578,
"step": 762
},
{
"epoch": 0.9901054339010543,
"grad_norm": 0.5556347370147705,
"learning_rate": 1.6028359916626308e-05,
"loss": 0.708602786064148,
"step": 763
},
{
"epoch": 0.9914030819140308,
"grad_norm": 0.5154053568840027,
"learning_rate": 1.601694471206359e-05,
"loss": 0.6270056366920471,
"step": 764
},
{
"epoch": 0.9927007299270073,
"grad_norm": 0.5185645222663879,
"learning_rate": 1.600551720437186e-05,
"loss": 0.6873992085456848,
"step": 765
},
{
"epoch": 0.9939983779399838,
"grad_norm": 0.546991229057312,
"learning_rate": 1.599407741691746e-05,
"loss": 0.7366882562637329,
"step": 766
},
{
"epoch": 0.9952960259529603,
"grad_norm": 0.5219473838806152,
"learning_rate": 1.5982625373091877e-05,
"loss": 0.6808854937553406,
"step": 767
},
{
"epoch": 0.9965936739659368,
"grad_norm": 0.5348212122917175,
"learning_rate": 1.5971161096311628e-05,
"loss": 0.7217116355895996,
"step": 768
},
{
"epoch": 0.9978913219789132,
"grad_norm": 0.5152093172073364,
"learning_rate": 1.5959684610018267e-05,
"loss": 0.6545735597610474,
"step": 769
},
{
"epoch": 0.9991889699918897,
"grad_norm": 0.5182209610939026,
"learning_rate": 1.5948195937678297e-05,
"loss": 0.6775786280632019,
"step": 770
},
{
"epoch": 1.0,
"grad_norm": 0.634954571723938,
"learning_rate": 1.5936695102783148e-05,
"loss": 0.6640980839729309,
"step": 771
},
{
"epoch": 1.0012976480129765,
"grad_norm": 0.7553068399429321,
"learning_rate": 1.5925182128849116e-05,
"loss": 0.6133830547332764,
"step": 772
},
{
"epoch": 1.002595296025953,
"grad_norm": 0.6613984704017639,
"learning_rate": 1.591365703941732e-05,
"loss": 0.5815013647079468,
"step": 773
},
{
"epoch": 1.0038929440389295,
"grad_norm": 0.592282235622406,
"learning_rate": 1.5902119858053652e-05,
"loss": 0.5898460149765015,
"step": 774
},
{
"epoch": 1.005190592051906,
"grad_norm": 0.5373958945274353,
"learning_rate": 1.589057060834872e-05,
"loss": 0.6019303798675537,
"step": 775
},
{
"epoch": 1.0064882400648825,
"grad_norm": 0.6260755062103271,
"learning_rate": 1.5879009313917826e-05,
"loss": 0.5970971584320068,
"step": 776
},
{
"epoch": 1.007785888077859,
"grad_norm": 0.7529841661453247,
"learning_rate": 1.5867435998400885e-05,
"loss": 0.6816403865814209,
"step": 777
},
{
"epoch": 1.0090835360908355,
"grad_norm": 0.7224608659744263,
"learning_rate": 1.5855850685462404e-05,
"loss": 0.6263958215713501,
"step": 778
},
{
"epoch": 1.010381184103812,
"grad_norm": 0.6676880121231079,
"learning_rate": 1.584425339879141e-05,
"loss": 0.6304363012313843,
"step": 779
},
{
"epoch": 1.0116788321167882,
"grad_norm": 0.5799426436424255,
"learning_rate": 1.5832644162101417e-05,
"loss": 0.59343421459198,
"step": 780
},
{
"epoch": 1.0129764801297647,
"grad_norm": 0.570095956325531,
"learning_rate": 1.5821022999130385e-05,
"loss": 0.5410763025283813,
"step": 781
},
{
"epoch": 1.0142741281427412,
"grad_norm": 0.5948435068130493,
"learning_rate": 1.580938993364064e-05,
"loss": 0.5649259686470032,
"step": 782
},
{
"epoch": 1.0155717761557177,
"grad_norm": 0.6467446684837341,
"learning_rate": 1.579774498941886e-05,
"loss": 0.5860875844955444,
"step": 783
},
{
"epoch": 1.0168694241686942,
"grad_norm": 0.5886529088020325,
"learning_rate": 1.578608819027602e-05,
"loss": 0.5772626996040344,
"step": 784
},
{
"epoch": 1.0181670721816707,
"grad_norm": 0.5842233896255493,
"learning_rate": 1.5774419560047303e-05,
"loss": 0.6277778148651123,
"step": 785
},
{
"epoch": 1.0194647201946472,
"grad_norm": 0.590059220790863,
"learning_rate": 1.5762739122592123e-05,
"loss": 0.6396061182022095,
"step": 786
},
{
"epoch": 1.0207623682076237,
"grad_norm": 0.5897361636161804,
"learning_rate": 1.5751046901794008e-05,
"loss": 0.5980340242385864,
"step": 787
},
{
"epoch": 1.0220600162206002,
"grad_norm": 0.5984208583831787,
"learning_rate": 1.5739342921560593e-05,
"loss": 0.602581262588501,
"step": 788
},
{
"epoch": 1.0233576642335767,
"grad_norm": 0.5837097764015198,
"learning_rate": 1.5727627205823554e-05,
"loss": 0.5742583274841309,
"step": 789
},
{
"epoch": 1.0246553122465532,
"grad_norm": 0.5804028511047363,
"learning_rate": 1.571589977853857e-05,
"loss": 0.6103036999702454,
"step": 790
},
{
"epoch": 1.0259529602595296,
"grad_norm": 0.5784346461296082,
"learning_rate": 1.5704160663685254e-05,
"loss": 0.5436456203460693,
"step": 791
},
{
"epoch": 1.0272506082725061,
"grad_norm": 0.576518714427948,
"learning_rate": 1.5692409885267127e-05,
"loss": 0.6918940544128418,
"step": 792
},
{
"epoch": 1.0285482562854826,
"grad_norm": 0.5824302434921265,
"learning_rate": 1.568064746731156e-05,
"loss": 0.6090575456619263,
"step": 793
},
{
"epoch": 1.0298459042984591,
"grad_norm": 0.5279770493507385,
"learning_rate": 1.5668873433869718e-05,
"loss": 0.5268336534500122,
"step": 794
},
{
"epoch": 1.0311435523114356,
"grad_norm": 0.5494199395179749,
"learning_rate": 1.5657087809016517e-05,
"loss": 0.5766473412513733,
"step": 795
},
{
"epoch": 1.0324412003244121,
"grad_norm": 0.5452569723129272,
"learning_rate": 1.564529061685058e-05,
"loss": 0.5949534177780151,
"step": 796
},
{
"epoch": 1.0337388483373884,
"grad_norm": 0.5392066240310669,
"learning_rate": 1.5633481881494178e-05,
"loss": 0.5571380853652954,
"step": 797
},
{
"epoch": 1.0350364963503649,
"grad_norm": 0.5568217635154724,
"learning_rate": 1.562166162709319e-05,
"loss": 0.5642133951187134,
"step": 798
},
{
"epoch": 1.0363341443633414,
"grad_norm": 0.5702704191207886,
"learning_rate": 1.560982987781704e-05,
"loss": 0.6047669649124146,
"step": 799
},
{
"epoch": 1.0376317923763179,
"grad_norm": 0.532315731048584,
"learning_rate": 1.5597986657858656e-05,
"loss": 0.5958635807037354,
"step": 800
},
{
"epoch": 1.0389294403892944,
"grad_norm": 0.5331001877784729,
"learning_rate": 1.5586131991434434e-05,
"loss": 0.5987897515296936,
"step": 801
},
{
"epoch": 1.0402270884022708,
"grad_norm": 0.5481564402580261,
"learning_rate": 1.5574265902784163e-05,
"loss": 0.5622409582138062,
"step": 802
},
{
"epoch": 1.0415247364152473,
"grad_norm": 0.5720167756080627,
"learning_rate": 1.556238841617099e-05,
"loss": 0.6064007878303528,
"step": 803
},
{
"epoch": 1.0428223844282238,
"grad_norm": 0.5809172987937927,
"learning_rate": 1.555049955588137e-05,
"loss": 0.6170299053192139,
"step": 804
},
{
"epoch": 1.0441200324412003,
"grad_norm": 0.5783301591873169,
"learning_rate": 1.5538599346225013e-05,
"loss": 0.568396270275116,
"step": 805
},
{
"epoch": 1.0454176804541768,
"grad_norm": 0.5668922662734985,
"learning_rate": 1.552668781153484e-05,
"loss": 0.576393723487854,
"step": 806
},
{
"epoch": 1.0467153284671533,
"grad_norm": 0.5634539723396301,
"learning_rate": 1.5514764976166916e-05,
"loss": 0.6574882864952087,
"step": 807
},
{
"epoch": 1.0480129764801298,
"grad_norm": 0.5463752150535583,
"learning_rate": 1.5502830864500426e-05,
"loss": 0.5930934548377991,
"step": 808
},
{
"epoch": 1.0493106244931063,
"grad_norm": 0.5872495174407959,
"learning_rate": 1.5490885500937606e-05,
"loss": 0.609790563583374,
"step": 809
},
{
"epoch": 1.0506082725060828,
"grad_norm": 0.5574213266372681,
"learning_rate": 1.5478928909903705e-05,
"loss": 0.60848468542099,
"step": 810
},
{
"epoch": 1.0519059205190593,
"grad_norm": 0.5493984818458557,
"learning_rate": 1.5466961115846927e-05,
"loss": 0.5494011640548706,
"step": 811
},
{
"epoch": 1.0532035685320358,
"grad_norm": 0.5724595785140991,
"learning_rate": 1.545498214323837e-05,
"loss": 0.5948253273963928,
"step": 812
},
{
"epoch": 1.0545012165450123,
"grad_norm": 0.5360091924667358,
"learning_rate": 1.544299201657202e-05,
"loss": 0.6195284128189087,
"step": 813
},
{
"epoch": 1.0557988645579885,
"grad_norm": 0.5609839558601379,
"learning_rate": 1.543099076036463e-05,
"loss": 0.5945447087287903,
"step": 814
},
{
"epoch": 1.057096512570965,
"grad_norm": 0.5413586497306824,
"learning_rate": 1.5418978399155748e-05,
"loss": 0.55891352891922,
"step": 815
},
{
"epoch": 1.0583941605839415,
"grad_norm": 0.5763382315635681,
"learning_rate": 1.54069549575076e-05,
"loss": 0.5900748372077942,
"step": 816
},
{
"epoch": 1.059691808596918,
"grad_norm": 0.5625810623168945,
"learning_rate": 1.539492046000509e-05,
"loss": 0.5834665298461914,
"step": 817
},
{
"epoch": 1.0609894566098945,
"grad_norm": 0.5442895889282227,
"learning_rate": 1.5382874931255717e-05,
"loss": 0.6234191656112671,
"step": 818
},
{
"epoch": 1.062287104622871,
"grad_norm": 0.5448631048202515,
"learning_rate": 1.5370818395889536e-05,
"loss": 0.5617302060127258,
"step": 819
},
{
"epoch": 1.0635847526358475,
"grad_norm": 0.5880674719810486,
"learning_rate": 1.5358750878559113e-05,
"loss": 0.6024942994117737,
"step": 820
},
{
"epoch": 1.064882400648824,
"grad_norm": 0.5762202143669128,
"learning_rate": 1.5346672403939465e-05,
"loss": 0.625447154045105,
"step": 821
},
{
"epoch": 1.0661800486618005,
"grad_norm": 0.5726525187492371,
"learning_rate": 1.5334582996728017e-05,
"loss": 0.6527541875839233,
"step": 822
},
{
"epoch": 1.067477696674777,
"grad_norm": 0.5863476991653442,
"learning_rate": 1.532248268164455e-05,
"loss": 0.6537057161331177,
"step": 823
},
{
"epoch": 1.0687753446877535,
"grad_norm": 0.5855088829994202,
"learning_rate": 1.5310371483431138e-05,
"loss": 0.5910706520080566,
"step": 824
},
{
"epoch": 1.07007299270073,
"grad_norm": 0.5428813695907593,
"learning_rate": 1.529824942685212e-05,
"loss": 0.6206585168838501,
"step": 825
},
{
"epoch": 1.0713706407137065,
"grad_norm": 0.5427327156066895,
"learning_rate": 1.528611653669403e-05,
"loss": 0.6064955592155457,
"step": 826
},
{
"epoch": 1.072668288726683,
"grad_norm": 0.5533806085586548,
"learning_rate": 1.5273972837765566e-05,
"loss": 0.6161221861839294,
"step": 827
},
{
"epoch": 1.0739659367396595,
"grad_norm": 0.5330477356910706,
"learning_rate": 1.526181835489751e-05,
"loss": 0.584095299243927,
"step": 828
},
{
"epoch": 1.075263584752636,
"grad_norm": 0.5572231411933899,
"learning_rate": 1.5249653112942708e-05,
"loss": 0.6146395206451416,
"step": 829
},
{
"epoch": 1.0765612327656124,
"grad_norm": 0.5302649140357971,
"learning_rate": 1.5237477136776e-05,
"loss": 0.5835666060447693,
"step": 830
},
{
"epoch": 1.0778588807785887,
"grad_norm": 0.524252712726593,
"learning_rate": 1.5225290451294173e-05,
"loss": 0.5483739376068115,
"step": 831
},
{
"epoch": 1.0791565287915652,
"grad_norm": 0.5535216331481934,
"learning_rate": 1.521309308141592e-05,
"loss": 0.5715370774269104,
"step": 832
},
{
"epoch": 1.0804541768045417,
"grad_norm": 0.5739737749099731,
"learning_rate": 1.5200885052081767e-05,
"loss": 0.6168693900108337,
"step": 833
},
{
"epoch": 1.0817518248175182,
"grad_norm": 0.5620468258857727,
"learning_rate": 1.518866638825405e-05,
"loss": 0.6358708143234253,
"step": 834
},
{
"epoch": 1.0830494728304947,
"grad_norm": 0.5504558086395264,
"learning_rate": 1.517643711491684e-05,
"loss": 0.5625787973403931,
"step": 835
},
{
"epoch": 1.0843471208434712,
"grad_norm": 0.527152955532074,
"learning_rate": 1.516419725707591e-05,
"loss": 0.5917230248451233,
"step": 836
},
{
"epoch": 1.0856447688564477,
"grad_norm": 0.5097678899765015,
"learning_rate": 1.5151946839758673e-05,
"loss": 0.5631688237190247,
"step": 837
},
{
"epoch": 1.0869424168694242,
"grad_norm": 0.5500524044036865,
"learning_rate": 1.5139685888014123e-05,
"loss": 0.6300808787345886,
"step": 838
},
{
"epoch": 1.0882400648824007,
"grad_norm": 0.580634355545044,
"learning_rate": 1.512741442691281e-05,
"loss": 0.6707481145858765,
"step": 839
},
{
"epoch": 1.0895377128953772,
"grad_norm": 0.5668573379516602,
"learning_rate": 1.5115132481546763e-05,
"loss": 0.5974687337875366,
"step": 840
},
{
"epoch": 1.0908353609083536,
"grad_norm": 0.5720273852348328,
"learning_rate": 1.5102840077029452e-05,
"loss": 0.5461701154708862,
"step": 841
},
{
"epoch": 1.0921330089213301,
"grad_norm": 0.5787645578384399,
"learning_rate": 1.509053723849574e-05,
"loss": 0.6476290225982666,
"step": 842
},
{
"epoch": 1.0934306569343066,
"grad_norm": 0.5475322604179382,
"learning_rate": 1.5078223991101805e-05,
"loss": 0.5730643272399902,
"step": 843
},
{
"epoch": 1.0947283049472831,
"grad_norm": 0.5544430017471313,
"learning_rate": 1.5065900360025128e-05,
"loss": 0.6112351417541504,
"step": 844
},
{
"epoch": 1.0960259529602596,
"grad_norm": 0.6194364428520203,
"learning_rate": 1.5053566370464416e-05,
"loss": 0.612515926361084,
"step": 845
},
{
"epoch": 1.0973236009732361,
"grad_norm": 0.5542813539505005,
"learning_rate": 1.5041222047639558e-05,
"loss": 0.60612952709198,
"step": 846
},
{
"epoch": 1.0986212489862126,
"grad_norm": 0.5259748697280884,
"learning_rate": 1.5028867416791566e-05,
"loss": 0.5666128396987915,
"step": 847
},
{
"epoch": 1.0999188969991889,
"grad_norm": 0.5615611672401428,
"learning_rate": 1.5016502503182533e-05,
"loss": 0.5991164445877075,
"step": 848
},
{
"epoch": 1.1012165450121654,
"grad_norm": 0.5396665334701538,
"learning_rate": 1.5004127332095579e-05,
"loss": 0.608413815498352,
"step": 849
},
{
"epoch": 1.1025141930251419,
"grad_norm": 0.5625605583190918,
"learning_rate": 1.49917419288348e-05,
"loss": 0.6390218138694763,
"step": 850
},
{
"epoch": 1.1038118410381184,
"grad_norm": 0.5652357935905457,
"learning_rate": 1.4979346318725203e-05,
"loss": 0.613496720790863,
"step": 851
},
{
"epoch": 1.1051094890510949,
"grad_norm": 0.5494624376296997,
"learning_rate": 1.4966940527112679e-05,
"loss": 0.6234304308891296,
"step": 852
},
{
"epoch": 1.1064071370640713,
"grad_norm": 0.546302855014801,
"learning_rate": 1.4954524579363932e-05,
"loss": 0.6565023064613342,
"step": 853
},
{
"epoch": 1.1077047850770478,
"grad_norm": 0.5649261474609375,
"learning_rate": 1.4942098500866428e-05,
"loss": 0.6422203183174133,
"step": 854
},
{
"epoch": 1.1090024330900243,
"grad_norm": 0.5499486923217773,
"learning_rate": 1.4929662317028359e-05,
"loss": 0.6043179035186768,
"step": 855
},
{
"epoch": 1.1103000811030008,
"grad_norm": 0.5544485449790955,
"learning_rate": 1.491721605327857e-05,
"loss": 0.5800666213035583,
"step": 856
},
{
"epoch": 1.1115977291159773,
"grad_norm": 0.5804775953292847,
"learning_rate": 1.490475973506652e-05,
"loss": 0.6427537798881531,
"step": 857
},
{
"epoch": 1.1128953771289538,
"grad_norm": 0.5342238545417786,
"learning_rate": 1.4892293387862221e-05,
"loss": 0.6311315298080444,
"step": 858
},
{
"epoch": 1.1141930251419303,
"grad_norm": 0.5803128480911255,
"learning_rate": 1.487981703715621e-05,
"loss": 0.6198186874389648,
"step": 859
},
{
"epoch": 1.1154906731549068,
"grad_norm": 0.5532170534133911,
"learning_rate": 1.4867330708459463e-05,
"loss": 0.6145609617233276,
"step": 860
},
{
"epoch": 1.1167883211678833,
"grad_norm": 0.5493961572647095,
"learning_rate": 1.4854834427303353e-05,
"loss": 0.6166091561317444,
"step": 861
},
{
"epoch": 1.1180859691808598,
"grad_norm": 0.5559639930725098,
"learning_rate": 1.4842328219239618e-05,
"loss": 0.6064823865890503,
"step": 862
},
{
"epoch": 1.119383617193836,
"grad_norm": 0.5540943145751953,
"learning_rate": 1.4829812109840291e-05,
"loss": 0.5765544176101685,
"step": 863
},
{
"epoch": 1.1206812652068125,
"grad_norm": 0.5384024381637573,
"learning_rate": 1.4817286124697647e-05,
"loss": 0.565604567527771,
"step": 864
},
{
"epoch": 1.121978913219789,
"grad_norm": 0.5547834634780884,
"learning_rate": 1.480475028942415e-05,
"loss": 0.6463969349861145,
"step": 865
},
{
"epoch": 1.1232765612327655,
"grad_norm": 0.5574260354042053,
"learning_rate": 1.4792204629652414e-05,
"loss": 0.5858181118965149,
"step": 866
},
{
"epoch": 1.124574209245742,
"grad_norm": 0.5450447201728821,
"learning_rate": 1.4779649171035138e-05,
"loss": 0.6112916469573975,
"step": 867
},
{
"epoch": 1.1258718572587185,
"grad_norm": 0.5452038645744324,
"learning_rate": 1.4767083939245055e-05,
"loss": 0.6333041787147522,
"step": 868
},
{
"epoch": 1.127169505271695,
"grad_norm": 0.5453193187713623,
"learning_rate": 1.475450895997489e-05,
"loss": 0.6154720783233643,
"step": 869
},
{
"epoch": 1.1284671532846715,
"grad_norm": 0.5503911375999451,
"learning_rate": 1.4741924258937283e-05,
"loss": 0.580187201499939,
"step": 870
},
{
"epoch": 1.129764801297648,
"grad_norm": 0.564156174659729,
"learning_rate": 1.472932986186477e-05,
"loss": 0.6397178173065186,
"step": 871
},
{
"epoch": 1.1310624493106245,
"grad_norm": 0.5705751180648804,
"learning_rate": 1.47167257945097e-05,
"loss": 0.6369278430938721,
"step": 872
},
{
"epoch": 1.132360097323601,
"grad_norm": 0.562324583530426,
"learning_rate": 1.4704112082644207e-05,
"loss": 0.5986394882202148,
"step": 873
},
{
"epoch": 1.1336577453365775,
"grad_norm": 0.5652042031288147,
"learning_rate": 1.4691488752060132e-05,
"loss": 0.6185961365699768,
"step": 874
},
{
"epoch": 1.134955393349554,
"grad_norm": 0.5481469035148621,
"learning_rate": 1.4678855828568996e-05,
"loss": 0.5570172071456909,
"step": 875
},
{
"epoch": 1.1362530413625305,
"grad_norm": 0.5480834245681763,
"learning_rate": 1.4666213338001929e-05,
"loss": 0.5788794755935669,
"step": 876
},
{
"epoch": 1.137550689375507,
"grad_norm": 0.5426838994026184,
"learning_rate": 1.4653561306209625e-05,
"loss": 0.5975257158279419,
"step": 877
},
{
"epoch": 1.1388483373884835,
"grad_norm": 0.5632731914520264,
"learning_rate": 1.4640899759062285e-05,
"loss": 0.6319808959960938,
"step": 878
},
{
"epoch": 1.14014598540146,
"grad_norm": 0.5687447786331177,
"learning_rate": 1.462822872244957e-05,
"loss": 0.6043187379837036,
"step": 879
},
{
"epoch": 1.1414436334144362,
"grad_norm": 0.5472837686538696,
"learning_rate": 1.461554822228054e-05,
"loss": 0.607802152633667,
"step": 880
},
{
"epoch": 1.142741281427413,
"grad_norm": 0.5329515933990479,
"learning_rate": 1.460285828448361e-05,
"loss": 0.5557148456573486,
"step": 881
},
{
"epoch": 1.1440389294403892,
"grad_norm": 0.5272259712219238,
"learning_rate": 1.4590158935006494e-05,
"loss": 0.5320879817008972,
"step": 882
},
{
"epoch": 1.1453365774533657,
"grad_norm": 0.5834517478942871,
"learning_rate": 1.4577450199816142e-05,
"loss": 0.6263319253921509,
"step": 883
},
{
"epoch": 1.1466342254663422,
"grad_norm": 0.5725152492523193,
"learning_rate": 1.4564732104898702e-05,
"loss": 0.659183919429779,
"step": 884
},
{
"epoch": 1.1479318734793187,
"grad_norm": 0.5416671633720398,
"learning_rate": 1.4552004676259462e-05,
"loss": 0.5948503613471985,
"step": 885
},
{
"epoch": 1.1492295214922952,
"grad_norm": 0.5543138384819031,
"learning_rate": 1.453926793992279e-05,
"loss": 0.6404953002929688,
"step": 886
},
{
"epoch": 1.1505271695052717,
"grad_norm": 0.5595470070838928,
"learning_rate": 1.4526521921932091e-05,
"loss": 0.6393734812736511,
"step": 887
},
{
"epoch": 1.1518248175182482,
"grad_norm": 0.5882608294487,
"learning_rate": 1.4513766648349742e-05,
"loss": 0.5654003024101257,
"step": 888
},
{
"epoch": 1.1531224655312247,
"grad_norm": 0.5529691576957703,
"learning_rate": 1.4501002145257048e-05,
"loss": 0.6137228012084961,
"step": 889
},
{
"epoch": 1.1544201135442012,
"grad_norm": 0.5548762083053589,
"learning_rate": 1.4488228438754191e-05,
"loss": 0.603983998298645,
"step": 890
},
{
"epoch": 1.1557177615571776,
"grad_norm": 0.5486696362495422,
"learning_rate": 1.4475445554960166e-05,
"loss": 0.6514973640441895,
"step": 891
},
{
"epoch": 1.1570154095701541,
"grad_norm": 0.5455385446548462,
"learning_rate": 1.4462653520012736e-05,
"loss": 0.6550310850143433,
"step": 892
},
{
"epoch": 1.1583130575831306,
"grad_norm": 0.5628224015235901,
"learning_rate": 1.4449852360068372e-05,
"loss": 0.6537249088287354,
"step": 893
},
{
"epoch": 1.1596107055961071,
"grad_norm": 0.5596909523010254,
"learning_rate": 1.4437042101302212e-05,
"loss": 0.6253930926322937,
"step": 894
},
{
"epoch": 1.1609083536090836,
"grad_norm": 0.5298051238059998,
"learning_rate": 1.4424222769907985e-05,
"loss": 0.57865309715271,
"step": 895
},
{
"epoch": 1.1622060016220601,
"grad_norm": 0.5473706722259521,
"learning_rate": 1.4411394392097985e-05,
"loss": 0.5876542329788208,
"step": 896
},
{
"epoch": 1.1635036496350364,
"grad_norm": 0.5646262168884277,
"learning_rate": 1.4398556994102996e-05,
"loss": 0.6242583990097046,
"step": 897
},
{
"epoch": 1.164801297648013,
"grad_norm": 0.5632451176643372,
"learning_rate": 1.4385710602172245e-05,
"loss": 0.6315684914588928,
"step": 898
},
{
"epoch": 1.1660989456609894,
"grad_norm": 0.5819709300994873,
"learning_rate": 1.4372855242573356e-05,
"loss": 0.5947535037994385,
"step": 899
},
{
"epoch": 1.1673965936739659,
"grad_norm": 0.5634546875953674,
"learning_rate": 1.4359990941592283e-05,
"loss": 0.6281697750091553,
"step": 900
},
{
"epoch": 1.1686942416869424,
"grad_norm": 0.5534945130348206,
"learning_rate": 1.4347117725533269e-05,
"loss": 0.567562460899353,
"step": 901
},
{
"epoch": 1.1699918896999189,
"grad_norm": 0.5352903604507446,
"learning_rate": 1.4334235620718774e-05,
"loss": 0.5504214763641357,
"step": 902
},
{
"epoch": 1.1712895377128953,
"grad_norm": 0.5894420146942139,
"learning_rate": 1.4321344653489453e-05,
"loss": 0.5871877074241638,
"step": 903
},
{
"epoch": 1.1725871857258718,
"grad_norm": 0.5826941728591919,
"learning_rate": 1.4308444850204066e-05,
"loss": 0.5854516625404358,
"step": 904
},
{
"epoch": 1.1738848337388483,
"grad_norm": 0.5583464503288269,
"learning_rate": 1.4295536237239445e-05,
"loss": 0.6143467426300049,
"step": 905
},
{
"epoch": 1.1751824817518248,
"grad_norm": 0.5566253662109375,
"learning_rate": 1.4282618840990438e-05,
"loss": 0.6143018007278442,
"step": 906
},
{
"epoch": 1.1764801297648013,
"grad_norm": 0.5643221735954285,
"learning_rate": 1.4269692687869849e-05,
"loss": 0.6445101499557495,
"step": 907
},
{
"epoch": 1.1777777777777778,
"grad_norm": 0.583202600479126,
"learning_rate": 1.425675780430839e-05,
"loss": 0.6551916599273682,
"step": 908
},
{
"epoch": 1.1790754257907543,
"grad_norm": 0.5802360773086548,
"learning_rate": 1.4243814216754626e-05,
"loss": 0.6176046133041382,
"step": 909
},
{
"epoch": 1.1803730738037308,
"grad_norm": 0.5651218295097351,
"learning_rate": 1.4230861951674914e-05,
"loss": 0.6476747393608093,
"step": 910
},
{
"epoch": 1.1816707218167073,
"grad_norm": 0.5351070761680603,
"learning_rate": 1.421790103555336e-05,
"loss": 0.5974748134613037,
"step": 911
},
{
"epoch": 1.1829683698296838,
"grad_norm": 0.5506876111030579,
"learning_rate": 1.4204931494891759e-05,
"loss": 0.5977579355239868,
"step": 912
},
{
"epoch": 1.1842660178426603,
"grad_norm": 0.5496414303779602,
"learning_rate": 1.4191953356209535e-05,
"loss": 0.5993613004684448,
"step": 913
},
{
"epoch": 1.1855636658556366,
"grad_norm": 0.5448877215385437,
"learning_rate": 1.4178966646043702e-05,
"loss": 0.5849076509475708,
"step": 914
},
{
"epoch": 1.186861313868613,
"grad_norm": 0.5505439043045044,
"learning_rate": 1.4165971390948787e-05,
"loss": 0.6557425856590271,
"step": 915
},
{
"epoch": 1.1881589618815895,
"grad_norm": 0.5327088236808777,
"learning_rate": 1.4152967617496805e-05,
"loss": 0.5915898084640503,
"step": 916
},
{
"epoch": 1.189456609894566,
"grad_norm": 0.5534889698028564,
"learning_rate": 1.4139955352277176e-05,
"loss": 0.574662983417511,
"step": 917
},
{
"epoch": 1.1907542579075425,
"grad_norm": 0.5179355144500732,
"learning_rate": 1.4126934621896692e-05,
"loss": 0.5562629699707031,
"step": 918
},
{
"epoch": 1.192051905920519,
"grad_norm": 0.5698444247245789,
"learning_rate": 1.4113905452979455e-05,
"loss": 0.6139298677444458,
"step": 919
},
{
"epoch": 1.1933495539334955,
"grad_norm": 0.5280522108078003,
"learning_rate": 1.410086787216681e-05,
"loss": 0.5793087482452393,
"step": 920
},
{
"epoch": 1.1933495539334955,
"eval_loss": 0.6891781091690063,
"eval_runtime": 72.4953,
"eval_samples_per_second": 71.618,
"eval_steps_per_second": 8.952,
"step": 920
},
{
"epoch": 1.194647201946472,
"grad_norm": 0.518786609172821,
"learning_rate": 1.4087821906117314e-05,
"loss": 0.5602763891220093,
"step": 921
},
{
"epoch": 1.1959448499594485,
"grad_norm": 0.5518815517425537,
"learning_rate": 1.4074767581506666e-05,
"loss": 0.6225783824920654,
"step": 922
},
{
"epoch": 1.197242497972425,
"grad_norm": 0.5233501195907593,
"learning_rate": 1.4061704925027653e-05,
"loss": 0.5846587419509888,
"step": 923
},
{
"epoch": 1.1985401459854015,
"grad_norm": 0.5470210313796997,
"learning_rate": 1.4048633963390105e-05,
"loss": 0.5750600099563599,
"step": 924
},
{
"epoch": 1.199837793998378,
"grad_norm": 0.5647477507591248,
"learning_rate": 1.4035554723320828e-05,
"loss": 0.5977157354354858,
"step": 925
},
{
"epoch": 1.2011354420113545,
"grad_norm": 0.5179945230484009,
"learning_rate": 1.4022467231563554e-05,
"loss": 0.5806452035903931,
"step": 926
},
{
"epoch": 1.202433090024331,
"grad_norm": 0.5535194873809814,
"learning_rate": 1.4009371514878898e-05,
"loss": 0.6628227233886719,
"step": 927
},
{
"epoch": 1.2037307380373075,
"grad_norm": 0.6273780465126038,
"learning_rate": 1.399626760004428e-05,
"loss": 0.6142767667770386,
"step": 928
},
{
"epoch": 1.205028386050284,
"grad_norm": 0.5373409390449524,
"learning_rate": 1.3983155513853897e-05,
"loss": 0.6562739610671997,
"step": 929
},
{
"epoch": 1.2063260340632604,
"grad_norm": 0.5411200523376465,
"learning_rate": 1.3970035283118639e-05,
"loss": 0.5903608202934265,
"step": 930
},
{
"epoch": 1.2076236820762367,
"grad_norm": 0.5595235824584961,
"learning_rate": 1.3956906934666056e-05,
"loss": 0.6051539182662964,
"step": 931
},
{
"epoch": 1.2089213300892132,
"grad_norm": 0.5300971865653992,
"learning_rate": 1.3943770495340307e-05,
"loss": 0.643832802772522,
"step": 932
},
{
"epoch": 1.2102189781021897,
"grad_norm": 0.5413315892219543,
"learning_rate": 1.3930625992002076e-05,
"loss": 0.5942864418029785,
"step": 933
},
{
"epoch": 1.2115166261151662,
"grad_norm": 0.558797299861908,
"learning_rate": 1.391747345152855e-05,
"loss": 0.619717001914978,
"step": 934
},
{
"epoch": 1.2128142741281427,
"grad_norm": 0.5264928936958313,
"learning_rate": 1.3904312900813345e-05,
"loss": 0.5522656440734863,
"step": 935
},
{
"epoch": 1.2141119221411192,
"grad_norm": 0.5257030725479126,
"learning_rate": 1.3891144366766457e-05,
"loss": 0.5786164999008179,
"step": 936
},
{
"epoch": 1.2154095701540957,
"grad_norm": 0.577509343624115,
"learning_rate": 1.3877967876314205e-05,
"loss": 0.6315740346908569,
"step": 937
},
{
"epoch": 1.2167072181670722,
"grad_norm": 0.5317774415016174,
"learning_rate": 1.3864783456399174e-05,
"loss": 0.5896605253219604,
"step": 938
},
{
"epoch": 1.2180048661800487,
"grad_norm": 0.5598568320274353,
"learning_rate": 1.3851591133980167e-05,
"loss": 0.6161408424377441,
"step": 939
},
{
"epoch": 1.2193025141930252,
"grad_norm": 0.5387381911277771,
"learning_rate": 1.3838390936032146e-05,
"loss": 0.5705558061599731,
"step": 940
},
{
"epoch": 1.2206001622060016,
"grad_norm": 0.5279619693756104,
"learning_rate": 1.3825182889546173e-05,
"loss": 0.5650646686553955,
"step": 941
},
{
"epoch": 1.2218978102189781,
"grad_norm": 0.5602632164955139,
"learning_rate": 1.3811967021529362e-05,
"loss": 0.6143766045570374,
"step": 942
},
{
"epoch": 1.2231954582319546,
"grad_norm": 0.5425279140472412,
"learning_rate": 1.3798743359004816e-05,
"loss": 0.602745771408081,
"step": 943
},
{
"epoch": 1.2244931062449311,
"grad_norm": 0.5385331511497498,
"learning_rate": 1.378551192901158e-05,
"loss": 0.5555763244628906,
"step": 944
},
{
"epoch": 1.2257907542579076,
"grad_norm": 0.5338374972343445,
"learning_rate": 1.3772272758604576e-05,
"loss": 0.5934339165687561,
"step": 945
},
{
"epoch": 1.2270884022708841,
"grad_norm": 0.5479584336280823,
"learning_rate": 1.375902587485456e-05,
"loss": 0.5891726016998291,
"step": 946
},
{
"epoch": 1.2283860502838606,
"grad_norm": 0.5357087254524231,
"learning_rate": 1.3745771304848056e-05,
"loss": 0.5626200437545776,
"step": 947
},
{
"epoch": 1.2296836982968369,
"grad_norm": 0.5543829202651978,
"learning_rate": 1.3732509075687302e-05,
"loss": 0.5829602479934692,
"step": 948
},
{
"epoch": 1.2309813463098134,
"grad_norm": 0.5650047659873962,
"learning_rate": 1.3719239214490203e-05,
"loss": 0.6154081225395203,
"step": 949
},
{
"epoch": 1.2322789943227899,
"grad_norm": 0.5745924711227417,
"learning_rate": 1.3705961748390264e-05,
"loss": 0.5824979543685913,
"step": 950
},
{
"epoch": 1.2335766423357664,
"grad_norm": 0.5524203777313232,
"learning_rate": 1.3692676704536547e-05,
"loss": 0.6566962599754333,
"step": 951
},
{
"epoch": 1.2348742903487429,
"grad_norm": 0.5592309832572937,
"learning_rate": 1.3679384110093601e-05,
"loss": 0.5955104231834412,
"step": 952
},
{
"epoch": 1.2361719383617193,
"grad_norm": 0.5388526916503906,
"learning_rate": 1.3666083992241414e-05,
"loss": 0.6259311437606812,
"step": 953
},
{
"epoch": 1.2374695863746958,
"grad_norm": 0.5431481599807739,
"learning_rate": 1.3652776378175366e-05,
"loss": 0.6409016847610474,
"step": 954
},
{
"epoch": 1.2387672343876723,
"grad_norm": 0.5381134748458862,
"learning_rate": 1.3639461295106157e-05,
"loss": 0.5895624160766602,
"step": 955
},
{
"epoch": 1.2400648824006488,
"grad_norm": 0.5462051630020142,
"learning_rate": 1.3626138770259765e-05,
"loss": 0.5515483617782593,
"step": 956
},
{
"epoch": 1.2413625304136253,
"grad_norm": 0.5416935682296753,
"learning_rate": 1.3612808830877377e-05,
"loss": 0.5839380621910095,
"step": 957
},
{
"epoch": 1.2426601784266018,
"grad_norm": 0.543431282043457,
"learning_rate": 1.3599471504215347e-05,
"loss": 0.6129022836685181,
"step": 958
},
{
"epoch": 1.2439578264395783,
"grad_norm": 0.5546287894248962,
"learning_rate": 1.358612681754513e-05,
"loss": 0.5957478284835815,
"step": 959
},
{
"epoch": 1.2452554744525548,
"grad_norm": 0.5636503100395203,
"learning_rate": 1.357277479815324e-05,
"loss": 0.6206330060958862,
"step": 960
},
{
"epoch": 1.2465531224655313,
"grad_norm": 0.5537446141242981,
"learning_rate": 1.355941547334117e-05,
"loss": 0.5747988224029541,
"step": 961
},
{
"epoch": 1.2478507704785078,
"grad_norm": 0.5459409952163696,
"learning_rate": 1.3546048870425356e-05,
"loss": 0.5868381261825562,
"step": 962
},
{
"epoch": 1.2491484184914843,
"grad_norm": 0.5428374409675598,
"learning_rate": 1.3532675016737127e-05,
"loss": 0.6297606825828552,
"step": 963
},
{
"epoch": 1.2504460665044608,
"grad_norm": 0.5484406352043152,
"learning_rate": 1.3519293939622622e-05,
"loss": 0.6754599213600159,
"step": 964
},
{
"epoch": 1.251743714517437,
"grad_norm": 0.5630886554718018,
"learning_rate": 1.3505905666442757e-05,
"loss": 0.655160129070282,
"step": 965
},
{
"epoch": 1.2530413625304138,
"grad_norm": 0.5442233085632324,
"learning_rate": 1.3492510224573165e-05,
"loss": 0.5808818936347961,
"step": 966
},
{
"epoch": 1.25433901054339,
"grad_norm": 0.5171942114830017,
"learning_rate": 1.3479107641404134e-05,
"loss": 0.5760788321495056,
"step": 967
},
{
"epoch": 1.2556366585563665,
"grad_norm": 0.5334968566894531,
"learning_rate": 1.3465697944340552e-05,
"loss": 0.5447085499763489,
"step": 968
},
{
"epoch": 1.256934306569343,
"grad_norm": 0.5165731310844421,
"learning_rate": 1.3452281160801856e-05,
"loss": 0.600307822227478,
"step": 969
},
{
"epoch": 1.2582319545823195,
"grad_norm": 0.5485058426856995,
"learning_rate": 1.3438857318221974e-05,
"loss": 0.6196280717849731,
"step": 970
},
{
"epoch": 1.259529602595296,
"grad_norm": 0.5499110817909241,
"learning_rate": 1.3425426444049265e-05,
"loss": 0.6000030040740967,
"step": 971
},
{
"epoch": 1.2608272506082725,
"grad_norm": 0.5815853476524353,
"learning_rate": 1.3411988565746467e-05,
"loss": 0.6568498611450195,
"step": 972
},
{
"epoch": 1.262124898621249,
"grad_norm": 0.5364983081817627,
"learning_rate": 1.3398543710790642e-05,
"loss": 0.6078934073448181,
"step": 973
},
{
"epoch": 1.2634225466342255,
"grad_norm": 0.5517644286155701,
"learning_rate": 1.3385091906673115e-05,
"loss": 0.6221879720687866,
"step": 974
},
{
"epoch": 1.264720194647202,
"grad_norm": 0.5543562769889832,
"learning_rate": 1.3371633180899417e-05,
"loss": 0.6666390895843506,
"step": 975
},
{
"epoch": 1.2660178426601785,
"grad_norm": 0.5409432053565979,
"learning_rate": 1.335816756098924e-05,
"loss": 0.6188746690750122,
"step": 976
},
{
"epoch": 1.267315490673155,
"grad_norm": 0.590812087059021,
"learning_rate": 1.3344695074476365e-05,
"loss": 0.6498491764068604,
"step": 977
},
{
"epoch": 1.2686131386861315,
"grad_norm": 0.5648714900016785,
"learning_rate": 1.3331215748908622e-05,
"loss": 0.6376237869262695,
"step": 978
},
{
"epoch": 1.269910786699108,
"grad_norm": 0.5377125144004822,
"learning_rate": 1.3317729611847818e-05,
"loss": 0.6080333590507507,
"step": 979
},
{
"epoch": 1.2712084347120842,
"grad_norm": 0.6160985231399536,
"learning_rate": 1.3304236690869688e-05,
"loss": 0.6452457904815674,
"step": 980
},
{
"epoch": 1.272506082725061,
"grad_norm": 0.5675063133239746,
"learning_rate": 1.329073701356384e-05,
"loss": 0.6066033840179443,
"step": 981
},
{
"epoch": 1.2738037307380372,
"grad_norm": 0.5339285731315613,
"learning_rate": 1.3277230607533698e-05,
"loss": 0.563126266002655,
"step": 982
},
{
"epoch": 1.275101378751014,
"grad_norm": 0.558273434638977,
"learning_rate": 1.3263717500396446e-05,
"loss": 0.6070864796638489,
"step": 983
},
{
"epoch": 1.2763990267639902,
"grad_norm": 0.5663204789161682,
"learning_rate": 1.3250197719782966e-05,
"loss": 0.6016590595245361,
"step": 984
},
{
"epoch": 1.2776966747769667,
"grad_norm": 0.5561959743499756,
"learning_rate": 1.3236671293337788e-05,
"loss": 0.6111094951629639,
"step": 985
},
{
"epoch": 1.2789943227899432,
"grad_norm": 0.5440069437026978,
"learning_rate": 1.3223138248719032e-05,
"loss": 0.6232655644416809,
"step": 986
},
{
"epoch": 1.2802919708029197,
"grad_norm": 0.5603107810020447,
"learning_rate": 1.3209598613598344e-05,
"loss": 0.5950015783309937,
"step": 987
},
{
"epoch": 1.2815896188158962,
"grad_norm": 0.538038969039917,
"learning_rate": 1.3196052415660856e-05,
"loss": 0.6100248098373413,
"step": 988
},
{
"epoch": 1.2828872668288727,
"grad_norm": 0.5667180418968201,
"learning_rate": 1.318249968260511e-05,
"loss": 0.6681912541389465,
"step": 989
},
{
"epoch": 1.2841849148418492,
"grad_norm": 0.5527055859565735,
"learning_rate": 1.316894044214302e-05,
"loss": 0.6051948070526123,
"step": 990
},
{
"epoch": 1.2854825628548256,
"grad_norm": 0.5413651466369629,
"learning_rate": 1.3155374721999797e-05,
"loss": 0.5882329940795898,
"step": 991
},
{
"epoch": 1.2867802108678021,
"grad_norm": 0.5323876738548279,
"learning_rate": 1.3141802549913907e-05,
"loss": 0.6183469295501709,
"step": 992
},
{
"epoch": 1.2880778588807786,
"grad_norm": 0.5273195505142212,
"learning_rate": 1.3128223953637003e-05,
"loss": 0.5676054954528809,
"step": 993
},
{
"epoch": 1.2893755068937551,
"grad_norm": 0.567756175994873,
"learning_rate": 1.3114638960933883e-05,
"loss": 0.6798044443130493,
"step": 994
},
{
"epoch": 1.2906731549067316,
"grad_norm": 0.5517603754997253,
"learning_rate": 1.3101047599582415e-05,
"loss": 0.6340286731719971,
"step": 995
},
{
"epoch": 1.2919708029197081,
"grad_norm": 0.5477331280708313,
"learning_rate": 1.3087449897373494e-05,
"loss": 0.6021038889884949,
"step": 996
},
{
"epoch": 1.2932684509326844,
"grad_norm": 0.551368772983551,
"learning_rate": 1.307384588211098e-05,
"loss": 0.5940453410148621,
"step": 997
},
{
"epoch": 1.294566098945661,
"grad_norm": 0.5456337928771973,
"learning_rate": 1.306023558161164e-05,
"loss": 0.6023222208023071,
"step": 998
},
{
"epoch": 1.2958637469586374,
"grad_norm": 0.5676029324531555,
"learning_rate": 1.3046619023705095e-05,
"loss": 0.6922143697738647,
"step": 999
},
{
"epoch": 1.2971613949716139,
"grad_norm": 0.5776983499526978,
"learning_rate": 1.3032996236233756e-05,
"loss": 0.6589181423187256,
"step": 1000
},
{
"epoch": 1.2984590429845904,
"grad_norm": 0.5594776272773743,
"learning_rate": 1.3019367247052781e-05,
"loss": 0.6284008622169495,
"step": 1001
},
{
"epoch": 1.2997566909975669,
"grad_norm": 0.5632730722427368,
"learning_rate": 1.300573208403e-05,
"loss": 0.586546778678894,
"step": 1002
},
{
"epoch": 1.3010543390105433,
"grad_norm": 0.5418180823326111,
"learning_rate": 1.2992090775045868e-05,
"loss": 0.5931944847106934,
"step": 1003
},
{
"epoch": 1.3023519870235198,
"grad_norm": 0.5260592699050903,
"learning_rate": 1.2978443347993415e-05,
"loss": 0.5439613461494446,
"step": 1004
},
{
"epoch": 1.3036496350364963,
"grad_norm": 0.546437680721283,
"learning_rate": 1.296478983077817e-05,
"loss": 0.5946912169456482,
"step": 1005
},
{
"epoch": 1.3049472830494728,
"grad_norm": 0.5575598478317261,
"learning_rate": 1.2951130251318125e-05,
"loss": 0.6190862655639648,
"step": 1006
},
{
"epoch": 1.3062449310624493,
"grad_norm": 0.5441600680351257,
"learning_rate": 1.2937464637543655e-05,
"loss": 0.613700270652771,
"step": 1007
},
{
"epoch": 1.3075425790754258,
"grad_norm": 0.5194239020347595,
"learning_rate": 1.2923793017397488e-05,
"loss": 0.551931619644165,
"step": 1008
},
{
"epoch": 1.3088402270884023,
"grad_norm": 0.521641194820404,
"learning_rate": 1.2910115418834624e-05,
"loss": 0.544873833656311,
"step": 1009
},
{
"epoch": 1.3101378751013788,
"grad_norm": 0.5697146654129028,
"learning_rate": 1.289643186982229e-05,
"loss": 0.6762262582778931,
"step": 1010
},
{
"epoch": 1.3114355231143553,
"grad_norm": 0.5358358025550842,
"learning_rate": 1.2882742398339884e-05,
"loss": 0.5811675190925598,
"step": 1011
},
{
"epoch": 1.3127331711273318,
"grad_norm": 0.5812531113624573,
"learning_rate": 1.2869047032378905e-05,
"loss": 0.6202974319458008,
"step": 1012
},
{
"epoch": 1.3140308191403083,
"grad_norm": 0.5383328795433044,
"learning_rate": 1.2855345799942915e-05,
"loss": 0.58216392993927,
"step": 1013
},
{
"epoch": 1.3153284671532846,
"grad_norm": 0.5470954775810242,
"learning_rate": 1.2841638729047463e-05,
"loss": 0.5842857360839844,
"step": 1014
},
{
"epoch": 1.3166261151662613,
"grad_norm": 0.5181686878204346,
"learning_rate": 1.2827925847720041e-05,
"loss": 0.5985524654388428,
"step": 1015
},
{
"epoch": 1.3179237631792375,
"grad_norm": 0.5179515480995178,
"learning_rate": 1.2814207184000018e-05,
"loss": 0.5709914565086365,
"step": 1016
},
{
"epoch": 1.319221411192214,
"grad_norm": 0.5449542999267578,
"learning_rate": 1.2800482765938594e-05,
"loss": 0.646975576877594,
"step": 1017
},
{
"epoch": 1.3205190592051905,
"grad_norm": 0.5302087664604187,
"learning_rate": 1.2786752621598726e-05,
"loss": 0.6145081520080566,
"step": 1018
},
{
"epoch": 1.321816707218167,
"grad_norm": 0.5520698428153992,
"learning_rate": 1.2773016779055089e-05,
"loss": 0.5821577906608582,
"step": 1019
},
{
"epoch": 1.3231143552311435,
"grad_norm": 0.5411002039909363,
"learning_rate": 1.2759275266393998e-05,
"loss": 0.5899526476860046,
"step": 1020
},
{
"epoch": 1.32441200324412,
"grad_norm": 0.5193924307823181,
"learning_rate": 1.2745528111713373e-05,
"loss": 0.5851880311965942,
"step": 1021
},
{
"epoch": 1.3257096512570965,
"grad_norm": 0.5581620931625366,
"learning_rate": 1.2731775343122663e-05,
"loss": 0.6368898153305054,
"step": 1022
},
{
"epoch": 1.327007299270073,
"grad_norm": 0.5761281847953796,
"learning_rate": 1.2718016988742799e-05,
"loss": 0.6208426356315613,
"step": 1023
},
{
"epoch": 1.3283049472830495,
"grad_norm": 0.5429732799530029,
"learning_rate": 1.270425307670614e-05,
"loss": 0.5906336307525635,
"step": 1024
},
{
"epoch": 1.329602595296026,
"grad_norm": 0.5482628345489502,
"learning_rate": 1.2690483635156392e-05,
"loss": 0.6205004453659058,
"step": 1025
},
{
"epoch": 1.3309002433090025,
"grad_norm": 0.53929603099823,
"learning_rate": 1.2676708692248583e-05,
"loss": 0.5814516544342041,
"step": 1026
},
{
"epoch": 1.332197891321979,
"grad_norm": 0.5420404076576233,
"learning_rate": 1.2662928276148985e-05,
"loss": 0.6052178740501404,
"step": 1027
},
{
"epoch": 1.3334955393349555,
"grad_norm": 0.5524218678474426,
"learning_rate": 1.264914241503506e-05,
"loss": 0.639128565788269,
"step": 1028
},
{
"epoch": 1.334793187347932,
"grad_norm": 0.5308884978294373,
"learning_rate": 1.2635351137095408e-05,
"loss": 0.5758256316184998,
"step": 1029
},
{
"epoch": 1.3360908353609084,
"grad_norm": 0.556959867477417,
"learning_rate": 1.2621554470529698e-05,
"loss": 0.6215351223945618,
"step": 1030
},
{
"epoch": 1.3373884833738847,
"grad_norm": 0.5299232006072998,
"learning_rate": 1.2607752443548622e-05,
"loss": 0.6064879298210144,
"step": 1031
},
{
"epoch": 1.3386861313868614,
"grad_norm": 0.5557371973991394,
"learning_rate": 1.259394508437383e-05,
"loss": 0.62589031457901,
"step": 1032
},
{
"epoch": 1.3399837793998377,
"grad_norm": 0.5563995242118835,
"learning_rate": 1.2580132421237883e-05,
"loss": 0.6236660480499268,
"step": 1033
},
{
"epoch": 1.3412814274128142,
"grad_norm": 0.5666968822479248,
"learning_rate": 1.2566314482384174e-05,
"loss": 0.6252362728118896,
"step": 1034
},
{
"epoch": 1.3425790754257907,
"grad_norm": 0.5652741193771362,
"learning_rate": 1.2552491296066895e-05,
"loss": 0.6189643144607544,
"step": 1035
},
{
"epoch": 1.3438767234387672,
"grad_norm": 0.5583733320236206,
"learning_rate": 1.2538662890550959e-05,
"loss": 0.6765375137329102,
"step": 1036
},
{
"epoch": 1.3451743714517437,
"grad_norm": 0.5742061138153076,
"learning_rate": 1.252482929411196e-05,
"loss": 0.6477082967758179,
"step": 1037
},
{
"epoch": 1.3464720194647202,
"grad_norm": 0.5400403141975403,
"learning_rate": 1.25109905350361e-05,
"loss": 0.5811231136322021,
"step": 1038
},
{
"epoch": 1.3477696674776967,
"grad_norm": 0.5390773415565491,
"learning_rate": 1.249714664162014e-05,
"loss": 0.6055101156234741,
"step": 1039
},
{
"epoch": 1.3490673154906732,
"grad_norm": 0.5596996545791626,
"learning_rate": 1.2483297642171332e-05,
"loss": 0.6074774265289307,
"step": 1040
},
{
"epoch": 1.3503649635036497,
"grad_norm": 0.5600677728652954,
"learning_rate": 1.246944356500738e-05,
"loss": 0.6564399003982544,
"step": 1041
},
{
"epoch": 1.3516626115166261,
"grad_norm": 0.5470819473266602,
"learning_rate": 1.2455584438456366e-05,
"loss": 0.6430810689926147,
"step": 1042
},
{
"epoch": 1.3529602595296026,
"grad_norm": 0.5539683699607849,
"learning_rate": 1.2441720290856694e-05,
"loss": 0.6132862567901611,
"step": 1043
},
{
"epoch": 1.3542579075425791,
"grad_norm": 0.5648192167282104,
"learning_rate": 1.2427851150557036e-05,
"loss": 0.6304311156272888,
"step": 1044
},
{
"epoch": 1.3555555555555556,
"grad_norm": 0.5195255279541016,
"learning_rate": 1.241397704591627e-05,
"loss": 0.5641679763793945,
"step": 1045
},
{
"epoch": 1.3568532035685321,
"grad_norm": 0.5658749341964722,
"learning_rate": 1.2400098005303436e-05,
"loss": 0.6409952044487,
"step": 1046
},
{
"epoch": 1.3581508515815086,
"grad_norm": 0.5088870525360107,
"learning_rate": 1.238621405709766e-05,
"loss": 0.5354233384132385,
"step": 1047
},
{
"epoch": 1.3594484995944849,
"grad_norm": 0.5734469890594482,
"learning_rate": 1.2372325229688093e-05,
"loss": 0.6188406944274902,
"step": 1048
},
{
"epoch": 1.3607461476074616,
"grad_norm": 0.5380412936210632,
"learning_rate": 1.235843155147388e-05,
"loss": 0.5657402873039246,
"step": 1049
},
{
"epoch": 1.3620437956204379,
"grad_norm": 0.5315279960632324,
"learning_rate": 1.2344533050864071e-05,
"loss": 0.5667376518249512,
"step": 1050
},
{
"epoch": 1.3633414436334144,
"grad_norm": 0.5081866979598999,
"learning_rate": 1.2330629756277588e-05,
"loss": 0.5432066917419434,
"step": 1051
},
{
"epoch": 1.3646390916463909,
"grad_norm": 0.5798763036727905,
"learning_rate": 1.2316721696143141e-05,
"loss": 0.6364309191703796,
"step": 1052
},
{
"epoch": 1.3659367396593673,
"grad_norm": 0.5289844870567322,
"learning_rate": 1.23028088988992e-05,
"loss": 0.5321639180183411,
"step": 1053
},
{
"epoch": 1.3672343876723438,
"grad_norm": 0.5852347612380981,
"learning_rate": 1.228889139299391e-05,
"loss": 0.6831628084182739,
"step": 1054
},
{
"epoch": 1.3685320356853203,
"grad_norm": 0.5265390872955322,
"learning_rate": 1.2274969206885048e-05,
"loss": 0.5725244283676147,
"step": 1055
},
{
"epoch": 1.3698296836982968,
"grad_norm": 0.6298306584358215,
"learning_rate": 1.2261042369039966e-05,
"loss": 0.6366633176803589,
"step": 1056
},
{
"epoch": 1.3711273317112733,
"grad_norm": 0.521314263343811,
"learning_rate": 1.2247110907935518e-05,
"loss": 0.5725533962249756,
"step": 1057
},
{
"epoch": 1.3724249797242498,
"grad_norm": 0.5249886512756348,
"learning_rate": 1.2233174852058015e-05,
"loss": 0.577233076095581,
"step": 1058
},
{
"epoch": 1.3737226277372263,
"grad_norm": 0.5558046102523804,
"learning_rate": 1.2219234229903163e-05,
"loss": 0.6044833660125732,
"step": 1059
},
{
"epoch": 1.3750202757502028,
"grad_norm": 0.5569727420806885,
"learning_rate": 1.2205289069976012e-05,
"loss": 0.5831769704818726,
"step": 1060
},
{
"epoch": 1.3763179237631793,
"grad_norm": 0.5547581911087036,
"learning_rate": 1.2191339400790881e-05,
"loss": 0.5798386335372925,
"step": 1061
},
{
"epoch": 1.3776155717761558,
"grad_norm": 0.5544263124465942,
"learning_rate": 1.2177385250871312e-05,
"loss": 0.607170581817627,
"step": 1062
},
{
"epoch": 1.378913219789132,
"grad_norm": 0.5475184321403503,
"learning_rate": 1.2163426648750009e-05,
"loss": 0.596827507019043,
"step": 1063
},
{
"epoch": 1.3802108678021088,
"grad_norm": 0.551906168460846,
"learning_rate": 1.2149463622968782e-05,
"loss": 0.5992593169212341,
"step": 1064
},
{
"epoch": 1.381508515815085,
"grad_norm": 0.5418475270271301,
"learning_rate": 1.2135496202078487e-05,
"loss": 0.5538514852523804,
"step": 1065
},
{
"epoch": 1.3828061638280618,
"grad_norm": 0.5357592105865479,
"learning_rate": 1.2121524414638958e-05,
"loss": 0.6014474630355835,
"step": 1066
},
{
"epoch": 1.384103811841038,
"grad_norm": 0.5673146843910217,
"learning_rate": 1.2107548289218968e-05,
"loss": 0.5835940837860107,
"step": 1067
},
{
"epoch": 1.3854014598540145,
"grad_norm": 0.5655810832977295,
"learning_rate": 1.2093567854396158e-05,
"loss": 0.6108807325363159,
"step": 1068
},
{
"epoch": 1.386699107866991,
"grad_norm": 0.5361012816429138,
"learning_rate": 1.2079583138756976e-05,
"loss": 0.6093813180923462,
"step": 1069
},
{
"epoch": 1.3879967558799675,
"grad_norm": 0.5419613122940063,
"learning_rate": 1.206559417089663e-05,
"loss": 0.6026707887649536,
"step": 1070
},
{
"epoch": 1.389294403892944,
"grad_norm": 0.5429274439811707,
"learning_rate": 1.205160097941901e-05,
"loss": 0.6365257501602173,
"step": 1071
},
{
"epoch": 1.3905920519059205,
"grad_norm": 0.5734850764274597,
"learning_rate": 1.2037603592936656e-05,
"loss": 0.6649122834205627,
"step": 1072
},
{
"epoch": 1.391889699918897,
"grad_norm": 0.5734902024269104,
"learning_rate": 1.2023602040070679e-05,
"loss": 0.7125487327575684,
"step": 1073
},
{
"epoch": 1.3931873479318735,
"grad_norm": 0.5633674263954163,
"learning_rate": 1.2009596349450717e-05,
"loss": 0.6474109292030334,
"step": 1074
},
{
"epoch": 1.39448499594485,
"grad_norm": 0.5378244519233704,
"learning_rate": 1.1995586549714855e-05,
"loss": 0.6136443614959717,
"step": 1075
},
{
"epoch": 1.3957826439578265,
"grad_norm": 0.558250904083252,
"learning_rate": 1.198157266950959e-05,
"loss": 0.6676377058029175,
"step": 1076
},
{
"epoch": 1.397080291970803,
"grad_norm": 0.5315516591072083,
"learning_rate": 1.1967554737489762e-05,
"loss": 0.607810378074646,
"step": 1077
},
{
"epoch": 1.3983779399837795,
"grad_norm": 0.5391795039176941,
"learning_rate": 1.1953532782318491e-05,
"loss": 0.5898000597953796,
"step": 1078
},
{
"epoch": 1.399675587996756,
"grad_norm": 0.5466244220733643,
"learning_rate": 1.1939506832667129e-05,
"loss": 0.5943995118141174,
"step": 1079
},
{
"epoch": 1.4009732360097322,
"grad_norm": 0.5457687973976135,
"learning_rate": 1.1925476917215191e-05,
"loss": 0.6089761257171631,
"step": 1080
},
{
"epoch": 1.402270884022709,
"grad_norm": 0.5727429389953613,
"learning_rate": 1.1911443064650301e-05,
"loss": 0.6369843482971191,
"step": 1081
},
{
"epoch": 1.4035685320356852,
"grad_norm": 0.5765259861946106,
"learning_rate": 1.189740530366814e-05,
"loss": 0.6176037788391113,
"step": 1082
},
{
"epoch": 1.404866180048662,
"grad_norm": 0.5793892741203308,
"learning_rate": 1.1883363662972375e-05,
"loss": 0.6147127747535706,
"step": 1083
},
{
"epoch": 1.4061638280616382,
"grad_norm": 0.5127638578414917,
"learning_rate": 1.1869318171274606e-05,
"loss": 0.5739990472793579,
"step": 1084
},
{
"epoch": 1.4074614760746147,
"grad_norm": 0.5451372861862183,
"learning_rate": 1.1855268857294308e-05,
"loss": 0.6005086898803711,
"step": 1085
},
{
"epoch": 1.4087591240875912,
"grad_norm": 0.5556860566139221,
"learning_rate": 1.1841215749758774e-05,
"loss": 0.6003910303115845,
"step": 1086
},
{
"epoch": 1.4100567721005677,
"grad_norm": 0.5883124470710754,
"learning_rate": 1.182715887740305e-05,
"loss": 0.6721568703651428,
"step": 1087
},
{
"epoch": 1.4113544201135442,
"grad_norm": 0.5330623388290405,
"learning_rate": 1.1813098268969886e-05,
"loss": 0.617790699005127,
"step": 1088
},
{
"epoch": 1.4126520681265207,
"grad_norm": 0.5409324169158936,
"learning_rate": 1.1799033953209664e-05,
"loss": 0.6154944896697998,
"step": 1089
},
{
"epoch": 1.4139497161394972,
"grad_norm": 0.5280669927597046,
"learning_rate": 1.178496595888035e-05,
"loss": 0.6064777970314026,
"step": 1090
},
{
"epoch": 1.4152473641524737,
"grad_norm": 0.5559468269348145,
"learning_rate": 1.1770894314747433e-05,
"loss": 0.6379706263542175,
"step": 1091
},
{
"epoch": 1.4165450121654501,
"grad_norm": 0.5678933262825012,
"learning_rate": 1.1756819049583861e-05,
"loss": 0.5879865288734436,
"step": 1092
},
{
"epoch": 1.4178426601784266,
"grad_norm": 0.5317026972770691,
"learning_rate": 1.1742740192169995e-05,
"loss": 0.6252385377883911,
"step": 1093
},
{
"epoch": 1.4191403081914031,
"grad_norm": 0.5503518581390381,
"learning_rate": 1.1728657771293529e-05,
"loss": 0.5956102013587952,
"step": 1094
},
{
"epoch": 1.4204379562043796,
"grad_norm": 0.5392619967460632,
"learning_rate": 1.171457181574945e-05,
"loss": 0.6110433340072632,
"step": 1095
},
{
"epoch": 1.4217356042173561,
"grad_norm": 0.554594099521637,
"learning_rate": 1.1700482354339972e-05,
"loss": 0.6505380272865295,
"step": 1096
},
{
"epoch": 1.4230332522303324,
"grad_norm": 0.5639646053314209,
"learning_rate": 1.168638941587448e-05,
"loss": 0.6052155494689941,
"step": 1097
},
{
"epoch": 1.424330900243309,
"grad_norm": 0.5569002032279968,
"learning_rate": 1.1672293029169466e-05,
"loss": 0.5856403112411499,
"step": 1098
},
{
"epoch": 1.4256285482562854,
"grad_norm": 0.5615402460098267,
"learning_rate": 1.165819322304847e-05,
"loss": 0.6077978610992432,
"step": 1099
},
{
"epoch": 1.426926196269262,
"grad_norm": 0.5535939931869507,
"learning_rate": 1.164409002634203e-05,
"loss": 0.6245694160461426,
"step": 1100
},
{
"epoch": 1.4282238442822384,
"grad_norm": 0.5362287759780884,
"learning_rate": 1.162998346788761e-05,
"loss": 0.6105297803878784,
"step": 1101
},
{
"epoch": 1.4295214922952149,
"grad_norm": 0.5390259027481079,
"learning_rate": 1.1615873576529556e-05,
"loss": 0.6066164970397949,
"step": 1102
},
{
"epoch": 1.4308191403081914,
"grad_norm": 0.5315901041030884,
"learning_rate": 1.1601760381119022e-05,
"loss": 0.5768907070159912,
"step": 1103
},
{
"epoch": 1.4321167883211678,
"grad_norm": 0.5727961659431458,
"learning_rate": 1.158764391051392e-05,
"loss": 0.6904894113540649,
"step": 1104
},
{
"epoch": 1.4334144363341443,
"grad_norm": 0.5435361862182617,
"learning_rate": 1.1573524193578863e-05,
"loss": 0.5838584899902344,
"step": 1105
},
{
"epoch": 1.4347120843471208,
"grad_norm": 0.5609909296035767,
"learning_rate": 1.1559401259185095e-05,
"loss": 0.6729065775871277,
"step": 1106
},
{
"epoch": 1.4360097323600973,
"grad_norm": 0.5284282565116882,
"learning_rate": 1.1545275136210441e-05,
"loss": 0.5950232744216919,
"step": 1107
},
{
"epoch": 1.4373073803730738,
"grad_norm": 0.603245735168457,
"learning_rate": 1.153114585353925e-05,
"loss": 0.6702573299407959,
"step": 1108
},
{
"epoch": 1.4386050283860503,
"grad_norm": 0.5415088534355164,
"learning_rate": 1.1517013440062326e-05,
"loss": 0.5716216564178467,
"step": 1109
},
{
"epoch": 1.4399026763990268,
"grad_norm": 0.4960046708583832,
"learning_rate": 1.1502877924676881e-05,
"loss": 0.5501525402069092,
"step": 1110
},
{
"epoch": 1.4412003244120033,
"grad_norm": 0.5444253087043762,
"learning_rate": 1.1488739336286467e-05,
"loss": 0.6333913207054138,
"step": 1111
},
{
"epoch": 1.4424979724249798,
"grad_norm": 0.5255866646766663,
"learning_rate": 1.1474597703800915e-05,
"loss": 0.6024140119552612,
"step": 1112
},
{
"epoch": 1.4437956204379563,
"grad_norm": 0.5488544702529907,
"learning_rate": 1.1460453056136285e-05,
"loss": 0.6334477663040161,
"step": 1113
},
{
"epoch": 1.4450932684509326,
"grad_norm": 0.5465590953826904,
"learning_rate": 1.14463054222148e-05,
"loss": 0.6596208810806274,
"step": 1114
},
{
"epoch": 1.4463909164639093,
"grad_norm": 0.5492766499519348,
"learning_rate": 1.1432154830964796e-05,
"loss": 0.6396174430847168,
"step": 1115
},
{
"epoch": 1.4476885644768855,
"grad_norm": 0.5476314425468445,
"learning_rate": 1.1418001311320649e-05,
"loss": 0.6056069135665894,
"step": 1116
},
{
"epoch": 1.4489862124898623,
"grad_norm": 0.5088196396827698,
"learning_rate": 1.1403844892222717e-05,
"loss": 0.5474177002906799,
"step": 1117
},
{
"epoch": 1.4502838605028385,
"grad_norm": 0.5697342753410339,
"learning_rate": 1.1389685602617302e-05,
"loss": 0.6007769107818604,
"step": 1118
},
{
"epoch": 1.451581508515815,
"grad_norm": 0.5281476974487305,
"learning_rate": 1.1375523471456564e-05,
"loss": 0.5913225412368774,
"step": 1119
},
{
"epoch": 1.4528791565287915,
"grad_norm": 0.5619297027587891,
"learning_rate": 1.1361358527698481e-05,
"loss": 0.611336350440979,
"step": 1120
},
{
"epoch": 1.454176804541768,
"grad_norm": 0.531401515007019,
"learning_rate": 1.134719080030677e-05,
"loss": 0.5786083936691284,
"step": 1121
},
{
"epoch": 1.4554744525547445,
"grad_norm": 0.5428561568260193,
"learning_rate": 1.1333020318250854e-05,
"loss": 0.6208731532096863,
"step": 1122
},
{
"epoch": 1.456772100567721,
"grad_norm": 0.5384306311607361,
"learning_rate": 1.131884711050578e-05,
"loss": 0.5843198895454407,
"step": 1123
},
{
"epoch": 1.4580697485806975,
"grad_norm": 0.5160107016563416,
"learning_rate": 1.1304671206052168e-05,
"loss": 0.5473004579544067,
"step": 1124
},
{
"epoch": 1.459367396593674,
"grad_norm": 0.5360195636749268,
"learning_rate": 1.1290492633876164e-05,
"loss": 0.626501202583313,
"step": 1125
},
{
"epoch": 1.4606650446066505,
"grad_norm": 0.5251026749610901,
"learning_rate": 1.1276311422969349e-05,
"loss": 0.5944849848747253,
"step": 1126
},
{
"epoch": 1.461962692619627,
"grad_norm": 0.564008355140686,
"learning_rate": 1.1262127602328712e-05,
"loss": 0.6147276163101196,
"step": 1127
},
{
"epoch": 1.4632603406326035,
"grad_norm": 0.5388748645782471,
"learning_rate": 1.124794120095658e-05,
"loss": 0.5849318504333496,
"step": 1128
},
{
"epoch": 1.46455798864558,
"grad_norm": 0.5595386624336243,
"learning_rate": 1.1233752247860549e-05,
"loss": 0.6283015012741089,
"step": 1129
},
{
"epoch": 1.4658556366585564,
"grad_norm": 0.5528329014778137,
"learning_rate": 1.1219560772053442e-05,
"loss": 0.6135470867156982,
"step": 1130
},
{
"epoch": 1.4671532846715327,
"grad_norm": 0.5480870008468628,
"learning_rate": 1.1205366802553231e-05,
"loss": 0.579879879951477,
"step": 1131
},
{
"epoch": 1.4684509326845094,
"grad_norm": 0.6012369990348816,
"learning_rate": 1.1191170368382992e-05,
"loss": 0.67568039894104,
"step": 1132
},
{
"epoch": 1.4697485806974857,
"grad_norm": 0.5386692881584167,
"learning_rate": 1.117697149857084e-05,
"loss": 0.6155050992965698,
"step": 1133
},
{
"epoch": 1.4710462287104624,
"grad_norm": 0.540510892868042,
"learning_rate": 1.1162770222149873e-05,
"loss": 0.6193840503692627,
"step": 1134
},
{
"epoch": 1.4723438767234387,
"grad_norm": 0.5231954455375671,
"learning_rate": 1.1148566568158099e-05,
"loss": 0.5806912183761597,
"step": 1135
},
{
"epoch": 1.4736415247364152,
"grad_norm": 0.5371982455253601,
"learning_rate": 1.1134360565638402e-05,
"loss": 0.6294920444488525,
"step": 1136
},
{
"epoch": 1.4749391727493917,
"grad_norm": 0.5294065475463867,
"learning_rate": 1.1120152243638457e-05,
"loss": 0.6405944228172302,
"step": 1137
},
{
"epoch": 1.4762368207623682,
"grad_norm": 0.5396026372909546,
"learning_rate": 1.1105941631210694e-05,
"loss": 0.622348427772522,
"step": 1138
},
{
"epoch": 1.4775344687753447,
"grad_norm": 0.5184268951416016,
"learning_rate": 1.1091728757412212e-05,
"loss": 0.5783290863037109,
"step": 1139
},
{
"epoch": 1.4788321167883212,
"grad_norm": 0.5296680331230164,
"learning_rate": 1.107751365130474e-05,
"loss": 0.5765876770019531,
"step": 1140
},
{
"epoch": 1.4801297648012977,
"grad_norm": 0.5528906583786011,
"learning_rate": 1.1063296341954577e-05,
"loss": 0.5958802700042725,
"step": 1141
},
{
"epoch": 1.4814274128142741,
"grad_norm": 0.549384355545044,
"learning_rate": 1.1049076858432517e-05,
"loss": 0.6524186730384827,
"step": 1142
},
{
"epoch": 1.4827250608272506,
"grad_norm": 0.5553792119026184,
"learning_rate": 1.1034855229813812e-05,
"loss": 0.63478684425354,
"step": 1143
},
{
"epoch": 1.4840227088402271,
"grad_norm": 0.5639452934265137,
"learning_rate": 1.1020631485178084e-05,
"loss": 0.6482947468757629,
"step": 1144
},
{
"epoch": 1.4853203568532036,
"grad_norm": 0.5332263708114624,
"learning_rate": 1.1006405653609295e-05,
"loss": 0.6563082337379456,
"step": 1145
},
{
"epoch": 1.4866180048661801,
"grad_norm": 0.5505067110061646,
"learning_rate": 1.0992177764195671e-05,
"loss": 0.6217901706695557,
"step": 1146
},
{
"epoch": 1.4879156528791566,
"grad_norm": 0.5751034021377563,
"learning_rate": 1.0977947846029642e-05,
"loss": 0.618269681930542,
"step": 1147
},
{
"epoch": 1.4892133008921329,
"grad_norm": 0.5259911417961121,
"learning_rate": 1.0963715928207795e-05,
"loss": 0.5809241533279419,
"step": 1148
},
{
"epoch": 1.4905109489051096,
"grad_norm": 0.5405173301696777,
"learning_rate": 1.094948203983079e-05,
"loss": 0.6440936923027039,
"step": 1149
},
{
"epoch": 1.4918085969180859,
"grad_norm": 0.5359426736831665,
"learning_rate": 1.0935246210003334e-05,
"loss": 0.5997065305709839,
"step": 1150
},
{
"epoch": 1.4918085969180859,
"eval_loss": 0.6832194328308105,
"eval_runtime": 72.4893,
"eval_samples_per_second": 71.624,
"eval_steps_per_second": 8.953,
"step": 1150
},
{
"epoch": 1.4931062449310626,
"grad_norm": 0.545395016670227,
"learning_rate": 1.0921008467834094e-05,
"loss": 0.6377010345458984,
"step": 1151
},
{
"epoch": 1.4944038929440389,
"grad_norm": 0.553674578666687,
"learning_rate": 1.0906768842435647e-05,
"loss": 0.6331782937049866,
"step": 1152
},
{
"epoch": 1.4957015409570154,
"grad_norm": 0.5127398371696472,
"learning_rate": 1.0892527362924426e-05,
"loss": 0.5681911110877991,
"step": 1153
},
{
"epoch": 1.4969991889699918,
"grad_norm": 0.5308411717414856,
"learning_rate": 1.0878284058420647e-05,
"loss": 0.6325392127037048,
"step": 1154
},
{
"epoch": 1.4982968369829683,
"grad_norm": 0.5330897569656372,
"learning_rate": 1.0864038958048267e-05,
"loss": 0.5603891611099243,
"step": 1155
},
{
"epoch": 1.4995944849959448,
"grad_norm": 0.5287606716156006,
"learning_rate": 1.084979209093491e-05,
"loss": 0.5920351147651672,
"step": 1156
},
{
"epoch": 1.5008921330089213,
"grad_norm": 0.5484432578086853,
"learning_rate": 1.0835543486211815e-05,
"loss": 0.6529064178466797,
"step": 1157
},
{
"epoch": 1.5021897810218978,
"grad_norm": 0.5554434061050415,
"learning_rate": 1.0821293173013769e-05,
"loss": 0.6203141212463379,
"step": 1158
},
{
"epoch": 1.5034874290348743,
"grad_norm": 0.4985191226005554,
"learning_rate": 1.0807041180479054e-05,
"loss": 0.5167315006256104,
"step": 1159
},
{
"epoch": 1.5047850770478508,
"grad_norm": 0.5687364339828491,
"learning_rate": 1.0792787537749392e-05,
"loss": 0.6727509498596191,
"step": 1160
},
{
"epoch": 1.5060827250608273,
"grad_norm": 0.5391871333122253,
"learning_rate": 1.0778532273969877e-05,
"loss": 0.5891563892364502,
"step": 1161
},
{
"epoch": 1.5073803730738038,
"grad_norm": 0.5688561201095581,
"learning_rate": 1.0764275418288908e-05,
"loss": 0.6336361169815063,
"step": 1162
},
{
"epoch": 1.50867802108678,
"grad_norm": 0.5307201743125916,
"learning_rate": 1.0750016999858151e-05,
"loss": 0.6088765263557434,
"step": 1163
},
{
"epoch": 1.5099756690997568,
"grad_norm": 0.5417827367782593,
"learning_rate": 1.0735757047832461e-05,
"loss": 0.6234108209609985,
"step": 1164
},
{
"epoch": 1.511273317112733,
"grad_norm": 0.5165390968322754,
"learning_rate": 1.0721495591369832e-05,
"loss": 0.5378797054290771,
"step": 1165
},
{
"epoch": 1.5125709651257098,
"grad_norm": 0.5508493781089783,
"learning_rate": 1.0707232659631333e-05,
"loss": 0.6575205326080322,
"step": 1166
},
{
"epoch": 1.513868613138686,
"grad_norm": 0.5701325535774231,
"learning_rate": 1.0692968281781046e-05,
"loss": 0.5776763558387756,
"step": 1167
},
{
"epoch": 1.5151662611516628,
"grad_norm": 0.5180992484092712,
"learning_rate": 1.0678702486986016e-05,
"loss": 0.5627498626708984,
"step": 1168
},
{
"epoch": 1.516463909164639,
"grad_norm": 0.5465271472930908,
"learning_rate": 1.0664435304416185e-05,
"loss": 0.5880453586578369,
"step": 1169
},
{
"epoch": 1.5177615571776155,
"grad_norm": 0.5629556775093079,
"learning_rate": 1.065016676324433e-05,
"loss": 0.6594117879867554,
"step": 1170
},
{
"epoch": 1.519059205190592,
"grad_norm": 0.5278184413909912,
"learning_rate": 1.0635896892645998e-05,
"loss": 0.5453213453292847,
"step": 1171
},
{
"epoch": 1.5203568532035685,
"grad_norm": 0.5409108400344849,
"learning_rate": 1.0621625721799473e-05,
"loss": 0.6020928025245667,
"step": 1172
},
{
"epoch": 1.521654501216545,
"grad_norm": 0.5297386050224304,
"learning_rate": 1.0607353279885682e-05,
"loss": 0.581575870513916,
"step": 1173
},
{
"epoch": 1.5229521492295215,
"grad_norm": 0.5326167345046997,
"learning_rate": 1.0593079596088155e-05,
"loss": 0.5731886029243469,
"step": 1174
},
{
"epoch": 1.524249797242498,
"grad_norm": 0.5496317148208618,
"learning_rate": 1.0578804699592968e-05,
"loss": 0.6127786636352539,
"step": 1175
},
{
"epoch": 1.5255474452554745,
"grad_norm": 0.5222692489624023,
"learning_rate": 1.0564528619588668e-05,
"loss": 0.5508180856704712,
"step": 1176
},
{
"epoch": 1.526845093268451,
"grad_norm": 0.5078931450843811,
"learning_rate": 1.0550251385266223e-05,
"loss": 0.590618908405304,
"step": 1177
},
{
"epoch": 1.5281427412814275,
"grad_norm": 0.545173704624176,
"learning_rate": 1.0535973025818969e-05,
"loss": 0.5988805294036865,
"step": 1178
},
{
"epoch": 1.529440389294404,
"grad_norm": 0.5643585920333862,
"learning_rate": 1.0521693570442533e-05,
"loss": 0.6470606327056885,
"step": 1179
},
{
"epoch": 1.5307380373073802,
"grad_norm": 0.5382372140884399,
"learning_rate": 1.050741304833479e-05,
"loss": 0.6253216862678528,
"step": 1180
},
{
"epoch": 1.532035685320357,
"grad_norm": 0.527792751789093,
"learning_rate": 1.0493131488695789e-05,
"loss": 0.5740289092063904,
"step": 1181
},
{
"epoch": 1.5333333333333332,
"grad_norm": 0.5286063551902771,
"learning_rate": 1.0478848920727707e-05,
"loss": 0.5898089408874512,
"step": 1182
},
{
"epoch": 1.53463098134631,
"grad_norm": 0.5210081338882446,
"learning_rate": 1.0464565373634784e-05,
"loss": 0.5460256338119507,
"step": 1183
},
{
"epoch": 1.5359286293592862,
"grad_norm": 0.542233943939209,
"learning_rate": 1.0450280876623253e-05,
"loss": 0.6149614453315735,
"step": 1184
},
{
"epoch": 1.537226277372263,
"grad_norm": 0.5287345051765442,
"learning_rate": 1.0435995458901298e-05,
"loss": 0.5987131595611572,
"step": 1185
},
{
"epoch": 1.5385239253852392,
"grad_norm": 0.542398989200592,
"learning_rate": 1.042170914967898e-05,
"loss": 0.5659464001655579,
"step": 1186
},
{
"epoch": 1.5398215733982157,
"grad_norm": 0.5581417679786682,
"learning_rate": 1.0407421978168186e-05,
"loss": 0.648675262928009,
"step": 1187
},
{
"epoch": 1.5411192214111922,
"grad_norm": 0.542323112487793,
"learning_rate": 1.0393133973582572e-05,
"loss": 0.6466338634490967,
"step": 1188
},
{
"epoch": 1.5424168694241687,
"grad_norm": 0.5204232335090637,
"learning_rate": 1.0378845165137483e-05,
"loss": 0.5785092115402222,
"step": 1189
},
{
"epoch": 1.5437145174371452,
"grad_norm": 0.5261425375938416,
"learning_rate": 1.0364555582049917e-05,
"loss": 0.6130785346031189,
"step": 1190
},
{
"epoch": 1.5450121654501217,
"grad_norm": 0.5651884078979492,
"learning_rate": 1.0350265253538458e-05,
"loss": 0.6042903661727905,
"step": 1191
},
{
"epoch": 1.5463098134630981,
"grad_norm": 0.5569320917129517,
"learning_rate": 1.033597420882321e-05,
"loss": 0.6515809297561646,
"step": 1192
},
{
"epoch": 1.5476074614760746,
"grad_norm": 0.5539842844009399,
"learning_rate": 1.0321682477125743e-05,
"loss": 0.6051802039146423,
"step": 1193
},
{
"epoch": 1.5489051094890511,
"grad_norm": 0.5327019691467285,
"learning_rate": 1.0307390087669026e-05,
"loss": 0.5866248607635498,
"step": 1194
},
{
"epoch": 1.5502027575020276,
"grad_norm": 0.5504518151283264,
"learning_rate": 1.0293097069677382e-05,
"loss": 0.6087076663970947,
"step": 1195
},
{
"epoch": 1.5515004055150041,
"grad_norm": 0.5322021842002869,
"learning_rate": 1.0278803452376416e-05,
"loss": 0.5527307391166687,
"step": 1196
},
{
"epoch": 1.5527980535279804,
"grad_norm": 0.5314878821372986,
"learning_rate": 1.0264509264992954e-05,
"loss": 0.623512327671051,
"step": 1197
},
{
"epoch": 1.554095701540957,
"grad_norm": 0.5596524477005005,
"learning_rate": 1.0250214536754996e-05,
"loss": 0.6276538372039795,
"step": 1198
},
{
"epoch": 1.5553933495539334,
"grad_norm": 0.5265888571739197,
"learning_rate": 1.0235919296891641e-05,
"loss": 0.5611189603805542,
"step": 1199
},
{
"epoch": 1.55669099756691,
"grad_norm": 0.5899763107299805,
"learning_rate": 1.0221623574633035e-05,
"loss": 0.6541014909744263,
"step": 1200
},
{
"epoch": 1.5579886455798864,
"grad_norm": 0.545138955116272,
"learning_rate": 1.0207327399210311e-05,
"loss": 0.5935692191123962,
"step": 1201
},
{
"epoch": 1.559286293592863,
"grad_norm": 0.5380452871322632,
"learning_rate": 1.0193030799855534e-05,
"loss": 0.5741644501686096,
"step": 1202
},
{
"epoch": 1.5605839416058394,
"grad_norm": 0.5540161728858948,
"learning_rate": 1.0178733805801626e-05,
"loss": 0.625443696975708,
"step": 1203
},
{
"epoch": 1.5618815896188158,
"grad_norm": 0.5784110426902771,
"learning_rate": 1.0164436446282324e-05,
"loss": 0.6342917680740356,
"step": 1204
},
{
"epoch": 1.5631792376317923,
"grad_norm": 0.5346982479095459,
"learning_rate": 1.015013875053211e-05,
"loss": 0.5571820735931396,
"step": 1205
},
{
"epoch": 1.5644768856447688,
"grad_norm": 0.5152148008346558,
"learning_rate": 1.013584074778615e-05,
"loss": 0.5197643041610718,
"step": 1206
},
{
"epoch": 1.5657745336577453,
"grad_norm": 0.5702791213989258,
"learning_rate": 1.0121542467280245e-05,
"loss": 0.6099081635475159,
"step": 1207
},
{
"epoch": 1.5670721816707218,
"grad_norm": 0.5424299836158752,
"learning_rate": 1.0107243938250755e-05,
"loss": 0.5385927557945251,
"step": 1208
},
{
"epoch": 1.5683698296836983,
"grad_norm": 0.5413081049919128,
"learning_rate": 1.0092945189934558e-05,
"loss": 0.6308001279830933,
"step": 1209
},
{
"epoch": 1.5696674776966748,
"grad_norm": 0.5650938749313354,
"learning_rate": 1.007864625156897e-05,
"loss": 0.656417965888977,
"step": 1210
},
{
"epoch": 1.5709651257096513,
"grad_norm": 0.5578048229217529,
"learning_rate": 1.0064347152391703e-05,
"loss": 0.5987565517425537,
"step": 1211
},
{
"epoch": 1.5722627737226276,
"grad_norm": 0.5425694584846497,
"learning_rate": 1.0050047921640797e-05,
"loss": 0.5794038772583008,
"step": 1212
},
{
"epoch": 1.5735604217356043,
"grad_norm": 0.5536248087882996,
"learning_rate": 1.003574858855456e-05,
"loss": 0.6126576066017151,
"step": 1213
},
{
"epoch": 1.5748580697485806,
"grad_norm": 0.5221614837646484,
"learning_rate": 1.0021449182371504e-05,
"loss": 0.5808907747268677,
"step": 1214
},
{
"epoch": 1.5761557177615573,
"grad_norm": 0.5314812660217285,
"learning_rate": 1.0007149732330299e-05,
"loss": 0.5740360021591187,
"step": 1215
},
{
"epoch": 1.5774533657745335,
"grad_norm": 0.556327223777771,
"learning_rate": 9.992850267669703e-06,
"loss": 0.6449018716812134,
"step": 1216
},
{
"epoch": 1.5787510137875103,
"grad_norm": 0.5447148680686951,
"learning_rate": 9.978550817628501e-06,
"loss": 0.5590343475341797,
"step": 1217
},
{
"epoch": 1.5800486618004865,
"grad_norm": 0.5570490956306458,
"learning_rate": 9.964251411445444e-06,
"loss": 0.6283855438232422,
"step": 1218
},
{
"epoch": 1.5813463098134632,
"grad_norm": 0.5475562214851379,
"learning_rate": 9.949952078359208e-06,
"loss": 0.6058873534202576,
"step": 1219
},
{
"epoch": 1.5826439578264395,
"grad_norm": 0.5271614789962769,
"learning_rate": 9.935652847608302e-06,
"loss": 0.6080070734024048,
"step": 1220
},
{
"epoch": 1.583941605839416,
"grad_norm": 0.5340768098831177,
"learning_rate": 9.921353748431036e-06,
"loss": 0.5789950489997864,
"step": 1221
},
{
"epoch": 1.5852392538523925,
"grad_norm": 0.5284969806671143,
"learning_rate": 9.907054810065446e-06,
"loss": 0.5514812469482422,
"step": 1222
},
{
"epoch": 1.586536901865369,
"grad_norm": 0.5400740504264832,
"learning_rate": 9.89275606174925e-06,
"loss": 0.5774392485618591,
"step": 1223
},
{
"epoch": 1.5878345498783455,
"grad_norm": 0.5264250040054321,
"learning_rate": 9.878457532719757e-06,
"loss": 0.5731384754180908,
"step": 1224
},
{
"epoch": 1.589132197891322,
"grad_norm": 0.5703708529472351,
"learning_rate": 9.864159252213852e-06,
"loss": 0.6473686695098877,
"step": 1225
},
{
"epoch": 1.5904298459042985,
"grad_norm": 0.5441808104515076,
"learning_rate": 9.849861249467893e-06,
"loss": 0.6381841897964478,
"step": 1226
},
{
"epoch": 1.591727493917275,
"grad_norm": 0.5486851930618286,
"learning_rate": 9.83556355371768e-06,
"loss": 0.613477349281311,
"step": 1227
},
{
"epoch": 1.5930251419302515,
"grad_norm": 0.5925759673118591,
"learning_rate": 9.821266194198375e-06,
"loss": 0.5966989994049072,
"step": 1228
},
{
"epoch": 1.5943227899432277,
"grad_norm": 0.503745436668396,
"learning_rate": 9.806969200144471e-06,
"loss": 0.5462368726730347,
"step": 1229
},
{
"epoch": 1.5956204379562045,
"grad_norm": 0.525786817073822,
"learning_rate": 9.79267260078969e-06,
"loss": 0.5990958213806152,
"step": 1230
},
{
"epoch": 1.5969180859691807,
"grad_norm": 0.5402313470840454,
"learning_rate": 9.778376425366967e-06,
"loss": 0.6069964170455933,
"step": 1231
},
{
"epoch": 1.5982157339821574,
"grad_norm": 0.566880464553833,
"learning_rate": 9.764080703108362e-06,
"loss": 0.6295340061187744,
"step": 1232
},
{
"epoch": 1.5995133819951337,
"grad_norm": 0.5545258522033691,
"learning_rate": 9.749785463245006e-06,
"loss": 0.6260232925415039,
"step": 1233
},
{
"epoch": 1.6008110300081104,
"grad_norm": 0.5898419618606567,
"learning_rate": 9.735490735007047e-06,
"loss": 0.6146451830863953,
"step": 1234
},
{
"epoch": 1.6021086780210867,
"grad_norm": 0.5249006748199463,
"learning_rate": 9.721196547623585e-06,
"loss": 0.6049670577049255,
"step": 1235
},
{
"epoch": 1.6034063260340634,
"grad_norm": 0.5289062857627869,
"learning_rate": 9.706902930322621e-06,
"loss": 0.6006771326065063,
"step": 1236
},
{
"epoch": 1.6047039740470397,
"grad_norm": 0.5482916235923767,
"learning_rate": 9.692609912330975e-06,
"loss": 0.621732771396637,
"step": 1237
},
{
"epoch": 1.6060016220600162,
"grad_norm": 0.5499362945556641,
"learning_rate": 9.67831752287426e-06,
"loss": 0.6316919922828674,
"step": 1238
},
{
"epoch": 1.6072992700729927,
"grad_norm": 0.5119637250900269,
"learning_rate": 9.66402579117679e-06,
"loss": 0.5918980240821838,
"step": 1239
},
{
"epoch": 1.6085969180859692,
"grad_norm": 0.5473806262016296,
"learning_rate": 9.649734746461544e-06,
"loss": 0.6354460716247559,
"step": 1240
},
{
"epoch": 1.6098945660989457,
"grad_norm": 0.5340628027915955,
"learning_rate": 9.635444417950083e-06,
"loss": 0.5693660378456116,
"step": 1241
},
{
"epoch": 1.6111922141119221,
"grad_norm": 0.5385611653327942,
"learning_rate": 9.62115483486252e-06,
"loss": 0.5467959642410278,
"step": 1242
},
{
"epoch": 1.6124898621248986,
"grad_norm": 0.5278156399726868,
"learning_rate": 9.606866026417431e-06,
"loss": 0.6024355888366699,
"step": 1243
},
{
"epoch": 1.6137875101378751,
"grad_norm": 0.5506213903427124,
"learning_rate": 9.592578021831817e-06,
"loss": 0.6594349145889282,
"step": 1244
},
{
"epoch": 1.6150851581508516,
"grad_norm": 0.5613592267036438,
"learning_rate": 9.578290850321023e-06,
"loss": 0.6147022247314453,
"step": 1245
},
{
"epoch": 1.616382806163828,
"grad_norm": 0.5302473306655884,
"learning_rate": 9.564004541098709e-06,
"loss": 0.5724552869796753,
"step": 1246
},
{
"epoch": 1.6176804541768046,
"grad_norm": 0.5463687777519226,
"learning_rate": 9.549719123376749e-06,
"loss": 0.6859567165374756,
"step": 1247
},
{
"epoch": 1.6189781021897809,
"grad_norm": 0.578063428401947,
"learning_rate": 9.535434626365221e-06,
"loss": 0.654534101486206,
"step": 1248
},
{
"epoch": 1.6202757502027576,
"grad_norm": 0.5842363238334656,
"learning_rate": 9.521151079272295e-06,
"loss": 0.6818944811820984,
"step": 1249
},
{
"epoch": 1.6215733982157339,
"grad_norm": 0.5462816953659058,
"learning_rate": 9.506868511304216e-06,
"loss": 0.5978901386260986,
"step": 1250
},
{
"epoch": 1.6228710462287106,
"grad_norm": 0.5496495962142944,
"learning_rate": 9.492586951665214e-06,
"loss": 0.6664569973945618,
"step": 1251
},
{
"epoch": 1.6241686942416869,
"grad_norm": 0.541262149810791,
"learning_rate": 9.47830642955747e-06,
"loss": 0.5771492719650269,
"step": 1252
},
{
"epoch": 1.6254663422546636,
"grad_norm": 0.5542916655540466,
"learning_rate": 9.464026974181035e-06,
"loss": 0.6377862095832825,
"step": 1253
},
{
"epoch": 1.6267639902676398,
"grad_norm": 0.5212349891662598,
"learning_rate": 9.44974861473378e-06,
"loss": 0.5878604650497437,
"step": 1254
},
{
"epoch": 1.6280616382806163,
"grad_norm": 0.5611302256584167,
"learning_rate": 9.435471380411335e-06,
"loss": 0.636326789855957,
"step": 1255
},
{
"epoch": 1.6293592862935928,
"grad_norm": 0.5258191227912903,
"learning_rate": 9.421195300407035e-06,
"loss": 0.5580926537513733,
"step": 1256
},
{
"epoch": 1.6306569343065693,
"grad_norm": 0.5298276543617249,
"learning_rate": 9.406920403911848e-06,
"loss": 0.6048216819763184,
"step": 1257
},
{
"epoch": 1.6319545823195458,
"grad_norm": 0.5328834056854248,
"learning_rate": 9.392646720114325e-06,
"loss": 0.6379623413085938,
"step": 1258
},
{
"epoch": 1.6332522303325223,
"grad_norm": 0.5315790176391602,
"learning_rate": 9.37837427820053e-06,
"loss": 0.6466155052185059,
"step": 1259
},
{
"epoch": 1.6345498783454988,
"grad_norm": 0.5353376269340515,
"learning_rate": 9.364103107354002e-06,
"loss": 0.5879526138305664,
"step": 1260
},
{
"epoch": 1.6358475263584753,
"grad_norm": 0.5551068186759949,
"learning_rate": 9.349833236755675e-06,
"loss": 0.5988892316818237,
"step": 1261
},
{
"epoch": 1.6371451743714518,
"grad_norm": 0.5331724286079407,
"learning_rate": 9.335564695583816e-06,
"loss": 0.5948902368545532,
"step": 1262
},
{
"epoch": 1.638442822384428,
"grad_norm": 0.54310542345047,
"learning_rate": 9.321297513013987e-06,
"loss": 0.6055219769477844,
"step": 1263
},
{
"epoch": 1.6397404703974048,
"grad_norm": 0.5368586182594299,
"learning_rate": 9.307031718218956e-06,
"loss": 0.6035459637641907,
"step": 1264
},
{
"epoch": 1.641038118410381,
"grad_norm": 0.5460159182548523,
"learning_rate": 9.292767340368672e-06,
"loss": 0.6447773575782776,
"step": 1265
},
{
"epoch": 1.6423357664233578,
"grad_norm": 0.5599712133407593,
"learning_rate": 9.278504408630171e-06,
"loss": 0.6332420110702515,
"step": 1266
},
{
"epoch": 1.643633414436334,
"grad_norm": 0.5388185977935791,
"learning_rate": 9.264242952167544e-06,
"loss": 0.6116797924041748,
"step": 1267
},
{
"epoch": 1.6449310624493108,
"grad_norm": 0.5109002590179443,
"learning_rate": 9.24998300014185e-06,
"loss": 0.628926694393158,
"step": 1268
},
{
"epoch": 1.646228710462287,
"grad_norm": 0.5572671890258789,
"learning_rate": 9.235724581711096e-06,
"loss": 0.5795090794563293,
"step": 1269
},
{
"epoch": 1.6475263584752637,
"grad_norm": 0.777040421962738,
"learning_rate": 9.221467726030126e-06,
"loss": 0.644891083240509,
"step": 1270
},
{
"epoch": 1.64882400648824,
"grad_norm": 0.5158191919326782,
"learning_rate": 9.207212462250611e-06,
"loss": 0.5630925893783569,
"step": 1271
},
{
"epoch": 1.6501216545012165,
"grad_norm": 0.5111160278320312,
"learning_rate": 9.192958819520948e-06,
"loss": 0.5322938561439514,
"step": 1272
},
{
"epoch": 1.651419302514193,
"grad_norm": 0.5043333768844604,
"learning_rate": 9.178706826986236e-06,
"loss": 0.5961562395095825,
"step": 1273
},
{
"epoch": 1.6527169505271695,
"grad_norm": 0.5496838092803955,
"learning_rate": 9.164456513788186e-06,
"loss": 0.6005456447601318,
"step": 1274
},
{
"epoch": 1.654014598540146,
"grad_norm": 0.5577642321586609,
"learning_rate": 9.150207909065093e-06,
"loss": 0.6366305351257324,
"step": 1275
},
{
"epoch": 1.6553122465531225,
"grad_norm": 0.5257747769355774,
"learning_rate": 9.135961041951735e-06,
"loss": 0.5669390559196472,
"step": 1276
},
{
"epoch": 1.656609894566099,
"grad_norm": 0.5349394083023071,
"learning_rate": 9.121715941579358e-06,
"loss": 0.5594930052757263,
"step": 1277
},
{
"epoch": 1.6579075425790755,
"grad_norm": 0.5282658338546753,
"learning_rate": 9.107472637075578e-06,
"loss": 0.6159694194793701,
"step": 1278
},
{
"epoch": 1.659205190592052,
"grad_norm": 0.5608229637145996,
"learning_rate": 9.093231157564357e-06,
"loss": 0.6022686958312988,
"step": 1279
},
{
"epoch": 1.6605028386050282,
"grad_norm": 0.5175761580467224,
"learning_rate": 9.078991532165911e-06,
"loss": 0.5850685834884644,
"step": 1280
},
{
"epoch": 1.661800486618005,
"grad_norm": 0.5338742733001709,
"learning_rate": 9.06475378999667e-06,
"loss": 0.5943388938903809,
"step": 1281
},
{
"epoch": 1.6630981346309812,
"grad_norm": 0.5751469135284424,
"learning_rate": 9.050517960169211e-06,
"loss": 0.6381434798240662,
"step": 1282
},
{
"epoch": 1.664395782643958,
"grad_norm": 0.5597715377807617,
"learning_rate": 9.036284071792212e-06,
"loss": 0.6742138862609863,
"step": 1283
},
{
"epoch": 1.6656934306569342,
"grad_norm": 0.5457910895347595,
"learning_rate": 9.022052153970361e-06,
"loss": 0.6068155169487,
"step": 1284
},
{
"epoch": 1.666991078669911,
"grad_norm": 0.5507814884185791,
"learning_rate": 9.007822235804334e-06,
"loss": 0.6176409125328064,
"step": 1285
},
{
"epoch": 1.6682887266828872,
"grad_norm": 0.5373377203941345,
"learning_rate": 8.993594346390709e-06,
"loss": 0.5884984731674194,
"step": 1286
},
{
"epoch": 1.669586374695864,
"grad_norm": 0.523912787437439,
"learning_rate": 8.979368514821917e-06,
"loss": 0.5794025659561157,
"step": 1287
},
{
"epoch": 1.6708840227088402,
"grad_norm": 0.5313317179679871,
"learning_rate": 8.965144770186192e-06,
"loss": 0.6304433345794678,
"step": 1288
},
{
"epoch": 1.6721816707218167,
"grad_norm": 0.5308225154876709,
"learning_rate": 8.950923141567482e-06,
"loss": 0.5822694301605225,
"step": 1289
},
{
"epoch": 1.6734793187347932,
"grad_norm": 0.5657337307929993,
"learning_rate": 8.936703658045426e-06,
"loss": 0.7206499576568604,
"step": 1290
},
{
"epoch": 1.6747769667477697,
"grad_norm": 0.5842191576957703,
"learning_rate": 8.92248634869526e-06,
"loss": 0.6483322381973267,
"step": 1291
},
{
"epoch": 1.6760746147607462,
"grad_norm": 0.5084115266799927,
"learning_rate": 8.90827124258779e-06,
"loss": 0.60451340675354,
"step": 1292
},
{
"epoch": 1.6773722627737226,
"grad_norm": 0.5080921053886414,
"learning_rate": 8.894058368789308e-06,
"loss": 0.5007386803627014,
"step": 1293
},
{
"epoch": 1.6786699107866991,
"grad_norm": 0.5186359286308289,
"learning_rate": 8.879847756361544e-06,
"loss": 0.5846607685089111,
"step": 1294
},
{
"epoch": 1.6799675587996756,
"grad_norm": 0.5321721434593201,
"learning_rate": 8.8656394343616e-06,
"loss": 0.5854955315589905,
"step": 1295
},
{
"epoch": 1.6812652068126521,
"grad_norm": 0.5577939748764038,
"learning_rate": 8.851433431841904e-06,
"loss": 0.6218785643577576,
"step": 1296
},
{
"epoch": 1.6825628548256284,
"grad_norm": 0.5574389696121216,
"learning_rate": 8.837229777850129e-06,
"loss": 0.639427661895752,
"step": 1297
},
{
"epoch": 1.683860502838605,
"grad_norm": 0.5620577335357666,
"learning_rate": 8.823028501429161e-06,
"loss": 0.6334304809570312,
"step": 1298
},
{
"epoch": 1.6851581508515814,
"grad_norm": 0.5603854656219482,
"learning_rate": 8.808829631617009e-06,
"loss": 0.5796216726303101,
"step": 1299
},
{
"epoch": 1.686455798864558,
"grad_norm": 0.5886275172233582,
"learning_rate": 8.79463319744677e-06,
"loss": 0.6645929217338562,
"step": 1300
},
{
"epoch": 1.6877534468775344,
"grad_norm": 0.5587744116783142,
"learning_rate": 8.78043922794656e-06,
"loss": 0.6387877464294434,
"step": 1301
},
{
"epoch": 1.689051094890511,
"grad_norm": 0.5619886517524719,
"learning_rate": 8.766247752139453e-06,
"loss": 0.658257007598877,
"step": 1302
},
{
"epoch": 1.6903487429034874,
"grad_norm": 0.5658282041549683,
"learning_rate": 8.752058799043422e-06,
"loss": 0.6349663734436035,
"step": 1303
},
{
"epoch": 1.691646390916464,
"grad_norm": 0.5596343874931335,
"learning_rate": 8.737872397671293e-06,
"loss": 0.5926494002342224,
"step": 1304
},
{
"epoch": 1.6929440389294403,
"grad_norm": 0.5565075874328613,
"learning_rate": 8.723688577030655e-06,
"loss": 0.6093648672103882,
"step": 1305
},
{
"epoch": 1.6942416869424168,
"grad_norm": 0.5608682036399841,
"learning_rate": 8.709507366123841e-06,
"loss": 0.6120996475219727,
"step": 1306
},
{
"epoch": 1.6955393349553933,
"grad_norm": 0.5365821719169617,
"learning_rate": 8.695328793947833e-06,
"loss": 0.5509933233261108,
"step": 1307
},
{
"epoch": 1.6968369829683698,
"grad_norm": 0.537822961807251,
"learning_rate": 8.681152889494227e-06,
"loss": 0.6313689947128296,
"step": 1308
},
{
"epoch": 1.6981346309813463,
"grad_norm": 0.5853676199913025,
"learning_rate": 8.66697968174915e-06,
"loss": 0.6015232801437378,
"step": 1309
},
{
"epoch": 1.6994322789943228,
"grad_norm": 0.5395903587341309,
"learning_rate": 8.652809199693236e-06,
"loss": 0.5783022046089172,
"step": 1310
},
{
"epoch": 1.7007299270072993,
"grad_norm": 0.5408870577812195,
"learning_rate": 8.638641472301524e-06,
"loss": 0.6224579215049744,
"step": 1311
},
{
"epoch": 1.7020275750202758,
"grad_norm": 0.5533918142318726,
"learning_rate": 8.624476528543439e-06,
"loss": 0.6317031383514404,
"step": 1312
},
{
"epoch": 1.7033252230332523,
"grad_norm": 0.577556848526001,
"learning_rate": 8.610314397382701e-06,
"loss": 0.6522644758224487,
"step": 1313
},
{
"epoch": 1.7046228710462286,
"grad_norm": 0.5453810095787048,
"learning_rate": 8.596155107777288e-06,
"loss": 0.6072216629981995,
"step": 1314
},
{
"epoch": 1.7059205190592053,
"grad_norm": 0.5380662679672241,
"learning_rate": 8.581998688679356e-06,
"loss": 0.6069589853286743,
"step": 1315
},
{
"epoch": 1.7072181670721815,
"grad_norm": 0.5374992489814758,
"learning_rate": 8.567845169035205e-06,
"loss": 0.6239044070243835,
"step": 1316
},
{
"epoch": 1.7085158150851583,
"grad_norm": 0.5366406440734863,
"learning_rate": 8.553694577785201e-06,
"loss": 0.5901238322257996,
"step": 1317
},
{
"epoch": 1.7098134630981345,
"grad_norm": 0.5510634779930115,
"learning_rate": 8.539546943863717e-06,
"loss": 0.6066378355026245,
"step": 1318
},
{
"epoch": 1.7111111111111112,
"grad_norm": 0.5579630732536316,
"learning_rate": 8.525402296199089e-06,
"loss": 0.6439074873924255,
"step": 1319
},
{
"epoch": 1.7124087591240875,
"grad_norm": 0.5268120765686035,
"learning_rate": 8.511260663713537e-06,
"loss": 0.5521663427352905,
"step": 1320
},
{
"epoch": 1.7137064071370642,
"grad_norm": 0.5076732635498047,
"learning_rate": 8.497122075323122e-06,
"loss": 0.5523797273635864,
"step": 1321
},
{
"epoch": 1.7150040551500405,
"grad_norm": 0.5172733068466187,
"learning_rate": 8.482986559937676e-06,
"loss": 0.6011000275611877,
"step": 1322
},
{
"epoch": 1.716301703163017,
"grad_norm": 0.5152168869972229,
"learning_rate": 8.468854146460754e-06,
"loss": 0.5801671743392944,
"step": 1323
},
{
"epoch": 1.7175993511759935,
"grad_norm": 0.5168895721435547,
"learning_rate": 8.45472486378956e-06,
"loss": 0.6005280613899231,
"step": 1324
},
{
"epoch": 1.71889699918897,
"grad_norm": 0.571263313293457,
"learning_rate": 8.440598740814909e-06,
"loss": 0.6543586850166321,
"step": 1325
},
{
"epoch": 1.7201946472019465,
"grad_norm": 0.5240177512168884,
"learning_rate": 8.426475806421139e-06,
"loss": 0.613470196723938,
"step": 1326
},
{
"epoch": 1.721492295214923,
"grad_norm": 0.5217388272285461,
"learning_rate": 8.412356089486082e-06,
"loss": 0.5799127817153931,
"step": 1327
},
{
"epoch": 1.7227899432278995,
"grad_norm": 0.5473462343215942,
"learning_rate": 8.39823961888098e-06,
"loss": 0.6159072518348694,
"step": 1328
},
{
"epoch": 1.724087591240876,
"grad_norm": 0.9222651124000549,
"learning_rate": 8.384126423470447e-06,
"loss": 0.6260055303573608,
"step": 1329
},
{
"epoch": 1.7253852392538525,
"grad_norm": 0.5530563592910767,
"learning_rate": 8.37001653211239e-06,
"loss": 0.5505119562149048,
"step": 1330
},
{
"epoch": 1.7266828872668287,
"grad_norm": 0.5369389653205872,
"learning_rate": 8.355909973657975e-06,
"loss": 0.6139888763427734,
"step": 1331
},
{
"epoch": 1.7279805352798054,
"grad_norm": 0.5347586870193481,
"learning_rate": 8.341806776951532e-06,
"loss": 0.6265066862106323,
"step": 1332
},
{
"epoch": 1.7292781832927817,
"grad_norm": 0.545946478843689,
"learning_rate": 8.327706970830537e-06,
"loss": 0.6024926900863647,
"step": 1333
},
{
"epoch": 1.7305758313057584,
"grad_norm": 0.5450059771537781,
"learning_rate": 8.313610584125523e-06,
"loss": 0.658405065536499,
"step": 1334
},
{
"epoch": 1.7318734793187347,
"grad_norm": 0.5516889691352844,
"learning_rate": 8.299517645660033e-06,
"loss": 0.5770267248153687,
"step": 1335
},
{
"epoch": 1.7331711273317114,
"grad_norm": 0.557074785232544,
"learning_rate": 8.285428184250554e-06,
"loss": 0.5421329736709595,
"step": 1336
},
{
"epoch": 1.7344687753446877,
"grad_norm": 0.543565571308136,
"learning_rate": 8.271342228706478e-06,
"loss": 0.6527873277664185,
"step": 1337
},
{
"epoch": 1.7357664233576642,
"grad_norm": 0.49616673588752747,
"learning_rate": 8.257259807830009e-06,
"loss": 0.5355008840560913,
"step": 1338
},
{
"epoch": 1.7370640713706407,
"grad_norm": 0.5389429330825806,
"learning_rate": 8.243180950416142e-06,
"loss": 0.6072633862495422,
"step": 1339
},
{
"epoch": 1.7383617193836172,
"grad_norm": 0.542195737361908,
"learning_rate": 8.22910568525257e-06,
"loss": 0.5909712314605713,
"step": 1340
},
{
"epoch": 1.7396593673965937,
"grad_norm": 0.5480629205703735,
"learning_rate": 8.215034041119655e-06,
"loss": 0.5966728925704956,
"step": 1341
},
{
"epoch": 1.7409570154095702,
"grad_norm": 0.5179266929626465,
"learning_rate": 8.200966046790339e-06,
"loss": 0.608291745185852,
"step": 1342
},
{
"epoch": 1.7422546634225466,
"grad_norm": 0.525390625,
"learning_rate": 8.186901731030117e-06,
"loss": 0.6019555330276489,
"step": 1343
},
{
"epoch": 1.7435523114355231,
"grad_norm": 0.5716756582260132,
"learning_rate": 8.172841122596951e-06,
"loss": 0.6858773827552795,
"step": 1344
},
{
"epoch": 1.7448499594484996,
"grad_norm": 0.53510981798172,
"learning_rate": 8.158784250241226e-06,
"loss": 0.6193398833274841,
"step": 1345
},
{
"epoch": 1.7461476074614761,
"grad_norm": 0.509371280670166,
"learning_rate": 8.144731142705693e-06,
"loss": 0.5310204029083252,
"step": 1346
},
{
"epoch": 1.7474452554744526,
"grad_norm": 0.520005464553833,
"learning_rate": 8.130681828725394e-06,
"loss": 0.5864765644073486,
"step": 1347
},
{
"epoch": 1.748742903487429,
"grad_norm": 0.530784010887146,
"learning_rate": 8.116636337027626e-06,
"loss": 0.5898761749267578,
"step": 1348
},
{
"epoch": 1.7500405515004056,
"grad_norm": 0.528357982635498,
"learning_rate": 8.10259469633186e-06,
"loss": 0.611457347869873,
"step": 1349
},
{
"epoch": 1.7513381995133819,
"grad_norm": 0.5243317484855652,
"learning_rate": 8.0885569353497e-06,
"loss": 0.5851372480392456,
"step": 1350
},
{
"epoch": 1.7526358475263586,
"grad_norm": 0.5656478404998779,
"learning_rate": 8.07452308278481e-06,
"loss": 0.6243469715118408,
"step": 1351
},
{
"epoch": 1.7539334955393349,
"grad_norm": 0.5173115134239197,
"learning_rate": 8.060493167332874e-06,
"loss": 0.5658408403396606,
"step": 1352
},
{
"epoch": 1.7552311435523116,
"grad_norm": 0.5283849835395813,
"learning_rate": 8.04646721768151e-06,
"loss": 0.6133898496627808,
"step": 1353
},
{
"epoch": 1.7565287915652879,
"grad_norm": 0.5533227324485779,
"learning_rate": 8.032445262510241e-06,
"loss": 0.6251792907714844,
"step": 1354
},
{
"epoch": 1.7578264395782643,
"grad_norm": 0.5281651020050049,
"learning_rate": 8.018427330490411e-06,
"loss": 0.5514408349990845,
"step": 1355
},
{
"epoch": 1.7591240875912408,
"grad_norm": 0.5382410883903503,
"learning_rate": 8.004413450285147e-06,
"loss": 0.6591918468475342,
"step": 1356
},
{
"epoch": 1.7604217356042173,
"grad_norm": 0.566716194152832,
"learning_rate": 7.990403650549285e-06,
"loss": 0.6281836628913879,
"step": 1357
},
{
"epoch": 1.7617193836171938,
"grad_norm": 0.5423158407211304,
"learning_rate": 7.976397959929324e-06,
"loss": 0.5953754782676697,
"step": 1358
},
{
"epoch": 1.7630170316301703,
"grad_norm": 0.5327609181404114,
"learning_rate": 7.962396407063346e-06,
"loss": 0.6248747110366821,
"step": 1359
},
{
"epoch": 1.7643146796431468,
"grad_norm": 0.5314010381698608,
"learning_rate": 7.948399020580995e-06,
"loss": 0.5661095380783081,
"step": 1360
},
{
"epoch": 1.7656123276561233,
"grad_norm": 0.5650714039802551,
"learning_rate": 7.934405829103376e-06,
"loss": 0.6127238869667053,
"step": 1361
},
{
"epoch": 1.7669099756690998,
"grad_norm": 0.546101987361908,
"learning_rate": 7.920416861243028e-06,
"loss": 0.5874890089035034,
"step": 1362
},
{
"epoch": 1.7682076236820763,
"grad_norm": 0.5429707169532776,
"learning_rate": 7.906432145603844e-06,
"loss": 0.6140427589416504,
"step": 1363
},
{
"epoch": 1.7695052716950528,
"grad_norm": 0.5710042715072632,
"learning_rate": 7.892451710781035e-06,
"loss": 0.612266480922699,
"step": 1364
},
{
"epoch": 1.770802919708029,
"grad_norm": 0.55032879114151,
"learning_rate": 7.878475585361045e-06,
"loss": 0.6138355135917664,
"step": 1365
},
{
"epoch": 1.7721005677210058,
"grad_norm": 0.5812238454818726,
"learning_rate": 7.864503797921518e-06,
"loss": 0.6380466818809509,
"step": 1366
},
{
"epoch": 1.773398215733982,
"grad_norm": 0.5375271439552307,
"learning_rate": 7.850536377031221e-06,
"loss": 0.6307961344718933,
"step": 1367
},
{
"epoch": 1.7746958637469588,
"grad_norm": 0.5584734082221985,
"learning_rate": 7.836573351249996e-06,
"loss": 0.6312189698219299,
"step": 1368
},
{
"epoch": 1.775993511759935,
"grad_norm": 0.5133419036865234,
"learning_rate": 7.822614749128692e-06,
"loss": 0.5199952125549316,
"step": 1369
},
{
"epoch": 1.7772911597729117,
"grad_norm": 0.5400519371032715,
"learning_rate": 7.808660599209124e-06,
"loss": 0.630193829536438,
"step": 1370
},
{
"epoch": 1.778588807785888,
"grad_norm": 0.5627943277359009,
"learning_rate": 7.794710930023993e-06,
"loss": 0.6233404874801636,
"step": 1371
},
{
"epoch": 1.7798864557988645,
"grad_norm": 0.510907769203186,
"learning_rate": 7.78076577009684e-06,
"loss": 0.5262112021446228,
"step": 1372
},
{
"epoch": 1.781184103811841,
"grad_norm": 0.5093023777008057,
"learning_rate": 7.76682514794199e-06,
"loss": 0.5871707201004028,
"step": 1373
},
{
"epoch": 1.7824817518248175,
"grad_norm": 0.5214765667915344,
"learning_rate": 7.752889092064484e-06,
"loss": 0.5635697841644287,
"step": 1374
},
{
"epoch": 1.783779399837794,
"grad_norm": 0.5440617799758911,
"learning_rate": 7.738957630960037e-06,
"loss": 0.5805234909057617,
"step": 1375
},
{
"epoch": 1.7850770478507705,
"grad_norm": 0.5365013480186462,
"learning_rate": 7.725030793114952e-06,
"loss": 0.615504801273346,
"step": 1376
},
{
"epoch": 1.786374695863747,
"grad_norm": 0.5464739203453064,
"learning_rate": 7.711108607006094e-06,
"loss": 0.6203770637512207,
"step": 1377
},
{
"epoch": 1.7876723438767235,
"grad_norm": 0.5313665866851807,
"learning_rate": 7.697191101100802e-06,
"loss": 0.6234644055366516,
"step": 1378
},
{
"epoch": 1.7889699918897,
"grad_norm": 0.5652154684066772,
"learning_rate": 7.683278303856862e-06,
"loss": 0.6404775977134705,
"step": 1379
},
{
"epoch": 1.7902676399026762,
"grad_norm": 0.5399373769760132,
"learning_rate": 7.669370243722415e-06,
"loss": 0.6136540770530701,
"step": 1380
},
{
"epoch": 1.7902676399026762,
"eval_loss": 0.6770720481872559,
"eval_runtime": 72.4181,
"eval_samples_per_second": 71.695,
"eval_steps_per_second": 8.962,
"step": 1380
},
{
"epoch": 1.791565287915653,
"grad_norm": 0.5250906944274902,
"learning_rate": 7.655466949135932e-06,
"loss": 0.6147629022598267,
"step": 1381
},
{
"epoch": 1.7928629359286292,
"grad_norm": 0.5089812278747559,
"learning_rate": 7.641568448526122e-06,
"loss": 0.5584423542022705,
"step": 1382
},
{
"epoch": 1.794160583941606,
"grad_norm": 0.53523850440979,
"learning_rate": 7.627674770311909e-06,
"loss": 0.5899471640586853,
"step": 1383
},
{
"epoch": 1.7954582319545822,
"grad_norm": 0.5330705642700195,
"learning_rate": 7.613785942902343e-06,
"loss": 0.6054921746253967,
"step": 1384
},
{
"epoch": 1.796755879967559,
"grad_norm": 0.514224648475647,
"learning_rate": 7.599901994696566e-06,
"loss": 0.57494056224823,
"step": 1385
},
{
"epoch": 1.7980535279805352,
"grad_norm": 0.5187469124794006,
"learning_rate": 7.586022954083731e-06,
"loss": 0.5410253405570984,
"step": 1386
},
{
"epoch": 1.799351175993512,
"grad_norm": 0.5295100808143616,
"learning_rate": 7.572148849442971e-06,
"loss": 0.5727859139442444,
"step": 1387
},
{
"epoch": 1.8006488240064882,
"grad_norm": 0.5229355692863464,
"learning_rate": 7.5582797091433105e-06,
"loss": 0.5822583436965942,
"step": 1388
},
{
"epoch": 1.8019464720194647,
"grad_norm": 0.5615860223770142,
"learning_rate": 7.544415561543639e-06,
"loss": 0.6505988836288452,
"step": 1389
},
{
"epoch": 1.8032441200324412,
"grad_norm": 0.538707971572876,
"learning_rate": 7.5305564349926215e-06,
"loss": 0.5953875184059143,
"step": 1390
},
{
"epoch": 1.8045417680454177,
"grad_norm": 0.5197842717170715,
"learning_rate": 7.516702357828672e-06,
"loss": 0.61934494972229,
"step": 1391
},
{
"epoch": 1.8058394160583942,
"grad_norm": 0.49861758947372437,
"learning_rate": 7.502853358379865e-06,
"loss": 0.5522242784500122,
"step": 1392
},
{
"epoch": 1.8071370640713706,
"grad_norm": 0.5618783235549927,
"learning_rate": 7.489009464963903e-06,
"loss": 0.6682146787643433,
"step": 1393
},
{
"epoch": 1.8084347120843471,
"grad_norm": 0.9511061906814575,
"learning_rate": 7.475170705888042e-06,
"loss": 0.5893583297729492,
"step": 1394
},
{
"epoch": 1.8097323600973236,
"grad_norm": 0.6068239808082581,
"learning_rate": 7.461337109449045e-06,
"loss": 0.6168926954269409,
"step": 1395
},
{
"epoch": 1.8110300081103001,
"grad_norm": 0.517159640789032,
"learning_rate": 7.447508703933109e-06,
"loss": 0.5870746374130249,
"step": 1396
},
{
"epoch": 1.8123276561232764,
"grad_norm": 0.5260257720947266,
"learning_rate": 7.433685517615831e-06,
"loss": 0.6144825220108032,
"step": 1397
},
{
"epoch": 1.8136253041362531,
"grad_norm": 0.4919078052043915,
"learning_rate": 7.4198675787621185e-06,
"loss": 0.6141817569732666,
"step": 1398
},
{
"epoch": 1.8149229521492294,
"grad_norm": 0.5349772572517395,
"learning_rate": 7.406054915626172e-06,
"loss": 0.5727092027664185,
"step": 1399
},
{
"epoch": 1.816220600162206,
"grad_norm": 0.5762760639190674,
"learning_rate": 7.392247556451382e-06,
"loss": 0.647359311580658,
"step": 1400
},
{
"epoch": 1.8175182481751824,
"grad_norm": 0.5478885769844055,
"learning_rate": 7.378445529470303e-06,
"loss": 0.6371256113052368,
"step": 1401
},
{
"epoch": 1.818815896188159,
"grad_norm": 0.5577658414840698,
"learning_rate": 7.364648862904593e-06,
"loss": 0.6552213430404663,
"step": 1402
},
{
"epoch": 1.8201135442011354,
"grad_norm": 0.5350478887557983,
"learning_rate": 7.35085758496494e-06,
"loss": 0.5756250023841858,
"step": 1403
},
{
"epoch": 1.821411192214112,
"grad_norm": 0.5247483849525452,
"learning_rate": 7.337071723851018e-06,
"loss": 0.5872269868850708,
"step": 1404
},
{
"epoch": 1.8227088402270883,
"grad_norm": 0.5715752840042114,
"learning_rate": 7.323291307751418e-06,
"loss": 0.6395775079727173,
"step": 1405
},
{
"epoch": 1.8240064882400648,
"grad_norm": 0.5355315208435059,
"learning_rate": 7.3095163648436115e-06,
"loss": 0.5502926707267761,
"step": 1406
},
{
"epoch": 1.8253041362530413,
"grad_norm": 0.5468769073486328,
"learning_rate": 7.295746923293865e-06,
"loss": 0.6266253590583801,
"step": 1407
},
{
"epoch": 1.8266017842660178,
"grad_norm": 0.5183525681495667,
"learning_rate": 7.2819830112572035e-06,
"loss": 0.5890312194824219,
"step": 1408
},
{
"epoch": 1.8278994322789943,
"grad_norm": 0.5416871905326843,
"learning_rate": 7.268224656877339e-06,
"loss": 0.6163492798805237,
"step": 1409
},
{
"epoch": 1.8291970802919708,
"grad_norm": 0.5376898646354675,
"learning_rate": 7.25447188828663e-06,
"loss": 0.6440437436103821,
"step": 1410
},
{
"epoch": 1.8304947283049473,
"grad_norm": 0.5264099836349487,
"learning_rate": 7.240724733606002e-06,
"loss": 0.6445986032485962,
"step": 1411
},
{
"epoch": 1.8317923763179238,
"grad_norm": 0.5397512912750244,
"learning_rate": 7.2269832209449145e-06,
"loss": 0.5767061710357666,
"step": 1412
},
{
"epoch": 1.8330900243309003,
"grad_norm": 0.5331466794013977,
"learning_rate": 7.213247378401274e-06,
"loss": 0.6515385508537292,
"step": 1413
},
{
"epoch": 1.8343876723438766,
"grad_norm": 0.5380875468254089,
"learning_rate": 7.199517234061408e-06,
"loss": 0.5956803560256958,
"step": 1414
},
{
"epoch": 1.8356853203568533,
"grad_norm": 0.5553707480430603,
"learning_rate": 7.1857928159999814e-06,
"loss": 0.5990528464317322,
"step": 1415
},
{
"epoch": 1.8369829683698295,
"grad_norm": 0.5348111391067505,
"learning_rate": 7.172074152279963e-06,
"loss": 0.5816199779510498,
"step": 1416
},
{
"epoch": 1.8382806163828063,
"grad_norm": 0.63777756690979,
"learning_rate": 7.1583612709525405e-06,
"loss": 0.6647042036056519,
"step": 1417
},
{
"epoch": 1.8395782643957825,
"grad_norm": 0.5394327640533447,
"learning_rate": 7.14465420005709e-06,
"loss": 0.629410982131958,
"step": 1418
},
{
"epoch": 1.8408759124087593,
"grad_norm": 0.5467361807823181,
"learning_rate": 7.130952967621096e-06,
"loss": 0.5931155681610107,
"step": 1419
},
{
"epoch": 1.8421735604217355,
"grad_norm": 0.5642380714416504,
"learning_rate": 7.11725760166012e-06,
"loss": 0.59910649061203,
"step": 1420
},
{
"epoch": 1.8434712084347122,
"grad_norm": 0.5448968410491943,
"learning_rate": 7.103568130177713e-06,
"loss": 0.5758746862411499,
"step": 1421
},
{
"epoch": 1.8447688564476885,
"grad_norm": 0.5109772682189941,
"learning_rate": 7.089884581165382e-06,
"loss": 0.5374370217323303,
"step": 1422
},
{
"epoch": 1.846066504460665,
"grad_norm": 0.5496018528938293,
"learning_rate": 7.076206982602516e-06,
"loss": 0.6080317497253418,
"step": 1423
},
{
"epoch": 1.8473641524736415,
"grad_norm": 0.5525946021080017,
"learning_rate": 7.06253536245635e-06,
"loss": 0.6326315402984619,
"step": 1424
},
{
"epoch": 1.848661800486618,
"grad_norm": 0.5555429458618164,
"learning_rate": 7.048869748681879e-06,
"loss": 0.6499879360198975,
"step": 1425
},
{
"epoch": 1.8499594484995945,
"grad_norm": 0.5364986062049866,
"learning_rate": 7.035210169221834e-06,
"loss": 0.6402702331542969,
"step": 1426
},
{
"epoch": 1.851257096512571,
"grad_norm": 0.5398283004760742,
"learning_rate": 7.021556652006588e-06,
"loss": 0.636422872543335,
"step": 1427
},
{
"epoch": 1.8525547445255475,
"grad_norm": 0.5333319306373596,
"learning_rate": 7.007909224954135e-06,
"loss": 0.6210685968399048,
"step": 1428
},
{
"epoch": 1.853852392538524,
"grad_norm": 0.5136668086051941,
"learning_rate": 6.994267915970003e-06,
"loss": 0.5984174013137817,
"step": 1429
},
{
"epoch": 1.8551500405515005,
"grad_norm": 0.5352861285209656,
"learning_rate": 6.980632752947221e-06,
"loss": 0.6331675052642822,
"step": 1430
},
{
"epoch": 1.8564476885644767,
"grad_norm": 0.5386180281639099,
"learning_rate": 6.967003763766247e-06,
"loss": 0.599821925163269,
"step": 1431
},
{
"epoch": 1.8577453365774534,
"grad_norm": 0.5548969507217407,
"learning_rate": 6.953380976294907e-06,
"loss": 0.6447435617446899,
"step": 1432
},
{
"epoch": 1.8590429845904297,
"grad_norm": 0.5061814188957214,
"learning_rate": 6.9397644183883616e-06,
"loss": 0.6045181751251221,
"step": 1433
},
{
"epoch": 1.8603406326034064,
"grad_norm": 0.49961408972740173,
"learning_rate": 6.926154117889022e-06,
"loss": 0.5710508823394775,
"step": 1434
},
{
"epoch": 1.8616382806163827,
"grad_norm": 0.5761319398880005,
"learning_rate": 6.91255010262651e-06,
"loss": 0.6047182679176331,
"step": 1435
},
{
"epoch": 1.8629359286293594,
"grad_norm": 0.5302688479423523,
"learning_rate": 6.898952400417587e-06,
"loss": 0.5881869792938232,
"step": 1436
},
{
"epoch": 1.8642335766423357,
"grad_norm": 0.567452609539032,
"learning_rate": 6.885361039066121e-06,
"loss": 0.6580846905708313,
"step": 1437
},
{
"epoch": 1.8655312246553124,
"grad_norm": 0.5567494034767151,
"learning_rate": 6.8717760463629965e-06,
"loss": 0.6213802099227905,
"step": 1438
},
{
"epoch": 1.8668288726682887,
"grad_norm": 0.535961925983429,
"learning_rate": 6.858197450086097e-06,
"loss": 0.6174903512001038,
"step": 1439
},
{
"epoch": 1.8681265206812652,
"grad_norm": 0.5607694387435913,
"learning_rate": 6.844625278000205e-06,
"loss": 0.658057451248169,
"step": 1440
},
{
"epoch": 1.8694241686942417,
"grad_norm": 0.5164813995361328,
"learning_rate": 6.831059557856984e-06,
"loss": 0.6188488602638245,
"step": 1441
},
{
"epoch": 1.8707218167072182,
"grad_norm": 0.5046887397766113,
"learning_rate": 6.81750031739489e-06,
"loss": 0.5495269298553467,
"step": 1442
},
{
"epoch": 1.8720194647201946,
"grad_norm": 0.5218680500984192,
"learning_rate": 6.803947584339148e-06,
"loss": 0.5858875513076782,
"step": 1443
},
{
"epoch": 1.8733171127331711,
"grad_norm": 0.5279871225357056,
"learning_rate": 6.79040138640166e-06,
"loss": 0.5829395055770874,
"step": 1444
},
{
"epoch": 1.8746147607461476,
"grad_norm": 0.5364516377449036,
"learning_rate": 6.7768617512809745e-06,
"loss": 0.6135284900665283,
"step": 1445
},
{
"epoch": 1.8759124087591241,
"grad_norm": 0.5465746521949768,
"learning_rate": 6.763328706662214e-06,
"loss": 0.5970785617828369,
"step": 1446
},
{
"epoch": 1.8772100567721006,
"grad_norm": 0.5328618288040161,
"learning_rate": 6.749802280217037e-06,
"loss": 0.6004316806793213,
"step": 1447
},
{
"epoch": 1.878507704785077,
"grad_norm": 0.5282012224197388,
"learning_rate": 6.7362824996035545e-06,
"loss": 0.5903221368789673,
"step": 1448
},
{
"epoch": 1.8798053527980536,
"grad_norm": 0.5416566133499146,
"learning_rate": 6.722769392466304e-06,
"loss": 0.624277651309967,
"step": 1449
},
{
"epoch": 1.8811030008110299,
"grad_norm": 0.5569058060646057,
"learning_rate": 6.709262986436162e-06,
"loss": 0.6214337348937988,
"step": 1450
},
{
"epoch": 1.8824006488240066,
"grad_norm": 0.5567551255226135,
"learning_rate": 6.695763309130318e-06,
"loss": 0.5963641405105591,
"step": 1451
},
{
"epoch": 1.8836982968369829,
"grad_norm": 0.5245199203491211,
"learning_rate": 6.682270388152185e-06,
"loss": 0.5722153186798096,
"step": 1452
},
{
"epoch": 1.8849959448499596,
"grad_norm": 0.5476487874984741,
"learning_rate": 6.668784251091381e-06,
"loss": 0.573593258857727,
"step": 1453
},
{
"epoch": 1.8862935928629359,
"grad_norm": 0.5254029631614685,
"learning_rate": 6.655304925523635e-06,
"loss": 0.5607786774635315,
"step": 1454
},
{
"epoch": 1.8875912408759126,
"grad_norm": 0.5431527495384216,
"learning_rate": 6.641832439010765e-06,
"loss": 0.5841714143753052,
"step": 1455
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.5374141931533813,
"learning_rate": 6.628366819100586e-06,
"loss": 0.5811495780944824,
"step": 1456
},
{
"epoch": 1.8901865369018653,
"grad_norm": 0.5369722247123718,
"learning_rate": 6.614908093326891e-06,
"loss": 0.6311888694763184,
"step": 1457
},
{
"epoch": 1.8914841849148418,
"grad_norm": 0.5656461119651794,
"learning_rate": 6.601456289209362e-06,
"loss": 0.6515893936157227,
"step": 1458
},
{
"epoch": 1.8927818329278183,
"grad_norm": 0.5076130032539368,
"learning_rate": 6.588011434253534e-06,
"loss": 0.5477322340011597,
"step": 1459
},
{
"epoch": 1.8940794809407948,
"grad_norm": 0.5373955965042114,
"learning_rate": 6.574573555950738e-06,
"loss": 0.5668719410896301,
"step": 1460
},
{
"epoch": 1.8953771289537713,
"grad_norm": 0.5303026437759399,
"learning_rate": 6.561142681778027e-06,
"loss": 0.5856397747993469,
"step": 1461
},
{
"epoch": 1.8966747769667478,
"grad_norm": 0.5287466049194336,
"learning_rate": 6.547718839198145e-06,
"loss": 0.574636697769165,
"step": 1462
},
{
"epoch": 1.8979724249797243,
"grad_norm": 0.546556830406189,
"learning_rate": 6.53430205565945e-06,
"loss": 0.6119240522384644,
"step": 1463
},
{
"epoch": 1.8992700729927008,
"grad_norm": 0.5332784652709961,
"learning_rate": 6.520892358595869e-06,
"loss": 0.6177451014518738,
"step": 1464
},
{
"epoch": 1.900567721005677,
"grad_norm": 0.5086203217506409,
"learning_rate": 6.507489775426834e-06,
"loss": 0.6066810488700867,
"step": 1465
},
{
"epoch": 1.9018653690186538,
"grad_norm": 0.5467303991317749,
"learning_rate": 6.494094333557243e-06,
"loss": 0.5971111059188843,
"step": 1466
},
{
"epoch": 1.90316301703163,
"grad_norm": 0.5070620179176331,
"learning_rate": 6.4807060603773795e-06,
"loss": 0.6063017845153809,
"step": 1467
},
{
"epoch": 1.9044606650446068,
"grad_norm": 0.553736686706543,
"learning_rate": 6.467324983262877e-06,
"loss": 0.579677402973175,
"step": 1468
},
{
"epoch": 1.905758313057583,
"grad_norm": 0.5139430165290833,
"learning_rate": 6.453951129574644e-06,
"loss": 0.5715341567993164,
"step": 1469
},
{
"epoch": 1.9070559610705597,
"grad_norm": 0.5478905439376831,
"learning_rate": 6.4405845266588356e-06,
"loss": 0.6066344976425171,
"step": 1470
},
{
"epoch": 1.908353609083536,
"grad_norm": 0.5382056832313538,
"learning_rate": 6.427225201846763e-06,
"loss": 0.5792092084884644,
"step": 1471
},
{
"epoch": 1.9096512570965127,
"grad_norm": 0.5592162013053894,
"learning_rate": 6.413873182454873e-06,
"loss": 0.6224773526191711,
"step": 1472
},
{
"epoch": 1.910948905109489,
"grad_norm": 0.5435997843742371,
"learning_rate": 6.4005284957846546e-06,
"loss": 0.5740009546279907,
"step": 1473
},
{
"epoch": 1.9122465531224655,
"grad_norm": 0.5480201840400696,
"learning_rate": 6.3871911691226276e-06,
"loss": 0.5897870063781738,
"step": 1474
},
{
"epoch": 1.913544201135442,
"grad_norm": 0.5461702942848206,
"learning_rate": 6.373861229740237e-06,
"loss": 0.6223511695861816,
"step": 1475
},
{
"epoch": 1.9148418491484185,
"grad_norm": 0.5337714552879333,
"learning_rate": 6.360538704893845e-06,
"loss": 0.5608541369438171,
"step": 1476
},
{
"epoch": 1.916139497161395,
"grad_norm": 0.5573077201843262,
"learning_rate": 6.3472236218246366e-06,
"loss": 0.6532754302024841,
"step": 1477
},
{
"epoch": 1.9174371451743715,
"grad_norm": 0.5389246940612793,
"learning_rate": 6.333916007758591e-06,
"loss": 0.5982533693313599,
"step": 1478
},
{
"epoch": 1.918734793187348,
"grad_norm": 0.5433958768844604,
"learning_rate": 6.320615889906403e-06,
"loss": 0.592591404914856,
"step": 1479
},
{
"epoch": 1.9200324412003245,
"grad_norm": 0.5413274765014648,
"learning_rate": 6.307323295463457e-06,
"loss": 0.6429393291473389,
"step": 1480
},
{
"epoch": 1.921330089213301,
"grad_norm": 0.5350672602653503,
"learning_rate": 6.294038251609738e-06,
"loss": 0.5930889844894409,
"step": 1481
},
{
"epoch": 1.9226277372262772,
"grad_norm": 0.5042331218719482,
"learning_rate": 6.280760785509802e-06,
"loss": 0.5509825944900513,
"step": 1482
},
{
"epoch": 1.923925385239254,
"grad_norm": 0.5447627902030945,
"learning_rate": 6.2674909243127e-06,
"loss": 0.6052374839782715,
"step": 1483
},
{
"epoch": 1.9252230332522302,
"grad_norm": 0.5395492911338806,
"learning_rate": 6.254228695151949e-06,
"loss": 0.6406330466270447,
"step": 1484
},
{
"epoch": 1.926520681265207,
"grad_norm": 0.5140017867088318,
"learning_rate": 6.240974125145443e-06,
"loss": 0.5923643112182617,
"step": 1485
},
{
"epoch": 1.9278183292781832,
"grad_norm": 0.5255963802337646,
"learning_rate": 6.227727241395429e-06,
"loss": 0.612221360206604,
"step": 1486
},
{
"epoch": 1.92911597729116,
"grad_norm": 0.5396282076835632,
"learning_rate": 6.214488070988424e-06,
"loss": 0.5972959399223328,
"step": 1487
},
{
"epoch": 1.9304136253041362,
"grad_norm": 0.5345456004142761,
"learning_rate": 6.201256640995184e-06,
"loss": 0.5695825815200806,
"step": 1488
},
{
"epoch": 1.931711273317113,
"grad_norm": 0.5186867713928223,
"learning_rate": 6.188032978470639e-06,
"loss": 0.6117428541183472,
"step": 1489
},
{
"epoch": 1.9330089213300892,
"grad_norm": 0.5213980674743652,
"learning_rate": 6.174817110453828e-06,
"loss": 0.584017276763916,
"step": 1490
},
{
"epoch": 1.9343065693430657,
"grad_norm": 0.541926920413971,
"learning_rate": 6.161609063967857e-06,
"loss": 0.6257720589637756,
"step": 1491
},
{
"epoch": 1.9356042173560422,
"grad_norm": 0.5566191673278809,
"learning_rate": 6.1484088660198325e-06,
"loss": 0.6734557151794434,
"step": 1492
},
{
"epoch": 1.9369018653690186,
"grad_norm": 0.5532911419868469,
"learning_rate": 6.135216543600828e-06,
"loss": 0.5978685021400452,
"step": 1493
},
{
"epoch": 1.9381995133819951,
"grad_norm": 0.5523790717124939,
"learning_rate": 6.1220321236857974e-06,
"loss": 0.6684085130691528,
"step": 1494
},
{
"epoch": 1.9394971613949716,
"grad_norm": 0.5317186713218689,
"learning_rate": 6.108855633233546e-06,
"loss": 0.5903822183609009,
"step": 1495
},
{
"epoch": 1.9407948094079481,
"grad_norm": 0.52325439453125,
"learning_rate": 6.0956870991866545e-06,
"loss": 0.5855342149734497,
"step": 1496
},
{
"epoch": 1.9420924574209246,
"grad_norm": 0.5201572775840759,
"learning_rate": 6.0825265484714526e-06,
"loss": 0.5801212787628174,
"step": 1497
},
{
"epoch": 1.9433901054339011,
"grad_norm": 0.5488981008529663,
"learning_rate": 6.0693740079979235e-06,
"loss": 0.647799015045166,
"step": 1498
},
{
"epoch": 1.9446877534468774,
"grad_norm": 0.49936795234680176,
"learning_rate": 6.056229504659696e-06,
"loss": 0.5507512092590332,
"step": 1499
},
{
"epoch": 1.945985401459854,
"grad_norm": 0.5403010249137878,
"learning_rate": 6.043093065333945e-06,
"loss": 0.5773292779922485,
"step": 1500
},
{
"epoch": 1.9472830494728304,
"grad_norm": 0.532992422580719,
"learning_rate": 6.029964716881367e-06,
"loss": 0.561974048614502,
"step": 1501
},
{
"epoch": 1.948580697485807,
"grad_norm": 0.5226876139640808,
"learning_rate": 6.016844486146106e-06,
"loss": 0.6117234230041504,
"step": 1502
},
{
"epoch": 1.9498783454987834,
"grad_norm": 0.5627997517585754,
"learning_rate": 6.003732399955722e-06,
"loss": 0.5736496448516846,
"step": 1503
},
{
"epoch": 1.95117599351176,
"grad_norm": 0.5260640382766724,
"learning_rate": 5.990628485121106e-06,
"loss": 0.5524093508720398,
"step": 1504
},
{
"epoch": 1.9524736415247363,
"grad_norm": 0.5555213689804077,
"learning_rate": 5.97753276843645e-06,
"loss": 0.6590294241905212,
"step": 1505
},
{
"epoch": 1.9537712895377128,
"grad_norm": 0.5117315053939819,
"learning_rate": 5.964445276679176e-06,
"loss": 0.5593676567077637,
"step": 1506
},
{
"epoch": 1.9550689375506893,
"grad_norm": 0.5474593043327332,
"learning_rate": 5.9513660366099005e-06,
"loss": 0.5995163321495056,
"step": 1507
},
{
"epoch": 1.9563665855636658,
"grad_norm": 0.5376996397972107,
"learning_rate": 5.93829507497235e-06,
"loss": 0.5445429086685181,
"step": 1508
},
{
"epoch": 1.9576642335766423,
"grad_norm": 0.539804220199585,
"learning_rate": 5.925232418493338e-06,
"loss": 0.6023607850074768,
"step": 1509
},
{
"epoch": 1.9589618815896188,
"grad_norm": 0.5308881402015686,
"learning_rate": 5.912178093882688e-06,
"loss": 0.5908794403076172,
"step": 1510
},
{
"epoch": 1.9602595296025953,
"grad_norm": 0.5358856320381165,
"learning_rate": 5.8991321278331934e-06,
"loss": 0.5432258248329163,
"step": 1511
},
{
"epoch": 1.9615571776155718,
"grad_norm": 0.5521926879882812,
"learning_rate": 5.8860945470205466e-06,
"loss": 0.6700773239135742,
"step": 1512
},
{
"epoch": 1.9628548256285483,
"grad_norm": 0.5567953586578369,
"learning_rate": 5.8730653781033085e-06,
"loss": 0.6132399439811707,
"step": 1513
},
{
"epoch": 1.9641524736415248,
"grad_norm": 0.5308123826980591,
"learning_rate": 5.860044647722827e-06,
"loss": 0.595048189163208,
"step": 1514
},
{
"epoch": 1.9654501216545013,
"grad_norm": 0.5229505896568298,
"learning_rate": 5.847032382503202e-06,
"loss": 0.5752079486846924,
"step": 1515
},
{
"epoch": 1.9667477696674776,
"grad_norm": 0.5336843729019165,
"learning_rate": 5.834028609051218e-06,
"loss": 0.6190193891525269,
"step": 1516
},
{
"epoch": 1.9680454176804543,
"grad_norm": 0.5378988981246948,
"learning_rate": 5.8210333539563e-06,
"loss": 0.5807895660400391,
"step": 1517
},
{
"epoch": 1.9693430656934305,
"grad_norm": 0.5520551800727844,
"learning_rate": 5.808046643790468e-06,
"loss": 0.6308130621910095,
"step": 1518
},
{
"epoch": 1.9706407137064073,
"grad_norm": 0.5014427900314331,
"learning_rate": 5.795068505108243e-06,
"loss": 0.584097146987915,
"step": 1519
},
{
"epoch": 1.9719383617193835,
"grad_norm": 0.5326021313667297,
"learning_rate": 5.782098964446641e-06,
"loss": 0.5909327268600464,
"step": 1520
},
{
"epoch": 1.9732360097323602,
"grad_norm": 0.5124540328979492,
"learning_rate": 5.769138048325087e-06,
"loss": 0.5518309473991394,
"step": 1521
},
{
"epoch": 1.9745336577453365,
"grad_norm": 0.5387500524520874,
"learning_rate": 5.756185783245376e-06,
"loss": 0.5835770964622498,
"step": 1522
},
{
"epoch": 1.975831305758313,
"grad_norm": 0.568587064743042,
"learning_rate": 5.743242195691612e-06,
"loss": 0.5821942687034607,
"step": 1523
},
{
"epoch": 1.9771289537712895,
"grad_norm": 0.5374230742454529,
"learning_rate": 5.730307312130152e-06,
"loss": 0.6571119427680969,
"step": 1524
},
{
"epoch": 1.978426601784266,
"grad_norm": 0.5388919115066528,
"learning_rate": 5.717381159009563e-06,
"loss": 0.5895075798034668,
"step": 1525
},
{
"epoch": 1.9797242497972425,
"grad_norm": 0.5499215722084045,
"learning_rate": 5.704463762760559e-06,
"loss": 0.61728835105896,
"step": 1526
},
{
"epoch": 1.981021897810219,
"grad_norm": 0.5375927686691284,
"learning_rate": 5.691555149795933e-06,
"loss": 0.6732977032661438,
"step": 1527
},
{
"epoch": 1.9823195458231955,
"grad_norm": 0.5313878655433655,
"learning_rate": 5.678655346510549e-06,
"loss": 0.61357581615448,
"step": 1528
},
{
"epoch": 1.983617193836172,
"grad_norm": 0.5222123265266418,
"learning_rate": 5.6657643792812265e-06,
"loss": 0.5704218745231628,
"step": 1529
},
{
"epoch": 1.9849148418491485,
"grad_norm": 0.5498616099357605,
"learning_rate": 5.652882274466736e-06,
"loss": 0.6428430080413818,
"step": 1530
},
{
"epoch": 1.986212489862125,
"grad_norm": 0.5288700461387634,
"learning_rate": 5.640009058407719e-06,
"loss": 0.5776660442352295,
"step": 1531
},
{
"epoch": 1.9875101378751014,
"grad_norm": 0.5719195008277893,
"learning_rate": 5.627144757426647e-06,
"loss": 0.6659935116767883,
"step": 1532
},
{
"epoch": 1.9888077858880777,
"grad_norm": 0.5699102282524109,
"learning_rate": 5.614289397827757e-06,
"loss": 0.649441123008728,
"step": 1533
},
{
"epoch": 1.9901054339010544,
"grad_norm": 0.5806236267089844,
"learning_rate": 5.601443005897012e-06,
"loss": 0.6462723016738892,
"step": 1534
},
{
"epoch": 1.9914030819140307,
"grad_norm": 0.5485842823982239,
"learning_rate": 5.588605607902017e-06,
"loss": 0.6063494086265564,
"step": 1535
},
{
"epoch": 1.9927007299270074,
"grad_norm": 0.5317525863647461,
"learning_rate": 5.57577723009202e-06,
"loss": 0.5641921162605286,
"step": 1536
},
{
"epoch": 1.9939983779399837,
"grad_norm": 0.5366416573524475,
"learning_rate": 5.5629578986977894e-06,
"loss": 0.623965322971344,
"step": 1537
},
{
"epoch": 1.9952960259529604,
"grad_norm": 0.5662190318107605,
"learning_rate": 5.550147639931631e-06,
"loss": 0.6340383291244507,
"step": 1538
},
{
"epoch": 1.9965936739659367,
"grad_norm": 0.5266711711883545,
"learning_rate": 5.537346479987269e-06,
"loss": 0.6086807250976562,
"step": 1539
},
{
"epoch": 1.9978913219789132,
"grad_norm": 0.5435559153556824,
"learning_rate": 5.524554445039838e-06,
"loss": 0.640510082244873,
"step": 1540
},
{
"epoch": 1.9991889699918897,
"grad_norm": 0.5433489084243774,
"learning_rate": 5.511771561245813e-06,
"loss": 0.5800854563713074,
"step": 1541
},
{
"epoch": 2.0,
"grad_norm": 0.6513635516166687,
"learning_rate": 5.498997854742956e-06,
"loss": 0.546117901802063,
"step": 1542
},
{
"epoch": 2.0012976480129763,
"grad_norm": 0.7124117016792297,
"learning_rate": 5.4862333516502634e-06,
"loss": 0.5231295824050903,
"step": 1543
},
{
"epoch": 2.002595296025953,
"grad_norm": 0.727088451385498,
"learning_rate": 5.473478078067913e-06,
"loss": 0.5810973644256592,
"step": 1544
},
{
"epoch": 2.0038929440389293,
"grad_norm": 0.6788406372070312,
"learning_rate": 5.460732060077212e-06,
"loss": 0.47124871611595154,
"step": 1545
},
{
"epoch": 2.005190592051906,
"grad_norm": 0.6010527610778809,
"learning_rate": 5.44799532374054e-06,
"loss": 0.5422745943069458,
"step": 1546
},
{
"epoch": 2.0064882400648822,
"grad_norm": 0.609658420085907,
"learning_rate": 5.435267895101303e-06,
"loss": 0.48424142599105835,
"step": 1547
},
{
"epoch": 2.007785888077859,
"grad_norm": 0.5703460574150085,
"learning_rate": 5.422549800183861e-06,
"loss": 0.5136675834655762,
"step": 1548
},
{
"epoch": 2.0090835360908352,
"grad_norm": 0.5782158970832825,
"learning_rate": 5.409841064993512e-06,
"loss": 0.509381890296936,
"step": 1549
},
{
"epoch": 2.010381184103812,
"grad_norm": 0.6222527623176575,
"learning_rate": 5.39714171551639e-06,
"loss": 0.4843388795852661,
"step": 1550
},
{
"epoch": 2.011678832116788,
"grad_norm": 0.7037692666053772,
"learning_rate": 5.384451777719464e-06,
"loss": 0.5681462287902832,
"step": 1551
},
{
"epoch": 2.012976480129765,
"grad_norm": 0.7455988526344299,
"learning_rate": 5.371771277550432e-06,
"loss": 0.551672101020813,
"step": 1552
},
{
"epoch": 2.014274128142741,
"grad_norm": 0.7268160581588745,
"learning_rate": 5.359100240937717e-06,
"loss": 0.5382372140884399,
"step": 1553
},
{
"epoch": 2.015571776155718,
"grad_norm": 0.6356255412101746,
"learning_rate": 5.3464386937903764e-06,
"loss": 0.5280675888061523,
"step": 1554
},
{
"epoch": 2.016869424168694,
"grad_norm": 0.5975467562675476,
"learning_rate": 5.33378666199807e-06,
"loss": 0.47013112902641296,
"step": 1555
},
{
"epoch": 2.018167072181671,
"grad_norm": 0.6236818432807922,
"learning_rate": 5.321144171431003e-06,
"loss": 0.4888884425163269,
"step": 1556
},
{
"epoch": 2.019464720194647,
"grad_norm": 0.6166471838951111,
"learning_rate": 5.308511247939872e-06,
"loss": 0.5211419463157654,
"step": 1557
},
{
"epoch": 2.020762368207624,
"grad_norm": 0.6095893383026123,
"learning_rate": 5.295887917355794e-06,
"loss": 0.5085535049438477,
"step": 1558
},
{
"epoch": 2.0220600162206,
"grad_norm": 0.6039384007453918,
"learning_rate": 5.283274205490303e-06,
"loss": 0.4754714369773865,
"step": 1559
},
{
"epoch": 2.0233576642335764,
"grad_norm": 0.6331435441970825,
"learning_rate": 5.270670138135234e-06,
"loss": 0.5521947145462036,
"step": 1560
},
{
"epoch": 2.024655312246553,
"grad_norm": 0.6151823997497559,
"learning_rate": 5.25807574106272e-06,
"loss": 0.5278744697570801,
"step": 1561
},
{
"epoch": 2.0259529602595294,
"grad_norm": 0.5749709606170654,
"learning_rate": 5.245491040025115e-06,
"loss": 0.4914984107017517,
"step": 1562
},
{
"epoch": 2.027250608272506,
"grad_norm": 0.5855306386947632,
"learning_rate": 5.232916060754947e-06,
"loss": 0.5195509195327759,
"step": 1563
},
{
"epoch": 2.0285482562854824,
"grad_norm": 0.5908445119857788,
"learning_rate": 5.220350828964865e-06,
"loss": 0.48390451073646545,
"step": 1564
},
{
"epoch": 2.029845904298459,
"grad_norm": 0.5874761343002319,
"learning_rate": 5.207795370347588e-06,
"loss": 0.5324580669403076,
"step": 1565
},
{
"epoch": 2.0311435523114354,
"grad_norm": 0.5893219709396362,
"learning_rate": 5.195249710575853e-06,
"loss": 0.5100334286689758,
"step": 1566
},
{
"epoch": 2.032441200324412,
"grad_norm": 0.5876151919364929,
"learning_rate": 5.182713875302361e-06,
"loss": 0.4768049716949463,
"step": 1567
},
{
"epoch": 2.0337388483373884,
"grad_norm": 0.6265038251876831,
"learning_rate": 5.1701878901597106e-06,
"loss": 0.5602673292160034,
"step": 1568
},
{
"epoch": 2.035036496350365,
"grad_norm": 0.5975306034088135,
"learning_rate": 5.157671780760385e-06,
"loss": 0.5052694082260132,
"step": 1569
},
{
"epoch": 2.0363341443633414,
"grad_norm": 0.5611022114753723,
"learning_rate": 5.145165572696652e-06,
"loss": 0.49101999402046204,
"step": 1570
},
{
"epoch": 2.037631792376318,
"grad_norm": 0.5829542875289917,
"learning_rate": 5.132669291540544e-06,
"loss": 0.474854052066803,
"step": 1571
},
{
"epoch": 2.0389294403892944,
"grad_norm": 0.5918568968772888,
"learning_rate": 5.1201829628437926e-06,
"loss": 0.4853309988975525,
"step": 1572
},
{
"epoch": 2.040227088402271,
"grad_norm": 0.5785784125328064,
"learning_rate": 5.107706612137776e-06,
"loss": 0.5171955227851868,
"step": 1573
},
{
"epoch": 2.0415247364152473,
"grad_norm": 0.5528171062469482,
"learning_rate": 5.095240264933486e-06,
"loss": 0.47794681787490845,
"step": 1574
},
{
"epoch": 2.042822384428224,
"grad_norm": 0.5567626357078552,
"learning_rate": 5.082783946721434e-06,
"loss": 0.4940184950828552,
"step": 1575
},
{
"epoch": 2.0441200324412003,
"grad_norm": 0.5630913376808167,
"learning_rate": 5.070337682971642e-06,
"loss": 0.5437344312667847,
"step": 1576
},
{
"epoch": 2.0454176804541766,
"grad_norm": 0.5575384497642517,
"learning_rate": 5.057901499133573e-06,
"loss": 0.49236786365509033,
"step": 1577
},
{
"epoch": 2.0467153284671533,
"grad_norm": 0.5638654828071594,
"learning_rate": 5.0454754206360705e-06,
"loss": 0.4736412465572357,
"step": 1578
},
{
"epoch": 2.0480129764801296,
"grad_norm": 0.5577630996704102,
"learning_rate": 5.033059472887322e-06,
"loss": 0.5147624015808105,
"step": 1579
},
{
"epoch": 2.0493106244931063,
"grad_norm": 0.5717137455940247,
"learning_rate": 5.0206536812748004e-06,
"loss": 0.4905228614807129,
"step": 1580
},
{
"epoch": 2.0506082725060826,
"grad_norm": 0.5646504759788513,
"learning_rate": 5.008258071165202e-06,
"loss": 0.5036407113075256,
"step": 1581
},
{
"epoch": 2.0519059205190593,
"grad_norm": 0.5792942047119141,
"learning_rate": 4.995872667904424e-06,
"loss": 0.5340180993080139,
"step": 1582
},
{
"epoch": 2.0532035685320356,
"grad_norm": 0.573951244354248,
"learning_rate": 4.98349749681747e-06,
"loss": 0.4675467610359192,
"step": 1583
},
{
"epoch": 2.0545012165450123,
"grad_norm": 0.5502886772155762,
"learning_rate": 4.971132583208438e-06,
"loss": 0.4816184937953949,
"step": 1584
},
{
"epoch": 2.0557988645579885,
"grad_norm": 0.5748745203018188,
"learning_rate": 4.958777952360445e-06,
"loss": 0.49751102924346924,
"step": 1585
},
{
"epoch": 2.0570965125709653,
"grad_norm": 0.593724250793457,
"learning_rate": 4.946433629535585e-06,
"loss": 0.48918506503105164,
"step": 1586
},
{
"epoch": 2.0583941605839415,
"grad_norm": 0.5852590799331665,
"learning_rate": 4.934099639974874e-06,
"loss": 0.5142393708229065,
"step": 1587
},
{
"epoch": 2.0596918085969182,
"grad_norm": 0.5500675439834595,
"learning_rate": 4.921776008898198e-06,
"loss": 0.43804582953453064,
"step": 1588
},
{
"epoch": 2.0609894566098945,
"grad_norm": 0.572162389755249,
"learning_rate": 4.909462761504264e-06,
"loss": 0.5290922522544861,
"step": 1589
},
{
"epoch": 2.0622871046228712,
"grad_norm": 0.5475997924804688,
"learning_rate": 4.897159922970551e-06,
"loss": 0.489504873752594,
"step": 1590
},
{
"epoch": 2.0635847526358475,
"grad_norm": 0.5753741264343262,
"learning_rate": 4.884867518453238e-06,
"loss": 0.5394560694694519,
"step": 1591
},
{
"epoch": 2.0648824006488242,
"grad_norm": 0.5752173662185669,
"learning_rate": 4.872585573087195e-06,
"loss": 0.5700497627258301,
"step": 1592
},
{
"epoch": 2.0661800486618005,
"grad_norm": 0.5844142436981201,
"learning_rate": 4.860314111985881e-06,
"loss": 0.5502715110778809,
"step": 1593
},
{
"epoch": 2.0674776966747768,
"grad_norm": 0.5586737990379333,
"learning_rate": 4.848053160241333e-06,
"loss": 0.48312538862228394,
"step": 1594
},
{
"epoch": 2.0687753446877535,
"grad_norm": 0.5547072887420654,
"learning_rate": 4.835802742924091e-06,
"loss": 0.4890977442264557,
"step": 1595
},
{
"epoch": 2.0700729927007298,
"grad_norm": 0.5696388483047485,
"learning_rate": 4.823562885083161e-06,
"loss": 0.5179868936538696,
"step": 1596
},
{
"epoch": 2.0713706407137065,
"grad_norm": 0.5792607069015503,
"learning_rate": 4.811333611745953e-06,
"loss": 0.5098393559455872,
"step": 1597
},
{
"epoch": 2.0726682887266827,
"grad_norm": 0.5769554972648621,
"learning_rate": 4.799114947918238e-06,
"loss": 0.4976171553134918,
"step": 1598
},
{
"epoch": 2.0739659367396595,
"grad_norm": 0.6067489981651306,
"learning_rate": 4.786906918584083e-06,
"loss": 0.5139312148094177,
"step": 1599
},
{
"epoch": 2.0752635847526357,
"grad_norm": 0.5910279750823975,
"learning_rate": 4.774709548705831e-06,
"loss": 0.5157588720321655,
"step": 1600
},
{
"epoch": 2.0765612327656124,
"grad_norm": 0.5831329226493835,
"learning_rate": 4.762522863224001e-06,
"loss": 0.5141895413398743,
"step": 1601
},
{
"epoch": 2.0778588807785887,
"grad_norm": 0.5735464692115784,
"learning_rate": 4.750346887057292e-06,
"loss": 0.47724485397338867,
"step": 1602
},
{
"epoch": 2.0791565287915654,
"grad_norm": 0.5806788206100464,
"learning_rate": 4.738181645102493e-06,
"loss": 0.4755935072898865,
"step": 1603
},
{
"epoch": 2.0804541768045417,
"grad_norm": 0.5973532199859619,
"learning_rate": 4.726027162234434e-06,
"loss": 0.5464816093444824,
"step": 1604
},
{
"epoch": 2.0817518248175184,
"grad_norm": 0.5893049240112305,
"learning_rate": 4.713883463305972e-06,
"loss": 0.5293697118759155,
"step": 1605
},
{
"epoch": 2.0830494728304947,
"grad_norm": 0.5956568717956543,
"learning_rate": 4.701750573147885e-06,
"loss": 0.5268076658248901,
"step": 1606
},
{
"epoch": 2.0843471208434714,
"grad_norm": 0.5941202044487,
"learning_rate": 4.689628516568866e-06,
"loss": 0.526781439781189,
"step": 1607
},
{
"epoch": 2.0856447688564477,
"grad_norm": 0.5724000334739685,
"learning_rate": 4.677517318355455e-06,
"loss": 0.5051593780517578,
"step": 1608
},
{
"epoch": 2.086942416869424,
"grad_norm": 0.5567840933799744,
"learning_rate": 4.6654170032719825e-06,
"loss": 0.48566874861717224,
"step": 1609
},
{
"epoch": 2.0882400648824007,
"grad_norm": 0.5653722882270813,
"learning_rate": 4.6533275960605355e-06,
"loss": 0.5071468353271484,
"step": 1610
},
{
"epoch": 2.0882400648824007,
"eval_loss": 0.6963403820991516,
"eval_runtime": 72.3826,
"eval_samples_per_second": 71.73,
"eval_steps_per_second": 8.966,
"step": 1610
},
{
"epoch": 2.089537712895377,
"grad_norm": 0.5640507340431213,
"learning_rate": 4.641249121440892e-06,
"loss": 0.5107710361480713,
"step": 1611
},
{
"epoch": 2.0908353609083536,
"grad_norm": 0.5841313004493713,
"learning_rate": 4.629181604110464e-06,
"loss": 0.5194936990737915,
"step": 1612
},
{
"epoch": 2.09213300892133,
"grad_norm": 0.5427317023277283,
"learning_rate": 4.617125068744288e-06,
"loss": 0.44176995754241943,
"step": 1613
},
{
"epoch": 2.0934306569343066,
"grad_norm": 0.6006700992584229,
"learning_rate": 4.605079539994911e-06,
"loss": 0.5314173102378845,
"step": 1614
},
{
"epoch": 2.094728304947283,
"grad_norm": 0.5708412528038025,
"learning_rate": 4.593045042492404e-06,
"loss": 0.5313728451728821,
"step": 1615
},
{
"epoch": 2.0960259529602596,
"grad_norm": 0.5850820541381836,
"learning_rate": 4.581021600844258e-06,
"loss": 0.4967271089553833,
"step": 1616
},
{
"epoch": 2.097323600973236,
"grad_norm": 0.5869132280349731,
"learning_rate": 4.569009239635374e-06,
"loss": 0.5268970727920532,
"step": 1617
},
{
"epoch": 2.0986212489862126,
"grad_norm": 0.5825201869010925,
"learning_rate": 4.557007983427987e-06,
"loss": 0.5315977334976196,
"step": 1618
},
{
"epoch": 2.099918896999189,
"grad_norm": 0.5721443891525269,
"learning_rate": 4.54501785676163e-06,
"loss": 0.4732065498828888,
"step": 1619
},
{
"epoch": 2.1012165450121656,
"grad_norm": 0.5872232913970947,
"learning_rate": 4.533038884153077e-06,
"loss": 0.5813014507293701,
"step": 1620
},
{
"epoch": 2.102514193025142,
"grad_norm": 0.5751720666885376,
"learning_rate": 4.521071090096298e-06,
"loss": 0.4687768518924713,
"step": 1621
},
{
"epoch": 2.1038118410381186,
"grad_norm": 0.5663445591926575,
"learning_rate": 4.509114499062393e-06,
"loss": 0.49182090163230896,
"step": 1622
},
{
"epoch": 2.105109489051095,
"grad_norm": 0.5650926828384399,
"learning_rate": 4.4971691354995795e-06,
"loss": 0.5067583322525024,
"step": 1623
},
{
"epoch": 2.1064071370640716,
"grad_norm": 0.6090897917747498,
"learning_rate": 4.485235023833087e-06,
"loss": 0.5684949159622192,
"step": 1624
},
{
"epoch": 2.107704785077048,
"grad_norm": 0.6066005229949951,
"learning_rate": 4.4733121884651665e-06,
"loss": 0.5100910067558289,
"step": 1625
},
{
"epoch": 2.1090024330900246,
"grad_norm": 0.5951321125030518,
"learning_rate": 4.46140065377499e-06,
"loss": 0.4774884283542633,
"step": 1626
},
{
"epoch": 2.110300081103001,
"grad_norm": 0.5725848078727722,
"learning_rate": 4.449500444118633e-06,
"loss": 0.5018754005432129,
"step": 1627
},
{
"epoch": 2.111597729115977,
"grad_norm": 0.5799410343170166,
"learning_rate": 4.437611583829014e-06,
"loss": 0.49752479791641235,
"step": 1628
},
{
"epoch": 2.112895377128954,
"grad_norm": 0.5619634985923767,
"learning_rate": 4.42573409721584e-06,
"loss": 0.4756616950035095,
"step": 1629
},
{
"epoch": 2.11419302514193,
"grad_norm": 0.5556355118751526,
"learning_rate": 4.413868008565569e-06,
"loss": 0.4895199239253998,
"step": 1630
},
{
"epoch": 2.115490673154907,
"grad_norm": 0.5813250541687012,
"learning_rate": 4.402013342141347e-06,
"loss": 0.45987099409103394,
"step": 1631
},
{
"epoch": 2.116788321167883,
"grad_norm": 0.5723846554756165,
"learning_rate": 4.390170122182965e-06,
"loss": 0.4845224916934967,
"step": 1632
},
{
"epoch": 2.11808596918086,
"grad_norm": 0.5540896058082581,
"learning_rate": 4.378338372906813e-06,
"loss": 0.4948923587799072,
"step": 1633
},
{
"epoch": 2.119383617193836,
"grad_norm": 0.61214679479599,
"learning_rate": 4.3665181185058255e-06,
"loss": 0.5314114093780518,
"step": 1634
},
{
"epoch": 2.1206812652068128,
"grad_norm": 0.5635900497436523,
"learning_rate": 4.354709383149421e-06,
"loss": 0.4875974655151367,
"step": 1635
},
{
"epoch": 2.121978913219789,
"grad_norm": 0.5833781957626343,
"learning_rate": 4.342912190983487e-06,
"loss": 0.5470179915428162,
"step": 1636
},
{
"epoch": 2.1232765612327658,
"grad_norm": 0.5999435782432556,
"learning_rate": 4.331126566130284e-06,
"loss": 0.5479536056518555,
"step": 1637
},
{
"epoch": 2.124574209245742,
"grad_norm": 0.589368999004364,
"learning_rate": 4.319352532688444e-06,
"loss": 0.5104061961174011,
"step": 1638
},
{
"epoch": 2.1258718572587187,
"grad_norm": 0.5677252411842346,
"learning_rate": 4.3075901147328745e-06,
"loss": 0.5259417295455933,
"step": 1639
},
{
"epoch": 2.127169505271695,
"grad_norm": 0.5625855326652527,
"learning_rate": 4.295839336314749e-06,
"loss": 0.49216002225875854,
"step": 1640
},
{
"epoch": 2.1284671532846717,
"grad_norm": 0.5749784111976624,
"learning_rate": 4.284100221461432e-06,
"loss": 0.47341352701187134,
"step": 1641
},
{
"epoch": 2.129764801297648,
"grad_norm": 0.5952023267745972,
"learning_rate": 4.272372794176446e-06,
"loss": 0.5849668979644775,
"step": 1642
},
{
"epoch": 2.1310624493106243,
"grad_norm": 0.6117653250694275,
"learning_rate": 4.260657078439409e-06,
"loss": 0.5250235795974731,
"step": 1643
},
{
"epoch": 2.132360097323601,
"grad_norm": 0.5717377662658691,
"learning_rate": 4.248953098205997e-06,
"loss": 0.49503540992736816,
"step": 1644
},
{
"epoch": 2.1336577453365773,
"grad_norm": 0.5875842571258545,
"learning_rate": 4.237260877407878e-06,
"loss": 0.5329856872558594,
"step": 1645
},
{
"epoch": 2.134955393349554,
"grad_norm": 0.5664336085319519,
"learning_rate": 4.225580439952699e-06,
"loss": 0.5302871465682983,
"step": 1646
},
{
"epoch": 2.1362530413625302,
"grad_norm": 0.5786408185958862,
"learning_rate": 4.213911809723987e-06,
"loss": 0.49267759919166565,
"step": 1647
},
{
"epoch": 2.137550689375507,
"grad_norm": 0.5607128143310547,
"learning_rate": 4.20225501058114e-06,
"loss": 0.5211464166641235,
"step": 1648
},
{
"epoch": 2.1388483373884832,
"grad_norm": 0.5761646628379822,
"learning_rate": 4.190610066359364e-06,
"loss": 0.5178772211074829,
"step": 1649
},
{
"epoch": 2.14014598540146,
"grad_norm": 0.5818209648132324,
"learning_rate": 4.1789770008696205e-06,
"loss": 0.5244809985160828,
"step": 1650
},
{
"epoch": 2.141443633414436,
"grad_norm": 0.6208338141441345,
"learning_rate": 4.167355837898585e-06,
"loss": 0.5720170736312866,
"step": 1651
},
{
"epoch": 2.142741281427413,
"grad_norm": 0.59494549036026,
"learning_rate": 4.155746601208594e-06,
"loss": 0.5233884453773499,
"step": 1652
},
{
"epoch": 2.144038929440389,
"grad_norm": 0.5718002915382385,
"learning_rate": 4.144149314537599e-06,
"loss": 0.48552173376083374,
"step": 1653
},
{
"epoch": 2.145336577453366,
"grad_norm": 0.5601415634155273,
"learning_rate": 4.1325640015991185e-06,
"loss": 0.4996642768383026,
"step": 1654
},
{
"epoch": 2.146634225466342,
"grad_norm": 0.5795076489448547,
"learning_rate": 4.120990686082174e-06,
"loss": 0.5177854895591736,
"step": 1655
},
{
"epoch": 2.147931873479319,
"grad_norm": 0.5665140151977539,
"learning_rate": 4.109429391651283e-06,
"loss": 0.46502965688705444,
"step": 1656
},
{
"epoch": 2.149229521492295,
"grad_norm": 0.5985783934593201,
"learning_rate": 4.097880141946354e-06,
"loss": 0.4880366325378418,
"step": 1657
},
{
"epoch": 2.150527169505272,
"grad_norm": 0.5875007510185242,
"learning_rate": 4.08634296058268e-06,
"loss": 0.4756428599357605,
"step": 1658
},
{
"epoch": 2.151824817518248,
"grad_norm": 0.5694658160209656,
"learning_rate": 4.074817871150887e-06,
"loss": 0.5224863886833191,
"step": 1659
},
{
"epoch": 2.153122465531225,
"grad_norm": 0.5686694979667664,
"learning_rate": 4.063304897216856e-06,
"loss": 0.4963817000389099,
"step": 1660
},
{
"epoch": 2.154420113544201,
"grad_norm": 0.5916073322296143,
"learning_rate": 4.051804062321706e-06,
"loss": 0.5067265629768372,
"step": 1661
},
{
"epoch": 2.1557177615571774,
"grad_norm": 0.5737749338150024,
"learning_rate": 4.040315389981736e-06,
"loss": 0.547669529914856,
"step": 1662
},
{
"epoch": 2.157015409570154,
"grad_norm": 0.5631166696548462,
"learning_rate": 4.028838903688372e-06,
"loss": 0.5300416946411133,
"step": 1663
},
{
"epoch": 2.1583130575831304,
"grad_norm": 0.5811983942985535,
"learning_rate": 4.017374626908125e-06,
"loss": 0.5100100040435791,
"step": 1664
},
{
"epoch": 2.159610705596107,
"grad_norm": 0.571027934551239,
"learning_rate": 4.005922583082538e-06,
"loss": 0.5137525200843811,
"step": 1665
},
{
"epoch": 2.1609083536090834,
"grad_norm": 0.5910731554031372,
"learning_rate": 3.994482795628142e-06,
"loss": 0.5244160890579224,
"step": 1666
},
{
"epoch": 2.16220600162206,
"grad_norm": 0.5894386768341064,
"learning_rate": 3.983055287936411e-06,
"loss": 0.5517876148223877,
"step": 1667
},
{
"epoch": 2.1635036496350364,
"grad_norm": 0.5779116153717041,
"learning_rate": 3.971640083373696e-06,
"loss": 0.5097295045852661,
"step": 1668
},
{
"epoch": 2.164801297648013,
"grad_norm": 0.5987510085105896,
"learning_rate": 3.960237205281213e-06,
"loss": 0.511284589767456,
"step": 1669
},
{
"epoch": 2.1660989456609894,
"grad_norm": 0.5853222608566284,
"learning_rate": 3.948846676974953e-06,
"loss": 0.5473302602767944,
"step": 1670
},
{
"epoch": 2.167396593673966,
"grad_norm": 0.5716820359230042,
"learning_rate": 3.937468521745666e-06,
"loss": 0.4697805345058441,
"step": 1671
},
{
"epoch": 2.1686942416869424,
"grad_norm": 0.5948668122291565,
"learning_rate": 3.9261027628588e-06,
"loss": 0.5532658100128174,
"step": 1672
},
{
"epoch": 2.169991889699919,
"grad_norm": 0.5779493451118469,
"learning_rate": 3.9147494235544544e-06,
"loss": 0.495819091796875,
"step": 1673
},
{
"epoch": 2.1712895377128953,
"grad_norm": 0.588945746421814,
"learning_rate": 3.903408527047336e-06,
"loss": 0.50020432472229,
"step": 1674
},
{
"epoch": 2.172587185725872,
"grad_norm": 0.5889913439750671,
"learning_rate": 3.892080096526707e-06,
"loss": 0.5079851150512695,
"step": 1675
},
{
"epoch": 2.1738848337388483,
"grad_norm": 0.5692569017410278,
"learning_rate": 3.880764155156339e-06,
"loss": 0.47483527660369873,
"step": 1676
},
{
"epoch": 2.1751824817518246,
"grad_norm": 0.6015142202377319,
"learning_rate": 3.8694607260744745e-06,
"loss": 0.5588316321372986,
"step": 1677
},
{
"epoch": 2.1764801297648013,
"grad_norm": 0.5825367569923401,
"learning_rate": 3.858169832393752e-06,
"loss": 0.5049576759338379,
"step": 1678
},
{
"epoch": 2.1777777777777776,
"grad_norm": 0.6517031788825989,
"learning_rate": 3.846891497201206e-06,
"loss": 0.5698549151420593,
"step": 1679
},
{
"epoch": 2.1790754257907543,
"grad_norm": 0.5972406268119812,
"learning_rate": 3.835625743558168e-06,
"loss": 0.5489758253097534,
"step": 1680
},
{
"epoch": 2.1803730738037306,
"grad_norm": 0.590186595916748,
"learning_rate": 3.824372594500256e-06,
"loss": 0.5560799837112427,
"step": 1681
},
{
"epoch": 2.1816707218167073,
"grad_norm": 0.6042253375053406,
"learning_rate": 3.813132073037309e-06,
"loss": 0.5188357830047607,
"step": 1682
},
{
"epoch": 2.1829683698296836,
"grad_norm": 0.5862630605697632,
"learning_rate": 3.8019042021533513e-06,
"loss": 0.49817925691604614,
"step": 1683
},
{
"epoch": 2.1842660178426603,
"grad_norm": 0.5700656175613403,
"learning_rate": 3.7906890048065358e-06,
"loss": 0.5223833322525024,
"step": 1684
},
{
"epoch": 2.1855636658556366,
"grad_norm": 0.5849031805992126,
"learning_rate": 3.779486503929106e-06,
"loss": 0.5123599767684937,
"step": 1685
},
{
"epoch": 2.1868613138686133,
"grad_norm": 0.5997171998023987,
"learning_rate": 3.7682967224273317e-06,
"loss": 0.5369530320167542,
"step": 1686
},
{
"epoch": 2.1881589618815895,
"grad_norm": 0.5994778275489807,
"learning_rate": 3.757119683181493e-06,
"loss": 0.47989219427108765,
"step": 1687
},
{
"epoch": 2.1894566098945663,
"grad_norm": 0.5771443247795105,
"learning_rate": 3.7459554090458018e-06,
"loss": 0.4408413767814636,
"step": 1688
},
{
"epoch": 2.1907542579075425,
"grad_norm": 0.5725969672203064,
"learning_rate": 3.7348039228483758e-06,
"loss": 0.46296805143356323,
"step": 1689
},
{
"epoch": 2.1920519059205192,
"grad_norm": 0.5743042826652527,
"learning_rate": 3.7236652473911817e-06,
"loss": 0.482837975025177,
"step": 1690
},
{
"epoch": 2.1933495539334955,
"grad_norm": 0.5836053490638733,
"learning_rate": 3.7125394054499843e-06,
"loss": 0.5156795978546143,
"step": 1691
},
{
"epoch": 2.1946472019464722,
"grad_norm": 0.5889219641685486,
"learning_rate": 3.7014264197743267e-06,
"loss": 0.5081969499588013,
"step": 1692
},
{
"epoch": 2.1959448499594485,
"grad_norm": 0.6140073537826538,
"learning_rate": 3.6903263130874423e-06,
"loss": 0.5605005025863647,
"step": 1693
},
{
"epoch": 2.197242497972425,
"grad_norm": 0.5697020292282104,
"learning_rate": 3.679239108086241e-06,
"loss": 0.5305500030517578,
"step": 1694
},
{
"epoch": 2.1985401459854015,
"grad_norm": 0.5989742875099182,
"learning_rate": 3.668164827441254e-06,
"loss": 0.5370711088180542,
"step": 1695
},
{
"epoch": 2.1998377939983778,
"grad_norm": 0.608519971370697,
"learning_rate": 3.657103493796581e-06,
"loss": 0.5120800137519836,
"step": 1696
},
{
"epoch": 2.2011354420113545,
"grad_norm": 0.5787931084632874,
"learning_rate": 3.6460551297698486e-06,
"loss": 0.5016961693763733,
"step": 1697
},
{
"epoch": 2.2024330900243307,
"grad_norm": 0.5809414982795715,
"learning_rate": 3.6350197579521696e-06,
"loss": 0.5177795886993408,
"step": 1698
},
{
"epoch": 2.2037307380373075,
"grad_norm": 0.6027206778526306,
"learning_rate": 3.6239974009080746e-06,
"loss": 0.500653862953186,
"step": 1699
},
{
"epoch": 2.2050283860502837,
"grad_norm": 0.5894326567649841,
"learning_rate": 3.6129880811755093e-06,
"loss": 0.5206901431083679,
"step": 1700
},
{
"epoch": 2.2063260340632604,
"grad_norm": 0.591676652431488,
"learning_rate": 3.601991821265731e-06,
"loss": 0.49031156301498413,
"step": 1701
},
{
"epoch": 2.2076236820762367,
"grad_norm": 0.567371666431427,
"learning_rate": 3.591008643663323e-06,
"loss": 0.49885687232017517,
"step": 1702
},
{
"epoch": 2.2089213300892134,
"grad_norm": 0.5756494998931885,
"learning_rate": 3.580038570826093e-06,
"loss": 0.499514639377594,
"step": 1703
},
{
"epoch": 2.2102189781021897,
"grad_norm": 0.5830073356628418,
"learning_rate": 3.5690816251850657e-06,
"loss": 0.4895148277282715,
"step": 1704
},
{
"epoch": 2.2115166261151664,
"grad_norm": 0.6235371828079224,
"learning_rate": 3.5581378291444223e-06,
"loss": 0.5166549682617188,
"step": 1705
},
{
"epoch": 2.2128142741281427,
"grad_norm": 0.5604133605957031,
"learning_rate": 3.5472072050814565e-06,
"loss": 0.4416266083717346,
"step": 1706
},
{
"epoch": 2.2141119221411194,
"grad_norm": 0.5687461495399475,
"learning_rate": 3.5362897753465265e-06,
"loss": 0.48436877131462097,
"step": 1707
},
{
"epoch": 2.2154095701540957,
"grad_norm": 0.5818923115730286,
"learning_rate": 3.5253855622630174e-06,
"loss": 0.5402669906616211,
"step": 1708
},
{
"epoch": 2.2167072181670724,
"grad_norm": 0.6057185530662537,
"learning_rate": 3.514494588127275e-06,
"loss": 0.5666176080703735,
"step": 1709
},
{
"epoch": 2.2180048661800487,
"grad_norm": 0.5755799412727356,
"learning_rate": 3.5036168752085977e-06,
"loss": 0.48957937955856323,
"step": 1710
},
{
"epoch": 2.219302514193025,
"grad_norm": 0.5948247313499451,
"learning_rate": 3.4927524457491456e-06,
"loss": 0.4885704219341278,
"step": 1711
},
{
"epoch": 2.2206001622060016,
"grad_norm": 0.5859489440917969,
"learning_rate": 3.4819013219639295e-06,
"loss": 0.4678208827972412,
"step": 1712
},
{
"epoch": 2.221897810218978,
"grad_norm": 0.5540412068367004,
"learning_rate": 3.471063526040752e-06,
"loss": 0.481825053691864,
"step": 1713
},
{
"epoch": 2.2231954582319546,
"grad_norm": 0.5437055826187134,
"learning_rate": 3.460239080140163e-06,
"loss": 0.4387455880641937,
"step": 1714
},
{
"epoch": 2.224493106244931,
"grad_norm": 0.5966470241546631,
"learning_rate": 3.4494280063954146e-06,
"loss": 0.545790433883667,
"step": 1715
},
{
"epoch": 2.2257907542579076,
"grad_norm": 0.5654957294464111,
"learning_rate": 3.4386303269124142e-06,
"loss": 0.4880921244621277,
"step": 1716
},
{
"epoch": 2.227088402270884,
"grad_norm": 0.5839219689369202,
"learning_rate": 3.4278460637696865e-06,
"loss": 0.5272015333175659,
"step": 1717
},
{
"epoch": 2.2283860502838606,
"grad_norm": 0.5752228498458862,
"learning_rate": 3.4170752390183183e-06,
"loss": 0.5249931812286377,
"step": 1718
},
{
"epoch": 2.229683698296837,
"grad_norm": 0.580033540725708,
"learning_rate": 3.4063178746819193e-06,
"loss": 0.4954257309436798,
"step": 1719
},
{
"epoch": 2.2309813463098136,
"grad_norm": 0.5703238844871521,
"learning_rate": 3.395573992756579e-06,
"loss": 0.502043604850769,
"step": 1720
},
{
"epoch": 2.23227899432279,
"grad_norm": 0.5960628986358643,
"learning_rate": 3.384843615210819e-06,
"loss": 0.5299471616744995,
"step": 1721
},
{
"epoch": 2.2335766423357666,
"grad_norm": 0.5959639549255371,
"learning_rate": 3.3741267639855345e-06,
"loss": 0.6064699292182922,
"step": 1722
},
{
"epoch": 2.234874290348743,
"grad_norm": 0.5705887079238892,
"learning_rate": 3.3634234609939888e-06,
"loss": 0.49739521741867065,
"step": 1723
},
{
"epoch": 2.2361719383617196,
"grad_norm": 0.5743765830993652,
"learning_rate": 3.352733728121712e-06,
"loss": 0.5017514228820801,
"step": 1724
},
{
"epoch": 2.237469586374696,
"grad_norm": 0.5511932969093323,
"learning_rate": 3.3420575872265184e-06,
"loss": 0.4473830759525299,
"step": 1725
},
{
"epoch": 2.238767234387672,
"grad_norm": 0.5601068139076233,
"learning_rate": 3.3313950601384016e-06,
"loss": 0.4705375134944916,
"step": 1726
},
{
"epoch": 2.240064882400649,
"grad_norm": 0.5842630863189697,
"learning_rate": 3.320746168659534e-06,
"loss": 0.5488964319229126,
"step": 1727
},
{
"epoch": 2.241362530413625,
"grad_norm": 0.5851315855979919,
"learning_rate": 3.3101109345642056e-06,
"loss": 0.4903653860092163,
"step": 1728
},
{
"epoch": 2.242660178426602,
"grad_norm": 0.5913082361221313,
"learning_rate": 3.299489379598777e-06,
"loss": 0.5187092423439026,
"step": 1729
},
{
"epoch": 2.243957826439578,
"grad_norm": 0.5963798761367798,
"learning_rate": 3.288881525481639e-06,
"loss": 0.5145666003227234,
"step": 1730
},
{
"epoch": 2.245255474452555,
"grad_norm": 0.5765670537948608,
"learning_rate": 3.278287393903172e-06,
"loss": 0.47934818267822266,
"step": 1731
},
{
"epoch": 2.246553122465531,
"grad_norm": 0.5776212215423584,
"learning_rate": 3.2677070065256855e-06,
"loss": 0.5102344751358032,
"step": 1732
},
{
"epoch": 2.247850770478508,
"grad_norm": 0.5738791823387146,
"learning_rate": 3.257140384983405e-06,
"loss": 0.5097633600234985,
"step": 1733
},
{
"epoch": 2.249148418491484,
"grad_norm": 0.5827375650405884,
"learning_rate": 3.2465875508823876e-06,
"loss": 0.49323970079421997,
"step": 1734
},
{
"epoch": 2.2504460665044608,
"grad_norm": 0.5527526140213013,
"learning_rate": 3.2360485258005115e-06,
"loss": 0.47956135869026184,
"step": 1735
},
{
"epoch": 2.251743714517437,
"grad_norm": 0.581285297870636,
"learning_rate": 3.2255233312874155e-06,
"loss": 0.5309310555458069,
"step": 1736
},
{
"epoch": 2.2530413625304138,
"grad_norm": 0.6052958965301514,
"learning_rate": 3.2150119888644594e-06,
"loss": 0.5168576240539551,
"step": 1737
},
{
"epoch": 2.25433901054339,
"grad_norm": 0.5458951592445374,
"learning_rate": 3.2045145200246763e-06,
"loss": 0.45663541555404663,
"step": 1738
},
{
"epoch": 2.2556366585563667,
"grad_norm": 0.6066997647285461,
"learning_rate": 3.1940309462327334e-06,
"loss": 0.5442982912063599,
"step": 1739
},
{
"epoch": 2.256934306569343,
"grad_norm": 0.5723252296447754,
"learning_rate": 3.1835612889248868e-06,
"loss": 0.5069276094436646,
"step": 1740
},
{
"epoch": 2.2582319545823197,
"grad_norm": 0.571399986743927,
"learning_rate": 3.1731055695089384e-06,
"loss": 0.46238988637924194,
"step": 1741
},
{
"epoch": 2.259529602595296,
"grad_norm": 0.5810062289237976,
"learning_rate": 3.162663809364178e-06,
"loss": 0.5127156972885132,
"step": 1742
},
{
"epoch": 2.2608272506082727,
"grad_norm": 0.57572340965271,
"learning_rate": 3.152236029841376e-06,
"loss": 0.4930036664009094,
"step": 1743
},
{
"epoch": 2.262124898621249,
"grad_norm": 0.580849826335907,
"learning_rate": 3.1418222522626907e-06,
"loss": 0.5655021071434021,
"step": 1744
},
{
"epoch": 2.2634225466342253,
"grad_norm": 0.5487149953842163,
"learning_rate": 3.1314224979216633e-06,
"loss": 0.4654723107814789,
"step": 1745
},
{
"epoch": 2.264720194647202,
"grad_norm": 0.5340819954872131,
"learning_rate": 3.1210367880831684e-06,
"loss": 0.4503304362297058,
"step": 1746
},
{
"epoch": 2.2660178426601782,
"grad_norm": 0.5930841565132141,
"learning_rate": 3.1106651439833434e-06,
"loss": 0.5008471608161926,
"step": 1747
},
{
"epoch": 2.267315490673155,
"grad_norm": 0.6097638010978699,
"learning_rate": 3.1003075868295794e-06,
"loss": 0.5474433898925781,
"step": 1748
},
{
"epoch": 2.2686131386861312,
"grad_norm": 0.5703378319740295,
"learning_rate": 3.0899641378004596e-06,
"loss": 0.4988810420036316,
"step": 1749
},
{
"epoch": 2.269910786699108,
"grad_norm": 0.5475755333900452,
"learning_rate": 3.079634818045719e-06,
"loss": 0.4420495927333832,
"step": 1750
},
{
"epoch": 2.2712084347120842,
"grad_norm": 0.5802868008613586,
"learning_rate": 3.069319648686202e-06,
"loss": 0.4927031397819519,
"step": 1751
},
{
"epoch": 2.272506082725061,
"grad_norm": 0.5564054846763611,
"learning_rate": 3.0590186508138186e-06,
"loss": 0.4879905581474304,
"step": 1752
},
{
"epoch": 2.273803730738037,
"grad_norm": 0.5730741620063782,
"learning_rate": 3.048731845491504e-06,
"loss": 0.4577972888946533,
"step": 1753
},
{
"epoch": 2.275101378751014,
"grad_norm": 0.5826799869537354,
"learning_rate": 3.038459253753172e-06,
"loss": 0.49198514223098755,
"step": 1754
},
{
"epoch": 2.27639902676399,
"grad_norm": 0.5650803446769714,
"learning_rate": 3.0282008966036647e-06,
"loss": 0.48484641313552856,
"step": 1755
},
{
"epoch": 2.277696674776967,
"grad_norm": 0.579980731010437,
"learning_rate": 3.0179567950187396e-06,
"loss": 0.4821101427078247,
"step": 1756
},
{
"epoch": 2.278994322789943,
"grad_norm": 0.562907874584198,
"learning_rate": 3.0077269699449795e-06,
"loss": 0.47341495752334595,
"step": 1757
},
{
"epoch": 2.28029197080292,
"grad_norm": 0.584148108959198,
"learning_rate": 2.9975114422997932e-06,
"loss": 0.48562386631965637,
"step": 1758
},
{
"epoch": 2.281589618815896,
"grad_norm": 0.5975433588027954,
"learning_rate": 2.9873102329713478e-06,
"loss": 0.5041466951370239,
"step": 1759
},
{
"epoch": 2.2828872668288724,
"grad_norm": 0.5545569062232971,
"learning_rate": 2.9771233628185346e-06,
"loss": 0.45113393664360046,
"step": 1760
},
{
"epoch": 2.284184914841849,
"grad_norm": 0.5939710140228271,
"learning_rate": 2.9669508526709256e-06,
"loss": 0.550965428352356,
"step": 1761
},
{
"epoch": 2.285482562854826,
"grad_norm": 0.6028052568435669,
"learning_rate": 2.9567927233287307e-06,
"loss": 0.5310263633728027,
"step": 1762
},
{
"epoch": 2.286780210867802,
"grad_norm": 0.5738025903701782,
"learning_rate": 2.9466489955627452e-06,
"loss": 0.5576157569885254,
"step": 1763
},
{
"epoch": 2.2880778588807784,
"grad_norm": 0.5776515007019043,
"learning_rate": 2.936519690114338e-06,
"loss": 0.4818328022956848,
"step": 1764
},
{
"epoch": 2.289375506893755,
"grad_norm": 0.5612311363220215,
"learning_rate": 2.9264048276953606e-06,
"loss": 0.4919436573982239,
"step": 1765
},
{
"epoch": 2.2906731549067314,
"grad_norm": 0.5739221572875977,
"learning_rate": 2.9163044289881604e-06,
"loss": 0.5123167634010315,
"step": 1766
},
{
"epoch": 2.291970802919708,
"grad_norm": 0.5849712491035461,
"learning_rate": 2.906218514645487e-06,
"loss": 0.48645591735839844,
"step": 1767
},
{
"epoch": 2.2932684509326844,
"grad_norm": 0.5921924114227295,
"learning_rate": 2.8961471052904855e-06,
"loss": 0.5228952169418335,
"step": 1768
},
{
"epoch": 2.294566098945661,
"grad_norm": 0.5667364001274109,
"learning_rate": 2.8860902215166374e-06,
"loss": 0.4713795781135559,
"step": 1769
},
{
"epoch": 2.2958637469586374,
"grad_norm": 0.5740687847137451,
"learning_rate": 2.876047883887727e-06,
"loss": 0.5572628974914551,
"step": 1770
},
{
"epoch": 2.297161394971614,
"grad_norm": 0.5873590111732483,
"learning_rate": 2.866020112937792e-06,
"loss": 0.5043233036994934,
"step": 1771
},
{
"epoch": 2.2984590429845904,
"grad_norm": 0.6047444343566895,
"learning_rate": 2.8560069291710857e-06,
"loss": 0.5389963984489441,
"step": 1772
},
{
"epoch": 2.299756690997567,
"grad_norm": 0.5967015624046326,
"learning_rate": 2.8460083530620342e-06,
"loss": 0.5294721126556396,
"step": 1773
},
{
"epoch": 2.3010543390105433,
"grad_norm": 0.549340546131134,
"learning_rate": 2.8360244050551943e-06,
"loss": 0.4317038357257843,
"step": 1774
},
{
"epoch": 2.30235198702352,
"grad_norm": 0.5504307150840759,
"learning_rate": 2.8260551055652154e-06,
"loss": 0.529647946357727,
"step": 1775
},
{
"epoch": 2.3036496350364963,
"grad_norm": 0.603110671043396,
"learning_rate": 2.8161004749767893e-06,
"loss": 0.5209970474243164,
"step": 1776
},
{
"epoch": 2.304947283049473,
"grad_norm": 0.6039415001869202,
"learning_rate": 2.8061605336446194e-06,
"loss": 0.5043014287948608,
"step": 1777
},
{
"epoch": 2.3062449310624493,
"grad_norm": 0.5883081555366516,
"learning_rate": 2.796235301893362e-06,
"loss": 0.4972041845321655,
"step": 1778
},
{
"epoch": 2.3075425790754256,
"grad_norm": 0.5843275785446167,
"learning_rate": 2.7863248000176146e-06,
"loss": 0.4763846695423126,
"step": 1779
},
{
"epoch": 2.3088402270884023,
"grad_norm": 0.5958689451217651,
"learning_rate": 2.776429048281837e-06,
"loss": 0.534402072429657,
"step": 1780
},
{
"epoch": 2.3101378751013786,
"grad_norm": 0.5908694267272949,
"learning_rate": 2.7665480669203383e-06,
"loss": 0.5190926790237427,
"step": 1781
},
{
"epoch": 2.3114355231143553,
"grad_norm": 0.5524806380271912,
"learning_rate": 2.756681876137227e-06,
"loss": 0.4656313359737396,
"step": 1782
},
{
"epoch": 2.3127331711273316,
"grad_norm": 0.5877224206924438,
"learning_rate": 2.7468304961063642e-06,
"loss": 0.5328505635261536,
"step": 1783
},
{
"epoch": 2.3140308191403083,
"grad_norm": 0.5791632533073425,
"learning_rate": 2.736993946971329e-06,
"loss": 0.49198758602142334,
"step": 1784
},
{
"epoch": 2.3153284671532846,
"grad_norm": 0.5888563990592957,
"learning_rate": 2.727172248845378e-06,
"loss": 0.5110273957252502,
"step": 1785
},
{
"epoch": 2.3166261151662613,
"grad_norm": 0.5828698873519897,
"learning_rate": 2.717365421811389e-06,
"loss": 0.5017109513282776,
"step": 1786
},
{
"epoch": 2.3179237631792375,
"grad_norm": 0.5837040543556213,
"learning_rate": 2.7075734859218526e-06,
"loss": 0.48261111974716187,
"step": 1787
},
{
"epoch": 2.3192214111922143,
"grad_norm": 0.5555887222290039,
"learning_rate": 2.6977964611987885e-06,
"loss": 0.47618377208709717,
"step": 1788
},
{
"epoch": 2.3205190592051905,
"grad_norm": 0.5828522443771362,
"learning_rate": 2.6880343676337485e-06,
"loss": 0.5134596824645996,
"step": 1789
},
{
"epoch": 2.3218167072181672,
"grad_norm": 0.5784159898757935,
"learning_rate": 2.6782872251877347e-06,
"loss": 0.5150825381278992,
"step": 1790
},
{
"epoch": 2.3231143552311435,
"grad_norm": 0.5633057951927185,
"learning_rate": 2.6685550537911886e-06,
"loss": 0.5161488056182861,
"step": 1791
},
{
"epoch": 2.3244120032441202,
"grad_norm": 0.6642704010009766,
"learning_rate": 2.658837873343938e-06,
"loss": 0.49425986409187317,
"step": 1792
},
{
"epoch": 2.3257096512570965,
"grad_norm": 1.5263655185699463,
"learning_rate": 2.6491357037151565e-06,
"loss": 0.5067033767700195,
"step": 1793
},
{
"epoch": 2.3270072992700728,
"grad_norm": 0.5753558278083801,
"learning_rate": 2.639448564743328e-06,
"loss": 0.5167245864868164,
"step": 1794
},
{
"epoch": 2.3283049472830495,
"grad_norm": 0.576946496963501,
"learning_rate": 2.6297764762362e-06,
"loss": 0.4853561818599701,
"step": 1795
},
{
"epoch": 2.329602595296026,
"grad_norm": 0.5866283774375916,
"learning_rate": 2.6201194579707377e-06,
"loss": 0.5048178434371948,
"step": 1796
},
{
"epoch": 2.3309002433090025,
"grad_norm": 0.5844078660011292,
"learning_rate": 2.6104775296931118e-06,
"loss": 0.5524246096611023,
"step": 1797
},
{
"epoch": 2.3321978913219787,
"grad_norm": 0.5873027443885803,
"learning_rate": 2.6008507111186142e-06,
"loss": 0.4834699034690857,
"step": 1798
},
{
"epoch": 2.3334955393349555,
"grad_norm": 0.5751008987426758,
"learning_rate": 2.5912390219316573e-06,
"loss": 0.46085190773010254,
"step": 1799
},
{
"epoch": 2.3347931873479317,
"grad_norm": 0.5933749675750732,
"learning_rate": 2.5816424817857122e-06,
"loss": 0.5757045745849609,
"step": 1800
},
{
"epoch": 2.3360908353609084,
"grad_norm": 0.5685113668441772,
"learning_rate": 2.572061110303271e-06,
"loss": 0.5482950210571289,
"step": 1801
},
{
"epoch": 2.3373884833738847,
"grad_norm": 0.5949112176895142,
"learning_rate": 2.562494927075824e-06,
"loss": 0.45071443915367126,
"step": 1802
},
{
"epoch": 2.3386861313868614,
"grad_norm": 0.5924611687660217,
"learning_rate": 2.552943951663782e-06,
"loss": 0.5145446062088013,
"step": 1803
},
{
"epoch": 2.3399837793998377,
"grad_norm": 0.6171916127204895,
"learning_rate": 2.543408203596479e-06,
"loss": 0.5408798456192017,
"step": 1804
},
{
"epoch": 2.3412814274128144,
"grad_norm": 0.5777391791343689,
"learning_rate": 2.5338877023721055e-06,
"loss": 0.4972618818283081,
"step": 1805
},
{
"epoch": 2.3425790754257907,
"grad_norm": 0.5500625371932983,
"learning_rate": 2.5243824674576743e-06,
"loss": 0.47741931676864624,
"step": 1806
},
{
"epoch": 2.3438767234387674,
"grad_norm": 0.6426427960395813,
"learning_rate": 2.514892518288988e-06,
"loss": 0.4675457179546356,
"step": 1807
},
{
"epoch": 2.3451743714517437,
"grad_norm": 0.5633028149604797,
"learning_rate": 2.5054178742705936e-06,
"loss": 0.4990037679672241,
"step": 1808
},
{
"epoch": 2.34647201946472,
"grad_norm": 0.5860106945037842,
"learning_rate": 2.4959585547757294e-06,
"loss": 0.5247271060943604,
"step": 1809
},
{
"epoch": 2.3477696674776967,
"grad_norm": 0.6035534143447876,
"learning_rate": 2.486514579146322e-06,
"loss": 0.5100830793380737,
"step": 1810
},
{
"epoch": 2.3490673154906734,
"grad_norm": 0.5890262722969055,
"learning_rate": 2.4770859666929027e-06,
"loss": 0.4713430106639862,
"step": 1811
},
{
"epoch": 2.3503649635036497,
"grad_norm": 0.5817517638206482,
"learning_rate": 2.4676727366945995e-06,
"loss": 0.5113362073898315,
"step": 1812
},
{
"epoch": 2.351662611516626,
"grad_norm": 0.5895565748214722,
"learning_rate": 2.4582749083990875e-06,
"loss": 0.5131444931030273,
"step": 1813
},
{
"epoch": 2.3529602595296026,
"grad_norm": 0.6126547455787659,
"learning_rate": 2.448892501022544e-06,
"loss": 0.5126985907554626,
"step": 1814
},
{
"epoch": 2.354257907542579,
"grad_norm": 0.6138656139373779,
"learning_rate": 2.4395255337496202e-06,
"loss": 0.5113729238510132,
"step": 1815
},
{
"epoch": 2.3555555555555556,
"grad_norm": 0.5864330530166626,
"learning_rate": 2.4301740257333918e-06,
"loss": 0.49038761854171753,
"step": 1816
},
{
"epoch": 2.356853203568532,
"grad_norm": 0.5852108597755432,
"learning_rate": 2.4208379960953255e-06,
"loss": 0.5150374174118042,
"step": 1817
},
{
"epoch": 2.3581508515815086,
"grad_norm": 0.5658332705497742,
"learning_rate": 2.4115174639252425e-06,
"loss": 0.45495855808258057,
"step": 1818
},
{
"epoch": 2.359448499594485,
"grad_norm": 0.6017063856124878,
"learning_rate": 2.4022124482812627e-06,
"loss": 0.505713701248169,
"step": 1819
},
{
"epoch": 2.3607461476074616,
"grad_norm": 0.5778226852416992,
"learning_rate": 2.3929229681898005e-06,
"loss": 0.5222234725952148,
"step": 1820
},
{
"epoch": 2.362043795620438,
"grad_norm": 0.5651443004608154,
"learning_rate": 2.3836490426454816e-06,
"loss": 0.49572640657424927,
"step": 1821
},
{
"epoch": 2.3633414436334146,
"grad_norm": 0.5689359307289124,
"learning_rate": 2.3743906906111415e-06,
"loss": 0.5316051840782166,
"step": 1822
},
{
"epoch": 2.364639091646391,
"grad_norm": 0.5702098608016968,
"learning_rate": 2.365147931017764e-06,
"loss": 0.4997398257255554,
"step": 1823
},
{
"epoch": 2.3659367396593676,
"grad_norm": 0.5760017037391663,
"learning_rate": 2.355920782764455e-06,
"loss": 0.48562324047088623,
"step": 1824
},
{
"epoch": 2.367234387672344,
"grad_norm": 0.5816190242767334,
"learning_rate": 2.3467092647183962e-06,
"loss": 0.4969868063926697,
"step": 1825
},
{
"epoch": 2.3685320356853206,
"grad_norm": 0.573274552822113,
"learning_rate": 2.337513395714812e-06,
"loss": 0.5109938383102417,
"step": 1826
},
{
"epoch": 2.369829683698297,
"grad_norm": 0.6311878561973572,
"learning_rate": 2.3283331945569256e-06,
"loss": 0.5642886161804199,
"step": 1827
},
{
"epoch": 2.371127331711273,
"grad_norm": 0.584414541721344,
"learning_rate": 2.3191686800159272e-06,
"loss": 0.4909813404083252,
"step": 1828
},
{
"epoch": 2.37242497972425,
"grad_norm": 0.5963045954704285,
"learning_rate": 2.310019870830923e-06,
"loss": 0.5222618579864502,
"step": 1829
},
{
"epoch": 2.373722627737226,
"grad_norm": 0.5990424752235413,
"learning_rate": 2.300886785708919e-06,
"loss": 0.527482271194458,
"step": 1830
},
{
"epoch": 2.375020275750203,
"grad_norm": 0.5891411900520325,
"learning_rate": 2.2917694433247626e-06,
"loss": 0.5050874948501587,
"step": 1831
},
{
"epoch": 2.376317923763179,
"grad_norm": 0.6118223071098328,
"learning_rate": 2.282667862321104e-06,
"loss": 0.5382136106491089,
"step": 1832
},
{
"epoch": 2.377615571776156,
"grad_norm": 0.6039783358573914,
"learning_rate": 2.2735820613083837e-06,
"loss": 0.5693233013153076,
"step": 1833
},
{
"epoch": 2.378913219789132,
"grad_norm": 0.5887247323989868,
"learning_rate": 2.264512058864755e-06,
"loss": 0.5109111666679382,
"step": 1834
},
{
"epoch": 2.3802108678021088,
"grad_norm": 0.5879799723625183,
"learning_rate": 2.2554578735360823e-06,
"loss": 0.5213186740875244,
"step": 1835
},
{
"epoch": 2.381508515815085,
"grad_norm": 0.5826606154441833,
"learning_rate": 2.246419523835882e-06,
"loss": 0.4647579789161682,
"step": 1836
},
{
"epoch": 2.3828061638280618,
"grad_norm": 0.5773786306381226,
"learning_rate": 2.2373970282452916e-06,
"loss": 0.4783990681171417,
"step": 1837
},
{
"epoch": 2.384103811841038,
"grad_norm": 0.5842030644416809,
"learning_rate": 2.2283904052130313e-06,
"loss": 0.5339592695236206,
"step": 1838
},
{
"epoch": 2.3854014598540147,
"grad_norm": 0.569379985332489,
"learning_rate": 2.2193996731553656e-06,
"loss": 0.4958034157752991,
"step": 1839
},
{
"epoch": 2.386699107866991,
"grad_norm": 0.6030622124671936,
"learning_rate": 2.2104248504560643e-06,
"loss": 0.4680197834968567,
"step": 1840
},
{
"epoch": 2.386699107866991,
"eval_loss": 0.6960097551345825,
"eval_runtime": 72.3931,
"eval_samples_per_second": 71.72,
"eval_steps_per_second": 8.965,
"step": 1840
},
{
"epoch": 2.3879967558799677,
"grad_norm": 0.5678315758705139,
"learning_rate": 2.2014659554663732e-06,
"loss": 0.5050360560417175,
"step": 1841
},
{
"epoch": 2.389294403892944,
"grad_norm": 0.5803557634353638,
"learning_rate": 2.192523006504956e-06,
"loss": 0.45793968439102173,
"step": 1842
},
{
"epoch": 2.3905920519059203,
"grad_norm": 0.5823774933815002,
"learning_rate": 2.183596021857891e-06,
"loss": 0.4527888596057892,
"step": 1843
},
{
"epoch": 2.391889699918897,
"grad_norm": 0.5696638226509094,
"learning_rate": 2.1746850197785928e-06,
"loss": 0.48019784688949585,
"step": 1844
},
{
"epoch": 2.3931873479318737,
"grad_norm": 0.5827446579933167,
"learning_rate": 2.16579001848781e-06,
"loss": 0.5040067434310913,
"step": 1845
},
{
"epoch": 2.39448499594485,
"grad_norm": 0.5871142148971558,
"learning_rate": 2.156911036173568e-06,
"loss": 0.47293055057525635,
"step": 1846
},
{
"epoch": 2.3957826439578263,
"grad_norm": 0.558737576007843,
"learning_rate": 2.1480480909911384e-06,
"loss": 0.47470247745513916,
"step": 1847
},
{
"epoch": 2.397080291970803,
"grad_norm": 0.5871817469596863,
"learning_rate": 2.139201201062999e-06,
"loss": 0.5189757347106934,
"step": 1848
},
{
"epoch": 2.3983779399837792,
"grad_norm": 0.5788654088973999,
"learning_rate": 2.130370384478807e-06,
"loss": 0.49212944507598877,
"step": 1849
},
{
"epoch": 2.399675587996756,
"grad_norm": 0.6011954545974731,
"learning_rate": 2.1215556592953357e-06,
"loss": 0.5247466564178467,
"step": 1850
},
{
"epoch": 2.4009732360097322,
"grad_norm": 0.5478853583335876,
"learning_rate": 2.11275704353648e-06,
"loss": 0.4548777937889099,
"step": 1851
},
{
"epoch": 2.402270884022709,
"grad_norm": 0.5758265852928162,
"learning_rate": 2.10397455519317e-06,
"loss": 0.5072181224822998,
"step": 1852
},
{
"epoch": 2.403568532035685,
"grad_norm": 0.5652422308921814,
"learning_rate": 2.095208212223383e-06,
"loss": 0.524145245552063,
"step": 1853
},
{
"epoch": 2.404866180048662,
"grad_norm": 0.5495245456695557,
"learning_rate": 2.0864580325520623e-06,
"loss": 0.47712084650993347,
"step": 1854
},
{
"epoch": 2.406163828061638,
"grad_norm": 0.5936484932899475,
"learning_rate": 2.077724034071116e-06,
"loss": 0.5134607553482056,
"step": 1855
},
{
"epoch": 2.407461476074615,
"grad_norm": 0.5818508863449097,
"learning_rate": 2.069006234639357e-06,
"loss": 0.46304088830947876,
"step": 1856
},
{
"epoch": 2.408759124087591,
"grad_norm": 0.6046934723854065,
"learning_rate": 2.060304652082481e-06,
"loss": 0.5234611630439758,
"step": 1857
},
{
"epoch": 2.410056772100568,
"grad_norm": 0.6409534215927124,
"learning_rate": 2.051619304193022e-06,
"loss": 0.5672463178634644,
"step": 1858
},
{
"epoch": 2.411354420113544,
"grad_norm": 0.5750660300254822,
"learning_rate": 2.0429502087303164e-06,
"loss": 0.4885750710964203,
"step": 1859
},
{
"epoch": 2.412652068126521,
"grad_norm": 0.6407312750816345,
"learning_rate": 2.0342973834204715e-06,
"loss": 0.4792509973049164,
"step": 1860
},
{
"epoch": 2.413949716139497,
"grad_norm": 0.5465012192726135,
"learning_rate": 2.0256608459563244e-06,
"loss": 0.4969291388988495,
"step": 1861
},
{
"epoch": 2.4152473641524734,
"grad_norm": 0.5713889002799988,
"learning_rate": 2.017040613997412e-06,
"loss": 0.48591309785842896,
"step": 1862
},
{
"epoch": 2.41654501216545,
"grad_norm": 0.5666239857673645,
"learning_rate": 2.008436705169917e-06,
"loss": 0.44293344020843506,
"step": 1863
},
{
"epoch": 2.4178426601784264,
"grad_norm": 0.5586820244789124,
"learning_rate": 1.9998491370666684e-06,
"loss": 0.45493143796920776,
"step": 1864
},
{
"epoch": 2.419140308191403,
"grad_norm": 0.5613408088684082,
"learning_rate": 1.991277927247056e-06,
"loss": 0.49673575162887573,
"step": 1865
},
{
"epoch": 2.4204379562043794,
"grad_norm": 0.5929522514343262,
"learning_rate": 1.9827230932370467e-06,
"loss": 0.5190791487693787,
"step": 1866
},
{
"epoch": 2.421735604217356,
"grad_norm": 0.5624476075172424,
"learning_rate": 1.9741846525291033e-06,
"loss": 0.4601350724697113,
"step": 1867
},
{
"epoch": 2.4230332522303324,
"grad_norm": 0.5859534740447998,
"learning_rate": 1.9656626225821774e-06,
"loss": 0.4977201819419861,
"step": 1868
},
{
"epoch": 2.424330900243309,
"grad_norm": 0.5921490788459778,
"learning_rate": 1.957157020821664e-06,
"loss": 0.5139193534851074,
"step": 1869
},
{
"epoch": 2.4256285482562854,
"grad_norm": 0.5974218845367432,
"learning_rate": 1.9486678646393654e-06,
"loss": 0.5071057081222534,
"step": 1870
},
{
"epoch": 2.426926196269262,
"grad_norm": 0.5919764041900635,
"learning_rate": 1.9401951713934574e-06,
"loss": 0.49057209491729736,
"step": 1871
},
{
"epoch": 2.4282238442822384,
"grad_norm": 0.5927568674087524,
"learning_rate": 1.931738958408457e-06,
"loss": 0.5092151165008545,
"step": 1872
},
{
"epoch": 2.429521492295215,
"grad_norm": 0.5767861604690552,
"learning_rate": 1.9232992429751694e-06,
"loss": 0.4838736355304718,
"step": 1873
},
{
"epoch": 2.4308191403081914,
"grad_norm": 0.5671409964561462,
"learning_rate": 1.9148760423506884e-06,
"loss": 0.4564237594604492,
"step": 1874
},
{
"epoch": 2.432116788321168,
"grad_norm": 0.5710315108299255,
"learning_rate": 1.9064693737583173e-06,
"loss": 0.5324878096580505,
"step": 1875
},
{
"epoch": 2.4334144363341443,
"grad_norm": 0.5930359959602356,
"learning_rate": 1.8980792543875758e-06,
"loss": 0.5325191617012024,
"step": 1876
},
{
"epoch": 2.4347120843471206,
"grad_norm": 0.5865573287010193,
"learning_rate": 1.8897057013941256e-06,
"loss": 0.4776073694229126,
"step": 1877
},
{
"epoch": 2.4360097323600973,
"grad_norm": 0.5611563920974731,
"learning_rate": 1.8813487318997658e-06,
"loss": 0.5060328841209412,
"step": 1878
},
{
"epoch": 2.437307380373074,
"grad_norm": 0.5972756147384644,
"learning_rate": 1.8730083629923857e-06,
"loss": 0.4804626405239105,
"step": 1879
},
{
"epoch": 2.4386050283860503,
"grad_norm": 0.5864998698234558,
"learning_rate": 1.8646846117259277e-06,
"loss": 0.49063995480537415,
"step": 1880
},
{
"epoch": 2.4399026763990266,
"grad_norm": 0.5928548574447632,
"learning_rate": 1.856377495120355e-06,
"loss": 0.5291346311569214,
"step": 1881
},
{
"epoch": 2.4412003244120033,
"grad_norm": 0.5551499724388123,
"learning_rate": 1.8480870301616227e-06,
"loss": 0.5005500912666321,
"step": 1882
},
{
"epoch": 2.4424979724249796,
"grad_norm": 0.5950235724449158,
"learning_rate": 1.839813233801626e-06,
"loss": 0.5388972759246826,
"step": 1883
},
{
"epoch": 2.4437956204379563,
"grad_norm": 0.5625823736190796,
"learning_rate": 1.8315561229581925e-06,
"loss": 0.49611175060272217,
"step": 1884
},
{
"epoch": 2.4450932684509326,
"grad_norm": 0.5934765934944153,
"learning_rate": 1.8233157145150183e-06,
"loss": 0.5419527292251587,
"step": 1885
},
{
"epoch": 2.4463909164639093,
"grad_norm": 0.5831634402275085,
"learning_rate": 1.8150920253216542e-06,
"loss": 0.5380743145942688,
"step": 1886
},
{
"epoch": 2.4476885644768855,
"grad_norm": 0.5773998498916626,
"learning_rate": 1.8068850721934639e-06,
"loss": 0.5360612869262695,
"step": 1887
},
{
"epoch": 2.4489862124898623,
"grad_norm": 0.5667778253555298,
"learning_rate": 1.7986948719115872e-06,
"loss": 0.4837849736213684,
"step": 1888
},
{
"epoch": 2.4502838605028385,
"grad_norm": 0.5844002962112427,
"learning_rate": 1.7905214412229177e-06,
"loss": 0.5097035765647888,
"step": 1889
},
{
"epoch": 2.4515815085158152,
"grad_norm": 0.571603536605835,
"learning_rate": 1.7823647968400437e-06,
"loss": 0.4986342787742615,
"step": 1890
},
{
"epoch": 2.4528791565287915,
"grad_norm": 0.5814788341522217,
"learning_rate": 1.7742249554412426e-06,
"loss": 0.5466139316558838,
"step": 1891
},
{
"epoch": 2.4541768045417682,
"grad_norm": 0.602313756942749,
"learning_rate": 1.76610193367043e-06,
"loss": 0.5179327726364136,
"step": 1892
},
{
"epoch": 2.4554744525547445,
"grad_norm": 0.5728641748428345,
"learning_rate": 1.757995748137129e-06,
"loss": 0.4758206903934479,
"step": 1893
},
{
"epoch": 2.456772100567721,
"grad_norm": 0.5834367871284485,
"learning_rate": 1.7499064154164358e-06,
"loss": 0.48661813139915466,
"step": 1894
},
{
"epoch": 2.4580697485806975,
"grad_norm": 0.6014889478683472,
"learning_rate": 1.7418339520489936e-06,
"loss": 0.5374865531921387,
"step": 1895
},
{
"epoch": 2.4593673965936738,
"grad_norm": 0.5678799152374268,
"learning_rate": 1.7337783745409363e-06,
"loss": 0.47202199697494507,
"step": 1896
},
{
"epoch": 2.4606650446066505,
"grad_norm": 0.5770121216773987,
"learning_rate": 1.7257396993638942e-06,
"loss": 0.4832342565059662,
"step": 1897
},
{
"epoch": 2.4619626926196267,
"grad_norm": 0.5571733713150024,
"learning_rate": 1.717717942954914e-06,
"loss": 0.5462654829025269,
"step": 1898
},
{
"epoch": 2.4632603406326035,
"grad_norm": 0.5752882361412048,
"learning_rate": 1.7097131217164598e-06,
"loss": 0.5042911171913147,
"step": 1899
},
{
"epoch": 2.4645579886455797,
"grad_norm": 0.5651837587356567,
"learning_rate": 1.7017252520163652e-06,
"loss": 0.5055532455444336,
"step": 1900
},
{
"epoch": 2.4658556366585564,
"grad_norm": 0.5626855492591858,
"learning_rate": 1.6937543501878018e-06,
"loss": 0.5025293827056885,
"step": 1901
},
{
"epoch": 2.4671532846715327,
"grad_norm": 0.5588532090187073,
"learning_rate": 1.6858004325292466e-06,
"loss": 0.5056187510490417,
"step": 1902
},
{
"epoch": 2.4684509326845094,
"grad_norm": 0.6047312021255493,
"learning_rate": 1.6778635153044486e-06,
"loss": 0.5340344309806824,
"step": 1903
},
{
"epoch": 2.4697485806974857,
"grad_norm": 0.5701199769973755,
"learning_rate": 1.6699436147423942e-06,
"loss": 0.47314453125,
"step": 1904
},
{
"epoch": 2.4710462287104624,
"grad_norm": 0.5887412428855896,
"learning_rate": 1.662040747037277e-06,
"loss": 0.5806034207344055,
"step": 1905
},
{
"epoch": 2.4723438767234387,
"grad_norm": 0.5856630206108093,
"learning_rate": 1.654154928348455e-06,
"loss": 0.542724609375,
"step": 1906
},
{
"epoch": 2.4736415247364154,
"grad_norm": 0.5869402885437012,
"learning_rate": 1.646286174800441e-06,
"loss": 0.5193344354629517,
"step": 1907
},
{
"epoch": 2.4749391727493917,
"grad_norm": 0.5962528586387634,
"learning_rate": 1.6384345024828374e-06,
"loss": 0.49579355120658875,
"step": 1908
},
{
"epoch": 2.4762368207623684,
"grad_norm": 0.5732969641685486,
"learning_rate": 1.6305999274503282e-06,
"loss": 0.4678477346897125,
"step": 1909
},
{
"epoch": 2.4775344687753447,
"grad_norm": 0.5851303339004517,
"learning_rate": 1.6227824657226366e-06,
"loss": 0.4453192949295044,
"step": 1910
},
{
"epoch": 2.478832116788321,
"grad_norm": 0.5631725192070007,
"learning_rate": 1.614982133284495e-06,
"loss": 0.47414714097976685,
"step": 1911
},
{
"epoch": 2.4801297648012977,
"grad_norm": 0.5917407274246216,
"learning_rate": 1.6071989460856063e-06,
"loss": 0.51967453956604,
"step": 1912
},
{
"epoch": 2.4814274128142744,
"grad_norm": 0.5762115716934204,
"learning_rate": 1.5994329200406223e-06,
"loss": 0.47164011001586914,
"step": 1913
},
{
"epoch": 2.4827250608272506,
"grad_norm": 0.5615324974060059,
"learning_rate": 1.5916840710290937e-06,
"loss": 0.5057311058044434,
"step": 1914
},
{
"epoch": 2.484022708840227,
"grad_norm": 0.5691003203392029,
"learning_rate": 1.5839524148954622e-06,
"loss": 0.46432405710220337,
"step": 1915
},
{
"epoch": 2.4853203568532036,
"grad_norm": 0.5725374221801758,
"learning_rate": 1.5762379674490048e-06,
"loss": 0.46116703748703003,
"step": 1916
},
{
"epoch": 2.48661800486618,
"grad_norm": 0.6240981221199036,
"learning_rate": 1.5685407444638146e-06,
"loss": 0.5304262638092041,
"step": 1917
},
{
"epoch": 2.4879156528791566,
"grad_norm": 0.5866638422012329,
"learning_rate": 1.5608607616787663e-06,
"loss": 0.46918168663978577,
"step": 1918
},
{
"epoch": 2.489213300892133,
"grad_norm": 0.6103445291519165,
"learning_rate": 1.553198034797474e-06,
"loss": 0.5785281658172607,
"step": 1919
},
{
"epoch": 2.4905109489051096,
"grad_norm": 0.5748964548110962,
"learning_rate": 1.5455525794882841e-06,
"loss": 0.47489288449287415,
"step": 1920
},
{
"epoch": 2.491808596918086,
"grad_norm": 0.5849605202674866,
"learning_rate": 1.5379244113842106e-06,
"loss": 0.5081884860992432,
"step": 1921
},
{
"epoch": 2.4931062449310626,
"grad_norm": 0.5827904343605042,
"learning_rate": 1.53031354608293e-06,
"loss": 0.5528438091278076,
"step": 1922
},
{
"epoch": 2.494403892944039,
"grad_norm": 0.5817930102348328,
"learning_rate": 1.5227199991467335e-06,
"loss": 0.5150377154350281,
"step": 1923
},
{
"epoch": 2.4957015409570156,
"grad_norm": 0.5756059288978577,
"learning_rate": 1.5151437861025032e-06,
"loss": 0.4410705268383026,
"step": 1924
},
{
"epoch": 2.496999188969992,
"grad_norm": 0.5646528005599976,
"learning_rate": 1.5075849224416783e-06,
"loss": 0.5073448419570923,
"step": 1925
},
{
"epoch": 2.4982968369829686,
"grad_norm": 0.5877253413200378,
"learning_rate": 1.5000434236202211e-06,
"loss": 0.5140043497085571,
"step": 1926
},
{
"epoch": 2.499594484995945,
"grad_norm": 0.5703092813491821,
"learning_rate": 1.4925193050585873e-06,
"loss": 0.5106258392333984,
"step": 1927
},
{
"epoch": 2.5008921330089215,
"grad_norm": 0.5841608643531799,
"learning_rate": 1.4850125821416983e-06,
"loss": 0.49111461639404297,
"step": 1928
},
{
"epoch": 2.502189781021898,
"grad_norm": 0.5806940197944641,
"learning_rate": 1.4775232702188947e-06,
"loss": 0.477137953042984,
"step": 1929
},
{
"epoch": 2.503487429034874,
"grad_norm": 0.5762841105461121,
"learning_rate": 1.4700513846039332e-06,
"loss": 0.4592735469341278,
"step": 1930
},
{
"epoch": 2.504785077047851,
"grad_norm": 0.5808306932449341,
"learning_rate": 1.4625969405749218e-06,
"loss": 0.5200600624084473,
"step": 1931
},
{
"epoch": 2.5060827250608275,
"grad_norm": 0.5846347212791443,
"learning_rate": 1.4551599533743155e-06,
"loss": 0.5185432434082031,
"step": 1932
},
{
"epoch": 2.507380373073804,
"grad_norm": 0.6160796284675598,
"learning_rate": 1.4477404382088689e-06,
"loss": 0.5391091108322144,
"step": 1933
},
{
"epoch": 2.50867802108678,
"grad_norm": 0.5582398176193237,
"learning_rate": 1.4403384102496132e-06,
"loss": 0.4704029858112335,
"step": 1934
},
{
"epoch": 2.509975669099757,
"grad_norm": 0.5653654932975769,
"learning_rate": 1.4329538846318225e-06,
"loss": 0.524503231048584,
"step": 1935
},
{
"epoch": 2.511273317112733,
"grad_norm": 0.5886475443840027,
"learning_rate": 1.4255868764549852e-06,
"loss": 0.4819219708442688,
"step": 1936
},
{
"epoch": 2.5125709651257098,
"grad_norm": 0.5662146806716919,
"learning_rate": 1.4182374007827605e-06,
"loss": 0.5265961289405823,
"step": 1937
},
{
"epoch": 2.513868613138686,
"grad_norm": 0.5975550413131714,
"learning_rate": 1.410905472642975e-06,
"loss": 0.5036963224411011,
"step": 1938
},
{
"epoch": 2.5151662611516628,
"grad_norm": 0.5727776885032654,
"learning_rate": 1.4035911070275576e-06,
"loss": 0.4989280104637146,
"step": 1939
},
{
"epoch": 2.516463909164639,
"grad_norm": 0.6097977161407471,
"learning_rate": 1.3962943188925438e-06,
"loss": 0.535049557685852,
"step": 1940
},
{
"epoch": 2.5177615571776153,
"grad_norm": 0.5695138573646545,
"learning_rate": 1.3890151231580117e-06,
"loss": 0.5146960020065308,
"step": 1941
},
{
"epoch": 2.519059205190592,
"grad_norm": 0.5890569686889648,
"learning_rate": 1.3817535347080768e-06,
"loss": 0.5350029468536377,
"step": 1942
},
{
"epoch": 2.5203568532035687,
"grad_norm": 0.5916978120803833,
"learning_rate": 1.3745095683908482e-06,
"loss": 0.5213718414306641,
"step": 1943
},
{
"epoch": 2.521654501216545,
"grad_norm": 0.5767956972122192,
"learning_rate": 1.3672832390184042e-06,
"loss": 0.506149411201477,
"step": 1944
},
{
"epoch": 2.5229521492295213,
"grad_norm": 0.5916143655776978,
"learning_rate": 1.3600745613667598e-06,
"loss": 0.5128974318504333,
"step": 1945
},
{
"epoch": 2.524249797242498,
"grad_norm": 0.5634325742721558,
"learning_rate": 1.3528835501758365e-06,
"loss": 0.5004685521125793,
"step": 1946
},
{
"epoch": 2.5255474452554747,
"grad_norm": 0.5783470869064331,
"learning_rate": 1.345710220149431e-06,
"loss": 0.5014833807945251,
"step": 1947
},
{
"epoch": 2.526845093268451,
"grad_norm": 0.5838568210601807,
"learning_rate": 1.3385545859551886e-06,
"loss": 0.540973424911499,
"step": 1948
},
{
"epoch": 2.5281427412814272,
"grad_norm": 0.5862357020378113,
"learning_rate": 1.3314166622245717e-06,
"loss": 0.5124210715293884,
"step": 1949
},
{
"epoch": 2.529440389294404,
"grad_norm": 0.5789701342582703,
"learning_rate": 1.324296463552821e-06,
"loss": 0.4796435236930847,
"step": 1950
},
{
"epoch": 2.5307380373073802,
"grad_norm": 0.5998684167861938,
"learning_rate": 1.3171940044989495e-06,
"loss": 0.5745923519134521,
"step": 1951
},
{
"epoch": 2.532035685320357,
"grad_norm": 0.5753020644187927,
"learning_rate": 1.3101092995856802e-06,
"loss": 0.4947076439857483,
"step": 1952
},
{
"epoch": 2.533333333333333,
"grad_norm": 0.5820896029472351,
"learning_rate": 1.3030423632994493e-06,
"loss": 0.4961175322532654,
"step": 1953
},
{
"epoch": 2.53463098134631,
"grad_norm": 0.5677821040153503,
"learning_rate": 1.2959932100903472e-06,
"loss": 0.49631717801094055,
"step": 1954
},
{
"epoch": 2.535928629359286,
"grad_norm": 0.5767098665237427,
"learning_rate": 1.2889618543721094e-06,
"loss": 0.5189783573150635,
"step": 1955
},
{
"epoch": 2.537226277372263,
"grad_norm": 0.5949708819389343,
"learning_rate": 1.2819483105220798e-06,
"loss": 0.5087240934371948,
"step": 1956
},
{
"epoch": 2.538523925385239,
"grad_norm": 0.582380473613739,
"learning_rate": 1.274952592881179e-06,
"loss": 0.48820894956588745,
"step": 1957
},
{
"epoch": 2.539821573398216,
"grad_norm": 0.578072726726532,
"learning_rate": 1.2679747157538801e-06,
"loss": 0.5089854598045349,
"step": 1958
},
{
"epoch": 2.541119221411192,
"grad_norm": 0.5774610042572021,
"learning_rate": 1.2610146934081768e-06,
"loss": 0.49252915382385254,
"step": 1959
},
{
"epoch": 2.5424168694241684,
"grad_norm": 0.58970707654953,
"learning_rate": 1.2540725400755472e-06,
"loss": 0.5605252981185913,
"step": 1960
},
{
"epoch": 2.543714517437145,
"grad_norm": 0.566736102104187,
"learning_rate": 1.2471482699509463e-06,
"loss": 0.5428552627563477,
"step": 1961
},
{
"epoch": 2.545012165450122,
"grad_norm": 0.5720308423042297,
"learning_rate": 1.2402418971927487e-06,
"loss": 0.5265427827835083,
"step": 1962
},
{
"epoch": 2.546309813463098,
"grad_norm": 0.5800856351852417,
"learning_rate": 1.2333534359227383e-06,
"loss": 0.5138852596282959,
"step": 1963
},
{
"epoch": 2.5476074614760744,
"grad_norm": 0.5780075788497925,
"learning_rate": 1.226482900226077e-06,
"loss": 0.48286569118499756,
"step": 1964
},
{
"epoch": 2.548905109489051,
"grad_norm": 0.5666484236717224,
"learning_rate": 1.2196303041512714e-06,
"loss": 0.5184611082077026,
"step": 1965
},
{
"epoch": 2.550202757502028,
"grad_norm": 0.5936673879623413,
"learning_rate": 1.2127956617101445e-06,
"loss": 0.5331882238388062,
"step": 1966
},
{
"epoch": 2.551500405515004,
"grad_norm": 0.5658625364303589,
"learning_rate": 1.2059789868778116e-06,
"loss": 0.5007424354553223,
"step": 1967
},
{
"epoch": 2.5527980535279804,
"grad_norm": 0.5596531629562378,
"learning_rate": 1.1991802935926455e-06,
"loss": 0.4455481767654419,
"step": 1968
},
{
"epoch": 2.554095701540957,
"grad_norm": 0.5873602628707886,
"learning_rate": 1.1923995957562585e-06,
"loss": 0.4800918698310852,
"step": 1969
},
{
"epoch": 2.5553933495539334,
"grad_norm": 0.5768440961837769,
"learning_rate": 1.1856369072334517e-06,
"loss": 0.5240867733955383,
"step": 1970
},
{
"epoch": 2.55669099756691,
"grad_norm": 0.5888426899909973,
"learning_rate": 1.178892241852222e-06,
"loss": 0.4650096893310547,
"step": 1971
},
{
"epoch": 2.5579886455798864,
"grad_norm": 0.5748341083526611,
"learning_rate": 1.1721656134036962e-06,
"loss": 0.5009864568710327,
"step": 1972
},
{
"epoch": 2.559286293592863,
"grad_norm": 0.5902095437049866,
"learning_rate": 1.165457035642128e-06,
"loss": 0.5109707117080688,
"step": 1973
},
{
"epoch": 2.5605839416058394,
"grad_norm": 0.6467815041542053,
"learning_rate": 1.1587665222848643e-06,
"loss": 0.4991541802883148,
"step": 1974
},
{
"epoch": 2.5618815896188156,
"grad_norm": 0.5866140127182007,
"learning_rate": 1.1520940870123065e-06,
"loss": 0.48706984519958496,
"step": 1975
},
{
"epoch": 2.5631792376317923,
"grad_norm": 0.5842229127883911,
"learning_rate": 1.1454397434679022e-06,
"loss": 0.5219037532806396,
"step": 1976
},
{
"epoch": 2.564476885644769,
"grad_norm": 0.5731110572814941,
"learning_rate": 1.1388035052580936e-06,
"loss": 0.5115249156951904,
"step": 1977
},
{
"epoch": 2.5657745336577453,
"grad_norm": 0.5784810185432434,
"learning_rate": 1.1321853859523113e-06,
"loss": 0.49307000637054443,
"step": 1978
},
{
"epoch": 2.5670721816707216,
"grad_norm": 0.5523423552513123,
"learning_rate": 1.1255853990829323e-06,
"loss": 0.4534381031990051,
"step": 1979
},
{
"epoch": 2.5683698296836983,
"grad_norm": 0.576626718044281,
"learning_rate": 1.119003558145262e-06,
"loss": 0.5025165677070618,
"step": 1980
},
{
"epoch": 2.569667477696675,
"grad_norm": 0.6068827509880066,
"learning_rate": 1.1124398765974976e-06,
"loss": 0.5154992341995239,
"step": 1981
},
{
"epoch": 2.5709651257096513,
"grad_norm": 0.5544149875640869,
"learning_rate": 1.1058943678607082e-06,
"loss": 0.4641039967536926,
"step": 1982
},
{
"epoch": 2.5722627737226276,
"grad_norm": 0.591013491153717,
"learning_rate": 1.0993670453187965e-06,
"loss": 0.5354744791984558,
"step": 1983
},
{
"epoch": 2.5735604217356043,
"grad_norm": 0.5729239583015442,
"learning_rate": 1.0928579223184943e-06,
"loss": 0.4895523190498352,
"step": 1984
},
{
"epoch": 2.5748580697485806,
"grad_norm": 0.5629091858863831,
"learning_rate": 1.0863670121693037e-06,
"loss": 0.4998272955417633,
"step": 1985
},
{
"epoch": 2.5761557177615573,
"grad_norm": 0.5692305564880371,
"learning_rate": 1.0798943281434958e-06,
"loss": 0.5316153764724731,
"step": 1986
},
{
"epoch": 2.5774533657745335,
"grad_norm": 0.5875282287597656,
"learning_rate": 1.0734398834760695e-06,
"loss": 0.47188982367515564,
"step": 1987
},
{
"epoch": 2.5787510137875103,
"grad_norm": 0.613525927066803,
"learning_rate": 1.067003691364733e-06,
"loss": 0.5325276851654053,
"step": 1988
},
{
"epoch": 2.5800486618004865,
"grad_norm": 0.5971388816833496,
"learning_rate": 1.060585764969867e-06,
"loss": 0.5428590774536133,
"step": 1989
},
{
"epoch": 2.5813463098134632,
"grad_norm": 0.5674665570259094,
"learning_rate": 1.0541861174145097e-06,
"loss": 0.47022098302841187,
"step": 1990
},
{
"epoch": 2.5826439578264395,
"grad_norm": 0.5619399547576904,
"learning_rate": 1.047804761784319e-06,
"loss": 0.48155295848846436,
"step": 1991
},
{
"epoch": 2.5839416058394162,
"grad_norm": 0.5751737952232361,
"learning_rate": 1.0414417111275533e-06,
"loss": 0.5390469431877136,
"step": 1992
},
{
"epoch": 2.5852392538523925,
"grad_norm": 0.5782447457313538,
"learning_rate": 1.0350969784550368e-06,
"loss": 0.5048004984855652,
"step": 1993
},
{
"epoch": 2.5865369018653688,
"grad_norm": 0.5656158328056335,
"learning_rate": 1.028770576740148e-06,
"loss": 0.5237029194831848,
"step": 1994
},
{
"epoch": 2.5878345498783455,
"grad_norm": 0.568681538105011,
"learning_rate": 1.022462518918772e-06,
"loss": 0.4539422392845154,
"step": 1995
},
{
"epoch": 2.589132197891322,
"grad_norm": 0.560100793838501,
"learning_rate": 1.0161728178892928e-06,
"loss": 0.45414865016937256,
"step": 1996
},
{
"epoch": 2.5904298459042985,
"grad_norm": 0.5668950080871582,
"learning_rate": 1.0099014865125557e-06,
"loss": 0.4774186611175537,
"step": 1997
},
{
"epoch": 2.5917274939172747,
"grad_norm": 0.606434166431427,
"learning_rate": 1.0036485376118477e-06,
"loss": 0.565065324306488,
"step": 1998
},
{
"epoch": 2.5930251419302515,
"grad_norm": 0.5841239094734192,
"learning_rate": 9.974139839728658e-07,
"loss": 0.5483173131942749,
"step": 1999
},
{
"epoch": 2.5943227899432277,
"grad_norm": 0.591903805732727,
"learning_rate": 9.91197838343696e-07,
"loss": 0.539207398891449,
"step": 2000
},
{
"epoch": 2.5956204379562045,
"grad_norm": 0.5807414650917053,
"learning_rate": 9.850001134347765e-07,
"loss": 0.5179691314697266,
"step": 2001
},
{
"epoch": 2.5969180859691807,
"grad_norm": 0.5769233107566833,
"learning_rate": 9.788208219188932e-07,
"loss": 0.4748839735984802,
"step": 2002
},
{
"epoch": 2.5982157339821574,
"grad_norm": 0.5766239762306213,
"learning_rate": 9.726599764311318e-07,
"loss": 0.48025619983673096,
"step": 2003
},
{
"epoch": 2.5995133819951337,
"grad_norm": 0.5754262208938599,
"learning_rate": 9.665175895688594e-07,
"loss": 0.47812211513519287,
"step": 2004
},
{
"epoch": 2.6008110300081104,
"grad_norm": 0.5699096918106079,
"learning_rate": 9.603936738917063e-07,
"loss": 0.5337727069854736,
"step": 2005
},
{
"epoch": 2.6021086780210867,
"grad_norm": 0.6039567589759827,
"learning_rate": 9.54288241921525e-07,
"loss": 0.5216813087463379,
"step": 2006
},
{
"epoch": 2.6034063260340634,
"grad_norm": 0.5594240427017212,
"learning_rate": 9.482013061423833e-07,
"loss": 0.5251287221908569,
"step": 2007
},
{
"epoch": 2.6047039740470397,
"grad_norm": 0.5856126546859741,
"learning_rate": 9.421328790005213e-07,
"loss": 0.5040426850318909,
"step": 2008
},
{
"epoch": 2.606001622060016,
"grad_norm": 0.5794676542282104,
"learning_rate": 9.360829729043375e-07,
"loss": 0.5068378448486328,
"step": 2009
},
{
"epoch": 2.6072992700729927,
"grad_norm": 0.5879704356193542,
"learning_rate": 9.300516002243587e-07,
"loss": 0.5116778016090393,
"step": 2010
},
{
"epoch": 2.6085969180859694,
"grad_norm": 0.5978105068206787,
"learning_rate": 9.240387732932155e-07,
"loss": 0.525846004486084,
"step": 2011
},
{
"epoch": 2.6098945660989457,
"grad_norm": 0.5788280367851257,
"learning_rate": 9.180445044056164e-07,
"loss": 0.5172775983810425,
"step": 2012
},
{
"epoch": 2.611192214111922,
"grad_norm": 0.5901548862457275,
"learning_rate": 9.120688058183269e-07,
"loss": 0.5301088094711304,
"step": 2013
},
{
"epoch": 2.6124898621248986,
"grad_norm": 0.5967061519622803,
"learning_rate": 9.061116897501321e-07,
"loss": 0.5318504571914673,
"step": 2014
},
{
"epoch": 2.6137875101378754,
"grad_norm": 0.5555222034454346,
"learning_rate": 9.001731683818338e-07,
"loss": 0.5011588335037231,
"step": 2015
},
{
"epoch": 2.6150851581508516,
"grad_norm": 0.613298237323761,
"learning_rate": 8.942532538561988e-07,
"loss": 0.5700482130050659,
"step": 2016
},
{
"epoch": 2.616382806163828,
"grad_norm": 0.599183201789856,
"learning_rate": 8.883519582779598e-07,
"loss": 0.5524272322654724,
"step": 2017
},
{
"epoch": 2.6176804541768046,
"grad_norm": 0.6120027899742126,
"learning_rate": 8.82469293713768e-07,
"loss": 0.47205424308776855,
"step": 2018
},
{
"epoch": 2.618978102189781,
"grad_norm": 0.5907730460166931,
"learning_rate": 8.766052721921858e-07,
"loss": 0.507009744644165,
"step": 2019
},
{
"epoch": 2.6202757502027576,
"grad_norm": 0.5603318810462952,
"learning_rate": 8.70759905703652e-07,
"loss": 0.48432788252830505,
"step": 2020
},
{
"epoch": 2.621573398215734,
"grad_norm": 0.5962936282157898,
"learning_rate": 8.649332062004622e-07,
"loss": 0.4898841381072998,
"step": 2021
},
{
"epoch": 2.6228710462287106,
"grad_norm": 0.7598771452903748,
"learning_rate": 8.59125185596742e-07,
"loss": 0.5321274995803833,
"step": 2022
},
{
"epoch": 2.624168694241687,
"grad_norm": 0.5821399092674255,
"learning_rate": 8.533358557684246e-07,
"loss": 0.512812614440918,
"step": 2023
},
{
"epoch": 2.6254663422546636,
"grad_norm": 0.5900049805641174,
"learning_rate": 8.475652285532199e-07,
"loss": 0.5129188299179077,
"step": 2024
},
{
"epoch": 2.62676399026764,
"grad_norm": 0.5779396295547485,
"learning_rate": 8.41813315750607e-07,
"loss": 0.4839695394039154,
"step": 2025
},
{
"epoch": 2.6280616382806166,
"grad_norm": 0.581840455532074,
"learning_rate": 8.360801291217835e-07,
"loss": 0.4942781925201416,
"step": 2026
},
{
"epoch": 2.629359286293593,
"grad_norm": 0.5503793954849243,
"learning_rate": 8.303656803896731e-07,
"loss": 0.4754694700241089,
"step": 2027
},
{
"epoch": 2.630656934306569,
"grad_norm": 0.5595881342887878,
"learning_rate": 8.246699812388714e-07,
"loss": 0.48087698221206665,
"step": 2028
},
{
"epoch": 2.631954582319546,
"grad_norm": 0.5697108507156372,
"learning_rate": 8.189930433156424e-07,
"loss": 0.5032870173454285,
"step": 2029
},
{
"epoch": 2.6332522303325225,
"grad_norm": 0.5761867761611938,
"learning_rate": 8.133348782278916e-07,
"loss": 0.5013032555580139,
"step": 2030
},
{
"epoch": 2.634549878345499,
"grad_norm": 0.6058787703514099,
"learning_rate": 8.07695497545129e-07,
"loss": 0.44857025146484375,
"step": 2031
},
{
"epoch": 2.635847526358475,
"grad_norm": 0.5961512327194214,
"learning_rate": 8.020749127984629e-07,
"loss": 0.5228594541549683,
"step": 2032
},
{
"epoch": 2.637145174371452,
"grad_norm": 0.5766192078590393,
"learning_rate": 7.964731354805677e-07,
"loss": 0.4745315611362457,
"step": 2033
},
{
"epoch": 2.638442822384428,
"grad_norm": 0.5896121859550476,
"learning_rate": 7.908901770456579e-07,
"loss": 0.519614577293396,
"step": 2034
},
{
"epoch": 2.639740470397405,
"grad_norm": 0.5732361078262329,
"learning_rate": 7.853260489094727e-07,
"loss": 0.48370620608329773,
"step": 2035
},
{
"epoch": 2.641038118410381,
"grad_norm": 0.5929004549980164,
"learning_rate": 7.79780762449246e-07,
"loss": 0.5153477191925049,
"step": 2036
},
{
"epoch": 2.6423357664233578,
"grad_norm": 0.587020754814148,
"learning_rate": 7.742543290036797e-07,
"loss": 0.4829615652561188,
"step": 2037
},
{
"epoch": 2.643633414436334,
"grad_norm": 0.5629860758781433,
"learning_rate": 7.687467598729403e-07,
"loss": 0.5223960876464844,
"step": 2038
},
{
"epoch": 2.6449310624493108,
"grad_norm": 0.5553507208824158,
"learning_rate": 7.63258066318604e-07,
"loss": 0.4827447235584259,
"step": 2039
},
{
"epoch": 2.646228710462287,
"grad_norm": 0.5940564274787903,
"learning_rate": 7.577882595636665e-07,
"loss": 0.538356602191925,
"step": 2040
},
{
"epoch": 2.6475263584752637,
"grad_norm": 0.5712041258811951,
"learning_rate": 7.523373507924947e-07,
"loss": 0.48258891701698303,
"step": 2041
},
{
"epoch": 2.64882400648824,
"grad_norm": 0.5664177536964417,
"learning_rate": 7.469053511508184e-07,
"loss": 0.4672595262527466,
"step": 2042
},
{
"epoch": 2.6501216545012163,
"grad_norm": 0.6014147996902466,
"learning_rate": 7.414922717457018e-07,
"loss": 0.5549574494361877,
"step": 2043
},
{
"epoch": 2.651419302514193,
"grad_norm": 0.588028073310852,
"learning_rate": 7.360981236455222e-07,
"loss": 0.5366802215576172,
"step": 2044
},
{
"epoch": 2.6527169505271697,
"grad_norm": 0.5555592179298401,
"learning_rate": 7.307229178799469e-07,
"loss": 0.49787813425064087,
"step": 2045
},
{
"epoch": 2.654014598540146,
"grad_norm": 0.5918477177619934,
"learning_rate": 7.253666654399128e-07,
"loss": 0.5271812081336975,
"step": 2046
},
{
"epoch": 2.6553122465531223,
"grad_norm": 0.6544379591941833,
"learning_rate": 7.200293772775968e-07,
"loss": 0.5332372784614563,
"step": 2047
},
{
"epoch": 2.656609894566099,
"grad_norm": 0.578555166721344,
"learning_rate": 7.14711064306407e-07,
"loss": 0.496245801448822,
"step": 2048
},
{
"epoch": 2.6579075425790757,
"grad_norm": 0.5929746627807617,
"learning_rate": 7.094117374009446e-07,
"loss": 0.5187441110610962,
"step": 2049
},
{
"epoch": 2.659205190592052,
"grad_norm": 0.5854722261428833,
"learning_rate": 7.041314073969918e-07,
"loss": 0.4945400655269623,
"step": 2050
},
{
"epoch": 2.6605028386050282,
"grad_norm": 0.6011053323745728,
"learning_rate": 6.988700850914876e-07,
"loss": 0.48466387391090393,
"step": 2051
},
{
"epoch": 2.661800486618005,
"grad_norm": 0.5774915814399719,
"learning_rate": 6.93627781242504e-07,
"loss": 0.5133316516876221,
"step": 2052
},
{
"epoch": 2.663098134630981,
"grad_norm": 0.5776026248931885,
"learning_rate": 6.884045065692257e-07,
"loss": 0.5115536451339722,
"step": 2053
},
{
"epoch": 2.664395782643958,
"grad_norm": 0.6011329293251038,
"learning_rate": 6.83200271751927e-07,
"loss": 0.5355618000030518,
"step": 2054
},
{
"epoch": 2.665693430656934,
"grad_norm": 0.5973834991455078,
"learning_rate": 6.780150874319524e-07,
"loss": 0.5230112075805664,
"step": 2055
},
{
"epoch": 2.666991078669911,
"grad_norm": 0.5917934775352478,
"learning_rate": 6.72848964211692e-07,
"loss": 0.5399461388587952,
"step": 2056
},
{
"epoch": 2.668288726682887,
"grad_norm": 0.5736814141273499,
"learning_rate": 6.677019126545548e-07,
"loss": 0.49193501472473145,
"step": 2057
},
{
"epoch": 2.669586374695864,
"grad_norm": 0.5814056396484375,
"learning_rate": 6.625739432849643e-07,
"loss": 0.5203338861465454,
"step": 2058
},
{
"epoch": 2.67088402270884,
"grad_norm": 0.601714015007019,
"learning_rate": 6.574650665883197e-07,
"loss": 0.5449438095092773,
"step": 2059
},
{
"epoch": 2.672181670721817,
"grad_norm": 0.5884926319122314,
"learning_rate": 6.523752930109761e-07,
"loss": 0.5138452053070068,
"step": 2060
},
{
"epoch": 2.673479318734793,
"grad_norm": 0.5702131390571594,
"learning_rate": 6.473046329602384e-07,
"loss": 0.4545958638191223,
"step": 2061
},
{
"epoch": 2.6747769667477694,
"grad_norm": 0.5839261412620544,
"learning_rate": 6.422530968043173e-07,
"loss": 0.5412476658821106,
"step": 2062
},
{
"epoch": 2.676074614760746,
"grad_norm": 0.5880113244056702,
"learning_rate": 6.372206948723292e-07,
"loss": 0.5263261795043945,
"step": 2063
},
{
"epoch": 2.677372262773723,
"grad_norm": 0.5763228535652161,
"learning_rate": 6.322074374542608e-07,
"loss": 0.5082492828369141,
"step": 2064
},
{
"epoch": 2.678669910786699,
"grad_norm": 0.5878806710243225,
"learning_rate": 6.272133348009546e-07,
"loss": 0.5076773166656494,
"step": 2065
},
{
"epoch": 2.6799675587996754,
"grad_norm": 0.5525650978088379,
"learning_rate": 6.222383971240875e-07,
"loss": 0.48154234886169434,
"step": 2066
},
{
"epoch": 2.681265206812652,
"grad_norm": 0.6016013622283936,
"learning_rate": 6.17282634596148e-07,
"loss": 0.503459095954895,
"step": 2067
},
{
"epoch": 2.6825628548256284,
"grad_norm": 0.6026131510734558,
"learning_rate": 6.123460573504147e-07,
"loss": 0.4821071922779083,
"step": 2068
},
{
"epoch": 2.683860502838605,
"grad_norm": 0.5926850438117981,
"learning_rate": 6.074286754809411e-07,
"loss": 0.5161428451538086,
"step": 2069
},
{
"epoch": 2.6851581508515814,
"grad_norm": 0.5853096842765808,
"learning_rate": 6.025304990425241e-07,
"loss": 0.5262787342071533,
"step": 2070
},
{
"epoch": 2.6851581508515814,
"eval_loss": 0.6954009532928467,
"eval_runtime": 72.3609,
"eval_samples_per_second": 71.751,
"eval_steps_per_second": 8.969,
"step": 2070
},
{
"epoch": 2.686455798864558,
"grad_norm": 0.5976012945175171,
"learning_rate": 5.976515380507008e-07,
"loss": 0.5311732888221741,
"step": 2071
},
{
"epoch": 2.6877534468775344,
"grad_norm": 0.5981724262237549,
"learning_rate": 5.927918024817059e-07,
"loss": 0.5703781247138977,
"step": 2072
},
{
"epoch": 2.689051094890511,
"grad_norm": 0.5645772814750671,
"learning_rate": 5.879513022724714e-07,
"loss": 0.4812767505645752,
"step": 2073
},
{
"epoch": 2.6903487429034874,
"grad_norm": 0.5886021852493286,
"learning_rate": 5.831300473205948e-07,
"loss": 0.5149608254432678,
"step": 2074
},
{
"epoch": 2.691646390916464,
"grad_norm": 0.5895439982414246,
"learning_rate": 5.783280474843222e-07,
"loss": 0.5148745179176331,
"step": 2075
},
{
"epoch": 2.6929440389294403,
"grad_norm": 0.571723461151123,
"learning_rate": 5.735453125825275e-07,
"loss": 0.5035296082496643,
"step": 2076
},
{
"epoch": 2.6942416869424166,
"grad_norm": 0.6077845096588135,
"learning_rate": 5.687818523946931e-07,
"loss": 0.5260845422744751,
"step": 2077
},
{
"epoch": 2.6955393349553933,
"grad_norm": 0.5872023105621338,
"learning_rate": 5.640376766608902e-07,
"loss": 0.49081629514694214,
"step": 2078
},
{
"epoch": 2.69683698296837,
"grad_norm": 0.5637922286987305,
"learning_rate": 5.593127950817579e-07,
"loss": 0.49831029772758484,
"step": 2079
},
{
"epoch": 2.6981346309813463,
"grad_norm": 0.588504433631897,
"learning_rate": 5.546072173184791e-07,
"loss": 0.5403261184692383,
"step": 2080
},
{
"epoch": 2.6994322789943226,
"grad_norm": 0.5554431080818176,
"learning_rate": 5.499209529927751e-07,
"loss": 0.4801977872848511,
"step": 2081
},
{
"epoch": 2.7007299270072993,
"grad_norm": 0.594923198223114,
"learning_rate": 5.452540116868654e-07,
"loss": 0.552370011806488,
"step": 2082
},
{
"epoch": 2.702027575020276,
"grad_norm": 0.5900223851203918,
"learning_rate": 5.406064029434666e-07,
"loss": 0.5598849058151245,
"step": 2083
},
{
"epoch": 2.7033252230332523,
"grad_norm": 0.5767436027526855,
"learning_rate": 5.359781362657623e-07,
"loss": 0.5048878192901611,
"step": 2084
},
{
"epoch": 2.7046228710462286,
"grad_norm": 0.551128089427948,
"learning_rate": 5.313692211173838e-07,
"loss": 0.5155936479568481,
"step": 2085
},
{
"epoch": 2.7059205190592053,
"grad_norm": 0.5880531072616577,
"learning_rate": 5.26779666922399e-07,
"loss": 0.5444161295890808,
"step": 2086
},
{
"epoch": 2.7072181670721815,
"grad_norm": 0.5545855164527893,
"learning_rate": 5.222094830652835e-07,
"loss": 0.4949781894683838,
"step": 2087
},
{
"epoch": 2.7085158150851583,
"grad_norm": 0.5254430174827576,
"learning_rate": 5.176586788909066e-07,
"loss": 0.48143208026885986,
"step": 2088
},
{
"epoch": 2.7098134630981345,
"grad_norm": 0.5895472764968872,
"learning_rate": 5.131272637045104e-07,
"loss": 0.5467052459716797,
"step": 2089
},
{
"epoch": 2.7111111111111112,
"grad_norm": 0.5603579878807068,
"learning_rate": 5.086152467716932e-07,
"loss": 0.48797622323036194,
"step": 2090
},
{
"epoch": 2.7124087591240875,
"grad_norm": 0.5788029432296753,
"learning_rate": 5.041226373183861e-07,
"loss": 0.5119057297706604,
"step": 2091
},
{
"epoch": 2.7137064071370642,
"grad_norm": 0.5590220093727112,
"learning_rate": 4.996494445308409e-07,
"loss": 0.46394574642181396,
"step": 2092
},
{
"epoch": 2.7150040551500405,
"grad_norm": 0.5895569920539856,
"learning_rate": 4.951956775556e-07,
"loss": 0.4952976703643799,
"step": 2093
},
{
"epoch": 2.7163017031630172,
"grad_norm": 0.5719903707504272,
"learning_rate": 4.907613454994964e-07,
"loss": 0.5015777349472046,
"step": 2094
},
{
"epoch": 2.7175993511759935,
"grad_norm": 0.5849481821060181,
"learning_rate": 4.863464574296106e-07,
"loss": 0.5244485139846802,
"step": 2095
},
{
"epoch": 2.7188969991889698,
"grad_norm": 0.5956225991249084,
"learning_rate": 4.819510223732738e-07,
"loss": 0.5492672324180603,
"step": 2096
},
{
"epoch": 2.7201946472019465,
"grad_norm": 0.5836542844772339,
"learning_rate": 4.775750493180386e-07,
"loss": 0.48292914032936096,
"step": 2097
},
{
"epoch": 2.721492295214923,
"grad_norm": 0.5966354012489319,
"learning_rate": 4.7321854721166127e-07,
"loss": 0.5208597183227539,
"step": 2098
},
{
"epoch": 2.7227899432278995,
"grad_norm": 0.536894679069519,
"learning_rate": 4.6888152496208593e-07,
"loss": 0.4349246621131897,
"step": 2099
},
{
"epoch": 2.7240875912408757,
"grad_norm": 0.589508593082428,
"learning_rate": 4.645639914374278e-07,
"loss": 0.5353684425354004,
"step": 2100
},
{
"epoch": 2.7253852392538525,
"grad_norm": 0.5571612119674683,
"learning_rate": 4.602659554659461e-07,
"loss": 0.4614424705505371,
"step": 2101
},
{
"epoch": 2.7266828872668287,
"grad_norm": 0.6046862602233887,
"learning_rate": 4.559874258360408e-07,
"loss": 0.5189507603645325,
"step": 2102
},
{
"epoch": 2.7279805352798054,
"grad_norm": 0.5680896639823914,
"learning_rate": 4.5172841129621726e-07,
"loss": 0.5085829496383667,
"step": 2103
},
{
"epoch": 2.7292781832927817,
"grad_norm": 0.5765218138694763,
"learning_rate": 4.474889205550881e-07,
"loss": 0.5140299797058105,
"step": 2104
},
{
"epoch": 2.7305758313057584,
"grad_norm": 0.587651252746582,
"learning_rate": 4.4326896228133354e-07,
"loss": 0.4957928955554962,
"step": 2105
},
{
"epoch": 2.7318734793187347,
"grad_norm": 0.5494794249534607,
"learning_rate": 4.3906854510370245e-07,
"loss": 0.5062738060951233,
"step": 2106
},
{
"epoch": 2.7331711273317114,
"grad_norm": 0.5937455296516418,
"learning_rate": 4.348876776109856e-07,
"loss": 0.5094043016433716,
"step": 2107
},
{
"epoch": 2.7344687753446877,
"grad_norm": 0.5641949772834778,
"learning_rate": 4.307263683519969e-07,
"loss": 0.48215553164482117,
"step": 2108
},
{
"epoch": 2.7357664233576644,
"grad_norm": 0.5819230079650879,
"learning_rate": 4.2658462583556216e-07,
"loss": 0.5357835292816162,
"step": 2109
},
{
"epoch": 2.7370640713706407,
"grad_norm": 0.5532712936401367,
"learning_rate": 4.2246245853049706e-07,
"loss": 0.47937077283859253,
"step": 2110
},
{
"epoch": 2.738361719383617,
"grad_norm": 0.6110063195228577,
"learning_rate": 4.1835987486558595e-07,
"loss": 0.4744276702404022,
"step": 2111
},
{
"epoch": 2.7396593673965937,
"grad_norm": 0.5573598146438599,
"learning_rate": 4.142768832295807e-07,
"loss": 0.5128625631332397,
"step": 2112
},
{
"epoch": 2.7409570154095704,
"grad_norm": 0.5569184422492981,
"learning_rate": 4.102134919711609e-07,
"loss": 0.47407659888267517,
"step": 2113
},
{
"epoch": 2.7422546634225466,
"grad_norm": 0.5868476629257202,
"learning_rate": 4.061697093989347e-07,
"loss": 0.5311683416366577,
"step": 2114
},
{
"epoch": 2.743552311435523,
"grad_norm": 0.5694899559020996,
"learning_rate": 4.021455437814148e-07,
"loss": 0.4629291892051697,
"step": 2115
},
{
"epoch": 2.7448499594484996,
"grad_norm": 0.5624482035636902,
"learning_rate": 3.981410033469979e-07,
"loss": 0.4855622351169586,
"step": 2116
},
{
"epoch": 2.7461476074614763,
"grad_norm": 0.576919436454773,
"learning_rate": 3.941560962839619e-07,
"loss": 0.47935816645622253,
"step": 2117
},
{
"epoch": 2.7474452554744526,
"grad_norm": 0.5966827869415283,
"learning_rate": 3.9019083074042784e-07,
"loss": 0.4561656415462494,
"step": 2118
},
{
"epoch": 2.748742903487429,
"grad_norm": 0.5702851414680481,
"learning_rate": 3.862452148243623e-07,
"loss": 0.4796487092971802,
"step": 2119
},
{
"epoch": 2.7500405515004056,
"grad_norm": 0.5755755305290222,
"learning_rate": 3.823192566035494e-07,
"loss": 0.5047421455383301,
"step": 2120
},
{
"epoch": 2.751338199513382,
"grad_norm": 0.5769697427749634,
"learning_rate": 3.7841296410558225e-07,
"loss": 0.48532968759536743,
"step": 2121
},
{
"epoch": 2.7526358475263586,
"grad_norm": 0.5873609781265259,
"learning_rate": 3.7452634531783935e-07,
"loss": 0.5122209787368774,
"step": 2122
},
{
"epoch": 2.753933495539335,
"grad_norm": 0.5939727425575256,
"learning_rate": 3.706594081874737e-07,
"loss": 0.49794304370880127,
"step": 2123
},
{
"epoch": 2.7552311435523116,
"grad_norm": 0.5834800601005554,
"learning_rate": 3.6681216062138923e-07,
"loss": 0.5340889096260071,
"step": 2124
},
{
"epoch": 2.756528791565288,
"grad_norm": 0.576677680015564,
"learning_rate": 3.6298461048623887e-07,
"loss": 0.5236599445343018,
"step": 2125
},
{
"epoch": 2.757826439578264,
"grad_norm": 0.5462478399276733,
"learning_rate": 3.5917676560838775e-07,
"loss": 0.47627806663513184,
"step": 2126
},
{
"epoch": 2.759124087591241,
"grad_norm": 0.5982619524002075,
"learning_rate": 3.5538863377392095e-07,
"loss": 0.4933459460735321,
"step": 2127
},
{
"epoch": 2.7604217356042176,
"grad_norm": 0.5802999138832092,
"learning_rate": 3.5162022272860475e-07,
"loss": 0.5381085872650146,
"step": 2128
},
{
"epoch": 2.761719383617194,
"grad_norm": 0.5820630788803101,
"learning_rate": 3.478715401778876e-07,
"loss": 0.5177547931671143,
"step": 2129
},
{
"epoch": 2.76301703163017,
"grad_norm": 0.6046480536460876,
"learning_rate": 3.44142593786877e-07,
"loss": 0.5715194940567017,
"step": 2130
},
{
"epoch": 2.764314679643147,
"grad_norm": 0.5816249847412109,
"learning_rate": 3.404333911803237e-07,
"loss": 0.48858851194381714,
"step": 2131
},
{
"epoch": 2.7656123276561235,
"grad_norm": 0.5709452629089355,
"learning_rate": 3.367439399426087e-07,
"loss": 0.5259594917297363,
"step": 2132
},
{
"epoch": 2.7669099756691,
"grad_norm": 0.5610825419425964,
"learning_rate": 3.330742476177273e-07,
"loss": 0.49785754084587097,
"step": 2133
},
{
"epoch": 2.768207623682076,
"grad_norm": 0.5751505494117737,
"learning_rate": 3.2942432170926743e-07,
"loss": 0.45043110847473145,
"step": 2134
},
{
"epoch": 2.769505271695053,
"grad_norm": 0.5675750374794006,
"learning_rate": 3.257941696804079e-07,
"loss": 0.5171366930007935,
"step": 2135
},
{
"epoch": 2.770802919708029,
"grad_norm": 0.5672844052314758,
"learning_rate": 3.2218379895388896e-07,
"loss": 0.467257022857666,
"step": 2136
},
{
"epoch": 2.7721005677210058,
"grad_norm": 0.6082518696784973,
"learning_rate": 3.185932169120043e-07,
"loss": 0.5202172994613647,
"step": 2137
},
{
"epoch": 2.773398215733982,
"grad_norm": 0.5631950497627258,
"learning_rate": 3.150224308965866e-07,
"loss": 0.5058823823928833,
"step": 2138
},
{
"epoch": 2.7746958637469588,
"grad_norm": 0.6380532383918762,
"learning_rate": 3.114714482089898e-07,
"loss": 0.5831983089447021,
"step": 2139
},
{
"epoch": 2.775993511759935,
"grad_norm": 0.5557391047477722,
"learning_rate": 3.079402761100736e-07,
"loss": 0.4567191004753113,
"step": 2140
},
{
"epoch": 2.7772911597729117,
"grad_norm": 0.562920868396759,
"learning_rate": 3.0442892182019236e-07,
"loss": 0.4184800386428833,
"step": 2141
},
{
"epoch": 2.778588807785888,
"grad_norm": 0.63033127784729,
"learning_rate": 3.00937392519175e-07,
"loss": 0.5374839901924133,
"step": 2142
},
{
"epoch": 2.7798864557988647,
"grad_norm": 0.5735025405883789,
"learning_rate": 2.974656953463173e-07,
"loss": 0.4503205716609955,
"step": 2143
},
{
"epoch": 2.781184103811841,
"grad_norm": 0.6051810383796692,
"learning_rate": 2.9401383740035983e-07,
"loss": 0.4981985092163086,
"step": 2144
},
{
"epoch": 2.7824817518248173,
"grad_norm": 0.6038339734077454,
"learning_rate": 2.905818257394799e-07,
"loss": 0.5327208638191223,
"step": 2145
},
{
"epoch": 2.783779399837794,
"grad_norm": 0.5686031579971313,
"learning_rate": 2.871696673812718e-07,
"loss": 0.4990962743759155,
"step": 2146
},
{
"epoch": 2.7850770478507707,
"grad_norm": 0.5870386958122253,
"learning_rate": 2.837773693027346e-07,
"loss": 0.5274587869644165,
"step": 2147
},
{
"epoch": 2.786374695863747,
"grad_norm": 0.6039890050888062,
"learning_rate": 2.8040493844026185e-07,
"loss": 0.4969175457954407,
"step": 2148
},
{
"epoch": 2.7876723438767232,
"grad_norm": 0.5605257749557495,
"learning_rate": 2.7705238168961867e-07,
"loss": 0.466129869222641,
"step": 2149
},
{
"epoch": 2.7889699918897,
"grad_norm": 0.5661087036132812,
"learning_rate": 2.7371970590593597e-07,
"loss": 0.5182359218597412,
"step": 2150
},
{
"epoch": 2.7902676399026762,
"grad_norm": 0.6032746434211731,
"learning_rate": 2.7040691790369165e-07,
"loss": 0.4847348928451538,
"step": 2151
},
{
"epoch": 2.791565287915653,
"grad_norm": 0.5873638987541199,
"learning_rate": 2.671140244567005e-07,
"loss": 0.4982571005821228,
"step": 2152
},
{
"epoch": 2.792862935928629,
"grad_norm": 0.5877160429954529,
"learning_rate": 2.6384103229809445e-07,
"loss": 0.47337985038757324,
"step": 2153
},
{
"epoch": 2.794160583941606,
"grad_norm": 0.6034269332885742,
"learning_rate": 2.605879481203144e-07,
"loss": 0.5359882116317749,
"step": 2154
},
{
"epoch": 2.795458231954582,
"grad_norm": 0.5855337381362915,
"learning_rate": 2.5735477857509406e-07,
"loss": 0.48935824632644653,
"step": 2155
},
{
"epoch": 2.796755879967559,
"grad_norm": 0.5761221647262573,
"learning_rate": 2.5414153027344846e-07,
"loss": 0.5092116594314575,
"step": 2156
},
{
"epoch": 2.798053527980535,
"grad_norm": 0.5906012654304504,
"learning_rate": 2.5094820978565416e-07,
"loss": 0.4823336601257324,
"step": 2157
},
{
"epoch": 2.799351175993512,
"grad_norm": 0.5929545164108276,
"learning_rate": 2.4777482364124695e-07,
"loss": 0.48247990012168884,
"step": 2158
},
{
"epoch": 2.800648824006488,
"grad_norm": 0.5614597797393799,
"learning_rate": 2.446213783289941e-07,
"loss": 0.48732107877731323,
"step": 2159
},
{
"epoch": 2.8019464720194645,
"grad_norm": 0.6198487281799316,
"learning_rate": 2.4148788029689565e-07,
"loss": 0.544142484664917,
"step": 2160
},
{
"epoch": 2.803244120032441,
"grad_norm": 0.5842984318733215,
"learning_rate": 2.3837433595216174e-07,
"loss": 0.5269244313240051,
"step": 2161
},
{
"epoch": 2.804541768045418,
"grad_norm": 0.5822996497154236,
"learning_rate": 2.3528075166120323e-07,
"loss": 0.49836334586143494,
"step": 2162
},
{
"epoch": 2.805839416058394,
"grad_norm": 0.5670111775398254,
"learning_rate": 2.3220713374961457e-07,
"loss": 0.5108374357223511,
"step": 2163
},
{
"epoch": 2.8071370640713704,
"grad_norm": 0.5872285962104797,
"learning_rate": 2.2915348850216955e-07,
"loss": 0.49880123138427734,
"step": 2164
},
{
"epoch": 2.808434712084347,
"grad_norm": 0.5544793605804443,
"learning_rate": 2.2611982216279693e-07,
"loss": 0.5181583166122437,
"step": 2165
},
{
"epoch": 2.809732360097324,
"grad_norm": 0.5830904245376587,
"learning_rate": 2.2310614093457917e-07,
"loss": 0.48121365904808044,
"step": 2166
},
{
"epoch": 2.8110300081103,
"grad_norm": 0.6001294255256653,
"learning_rate": 2.2011245097972812e-07,
"loss": 0.500962495803833,
"step": 2167
},
{
"epoch": 2.8123276561232764,
"grad_norm": 0.6160042881965637,
"learning_rate": 2.171387584195861e-07,
"loss": 0.5166311264038086,
"step": 2168
},
{
"epoch": 2.813625304136253,
"grad_norm": 0.5664080381393433,
"learning_rate": 2.1418506933459926e-07,
"loss": 0.4849929213523865,
"step": 2169
},
{
"epoch": 2.8149229521492294,
"grad_norm": 0.60596764087677,
"learning_rate": 2.1125138976431425e-07,
"loss": 0.5384441018104553,
"step": 2170
},
{
"epoch": 2.816220600162206,
"grad_norm": 0.6017642617225647,
"learning_rate": 2.0833772570736376e-07,
"loss": 0.5182196497917175,
"step": 2171
},
{
"epoch": 2.8175182481751824,
"grad_norm": 0.567242443561554,
"learning_rate": 2.0544408312145325e-07,
"loss": 0.5023871660232544,
"step": 2172
},
{
"epoch": 2.818815896188159,
"grad_norm": 0.5743298530578613,
"learning_rate": 2.025704679233498e-07,
"loss": 0.4737445116043091,
"step": 2173
},
{
"epoch": 2.8201135442011354,
"grad_norm": 0.5686278343200684,
"learning_rate": 1.9971688598886874e-07,
"loss": 0.4916064441204071,
"step": 2174
},
{
"epoch": 2.821411192214112,
"grad_norm": 0.5849027037620544,
"learning_rate": 1.9688334315286383e-07,
"loss": 0.5161796808242798,
"step": 2175
},
{
"epoch": 2.8227088402270883,
"grad_norm": 0.5709643959999084,
"learning_rate": 1.9406984520921156e-07,
"loss": 0.5027370452880859,
"step": 2176
},
{
"epoch": 2.824006488240065,
"grad_norm": 0.6077797412872314,
"learning_rate": 1.9127639791080345e-07,
"loss": 0.561673641204834,
"step": 2177
},
{
"epoch": 2.8253041362530413,
"grad_norm": 0.5836532711982727,
"learning_rate": 1.885030069695326e-07,
"loss": 0.5252400636672974,
"step": 2178
},
{
"epoch": 2.8266017842660176,
"grad_norm": 0.5875435471534729,
"learning_rate": 1.8574967805628174e-07,
"loss": 0.5136289596557617,
"step": 2179
},
{
"epoch": 2.8278994322789943,
"grad_norm": 0.5999600291252136,
"learning_rate": 1.8301641680090965e-07,
"loss": 0.5113690495491028,
"step": 2180
},
{
"epoch": 2.829197080291971,
"grad_norm": 0.5720099210739136,
"learning_rate": 1.8030322879224792e-07,
"loss": 0.5277208089828491,
"step": 2181
},
{
"epoch": 2.8304947283049473,
"grad_norm": 0.5587209463119507,
"learning_rate": 1.7761011957807439e-07,
"loss": 0.5302145481109619,
"step": 2182
},
{
"epoch": 2.8317923763179236,
"grad_norm": 0.574344277381897,
"learning_rate": 1.7493709466511965e-07,
"loss": 0.5009472370147705,
"step": 2183
},
{
"epoch": 2.8330900243309003,
"grad_norm": 0.5876274704933167,
"learning_rate": 1.7228415951904165e-07,
"loss": 0.49587976932525635,
"step": 2184
},
{
"epoch": 2.8343876723438766,
"grad_norm": 0.5799663662910461,
"learning_rate": 1.6965131956442004e-07,
"loss": 0.5200576782226562,
"step": 2185
},
{
"epoch": 2.8356853203568533,
"grad_norm": 0.5789362192153931,
"learning_rate": 1.670385801847485e-07,
"loss": 0.4996534585952759,
"step": 2186
},
{
"epoch": 2.8369829683698295,
"grad_norm": 0.5791637897491455,
"learning_rate": 1.6444594672241688e-07,
"loss": 0.5251076221466064,
"step": 2187
},
{
"epoch": 2.8382806163828063,
"grad_norm": 0.581289529800415,
"learning_rate": 1.6187342447870235e-07,
"loss": 0.47298407554626465,
"step": 2188
},
{
"epoch": 2.8395782643957825,
"grad_norm": 0.5624388456344604,
"learning_rate": 1.5932101871376503e-07,
"loss": 0.48804956674575806,
"step": 2189
},
{
"epoch": 2.8408759124087593,
"grad_norm": 0.5740110278129578,
"learning_rate": 1.567887346466257e-07,
"loss": 0.4583921730518341,
"step": 2190
},
{
"epoch": 2.8421735604217355,
"grad_norm": 0.5799588561058044,
"learning_rate": 1.54276577455168e-07,
"loss": 0.5046111345291138,
"step": 2191
},
{
"epoch": 2.8434712084347122,
"grad_norm": 0.5686801671981812,
"learning_rate": 1.517845522761141e-07,
"loss": 0.5424494743347168,
"step": 2192
},
{
"epoch": 2.8447688564476885,
"grad_norm": 0.5737746953964233,
"learning_rate": 1.4931266420502687e-07,
"loss": 0.5258438587188721,
"step": 2193
},
{
"epoch": 2.846066504460665,
"grad_norm": 0.5844926238059998,
"learning_rate": 1.468609182962899e-07,
"loss": 0.5294222831726074,
"step": 2194
},
{
"epoch": 2.8473641524736415,
"grad_norm": 0.6161758899688721,
"learning_rate": 1.4442931956310525e-07,
"loss": 0.48813527822494507,
"step": 2195
},
{
"epoch": 2.848661800486618,
"grad_norm": 0.5877721905708313,
"learning_rate": 1.420178729774746e-07,
"loss": 0.5104416608810425,
"step": 2196
},
{
"epoch": 2.8499594484995945,
"grad_norm": 0.607412576675415,
"learning_rate": 1.3962658347019819e-07,
"loss": 0.5552476644515991,
"step": 2197
},
{
"epoch": 2.8512570965125708,
"grad_norm": 0.5500598549842834,
"learning_rate": 1.372554559308559e-07,
"loss": 0.5361748933792114,
"step": 2198
},
{
"epoch": 2.8525547445255475,
"grad_norm": 0.5887991786003113,
"learning_rate": 1.3490449520780492e-07,
"loss": 0.5089778304100037,
"step": 2199
},
{
"epoch": 2.853852392538524,
"grad_norm": 0.5767118334770203,
"learning_rate": 1.3257370610816333e-07,
"loss": 0.4646577537059784,
"step": 2200
},
{
"epoch": 2.8551500405515005,
"grad_norm": 0.5947672128677368,
"learning_rate": 1.3026309339780442e-07,
"loss": 0.45190826058387756,
"step": 2201
},
{
"epoch": 2.8564476885644767,
"grad_norm": 0.576164722442627,
"learning_rate": 1.2797266180134994e-07,
"loss": 0.47920286655426025,
"step": 2202
},
{
"epoch": 2.8577453365774534,
"grad_norm": 0.5928218364715576,
"learning_rate": 1.2570241600214805e-07,
"loss": 0.4952476918697357,
"step": 2203
},
{
"epoch": 2.8590429845904297,
"grad_norm": 0.5796513557434082,
"learning_rate": 1.2345236064228216e-07,
"loss": 0.4798247218132019,
"step": 2204
},
{
"epoch": 2.8603406326034064,
"grad_norm": 0.6173388361930847,
"learning_rate": 1.212225003225409e-07,
"loss": 0.5353522300720215,
"step": 2205
},
{
"epoch": 2.8616382806163827,
"grad_norm": 0.582225501537323,
"learning_rate": 1.1901283960242704e-07,
"loss": 0.4966939091682434,
"step": 2206
},
{
"epoch": 2.8629359286293594,
"grad_norm": 0.573807954788208,
"learning_rate": 1.168233830001364e-07,
"loss": 0.5133891701698303,
"step": 2207
},
{
"epoch": 2.8642335766423357,
"grad_norm": 0.5719092488288879,
"learning_rate": 1.1465413499255452e-07,
"loss": 0.5084906816482544,
"step": 2208
},
{
"epoch": 2.8655312246553124,
"grad_norm": 0.563827395439148,
"learning_rate": 1.1250510001524329e-07,
"loss": 0.551742434501648,
"step": 2209
},
{
"epoch": 2.8668288726682887,
"grad_norm": 0.5915552973747253,
"learning_rate": 1.103762824624377e-07,
"loss": 0.5108176469802856,
"step": 2210
},
{
"epoch": 2.8681265206812654,
"grad_norm": 0.5619785189628601,
"learning_rate": 1.0826768668702691e-07,
"loss": 0.5008025169372559,
"step": 2211
},
{
"epoch": 2.8694241686942417,
"grad_norm": 0.5829325914382935,
"learning_rate": 1.0617931700055984e-07,
"loss": 0.5187573432922363,
"step": 2212
},
{
"epoch": 2.870721816707218,
"grad_norm": 0.6110272407531738,
"learning_rate": 1.0411117767322065e-07,
"loss": 0.5479835271835327,
"step": 2213
},
{
"epoch": 2.8720194647201946,
"grad_norm": 0.5755971074104309,
"learning_rate": 1.0206327293383222e-07,
"loss": 0.5030970573425293,
"step": 2214
},
{
"epoch": 2.8733171127331714,
"grad_norm": 0.5851888060569763,
"learning_rate": 1.000356069698416e-07,
"loss": 0.5171909928321838,
"step": 2215
},
{
"epoch": 2.8746147607461476,
"grad_norm": 0.558315098285675,
"learning_rate": 9.802818392731117e-08,
"loss": 0.47078371047973633,
"step": 2216
},
{
"epoch": 2.875912408759124,
"grad_norm": 0.6229851841926575,
"learning_rate": 9.60410079109153e-08,
"loss": 0.5632795095443726,
"step": 2217
},
{
"epoch": 2.8772100567721006,
"grad_norm": 0.5876999497413635,
"learning_rate": 9.407408298392373e-08,
"loss": 0.5133551359176636,
"step": 2218
},
{
"epoch": 2.878507704785077,
"grad_norm": 0.5872880220413208,
"learning_rate": 9.212741316820039e-08,
"loss": 0.4713757038116455,
"step": 2219
},
{
"epoch": 2.8798053527980536,
"grad_norm": 0.5895143747329712,
"learning_rate": 9.020100244419461e-08,
"loss": 0.5900079607963562,
"step": 2220
},
{
"epoch": 2.88110300081103,
"grad_norm": 0.5657681822776794,
"learning_rate": 8.829485475092548e-08,
"loss": 0.5136827230453491,
"step": 2221
},
{
"epoch": 2.8824006488240066,
"grad_norm": 0.8106376528739929,
"learning_rate": 8.640897398598525e-08,
"loss": 0.6291136741638184,
"step": 2222
},
{
"epoch": 2.883698296836983,
"grad_norm": 0.5875924825668335,
"learning_rate": 8.454336400552154e-08,
"loss": 0.4933609962463379,
"step": 2223
},
{
"epoch": 2.8849959448499596,
"grad_norm": 0.5977309346199036,
"learning_rate": 8.269802862423405e-08,
"loss": 0.5197732448577881,
"step": 2224
},
{
"epoch": 2.886293592862936,
"grad_norm": 0.5707021951675415,
"learning_rate": 8.087297161536778e-08,
"loss": 0.5037369132041931,
"step": 2225
},
{
"epoch": 2.8875912408759126,
"grad_norm": 0.5633382797241211,
"learning_rate": 7.906819671070098e-08,
"loss": 0.4686581492424011,
"step": 2226
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.5665260553359985,
"learning_rate": 7.728370760054283e-08,
"loss": 0.4968178868293762,
"step": 2227
},
{
"epoch": 2.890186536901865,
"grad_norm": 0.557956874370575,
"learning_rate": 7.55195079337212e-08,
"loss": 0.4842921793460846,
"step": 2228
},
{
"epoch": 2.891484184914842,
"grad_norm": 0.5774162411689758,
"learning_rate": 7.377560131757832e-08,
"loss": 0.48150286078453064,
"step": 2229
},
{
"epoch": 2.8927818329278185,
"grad_norm": 0.5605522990226746,
"learning_rate": 7.205199131796182e-08,
"loss": 0.47593769431114197,
"step": 2230
},
{
"epoch": 2.894079480940795,
"grad_norm": 0.5713371634483337,
"learning_rate": 7.034868145921802e-08,
"loss": 0.5388371348381042,
"step": 2231
},
{
"epoch": 2.895377128953771,
"grad_norm": 0.566564679145813,
"learning_rate": 6.866567522418322e-08,
"loss": 0.5253296494483948,
"step": 2232
},
{
"epoch": 2.896674776966748,
"grad_norm": 0.6038841605186462,
"learning_rate": 6.700297605418127e-08,
"loss": 0.4850519895553589,
"step": 2233
},
{
"epoch": 2.8979724249797245,
"grad_norm": 0.5850130915641785,
"learning_rate": 6.53605873490093e-08,
"loss": 0.526265025138855,
"step": 2234
},
{
"epoch": 2.899270072992701,
"grad_norm": 0.5685164332389832,
"learning_rate": 6.373851246693763e-08,
"loss": 0.49016064405441284,
"step": 2235
},
{
"epoch": 2.900567721005677,
"grad_norm": 0.585509717464447,
"learning_rate": 6.21367547246976e-08,
"loss": 0.49361756443977356,
"step": 2236
},
{
"epoch": 2.9018653690186538,
"grad_norm": 0.5846717357635498,
"learning_rate": 6.055531739747933e-08,
"loss": 0.5073826313018799,
"step": 2237
},
{
"epoch": 2.90316301703163,
"grad_norm": 0.6035211682319641,
"learning_rate": 5.899420371892173e-08,
"loss": 0.4748195707798004,
"step": 2238
},
{
"epoch": 2.9044606650446068,
"grad_norm": 0.5725396275520325,
"learning_rate": 5.745341688110806e-08,
"loss": 0.49574536085128784,
"step": 2239
},
{
"epoch": 2.905758313057583,
"grad_norm": 0.5700922012329102,
"learning_rate": 5.593296003455595e-08,
"loss": 0.4746463894844055,
"step": 2240
},
{
"epoch": 2.9070559610705597,
"grad_norm": 0.5627117156982422,
"learning_rate": 5.4432836288215165e-08,
"loss": 0.512833833694458,
"step": 2241
},
{
"epoch": 2.908353609083536,
"grad_norm": 0.5812812447547913,
"learning_rate": 5.2953048709459834e-08,
"loss": 0.48332545161247253,
"step": 2242
},
{
"epoch": 2.9096512570965127,
"grad_norm": 0.5835334062576294,
"learning_rate": 5.1493600324080684e-08,
"loss": 0.507304847240448,
"step": 2243
},
{
"epoch": 2.910948905109489,
"grad_norm": 0.5789167284965515,
"learning_rate": 5.0054494116279497e-08,
"loss": 0.5132785439491272,
"step": 2244
},
{
"epoch": 2.9122465531224657,
"grad_norm": 0.5582759976387024,
"learning_rate": 4.8635733028664644e-08,
"loss": 0.4791605472564697,
"step": 2245
},
{
"epoch": 2.913544201135442,
"grad_norm": 0.5968536138534546,
"learning_rate": 4.723731996224446e-08,
"loss": 0.5294557809829712,
"step": 2246
},
{
"epoch": 2.9148418491484183,
"grad_norm": 0.5799421072006226,
"learning_rate": 4.585925777641831e-08,
"loss": 0.5392569303512573,
"step": 2247
},
{
"epoch": 2.916139497161395,
"grad_norm": 0.5876581072807312,
"learning_rate": 4.450154928897443e-08,
"loss": 0.5044458508491516,
"step": 2248
},
{
"epoch": 2.9174371451743717,
"grad_norm": 0.5795705914497375,
"learning_rate": 4.316419727608434e-08,
"loss": 0.518474280834198,
"step": 2249
},
{
"epoch": 2.918734793187348,
"grad_norm": 0.5783658027648926,
"learning_rate": 4.1847204472293954e-08,
"loss": 0.5036035180091858,
"step": 2250
},
{
"epoch": 2.9200324412003242,
"grad_norm": 0.5799797773361206,
"learning_rate": 4.055057357052139e-08,
"loss": 0.5075333118438721,
"step": 2251
},
{
"epoch": 2.921330089213301,
"grad_norm": 0.5816603899002075,
"learning_rate": 3.927430722204473e-08,
"loss": 0.49955567717552185,
"step": 2252
},
{
"epoch": 2.9226277372262772,
"grad_norm": 0.5603087544441223,
"learning_rate": 3.801840803651091e-08,
"loss": 0.4799802005290985,
"step": 2253
},
{
"epoch": 2.923925385239254,
"grad_norm": 0.5984447598457336,
"learning_rate": 3.678287858191132e-08,
"loss": 0.4863054156303406,
"step": 2254
},
{
"epoch": 2.92522303325223,
"grad_norm": 0.5684608817100525,
"learning_rate": 3.5567721384593965e-08,
"loss": 0.5202617645263672,
"step": 2255
},
{
"epoch": 2.926520681265207,
"grad_norm": 0.6067941784858704,
"learning_rate": 3.437293892924576e-08,
"loss": 0.5111681818962097,
"step": 2256
},
{
"epoch": 2.927818329278183,
"grad_norm": 0.6141681671142578,
"learning_rate": 3.3198533658895804e-08,
"loss": 0.5316765904426575,
"step": 2257
},
{
"epoch": 2.92911597729116,
"grad_norm": 0.5799176096916199,
"learning_rate": 3.2044507974905433e-08,
"loss": 0.46131962537765503,
"step": 2258
},
{
"epoch": 2.930413625304136,
"grad_norm": 0.5954794883728027,
"learning_rate": 3.091086423696377e-08,
"loss": 0.520176887512207,
"step": 2259
},
{
"epoch": 2.931711273317113,
"grad_norm": 0.5652449131011963,
"learning_rate": 2.9797604763087684e-08,
"loss": 0.5085136890411377,
"step": 2260
},
{
"epoch": 2.933008921330089,
"grad_norm": 0.5852287411689758,
"learning_rate": 2.8704731829609643e-08,
"loss": 0.5083173513412476,
"step": 2261
},
{
"epoch": 2.9343065693430654,
"grad_norm": 0.5846629738807678,
"learning_rate": 2.763224767117767e-08,
"loss": 0.5292702913284302,
"step": 2262
},
{
"epoch": 2.935604217356042,
"grad_norm": 0.5861793756484985,
"learning_rate": 2.6580154480750907e-08,
"loss": 0.5053665637969971,
"step": 2263
},
{
"epoch": 2.936901865369019,
"grad_norm": 0.5602736473083496,
"learning_rate": 2.554845440959408e-08,
"loss": 0.5189537405967712,
"step": 2264
},
{
"epoch": 2.938199513381995,
"grad_norm": 0.5991557240486145,
"learning_rate": 2.4537149567271935e-08,
"loss": 0.5867321491241455,
"step": 2265
},
{
"epoch": 2.9394971613949714,
"grad_norm": 0.5465215444564819,
"learning_rate": 2.3546242021648126e-08,
"loss": 0.5084092617034912,
"step": 2266
},
{
"epoch": 2.940794809407948,
"grad_norm": 0.6008067727088928,
"learning_rate": 2.2575733798876342e-08,
"loss": 0.5280360579490662,
"step": 2267
},
{
"epoch": 2.942092457420925,
"grad_norm": 0.5549503564834595,
"learning_rate": 2.162562688340142e-08,
"loss": 0.4592389762401581,
"step": 2268
},
{
"epoch": 2.943390105433901,
"grad_norm": 0.600985586643219,
"learning_rate": 2.0695923217950442e-08,
"loss": 0.5138071179389954,
"step": 2269
},
{
"epoch": 2.9446877534468774,
"grad_norm": 0.5776973366737366,
"learning_rate": 1.9786624703532764e-08,
"loss": 0.560516357421875,
"step": 2270
},
{
"epoch": 2.945985401459854,
"grad_norm": 0.5803866982460022,
"learning_rate": 1.8897733199434443e-08,
"loss": 0.48770207166671753,
"step": 2271
},
{
"epoch": 2.9472830494728304,
"grad_norm": 0.5844945907592773,
"learning_rate": 1.8029250523211582e-08,
"loss": 0.5004736185073853,
"step": 2272
},
{
"epoch": 2.948580697485807,
"grad_norm": 0.5826125144958496,
"learning_rate": 1.718117845069367e-08,
"loss": 0.4950000047683716,
"step": 2273
},
{
"epoch": 2.9498783454987834,
"grad_norm": 0.5776214003562927,
"learning_rate": 1.635351871597246e-08,
"loss": 0.5560945868492126,
"step": 2274
},
{
"epoch": 2.95117599351176,
"grad_norm": 0.565700352191925,
"learning_rate": 1.554627301140199e-08,
"loss": 0.4630610942840576,
"step": 2275
},
{
"epoch": 2.9524736415247363,
"grad_norm": 0.5994547605514526,
"learning_rate": 1.4759442987596351e-08,
"loss": 0.5141358375549316,
"step": 2276
},
{
"epoch": 2.9537712895377126,
"grad_norm": 0.573093831539154,
"learning_rate": 1.3993030253423023e-08,
"loss": 0.4815256893634796,
"step": 2277
},
{
"epoch": 2.9550689375506893,
"grad_norm": 0.5978487730026245,
"learning_rate": 1.3247036376002886e-08,
"loss": 0.5149579048156738,
"step": 2278
},
{
"epoch": 2.956366585563666,
"grad_norm": 0.6069895625114441,
"learning_rate": 1.252146288070355e-08,
"loss": 0.5201846361160278,
"step": 2279
},
{
"epoch": 2.9576642335766423,
"grad_norm": 0.5879092216491699,
"learning_rate": 1.1816311251140466e-08,
"loss": 0.5039907693862915,
"step": 2280
},
{
"epoch": 2.9589618815896186,
"grad_norm": 0.5550662875175476,
"learning_rate": 1.113158292916916e-08,
"loss": 0.5198723077774048,
"step": 2281
},
{
"epoch": 2.9602595296025953,
"grad_norm": 0.5664054155349731,
"learning_rate": 1.0467279314886336e-08,
"loss": 0.5281890630722046,
"step": 2282
},
{
"epoch": 2.961557177615572,
"grad_norm": 0.5738133788108826,
"learning_rate": 9.82340176662433e-09,
"loss": 0.47895991802215576,
"step": 2283
},
{
"epoch": 2.9628548256285483,
"grad_norm": 0.5834701657295227,
"learning_rate": 9.199951600951106e-09,
"loss": 0.49841928482055664,
"step": 2284
},
{
"epoch": 2.9641524736415246,
"grad_norm": 0.553411602973938,
"learning_rate": 8.596930092662493e-09,
"loss": 0.5044345855712891,
"step": 2285
},
{
"epoch": 2.9654501216545013,
"grad_norm": 0.5765789151191711,
"learning_rate": 8.014338474785499e-09,
"loss": 0.45714667439460754,
"step": 2286
},
{
"epoch": 2.9667477696674776,
"grad_norm": 0.5678233504295349,
"learning_rate": 7.45217793857389e-09,
"loss": 0.5142921209335327,
"step": 2287
},
{
"epoch": 2.9680454176804543,
"grad_norm": 0.5809730887413025,
"learning_rate": 6.910449633501515e-09,
"loss": 0.5097491145133972,
"step": 2288
},
{
"epoch": 2.9693430656934305,
"grad_norm": 0.863067626953125,
"learning_rate": 6.389154667266751e-09,
"loss": 0.49733829498291016,
"step": 2289
},
{
"epoch": 2.9706407137064073,
"grad_norm": 0.5724239349365234,
"learning_rate": 5.888294105785841e-09,
"loss": 0.5271996855735779,
"step": 2290
},
{
"epoch": 2.9719383617193835,
"grad_norm": 0.5894045829772949,
"learning_rate": 5.407868973191788e-09,
"loss": 0.5507649183273315,
"step": 2291
},
{
"epoch": 2.9732360097323602,
"grad_norm": 0.5670002698898315,
"learning_rate": 4.947880251832127e-09,
"loss": 0.5069165229797363,
"step": 2292
},
{
"epoch": 2.9745336577453365,
"grad_norm": 0.6079567074775696,
"learning_rate": 4.508328882268931e-09,
"loss": 0.5027692317962646,
"step": 2293
},
{
"epoch": 2.9758313057583132,
"grad_norm": 0.5965436697006226,
"learning_rate": 4.089215763271037e-09,
"loss": 0.4549415707588196,
"step": 2294
},
{
"epoch": 2.9771289537712895,
"grad_norm": 0.5540100336074829,
"learning_rate": 3.6905417518195985e-09,
"loss": 0.5082988739013672,
"step": 2295
},
{
"epoch": 2.9784266017842658,
"grad_norm": 0.5584218502044678,
"learning_rate": 3.312307663103642e-09,
"loss": 0.49896612763404846,
"step": 2296
},
{
"epoch": 2.9797242497972425,
"grad_norm": 0.5825123190879822,
"learning_rate": 2.954514270513409e-09,
"loss": 0.5268645286560059,
"step": 2297
},
{
"epoch": 2.981021897810219,
"grad_norm": 0.6069872379302979,
"learning_rate": 2.6171623056481245e-09,
"loss": 0.5306706428527832,
"step": 2298
},
{
"epoch": 2.9823195458231955,
"grad_norm": 0.619730532169342,
"learning_rate": 2.300252458306007e-09,
"loss": 0.5466433167457581,
"step": 2299
},
{
"epoch": 2.9836171938361717,
"grad_norm": 0.575143039226532,
"learning_rate": 2.0037853764887096e-09,
"loss": 0.5247520804405212,
"step": 2300
},
{
"epoch": 2.9836171938361717,
"eval_loss": 0.6951664686203003,
"eval_runtime": 72.3726,
"eval_samples_per_second": 71.74,
"eval_steps_per_second": 8.967,
"step": 2300
},
{
"epoch": 2.9849148418491485,
"grad_norm": 0.5698785781860352,
"learning_rate": 1.7277616663946562e-09,
"loss": 0.5104506015777588,
"step": 2301
},
{
"epoch": 2.986212489862125,
"grad_norm": 0.5820271372795105,
"learning_rate": 1.4721818924223752e-09,
"loss": 0.5188534259796143,
"step": 2302
},
{
"epoch": 2.9875101378751014,
"grad_norm": 0.5771408081054688,
"learning_rate": 1.2370465771693874e-09,
"loss": 0.5191137194633484,
"step": 2303
},
{
"epoch": 2.9888077858880777,
"grad_norm": 0.555460512638092,
"learning_rate": 1.0223562014277654e-09,
"loss": 0.4951835870742798,
"step": 2304
},
{
"epoch": 2.9901054339010544,
"grad_norm": 0.602135956287384,
"learning_rate": 8.281112041841343e-10,
"loss": 0.5143213272094727,
"step": 2305
},
{
"epoch": 2.9914030819140307,
"grad_norm": 0.5755578875541687,
"learning_rate": 6.543119826207811e-10,
"loss": 0.5067423582077026,
"step": 2306
},
{
"epoch": 2.9927007299270074,
"grad_norm": 0.585641622543335,
"learning_rate": 5.009588921123243e-10,
"loss": 0.49582135677337646,
"step": 2307
},
{
"epoch": 2.9939983779399837,
"grad_norm": 0.5883374214172363,
"learning_rate": 3.680522462279346e-10,
"loss": 0.4730003774166107,
"step": 2308
},
{
"epoch": 2.9952960259529604,
"grad_norm": 0.585075318813324,
"learning_rate": 2.555923167291141e-10,
"loss": 0.5166332721710205,
"step": 2309
},
{
"epoch": 2.9965936739659367,
"grad_norm": 0.5931539535522461,
"learning_rate": 1.635793335652558e-10,
"loss": 0.5443276166915894,
"step": 2310
},
{
"epoch": 2.997891321978913,
"grad_norm": 0.6000698804855347,
"learning_rate": 9.20134848814147e-11,
"loss": 0.4828116297721863,
"step": 2311
},
{
"epoch": 2.9991889699918897,
"grad_norm": 0.5825672149658203,
"learning_rate": 4.08949170105366e-11,
"loss": 0.48934438824653625,
"step": 2312
},
{
"epoch": 3.0,
"grad_norm": 0.8691220283508301,
"learning_rate": 1.022373447900904e-11,
"loss": 0.5870037078857422,
"step": 2313
},
{
"epoch": 3.0,
"step": 2313,
"total_flos": 8.852766725217714e+18,
"train_loss": 0.5397342537911073,
"train_runtime": 26894.7398,
"train_samples_per_second": 11.002,
"train_steps_per_second": 0.086
}
],
"logging_steps": 1.0,
"max_steps": 2313,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 230,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.852766725217714e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}