sparcol-large-k512-no-cls / trainer_state.json
charsiu's picture
Upload checkpoints from checkpoint-12652
93f7525 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 20000,
"global_step": 12652,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003161555485298767,
"grad_norm": 128.30313110351562,
"learning_rate": 5.00526870389884e-07,
"loss": 16.6252,
"sparse_loss": 16.6252,
"step": 20
},
{
"epoch": 0.006323110970597534,
"grad_norm": 90.16519165039062,
"learning_rate": 1.0273972602739725e-06,
"loss": 14.3114,
"sparse_loss": 14.3114,
"step": 40
},
{
"epoch": 0.0094846664558963,
"grad_norm": 159.9249267578125,
"learning_rate": 1.554267650158061e-06,
"loss": 14.3623,
"sparse_loss": 14.3623,
"step": 60
},
{
"epoch": 0.012646221941195067,
"grad_norm": 81.25911712646484,
"learning_rate": 2.08113804004215e-06,
"loss": 11.6206,
"sparse_loss": 11.6206,
"step": 80
},
{
"epoch": 0.015807777426493835,
"grad_norm": 152.113525390625,
"learning_rate": 2.6080084299262384e-06,
"loss": 9.1902,
"sparse_loss": 9.1902,
"step": 100
},
{
"epoch": 0.0189693329117926,
"grad_norm": 68.47811889648438,
"learning_rate": 3.1348788198103265e-06,
"loss": 6.6879,
"sparse_loss": 6.6879,
"step": 120
},
{
"epoch": 0.022130888397091368,
"grad_norm": 61.585304260253906,
"learning_rate": 3.661749209694415e-06,
"loss": 4.3414,
"sparse_loss": 4.3414,
"step": 140
},
{
"epoch": 0.025292443882390134,
"grad_norm": 32.29441452026367,
"learning_rate": 4.188619599578504e-06,
"loss": 2.8611,
"sparse_loss": 2.8611,
"step": 160
},
{
"epoch": 0.028453999367688904,
"grad_norm": 23.945159912109375,
"learning_rate": 4.715489989462593e-06,
"loss": 2.3256,
"sparse_loss": 2.3256,
"step": 180
},
{
"epoch": 0.03161555485298767,
"grad_norm": 28.487728118896484,
"learning_rate": 5.242360379346681e-06,
"loss": 2.1056,
"sparse_loss": 2.1056,
"step": 200
},
{
"epoch": 0.034777110338286434,
"grad_norm": 21.144609451293945,
"learning_rate": 5.76923076923077e-06,
"loss": 2.2798,
"sparse_loss": 2.2798,
"step": 220
},
{
"epoch": 0.0379386658235852,
"grad_norm": 57.145225524902344,
"learning_rate": 6.296101159114858e-06,
"loss": 2.2509,
"sparse_loss": 2.2509,
"step": 240
},
{
"epoch": 0.04110022130888397,
"grad_norm": 35.35124588012695,
"learning_rate": 6.822971548998947e-06,
"loss": 2.0234,
"sparse_loss": 2.0234,
"step": 260
},
{
"epoch": 0.044261776794182736,
"grad_norm": 27.13028907775879,
"learning_rate": 7.349841938883036e-06,
"loss": 1.9,
"sparse_loss": 1.9,
"step": 280
},
{
"epoch": 0.047423332279481506,
"grad_norm": 17.84204864501953,
"learning_rate": 7.876712328767124e-06,
"loss": 2.0252,
"sparse_loss": 2.0252,
"step": 300
},
{
"epoch": 0.05058488776478027,
"grad_norm": 99.34933471679688,
"learning_rate": 8.403582718651212e-06,
"loss": 1.8767,
"sparse_loss": 1.8767,
"step": 320
},
{
"epoch": 0.05374644325007904,
"grad_norm": 20.827627182006836,
"learning_rate": 8.930453108535302e-06,
"loss": 1.7484,
"sparse_loss": 1.7484,
"step": 340
},
{
"epoch": 0.05690799873537781,
"grad_norm": 22.89020538330078,
"learning_rate": 9.457323498419388e-06,
"loss": 1.7255,
"sparse_loss": 1.7255,
"step": 360
},
{
"epoch": 0.06006955422067657,
"grad_norm": 18.195846557617188,
"learning_rate": 9.984193888303478e-06,
"loss": 1.6325,
"sparse_loss": 1.6325,
"step": 380
},
{
"epoch": 0.06323110970597534,
"grad_norm": 12.281404495239258,
"learning_rate": 1.0511064278187566e-05,
"loss": 1.9005,
"sparse_loss": 1.9005,
"step": 400
},
{
"epoch": 0.06639266519127411,
"grad_norm": 13.496838569641113,
"learning_rate": 1.1037934668071655e-05,
"loss": 1.6418,
"sparse_loss": 1.6418,
"step": 420
},
{
"epoch": 0.06955422067657287,
"grad_norm": 13.643138885498047,
"learning_rate": 1.1564805057955744e-05,
"loss": 1.6877,
"sparse_loss": 1.6877,
"step": 440
},
{
"epoch": 0.07271577616187164,
"grad_norm": 9.594386100769043,
"learning_rate": 1.209167544783983e-05,
"loss": 1.698,
"sparse_loss": 1.698,
"step": 460
},
{
"epoch": 0.0758773316471704,
"grad_norm": 234.09896850585938,
"learning_rate": 1.2618545837723922e-05,
"loss": 1.6121,
"sparse_loss": 1.6121,
"step": 480
},
{
"epoch": 0.07903888713246918,
"grad_norm": 11.300992965698242,
"learning_rate": 1.3145416227608009e-05,
"loss": 1.618,
"sparse_loss": 1.618,
"step": 500
},
{
"epoch": 0.08220044261776795,
"grad_norm": 12.089790344238281,
"learning_rate": 1.3672286617492097e-05,
"loss": 1.5691,
"sparse_loss": 1.5691,
"step": 520
},
{
"epoch": 0.0853619981030667,
"grad_norm": 6.890669345855713,
"learning_rate": 1.4199157007376185e-05,
"loss": 1.7044,
"sparse_loss": 1.7044,
"step": 540
},
{
"epoch": 0.08852355358836547,
"grad_norm": 7.337052345275879,
"learning_rate": 1.4726027397260275e-05,
"loss": 1.5826,
"sparse_loss": 1.5826,
"step": 560
},
{
"epoch": 0.09168510907366424,
"grad_norm": 11.253660202026367,
"learning_rate": 1.5252897787144363e-05,
"loss": 1.4962,
"sparse_loss": 1.4962,
"step": 580
},
{
"epoch": 0.09484666455896301,
"grad_norm": 11.72204875946045,
"learning_rate": 1.577976817702845e-05,
"loss": 1.5067,
"sparse_loss": 1.5067,
"step": 600
},
{
"epoch": 0.09800822004426178,
"grad_norm": 17.81527328491211,
"learning_rate": 1.630663856691254e-05,
"loss": 1.3541,
"sparse_loss": 1.3541,
"step": 620
},
{
"epoch": 0.10116977552956054,
"grad_norm": 7.416006088256836,
"learning_rate": 1.683350895679663e-05,
"loss": 1.4784,
"sparse_loss": 1.4784,
"step": 640
},
{
"epoch": 0.10433133101485931,
"grad_norm": 11.03427791595459,
"learning_rate": 1.7360379346680716e-05,
"loss": 1.4868,
"sparse_loss": 1.4868,
"step": 660
},
{
"epoch": 0.10749288650015808,
"grad_norm": 6.934051036834717,
"learning_rate": 1.7887249736564805e-05,
"loss": 1.5381,
"sparse_loss": 1.5381,
"step": 680
},
{
"epoch": 0.11065444198545685,
"grad_norm": 9.158458709716797,
"learning_rate": 1.8414120126448895e-05,
"loss": 1.4103,
"sparse_loss": 1.4103,
"step": 700
},
{
"epoch": 0.11381599747075562,
"grad_norm": 15.296072959899902,
"learning_rate": 1.894099051633298e-05,
"loss": 1.4227,
"sparse_loss": 1.4227,
"step": 720
},
{
"epoch": 0.11697755295605437,
"grad_norm": 10.86883544921875,
"learning_rate": 1.946786090621707e-05,
"loss": 1.5968,
"sparse_loss": 1.5968,
"step": 740
},
{
"epoch": 0.12013910844135314,
"grad_norm": 13.303630828857422,
"learning_rate": 1.999473129610116e-05,
"loss": 1.3709,
"sparse_loss": 1.3709,
"step": 760
},
{
"epoch": 0.12330066392665191,
"grad_norm": 7.9238362312316895,
"learning_rate": 2.0521601685985248e-05,
"loss": 1.3811,
"sparse_loss": 1.3811,
"step": 780
},
{
"epoch": 0.12646221941195068,
"grad_norm": 17.758581161499023,
"learning_rate": 2.1048472075869338e-05,
"loss": 1.5005,
"sparse_loss": 1.5005,
"step": 800
},
{
"epoch": 0.12962377489724944,
"grad_norm": 3.8343210220336914,
"learning_rate": 2.1575342465753427e-05,
"loss": 1.6216,
"sparse_loss": 1.6216,
"step": 820
},
{
"epoch": 0.13278533038254822,
"grad_norm": 3.8403618335723877,
"learning_rate": 2.2102212855637514e-05,
"loss": 1.37,
"sparse_loss": 1.37,
"step": 840
},
{
"epoch": 0.13594688586784698,
"grad_norm": 5.895383358001709,
"learning_rate": 2.2629083245521604e-05,
"loss": 1.4373,
"sparse_loss": 1.4373,
"step": 860
},
{
"epoch": 0.13910844135314573,
"grad_norm": 8.331881523132324,
"learning_rate": 2.315595363540569e-05,
"loss": 1.4005,
"sparse_loss": 1.4005,
"step": 880
},
{
"epoch": 0.14226999683844452,
"grad_norm": 12.499529838562012,
"learning_rate": 2.368282402528978e-05,
"loss": 1.402,
"sparse_loss": 1.402,
"step": 900
},
{
"epoch": 0.14543155232374327,
"grad_norm": 8.521995544433594,
"learning_rate": 2.420969441517387e-05,
"loss": 1.5812,
"sparse_loss": 1.5812,
"step": 920
},
{
"epoch": 0.14859310780904206,
"grad_norm": 6.057605743408203,
"learning_rate": 2.4736564805057956e-05,
"loss": 1.3752,
"sparse_loss": 1.3752,
"step": 940
},
{
"epoch": 0.1517546632943408,
"grad_norm": 26.1428165435791,
"learning_rate": 2.5263435194942046e-05,
"loss": 1.4496,
"sparse_loss": 1.4496,
"step": 960
},
{
"epoch": 0.15491621877963957,
"grad_norm": 11.980278015136719,
"learning_rate": 2.5790305584826136e-05,
"loss": 1.4868,
"sparse_loss": 1.4868,
"step": 980
},
{
"epoch": 0.15807777426493835,
"grad_norm": 5.677769184112549,
"learning_rate": 2.6317175974710222e-05,
"loss": 1.2911,
"sparse_loss": 1.2911,
"step": 1000
},
{
"epoch": 0.1612393297502371,
"grad_norm": 7.899789333343506,
"learning_rate": 2.6844046364594312e-05,
"loss": 1.2455,
"sparse_loss": 1.2455,
"step": 1020
},
{
"epoch": 0.1644008852355359,
"grad_norm": 9.248682975769043,
"learning_rate": 2.73709167544784e-05,
"loss": 1.4636,
"sparse_loss": 1.4636,
"step": 1040
},
{
"epoch": 0.16756244072083465,
"grad_norm": 25.693893432617188,
"learning_rate": 2.7897787144362485e-05,
"loss": 1.431,
"sparse_loss": 1.431,
"step": 1060
},
{
"epoch": 0.1707239962061334,
"grad_norm": 5.5594940185546875,
"learning_rate": 2.842465753424658e-05,
"loss": 1.4264,
"sparse_loss": 1.4264,
"step": 1080
},
{
"epoch": 0.1738855516914322,
"grad_norm": 9.96285629272461,
"learning_rate": 2.8951527924130668e-05,
"loss": 1.379,
"sparse_loss": 1.379,
"step": 1100
},
{
"epoch": 0.17704710717673094,
"grad_norm": 7.128167629241943,
"learning_rate": 2.9478398314014755e-05,
"loss": 1.482,
"sparse_loss": 1.482,
"step": 1120
},
{
"epoch": 0.18020866266202973,
"grad_norm": 4.2342658042907715,
"learning_rate": 3.000526870389884e-05,
"loss": 1.3317,
"sparse_loss": 1.3317,
"step": 1140
},
{
"epoch": 0.18337021814732848,
"grad_norm": 5.94893217086792,
"learning_rate": 3.053213909378293e-05,
"loss": 1.3711,
"sparse_loss": 1.3711,
"step": 1160
},
{
"epoch": 0.18653177363262724,
"grad_norm": 9.314913749694824,
"learning_rate": 3.105900948366702e-05,
"loss": 1.3961,
"sparse_loss": 1.3961,
"step": 1180
},
{
"epoch": 0.18969332911792602,
"grad_norm": 12.692520141601562,
"learning_rate": 3.1585879873551104e-05,
"loss": 1.2723,
"sparse_loss": 1.2723,
"step": 1200
},
{
"epoch": 0.19285488460322478,
"grad_norm": 4.648574352264404,
"learning_rate": 3.21127502634352e-05,
"loss": 1.4275,
"sparse_loss": 1.4275,
"step": 1220
},
{
"epoch": 0.19601644008852356,
"grad_norm": 10.362495422363281,
"learning_rate": 3.2639620653319283e-05,
"loss": 1.3174,
"sparse_loss": 1.3174,
"step": 1240
},
{
"epoch": 0.19917799557382232,
"grad_norm": 226.89773559570312,
"learning_rate": 3.316649104320337e-05,
"loss": 1.4134,
"sparse_loss": 1.4134,
"step": 1260
},
{
"epoch": 0.20233955105912108,
"grad_norm": 3.8117411136627197,
"learning_rate": 3.369336143308746e-05,
"loss": 1.3341,
"sparse_loss": 1.3341,
"step": 1280
},
{
"epoch": 0.20550110654441986,
"grad_norm": 4.760183334350586,
"learning_rate": 3.4220231822971546e-05,
"loss": 1.2803,
"sparse_loss": 1.2803,
"step": 1300
},
{
"epoch": 0.20866266202971862,
"grad_norm": 4.356404781341553,
"learning_rate": 3.4747102212855636e-05,
"loss": 1.3362,
"sparse_loss": 1.3362,
"step": 1320
},
{
"epoch": 0.2118242175150174,
"grad_norm": 13.460466384887695,
"learning_rate": 3.527397260273973e-05,
"loss": 1.285,
"sparse_loss": 1.285,
"step": 1340
},
{
"epoch": 0.21498577300031615,
"grad_norm": 6.3744940757751465,
"learning_rate": 3.5800842992623816e-05,
"loss": 1.3639,
"sparse_loss": 1.3639,
"step": 1360
},
{
"epoch": 0.2181473284856149,
"grad_norm": 9.473126411437988,
"learning_rate": 3.6327713382507905e-05,
"loss": 1.2435,
"sparse_loss": 1.2435,
"step": 1380
},
{
"epoch": 0.2213088839709137,
"grad_norm": 3.342799425125122,
"learning_rate": 3.6854583772391995e-05,
"loss": 1.4165,
"sparse_loss": 1.4165,
"step": 1400
},
{
"epoch": 0.22447043945621245,
"grad_norm": 22.65892791748047,
"learning_rate": 3.738145416227608e-05,
"loss": 1.3688,
"sparse_loss": 1.3688,
"step": 1420
},
{
"epoch": 0.22763199494151123,
"grad_norm": 5.911210536956787,
"learning_rate": 3.790832455216017e-05,
"loss": 1.3473,
"sparse_loss": 1.3473,
"step": 1440
},
{
"epoch": 0.23079355042681,
"grad_norm": 4.169831275939941,
"learning_rate": 3.843519494204426e-05,
"loss": 1.2015,
"sparse_loss": 1.2015,
"step": 1460
},
{
"epoch": 0.23395510591210875,
"grad_norm": 3.30840802192688,
"learning_rate": 3.896206533192835e-05,
"loss": 1.1465,
"sparse_loss": 1.1465,
"step": 1480
},
{
"epoch": 0.23711666139740753,
"grad_norm": 84.1337661743164,
"learning_rate": 3.948893572181244e-05,
"loss": 1.1855,
"sparse_loss": 1.1855,
"step": 1500
},
{
"epoch": 0.24027821688270629,
"grad_norm": 7.522827625274658,
"learning_rate": 4.001580611169653e-05,
"loss": 1.2851,
"sparse_loss": 1.2851,
"step": 1520
},
{
"epoch": 0.24343977236800507,
"grad_norm": 197.22821044921875,
"learning_rate": 4.054267650158061e-05,
"loss": 1.5259,
"sparse_loss": 1.5259,
"step": 1540
},
{
"epoch": 0.24660132785330383,
"grad_norm": 7.006889820098877,
"learning_rate": 4.10695468914647e-05,
"loss": 1.3605,
"sparse_loss": 1.3605,
"step": 1560
},
{
"epoch": 0.24976288333860258,
"grad_norm": 11.081674575805664,
"learning_rate": 4.159641728134879e-05,
"loss": 1.3445,
"sparse_loss": 1.3445,
"step": 1580
},
{
"epoch": 0.25292443882390137,
"grad_norm": 5.594365119934082,
"learning_rate": 4.212328767123288e-05,
"loss": 1.2714,
"sparse_loss": 1.2714,
"step": 1600
},
{
"epoch": 0.25608599430920015,
"grad_norm": 4.106484413146973,
"learning_rate": 4.265015806111697e-05,
"loss": 1.3731,
"sparse_loss": 1.3731,
"step": 1620
},
{
"epoch": 0.2592475497944989,
"grad_norm": 4.2914533615112305,
"learning_rate": 4.317702845100105e-05,
"loss": 1.2776,
"sparse_loss": 1.2776,
"step": 1640
},
{
"epoch": 0.26240910527979766,
"grad_norm": 4.921117782592773,
"learning_rate": 4.370389884088514e-05,
"loss": 1.4268,
"sparse_loss": 1.4268,
"step": 1660
},
{
"epoch": 0.26557066076509644,
"grad_norm": 4.515608787536621,
"learning_rate": 4.423076923076923e-05,
"loss": 1.3297,
"sparse_loss": 1.3297,
"step": 1680
},
{
"epoch": 0.2687322162503952,
"grad_norm": 4.874992847442627,
"learning_rate": 4.4757639620653316e-05,
"loss": 1.3508,
"sparse_loss": 1.3508,
"step": 1700
},
{
"epoch": 0.27189377173569396,
"grad_norm": 9.342555046081543,
"learning_rate": 4.528451001053741e-05,
"loss": 1.2828,
"sparse_loss": 1.2828,
"step": 1720
},
{
"epoch": 0.27505532722099274,
"grad_norm": 22.3294734954834,
"learning_rate": 4.58113804004215e-05,
"loss": 1.1539,
"sparse_loss": 1.1539,
"step": 1740
},
{
"epoch": 0.27821688270629147,
"grad_norm": 4.5409321784973145,
"learning_rate": 4.6338250790305585e-05,
"loss": 1.2459,
"sparse_loss": 1.2459,
"step": 1760
},
{
"epoch": 0.28137843819159025,
"grad_norm": 10.592023849487305,
"learning_rate": 4.6865121180189675e-05,
"loss": 1.241,
"sparse_loss": 1.241,
"step": 1780
},
{
"epoch": 0.28453999367688904,
"grad_norm": 92.1581802368164,
"learning_rate": 4.7391991570073765e-05,
"loss": 1.2905,
"sparse_loss": 1.2905,
"step": 1800
},
{
"epoch": 0.2877015491621878,
"grad_norm": 8.384659767150879,
"learning_rate": 4.791886195995785e-05,
"loss": 1.2808,
"sparse_loss": 1.2808,
"step": 1820
},
{
"epoch": 0.29086310464748655,
"grad_norm": 3.508902072906494,
"learning_rate": 4.8445732349841945e-05,
"loss": 1.2958,
"sparse_loss": 1.2958,
"step": 1840
},
{
"epoch": 0.29402466013278533,
"grad_norm": 2.9955217838287354,
"learning_rate": 4.8972602739726034e-05,
"loss": 1.0703,
"sparse_loss": 1.0703,
"step": 1860
},
{
"epoch": 0.2971862156180841,
"grad_norm": 3.8358285427093506,
"learning_rate": 4.949947312961012e-05,
"loss": 1.3482,
"sparse_loss": 1.3482,
"step": 1880
},
{
"epoch": 0.30034777110338284,
"grad_norm": 4.9970479011535645,
"learning_rate": 4.999999893323271e-05,
"loss": 1.2378,
"sparse_loss": 1.2378,
"step": 1900
},
{
"epoch": 0.3035093265886816,
"grad_norm": 5.74011754989624,
"learning_rate": 4.999952955709672e-05,
"loss": 1.2849,
"sparse_loss": 1.2849,
"step": 1920
},
{
"epoch": 0.3066708820739804,
"grad_norm": 4.790186882019043,
"learning_rate": 4.999820678560873e-05,
"loss": 1.2481,
"sparse_loss": 1.2481,
"step": 1940
},
{
"epoch": 0.30983243755927914,
"grad_norm": 4.901861667633057,
"learning_rate": 4.999603066392346e-05,
"loss": 1.2009,
"sparse_loss": 1.2009,
"step": 1960
},
{
"epoch": 0.3129939930445779,
"grad_norm": 8.495441436767578,
"learning_rate": 4.999300126632601e-05,
"loss": 1.2779,
"sparse_loss": 1.2779,
"step": 1980
},
{
"epoch": 0.3161555485298767,
"grad_norm": 3.7238235473632812,
"learning_rate": 4.998911869622926e-05,
"loss": 1.0882,
"sparse_loss": 1.0882,
"step": 2000
},
{
"epoch": 0.3193171040151755,
"grad_norm": 7.2108354568481445,
"learning_rate": 4.998438308617042e-05,
"loss": 1.3365,
"sparse_loss": 1.3365,
"step": 2020
},
{
"epoch": 0.3224786595004742,
"grad_norm": 3.252174139022827,
"learning_rate": 4.997879459780641e-05,
"loss": 1.3396,
"sparse_loss": 1.3396,
"step": 2040
},
{
"epoch": 0.325640214985773,
"grad_norm": 4.206798076629639,
"learning_rate": 4.997235342190843e-05,
"loss": 1.3244,
"sparse_loss": 1.3244,
"step": 2060
},
{
"epoch": 0.3288017704710718,
"grad_norm": 3.757575511932373,
"learning_rate": 4.996505977835541e-05,
"loss": 1.1792,
"sparse_loss": 1.1792,
"step": 2080
},
{
"epoch": 0.3319633259563705,
"grad_norm": 5.497296333312988,
"learning_rate": 4.995691391612649e-05,
"loss": 1.2505,
"sparse_loss": 1.2505,
"step": 2100
},
{
"epoch": 0.3351248814416693,
"grad_norm": 5.120765686035156,
"learning_rate": 4.994791611329253e-05,
"loss": 1.2359,
"sparse_loss": 1.2359,
"step": 2120
},
{
"epoch": 0.3382864369269681,
"grad_norm": 15.797529220581055,
"learning_rate": 4.9938066677006644e-05,
"loss": 1.2179,
"sparse_loss": 1.2179,
"step": 2140
},
{
"epoch": 0.3414479924122668,
"grad_norm": 7.707025051116943,
"learning_rate": 4.9927365943493686e-05,
"loss": 1.2451,
"sparse_loss": 1.2451,
"step": 2160
},
{
"epoch": 0.3446095478975656,
"grad_norm": 4.341976642608643,
"learning_rate": 4.991581427803879e-05,
"loss": 1.0311,
"sparse_loss": 1.0311,
"step": 2180
},
{
"epoch": 0.3477711033828644,
"grad_norm": 4.378412246704102,
"learning_rate": 4.990341207497485e-05,
"loss": 1.12,
"sparse_loss": 1.12,
"step": 2200
},
{
"epoch": 0.35093265886816316,
"grad_norm": 7.830859661102295,
"learning_rate": 4.989015975766916e-05,
"loss": 1.0732,
"sparse_loss": 1.0732,
"step": 2220
},
{
"epoch": 0.3540942143534619,
"grad_norm": 4.191309452056885,
"learning_rate": 4.987605777850886e-05,
"loss": 1.1785,
"sparse_loss": 1.1785,
"step": 2240
},
{
"epoch": 0.3572557698387607,
"grad_norm": 21.474308013916016,
"learning_rate": 4.986110661888555e-05,
"loss": 1.1085,
"sparse_loss": 1.1085,
"step": 2260
},
{
"epoch": 0.36041732532405946,
"grad_norm": 5.993708610534668,
"learning_rate": 4.9845306789178833e-05,
"loss": 1.2287,
"sparse_loss": 1.2287,
"step": 2280
},
{
"epoch": 0.3635788808093582,
"grad_norm": 3.8776209354400635,
"learning_rate": 4.982865882873893e-05,
"loss": 1.0967,
"sparse_loss": 1.0967,
"step": 2300
},
{
"epoch": 0.36674043629465697,
"grad_norm": 4.90541934967041,
"learning_rate": 4.9811163305868185e-05,
"loss": 1.157,
"sparse_loss": 1.157,
"step": 2320
},
{
"epoch": 0.36990199177995575,
"grad_norm": 14.786788940429688,
"learning_rate": 4.9792820817801776e-05,
"loss": 1.1239,
"sparse_loss": 1.1239,
"step": 2340
},
{
"epoch": 0.3730635472652545,
"grad_norm": 10.61219596862793,
"learning_rate": 4.977363199068724e-05,
"loss": 1.1468,
"sparse_loss": 1.1468,
"step": 2360
},
{
"epoch": 0.37622510275055326,
"grad_norm": 3.993335008621216,
"learning_rate": 4.9753597479563135e-05,
"loss": 1.1354,
"sparse_loss": 1.1354,
"step": 2380
},
{
"epoch": 0.37938665823585205,
"grad_norm": 25.283395767211914,
"learning_rate": 4.9732717968336684e-05,
"loss": 1.2344,
"sparse_loss": 1.2344,
"step": 2400
},
{
"epoch": 0.38254821372115083,
"grad_norm": 3.873542547225952,
"learning_rate": 4.971099416976041e-05,
"loss": 1.0804,
"sparse_loss": 1.0804,
"step": 2420
},
{
"epoch": 0.38570976920644956,
"grad_norm": 6.974337100982666,
"learning_rate": 4.968842682540782e-05,
"loss": 1.4594,
"sparse_loss": 1.4594,
"step": 2440
},
{
"epoch": 0.38887132469174834,
"grad_norm": 3.187790870666504,
"learning_rate": 4.966501670564807e-05,
"loss": 1.1694,
"sparse_loss": 1.1694,
"step": 2460
},
{
"epoch": 0.3920328801770471,
"grad_norm": 12.773173332214355,
"learning_rate": 4.964076460961971e-05,
"loss": 1.066,
"sparse_loss": 1.066,
"step": 2480
},
{
"epoch": 0.39519443566234586,
"grad_norm": 4.0825371742248535,
"learning_rate": 4.961567136520335e-05,
"loss": 1.0973,
"sparse_loss": 1.0973,
"step": 2500
},
{
"epoch": 0.39835599114764464,
"grad_norm": 3.0021812915802,
"learning_rate": 4.958973782899344e-05,
"loss": 1.3674,
"sparse_loss": 1.3674,
"step": 2520
},
{
"epoch": 0.4015175466329434,
"grad_norm": 4.639908790588379,
"learning_rate": 4.9562964886269005e-05,
"loss": 1.169,
"sparse_loss": 1.169,
"step": 2540
},
{
"epoch": 0.40467910211824215,
"grad_norm": 151.06048583984375,
"learning_rate": 4.953535345096344e-05,
"loss": 1.4649,
"sparse_loss": 1.4649,
"step": 2560
},
{
"epoch": 0.40784065760354093,
"grad_norm": 33.80763244628906,
"learning_rate": 4.95069044656333e-05,
"loss": 4.1995,
"sparse_loss": 4.1995,
"step": 2580
},
{
"epoch": 0.4110022130888397,
"grad_norm": 5.223995685577393,
"learning_rate": 4.947761890142615e-05,
"loss": 1.5354,
"sparse_loss": 1.5354,
"step": 2600
},
{
"epoch": 0.4141637685741385,
"grad_norm": 15.408181190490723,
"learning_rate": 4.9447497758047354e-05,
"loss": 1.2286,
"sparse_loss": 1.2286,
"step": 2620
},
{
"epoch": 0.41732532405943723,
"grad_norm": 2.977509021759033,
"learning_rate": 4.941654206372602e-05,
"loss": 1.1715,
"sparse_loss": 1.1715,
"step": 2640
},
{
"epoch": 0.420486879544736,
"grad_norm": 4.334832191467285,
"learning_rate": 4.9384752875179876e-05,
"loss": 1.1055,
"sparse_loss": 1.1055,
"step": 2660
},
{
"epoch": 0.4236484350300348,
"grad_norm": 5.943917751312256,
"learning_rate": 4.9352131277579144e-05,
"loss": 1.0437,
"sparse_loss": 1.0437,
"step": 2680
},
{
"epoch": 0.4268099905153335,
"grad_norm": 4.5376996994018555,
"learning_rate": 4.931867838450959e-05,
"loss": 1.0396,
"sparse_loss": 1.0396,
"step": 2700
},
{
"epoch": 0.4299715460006323,
"grad_norm": 8.081282615661621,
"learning_rate": 4.928439533793443e-05,
"loss": 1.0279,
"sparse_loss": 1.0279,
"step": 2720
},
{
"epoch": 0.4331331014859311,
"grad_norm": 2.9060018062591553,
"learning_rate": 4.92492833081554e-05,
"loss": 1.0892,
"sparse_loss": 1.0892,
"step": 2740
},
{
"epoch": 0.4362946569712298,
"grad_norm": 1.9138152599334717,
"learning_rate": 4.921334349377277e-05,
"loss": 1.0709,
"sparse_loss": 1.0709,
"step": 2760
},
{
"epoch": 0.4394562124565286,
"grad_norm": 3.0852744579315186,
"learning_rate": 4.917657712164445e-05,
"loss": 1.0889,
"sparse_loss": 1.0889,
"step": 2780
},
{
"epoch": 0.4426177679418274,
"grad_norm": 2.95231294631958,
"learning_rate": 4.91389854468441e-05,
"loss": 1.1733,
"sparse_loss": 1.1733,
"step": 2800
},
{
"epoch": 0.4457793234271262,
"grad_norm": 4.986962795257568,
"learning_rate": 4.910056975261829e-05,
"loss": 1.1991,
"sparse_loss": 1.1991,
"step": 2820
},
{
"epoch": 0.4489408789124249,
"grad_norm": 4.206987380981445,
"learning_rate": 4.906133135034269e-05,
"loss": 1.2295,
"sparse_loss": 1.2295,
"step": 2840
},
{
"epoch": 0.4521024343977237,
"grad_norm": 4.294253349304199,
"learning_rate": 4.902127157947732e-05,
"loss": 1.0959,
"sparse_loss": 1.0959,
"step": 2860
},
{
"epoch": 0.45526398988302247,
"grad_norm": 4.660038948059082,
"learning_rate": 4.898039180752079e-05,
"loss": 1.1378,
"sparse_loss": 1.1378,
"step": 2880
},
{
"epoch": 0.4584255453683212,
"grad_norm": 14.10377311706543,
"learning_rate": 4.893869342996367e-05,
"loss": 1.0923,
"sparse_loss": 1.0923,
"step": 2900
},
{
"epoch": 0.46158710085362,
"grad_norm": 55.42957305908203,
"learning_rate": 4.889617787024079e-05,
"loss": 1.1051,
"sparse_loss": 1.1051,
"step": 2920
},
{
"epoch": 0.46474865633891876,
"grad_norm": 2.9921209812164307,
"learning_rate": 4.885284657968272e-05,
"loss": 1.0354,
"sparse_loss": 1.0354,
"step": 2940
},
{
"epoch": 0.4679102118242175,
"grad_norm": 1.9534173011779785,
"learning_rate": 4.880870103746617e-05,
"loss": 1.1892,
"sparse_loss": 1.1892,
"step": 2960
},
{
"epoch": 0.4710717673095163,
"grad_norm": 3.1078741550445557,
"learning_rate": 4.8763742750563515e-05,
"loss": 1.1636,
"sparse_loss": 1.1636,
"step": 2980
},
{
"epoch": 0.47423332279481506,
"grad_norm": 5.104287147521973,
"learning_rate": 4.8717973253691365e-05,
"loss": 1.2148,
"sparse_loss": 1.2148,
"step": 3000
},
{
"epoch": 0.47739487828011384,
"grad_norm": 3.2280941009521484,
"learning_rate": 4.867139410925815e-05,
"loss": 1.196,
"sparse_loss": 1.196,
"step": 3020
},
{
"epoch": 0.48055643376541257,
"grad_norm": 2.9964301586151123,
"learning_rate": 4.8624006907310804e-05,
"loss": 1.1401,
"sparse_loss": 1.1401,
"step": 3040
},
{
"epoch": 0.48371798925071136,
"grad_norm": 8.02204418182373,
"learning_rate": 4.857581326548049e-05,
"loss": 1.1311,
"sparse_loss": 1.1311,
"step": 3060
},
{
"epoch": 0.48687954473601014,
"grad_norm": 4.118027687072754,
"learning_rate": 4.852681482892735e-05,
"loss": 1.191,
"sparse_loss": 1.191,
"step": 3080
},
{
"epoch": 0.49004110022130887,
"grad_norm": 2.0322353839874268,
"learning_rate": 4.847701327028439e-05,
"loss": 1.1354,
"sparse_loss": 1.1354,
"step": 3100
},
{
"epoch": 0.49320265570660765,
"grad_norm": 3.882215738296509,
"learning_rate": 4.8426410289600356e-05,
"loss": 1.1447,
"sparse_loss": 1.1447,
"step": 3120
},
{
"epoch": 0.49636421119190643,
"grad_norm": 45.387672424316406,
"learning_rate": 4.837500761428167e-05,
"loss": 0.9855,
"sparse_loss": 0.9855,
"step": 3140
},
{
"epoch": 0.49952576667720516,
"grad_norm": 4.83632755279541,
"learning_rate": 4.832280699903355e-05,
"loss": 1.1387,
"sparse_loss": 1.1387,
"step": 3160
},
{
"epoch": 0.502687322162504,
"grad_norm": 2.312962532043457,
"learning_rate": 4.826981022580001e-05,
"loss": 1.2482,
"sparse_loss": 1.2482,
"step": 3180
},
{
"epoch": 0.5058488776478027,
"grad_norm": 6.244421482086182,
"learning_rate": 4.821601910370308e-05,
"loss": 1.0939,
"sparse_loss": 1.0939,
"step": 3200
},
{
"epoch": 0.5090104331331015,
"grad_norm": 4.036285400390625,
"learning_rate": 4.8161435468981074e-05,
"loss": 1.1258,
"sparse_loss": 1.1258,
"step": 3220
},
{
"epoch": 0.5121719886184003,
"grad_norm": 7.3748884201049805,
"learning_rate": 4.8106061184925856e-05,
"loss": 1.0983,
"sparse_loss": 1.0983,
"step": 3240
},
{
"epoch": 0.515333544103699,
"grad_norm": 2.0951173305511475,
"learning_rate": 4.804989814181926e-05,
"loss": 0.9883,
"sparse_loss": 0.9883,
"step": 3260
},
{
"epoch": 0.5184950995889978,
"grad_norm": 5.343947410583496,
"learning_rate": 4.799294825686855e-05,
"loss": 1.0743,
"sparse_loss": 1.0743,
"step": 3280
},
{
"epoch": 0.5216566550742966,
"grad_norm": 4.659270286560059,
"learning_rate": 4.793521347414102e-05,
"loss": 1.0794,
"sparse_loss": 1.0794,
"step": 3300
},
{
"epoch": 0.5248182105595953,
"grad_norm": 4.440898895263672,
"learning_rate": 4.787669576449755e-05,
"loss": 1.0888,
"sparse_loss": 1.0888,
"step": 3320
},
{
"epoch": 0.527979766044894,
"grad_norm": 3.1608309745788574,
"learning_rate": 4.781739712552539e-05,
"loss": 1.0183,
"sparse_loss": 1.0183,
"step": 3340
},
{
"epoch": 0.5311413215301929,
"grad_norm": 5.888028144836426,
"learning_rate": 4.775731958146995e-05,
"loss": 1.0622,
"sparse_loss": 1.0622,
"step": 3360
},
{
"epoch": 0.5343028770154916,
"grad_norm": 5.10470724105835,
"learning_rate": 4.769646518316568e-05,
"loss": 1.0711,
"sparse_loss": 1.0711,
"step": 3380
},
{
"epoch": 0.5374644325007903,
"grad_norm": 6.20997953414917,
"learning_rate": 4.763483600796612e-05,
"loss": 1.0966,
"sparse_loss": 1.0966,
"step": 3400
},
{
"epoch": 0.5406259879860892,
"grad_norm": 4.204266548156738,
"learning_rate": 4.757243415967291e-05,
"loss": 1.0076,
"sparse_loss": 1.0076,
"step": 3420
},
{
"epoch": 0.5437875434713879,
"grad_norm": 2.871713399887085,
"learning_rate": 4.750926176846404e-05,
"loss": 1.0147,
"sparse_loss": 1.0147,
"step": 3440
},
{
"epoch": 0.5469490989566866,
"grad_norm": 3.438703775405884,
"learning_rate": 4.744532099082107e-05,
"loss": 0.948,
"sparse_loss": 0.948,
"step": 3460
},
{
"epoch": 0.5501106544419855,
"grad_norm": 2.7154433727264404,
"learning_rate": 4.7380614009455595e-05,
"loss": 1.0644,
"sparse_loss": 1.0644,
"step": 3480
},
{
"epoch": 0.5532722099272842,
"grad_norm": 34.20964813232422,
"learning_rate": 4.7315143033234654e-05,
"loss": 1.0487,
"sparse_loss": 1.0487,
"step": 3500
},
{
"epoch": 0.5564337654125829,
"grad_norm": 3.2565040588378906,
"learning_rate": 4.724891029710537e-05,
"loss": 1.0309,
"sparse_loss": 1.0309,
"step": 3520
},
{
"epoch": 0.5595953208978818,
"grad_norm": 2.995903968811035,
"learning_rate": 4.7181918062018674e-05,
"loss": 1.2914,
"sparse_loss": 1.2914,
"step": 3540
},
{
"epoch": 0.5627568763831805,
"grad_norm": 1.9953080415725708,
"learning_rate": 4.7114168614852064e-05,
"loss": 1.0632,
"sparse_loss": 1.0632,
"step": 3560
},
{
"epoch": 0.5659184318684793,
"grad_norm": 2.5321481227874756,
"learning_rate": 4.70456642683316e-05,
"loss": 1.1658,
"sparse_loss": 1.1658,
"step": 3580
},
{
"epoch": 0.5690799873537781,
"grad_norm": 4.2941131591796875,
"learning_rate": 4.697640736095292e-05,
"loss": 1.0742,
"sparse_loss": 1.0742,
"step": 3600
},
{
"epoch": 0.5722415428390768,
"grad_norm": 2.0793707370758057,
"learning_rate": 4.690640025690143e-05,
"loss": 1.1901,
"sparse_loss": 1.1901,
"step": 3620
},
{
"epoch": 0.5754030983243756,
"grad_norm": 9.403236389160156,
"learning_rate": 4.683564534597159e-05,
"loss": 0.9989,
"sparse_loss": 0.9989,
"step": 3640
},
{
"epoch": 0.5785646538096744,
"grad_norm": 4.044275760650635,
"learning_rate": 4.676414504348533e-05,
"loss": 1.0833,
"sparse_loss": 1.0833,
"step": 3660
},
{
"epoch": 0.5817262092949731,
"grad_norm": 3.725008487701416,
"learning_rate": 4.669190179020962e-05,
"loss": 1.3316,
"sparse_loss": 1.3316,
"step": 3680
},
{
"epoch": 0.5848877647802719,
"grad_norm": 2.5833184719085693,
"learning_rate": 4.661891805227313e-05,
"loss": 1.3381,
"sparse_loss": 1.3381,
"step": 3700
},
{
"epoch": 0.5880493202655707,
"grad_norm": 112.98980712890625,
"learning_rate": 4.654519632108204e-05,
"loss": 1.1105,
"sparse_loss": 1.1105,
"step": 3720
},
{
"epoch": 0.5912108757508694,
"grad_norm": 3.47724986076355,
"learning_rate": 4.6470739113235026e-05,
"loss": 1.0299,
"sparse_loss": 1.0299,
"step": 3740
},
{
"epoch": 0.5943724312361682,
"grad_norm": 5.579695701599121,
"learning_rate": 4.639554897043731e-05,
"loss": 0.9969,
"sparse_loss": 0.9969,
"step": 3760
},
{
"epoch": 0.597533986721467,
"grad_norm": 3.7847225666046143,
"learning_rate": 4.6319628459413946e-05,
"loss": 0.9187,
"sparse_loss": 0.9187,
"step": 3780
},
{
"epoch": 0.6006955422067657,
"grad_norm": 3.587329864501953,
"learning_rate": 4.6242980171822134e-05,
"loss": 1.0435,
"sparse_loss": 1.0435,
"step": 3800
},
{
"epoch": 0.6038570976920645,
"grad_norm": 8.99221420288086,
"learning_rate": 4.6165606724162816e-05,
"loss": 1.0201,
"sparse_loss": 1.0201,
"step": 3820
},
{
"epoch": 0.6070186531773633,
"grad_norm": 2.8481829166412354,
"learning_rate": 4.608751075769131e-05,
"loss": 1.2422,
"sparse_loss": 1.2422,
"step": 3840
},
{
"epoch": 0.610180208662662,
"grad_norm": 10.8621187210083,
"learning_rate": 4.600869493832718e-05,
"loss": 1.0296,
"sparse_loss": 1.0296,
"step": 3860
},
{
"epoch": 0.6133417641479608,
"grad_norm": 2.5077457427978516,
"learning_rate": 4.592916195656322e-05,
"loss": 1.0305,
"sparse_loss": 1.0305,
"step": 3880
},
{
"epoch": 0.6165033196332595,
"grad_norm": 4.512426376342773,
"learning_rate": 4.5848914527373574e-05,
"loss": 1.3777,
"sparse_loss": 1.3777,
"step": 3900
},
{
"epoch": 0.6196648751185583,
"grad_norm": 5.227989196777344,
"learning_rate": 4.576795539012114e-05,
"loss": 0.9716,
"sparse_loss": 0.9716,
"step": 3920
},
{
"epoch": 0.6228264306038571,
"grad_norm": 2.234457015991211,
"learning_rate": 4.568628730846397e-05,
"loss": 0.8175,
"sparse_loss": 0.8175,
"step": 3940
},
{
"epoch": 0.6259879860891558,
"grad_norm": 2.7576990127563477,
"learning_rate": 4.560391307026097e-05,
"loss": 1.0889,
"sparse_loss": 1.0889,
"step": 3960
},
{
"epoch": 0.6291495415744547,
"grad_norm": 3.3322880268096924,
"learning_rate": 4.5520835487476753e-05,
"loss": 0.9866,
"sparse_loss": 0.9866,
"step": 3980
},
{
"epoch": 0.6323110970597534,
"grad_norm": 2.961534261703491,
"learning_rate": 4.5437057396085584e-05,
"loss": 0.9558,
"sparse_loss": 0.9558,
"step": 4000
},
{
"epoch": 0.6354726525450521,
"grad_norm": 8.994413375854492,
"learning_rate": 4.535258165597465e-05,
"loss": 0.9634,
"sparse_loss": 0.9634,
"step": 4020
},
{
"epoch": 0.638634208030351,
"grad_norm": 9.025190353393555,
"learning_rate": 4.526741115084636e-05,
"loss": 1.0034,
"sparse_loss": 1.0034,
"step": 4040
},
{
"epoch": 0.6417957635156497,
"grad_norm": 5.431192398071289,
"learning_rate": 4.518154878811997e-05,
"loss": 1.0137,
"sparse_loss": 1.0137,
"step": 4060
},
{
"epoch": 0.6449573190009484,
"grad_norm": 7.579077243804932,
"learning_rate": 4.509499749883226e-05,
"loss": 1.0167,
"sparse_loss": 1.0167,
"step": 4080
},
{
"epoch": 0.6481188744862473,
"grad_norm": 5.1751909255981445,
"learning_rate": 4.5007760237537566e-05,
"loss": 0.9525,
"sparse_loss": 0.9525,
"step": 4100
},
{
"epoch": 0.651280429971546,
"grad_norm": 5.494132995605469,
"learning_rate": 4.491983998220686e-05,
"loss": 0.978,
"sparse_loss": 0.978,
"step": 4120
},
{
"epoch": 0.6544419854568447,
"grad_norm": 2.640233278274536,
"learning_rate": 4.483123973412611e-05,
"loss": 1.0011,
"sparse_loss": 1.0011,
"step": 4140
},
{
"epoch": 0.6576035409421436,
"grad_norm": 7.292989253997803,
"learning_rate": 4.474196251779381e-05,
"loss": 1.0074,
"sparse_loss": 1.0074,
"step": 4160
},
{
"epoch": 0.6607650964274423,
"grad_norm": 87.05461120605469,
"learning_rate": 4.465201138081778e-05,
"loss": 1.0582,
"sparse_loss": 1.0582,
"step": 4180
},
{
"epoch": 0.663926651912741,
"grad_norm": 4.470405578613281,
"learning_rate": 4.4561389393811096e-05,
"loss": 1.0093,
"sparse_loss": 1.0093,
"step": 4200
},
{
"epoch": 0.6670882073980399,
"grad_norm": 4.9339518547058105,
"learning_rate": 4.4470099650287255e-05,
"loss": 0.9241,
"sparse_loss": 0.9241,
"step": 4220
},
{
"epoch": 0.6702497628833386,
"grad_norm": 2.3951964378356934,
"learning_rate": 4.4378145266554625e-05,
"loss": 0.9304,
"sparse_loss": 0.9304,
"step": 4240
},
{
"epoch": 0.6734113183686373,
"grad_norm": 2.9704620838165283,
"learning_rate": 4.428552938161002e-05,
"loss": 1.04,
"sparse_loss": 1.04,
"step": 4260
},
{
"epoch": 0.6765728738539362,
"grad_norm": 46.931556701660156,
"learning_rate": 4.419225515703155e-05,
"loss": 0.9433,
"sparse_loss": 0.9433,
"step": 4280
},
{
"epoch": 0.6797344293392349,
"grad_norm": 2.5629138946533203,
"learning_rate": 4.4098325776870734e-05,
"loss": 1.2199,
"sparse_loss": 1.2199,
"step": 4300
},
{
"epoch": 0.6828959848245336,
"grad_norm": 40.42338180541992,
"learning_rate": 4.400374444754376e-05,
"loss": 0.9545,
"sparse_loss": 0.9545,
"step": 4320
},
{
"epoch": 0.6860575403098325,
"grad_norm": 3.68564510345459,
"learning_rate": 4.3908514397722064e-05,
"loss": 0.9973,
"sparse_loss": 0.9973,
"step": 4340
},
{
"epoch": 0.6892190957951312,
"grad_norm": 2.8047549724578857,
"learning_rate": 4.3812638878222095e-05,
"loss": 1.0985,
"sparse_loss": 1.0985,
"step": 4360
},
{
"epoch": 0.69238065128043,
"grad_norm": 4.292630672454834,
"learning_rate": 4.371612116189434e-05,
"loss": 0.8444,
"sparse_loss": 0.8444,
"step": 4380
},
{
"epoch": 0.6955422067657288,
"grad_norm": 2.901982307434082,
"learning_rate": 4.361896454351162e-05,
"loss": 0.9891,
"sparse_loss": 0.9891,
"step": 4400
},
{
"epoch": 0.6987037622510275,
"grad_norm": 5.447702407836914,
"learning_rate": 4.3521172339656616e-05,
"loss": 1.0007,
"sparse_loss": 1.0007,
"step": 4420
},
{
"epoch": 0.7018653177363263,
"grad_norm": 7.081545829772949,
"learning_rate": 4.342274788860863e-05,
"loss": 1.3366,
"sparse_loss": 1.3366,
"step": 4440
},
{
"epoch": 0.705026873221625,
"grad_norm": 4.312632083892822,
"learning_rate": 4.332369455022965e-05,
"loss": 0.9928,
"sparse_loss": 0.9928,
"step": 4460
},
{
"epoch": 0.7081884287069238,
"grad_norm": 2.817713499069214,
"learning_rate": 4.322401570584965e-05,
"loss": 0.9269,
"sparse_loss": 0.9269,
"step": 4480
},
{
"epoch": 0.7113499841922226,
"grad_norm": 3.021247148513794,
"learning_rate": 4.312371475815116e-05,
"loss": 0.8685,
"sparse_loss": 0.8685,
"step": 4500
},
{
"epoch": 0.7145115396775213,
"grad_norm": 4.620492935180664,
"learning_rate": 4.3022795131053104e-05,
"loss": 0.9361,
"sparse_loss": 0.9361,
"step": 4520
},
{
"epoch": 0.7176730951628201,
"grad_norm": 3.594322443008423,
"learning_rate": 4.2921260269593954e-05,
"loss": 1.0142,
"sparse_loss": 1.0142,
"step": 4540
},
{
"epoch": 0.7208346506481189,
"grad_norm": 2.517115831375122,
"learning_rate": 4.281911363981407e-05,
"loss": 0.9497,
"sparse_loss": 0.9497,
"step": 4560
},
{
"epoch": 0.7239962061334176,
"grad_norm": 2.5836758613586426,
"learning_rate": 4.271635872863744e-05,
"loss": 0.9003,
"sparse_loss": 0.9003,
"step": 4580
},
{
"epoch": 0.7271577616187164,
"grad_norm": 3.6526050567626953,
"learning_rate": 4.261299904375261e-05,
"loss": 1.6835,
"sparse_loss": 1.6835,
"step": 4600
},
{
"epoch": 0.7303193171040152,
"grad_norm": 3.3744492530822754,
"learning_rate": 4.250903811349297e-05,
"loss": 0.9629,
"sparse_loss": 0.9629,
"step": 4620
},
{
"epoch": 0.7334808725893139,
"grad_norm": 4.435449123382568,
"learning_rate": 4.240447948671628e-05,
"loss": 0.9577,
"sparse_loss": 0.9577,
"step": 4640
},
{
"epoch": 0.7366424280746127,
"grad_norm": 10.854448318481445,
"learning_rate": 4.2299326732683555e-05,
"loss": 0.8803,
"sparse_loss": 0.8803,
"step": 4660
},
{
"epoch": 0.7398039835599115,
"grad_norm": 13.589402198791504,
"learning_rate": 4.219358344093719e-05,
"loss": 0.8339,
"sparse_loss": 0.8339,
"step": 4680
},
{
"epoch": 0.7429655390452102,
"grad_norm": 3.220682144165039,
"learning_rate": 4.208725322117848e-05,
"loss": 0.98,
"sparse_loss": 0.98,
"step": 4700
},
{
"epoch": 0.746127094530509,
"grad_norm": 3.848068952560425,
"learning_rate": 4.1980339703144325e-05,
"loss": 0.9304,
"sparse_loss": 0.9304,
"step": 4720
},
{
"epoch": 0.7492886500158078,
"grad_norm": 4.343535423278809,
"learning_rate": 4.1872846536483377e-05,
"loss": 0.9197,
"sparse_loss": 0.9197,
"step": 4740
},
{
"epoch": 0.7524502055011065,
"grad_norm": 22.216102600097656,
"learning_rate": 4.176477739063146e-05,
"loss": 0.9199,
"sparse_loss": 0.9199,
"step": 4760
},
{
"epoch": 0.7556117609864053,
"grad_norm": 4.729385852813721,
"learning_rate": 4.165613595468624e-05,
"loss": 0.9136,
"sparse_loss": 0.9136,
"step": 4780
},
{
"epoch": 0.7587733164717041,
"grad_norm": 3.364084482192993,
"learning_rate": 4.1546925937281376e-05,
"loss": 1.0395,
"sparse_loss": 1.0395,
"step": 4800
},
{
"epoch": 0.7619348719570028,
"grad_norm": 2.467050790786743,
"learning_rate": 4.143715106645986e-05,
"loss": 0.896,
"sparse_loss": 0.896,
"step": 4820
},
{
"epoch": 0.7650964274423017,
"grad_norm": 81.41665649414062,
"learning_rate": 4.13268150895468e-05,
"loss": 1.0098,
"sparse_loss": 1.0098,
"step": 4840
},
{
"epoch": 0.7682579829276004,
"grad_norm": 2.784630298614502,
"learning_rate": 4.121592177302147e-05,
"loss": 0.9415,
"sparse_loss": 0.9415,
"step": 4860
},
{
"epoch": 0.7714195384128991,
"grad_norm": 3.2318930625915527,
"learning_rate": 4.1104474902388734e-05,
"loss": 1.0379,
"sparse_loss": 1.0379,
"step": 4880
},
{
"epoch": 0.774581093898198,
"grad_norm": 5.31190299987793,
"learning_rate": 4.099247828204984e-05,
"loss": 0.9674,
"sparse_loss": 0.9674,
"step": 4900
},
{
"epoch": 0.7777426493834967,
"grad_norm": 3.919340133666992,
"learning_rate": 4.0879935735172526e-05,
"loss": 0.9471,
"sparse_loss": 0.9471,
"step": 4920
},
{
"epoch": 0.7809042048687954,
"grad_norm": 26.871673583984375,
"learning_rate": 4.076685110356057e-05,
"loss": 0.9859,
"sparse_loss": 0.9859,
"step": 4940
},
{
"epoch": 0.7840657603540943,
"grad_norm": 9.538094520568848,
"learning_rate": 4.0653228247522545e-05,
"loss": 1.3406,
"sparse_loss": 1.3406,
"step": 4960
},
{
"epoch": 0.787227315839393,
"grad_norm": 3.3164422512054443,
"learning_rate": 4.053907104574016e-05,
"loss": 1.0039,
"sparse_loss": 1.0039,
"step": 4980
},
{
"epoch": 0.7903888713246917,
"grad_norm": 3.28509783744812,
"learning_rate": 4.042438339513573e-05,
"loss": 0.9906,
"sparse_loss": 0.9906,
"step": 5000
},
{
"epoch": 0.7935504268099906,
"grad_norm": 5.863593101501465,
"learning_rate": 4.030916921073926e-05,
"loss": 1.333,
"sparse_loss": 1.333,
"step": 5020
},
{
"epoch": 0.7967119822952893,
"grad_norm": 11.845443725585938,
"learning_rate": 4.019343242555474e-05,
"loss": 0.9501,
"sparse_loss": 0.9501,
"step": 5040
},
{
"epoch": 0.799873537780588,
"grad_norm": 3.840085506439209,
"learning_rate": 4.00771769904259e-05,
"loss": 0.9624,
"sparse_loss": 0.9624,
"step": 5060
},
{
"epoch": 0.8030350932658868,
"grad_norm": 7.45705509185791,
"learning_rate": 3.9960406873901335e-05,
"loss": 1.1257,
"sparse_loss": 1.1257,
"step": 5080
},
{
"epoch": 0.8061966487511856,
"grad_norm": 2.9892349243164062,
"learning_rate": 3.984312606209904e-05,
"loss": 1.0608,
"sparse_loss": 1.0608,
"step": 5100
},
{
"epoch": 0.8093582042364843,
"grad_norm": 4.22726583480835,
"learning_rate": 3.9725338558570335e-05,
"loss": 0.8869,
"sparse_loss": 0.8869,
"step": 5120
},
{
"epoch": 0.8125197597217831,
"grad_norm": 71.5716552734375,
"learning_rate": 3.960704838416321e-05,
"loss": 1.024,
"sparse_loss": 1.024,
"step": 5140
},
{
"epoch": 0.8156813152070819,
"grad_norm": 3.4674580097198486,
"learning_rate": 3.948825957688506e-05,
"loss": 0.9383,
"sparse_loss": 0.9383,
"step": 5160
},
{
"epoch": 0.8188428706923806,
"grad_norm": 22.559823989868164,
"learning_rate": 3.9368976191764806e-05,
"loss": 1.2298,
"sparse_loss": 1.2298,
"step": 5180
},
{
"epoch": 0.8220044261776794,
"grad_norm": 2.515516757965088,
"learning_rate": 3.924920230071456e-05,
"loss": 1.0466,
"sparse_loss": 1.0466,
"step": 5200
},
{
"epoch": 0.8251659816629782,
"grad_norm": 2.4846043586730957,
"learning_rate": 3.912894199239052e-05,
"loss": 0.8882,
"sparse_loss": 0.8882,
"step": 5220
},
{
"epoch": 0.828327537148277,
"grad_norm": 4.767989635467529,
"learning_rate": 3.900819937205348e-05,
"loss": 0.981,
"sparse_loss": 0.981,
"step": 5240
},
{
"epoch": 0.8314890926335757,
"grad_norm": 5.391225814819336,
"learning_rate": 3.888697856142861e-05,
"loss": 0.9302,
"sparse_loss": 0.9302,
"step": 5260
},
{
"epoch": 0.8346506481188745,
"grad_norm": 4.460025787353516,
"learning_rate": 3.876528369856486e-05,
"loss": 0.8613,
"sparse_loss": 0.8613,
"step": 5280
},
{
"epoch": 0.8378122036041733,
"grad_norm": 3.7380456924438477,
"learning_rate": 3.864311893769361e-05,
"loss": 1.1334,
"sparse_loss": 1.1334,
"step": 5300
},
{
"epoch": 0.840973759089472,
"grad_norm": 3.2851319313049316,
"learning_rate": 3.85204884490869e-05,
"loss": 1.0204,
"sparse_loss": 1.0204,
"step": 5320
},
{
"epoch": 0.8441353145747708,
"grad_norm": 4.777285575866699,
"learning_rate": 3.839739641891506e-05,
"loss": 0.9311,
"sparse_loss": 0.9311,
"step": 5340
},
{
"epoch": 0.8472968700600696,
"grad_norm": 5.82297945022583,
"learning_rate": 3.8273847049103816e-05,
"loss": 0.8136,
"sparse_loss": 0.8136,
"step": 5360
},
{
"epoch": 0.8504584255453683,
"grad_norm": 3.461158275604248,
"learning_rate": 3.8149844557190855e-05,
"loss": 0.8487,
"sparse_loss": 0.8487,
"step": 5380
},
{
"epoch": 0.853619981030667,
"grad_norm": 8.542764663696289,
"learning_rate": 3.802539317618185e-05,
"loss": 0.8648,
"sparse_loss": 0.8648,
"step": 5400
},
{
"epoch": 0.8567815365159659,
"grad_norm": 3.9751503467559814,
"learning_rate": 3.790049715440592e-05,
"loss": 0.8832,
"sparse_loss": 0.8832,
"step": 5420
},
{
"epoch": 0.8599430920012646,
"grad_norm": 6.192680358886719,
"learning_rate": 3.7775160755370695e-05,
"loss": 0.8357,
"sparse_loss": 0.8357,
"step": 5440
},
{
"epoch": 0.8631046474865633,
"grad_norm": 3.71183705329895,
"learning_rate": 3.764938825761671e-05,
"loss": 0.8037,
"sparse_loss": 0.8037,
"step": 5460
},
{
"epoch": 0.8662662029718622,
"grad_norm": 3.918074369430542,
"learning_rate": 3.7523183954571336e-05,
"loss": 0.9258,
"sparse_loss": 0.9258,
"step": 5480
},
{
"epoch": 0.8694277584571609,
"grad_norm": 3.448901891708374,
"learning_rate": 3.739655215440228e-05,
"loss": 0.8469,
"sparse_loss": 0.8469,
"step": 5500
},
{
"epoch": 0.8725893139424596,
"grad_norm": 3.151432752609253,
"learning_rate": 3.726949717987048e-05,
"loss": 0.8945,
"sparse_loss": 0.8945,
"step": 5520
},
{
"epoch": 0.8757508694277585,
"grad_norm": 2.4400060176849365,
"learning_rate": 3.714202336818252e-05,
"loss": 0.8608,
"sparse_loss": 0.8608,
"step": 5540
},
{
"epoch": 0.8789124249130572,
"grad_norm": 8.550524711608887,
"learning_rate": 3.701413507084264e-05,
"loss": 1.0025,
"sparse_loss": 1.0025,
"step": 5560
},
{
"epoch": 0.8820739803983559,
"grad_norm": 3.8528268337249756,
"learning_rate": 3.6885836653504124e-05,
"loss": 0.8956,
"sparse_loss": 0.8956,
"step": 5580
},
{
"epoch": 0.8852355358836548,
"grad_norm": 6.683523178100586,
"learning_rate": 3.675713249582031e-05,
"loss": 0.8487,
"sparse_loss": 0.8487,
"step": 5600
},
{
"epoch": 0.8883970913689535,
"grad_norm": 4.686063766479492,
"learning_rate": 3.662802699129508e-05,
"loss": 0.8578,
"sparse_loss": 0.8578,
"step": 5620
},
{
"epoch": 0.8915586468542523,
"grad_norm": 11.728353500366211,
"learning_rate": 3.649852454713286e-05,
"loss": 0.7899,
"sparse_loss": 0.7899,
"step": 5640
},
{
"epoch": 0.8947202023395511,
"grad_norm": 3.126070499420166,
"learning_rate": 3.636862958408818e-05,
"loss": 0.8662,
"sparse_loss": 0.8662,
"step": 5660
},
{
"epoch": 0.8978817578248498,
"grad_norm": 10.942789077758789,
"learning_rate": 3.6238346536314815e-05,
"loss": 0.8668,
"sparse_loss": 0.8668,
"step": 5680
},
{
"epoch": 0.9010433133101486,
"grad_norm": 7.786405563354492,
"learning_rate": 3.610767985121433e-05,
"loss": 0.8688,
"sparse_loss": 0.8688,
"step": 5700
},
{
"epoch": 0.9042048687954474,
"grad_norm": 4.515435218811035,
"learning_rate": 3.597663398928435e-05,
"loss": 1.035,
"sparse_loss": 1.035,
"step": 5720
},
{
"epoch": 0.9073664242807461,
"grad_norm": 5.420071601867676,
"learning_rate": 3.584521342396623e-05,
"loss": 0.8736,
"sparse_loss": 0.8736,
"step": 5740
},
{
"epoch": 0.9105279797660449,
"grad_norm": 4.7605767250061035,
"learning_rate": 3.5713422641492355e-05,
"loss": 0.9587,
"sparse_loss": 0.9587,
"step": 5760
},
{
"epoch": 0.9136895352513437,
"grad_norm": 4.38069486618042,
"learning_rate": 3.558126614073305e-05,
"loss": 0.9497,
"sparse_loss": 0.9497,
"step": 5780
},
{
"epoch": 0.9168510907366424,
"grad_norm": 3.70302152633667,
"learning_rate": 3.544874843304294e-05,
"loss": 0.8356,
"sparse_loss": 0.8356,
"step": 5800
},
{
"epoch": 0.9200126462219412,
"grad_norm": 3.1259734630584717,
"learning_rate": 3.5315874042107e-05,
"loss": 0.9136,
"sparse_loss": 0.9136,
"step": 5820
},
{
"epoch": 0.92317420170724,
"grad_norm": 9.207280158996582,
"learning_rate": 3.518264750378606e-05,
"loss": 0.8825,
"sparse_loss": 0.8825,
"step": 5840
},
{
"epoch": 0.9263357571925387,
"grad_norm": 4.420866012573242,
"learning_rate": 3.5049073365962065e-05,
"loss": 0.9497,
"sparse_loss": 0.9497,
"step": 5860
},
{
"epoch": 0.9294973126778375,
"grad_norm": 6.084704875946045,
"learning_rate": 3.491515618838275e-05,
"loss": 0.9559,
"sparse_loss": 0.9559,
"step": 5880
},
{
"epoch": 0.9326588681631363,
"grad_norm": 7.439093112945557,
"learning_rate": 3.4780900542506e-05,
"loss": 0.9589,
"sparse_loss": 0.9589,
"step": 5900
},
{
"epoch": 0.935820423648435,
"grad_norm": 2.716554880142212,
"learning_rate": 3.464631101134385e-05,
"loss": 0.8888,
"sparse_loss": 0.8888,
"step": 5920
},
{
"epoch": 0.9389819791337338,
"grad_norm": 242.3243865966797,
"learning_rate": 3.451139218930595e-05,
"loss": 0.8851,
"sparse_loss": 0.8851,
"step": 5940
},
{
"epoch": 0.9421435346190326,
"grad_norm": 4.233142852783203,
"learning_rate": 3.43761486820428e-05,
"loss": 0.9576,
"sparse_loss": 0.9576,
"step": 5960
},
{
"epoch": 0.9453050901043313,
"grad_norm": 4.0793328285217285,
"learning_rate": 3.424058510628849e-05,
"loss": 0.9139,
"sparse_loss": 0.9139,
"step": 5980
},
{
"epoch": 0.9484666455896301,
"grad_norm": 30.09717559814453,
"learning_rate": 3.410470608970313e-05,
"loss": 0.8681,
"sparse_loss": 0.8681,
"step": 6000
},
{
"epoch": 0.9516282010749288,
"grad_norm": 4.182109355926514,
"learning_rate": 3.396851627071484e-05,
"loss": 0.8576,
"sparse_loss": 0.8576,
"step": 6020
},
{
"epoch": 0.9547897565602277,
"grad_norm": 10.850983619689941,
"learning_rate": 3.383202029836145e-05,
"loss": 0.9598,
"sparse_loss": 0.9598,
"step": 6040
},
{
"epoch": 0.9579513120455264,
"grad_norm": 10.119805335998535,
"learning_rate": 3.369522283213179e-05,
"loss": 0.9393,
"sparse_loss": 0.9393,
"step": 6060
},
{
"epoch": 0.9611128675308251,
"grad_norm": 3.4797518253326416,
"learning_rate": 3.3558128541806586e-05,
"loss": 0.9382,
"sparse_loss": 0.9382,
"step": 6080
},
{
"epoch": 0.964274423016124,
"grad_norm": 3.270132064819336,
"learning_rate": 3.3420742107299117e-05,
"loss": 0.9342,
"sparse_loss": 0.9342,
"step": 6100
},
{
"epoch": 0.9674359785014227,
"grad_norm": 4.080615520477295,
"learning_rate": 3.328306821849542e-05,
"loss": 0.9381,
"sparse_loss": 0.9381,
"step": 6120
},
{
"epoch": 0.9705975339867214,
"grad_norm": 2.0365800857543945,
"learning_rate": 3.314511157509422e-05,
"loss": 0.826,
"sparse_loss": 0.826,
"step": 6140
},
{
"epoch": 0.9737590894720203,
"grad_norm": 5.458964824676514,
"learning_rate": 3.300687688644644e-05,
"loss": 0.9035,
"sparse_loss": 0.9035,
"step": 6160
},
{
"epoch": 0.976920644957319,
"grad_norm": 7.074070453643799,
"learning_rate": 3.286836887139454e-05,
"loss": 0.9065,
"sparse_loss": 0.9065,
"step": 6180
},
{
"epoch": 0.9800822004426177,
"grad_norm": 2.327319622039795,
"learning_rate": 3.272959225811132e-05,
"loss": 0.813,
"sparse_loss": 0.813,
"step": 6200
},
{
"epoch": 0.9832437559279166,
"grad_norm": 2.7987070083618164,
"learning_rate": 3.259055178393859e-05,
"loss": 0.8557,
"sparse_loss": 0.8557,
"step": 6220
},
{
"epoch": 0.9864053114132153,
"grad_norm": 3.6612040996551514,
"learning_rate": 3.2451252195225476e-05,
"loss": 0.8239,
"sparse_loss": 0.8239,
"step": 6240
},
{
"epoch": 0.989566866898514,
"grad_norm": 2.771278142929077,
"learning_rate": 3.231169824716628e-05,
"loss": 0.9103,
"sparse_loss": 0.9103,
"step": 6260
},
{
"epoch": 0.9927284223838129,
"grad_norm": 3.7233340740203857,
"learning_rate": 3.2171894703638306e-05,
"loss": 0.7789,
"sparse_loss": 0.7789,
"step": 6280
},
{
"epoch": 0.9958899778691116,
"grad_norm": 2.886253833770752,
"learning_rate": 3.2031846337039105e-05,
"loss": 1.0294,
"sparse_loss": 1.0294,
"step": 6300
},
{
"epoch": 0.9990515333544103,
"grad_norm": 3.4484128952026367,
"learning_rate": 3.189155792812366e-05,
"loss": 0.9973,
"sparse_loss": 0.9973,
"step": 6320
},
{
"epoch": 1.0022130888397092,
"grad_norm": 3.3322901725769043,
"learning_rate": 3.175103426584113e-05,
"loss": 0.851,
"sparse_loss": 0.851,
"step": 6340
},
{
"epoch": 1.005374644325008,
"grad_norm": 3.2941579818725586,
"learning_rate": 3.161028014717138e-05,
"loss": 0.8414,
"sparse_loss": 0.8414,
"step": 6360
},
{
"epoch": 1.0085361998103066,
"grad_norm": 2.743898868560791,
"learning_rate": 3.146930037696127e-05,
"loss": 0.8934,
"sparse_loss": 0.8934,
"step": 6380
},
{
"epoch": 1.0116977552956055,
"grad_norm": 5.593410015106201,
"learning_rate": 3.1328099767760584e-05,
"loss": 0.9014,
"sparse_loss": 0.9014,
"step": 6400
},
{
"epoch": 1.0148593107809043,
"grad_norm": 5.132786750793457,
"learning_rate": 3.118668313965775e-05,
"loss": 0.8435,
"sparse_loss": 0.8435,
"step": 6420
},
{
"epoch": 1.018020866266203,
"grad_norm": 23.462923049926758,
"learning_rate": 3.1045055320115356e-05,
"loss": 0.8969,
"sparse_loss": 0.8969,
"step": 6440
},
{
"epoch": 1.0211824217515018,
"grad_norm": 4.707699775695801,
"learning_rate": 3.090322114380528e-05,
"loss": 0.8335,
"sparse_loss": 0.8335,
"step": 6460
},
{
"epoch": 1.0243439772368006,
"grad_norm": 3.600597620010376,
"learning_rate": 3.076118545244371e-05,
"loss": 0.7767,
"sparse_loss": 0.7767,
"step": 6480
},
{
"epoch": 1.0275055327220992,
"grad_norm": 2.8197829723358154,
"learning_rate": 3.0618953094625856e-05,
"loss": 0.7853,
"sparse_loss": 0.7853,
"step": 6500
},
{
"epoch": 1.030667088207398,
"grad_norm": 3.2937378883361816,
"learning_rate": 3.0476528925660382e-05,
"loss": 0.8076,
"sparse_loss": 0.8076,
"step": 6520
},
{
"epoch": 1.033828643692697,
"grad_norm": 9.177017211914062,
"learning_rate": 3.033391780740374e-05,
"loss": 0.7984,
"sparse_loss": 0.7984,
"step": 6540
},
{
"epoch": 1.0369901991779955,
"grad_norm": 3.7408878803253174,
"learning_rate": 3.019112460809415e-05,
"loss": 0.8806,
"sparse_loss": 0.8806,
"step": 6560
},
{
"epoch": 1.0401517546632943,
"grad_norm": 3.173116445541382,
"learning_rate": 3.0048154202185452e-05,
"loss": 0.7925,
"sparse_loss": 0.7925,
"step": 6580
},
{
"epoch": 1.0433133101485932,
"grad_norm": 3.0256857872009277,
"learning_rate": 2.9905011470180683e-05,
"loss": 0.7768,
"sparse_loss": 0.7768,
"step": 6600
},
{
"epoch": 1.0464748656338918,
"grad_norm": 3.942674398422241,
"learning_rate": 2.9761701298465465e-05,
"loss": 0.7864,
"sparse_loss": 0.7864,
"step": 6620
},
{
"epoch": 1.0496364211191906,
"grad_norm": 2.6104464530944824,
"learning_rate": 2.9618228579141244e-05,
"loss": 0.8511,
"sparse_loss": 0.8511,
"step": 6640
},
{
"epoch": 1.0527979766044895,
"grad_norm": 4.09505033493042,
"learning_rate": 2.9474598209858262e-05,
"loss": 0.7585,
"sparse_loss": 0.7585,
"step": 6660
},
{
"epoch": 1.055959532089788,
"grad_norm": 3.442453145980835,
"learning_rate": 2.9330815093648344e-05,
"loss": 0.8245,
"sparse_loss": 0.8245,
"step": 6680
},
{
"epoch": 1.059121087575087,
"grad_norm": 4.608921527862549,
"learning_rate": 2.9186884138757596e-05,
"loss": 0.7885,
"sparse_loss": 0.7885,
"step": 6700
},
{
"epoch": 1.0622826430603858,
"grad_norm": 3.9954628944396973,
"learning_rate": 2.9042810258478785e-05,
"loss": 0.7727,
"sparse_loss": 0.7727,
"step": 6720
},
{
"epoch": 1.0654441985456844,
"grad_norm": 2.6084184646606445,
"learning_rate": 2.8898598370983642e-05,
"loss": 0.8582,
"sparse_loss": 0.8582,
"step": 6740
},
{
"epoch": 1.0686057540309832,
"grad_norm": 17.903118133544922,
"learning_rate": 2.8754253399154995e-05,
"loss": 0.7957,
"sparse_loss": 0.7957,
"step": 6760
},
{
"epoch": 1.071767309516282,
"grad_norm": 5.914791107177734,
"learning_rate": 2.8609780270418684e-05,
"loss": 0.7795,
"sparse_loss": 0.7795,
"step": 6780
},
{
"epoch": 1.0749288650015807,
"grad_norm": 3.333829641342163,
"learning_rate": 2.846518391657538e-05,
"loss": 0.8225,
"sparse_loss": 0.8225,
"step": 6800
},
{
"epoch": 1.0780904204868795,
"grad_norm": 3.164673089981079,
"learning_rate": 2.832046927363221e-05,
"loss": 0.799,
"sparse_loss": 0.799,
"step": 6820
},
{
"epoch": 1.0812519759721784,
"grad_norm": 12.461358070373535,
"learning_rate": 2.8175641281634285e-05,
"loss": 0.8586,
"sparse_loss": 0.8586,
"step": 6840
},
{
"epoch": 1.084413531457477,
"grad_norm": 11.53919792175293,
"learning_rate": 2.8030704884496056e-05,
"loss": 0.7813,
"sparse_loss": 0.7813,
"step": 6860
},
{
"epoch": 1.0875750869427758,
"grad_norm": 3.0285277366638184,
"learning_rate": 2.7885665029832515e-05,
"loss": 0.689,
"sparse_loss": 0.689,
"step": 6880
},
{
"epoch": 1.0907366424280747,
"grad_norm": 3.562075614929199,
"learning_rate": 2.7740526668790355e-05,
"loss": 0.8058,
"sparse_loss": 0.8058,
"step": 6900
},
{
"epoch": 1.0938981979133733,
"grad_norm": 3.2898313999176025,
"learning_rate": 2.7595294755878914e-05,
"loss": 0.7827,
"sparse_loss": 0.7827,
"step": 6920
},
{
"epoch": 1.0970597533986721,
"grad_norm": 3.8332738876342773,
"learning_rate": 2.744997424880107e-05,
"loss": 0.7735,
"sparse_loss": 0.7735,
"step": 6940
},
{
"epoch": 1.100221308883971,
"grad_norm": 3.5667922496795654,
"learning_rate": 2.7304570108283978e-05,
"loss": 0.7801,
"sparse_loss": 0.7801,
"step": 6960
},
{
"epoch": 1.1033828643692696,
"grad_norm": 6.848054885864258,
"learning_rate": 2.715908729790974e-05,
"loss": 0.7815,
"sparse_loss": 0.7815,
"step": 6980
},
{
"epoch": 1.1065444198545684,
"grad_norm": 22.703956604003906,
"learning_rate": 2.701353078394599e-05,
"loss": 0.9333,
"sparse_loss": 0.9333,
"step": 7000
},
{
"epoch": 1.1097059753398673,
"grad_norm": 2.80373215675354,
"learning_rate": 2.686790553517632e-05,
"loss": 0.7076,
"sparse_loss": 0.7076,
"step": 7020
},
{
"epoch": 1.112867530825166,
"grad_norm": 3.815857172012329,
"learning_rate": 2.6722216522730693e-05,
"loss": 0.785,
"sparse_loss": 0.785,
"step": 7040
},
{
"epoch": 1.1160290863104647,
"grad_norm": 3.837503433227539,
"learning_rate": 2.657646871991575e-05,
"loss": 0.8114,
"sparse_loss": 0.8114,
"step": 7060
},
{
"epoch": 1.1191906417957636,
"grad_norm": 3.5608763694763184,
"learning_rate": 2.6430667102044994e-05,
"loss": 0.8866,
"sparse_loss": 0.8866,
"step": 7080
},
{
"epoch": 1.1223521972810624,
"grad_norm": 13.049933433532715,
"learning_rate": 2.628481664626901e-05,
"loss": 0.7034,
"sparse_loss": 0.7034,
"step": 7100
},
{
"epoch": 1.125513752766361,
"grad_norm": 3.2948811054229736,
"learning_rate": 2.6138922331405545e-05,
"loss": 0.7277,
"sparse_loss": 0.7277,
"step": 7120
},
{
"epoch": 1.1286753082516598,
"grad_norm": 3.614666223526001,
"learning_rate": 2.5992989137769512e-05,
"loss": 0.7875,
"sparse_loss": 0.7875,
"step": 7140
},
{
"epoch": 1.1318368637369587,
"grad_norm": 9.70922565460205,
"learning_rate": 2.5847022047003016e-05,
"loss": 0.8039,
"sparse_loss": 0.8039,
"step": 7160
},
{
"epoch": 1.1349984192222573,
"grad_norm": 4.468625068664551,
"learning_rate": 2.5701026041905306e-05,
"loss": 0.6848,
"sparse_loss": 0.6848,
"step": 7180
},
{
"epoch": 1.1381599747075561,
"grad_norm": 2.4922146797180176,
"learning_rate": 2.555500610626264e-05,
"loss": 0.853,
"sparse_loss": 0.853,
"step": 7200
},
{
"epoch": 1.141321530192855,
"grad_norm": 4.103279113769531,
"learning_rate": 2.5408967224678203e-05,
"loss": 0.7167,
"sparse_loss": 0.7167,
"step": 7220
},
{
"epoch": 1.1444830856781536,
"grad_norm": 14.72502326965332,
"learning_rate": 2.5262914382401908e-05,
"loss": 0.8225,
"sparse_loss": 0.8225,
"step": 7240
},
{
"epoch": 1.1476446411634524,
"grad_norm": 3.0032124519348145,
"learning_rate": 2.5116852565160253e-05,
"loss": 0.7334,
"sparse_loss": 0.7334,
"step": 7260
},
{
"epoch": 1.1508061966487513,
"grad_norm": 2.6268882751464844,
"learning_rate": 2.4970786758986098e-05,
"loss": 0.771,
"sparse_loss": 0.771,
"step": 7280
},
{
"epoch": 1.15396775213405,
"grad_norm": 3.5500500202178955,
"learning_rate": 2.482472195004847e-05,
"loss": 0.7327,
"sparse_loss": 0.7327,
"step": 7300
},
{
"epoch": 1.1571293076193487,
"grad_norm": 3.703057289123535,
"learning_rate": 2.4678663124482358e-05,
"loss": 0.7744,
"sparse_loss": 0.7744,
"step": 7320
},
{
"epoch": 1.1602908631046476,
"grad_norm": 5.458900451660156,
"learning_rate": 2.4532615268218503e-05,
"loss": 0.7852,
"sparse_loss": 0.7852,
"step": 7340
},
{
"epoch": 1.1634524185899462,
"grad_norm": 3.3717634677886963,
"learning_rate": 2.438658336681319e-05,
"loss": 0.8021,
"sparse_loss": 0.8021,
"step": 7360
},
{
"epoch": 1.166613974075245,
"grad_norm": 6.050585746765137,
"learning_rate": 2.4240572405278065e-05,
"loss": 0.7909,
"sparse_loss": 0.7909,
"step": 7380
},
{
"epoch": 1.1697755295605439,
"grad_norm": 5.964054584503174,
"learning_rate": 2.4094587367909942e-05,
"loss": 0.884,
"sparse_loss": 0.884,
"step": 7400
},
{
"epoch": 1.1729370850458425,
"grad_norm": 4.546367645263672,
"learning_rate": 2.394863323812072e-05,
"loss": 0.7565,
"sparse_loss": 0.7565,
"step": 7420
},
{
"epoch": 1.1760986405311413,
"grad_norm": 1.9227008819580078,
"learning_rate": 2.3802714998267177e-05,
"loss": 0.7934,
"sparse_loss": 0.7934,
"step": 7440
},
{
"epoch": 1.1792601960164402,
"grad_norm": 4.1103010177612305,
"learning_rate": 2.365683762948094e-05,
"loss": 0.753,
"sparse_loss": 0.753,
"step": 7460
},
{
"epoch": 1.1824217515017388,
"grad_norm": 4.982396602630615,
"learning_rate": 2.3511006111498486e-05,
"loss": 0.7338,
"sparse_loss": 0.7338,
"step": 7480
},
{
"epoch": 1.1855833069870376,
"grad_norm": 4.692390441894531,
"learning_rate": 2.3365225422491045e-05,
"loss": 0.8314,
"sparse_loss": 0.8314,
"step": 7500
},
{
"epoch": 1.1887448624723365,
"grad_norm": 4.539749622344971,
"learning_rate": 2.3219500538894796e-05,
"loss": 0.766,
"sparse_loss": 0.766,
"step": 7520
},
{
"epoch": 1.191906417957635,
"grad_norm": 2.8554506301879883,
"learning_rate": 2.307383643524085e-05,
"loss": 0.8289,
"sparse_loss": 0.8289,
"step": 7540
},
{
"epoch": 1.195067973442934,
"grad_norm": 2.8660402297973633,
"learning_rate": 2.292823808398554e-05,
"loss": 0.7801,
"sparse_loss": 0.7801,
"step": 7560
},
{
"epoch": 1.1982295289282328,
"grad_norm": 2.8601577281951904,
"learning_rate": 2.2782710455340666e-05,
"loss": 0.8038,
"sparse_loss": 0.8038,
"step": 7580
},
{
"epoch": 1.2013910844135314,
"grad_norm": 3.46244478225708,
"learning_rate": 2.2637258517103754e-05,
"loss": 0.7507,
"sparse_loss": 0.7507,
"step": 7600
},
{
"epoch": 1.2045526398988302,
"grad_norm": 5.021501541137695,
"learning_rate": 2.249188723448859e-05,
"loss": 0.8116,
"sparse_loss": 0.8116,
"step": 7620
},
{
"epoch": 1.207714195384129,
"grad_norm": 2.997774839401245,
"learning_rate": 2.2346601569955622e-05,
"loss": 0.7993,
"sparse_loss": 0.7993,
"step": 7640
},
{
"epoch": 1.2108757508694277,
"grad_norm": 2.6975152492523193,
"learning_rate": 2.2201406483042592e-05,
"loss": 0.7502,
"sparse_loss": 0.7502,
"step": 7660
},
{
"epoch": 1.2140373063547265,
"grad_norm": 2.3823089599609375,
"learning_rate": 2.205630693019529e-05,
"loss": 0.7218,
"sparse_loss": 0.7218,
"step": 7680
},
{
"epoch": 1.2171988618400253,
"grad_norm": 4.206768989562988,
"learning_rate": 2.1911307864598253e-05,
"loss": 0.7008,
"sparse_loss": 0.7008,
"step": 7700
},
{
"epoch": 1.220360417325324,
"grad_norm": 3.4627437591552734,
"learning_rate": 2.1766414236005795e-05,
"loss": 0.7893,
"sparse_loss": 0.7893,
"step": 7720
},
{
"epoch": 1.2235219728106228,
"grad_norm": 2.8255398273468018,
"learning_rate": 2.162163099057295e-05,
"loss": 0.7608,
"sparse_loss": 0.7608,
"step": 7740
},
{
"epoch": 1.2266835282959216,
"grad_norm": 4.011843204498291,
"learning_rate": 2.1476963070686658e-05,
"loss": 0.807,
"sparse_loss": 0.807,
"step": 7760
},
{
"epoch": 1.2298450837812203,
"grad_norm": 1.6373748779296875,
"learning_rate": 2.1332415414797083e-05,
"loss": 0.7559,
"sparse_loss": 0.7559,
"step": 7780
},
{
"epoch": 1.233006639266519,
"grad_norm": 5.056970119476318,
"learning_rate": 2.1187992957248975e-05,
"loss": 0.7906,
"sparse_loss": 0.7906,
"step": 7800
},
{
"epoch": 1.236168194751818,
"grad_norm": 2.484866142272949,
"learning_rate": 2.1043700628113274e-05,
"loss": 0.7988,
"sparse_loss": 0.7988,
"step": 7820
},
{
"epoch": 1.2393297502371166,
"grad_norm": 3.7188453674316406,
"learning_rate": 2.0899543353018792e-05,
"loss": 0.7656,
"sparse_loss": 0.7656,
"step": 7840
},
{
"epoch": 1.2424913057224154,
"grad_norm": 6.211063861846924,
"learning_rate": 2.0755526052984048e-05,
"loss": 0.715,
"sparse_loss": 0.715,
"step": 7860
},
{
"epoch": 1.2456528612077142,
"grad_norm": 7.281302452087402,
"learning_rate": 2.0611653644249363e-05,
"loss": 0.7624,
"sparse_loss": 0.7624,
"step": 7880
},
{
"epoch": 1.2488144166930129,
"grad_norm": 8.34732723236084,
"learning_rate": 2.0467931038108933e-05,
"loss": 0.6751,
"sparse_loss": 0.6751,
"step": 7900
},
{
"epoch": 1.2519759721783117,
"grad_norm": 3.84628963470459,
"learning_rate": 2.032436314074326e-05,
"loss": 0.8031,
"sparse_loss": 0.8031,
"step": 7920
},
{
"epoch": 1.2551375276636105,
"grad_norm": 4.450046062469482,
"learning_rate": 2.01809548530516e-05,
"loss": 0.8891,
"sparse_loss": 0.8891,
"step": 7940
},
{
"epoch": 1.2582990831489091,
"grad_norm": 4.694953441619873,
"learning_rate": 2.003771107048474e-05,
"loss": 0.7813,
"sparse_loss": 0.7813,
"step": 7960
},
{
"epoch": 1.261460638634208,
"grad_norm": 3.50838303565979,
"learning_rate": 1.9894636682877812e-05,
"loss": 0.6967,
"sparse_loss": 0.6967,
"step": 7980
},
{
"epoch": 1.2646221941195068,
"grad_norm": 6.803529262542725,
"learning_rate": 1.9751736574283416e-05,
"loss": 0.7321,
"sparse_loss": 0.7321,
"step": 8000
},
{
"epoch": 1.2677837496048054,
"grad_norm": 3.093456268310547,
"learning_rate": 1.96090156228049e-05,
"loss": 0.777,
"sparse_loss": 0.777,
"step": 8020
},
{
"epoch": 1.2709453050901043,
"grad_norm": 4.572612762451172,
"learning_rate": 1.9466478700429793e-05,
"loss": 0.7862,
"sparse_loss": 0.7862,
"step": 8040
},
{
"epoch": 1.2741068605754031,
"grad_norm": 3.8324055671691895,
"learning_rate": 1.932413067286355e-05,
"loss": 0.7502,
"sparse_loss": 0.7502,
"step": 8060
},
{
"epoch": 1.2772684160607017,
"grad_norm": 4.8424763679504395,
"learning_rate": 1.9181976399363415e-05,
"loss": 0.7847,
"sparse_loss": 0.7847,
"step": 8080
},
{
"epoch": 1.2804299715460006,
"grad_norm": 3.3925790786743164,
"learning_rate": 1.904002073257254e-05,
"loss": 0.6804,
"sparse_loss": 0.6804,
"step": 8100
},
{
"epoch": 1.2835915270312994,
"grad_norm": 3.5479822158813477,
"learning_rate": 1.8898268518354383e-05,
"loss": 0.7036,
"sparse_loss": 0.7036,
"step": 8120
},
{
"epoch": 1.286753082516598,
"grad_norm": 4.0524396896362305,
"learning_rate": 1.8756724595627207e-05,
"loss": 0.7484,
"sparse_loss": 0.7484,
"step": 8140
},
{
"epoch": 1.2899146380018969,
"grad_norm": 3.455230474472046,
"learning_rate": 1.861539379619899e-05,
"loss": 0.8262,
"sparse_loss": 0.8262,
"step": 8160
},
{
"epoch": 1.2930761934871957,
"grad_norm": 3.041726589202881,
"learning_rate": 1.84742809446024e-05,
"loss": 0.732,
"sparse_loss": 0.732,
"step": 8180
},
{
"epoch": 1.2962377489724943,
"grad_norm": 4.920287132263184,
"learning_rate": 1.8333390857930144e-05,
"loss": 0.6776,
"sparse_loss": 0.6776,
"step": 8200
},
{
"epoch": 1.2993993044577932,
"grad_norm": 3.5329527854919434,
"learning_rate": 1.8192728345670547e-05,
"loss": 0.7193,
"sparse_loss": 0.7193,
"step": 8220
},
{
"epoch": 1.302560859943092,
"grad_norm": 2.954718589782715,
"learning_rate": 1.8052298209543315e-05,
"loss": 0.7685,
"sparse_loss": 0.7685,
"step": 8240
},
{
"epoch": 1.3057224154283908,
"grad_norm": 3.8826847076416016,
"learning_rate": 1.7912105243335687e-05,
"loss": 0.8173,
"sparse_loss": 0.8173,
"step": 8260
},
{
"epoch": 1.3088839709136895,
"grad_norm": 3.8182241916656494,
"learning_rate": 1.7772154232738745e-05,
"loss": 0.7295,
"sparse_loss": 0.7295,
"step": 8280
},
{
"epoch": 1.3120455263989883,
"grad_norm": 8.97372817993164,
"learning_rate": 1.763244995518406e-05,
"loss": 0.7348,
"sparse_loss": 0.7348,
"step": 8300
},
{
"epoch": 1.3152070818842871,
"grad_norm": 4.511361122131348,
"learning_rate": 1.749299717968063e-05,
"loss": 0.7708,
"sparse_loss": 0.7708,
"step": 8320
},
{
"epoch": 1.3183686373695858,
"grad_norm": 6.190827369689941,
"learning_rate": 1.7353800666652046e-05,
"loss": 0.7471,
"sparse_loss": 0.7471,
"step": 8340
},
{
"epoch": 1.3215301928548846,
"grad_norm": 5.850632667541504,
"learning_rate": 1.721486516777402e-05,
"loss": 0.6956,
"sparse_loss": 0.6956,
"step": 8360
},
{
"epoch": 1.3246917483401834,
"grad_norm": 3.439373016357422,
"learning_rate": 1.707619542581215e-05,
"loss": 0.7216,
"sparse_loss": 0.7216,
"step": 8380
},
{
"epoch": 1.327853303825482,
"grad_norm": 3.1243553161621094,
"learning_rate": 1.6937796174460044e-05,
"loss": 0.7003,
"sparse_loss": 0.7003,
"step": 8400
},
{
"epoch": 1.331014859310781,
"grad_norm": 2.189188003540039,
"learning_rate": 1.6799672138177726e-05,
"loss": 0.7786,
"sparse_loss": 0.7786,
"step": 8420
},
{
"epoch": 1.3341764147960797,
"grad_norm": 4.986023902893066,
"learning_rate": 1.6661828032030334e-05,
"loss": 0.7062,
"sparse_loss": 0.7062,
"step": 8440
},
{
"epoch": 1.3373379702813786,
"grad_norm": 4.733263969421387,
"learning_rate": 1.652426856152721e-05,
"loss": 0.728,
"sparse_loss": 0.728,
"step": 8460
},
{
"epoch": 1.3404995257666772,
"grad_norm": 3.997431516647339,
"learning_rate": 1.638699842246121e-05,
"loss": 0.7494,
"sparse_loss": 0.7494,
"step": 8480
},
{
"epoch": 1.343661081251976,
"grad_norm": 3.0536346435546875,
"learning_rate": 1.6250022300748486e-05,
"loss": 0.6844,
"sparse_loss": 0.6844,
"step": 8500
},
{
"epoch": 1.3468226367372749,
"grad_norm": 5.487432956695557,
"learning_rate": 1.611334487226842e-05,
"loss": 0.6523,
"sparse_loss": 0.6523,
"step": 8520
},
{
"epoch": 1.3499841922225735,
"grad_norm": 5.399076461791992,
"learning_rate": 1.5976970802704106e-05,
"loss": 0.7584,
"sparse_loss": 0.7584,
"step": 8540
},
{
"epoch": 1.3531457477078723,
"grad_norm": 3.9257521629333496,
"learning_rate": 1.584090474738305e-05,
"loss": 0.9882,
"sparse_loss": 0.9882,
"step": 8560
},
{
"epoch": 1.3563073031931712,
"grad_norm": 3.830868721008301,
"learning_rate": 1.5705151351118192e-05,
"loss": 0.9246,
"sparse_loss": 0.9246,
"step": 8580
},
{
"epoch": 1.3594688586784698,
"grad_norm": 3.6509013175964355,
"learning_rate": 1.5569715248049457e-05,
"loss": 0.9254,
"sparse_loss": 0.9254,
"step": 8600
},
{
"epoch": 1.3626304141637686,
"grad_norm": 4.829010009765625,
"learning_rate": 1.5434601061485477e-05,
"loss": 0.7008,
"sparse_loss": 0.7008,
"step": 8620
},
{
"epoch": 1.3657919696490675,
"grad_norm": 4.709140300750732,
"learning_rate": 1.5299813403745777e-05,
"loss": 0.7746,
"sparse_loss": 0.7746,
"step": 8640
},
{
"epoch": 1.368953525134366,
"grad_norm": 3.270906448364258,
"learning_rate": 1.5165356876003395e-05,
"loss": 0.7616,
"sparse_loss": 0.7616,
"step": 8660
},
{
"epoch": 1.372115080619665,
"grad_norm": 3.6882264614105225,
"learning_rate": 1.5031236068127701e-05,
"loss": 0.736,
"sparse_loss": 0.736,
"step": 8680
},
{
"epoch": 1.3752766361049638,
"grad_norm": 3.479408025741577,
"learning_rate": 1.4897455558527845e-05,
"loss": 0.7595,
"sparse_loss": 0.7595,
"step": 8700
},
{
"epoch": 1.3784381915902624,
"grad_norm": 3.896000623703003,
"learning_rate": 1.4764019913996355e-05,
"loss": 0.7135,
"sparse_loss": 0.7135,
"step": 8720
},
{
"epoch": 1.3815997470755612,
"grad_norm": 2.6327970027923584,
"learning_rate": 1.463093368955328e-05,
"loss": 0.8016,
"sparse_loss": 0.8016,
"step": 8740
},
{
"epoch": 1.38476130256086,
"grad_norm": 3.2666549682617188,
"learning_rate": 1.4498201428290759e-05,
"loss": 0.7027,
"sparse_loss": 0.7027,
"step": 8760
},
{
"epoch": 1.3879228580461587,
"grad_norm": 3.704594612121582,
"learning_rate": 1.4365827661217815e-05,
"loss": 0.7176,
"sparse_loss": 0.7176,
"step": 8780
},
{
"epoch": 1.3910844135314575,
"grad_norm": 4.167449474334717,
"learning_rate": 1.4233816907105808e-05,
"loss": 0.7141,
"sparse_loss": 0.7141,
"step": 8800
},
{
"epoch": 1.3942459690167563,
"grad_norm": 3.4307448863983154,
"learning_rate": 1.4102173672334087e-05,
"loss": 0.6909,
"sparse_loss": 0.6909,
"step": 8820
},
{
"epoch": 1.397407524502055,
"grad_norm": 6.434845447540283,
"learning_rate": 1.3970902450736207e-05,
"loss": 0.8462,
"sparse_loss": 0.8462,
"step": 8840
},
{
"epoch": 1.4005690799873538,
"grad_norm": 3.118427038192749,
"learning_rate": 1.3840007723446497e-05,
"loss": 1.653,
"sparse_loss": 1.653,
"step": 8860
},
{
"epoch": 1.4037306354726526,
"grad_norm": 3.181898355484009,
"learning_rate": 1.3709493958747114e-05,
"loss": 1.1503,
"sparse_loss": 1.1503,
"step": 8880
},
{
"epoch": 1.4068921909579513,
"grad_norm": 2.60162091255188,
"learning_rate": 1.3579365611915517e-05,
"loss": 0.7187,
"sparse_loss": 0.7187,
"step": 8900
},
{
"epoch": 1.41005374644325,
"grad_norm": 3.9322071075439453,
"learning_rate": 1.3449627125072348e-05,
"loss": 0.7415,
"sparse_loss": 0.7415,
"step": 8920
},
{
"epoch": 1.413215301928549,
"grad_norm": 12.794656753540039,
"learning_rate": 1.3320282927029806e-05,
"loss": 0.9116,
"sparse_loss": 0.9116,
"step": 8940
},
{
"epoch": 1.4163768574138476,
"grad_norm": 2.7025208473205566,
"learning_rate": 1.3191337433140477e-05,
"loss": 0.8108,
"sparse_loss": 0.8108,
"step": 8960
},
{
"epoch": 1.4195384128991464,
"grad_norm": 7.067229270935059,
"learning_rate": 1.3062795045146586e-05,
"loss": 0.7282,
"sparse_loss": 0.7282,
"step": 8980
},
{
"epoch": 1.4226999683844452,
"grad_norm": 17.709030151367188,
"learning_rate": 1.2934660151029787e-05,
"loss": 0.7099,
"sparse_loss": 0.7099,
"step": 9000
},
{
"epoch": 1.4258615238697439,
"grad_norm": 2.6624441146850586,
"learning_rate": 1.280693712486129e-05,
"loss": 0.709,
"sparse_loss": 0.709,
"step": 9020
},
{
"epoch": 1.4290230793550427,
"grad_norm": 4.256680011749268,
"learning_rate": 1.2679630326652637e-05,
"loss": 0.7621,
"sparse_loss": 0.7621,
"step": 9040
},
{
"epoch": 1.4321846348403415,
"grad_norm": 3.280846357345581,
"learning_rate": 1.2552744102206795e-05,
"loss": 0.7032,
"sparse_loss": 0.7032,
"step": 9060
},
{
"epoch": 1.4353461903256401,
"grad_norm": 4.684087753295898,
"learning_rate": 1.2426282782969817e-05,
"loss": 0.714,
"sparse_loss": 0.714,
"step": 9080
},
{
"epoch": 1.438507745810939,
"grad_norm": 2.5815060138702393,
"learning_rate": 1.2300250685883045e-05,
"loss": 0.6459,
"sparse_loss": 0.6459,
"step": 9100
},
{
"epoch": 1.4416693012962378,
"grad_norm": 90.20246887207031,
"learning_rate": 1.2174652113235651e-05,
"loss": 0.7376,
"sparse_loss": 0.7376,
"step": 9120
},
{
"epoch": 1.4448308567815364,
"grad_norm": 4.301577568054199,
"learning_rate": 1.2049491352517866e-05,
"loss": 0.7237,
"sparse_loss": 0.7237,
"step": 9140
},
{
"epoch": 1.4479924122668353,
"grad_norm": 3.5579445362091064,
"learning_rate": 1.1924772676274546e-05,
"loss": 0.7621,
"sparse_loss": 0.7621,
"step": 9160
},
{
"epoch": 1.4511539677521341,
"grad_norm": 2.510053873062134,
"learning_rate": 1.1800500341959317e-05,
"loss": 0.7033,
"sparse_loss": 0.7033,
"step": 9180
},
{
"epoch": 1.4543155232374327,
"grad_norm": 2.4674551486968994,
"learning_rate": 1.1676678591789341e-05,
"loss": 0.7039,
"sparse_loss": 0.7039,
"step": 9200
},
{
"epoch": 1.4574770787227316,
"grad_norm": 2.625556468963623,
"learning_rate": 1.155331165260038e-05,
"loss": 0.7147,
"sparse_loss": 0.7147,
"step": 9220
},
{
"epoch": 1.4606386342080304,
"grad_norm": 3.8454997539520264,
"learning_rate": 1.1430403735702599e-05,
"loss": 0.7117,
"sparse_loss": 0.7117,
"step": 9240
},
{
"epoch": 1.463800189693329,
"grad_norm": 4.563284873962402,
"learning_rate": 1.1307959036736754e-05,
"loss": 0.6743,
"sparse_loss": 0.6743,
"step": 9260
},
{
"epoch": 1.4669617451786279,
"grad_norm": 9.20923137664795,
"learning_rate": 1.1185981735530945e-05,
"loss": 0.7482,
"sparse_loss": 0.7482,
"step": 9280
},
{
"epoch": 1.4701233006639267,
"grad_norm": 4.920022964477539,
"learning_rate": 1.1064475995958035e-05,
"loss": 0.6762,
"sparse_loss": 0.6762,
"step": 9300
},
{
"epoch": 1.4732848561492253,
"grad_norm": 5.852681636810303,
"learning_rate": 1.0943445965793391e-05,
"loss": 0.8167,
"sparse_loss": 0.8167,
"step": 9320
},
{
"epoch": 1.4764464116345242,
"grad_norm": 2.162567138671875,
"learning_rate": 1.0822895776573386e-05,
"loss": 0.7235,
"sparse_loss": 0.7235,
"step": 9340
},
{
"epoch": 1.479607967119823,
"grad_norm": 7.650509357452393,
"learning_rate": 1.0702829543454295e-05,
"loss": 0.6727,
"sparse_loss": 0.6727,
"step": 9360
},
{
"epoch": 1.4827695226051216,
"grad_norm": 30.643972396850586,
"learning_rate": 1.0583251365071856e-05,
"loss": 0.7458,
"sparse_loss": 0.7458,
"step": 9380
},
{
"epoch": 1.4859310780904205,
"grad_norm": 2.6235289573669434,
"learning_rate": 1.0464165323401348e-05,
"loss": 0.6345,
"sparse_loss": 0.6345,
"step": 9400
},
{
"epoch": 1.4890926335757193,
"grad_norm": 4.027590751647949,
"learning_rate": 1.0345575483618236e-05,
"loss": 0.7367,
"sparse_loss": 0.7367,
"step": 9420
},
{
"epoch": 1.492254189061018,
"grad_norm": 5.625932216644287,
"learning_rate": 1.022748589395944e-05,
"loss": 0.7412,
"sparse_loss": 0.7412,
"step": 9440
},
{
"epoch": 1.4954157445463168,
"grad_norm": 3.3360841274261475,
"learning_rate": 1.0109900585585089e-05,
"loss": 0.6915,
"sparse_loss": 0.6915,
"step": 9460
},
{
"epoch": 1.4985773000316156,
"grad_norm": 3.0357189178466797,
"learning_rate": 9.992823572440936e-06,
"loss": 0.707,
"sparse_loss": 0.707,
"step": 9480
},
{
"epoch": 1.5017388555169142,
"grad_norm": 2.9004428386688232,
"learning_rate": 9.876258851121342e-06,
"loss": 0.6771,
"sparse_loss": 0.6771,
"step": 9500
},
{
"epoch": 1.504900411002213,
"grad_norm": 3.5283262729644775,
"learning_rate": 9.760210400732837e-06,
"loss": 0.75,
"sparse_loss": 0.75,
"step": 9520
},
{
"epoch": 1.508061966487512,
"grad_norm": 4.224815845489502,
"learning_rate": 9.644682182758306e-06,
"loss": 0.7227,
"sparse_loss": 0.7227,
"step": 9540
},
{
"epoch": 1.5112235219728105,
"grad_norm": 2.3723509311676025,
"learning_rate": 9.529678140921721e-06,
"loss": 0.6999,
"sparse_loss": 0.6999,
"step": 9560
},
{
"epoch": 1.5143850774581094,
"grad_norm": 4.764114856719971,
"learning_rate": 9.415202201053553e-06,
"loss": 0.7451,
"sparse_loss": 0.7451,
"step": 9580
},
{
"epoch": 1.5175466329434082,
"grad_norm": 2.386294364929199,
"learning_rate": 9.301258270956733e-06,
"loss": 0.7274,
"sparse_loss": 0.7274,
"step": 9600
},
{
"epoch": 1.5207081884287068,
"grad_norm": 2.7732250690460205,
"learning_rate": 9.187850240273263e-06,
"loss": 0.6657,
"sparse_loss": 0.6657,
"step": 9620
},
{
"epoch": 1.5238697439140056,
"grad_norm": 4.688830852508545,
"learning_rate": 9.074981980351461e-06,
"loss": 0.7123,
"sparse_loss": 0.7123,
"step": 9640
},
{
"epoch": 1.5270312993993045,
"grad_norm": 2.6799607276916504,
"learning_rate": 8.962657344113756e-06,
"loss": 0.76,
"sparse_loss": 0.76,
"step": 9660
},
{
"epoch": 1.530192854884603,
"grad_norm": 3.9289462566375732,
"learning_rate": 8.850880165925198e-06,
"loss": 0.6947,
"sparse_loss": 0.6947,
"step": 9680
},
{
"epoch": 1.533354410369902,
"grad_norm": 4.236504554748535,
"learning_rate": 8.73965426146257e-06,
"loss": 0.6947,
"sparse_loss": 0.6947,
"step": 9700
},
{
"epoch": 1.5365159658552008,
"grad_norm": 2.219120740890503,
"learning_rate": 8.628983427584104e-06,
"loss": 0.6093,
"sparse_loss": 0.6093,
"step": 9720
},
{
"epoch": 1.5396775213404994,
"grad_norm": 2.3669049739837646,
"learning_rate": 8.518871442199916e-06,
"loss": 0.7508,
"sparse_loss": 0.7508,
"step": 9740
},
{
"epoch": 1.5428390768257982,
"grad_norm": 3.3307414054870605,
"learning_rate": 8.40932206414299e-06,
"loss": 0.6202,
"sparse_loss": 0.6202,
"step": 9760
},
{
"epoch": 1.546000632311097,
"grad_norm": 3.576195001602173,
"learning_rate": 8.300339033040908e-06,
"loss": 0.6884,
"sparse_loss": 0.6884,
"step": 9780
},
{
"epoch": 1.5491621877963957,
"grad_norm": 3.074108362197876,
"learning_rate": 8.191926069188155e-06,
"loss": 0.6555,
"sparse_loss": 0.6555,
"step": 9800
},
{
"epoch": 1.5523237432816948,
"grad_norm": 2.822364330291748,
"learning_rate": 8.084086873419144e-06,
"loss": 0.6959,
"sparse_loss": 0.6959,
"step": 9820
},
{
"epoch": 1.5554852987669934,
"grad_norm": 3.8235299587249756,
"learning_rate": 7.976825126981907e-06,
"loss": 0.7085,
"sparse_loss": 0.7085,
"step": 9840
},
{
"epoch": 1.558646854252292,
"grad_norm": 3.4154646396636963,
"learning_rate": 7.87014449141236e-06,
"loss": 0.7891,
"sparse_loss": 0.7891,
"step": 9860
},
{
"epoch": 1.561808409737591,
"grad_norm": 2.6401171684265137,
"learning_rate": 7.764048608409394e-06,
"loss": 0.7563,
"sparse_loss": 0.7563,
"step": 9880
},
{
"epoch": 1.5649699652228897,
"grad_norm": 14.986632347106934,
"learning_rate": 7.65854109971048e-06,
"loss": 0.655,
"sparse_loss": 0.655,
"step": 9900
},
{
"epoch": 1.5681315207081883,
"grad_norm": 2.6173582077026367,
"learning_rate": 7.553625566968092e-06,
"loss": 0.6957,
"sparse_loss": 0.6957,
"step": 9920
},
{
"epoch": 1.5712930761934873,
"grad_norm": 2.3389580249786377,
"learning_rate": 7.44930559162676e-06,
"loss": 0.7026,
"sparse_loss": 0.7026,
"step": 9940
},
{
"epoch": 1.574454631678786,
"grad_norm": 3.2184102535247803,
"learning_rate": 7.345584734800764e-06,
"loss": 0.7599,
"sparse_loss": 0.7599,
"step": 9960
},
{
"epoch": 1.5776161871640846,
"grad_norm": 2.2282955646514893,
"learning_rate": 7.242466537152639e-06,
"loss": 0.6167,
"sparse_loss": 0.6167,
"step": 9980
},
{
"epoch": 1.5807777426493836,
"grad_norm": 4.351361274719238,
"learning_rate": 7.139954518772227e-06,
"loss": 0.808,
"sparse_loss": 0.808,
"step": 10000
},
{
"epoch": 1.5839392981346823,
"grad_norm": 2.3593902587890625,
"learning_rate": 7.038052179056573e-06,
"loss": 0.6325,
"sparse_loss": 0.6325,
"step": 10020
},
{
"epoch": 1.5871008536199809,
"grad_norm": 2.93237566947937,
"learning_rate": 6.936762996590482e-06,
"loss": 0.7235,
"sparse_loss": 0.7235,
"step": 10040
},
{
"epoch": 1.59026240910528,
"grad_norm": 8.080977439880371,
"learning_rate": 6.8360904290276975e-06,
"loss": 0.7299,
"sparse_loss": 0.7299,
"step": 10060
},
{
"epoch": 1.5934239645905786,
"grad_norm": 2.9335365295410156,
"learning_rate": 6.736037912972967e-06,
"loss": 0.6473,
"sparse_loss": 0.6473,
"step": 10080
},
{
"epoch": 1.5965855200758772,
"grad_norm": 4.444180965423584,
"learning_rate": 6.6366088638646154e-06,
"loss": 0.5555,
"sparse_loss": 0.5555,
"step": 10100
},
{
"epoch": 1.5997470755611762,
"grad_norm": 2.5193140506744385,
"learning_rate": 6.537806675858066e-06,
"loss": 0.6496,
"sparse_loss": 0.6496,
"step": 10120
},
{
"epoch": 1.6029086310464749,
"grad_norm": 4.642623424530029,
"learning_rate": 6.439634721709905e-06,
"loss": 0.6155,
"sparse_loss": 0.6155,
"step": 10140
},
{
"epoch": 1.6060701865317735,
"grad_norm": 2.8118245601654053,
"learning_rate": 6.34209635266276e-06,
"loss": 0.7121,
"sparse_loss": 0.7121,
"step": 10160
},
{
"epoch": 1.6092317420170725,
"grad_norm": 3.5233209133148193,
"learning_rate": 6.245194898330933e-06,
"loss": 0.6628,
"sparse_loss": 0.6628,
"step": 10180
},
{
"epoch": 1.6123932975023711,
"grad_norm": 3.2448441982269287,
"learning_rate": 6.148933666586693e-06,
"loss": 0.7382,
"sparse_loss": 0.7382,
"step": 10200
},
{
"epoch": 1.61555485298767,
"grad_norm": 4.133220672607422,
"learning_rate": 6.0533159434473825e-06,
"loss": 0.6039,
"sparse_loss": 0.6039,
"step": 10220
},
{
"epoch": 1.6187164084729688,
"grad_norm": 3.2340219020843506,
"learning_rate": 5.958344992963247e-06,
"loss": 0.737,
"sparse_loss": 0.737,
"step": 10240
},
{
"epoch": 1.6218779639582674,
"grad_norm": 3.0795609951019287,
"learning_rate": 5.864024057105993e-06,
"loss": 0.6226,
"sparse_loss": 0.6226,
"step": 10260
},
{
"epoch": 1.6250395194435663,
"grad_norm": 2.284395217895508,
"learning_rate": 5.770356355658155e-06,
"loss": 0.7065,
"sparse_loss": 0.7065,
"step": 10280
},
{
"epoch": 1.6282010749288651,
"grad_norm": 2.981004238128662,
"learning_rate": 5.6773450861031365e-06,
"loss": 0.6393,
"sparse_loss": 0.6393,
"step": 10300
},
{
"epoch": 1.6313626304141637,
"grad_norm": 2.6331748962402344,
"learning_rate": 5.584993423516088e-06,
"loss": 0.6835,
"sparse_loss": 0.6835,
"step": 10320
},
{
"epoch": 1.6345241858994626,
"grad_norm": 3.171314001083374,
"learning_rate": 5.49330452045552e-06,
"loss": 0.7525,
"sparse_loss": 0.7525,
"step": 10340
},
{
"epoch": 1.6376857413847614,
"grad_norm": 3.8974101543426514,
"learning_rate": 5.402281506855672e-06,
"loss": 0.7167,
"sparse_loss": 0.7167,
"step": 10360
},
{
"epoch": 1.64084729687006,
"grad_norm": 3.284602403640747,
"learning_rate": 5.3119274899196965e-06,
"loss": 0.6216,
"sparse_loss": 0.6216,
"step": 10380
},
{
"epoch": 1.6440088523553589,
"grad_norm": 3.199734926223755,
"learning_rate": 5.222245554013552e-06,
"loss": 0.7085,
"sparse_loss": 0.7085,
"step": 10400
},
{
"epoch": 1.6471704078406577,
"grad_norm": 4.375881671905518,
"learning_rate": 5.133238760560735e-06,
"loss": 0.5973,
"sparse_loss": 0.5973,
"step": 10420
},
{
"epoch": 1.6503319633259563,
"grad_norm": 3.2240214347839355,
"learning_rate": 5.044910147937778e-06,
"loss": 0.5963,
"sparse_loss": 0.5963,
"step": 10440
},
{
"epoch": 1.6534935188112552,
"grad_norm": 3.3825221061706543,
"learning_rate": 4.95726273137051e-06,
"loss": 0.7874,
"sparse_loss": 0.7874,
"step": 10460
},
{
"epoch": 1.656655074296554,
"grad_norm": 6.872448444366455,
"learning_rate": 4.870299502831163e-06,
"loss": 0.6904,
"sparse_loss": 0.6904,
"step": 10480
},
{
"epoch": 1.6598166297818526,
"grad_norm": 2.23103666305542,
"learning_rate": 4.784023430936193e-06,
"loss": 0.7489,
"sparse_loss": 0.7489,
"step": 10500
},
{
"epoch": 1.6629781852671515,
"grad_norm": 2.302872657775879,
"learning_rate": 4.698437460844976e-06,
"loss": 0.7505,
"sparse_loss": 0.7505,
"step": 10520
},
{
"epoch": 1.6661397407524503,
"grad_norm": 10.811643600463867,
"learning_rate": 4.613544514159246e-06,
"loss": 0.6442,
"sparse_loss": 0.6442,
"step": 10540
},
{
"epoch": 1.669301296237749,
"grad_norm": 6.4289021492004395,
"learning_rate": 4.52934748882338e-06,
"loss": 0.7088,
"sparse_loss": 0.7088,
"step": 10560
},
{
"epoch": 1.6724628517230478,
"grad_norm": 4.472427845001221,
"learning_rate": 4.445849259025475e-06,
"loss": 0.717,
"sparse_loss": 0.717,
"step": 10580
},
{
"epoch": 1.6756244072083466,
"grad_norm": 7.740157127380371,
"learning_rate": 4.363052675099213e-06,
"loss": 0.6795,
"sparse_loss": 0.6795,
"step": 10600
},
{
"epoch": 1.6787859626936452,
"grad_norm": 3.1040191650390625,
"learning_rate": 4.2809605634265755e-06,
"loss": 0.6909,
"sparse_loss": 0.6909,
"step": 10620
},
{
"epoch": 1.681947518178944,
"grad_norm": 18.65782356262207,
"learning_rate": 4.199575726341346e-06,
"loss": 0.6712,
"sparse_loss": 0.6712,
"step": 10640
},
{
"epoch": 1.685109073664243,
"grad_norm": 2.706319808959961,
"learning_rate": 4.118900942033491e-06,
"loss": 0.7309,
"sparse_loss": 0.7309,
"step": 10660
},
{
"epoch": 1.6882706291495415,
"grad_norm": 2.1650848388671875,
"learning_rate": 4.0389389644542586e-06,
"loss": 0.599,
"sparse_loss": 0.599,
"step": 10680
},
{
"epoch": 1.6914321846348404,
"grad_norm": 5.536136150360107,
"learning_rate": 3.9596925232222196e-06,
"loss": 0.6251,
"sparse_loss": 0.6251,
"step": 10700
},
{
"epoch": 1.6945937401201392,
"grad_norm": 4.87489652633667,
"learning_rate": 3.881164323530062e-06,
"loss": 0.6247,
"sparse_loss": 0.6247,
"step": 10720
},
{
"epoch": 1.6977552956054378,
"grad_norm": 4.932882308959961,
"learning_rate": 3.8033570460522498e-06,
"loss": 0.7014,
"sparse_loss": 0.7014,
"step": 10740
},
{
"epoch": 1.7009168510907366,
"grad_norm": 2.601973056793213,
"learning_rate": 3.7262733468535317e-06,
"loss": 0.651,
"sparse_loss": 0.651,
"step": 10760
},
{
"epoch": 1.7040784065760355,
"grad_norm": 2.757931709289551,
"learning_rate": 3.649915857298242e-06,
"loss": 0.6175,
"sparse_loss": 0.6175,
"step": 10780
},
{
"epoch": 1.707239962061334,
"grad_norm": 10.078351974487305,
"learning_rate": 3.5742871839605006e-06,
"loss": 0.6855,
"sparse_loss": 0.6855,
"step": 10800
},
{
"epoch": 1.710401517546633,
"grad_norm": 3.487138509750366,
"learning_rate": 3.499389908535222e-06,
"loss": 0.6298,
"sparse_loss": 0.6298,
"step": 10820
},
{
"epoch": 1.7135630730319318,
"grad_norm": 5.3016133308410645,
"learning_rate": 3.425226587749977e-06,
"loss": 0.6735,
"sparse_loss": 0.6735,
"step": 10840
},
{
"epoch": 1.7167246285172304,
"grad_norm": 3.6436634063720703,
"learning_rate": 3.3517997532777485e-06,
"loss": 0.6128,
"sparse_loss": 0.6128,
"step": 10860
},
{
"epoch": 1.7198861840025292,
"grad_norm": 4.625311374664307,
"learning_rate": 3.2791119116504703e-06,
"loss": 0.6725,
"sparse_loss": 0.6725,
"step": 10880
},
{
"epoch": 1.723047739487828,
"grad_norm": 3.579132318496704,
"learning_rate": 3.207165544173482e-06,
"loss": 0.7013,
"sparse_loss": 0.7013,
"step": 10900
},
{
"epoch": 1.7262092949731267,
"grad_norm": 4.380514144897461,
"learning_rate": 3.1359631068408224e-06,
"loss": 0.6835,
"sparse_loss": 0.6835,
"step": 10920
},
{
"epoch": 1.7293708504584255,
"grad_norm": 3.3078274726867676,
"learning_rate": 3.0655070302513884e-06,
"loss": 0.6052,
"sparse_loss": 0.6052,
"step": 10940
},
{
"epoch": 1.7325324059437244,
"grad_norm": 3.0710272789001465,
"learning_rate": 2.9957997195259796e-06,
"loss": 0.7106,
"sparse_loss": 0.7106,
"step": 10960
},
{
"epoch": 1.735693961429023,
"grad_norm": 3.607321262359619,
"learning_rate": 2.926843554225167e-06,
"loss": 0.7583,
"sparse_loss": 0.7583,
"step": 10980
},
{
"epoch": 1.7388555169143218,
"grad_norm": 2.6866536140441895,
"learning_rate": 2.8586408882680827e-06,
"loss": 0.8333,
"sparse_loss": 0.8333,
"step": 11000
},
{
"epoch": 1.7420170723996207,
"grad_norm": 3.9435837268829346,
"learning_rate": 2.791194049852075e-06,
"loss": 0.6172,
"sparse_loss": 0.6172,
"step": 11020
},
{
"epoch": 1.7451786278849193,
"grad_norm": 3.074887275695801,
"learning_rate": 2.7245053413731876e-06,
"loss": 0.6502,
"sparse_loss": 0.6502,
"step": 11040
},
{
"epoch": 1.7483401833702181,
"grad_norm": 4.271072864532471,
"learning_rate": 2.6585770393476288e-06,
"loss": 0.6979,
"sparse_loss": 0.6979,
"step": 11060
},
{
"epoch": 1.751501738855517,
"grad_norm": 2.711543083190918,
"learning_rate": 2.593411394334e-06,
"loss": 0.5692,
"sparse_loss": 0.5692,
"step": 11080
},
{
"epoch": 1.7546632943408156,
"grad_norm": 2.817941904067993,
"learning_rate": 2.529010630856507e-06,
"loss": 0.6522,
"sparse_loss": 0.6522,
"step": 11100
},
{
"epoch": 1.7578248498261144,
"grad_norm": 2.2273244857788086,
"learning_rate": 2.465376947329015e-06,
"loss": 0.6217,
"sparse_loss": 0.6217,
"step": 11120
},
{
"epoch": 1.7609864053114133,
"grad_norm": 9.07394027709961,
"learning_rate": 2.402512515979974e-06,
"loss": 0.656,
"sparse_loss": 0.656,
"step": 11140
},
{
"epoch": 1.7641479607967119,
"grad_norm": 5.390905857086182,
"learning_rate": 2.3404194827783223e-06,
"loss": 0.7755,
"sparse_loss": 0.7755,
"step": 11160
},
{
"epoch": 1.7673095162820107,
"grad_norm": 4.0121941566467285,
"learning_rate": 2.2790999673601736e-06,
"loss": 0.6861,
"sparse_loss": 0.6861,
"step": 11180
},
{
"epoch": 1.7704710717673096,
"grad_norm": 4.383898735046387,
"learning_rate": 2.218556062956506e-06,
"loss": 0.6274,
"sparse_loss": 0.6274,
"step": 11200
},
{
"epoch": 1.7736326272526082,
"grad_norm": 6.343504428863525,
"learning_rate": 2.158789836321673e-06,
"loss": 0.6694,
"sparse_loss": 0.6694,
"step": 11220
},
{
"epoch": 1.776794182737907,
"grad_norm": 3.90850567817688,
"learning_rate": 2.0998033276628525e-06,
"loss": 0.5824,
"sparse_loss": 0.5824,
"step": 11240
},
{
"epoch": 1.7799557382232059,
"grad_norm": 3.579479694366455,
"learning_rate": 2.0415985505704476e-06,
"loss": 0.6302,
"sparse_loss": 0.6302,
"step": 11260
},
{
"epoch": 1.7831172937085045,
"grad_norm": 2.917056083679199,
"learning_rate": 1.984177491949285e-06,
"loss": 0.6404,
"sparse_loss": 0.6404,
"step": 11280
},
{
"epoch": 1.7862788491938033,
"grad_norm": 3.728649139404297,
"learning_rate": 1.927542111950836e-06,
"loss": 0.5912,
"sparse_loss": 0.5912,
"step": 11300
},
{
"epoch": 1.7894404046791021,
"grad_norm": 3.583627700805664,
"learning_rate": 1.8716943439062883e-06,
"loss": 0.6079,
"sparse_loss": 0.6079,
"step": 11320
},
{
"epoch": 1.7926019601644008,
"grad_norm": 3.6497068405151367,
"learning_rate": 1.8166360942605348e-06,
"loss": 0.6869,
"sparse_loss": 0.6869,
"step": 11340
},
{
"epoch": 1.7957635156496996,
"grad_norm": 3.265042304992676,
"learning_rate": 1.7623692425071225e-06,
"loss": 0.6614,
"sparse_loss": 0.6614,
"step": 11360
},
{
"epoch": 1.7989250711349984,
"grad_norm": 4.365682125091553,
"learning_rate": 1.708895641124064e-06,
"loss": 0.7749,
"sparse_loss": 0.7749,
"step": 11380
},
{
"epoch": 1.802086626620297,
"grad_norm": 3.0027592182159424,
"learning_rate": 1.656217115510636e-06,
"loss": 0.665,
"sparse_loss": 0.665,
"step": 11400
},
{
"epoch": 1.805248182105596,
"grad_norm": 1.8759933710098267,
"learning_rate": 1.6043354639250301e-06,
"loss": 0.6043,
"sparse_loss": 0.6043,
"step": 11420
},
{
"epoch": 1.8084097375908947,
"grad_norm": 4.52070951461792,
"learning_rate": 1.553252457422985e-06,
"loss": 0.6661,
"sparse_loss": 0.6661,
"step": 11440
},
{
"epoch": 1.8115712930761934,
"grad_norm": 5.161223888397217,
"learning_rate": 1.5029698397973274e-06,
"loss": 0.6998,
"sparse_loss": 0.6998,
"step": 11460
},
{
"epoch": 1.8147328485614924,
"grad_norm": 13.196478843688965,
"learning_rate": 1.4534893275184397e-06,
"loss": 0.7442,
"sparse_loss": 0.7442,
"step": 11480
},
{
"epoch": 1.817894404046791,
"grad_norm": 2.45226788520813,
"learning_rate": 1.4048126096756847e-06,
"loss": 0.6336,
"sparse_loss": 0.6336,
"step": 11500
},
{
"epoch": 1.8210559595320897,
"grad_norm": 14.519777297973633,
"learning_rate": 1.3569413479197129e-06,
"loss": 0.7014,
"sparse_loss": 0.7014,
"step": 11520
},
{
"epoch": 1.8242175150173887,
"grad_norm": 3.6050455570220947,
"learning_rate": 1.3098771764057715e-06,
"loss": 0.7002,
"sparse_loss": 0.7002,
"step": 11540
},
{
"epoch": 1.8273790705026873,
"grad_norm": 2.05651593208313,
"learning_rate": 1.2636217017378992e-06,
"loss": 0.6105,
"sparse_loss": 0.6105,
"step": 11560
},
{
"epoch": 1.830540625987986,
"grad_norm": 5.284204483032227,
"learning_rate": 1.2181765029140868e-06,
"loss": 0.6764,
"sparse_loss": 0.6764,
"step": 11580
},
{
"epoch": 1.833702181473285,
"grad_norm": 2.5336947441101074,
"learning_rate": 1.173543131272395e-06,
"loss": 0.6703,
"sparse_loss": 0.6703,
"step": 11600
},
{
"epoch": 1.8368637369585836,
"grad_norm": 2.767129421234131,
"learning_rate": 1.1297231104379691e-06,
"loss": 0.7235,
"sparse_loss": 0.7235,
"step": 11620
},
{
"epoch": 1.8400252924438822,
"grad_norm": 3.1908180713653564,
"learning_rate": 1.0867179362710367e-06,
"loss": 0.7188,
"sparse_loss": 0.7188,
"step": 11640
},
{
"epoch": 1.8431868479291813,
"grad_norm": 3.3988351821899414,
"learning_rate": 1.0445290768158561e-06,
"loss": 0.6119,
"sparse_loss": 0.6119,
"step": 11660
},
{
"epoch": 1.84634840341448,
"grad_norm": 3.3586833477020264,
"learning_rate": 1.0031579722505902e-06,
"loss": 0.5684,
"sparse_loss": 0.5684,
"step": 11680
},
{
"epoch": 1.8495099588997785,
"grad_norm": 2.7259163856506348,
"learning_rate": 9.626060348381482e-07,
"loss": 0.7029,
"sparse_loss": 0.7029,
"step": 11700
},
{
"epoch": 1.8526715143850776,
"grad_norm": 4.455019474029541,
"learning_rate": 9.228746488779777e-07,
"loss": 0.6075,
"sparse_loss": 0.6075,
"step": 11720
},
{
"epoch": 1.8558330698703762,
"grad_norm": 3.4989333152770996,
"learning_rate": 8.839651706588042e-07,
"loss": 0.6807,
"sparse_loss": 0.6807,
"step": 11740
},
{
"epoch": 1.8589946253556748,
"grad_norm": 3.912893295288086,
"learning_rate": 8.458789284123359e-07,
"loss": 0.6755,
"sparse_loss": 0.6755,
"step": 11760
},
{
"epoch": 1.862156180840974,
"grad_norm": 3.059116840362549,
"learning_rate": 8.086172222679184e-07,
"loss": 0.6841,
"sparse_loss": 0.6841,
"step": 11780
},
{
"epoch": 1.8653177363262725,
"grad_norm": 5.806337356567383,
"learning_rate": 7.721813242081682e-07,
"loss": 0.6623,
"sparse_loss": 0.6623,
"step": 11800
},
{
"epoch": 1.8684792918115711,
"grad_norm": 3.4639861583709717,
"learning_rate": 7.365724780255239e-07,
"loss": 0.7208,
"sparse_loss": 0.7208,
"step": 11820
},
{
"epoch": 1.8716408472968702,
"grad_norm": 7.740152359008789,
"learning_rate": 7.017918992798272e-07,
"loss": 0.6936,
"sparse_loss": 0.6936,
"step": 11840
},
{
"epoch": 1.8748024027821688,
"grad_norm": 2.8128342628479004,
"learning_rate": 6.678407752567756e-07,
"loss": 0.615,
"sparse_loss": 0.615,
"step": 11860
},
{
"epoch": 1.8779639582674676,
"grad_norm": 4.114563465118408,
"learning_rate": 6.34720264927438e-07,
"loss": 0.635,
"sparse_loss": 0.635,
"step": 11880
},
{
"epoch": 1.8811255137527665,
"grad_norm": 3.8733608722686768,
"learning_rate": 6.024314989086788e-07,
"loss": 0.6929,
"sparse_loss": 0.6929,
"step": 11900
},
{
"epoch": 1.884287069238065,
"grad_norm": 5.899589538574219,
"learning_rate": 5.709755794245458e-07,
"loss": 0.6765,
"sparse_loss": 0.6765,
"step": 11920
},
{
"epoch": 1.887448624723364,
"grad_norm": 2.3009846210479736,
"learning_rate": 5.403535802686738e-07,
"loss": 0.6189,
"sparse_loss": 0.6189,
"step": 11940
},
{
"epoch": 1.8906101802086628,
"grad_norm": 2.454423427581787,
"learning_rate": 5.105665467675963e-07,
"loss": 0.6262,
"sparse_loss": 0.6262,
"step": 11960
},
{
"epoch": 1.8937717356939614,
"grad_norm": 3.036147356033325,
"learning_rate": 4.816154957450831e-07,
"loss": 0.5448,
"sparse_loss": 0.5448,
"step": 11980
},
{
"epoch": 1.8969332911792602,
"grad_norm": 4.604625701904297,
"learning_rate": 4.53501415487434e-07,
"loss": 0.6672,
"sparse_loss": 0.6672,
"step": 12000
},
{
"epoch": 1.900094846664559,
"grad_norm": 3.325061798095703,
"learning_rate": 4.2622526570972044e-07,
"loss": 0.6019,
"sparse_loss": 0.6019,
"step": 12020
},
{
"epoch": 1.9032564021498577,
"grad_norm": 4.258634090423584,
"learning_rate": 3.997879775230445e-07,
"loss": 0.6101,
"sparse_loss": 0.6101,
"step": 12040
},
{
"epoch": 1.9064179576351565,
"grad_norm": 2.745493173599243,
"learning_rate": 3.741904534027424e-07,
"loss": 0.6053,
"sparse_loss": 0.6053,
"step": 12060
},
{
"epoch": 1.9095795131204554,
"grad_norm": 3.0191283226013184,
"learning_rate": 3.494335671575755e-07,
"loss": 0.6834,
"sparse_loss": 0.6834,
"step": 12080
},
{
"epoch": 1.912741068605754,
"grad_norm": 2.9336516857147217,
"learning_rate": 3.255181638999211e-07,
"loss": 0.6392,
"sparse_loss": 0.6392,
"step": 12100
},
{
"epoch": 1.9159026240910528,
"grad_norm": 3.190945625305176,
"learning_rate": 3.0244506001689543e-07,
"loss": 0.6366,
"sparse_loss": 0.6366,
"step": 12120
},
{
"epoch": 1.9190641795763517,
"grad_norm": 3.285252571105957,
"learning_rate": 2.8021504314250934e-07,
"loss": 0.7014,
"sparse_loss": 0.7014,
"step": 12140
},
{
"epoch": 1.9222257350616503,
"grad_norm": 7.00942850112915,
"learning_rate": 2.588288721307619e-07,
"loss": 0.618,
"sparse_loss": 0.618,
"step": 12160
},
{
"epoch": 1.9253872905469491,
"grad_norm": 5.004833698272705,
"learning_rate": 2.3828727702975007e-07,
"loss": 0.6935,
"sparse_loss": 0.6935,
"step": 12180
},
{
"epoch": 1.928548846032248,
"grad_norm": 4.934511184692383,
"learning_rate": 2.1859095905674143e-07,
"loss": 0.6286,
"sparse_loss": 0.6286,
"step": 12200
},
{
"epoch": 1.9317104015175466,
"grad_norm": 2.8532612323760986,
"learning_rate": 1.9974059057423223e-07,
"loss": 0.661,
"sparse_loss": 0.661,
"step": 12220
},
{
"epoch": 1.9348719570028454,
"grad_norm": 3.2379302978515625,
"learning_rate": 1.8173681506701013e-07,
"loss": 0.6822,
"sparse_loss": 0.6822,
"step": 12240
},
{
"epoch": 1.9380335124881443,
"grad_norm": 3.928940534591675,
"learning_rate": 1.6458024712017182e-07,
"loss": 0.6936,
"sparse_loss": 0.6936,
"step": 12260
},
{
"epoch": 1.9411950679734429,
"grad_norm": 2.6900010108947754,
"learning_rate": 1.4827147239815097e-07,
"loss": 0.6284,
"sparse_loss": 0.6284,
"step": 12280
},
{
"epoch": 1.9443566234587417,
"grad_norm": 4.513027667999268,
"learning_rate": 1.328110476247285e-07,
"loss": 0.6469,
"sparse_loss": 0.6469,
"step": 12300
},
{
"epoch": 1.9475181789440406,
"grad_norm": 16.359464645385742,
"learning_rate": 1.181995005640174e-07,
"loss": 0.737,
"sparse_loss": 0.737,
"step": 12320
},
{
"epoch": 1.9506797344293392,
"grad_norm": 2.822321653366089,
"learning_rate": 1.0443733000246037e-07,
"loss": 0.7011,
"sparse_loss": 0.7011,
"step": 12340
},
{
"epoch": 1.953841289914638,
"grad_norm": 2.8069210052490234,
"learning_rate": 9.152500573179345e-08,
"loss": 0.6338,
"sparse_loss": 0.6338,
"step": 12360
},
{
"epoch": 1.9570028453999369,
"grad_norm": 2.046774387359619,
"learning_rate": 7.946296853300895e-08,
"loss": 0.6635,
"sparse_loss": 0.6635,
"step": 12380
},
{
"epoch": 1.9601644008852355,
"grad_norm": 3.2035796642303467,
"learning_rate": 6.825163016132007e-08,
"loss": 0.6019,
"sparse_loss": 0.6019,
"step": 12400
},
{
"epoch": 1.9633259563705343,
"grad_norm": 3.2717444896698,
"learning_rate": 5.78913733320835e-08,
"loss": 0.6897,
"sparse_loss": 0.6897,
"step": 12420
},
{
"epoch": 1.9664875118558331,
"grad_norm": 3.2161619663238525,
"learning_rate": 4.8382551707762403e-08,
"loss": 0.6334,
"sparse_loss": 0.6334,
"step": 12440
},
{
"epoch": 1.9696490673411318,
"grad_norm": 3.822545051574707,
"learning_rate": 3.972548988582792e-08,
"loss": 0.7097,
"sparse_loss": 0.7097,
"step": 12460
},
{
"epoch": 1.9728106228264306,
"grad_norm": 3.633744239807129,
"learning_rate": 3.192048338769293e-08,
"loss": 0.758,
"sparse_loss": 0.758,
"step": 12480
},
{
"epoch": 1.9759721783117294,
"grad_norm": 3.19952392578125,
"learning_rate": 2.496779864862575e-08,
"loss": 0.6445,
"sparse_loss": 0.6445,
"step": 12500
},
{
"epoch": 1.979133733797028,
"grad_norm": 2.4408233165740967,
"learning_rate": 1.886767300864345e-08,
"loss": 0.6809,
"sparse_loss": 0.6809,
"step": 12520
},
{
"epoch": 1.982295289282327,
"grad_norm": 2.962395191192627,
"learning_rate": 1.362031470441838e-08,
"loss": 0.6987,
"sparse_loss": 0.6987,
"step": 12540
},
{
"epoch": 1.9854568447676257,
"grad_norm": 2.5761423110961914,
"learning_rate": 9.225902862172731e-09,
"loss": 0.6249,
"sparse_loss": 0.6249,
"step": 12560
},
{
"epoch": 1.9886184002529244,
"grad_norm": 2.363164186477661,
"learning_rate": 5.684587491550097e-09,
"loss": 0.6844,
"sparse_loss": 0.6844,
"step": 12580
},
{
"epoch": 1.9917799557382232,
"grad_norm": 2.3563790321350098,
"learning_rate": 2.996489480514009e-09,
"loss": 0.7159,
"sparse_loss": 0.7159,
"step": 12600
},
{
"epoch": 1.994941511223522,
"grad_norm": 2.364633321762085,
"learning_rate": 1.1617005911984668e-09,
"loss": 0.7133,
"sparse_loss": 0.7133,
"step": 12620
},
{
"epoch": 1.9981030667088207,
"grad_norm": 8.201303482055664,
"learning_rate": 1.8028345680209946e-10,
"loss": 0.6341,
"sparse_loss": 0.6341,
"step": 12640
}
],
"logging_steps": 20,
"max_steps": 12652,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}