RAMP_7B_mp_2_ratio_0.1 / trainer_state.json
JJYDXFS's picture
init
1f1f5bd verified
Raw
History Blame Contribute Delete
373 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.995260663507109,
"eval_steps": 500,
"global_step": 2520,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 66560.0,
"learning_rate": 4.998025276461296e-05,
"loss": 9.856,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 8960.0,
"learning_rate": 4.996050552922591e-05,
"loss": 7.3207,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 9408.0,
"learning_rate": 4.994075829383886e-05,
"loss": 9.7708,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 622592.0,
"learning_rate": 4.9921011058451815e-05,
"loss": 8.7029,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 13120.0,
"learning_rate": 4.9901263823064776e-05,
"loss": 10.3375,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 43520.0,
"learning_rate": 4.988151658767773e-05,
"loss": 9.143,
"step": 6
},
{
"epoch": 0.0,
"grad_norm": 59648.0,
"learning_rate": 4.9861769352290685e-05,
"loss": 9.8022,
"step": 7
},
{
"epoch": 0.0,
"grad_norm": 25728.0,
"learning_rate": 4.9842022116903633e-05,
"loss": 9.2741,
"step": 8
},
{
"epoch": 0.0,
"grad_norm": 31232.0,
"learning_rate": 4.982227488151659e-05,
"loss": 8.3661,
"step": 9
},
{
"epoch": 0.0,
"grad_norm": 169984.0,
"learning_rate": 4.980252764612954e-05,
"loss": 7.6912,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 13632.0,
"learning_rate": 4.97827804107425e-05,
"loss": 8.1655,
"step": 11
},
{
"epoch": 0.0,
"grad_norm": 7392.0,
"learning_rate": 4.976303317535545e-05,
"loss": 8.4757,
"step": 12
},
{
"epoch": 0.01,
"grad_norm": 7552.0,
"learning_rate": 4.974328593996841e-05,
"loss": 8.4104,
"step": 13
},
{
"epoch": 0.01,
"grad_norm": 18688.0,
"learning_rate": 4.972353870458136e-05,
"loss": 7.8633,
"step": 14
},
{
"epoch": 0.01,
"grad_norm": 34560.0,
"learning_rate": 4.9703791469194316e-05,
"loss": 8.7962,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 10496.0,
"learning_rate": 4.968404423380727e-05,
"loss": 8.3055,
"step": 16
},
{
"epoch": 0.01,
"grad_norm": 18176.0,
"learning_rate": 4.9664296998420226e-05,
"loss": 8.28,
"step": 17
},
{
"epoch": 0.01,
"grad_norm": 11776.0,
"learning_rate": 4.9644549763033174e-05,
"loss": 7.3166,
"step": 18
},
{
"epoch": 0.01,
"grad_norm": 3408.0,
"learning_rate": 4.962480252764613e-05,
"loss": 6.392,
"step": 19
},
{
"epoch": 0.01,
"grad_norm": 932.0,
"learning_rate": 4.960505529225908e-05,
"loss": 5.6725,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 4928.0,
"learning_rate": 4.9585308056872045e-05,
"loss": 5.9983,
"step": 21
},
{
"epoch": 0.01,
"grad_norm": 17920.0,
"learning_rate": 4.9565560821485e-05,
"loss": 6.8331,
"step": 22
},
{
"epoch": 0.01,
"grad_norm": 6272.0,
"learning_rate": 4.954581358609795e-05,
"loss": 7.1968,
"step": 23
},
{
"epoch": 0.01,
"grad_norm": 110080.0,
"learning_rate": 4.95260663507109e-05,
"loss": 8.785,
"step": 24
},
{
"epoch": 0.01,
"grad_norm": 76288.0,
"learning_rate": 4.950631911532386e-05,
"loss": 8.6061,
"step": 25
},
{
"epoch": 0.01,
"grad_norm": 1848.0,
"learning_rate": 4.948657187993681e-05,
"loss": 8.5602,
"step": 26
},
{
"epoch": 0.01,
"grad_norm": 75776.0,
"learning_rate": 4.9466824644549766e-05,
"loss": 8.9401,
"step": 27
},
{
"epoch": 0.01,
"grad_norm": 4704.0,
"learning_rate": 4.9447077409162714e-05,
"loss": 8.5445,
"step": 28
},
{
"epoch": 0.01,
"grad_norm": 7264.0,
"learning_rate": 4.942733017377567e-05,
"loss": 8.7716,
"step": 29
},
{
"epoch": 0.01,
"grad_norm": 5344.0,
"learning_rate": 4.940758293838863e-05,
"loss": 8.8828,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 5056.0,
"learning_rate": 4.9387835703001585e-05,
"loss": 8.9008,
"step": 31
},
{
"epoch": 0.01,
"grad_norm": 5568.0,
"learning_rate": 4.936808846761454e-05,
"loss": 8.365,
"step": 32
},
{
"epoch": 0.01,
"grad_norm": 14336.0,
"learning_rate": 4.934834123222749e-05,
"loss": 8.3496,
"step": 33
},
{
"epoch": 0.01,
"grad_norm": 6016.0,
"learning_rate": 4.932859399684044e-05,
"loss": 8.1496,
"step": 34
},
{
"epoch": 0.01,
"grad_norm": 4864.0,
"learning_rate": 4.93088467614534e-05,
"loss": 8.3418,
"step": 35
},
{
"epoch": 0.01,
"grad_norm": 2784.0,
"learning_rate": 4.928909952606635e-05,
"loss": 8.1558,
"step": 36
},
{
"epoch": 0.01,
"grad_norm": 2272.0,
"learning_rate": 4.926935229067931e-05,
"loss": 7.4301,
"step": 37
},
{
"epoch": 0.02,
"grad_norm": 11648.0,
"learning_rate": 4.924960505529226e-05,
"loss": 8.0793,
"step": 38
},
{
"epoch": 0.02,
"grad_norm": 6848.0,
"learning_rate": 4.9229857819905216e-05,
"loss": 7.1571,
"step": 39
},
{
"epoch": 0.02,
"grad_norm": 3168.0,
"learning_rate": 4.921011058451817e-05,
"loss": 8.3708,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": 13824.0,
"learning_rate": 4.9190363349131126e-05,
"loss": 8.6299,
"step": 41
},
{
"epoch": 0.02,
"grad_norm": 4448.0,
"learning_rate": 4.917061611374408e-05,
"loss": 7.973,
"step": 42
},
{
"epoch": 0.02,
"grad_norm": 2720.0,
"learning_rate": 4.915086887835703e-05,
"loss": 8.0501,
"step": 43
},
{
"epoch": 0.02,
"grad_norm": 92672.0,
"learning_rate": 4.913112164296998e-05,
"loss": 8.2457,
"step": 44
},
{
"epoch": 0.02,
"grad_norm": 6016.0,
"learning_rate": 4.911137440758294e-05,
"loss": 8.3462,
"step": 45
},
{
"epoch": 0.02,
"grad_norm": 51456.0,
"learning_rate": 4.90916271721959e-05,
"loss": 8.1244,
"step": 46
},
{
"epoch": 0.02,
"grad_norm": 868.0,
"learning_rate": 4.9071879936808854e-05,
"loss": 8.078,
"step": 47
},
{
"epoch": 0.02,
"grad_norm": 2128.0,
"learning_rate": 4.90521327014218e-05,
"loss": 7.9465,
"step": 48
},
{
"epoch": 0.02,
"grad_norm": 12992.0,
"learning_rate": 4.903238546603476e-05,
"loss": 8.2983,
"step": 49
},
{
"epoch": 0.02,
"grad_norm": 2448.0,
"learning_rate": 4.901263823064771e-05,
"loss": 7.4304,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 1344.0,
"learning_rate": 4.8992890995260666e-05,
"loss": 7.4071,
"step": 51
},
{
"epoch": 0.02,
"grad_norm": 3120.0,
"learning_rate": 4.897314375987362e-05,
"loss": 8.0153,
"step": 52
},
{
"epoch": 0.02,
"grad_norm": 32128.0,
"learning_rate": 4.895339652448657e-05,
"loss": 7.4022,
"step": 53
},
{
"epoch": 0.02,
"grad_norm": 2448.0,
"learning_rate": 4.8933649289099524e-05,
"loss": 7.9052,
"step": 54
},
{
"epoch": 0.02,
"grad_norm": 12032.0,
"learning_rate": 4.8913902053712485e-05,
"loss": 6.6824,
"step": 55
},
{
"epoch": 0.02,
"grad_norm": 85504.0,
"learning_rate": 4.889415481832544e-05,
"loss": 7.7817,
"step": 56
},
{
"epoch": 0.02,
"grad_norm": 29440.0,
"learning_rate": 4.8874407582938395e-05,
"loss": 8.0797,
"step": 57
},
{
"epoch": 0.02,
"grad_norm": 604.0,
"learning_rate": 4.885466034755134e-05,
"loss": 8.4528,
"step": 58
},
{
"epoch": 0.02,
"grad_norm": 1992.0,
"learning_rate": 4.88349131121643e-05,
"loss": 8.1409,
"step": 59
},
{
"epoch": 0.02,
"grad_norm": 5056.0,
"learning_rate": 4.881516587677725e-05,
"loss": 8.297,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 10752.0,
"learning_rate": 4.879541864139021e-05,
"loss": 7.9699,
"step": 61
},
{
"epoch": 0.02,
"grad_norm": 15232.0,
"learning_rate": 4.877567140600316e-05,
"loss": 7.8561,
"step": 62
},
{
"epoch": 0.02,
"grad_norm": 370.0,
"learning_rate": 4.8755924170616116e-05,
"loss": 7.8856,
"step": 63
},
{
"epoch": 0.03,
"grad_norm": 764.0,
"learning_rate": 4.873617693522907e-05,
"loss": 7.3657,
"step": 64
},
{
"epoch": 0.03,
"grad_norm": 824.0,
"learning_rate": 4.8716429699842026e-05,
"loss": 7.9592,
"step": 65
},
{
"epoch": 0.03,
"grad_norm": 3600.0,
"learning_rate": 4.869668246445498e-05,
"loss": 6.8459,
"step": 66
},
{
"epoch": 0.03,
"grad_norm": 350.0,
"learning_rate": 4.8676935229067935e-05,
"loss": 7.2738,
"step": 67
},
{
"epoch": 0.03,
"grad_norm": 4288.0,
"learning_rate": 4.865718799368088e-05,
"loss": 6.1513,
"step": 68
},
{
"epoch": 0.03,
"grad_norm": 9664.0,
"learning_rate": 4.863744075829384e-05,
"loss": 6.1848,
"step": 69
},
{
"epoch": 0.03,
"grad_norm": 29056.0,
"learning_rate": 4.861769352290679e-05,
"loss": 6.5639,
"step": 70
},
{
"epoch": 0.03,
"grad_norm": 15936.0,
"learning_rate": 4.8597946287519754e-05,
"loss": 6.5396,
"step": 71
},
{
"epoch": 0.03,
"grad_norm": 1720.0,
"learning_rate": 4.857819905213271e-05,
"loss": 6.4093,
"step": 72
},
{
"epoch": 0.03,
"grad_norm": 21376.0,
"learning_rate": 4.8558451816745657e-05,
"loss": 6.8805,
"step": 73
},
{
"epoch": 0.03,
"grad_norm": 784.0,
"learning_rate": 4.853870458135861e-05,
"loss": 6.3058,
"step": 74
},
{
"epoch": 0.03,
"grad_norm": 536.0,
"learning_rate": 4.8518957345971566e-05,
"loss": 5.5386,
"step": 75
},
{
"epoch": 0.03,
"grad_norm": 676.0,
"learning_rate": 4.849921011058452e-05,
"loss": 5.4448,
"step": 76
},
{
"epoch": 0.03,
"grad_norm": 1616.0,
"learning_rate": 4.8479462875197475e-05,
"loss": 5.5412,
"step": 77
},
{
"epoch": 0.03,
"grad_norm": 89.0,
"learning_rate": 4.845971563981043e-05,
"loss": 5.2232,
"step": 78
},
{
"epoch": 0.03,
"grad_norm": 161.0,
"learning_rate": 4.843996840442338e-05,
"loss": 5.2437,
"step": 79
},
{
"epoch": 0.03,
"grad_norm": 253.0,
"learning_rate": 4.842022116903634e-05,
"loss": 4.4182,
"step": 80
},
{
"epoch": 0.03,
"grad_norm": 241.0,
"learning_rate": 4.8400473933649294e-05,
"loss": 4.5345,
"step": 81
},
{
"epoch": 0.03,
"grad_norm": 72.5,
"learning_rate": 4.838072669826225e-05,
"loss": 4.1391,
"step": 82
},
{
"epoch": 0.03,
"grad_norm": 446.0,
"learning_rate": 4.83609794628752e-05,
"loss": 4.2768,
"step": 83
},
{
"epoch": 0.03,
"grad_norm": 26.0,
"learning_rate": 4.834123222748815e-05,
"loss": 3.5847,
"step": 84
},
{
"epoch": 0.03,
"grad_norm": 13.0,
"learning_rate": 4.8321484992101106e-05,
"loss": 3.6449,
"step": 85
},
{
"epoch": 0.03,
"grad_norm": 103.0,
"learning_rate": 4.830173775671406e-05,
"loss": 3.5437,
"step": 86
},
{
"epoch": 0.03,
"grad_norm": 1040.0,
"learning_rate": 4.8281990521327016e-05,
"loss": 3.6617,
"step": 87
},
{
"epoch": 0.03,
"grad_norm": 2384.0,
"learning_rate": 4.826224328593997e-05,
"loss": 3.7292,
"step": 88
},
{
"epoch": 0.04,
"grad_norm": 4000.0,
"learning_rate": 4.8242496050552925e-05,
"loss": 3.4754,
"step": 89
},
{
"epoch": 0.04,
"grad_norm": 4992.0,
"learning_rate": 4.822274881516588e-05,
"loss": 3.8297,
"step": 90
},
{
"epoch": 0.04,
"grad_norm": 6240.0,
"learning_rate": 4.8203001579778835e-05,
"loss": 3.9467,
"step": 91
},
{
"epoch": 0.04,
"grad_norm": 796.0,
"learning_rate": 4.818325434439179e-05,
"loss": 3.6178,
"step": 92
},
{
"epoch": 0.04,
"grad_norm": 2784.0,
"learning_rate": 4.816350710900474e-05,
"loss": 3.8225,
"step": 93
},
{
"epoch": 0.04,
"grad_norm": 6528.0,
"learning_rate": 4.814375987361769e-05,
"loss": 3.8475,
"step": 94
},
{
"epoch": 0.04,
"grad_norm": 412.0,
"learning_rate": 4.812401263823065e-05,
"loss": 3.6729,
"step": 95
},
{
"epoch": 0.04,
"grad_norm": 91.5,
"learning_rate": 4.810426540284361e-05,
"loss": 3.2021,
"step": 96
},
{
"epoch": 0.04,
"grad_norm": 131.0,
"learning_rate": 4.808451816745656e-05,
"loss": 3.5322,
"step": 97
},
{
"epoch": 0.04,
"grad_norm": 1192.0,
"learning_rate": 4.806477093206951e-05,
"loss": 3.6487,
"step": 98
},
{
"epoch": 0.04,
"grad_norm": 16.625,
"learning_rate": 4.8045023696682466e-05,
"loss": 3.2748,
"step": 99
},
{
"epoch": 0.04,
"grad_norm": 4.875,
"learning_rate": 4.802527646129542e-05,
"loss": 3.077,
"step": 100
},
{
"epoch": 0.04,
"grad_norm": 10.125,
"learning_rate": 4.8005529225908375e-05,
"loss": 3.2129,
"step": 101
},
{
"epoch": 0.04,
"grad_norm": 6.03125,
"learning_rate": 4.798578199052133e-05,
"loss": 3.0891,
"step": 102
},
{
"epoch": 0.04,
"grad_norm": 116.0,
"learning_rate": 4.7966034755134285e-05,
"loss": 3.1009,
"step": 103
},
{
"epoch": 0.04,
"grad_norm": 18.75,
"learning_rate": 4.794628751974723e-05,
"loss": 3.2519,
"step": 104
},
{
"epoch": 0.04,
"grad_norm": 13.9375,
"learning_rate": 4.7926540284360194e-05,
"loss": 2.8401,
"step": 105
},
{
"epoch": 0.04,
"grad_norm": 4.03125,
"learning_rate": 4.790679304897315e-05,
"loss": 3.2164,
"step": 106
},
{
"epoch": 0.04,
"grad_norm": 7.15625,
"learning_rate": 4.7887045813586104e-05,
"loss": 2.8657,
"step": 107
},
{
"epoch": 0.04,
"grad_norm": 5.71875,
"learning_rate": 4.786729857819905e-05,
"loss": 3.0181,
"step": 108
},
{
"epoch": 0.04,
"grad_norm": 10.0,
"learning_rate": 4.7847551342812006e-05,
"loss": 3.1407,
"step": 109
},
{
"epoch": 0.04,
"grad_norm": 5.0625,
"learning_rate": 4.782780410742496e-05,
"loss": 2.8146,
"step": 110
},
{
"epoch": 0.04,
"grad_norm": 2.84375,
"learning_rate": 4.7808056872037916e-05,
"loss": 3.0214,
"step": 111
},
{
"epoch": 0.04,
"grad_norm": 3.78125,
"learning_rate": 4.778830963665087e-05,
"loss": 3.1901,
"step": 112
},
{
"epoch": 0.04,
"grad_norm": 4.75,
"learning_rate": 4.7768562401263825e-05,
"loss": 2.5318,
"step": 113
},
{
"epoch": 0.05,
"grad_norm": 2.828125,
"learning_rate": 4.774881516587678e-05,
"loss": 2.8071,
"step": 114
},
{
"epoch": 0.05,
"grad_norm": 3.15625,
"learning_rate": 4.7729067930489735e-05,
"loss": 2.9719,
"step": 115
},
{
"epoch": 0.05,
"grad_norm": 2.6875,
"learning_rate": 4.770932069510269e-05,
"loss": 2.8595,
"step": 116
},
{
"epoch": 0.05,
"grad_norm": 3.625,
"learning_rate": 4.7689573459715644e-05,
"loss": 3.2301,
"step": 117
},
{
"epoch": 0.05,
"grad_norm": 2.96875,
"learning_rate": 4.766982622432859e-05,
"loss": 3.1154,
"step": 118
},
{
"epoch": 0.05,
"grad_norm": 4.6875,
"learning_rate": 4.765007898894155e-05,
"loss": 2.9694,
"step": 119
},
{
"epoch": 0.05,
"grad_norm": 3.609375,
"learning_rate": 4.76303317535545e-05,
"loss": 2.7737,
"step": 120
},
{
"epoch": 0.05,
"grad_norm": 4.5,
"learning_rate": 4.761058451816746e-05,
"loss": 2.7949,
"step": 121
},
{
"epoch": 0.05,
"grad_norm": 11.125,
"learning_rate": 4.759083728278042e-05,
"loss": 3.0712,
"step": 122
},
{
"epoch": 0.05,
"grad_norm": 2.671875,
"learning_rate": 4.7571090047393366e-05,
"loss": 2.8633,
"step": 123
},
{
"epoch": 0.05,
"grad_norm": 4.46875,
"learning_rate": 4.755134281200632e-05,
"loss": 2.9064,
"step": 124
},
{
"epoch": 0.05,
"grad_norm": 4.0625,
"learning_rate": 4.7531595576619275e-05,
"loss": 2.8369,
"step": 125
},
{
"epoch": 0.05,
"grad_norm": 4.4375,
"learning_rate": 4.751184834123223e-05,
"loss": 2.5994,
"step": 126
},
{
"epoch": 0.05,
"grad_norm": 2.5625,
"learning_rate": 4.7492101105845185e-05,
"loss": 2.583,
"step": 127
},
{
"epoch": 0.05,
"grad_norm": 2.984375,
"learning_rate": 4.747235387045814e-05,
"loss": 2.9879,
"step": 128
},
{
"epoch": 0.05,
"grad_norm": 3.765625,
"learning_rate": 4.745260663507109e-05,
"loss": 2.9709,
"step": 129
},
{
"epoch": 0.05,
"grad_norm": 3.4375,
"learning_rate": 4.743285939968405e-05,
"loss": 2.5776,
"step": 130
},
{
"epoch": 0.05,
"grad_norm": 2.3125,
"learning_rate": 4.7413112164297003e-05,
"loss": 2.8578,
"step": 131
},
{
"epoch": 0.05,
"grad_norm": 3.296875,
"learning_rate": 4.739336492890996e-05,
"loss": 3.0034,
"step": 132
},
{
"epoch": 0.05,
"grad_norm": 2.875,
"learning_rate": 4.7373617693522906e-05,
"loss": 2.3887,
"step": 133
},
{
"epoch": 0.05,
"grad_norm": 2.28125,
"learning_rate": 4.735387045813586e-05,
"loss": 3.0038,
"step": 134
},
{
"epoch": 0.05,
"grad_norm": 3.796875,
"learning_rate": 4.7334123222748816e-05,
"loss": 2.8048,
"step": 135
},
{
"epoch": 0.05,
"grad_norm": 1.734375,
"learning_rate": 4.731437598736177e-05,
"loss": 2.7049,
"step": 136
},
{
"epoch": 0.05,
"grad_norm": 4.21875,
"learning_rate": 4.7294628751974725e-05,
"loss": 2.9914,
"step": 137
},
{
"epoch": 0.05,
"grad_norm": 2.3125,
"learning_rate": 4.727488151658768e-05,
"loss": 2.6212,
"step": 138
},
{
"epoch": 0.05,
"grad_norm": 2.484375,
"learning_rate": 4.7255134281200634e-05,
"loss": 2.6307,
"step": 139
},
{
"epoch": 0.06,
"grad_norm": 2.609375,
"learning_rate": 4.723538704581359e-05,
"loss": 2.9217,
"step": 140
},
{
"epoch": 0.06,
"grad_norm": 3.578125,
"learning_rate": 4.7215639810426544e-05,
"loss": 2.7525,
"step": 141
},
{
"epoch": 0.06,
"grad_norm": 2.671875,
"learning_rate": 4.71958925750395e-05,
"loss": 2.7069,
"step": 142
},
{
"epoch": 0.06,
"grad_norm": 2.53125,
"learning_rate": 4.717614533965245e-05,
"loss": 2.4349,
"step": 143
},
{
"epoch": 0.06,
"grad_norm": 1.734375,
"learning_rate": 4.71563981042654e-05,
"loss": 2.5973,
"step": 144
},
{
"epoch": 0.06,
"grad_norm": 3.765625,
"learning_rate": 4.7136650868878356e-05,
"loss": 2.7587,
"step": 145
},
{
"epoch": 0.06,
"grad_norm": 2.140625,
"learning_rate": 4.711690363349132e-05,
"loss": 2.8174,
"step": 146
},
{
"epoch": 0.06,
"grad_norm": 2.203125,
"learning_rate": 4.709715639810427e-05,
"loss": 2.7192,
"step": 147
},
{
"epoch": 0.06,
"grad_norm": 2.34375,
"learning_rate": 4.707740916271722e-05,
"loss": 2.7889,
"step": 148
},
{
"epoch": 0.06,
"grad_norm": 2.03125,
"learning_rate": 4.7057661927330175e-05,
"loss": 2.8343,
"step": 149
},
{
"epoch": 0.06,
"grad_norm": 2.265625,
"learning_rate": 4.703791469194313e-05,
"loss": 2.7464,
"step": 150
},
{
"epoch": 0.06,
"grad_norm": 1.6796875,
"learning_rate": 4.7018167456556084e-05,
"loss": 2.6573,
"step": 151
},
{
"epoch": 0.06,
"grad_norm": 3.046875,
"learning_rate": 4.699842022116904e-05,
"loss": 2.8755,
"step": 152
},
{
"epoch": 0.06,
"grad_norm": 2.328125,
"learning_rate": 4.6978672985781994e-05,
"loss": 2.8984,
"step": 153
},
{
"epoch": 0.06,
"grad_norm": 2.4375,
"learning_rate": 4.695892575039494e-05,
"loss": 2.7545,
"step": 154
},
{
"epoch": 0.06,
"grad_norm": 2.578125,
"learning_rate": 4.69391785150079e-05,
"loss": 2.9516,
"step": 155
},
{
"epoch": 0.06,
"grad_norm": 2.484375,
"learning_rate": 4.691943127962086e-05,
"loss": 2.6453,
"step": 156
},
{
"epoch": 0.06,
"grad_norm": 1.890625,
"learning_rate": 4.689968404423381e-05,
"loss": 2.5986,
"step": 157
},
{
"epoch": 0.06,
"grad_norm": 2.78125,
"learning_rate": 4.687993680884676e-05,
"loss": 2.9487,
"step": 158
},
{
"epoch": 0.06,
"grad_norm": 1.9375,
"learning_rate": 4.6860189573459715e-05,
"loss": 2.7401,
"step": 159
},
{
"epoch": 0.06,
"grad_norm": 3.03125,
"learning_rate": 4.684044233807267e-05,
"loss": 2.8816,
"step": 160
},
{
"epoch": 0.06,
"grad_norm": 2.1875,
"learning_rate": 4.6820695102685625e-05,
"loss": 2.8877,
"step": 161
},
{
"epoch": 0.06,
"grad_norm": 3.234375,
"learning_rate": 4.680094786729858e-05,
"loss": 2.8042,
"step": 162
},
{
"epoch": 0.06,
"grad_norm": 2.109375,
"learning_rate": 4.6781200631911534e-05,
"loss": 2.8763,
"step": 163
},
{
"epoch": 0.06,
"grad_norm": 2.171875,
"learning_rate": 4.676145339652449e-05,
"loss": 2.6943,
"step": 164
},
{
"epoch": 0.07,
"grad_norm": 1.8515625,
"learning_rate": 4.6741706161137444e-05,
"loss": 2.5086,
"step": 165
},
{
"epoch": 0.07,
"grad_norm": 1.9140625,
"learning_rate": 4.67219589257504e-05,
"loss": 2.5254,
"step": 166
},
{
"epoch": 0.07,
"grad_norm": 2.609375,
"learning_rate": 4.670221169036335e-05,
"loss": 2.6009,
"step": 167
},
{
"epoch": 0.07,
"grad_norm": 1.9921875,
"learning_rate": 4.668246445497631e-05,
"loss": 2.8622,
"step": 168
},
{
"epoch": 0.07,
"grad_norm": 1.5703125,
"learning_rate": 4.6662717219589256e-05,
"loss": 2.651,
"step": 169
},
{
"epoch": 0.07,
"grad_norm": 3.0,
"learning_rate": 4.664296998420221e-05,
"loss": 2.9756,
"step": 170
},
{
"epoch": 0.07,
"grad_norm": 1.90625,
"learning_rate": 4.662322274881517e-05,
"loss": 2.7025,
"step": 171
},
{
"epoch": 0.07,
"grad_norm": 1.6484375,
"learning_rate": 4.660347551342813e-05,
"loss": 2.76,
"step": 172
},
{
"epoch": 0.07,
"grad_norm": 1.3046875,
"learning_rate": 4.6583728278041075e-05,
"loss": 2.7927,
"step": 173
},
{
"epoch": 0.07,
"grad_norm": 2.640625,
"learning_rate": 4.656398104265403e-05,
"loss": 2.4775,
"step": 174
},
{
"epoch": 0.07,
"grad_norm": 2.125,
"learning_rate": 4.6544233807266984e-05,
"loss": 2.6717,
"step": 175
},
{
"epoch": 0.07,
"grad_norm": 2.515625,
"learning_rate": 4.652448657187994e-05,
"loss": 2.4236,
"step": 176
},
{
"epoch": 0.07,
"grad_norm": 2.578125,
"learning_rate": 4.6504739336492894e-05,
"loss": 2.8205,
"step": 177
},
{
"epoch": 0.07,
"grad_norm": 2.078125,
"learning_rate": 4.648499210110585e-05,
"loss": 2.6378,
"step": 178
},
{
"epoch": 0.07,
"grad_norm": 1.8828125,
"learning_rate": 4.6465244865718796e-05,
"loss": 2.7991,
"step": 179
},
{
"epoch": 0.07,
"grad_norm": 4.5625,
"learning_rate": 4.644549763033176e-05,
"loss": 2.7981,
"step": 180
},
{
"epoch": 0.07,
"grad_norm": 1.4609375,
"learning_rate": 4.642575039494471e-05,
"loss": 2.5205,
"step": 181
},
{
"epoch": 0.07,
"grad_norm": 4.125,
"learning_rate": 4.640600315955767e-05,
"loss": 2.6588,
"step": 182
},
{
"epoch": 0.07,
"grad_norm": 2.34375,
"learning_rate": 4.6386255924170615e-05,
"loss": 2.3992,
"step": 183
},
{
"epoch": 0.07,
"grad_norm": 3.0625,
"learning_rate": 4.636650868878357e-05,
"loss": 2.7429,
"step": 184
},
{
"epoch": 0.07,
"grad_norm": 4.875,
"learning_rate": 4.6346761453396525e-05,
"loss": 2.4167,
"step": 185
},
{
"epoch": 0.07,
"grad_norm": 3.71875,
"learning_rate": 4.632701421800948e-05,
"loss": 2.8682,
"step": 186
},
{
"epoch": 0.07,
"grad_norm": 2.65625,
"learning_rate": 4.6307266982622434e-05,
"loss": 2.8972,
"step": 187
},
{
"epoch": 0.07,
"grad_norm": 2.03125,
"learning_rate": 4.628751974723539e-05,
"loss": 2.6898,
"step": 188
},
{
"epoch": 0.07,
"grad_norm": 2.03125,
"learning_rate": 4.6267772511848343e-05,
"loss": 2.6833,
"step": 189
},
{
"epoch": 0.08,
"grad_norm": 2.609375,
"learning_rate": 4.62480252764613e-05,
"loss": 2.8257,
"step": 190
},
{
"epoch": 0.08,
"grad_norm": 1.6796875,
"learning_rate": 4.622827804107425e-05,
"loss": 2.3572,
"step": 191
},
{
"epoch": 0.08,
"grad_norm": 1.625,
"learning_rate": 4.620853080568721e-05,
"loss": 2.8499,
"step": 192
},
{
"epoch": 0.08,
"grad_norm": 1.8046875,
"learning_rate": 4.618878357030016e-05,
"loss": 2.6153,
"step": 193
},
{
"epoch": 0.08,
"grad_norm": 6.03125,
"learning_rate": 4.616903633491311e-05,
"loss": 2.3394,
"step": 194
},
{
"epoch": 0.08,
"grad_norm": 1.9140625,
"learning_rate": 4.6149289099526065e-05,
"loss": 2.5499,
"step": 195
},
{
"epoch": 0.08,
"grad_norm": 1.765625,
"learning_rate": 4.6129541864139027e-05,
"loss": 2.3042,
"step": 196
},
{
"epoch": 0.08,
"grad_norm": 1.7578125,
"learning_rate": 4.610979462875198e-05,
"loss": 2.878,
"step": 197
},
{
"epoch": 0.08,
"grad_norm": 1.953125,
"learning_rate": 4.609004739336493e-05,
"loss": 2.4411,
"step": 198
},
{
"epoch": 0.08,
"grad_norm": 2.1875,
"learning_rate": 4.6070300157977884e-05,
"loss": 2.7667,
"step": 199
},
{
"epoch": 0.08,
"grad_norm": 3.59375,
"learning_rate": 4.605055292259084e-05,
"loss": 2.7345,
"step": 200
},
{
"epoch": 0.08,
"grad_norm": 3.171875,
"learning_rate": 4.603080568720379e-05,
"loss": 2.4975,
"step": 201
},
{
"epoch": 0.08,
"grad_norm": 2.359375,
"learning_rate": 4.601105845181675e-05,
"loss": 2.7442,
"step": 202
},
{
"epoch": 0.08,
"grad_norm": 2.5625,
"learning_rate": 4.59913112164297e-05,
"loss": 2.5943,
"step": 203
},
{
"epoch": 0.08,
"grad_norm": 2.265625,
"learning_rate": 4.597156398104265e-05,
"loss": 2.6275,
"step": 204
},
{
"epoch": 0.08,
"grad_norm": 2.03125,
"learning_rate": 4.595181674565561e-05,
"loss": 3.0074,
"step": 205
},
{
"epoch": 0.08,
"grad_norm": 1.5,
"learning_rate": 4.593206951026857e-05,
"loss": 2.4229,
"step": 206
},
{
"epoch": 0.08,
"grad_norm": 1.828125,
"learning_rate": 4.591232227488152e-05,
"loss": 2.6958,
"step": 207
},
{
"epoch": 0.08,
"grad_norm": 1.6796875,
"learning_rate": 4.5892575039494476e-05,
"loss": 2.6977,
"step": 208
},
{
"epoch": 0.08,
"grad_norm": 1.9140625,
"learning_rate": 4.5872827804107424e-05,
"loss": 2.6727,
"step": 209
},
{
"epoch": 0.08,
"grad_norm": 2.921875,
"learning_rate": 4.585308056872038e-05,
"loss": 2.7467,
"step": 210
},
{
"epoch": 0.08,
"grad_norm": 3.765625,
"learning_rate": 4.5833333333333334e-05,
"loss": 2.6268,
"step": 211
},
{
"epoch": 0.08,
"grad_norm": 2.984375,
"learning_rate": 4.581358609794629e-05,
"loss": 2.547,
"step": 212
},
{
"epoch": 0.08,
"grad_norm": 1.9921875,
"learning_rate": 4.579383886255924e-05,
"loss": 2.5883,
"step": 213
},
{
"epoch": 0.08,
"grad_norm": 2.15625,
"learning_rate": 4.57740916271722e-05,
"loss": 2.3411,
"step": 214
},
{
"epoch": 0.08,
"grad_norm": 3.53125,
"learning_rate": 4.575434439178515e-05,
"loss": 2.7646,
"step": 215
},
{
"epoch": 0.09,
"grad_norm": 2.96875,
"learning_rate": 4.573459715639811e-05,
"loss": 2.7061,
"step": 216
},
{
"epoch": 0.09,
"grad_norm": 3.671875,
"learning_rate": 4.571484992101106e-05,
"loss": 2.6407,
"step": 217
},
{
"epoch": 0.09,
"grad_norm": 2.109375,
"learning_rate": 4.569510268562402e-05,
"loss": 2.5491,
"step": 218
},
{
"epoch": 0.09,
"grad_norm": 1.625,
"learning_rate": 4.5675355450236965e-05,
"loss": 2.5793,
"step": 219
},
{
"epoch": 0.09,
"grad_norm": 1.5546875,
"learning_rate": 4.565560821484992e-05,
"loss": 2.5309,
"step": 220
},
{
"epoch": 0.09,
"grad_norm": 2.21875,
"learning_rate": 4.563586097946288e-05,
"loss": 2.4251,
"step": 221
},
{
"epoch": 0.09,
"grad_norm": 2.140625,
"learning_rate": 4.5616113744075836e-05,
"loss": 2.9568,
"step": 222
},
{
"epoch": 0.09,
"grad_norm": 2.796875,
"learning_rate": 4.5596366508688784e-05,
"loss": 2.7043,
"step": 223
},
{
"epoch": 0.09,
"grad_norm": 2.90625,
"learning_rate": 4.557661927330174e-05,
"loss": 2.6535,
"step": 224
},
{
"epoch": 0.09,
"grad_norm": 2.171875,
"learning_rate": 4.555687203791469e-05,
"loss": 2.3791,
"step": 225
},
{
"epoch": 0.09,
"grad_norm": 1.7890625,
"learning_rate": 4.553712480252765e-05,
"loss": 2.8204,
"step": 226
},
{
"epoch": 0.09,
"grad_norm": 1.8515625,
"learning_rate": 4.55173775671406e-05,
"loss": 2.657,
"step": 227
},
{
"epoch": 0.09,
"grad_norm": 1.2265625,
"learning_rate": 4.549763033175356e-05,
"loss": 2.3554,
"step": 228
},
{
"epoch": 0.09,
"grad_norm": 1.890625,
"learning_rate": 4.5477883096366505e-05,
"loss": 2.7603,
"step": 229
},
{
"epoch": 0.09,
"grad_norm": 7.21875,
"learning_rate": 4.545813586097947e-05,
"loss": 2.5376,
"step": 230
},
{
"epoch": 0.09,
"grad_norm": 3.03125,
"learning_rate": 4.543838862559242e-05,
"loss": 2.4928,
"step": 231
},
{
"epoch": 0.09,
"grad_norm": 1.6953125,
"learning_rate": 4.5418641390205376e-05,
"loss": 2.4717,
"step": 232
},
{
"epoch": 0.09,
"grad_norm": 2.546875,
"learning_rate": 4.539889415481833e-05,
"loss": 2.7478,
"step": 233
},
{
"epoch": 0.09,
"grad_norm": 2.9375,
"learning_rate": 4.537914691943128e-05,
"loss": 2.3458,
"step": 234
},
{
"epoch": 0.09,
"grad_norm": 2.109375,
"learning_rate": 4.5359399684044234e-05,
"loss": 2.4139,
"step": 235
},
{
"epoch": 0.09,
"grad_norm": 2.359375,
"learning_rate": 4.533965244865719e-05,
"loss": 2.5769,
"step": 236
},
{
"epoch": 0.09,
"grad_norm": 2.15625,
"learning_rate": 4.531990521327014e-05,
"loss": 2.4584,
"step": 237
},
{
"epoch": 0.09,
"grad_norm": 3.703125,
"learning_rate": 4.53001579778831e-05,
"loss": 2.9022,
"step": 238
},
{
"epoch": 0.09,
"grad_norm": 2.890625,
"learning_rate": 4.528041074249605e-05,
"loss": 2.7332,
"step": 239
},
{
"epoch": 0.09,
"grad_norm": 1.7265625,
"learning_rate": 4.526066350710901e-05,
"loss": 2.6776,
"step": 240
},
{
"epoch": 0.1,
"grad_norm": 4.375,
"learning_rate": 4.524091627172196e-05,
"loss": 2.7312,
"step": 241
},
{
"epoch": 0.1,
"grad_norm": 4.4375,
"learning_rate": 4.522116903633492e-05,
"loss": 2.5245,
"step": 242
},
{
"epoch": 0.1,
"grad_norm": 1.921875,
"learning_rate": 4.520142180094787e-05,
"loss": 2.9007,
"step": 243
},
{
"epoch": 0.1,
"grad_norm": 2.78125,
"learning_rate": 4.518167456556082e-05,
"loss": 2.8348,
"step": 244
},
{
"epoch": 0.1,
"grad_norm": 2.5,
"learning_rate": 4.5161927330173774e-05,
"loss": 3.0287,
"step": 245
},
{
"epoch": 0.1,
"grad_norm": 2.53125,
"learning_rate": 4.5142180094786736e-05,
"loss": 2.5111,
"step": 246
},
{
"epoch": 0.1,
"grad_norm": 2.515625,
"learning_rate": 4.512243285939969e-05,
"loss": 2.5754,
"step": 247
},
{
"epoch": 0.1,
"grad_norm": 3.796875,
"learning_rate": 4.510268562401264e-05,
"loss": 2.6378,
"step": 248
},
{
"epoch": 0.1,
"grad_norm": 2.265625,
"learning_rate": 4.508293838862559e-05,
"loss": 2.7607,
"step": 249
},
{
"epoch": 0.1,
"grad_norm": 2.625,
"learning_rate": 4.506319115323855e-05,
"loss": 2.7698,
"step": 250
},
{
"epoch": 0.1,
"grad_norm": 2.546875,
"learning_rate": 4.50434439178515e-05,
"loss": 2.564,
"step": 251
},
{
"epoch": 0.1,
"grad_norm": 1.90625,
"learning_rate": 4.502369668246446e-05,
"loss": 2.6616,
"step": 252
},
{
"epoch": 0.1,
"grad_norm": 2.1875,
"learning_rate": 4.500394944707741e-05,
"loss": 2.7416,
"step": 253
},
{
"epoch": 0.1,
"grad_norm": 2.0625,
"learning_rate": 4.498420221169036e-05,
"loss": 2.7028,
"step": 254
},
{
"epoch": 0.1,
"grad_norm": 1.84375,
"learning_rate": 4.496445497630332e-05,
"loss": 2.9913,
"step": 255
},
{
"epoch": 0.1,
"grad_norm": 2.375,
"learning_rate": 4.4944707740916276e-05,
"loss": 2.6732,
"step": 256
},
{
"epoch": 0.1,
"grad_norm": 2.4375,
"learning_rate": 4.492496050552923e-05,
"loss": 2.2549,
"step": 257
},
{
"epoch": 0.1,
"grad_norm": 1.6015625,
"learning_rate": 4.4905213270142186e-05,
"loss": 2.5509,
"step": 258
},
{
"epoch": 0.1,
"grad_norm": 2.046875,
"learning_rate": 4.4885466034755133e-05,
"loss": 2.5506,
"step": 259
},
{
"epoch": 0.1,
"grad_norm": 1.9453125,
"learning_rate": 4.486571879936809e-05,
"loss": 2.4812,
"step": 260
},
{
"epoch": 0.1,
"grad_norm": 1.7890625,
"learning_rate": 4.484597156398104e-05,
"loss": 2.7874,
"step": 261
},
{
"epoch": 0.1,
"grad_norm": 3.171875,
"learning_rate": 4.4826224328594e-05,
"loss": 2.6254,
"step": 262
},
{
"epoch": 0.1,
"grad_norm": 1.890625,
"learning_rate": 4.480647709320695e-05,
"loss": 2.6186,
"step": 263
},
{
"epoch": 0.1,
"grad_norm": 3.25,
"learning_rate": 4.478672985781991e-05,
"loss": 2.7737,
"step": 264
},
{
"epoch": 0.1,
"grad_norm": 3.265625,
"learning_rate": 4.476698262243286e-05,
"loss": 2.7595,
"step": 265
},
{
"epoch": 0.11,
"grad_norm": 1.9296875,
"learning_rate": 4.4747235387045817e-05,
"loss": 2.6188,
"step": 266
},
{
"epoch": 0.11,
"grad_norm": 2.328125,
"learning_rate": 4.472748815165877e-05,
"loss": 2.8014,
"step": 267
},
{
"epoch": 0.11,
"grad_norm": 2.984375,
"learning_rate": 4.4707740916271726e-05,
"loss": 2.4501,
"step": 268
},
{
"epoch": 0.11,
"grad_norm": 1.8828125,
"learning_rate": 4.4687993680884674e-05,
"loss": 2.4877,
"step": 269
},
{
"epoch": 0.11,
"grad_norm": 2.640625,
"learning_rate": 4.466824644549763e-05,
"loss": 2.6551,
"step": 270
},
{
"epoch": 0.11,
"grad_norm": 2.25,
"learning_rate": 4.464849921011059e-05,
"loss": 2.6871,
"step": 271
},
{
"epoch": 0.11,
"grad_norm": 1.8984375,
"learning_rate": 4.4628751974723545e-05,
"loss": 2.3139,
"step": 272
},
{
"epoch": 0.11,
"grad_norm": 2.046875,
"learning_rate": 4.46090047393365e-05,
"loss": 2.4162,
"step": 273
},
{
"epoch": 0.11,
"grad_norm": 2.71875,
"learning_rate": 4.458925750394945e-05,
"loss": 2.3936,
"step": 274
},
{
"epoch": 0.11,
"grad_norm": 2.015625,
"learning_rate": 4.45695102685624e-05,
"loss": 2.3708,
"step": 275
},
{
"epoch": 0.11,
"grad_norm": 2.359375,
"learning_rate": 4.454976303317536e-05,
"loss": 2.6948,
"step": 276
},
{
"epoch": 0.11,
"grad_norm": 2.75,
"learning_rate": 4.453001579778831e-05,
"loss": 3.0324,
"step": 277
},
{
"epoch": 0.11,
"grad_norm": 2.375,
"learning_rate": 4.4510268562401266e-05,
"loss": 2.6217,
"step": 278
},
{
"epoch": 0.11,
"grad_norm": 1.5,
"learning_rate": 4.4490521327014214e-05,
"loss": 2.5678,
"step": 279
},
{
"epoch": 0.11,
"grad_norm": 2.046875,
"learning_rate": 4.4470774091627176e-05,
"loss": 2.382,
"step": 280
},
{
"epoch": 0.11,
"grad_norm": 1.7265625,
"learning_rate": 4.445102685624013e-05,
"loss": 2.5631,
"step": 281
},
{
"epoch": 0.11,
"grad_norm": 1.6328125,
"learning_rate": 4.4431279620853085e-05,
"loss": 2.4427,
"step": 282
},
{
"epoch": 0.11,
"grad_norm": 1.8671875,
"learning_rate": 4.441153238546604e-05,
"loss": 2.8178,
"step": 283
},
{
"epoch": 0.11,
"grad_norm": 3.515625,
"learning_rate": 4.439178515007899e-05,
"loss": 2.4941,
"step": 284
},
{
"epoch": 0.11,
"grad_norm": 1.8046875,
"learning_rate": 4.437203791469194e-05,
"loss": 2.6492,
"step": 285
},
{
"epoch": 0.11,
"grad_norm": 2.09375,
"learning_rate": 4.43522906793049e-05,
"loss": 2.63,
"step": 286
},
{
"epoch": 0.11,
"grad_norm": 1.78125,
"learning_rate": 4.433254344391785e-05,
"loss": 2.431,
"step": 287
},
{
"epoch": 0.11,
"grad_norm": 2.375,
"learning_rate": 4.431279620853081e-05,
"loss": 2.767,
"step": 288
},
{
"epoch": 0.11,
"grad_norm": 2.28125,
"learning_rate": 4.429304897314376e-05,
"loss": 2.5985,
"step": 289
},
{
"epoch": 0.11,
"grad_norm": 2.09375,
"learning_rate": 4.4273301737756716e-05,
"loss": 2.9324,
"step": 290
},
{
"epoch": 0.11,
"grad_norm": 2.03125,
"learning_rate": 4.425355450236967e-05,
"loss": 2.7975,
"step": 291
},
{
"epoch": 0.12,
"grad_norm": 1.6875,
"learning_rate": 4.4233807266982626e-05,
"loss": 2.6601,
"step": 292
},
{
"epoch": 0.12,
"grad_norm": 2.0625,
"learning_rate": 4.421406003159558e-05,
"loss": 2.8773,
"step": 293
},
{
"epoch": 0.12,
"grad_norm": 1.71875,
"learning_rate": 4.419431279620853e-05,
"loss": 2.3876,
"step": 294
},
{
"epoch": 0.12,
"grad_norm": 3.015625,
"learning_rate": 4.417456556082148e-05,
"loss": 2.5836,
"step": 295
},
{
"epoch": 0.12,
"grad_norm": 2.265625,
"learning_rate": 4.4154818325434445e-05,
"loss": 2.6567,
"step": 296
},
{
"epoch": 0.12,
"grad_norm": 2.734375,
"learning_rate": 4.41350710900474e-05,
"loss": 2.9493,
"step": 297
},
{
"epoch": 0.12,
"grad_norm": 3.171875,
"learning_rate": 4.4115323854660354e-05,
"loss": 2.7281,
"step": 298
},
{
"epoch": 0.12,
"grad_norm": 1.8984375,
"learning_rate": 4.40955766192733e-05,
"loss": 2.3735,
"step": 299
},
{
"epoch": 0.12,
"grad_norm": 3.03125,
"learning_rate": 4.407582938388626e-05,
"loss": 2.3528,
"step": 300
},
{
"epoch": 0.12,
"grad_norm": 2.3125,
"learning_rate": 4.405608214849921e-05,
"loss": 2.7345,
"step": 301
},
{
"epoch": 0.12,
"grad_norm": 2.03125,
"learning_rate": 4.4036334913112166e-05,
"loss": 2.615,
"step": 302
},
{
"epoch": 0.12,
"grad_norm": 2.03125,
"learning_rate": 4.401658767772512e-05,
"loss": 2.5159,
"step": 303
},
{
"epoch": 0.12,
"grad_norm": 2.171875,
"learning_rate": 4.399684044233807e-05,
"loss": 2.8403,
"step": 304
},
{
"epoch": 0.12,
"grad_norm": 2.640625,
"learning_rate": 4.397709320695103e-05,
"loss": 2.7985,
"step": 305
},
{
"epoch": 0.12,
"grad_norm": 2.09375,
"learning_rate": 4.3957345971563985e-05,
"loss": 2.6337,
"step": 306
},
{
"epoch": 0.12,
"grad_norm": 2.59375,
"learning_rate": 4.393759873617694e-05,
"loss": 2.825,
"step": 307
},
{
"epoch": 0.12,
"grad_norm": 2.296875,
"learning_rate": 4.3917851500789895e-05,
"loss": 2.4735,
"step": 308
},
{
"epoch": 0.12,
"grad_norm": 2.8125,
"learning_rate": 4.389810426540284e-05,
"loss": 2.8884,
"step": 309
},
{
"epoch": 0.12,
"grad_norm": 2.296875,
"learning_rate": 4.38783570300158e-05,
"loss": 2.8329,
"step": 310
},
{
"epoch": 0.12,
"grad_norm": 1.8125,
"learning_rate": 4.385860979462875e-05,
"loss": 2.5301,
"step": 311
},
{
"epoch": 0.12,
"grad_norm": 3.1875,
"learning_rate": 4.383886255924171e-05,
"loss": 2.5595,
"step": 312
},
{
"epoch": 0.12,
"grad_norm": 2.34375,
"learning_rate": 4.381911532385466e-05,
"loss": 2.3517,
"step": 313
},
{
"epoch": 0.12,
"grad_norm": 2.3125,
"learning_rate": 4.3799368088467616e-05,
"loss": 2.6969,
"step": 314
},
{
"epoch": 0.12,
"grad_norm": 1.5078125,
"learning_rate": 4.377962085308057e-05,
"loss": 2.3607,
"step": 315
},
{
"epoch": 0.12,
"grad_norm": 2.15625,
"learning_rate": 4.3759873617693526e-05,
"loss": 2.4543,
"step": 316
},
{
"epoch": 0.13,
"grad_norm": 2.078125,
"learning_rate": 4.374012638230648e-05,
"loss": 2.7003,
"step": 317
},
{
"epoch": 0.13,
"grad_norm": 2.203125,
"learning_rate": 4.3720379146919435e-05,
"loss": 2.7952,
"step": 318
},
{
"epoch": 0.13,
"grad_norm": 1.8671875,
"learning_rate": 4.370063191153238e-05,
"loss": 2.3314,
"step": 319
},
{
"epoch": 0.13,
"grad_norm": 2.125,
"learning_rate": 4.368088467614534e-05,
"loss": 2.5449,
"step": 320
},
{
"epoch": 0.13,
"grad_norm": 2.15625,
"learning_rate": 4.36611374407583e-05,
"loss": 2.6632,
"step": 321
},
{
"epoch": 0.13,
"grad_norm": 2.171875,
"learning_rate": 4.3641390205371254e-05,
"loss": 2.4701,
"step": 322
},
{
"epoch": 0.13,
"grad_norm": 2.875,
"learning_rate": 4.362164296998421e-05,
"loss": 2.8517,
"step": 323
},
{
"epoch": 0.13,
"grad_norm": 4.78125,
"learning_rate": 4.3601895734597157e-05,
"loss": 2.8037,
"step": 324
},
{
"epoch": 0.13,
"grad_norm": 2.578125,
"learning_rate": 4.358214849921011e-05,
"loss": 2.9109,
"step": 325
},
{
"epoch": 0.13,
"grad_norm": 2.234375,
"learning_rate": 4.3562401263823066e-05,
"loss": 2.6456,
"step": 326
},
{
"epoch": 0.13,
"grad_norm": 1.953125,
"learning_rate": 4.354265402843602e-05,
"loss": 2.7683,
"step": 327
},
{
"epoch": 0.13,
"grad_norm": 1.9453125,
"learning_rate": 4.3522906793048975e-05,
"loss": 2.8317,
"step": 328
},
{
"epoch": 0.13,
"grad_norm": 1.6640625,
"learning_rate": 4.350315955766193e-05,
"loss": 2.5437,
"step": 329
},
{
"epoch": 0.13,
"grad_norm": 2.359375,
"learning_rate": 4.3483412322274885e-05,
"loss": 2.6975,
"step": 330
},
{
"epoch": 0.13,
"grad_norm": 2.3125,
"learning_rate": 4.346366508688784e-05,
"loss": 2.6132,
"step": 331
},
{
"epoch": 0.13,
"grad_norm": 2.125,
"learning_rate": 4.3443917851500794e-05,
"loss": 3.0106,
"step": 332
},
{
"epoch": 0.13,
"grad_norm": 2.296875,
"learning_rate": 4.342417061611375e-05,
"loss": 2.6268,
"step": 333
},
{
"epoch": 0.13,
"grad_norm": 2.59375,
"learning_rate": 4.34044233807267e-05,
"loss": 2.6222,
"step": 334
},
{
"epoch": 0.13,
"grad_norm": 2.734375,
"learning_rate": 4.338467614533965e-05,
"loss": 2.8942,
"step": 335
},
{
"epoch": 0.13,
"grad_norm": 2.59375,
"learning_rate": 4.3364928909952606e-05,
"loss": 2.5479,
"step": 336
},
{
"epoch": 0.13,
"grad_norm": 3.046875,
"learning_rate": 4.334518167456556e-05,
"loss": 2.7458,
"step": 337
},
{
"epoch": 0.13,
"grad_norm": 1.984375,
"learning_rate": 4.332543443917852e-05,
"loss": 2.6692,
"step": 338
},
{
"epoch": 0.13,
"grad_norm": 1.984375,
"learning_rate": 4.330568720379147e-05,
"loss": 2.4748,
"step": 339
},
{
"epoch": 0.13,
"grad_norm": 12.375,
"learning_rate": 4.3285939968404425e-05,
"loss": 2.8305,
"step": 340
},
{
"epoch": 0.13,
"grad_norm": 2.609375,
"learning_rate": 4.326619273301738e-05,
"loss": 2.606,
"step": 341
},
{
"epoch": 0.14,
"grad_norm": 1.71875,
"learning_rate": 4.3246445497630335e-05,
"loss": 2.2643,
"step": 342
},
{
"epoch": 0.14,
"grad_norm": 2.015625,
"learning_rate": 4.322669826224329e-05,
"loss": 2.6267,
"step": 343
},
{
"epoch": 0.14,
"grad_norm": 2.65625,
"learning_rate": 4.320695102685624e-05,
"loss": 2.3359,
"step": 344
},
{
"epoch": 0.14,
"grad_norm": 2.40625,
"learning_rate": 4.318720379146919e-05,
"loss": 2.7255,
"step": 345
},
{
"epoch": 0.14,
"grad_norm": 1.6875,
"learning_rate": 4.3167456556082154e-05,
"loss": 2.3652,
"step": 346
},
{
"epoch": 0.14,
"grad_norm": 2.25,
"learning_rate": 4.314770932069511e-05,
"loss": 2.4905,
"step": 347
},
{
"epoch": 0.14,
"grad_norm": 1.609375,
"learning_rate": 4.312796208530806e-05,
"loss": 2.3889,
"step": 348
},
{
"epoch": 0.14,
"grad_norm": 1.96875,
"learning_rate": 4.310821484992101e-05,
"loss": 2.6173,
"step": 349
},
{
"epoch": 0.14,
"grad_norm": 2.234375,
"learning_rate": 4.3088467614533966e-05,
"loss": 2.4397,
"step": 350
},
{
"epoch": 0.14,
"grad_norm": 2.234375,
"learning_rate": 4.306872037914692e-05,
"loss": 2.5866,
"step": 351
},
{
"epoch": 0.14,
"grad_norm": 2.15625,
"learning_rate": 4.3048973143759875e-05,
"loss": 2.6347,
"step": 352
},
{
"epoch": 0.14,
"grad_norm": 1.859375,
"learning_rate": 4.302922590837283e-05,
"loss": 1.9974,
"step": 353
},
{
"epoch": 0.14,
"grad_norm": 2.734375,
"learning_rate": 4.3009478672985785e-05,
"loss": 2.6854,
"step": 354
},
{
"epoch": 0.14,
"grad_norm": 2.15625,
"learning_rate": 4.298973143759874e-05,
"loss": 2.4256,
"step": 355
},
{
"epoch": 0.14,
"grad_norm": 2.859375,
"learning_rate": 4.2969984202211694e-05,
"loss": 2.5534,
"step": 356
},
{
"epoch": 0.14,
"grad_norm": 2.03125,
"learning_rate": 4.295023696682465e-05,
"loss": 2.8615,
"step": 357
},
{
"epoch": 0.14,
"grad_norm": 2.1875,
"learning_rate": 4.2930489731437604e-05,
"loss": 2.851,
"step": 358
},
{
"epoch": 0.14,
"grad_norm": 2.0625,
"learning_rate": 4.291074249605055e-05,
"loss": 2.3993,
"step": 359
},
{
"epoch": 0.14,
"grad_norm": 1.84375,
"learning_rate": 4.2890995260663506e-05,
"loss": 2.3451,
"step": 360
},
{
"epoch": 0.14,
"grad_norm": 2.328125,
"learning_rate": 4.287124802527646e-05,
"loss": 2.5194,
"step": 361
},
{
"epoch": 0.14,
"grad_norm": 1.984375,
"learning_rate": 4.2851500789889416e-05,
"loss": 2.4847,
"step": 362
},
{
"epoch": 0.14,
"grad_norm": 2.328125,
"learning_rate": 4.283175355450238e-05,
"loss": 2.3711,
"step": 363
},
{
"epoch": 0.14,
"grad_norm": 3.109375,
"learning_rate": 4.2812006319115325e-05,
"loss": 2.5343,
"step": 364
},
{
"epoch": 0.14,
"grad_norm": 2.375,
"learning_rate": 4.279225908372828e-05,
"loss": 2.6558,
"step": 365
},
{
"epoch": 0.14,
"grad_norm": 2.609375,
"learning_rate": 4.2772511848341235e-05,
"loss": 2.5593,
"step": 366
},
{
"epoch": 0.14,
"grad_norm": 2.109375,
"learning_rate": 4.275276461295419e-05,
"loss": 2.4861,
"step": 367
},
{
"epoch": 0.15,
"grad_norm": 2.421875,
"learning_rate": 4.2733017377567144e-05,
"loss": 2.7165,
"step": 368
},
{
"epoch": 0.15,
"grad_norm": 2.15625,
"learning_rate": 4.271327014218009e-05,
"loss": 2.4084,
"step": 369
},
{
"epoch": 0.15,
"grad_norm": 1.8359375,
"learning_rate": 4.269352290679305e-05,
"loss": 2.6463,
"step": 370
},
{
"epoch": 0.15,
"grad_norm": 1.8984375,
"learning_rate": 4.267377567140601e-05,
"loss": 2.3626,
"step": 371
},
{
"epoch": 0.15,
"grad_norm": 2.421875,
"learning_rate": 4.265402843601896e-05,
"loss": 2.4366,
"step": 372
},
{
"epoch": 0.15,
"grad_norm": 2.828125,
"learning_rate": 4.263428120063192e-05,
"loss": 2.6347,
"step": 373
},
{
"epoch": 0.15,
"grad_norm": 2.765625,
"learning_rate": 4.2614533965244866e-05,
"loss": 2.4087,
"step": 374
},
{
"epoch": 0.15,
"grad_norm": 2.15625,
"learning_rate": 4.259478672985782e-05,
"loss": 2.3206,
"step": 375
},
{
"epoch": 0.15,
"grad_norm": 1.6328125,
"learning_rate": 4.2575039494470775e-05,
"loss": 2.3267,
"step": 376
},
{
"epoch": 0.15,
"grad_norm": 1.8203125,
"learning_rate": 4.255529225908373e-05,
"loss": 2.3858,
"step": 377
},
{
"epoch": 0.15,
"grad_norm": 4.21875,
"learning_rate": 4.2535545023696685e-05,
"loss": 2.5072,
"step": 378
},
{
"epoch": 0.15,
"grad_norm": 2.125,
"learning_rate": 4.251579778830964e-05,
"loss": 2.57,
"step": 379
},
{
"epoch": 0.15,
"grad_norm": 2.265625,
"learning_rate": 4.2496050552922594e-05,
"loss": 2.4605,
"step": 380
},
{
"epoch": 0.15,
"grad_norm": 2.921875,
"learning_rate": 4.247630331753555e-05,
"loss": 2.6674,
"step": 381
},
{
"epoch": 0.15,
"grad_norm": 2.265625,
"learning_rate": 4.2456556082148503e-05,
"loss": 2.4506,
"step": 382
},
{
"epoch": 0.15,
"grad_norm": 2.921875,
"learning_rate": 4.243680884676146e-05,
"loss": 2.3494,
"step": 383
},
{
"epoch": 0.15,
"grad_norm": 2.921875,
"learning_rate": 4.2417061611374406e-05,
"loss": 2.5204,
"step": 384
},
{
"epoch": 0.15,
"grad_norm": 3.125,
"learning_rate": 4.239731437598736e-05,
"loss": 2.5232,
"step": 385
},
{
"epoch": 0.15,
"grad_norm": 3.125,
"learning_rate": 4.2377567140600316e-05,
"loss": 2.4968,
"step": 386
},
{
"epoch": 0.15,
"grad_norm": 1.8984375,
"learning_rate": 4.235781990521327e-05,
"loss": 2.4083,
"step": 387
},
{
"epoch": 0.15,
"grad_norm": 3.3125,
"learning_rate": 4.233807266982623e-05,
"loss": 2.6494,
"step": 388
},
{
"epoch": 0.15,
"grad_norm": 2.8125,
"learning_rate": 4.231832543443918e-05,
"loss": 2.5001,
"step": 389
},
{
"epoch": 0.15,
"grad_norm": 2.0625,
"learning_rate": 4.2298578199052134e-05,
"loss": 2.4625,
"step": 390
},
{
"epoch": 0.15,
"grad_norm": 2.90625,
"learning_rate": 4.227883096366509e-05,
"loss": 2.3136,
"step": 391
},
{
"epoch": 0.15,
"grad_norm": 2.5,
"learning_rate": 4.2259083728278044e-05,
"loss": 2.5115,
"step": 392
},
{
"epoch": 0.16,
"grad_norm": 2.828125,
"learning_rate": 4.2239336492891e-05,
"loss": 2.3189,
"step": 393
},
{
"epoch": 0.16,
"grad_norm": 2.484375,
"learning_rate": 4.2219589257503947e-05,
"loss": 2.2746,
"step": 394
},
{
"epoch": 0.16,
"grad_norm": 2.640625,
"learning_rate": 4.21998420221169e-05,
"loss": 2.3084,
"step": 395
},
{
"epoch": 0.16,
"grad_norm": 2.4375,
"learning_rate": 4.218009478672986e-05,
"loss": 2.555,
"step": 396
},
{
"epoch": 0.16,
"grad_norm": 3.453125,
"learning_rate": 4.216034755134282e-05,
"loss": 2.8288,
"step": 397
},
{
"epoch": 0.16,
"grad_norm": 2.34375,
"learning_rate": 4.214060031595577e-05,
"loss": 2.7193,
"step": 398
},
{
"epoch": 0.16,
"grad_norm": 4.375,
"learning_rate": 4.212085308056872e-05,
"loss": 2.2882,
"step": 399
},
{
"epoch": 0.16,
"grad_norm": 2.40625,
"learning_rate": 4.2101105845181675e-05,
"loss": 2.5527,
"step": 400
},
{
"epoch": 0.16,
"grad_norm": 2.265625,
"learning_rate": 4.208135860979463e-05,
"loss": 2.7836,
"step": 401
},
{
"epoch": 0.16,
"grad_norm": 2.265625,
"learning_rate": 4.2061611374407584e-05,
"loss": 2.3036,
"step": 402
},
{
"epoch": 0.16,
"grad_norm": 2.375,
"learning_rate": 4.204186413902054e-05,
"loss": 2.3951,
"step": 403
},
{
"epoch": 0.16,
"grad_norm": 2.015625,
"learning_rate": 4.2022116903633494e-05,
"loss": 2.3308,
"step": 404
},
{
"epoch": 0.16,
"grad_norm": 3.203125,
"learning_rate": 4.200236966824645e-05,
"loss": 2.6641,
"step": 405
},
{
"epoch": 0.16,
"grad_norm": 1.8828125,
"learning_rate": 4.19826224328594e-05,
"loss": 2.5038,
"step": 406
},
{
"epoch": 0.16,
"grad_norm": 2.15625,
"learning_rate": 4.196287519747236e-05,
"loss": 2.5362,
"step": 407
},
{
"epoch": 0.16,
"grad_norm": 2.359375,
"learning_rate": 4.194312796208531e-05,
"loss": 2.4898,
"step": 408
},
{
"epoch": 0.16,
"grad_norm": 2.1875,
"learning_rate": 4.192338072669826e-05,
"loss": 2.3324,
"step": 409
},
{
"epoch": 0.16,
"grad_norm": 2.578125,
"learning_rate": 4.1903633491311215e-05,
"loss": 2.3356,
"step": 410
},
{
"epoch": 0.16,
"grad_norm": 2.640625,
"learning_rate": 4.188388625592417e-05,
"loss": 2.3605,
"step": 411
},
{
"epoch": 0.16,
"grad_norm": 2.421875,
"learning_rate": 4.1864139020537125e-05,
"loss": 2.337,
"step": 412
},
{
"epoch": 0.16,
"grad_norm": 4.71875,
"learning_rate": 4.1844391785150086e-05,
"loss": 2.2569,
"step": 413
},
{
"epoch": 0.16,
"grad_norm": 2.78125,
"learning_rate": 4.1824644549763034e-05,
"loss": 2.581,
"step": 414
},
{
"epoch": 0.16,
"grad_norm": 2.765625,
"learning_rate": 4.180489731437599e-05,
"loss": 2.6176,
"step": 415
},
{
"epoch": 0.16,
"grad_norm": 2.734375,
"learning_rate": 4.1785150078988944e-05,
"loss": 2.886,
"step": 416
},
{
"epoch": 0.16,
"grad_norm": 3.03125,
"learning_rate": 4.17654028436019e-05,
"loss": 2.4079,
"step": 417
},
{
"epoch": 0.17,
"grad_norm": 2.75,
"learning_rate": 4.174565560821485e-05,
"loss": 2.8401,
"step": 418
},
{
"epoch": 0.17,
"grad_norm": 15.1875,
"learning_rate": 4.17259083728278e-05,
"loss": 2.5026,
"step": 419
},
{
"epoch": 0.17,
"grad_norm": 3.0625,
"learning_rate": 4.1706161137440756e-05,
"loss": 2.5311,
"step": 420
},
{
"epoch": 0.17,
"grad_norm": 4.3125,
"learning_rate": 4.168641390205372e-05,
"loss": 2.3885,
"step": 421
},
{
"epoch": 0.17,
"grad_norm": 2.578125,
"learning_rate": 4.166666666666667e-05,
"loss": 2.3499,
"step": 422
},
{
"epoch": 0.17,
"grad_norm": 2.40625,
"learning_rate": 4.164691943127963e-05,
"loss": 2.4887,
"step": 423
},
{
"epoch": 0.17,
"grad_norm": 2.640625,
"learning_rate": 4.1627172195892575e-05,
"loss": 2.4074,
"step": 424
},
{
"epoch": 0.17,
"grad_norm": 2.359375,
"learning_rate": 4.160742496050553e-05,
"loss": 2.5399,
"step": 425
},
{
"epoch": 0.17,
"grad_norm": 2.71875,
"learning_rate": 4.1587677725118484e-05,
"loss": 2.5918,
"step": 426
},
{
"epoch": 0.17,
"grad_norm": 3.03125,
"learning_rate": 4.156793048973144e-05,
"loss": 2.6934,
"step": 427
},
{
"epoch": 0.17,
"grad_norm": 2.859375,
"learning_rate": 4.1548183254344394e-05,
"loss": 2.3809,
"step": 428
},
{
"epoch": 0.17,
"grad_norm": 2.890625,
"learning_rate": 4.152843601895735e-05,
"loss": 2.694,
"step": 429
},
{
"epoch": 0.17,
"grad_norm": 3.671875,
"learning_rate": 4.15086887835703e-05,
"loss": 2.2423,
"step": 430
},
{
"epoch": 0.17,
"grad_norm": 2.546875,
"learning_rate": 4.148894154818326e-05,
"loss": 2.2926,
"step": 431
},
{
"epoch": 0.17,
"grad_norm": 3.484375,
"learning_rate": 4.146919431279621e-05,
"loss": 2.3879,
"step": 432
},
{
"epoch": 0.17,
"grad_norm": 3.015625,
"learning_rate": 4.144944707740917e-05,
"loss": 2.3404,
"step": 433
},
{
"epoch": 0.17,
"grad_norm": 3.390625,
"learning_rate": 4.1429699842022115e-05,
"loss": 2.6603,
"step": 434
},
{
"epoch": 0.17,
"grad_norm": 3.296875,
"learning_rate": 4.140995260663507e-05,
"loss": 2.5506,
"step": 435
},
{
"epoch": 0.17,
"grad_norm": 3.390625,
"learning_rate": 4.1390205371248025e-05,
"loss": 2.1814,
"step": 436
},
{
"epoch": 0.17,
"grad_norm": 2.4375,
"learning_rate": 4.137045813586098e-05,
"loss": 2.3875,
"step": 437
},
{
"epoch": 0.17,
"grad_norm": 2.421875,
"learning_rate": 4.135071090047394e-05,
"loss": 2.4048,
"step": 438
},
{
"epoch": 0.17,
"grad_norm": 3.0,
"learning_rate": 4.133096366508689e-05,
"loss": 2.2644,
"step": 439
},
{
"epoch": 0.17,
"grad_norm": 2.640625,
"learning_rate": 4.1311216429699844e-05,
"loss": 2.3612,
"step": 440
},
{
"epoch": 0.17,
"grad_norm": 3.171875,
"learning_rate": 4.12914691943128e-05,
"loss": 2.4966,
"step": 441
},
{
"epoch": 0.17,
"grad_norm": 4.4375,
"learning_rate": 4.127172195892575e-05,
"loss": 2.5058,
"step": 442
},
{
"epoch": 0.17,
"grad_norm": 2.890625,
"learning_rate": 4.125197472353871e-05,
"loss": 2.4907,
"step": 443
},
{
"epoch": 0.18,
"grad_norm": 2.34375,
"learning_rate": 4.123222748815166e-05,
"loss": 2.4946,
"step": 444
},
{
"epoch": 0.18,
"grad_norm": 3.15625,
"learning_rate": 4.121248025276461e-05,
"loss": 2.3045,
"step": 445
},
{
"epoch": 0.18,
"grad_norm": 3.25,
"learning_rate": 4.119273301737757e-05,
"loss": 2.6534,
"step": 446
},
{
"epoch": 0.18,
"grad_norm": 2.390625,
"learning_rate": 4.1172985781990527e-05,
"loss": 2.6019,
"step": 447
},
{
"epoch": 0.18,
"grad_norm": 2.875,
"learning_rate": 4.115323854660348e-05,
"loss": 2.4286,
"step": 448
},
{
"epoch": 0.18,
"grad_norm": 1.984375,
"learning_rate": 4.113349131121643e-05,
"loss": 2.5032,
"step": 449
},
{
"epoch": 0.18,
"grad_norm": 2.21875,
"learning_rate": 4.1113744075829384e-05,
"loss": 2.1063,
"step": 450
},
{
"epoch": 0.18,
"grad_norm": 3.453125,
"learning_rate": 4.109399684044234e-05,
"loss": 2.1725,
"step": 451
},
{
"epoch": 0.18,
"grad_norm": 3.015625,
"learning_rate": 4.1074249605055293e-05,
"loss": 2.3036,
"step": 452
},
{
"epoch": 0.18,
"grad_norm": 2.671875,
"learning_rate": 4.105450236966825e-05,
"loss": 2.4136,
"step": 453
},
{
"epoch": 0.18,
"grad_norm": 2.6875,
"learning_rate": 4.10347551342812e-05,
"loss": 2.3596,
"step": 454
},
{
"epoch": 0.18,
"grad_norm": 3.09375,
"learning_rate": 4.101500789889416e-05,
"loss": 2.488,
"step": 455
},
{
"epoch": 0.18,
"grad_norm": 3.171875,
"learning_rate": 4.099526066350711e-05,
"loss": 2.3958,
"step": 456
},
{
"epoch": 0.18,
"grad_norm": 2.390625,
"learning_rate": 4.097551342812007e-05,
"loss": 2.5157,
"step": 457
},
{
"epoch": 0.18,
"grad_norm": 4.15625,
"learning_rate": 4.095576619273302e-05,
"loss": 2.7146,
"step": 458
},
{
"epoch": 0.18,
"grad_norm": 3.15625,
"learning_rate": 4.093601895734597e-05,
"loss": 2.3183,
"step": 459
},
{
"epoch": 0.18,
"grad_norm": 3.03125,
"learning_rate": 4.0916271721958924e-05,
"loss": 2.5069,
"step": 460
},
{
"epoch": 0.18,
"grad_norm": 3.375,
"learning_rate": 4.089652448657188e-05,
"loss": 2.0991,
"step": 461
},
{
"epoch": 0.18,
"grad_norm": 3.703125,
"learning_rate": 4.0876777251184834e-05,
"loss": 2.2417,
"step": 462
},
{
"epoch": 0.18,
"grad_norm": 4.03125,
"learning_rate": 4.0857030015797795e-05,
"loss": 2.2325,
"step": 463
},
{
"epoch": 0.18,
"grad_norm": 3.328125,
"learning_rate": 4.083728278041074e-05,
"loss": 2.5876,
"step": 464
},
{
"epoch": 0.18,
"grad_norm": 2.9375,
"learning_rate": 4.08175355450237e-05,
"loss": 2.2741,
"step": 465
},
{
"epoch": 0.18,
"grad_norm": 4.03125,
"learning_rate": 4.079778830963665e-05,
"loss": 2.2923,
"step": 466
},
{
"epoch": 0.18,
"grad_norm": 2.859375,
"learning_rate": 4.077804107424961e-05,
"loss": 2.4178,
"step": 467
},
{
"epoch": 0.18,
"grad_norm": 3.625,
"learning_rate": 4.075829383886256e-05,
"loss": 2.3001,
"step": 468
},
{
"epoch": 0.19,
"grad_norm": 3.359375,
"learning_rate": 4.073854660347552e-05,
"loss": 2.6521,
"step": 469
},
{
"epoch": 0.19,
"grad_norm": 2.515625,
"learning_rate": 4.0718799368088465e-05,
"loss": 2.1029,
"step": 470
},
{
"epoch": 0.19,
"grad_norm": 3.671875,
"learning_rate": 4.0699052132701426e-05,
"loss": 2.2849,
"step": 471
},
{
"epoch": 0.19,
"grad_norm": 3.3125,
"learning_rate": 4.067930489731438e-05,
"loss": 2.2918,
"step": 472
},
{
"epoch": 0.19,
"grad_norm": 5.84375,
"learning_rate": 4.0659557661927336e-05,
"loss": 2.47,
"step": 473
},
{
"epoch": 0.19,
"grad_norm": 2.6875,
"learning_rate": 4.0639810426540284e-05,
"loss": 2.2221,
"step": 474
},
{
"epoch": 0.19,
"grad_norm": 6.25,
"learning_rate": 4.062006319115324e-05,
"loss": 2.2914,
"step": 475
},
{
"epoch": 0.19,
"grad_norm": 2.65625,
"learning_rate": 4.060031595576619e-05,
"loss": 2.2545,
"step": 476
},
{
"epoch": 0.19,
"grad_norm": 2.5,
"learning_rate": 4.058056872037915e-05,
"loss": 2.2082,
"step": 477
},
{
"epoch": 0.19,
"grad_norm": 3.203125,
"learning_rate": 4.05608214849921e-05,
"loss": 2.3806,
"step": 478
},
{
"epoch": 0.19,
"grad_norm": 2.96875,
"learning_rate": 4.054107424960506e-05,
"loss": 2.3925,
"step": 479
},
{
"epoch": 0.19,
"grad_norm": 2.921875,
"learning_rate": 4.052132701421801e-05,
"loss": 2.1954,
"step": 480
},
{
"epoch": 0.19,
"grad_norm": 2.96875,
"learning_rate": 4.050157977883097e-05,
"loss": 2.4841,
"step": 481
},
{
"epoch": 0.19,
"grad_norm": 4.03125,
"learning_rate": 4.048183254344392e-05,
"loss": 2.496,
"step": 482
},
{
"epoch": 0.19,
"grad_norm": 4.0625,
"learning_rate": 4.0462085308056876e-05,
"loss": 2.498,
"step": 483
},
{
"epoch": 0.19,
"grad_norm": 5.8125,
"learning_rate": 4.0442338072669824e-05,
"loss": 2.6433,
"step": 484
},
{
"epoch": 0.19,
"grad_norm": 9.1875,
"learning_rate": 4.042259083728278e-05,
"loss": 2.2574,
"step": 485
},
{
"epoch": 0.19,
"grad_norm": 4.53125,
"learning_rate": 4.0402843601895734e-05,
"loss": 2.1073,
"step": 486
},
{
"epoch": 0.19,
"grad_norm": 8.4375,
"learning_rate": 4.038309636650869e-05,
"loss": 2.3932,
"step": 487
},
{
"epoch": 0.19,
"grad_norm": 3.015625,
"learning_rate": 4.036334913112165e-05,
"loss": 2.5867,
"step": 488
},
{
"epoch": 0.19,
"grad_norm": 3.390625,
"learning_rate": 4.03436018957346e-05,
"loss": 2.2009,
"step": 489
},
{
"epoch": 0.19,
"grad_norm": 4.40625,
"learning_rate": 4.032385466034755e-05,
"loss": 2.4256,
"step": 490
},
{
"epoch": 0.19,
"grad_norm": 3.578125,
"learning_rate": 4.030410742496051e-05,
"loss": 2.4502,
"step": 491
},
{
"epoch": 0.19,
"grad_norm": 4.03125,
"learning_rate": 4.028436018957346e-05,
"loss": 2.3308,
"step": 492
},
{
"epoch": 0.19,
"grad_norm": 2.796875,
"learning_rate": 4.026461295418642e-05,
"loss": 2.5512,
"step": 493
},
{
"epoch": 0.2,
"grad_norm": 3.25,
"learning_rate": 4.024486571879937e-05,
"loss": 2.2675,
"step": 494
},
{
"epoch": 0.2,
"grad_norm": 8.4375,
"learning_rate": 4.022511848341232e-05,
"loss": 2.506,
"step": 495
},
{
"epoch": 0.2,
"grad_norm": 3.71875,
"learning_rate": 4.020537124802528e-05,
"loss": 2.2324,
"step": 496
},
{
"epoch": 0.2,
"grad_norm": 3.015625,
"learning_rate": 4.0185624012638236e-05,
"loss": 2.7745,
"step": 497
},
{
"epoch": 0.2,
"grad_norm": 3.28125,
"learning_rate": 4.016587677725119e-05,
"loss": 2.3202,
"step": 498
},
{
"epoch": 0.2,
"grad_norm": 2.90625,
"learning_rate": 4.014612954186414e-05,
"loss": 2.225,
"step": 499
},
{
"epoch": 0.2,
"grad_norm": 2.953125,
"learning_rate": 4.012638230647709e-05,
"loss": 2.337,
"step": 500
},
{
"epoch": 0.2,
"grad_norm": 2.484375,
"learning_rate": 4.010663507109005e-05,
"loss": 2.5802,
"step": 501
},
{
"epoch": 0.2,
"grad_norm": 7.65625,
"learning_rate": 4.0086887835703e-05,
"loss": 2.711,
"step": 502
},
{
"epoch": 0.2,
"grad_norm": 3.03125,
"learning_rate": 4.006714060031596e-05,
"loss": 2.0939,
"step": 503
},
{
"epoch": 0.2,
"grad_norm": 3.375,
"learning_rate": 4.004739336492891e-05,
"loss": 2.5338,
"step": 504
},
{
"epoch": 0.2,
"grad_norm": 3.515625,
"learning_rate": 4.002764612954187e-05,
"loss": 2.3941,
"step": 505
},
{
"epoch": 0.2,
"grad_norm": 4.46875,
"learning_rate": 4.000789889415482e-05,
"loss": 2.2835,
"step": 506
},
{
"epoch": 0.2,
"grad_norm": 6.875,
"learning_rate": 3.9988151658767776e-05,
"loss": 2.3208,
"step": 507
},
{
"epoch": 0.2,
"grad_norm": 4.65625,
"learning_rate": 3.996840442338073e-05,
"loss": 2.133,
"step": 508
},
{
"epoch": 0.2,
"grad_norm": 5.65625,
"learning_rate": 3.9948657187993686e-05,
"loss": 2.4787,
"step": 509
},
{
"epoch": 0.2,
"grad_norm": 4.125,
"learning_rate": 3.9928909952606634e-05,
"loss": 2.3726,
"step": 510
},
{
"epoch": 0.2,
"grad_norm": 4.03125,
"learning_rate": 3.990916271721959e-05,
"loss": 2.0511,
"step": 511
},
{
"epoch": 0.2,
"grad_norm": 5.6875,
"learning_rate": 3.988941548183254e-05,
"loss": 2.5793,
"step": 512
},
{
"epoch": 0.2,
"grad_norm": 3.703125,
"learning_rate": 3.9869668246445504e-05,
"loss": 2.2887,
"step": 513
},
{
"epoch": 0.2,
"grad_norm": 3.015625,
"learning_rate": 3.984992101105845e-05,
"loss": 2.4867,
"step": 514
},
{
"epoch": 0.2,
"grad_norm": 2.953125,
"learning_rate": 3.983017377567141e-05,
"loss": 2.1657,
"step": 515
},
{
"epoch": 0.2,
"grad_norm": 2.984375,
"learning_rate": 3.981042654028436e-05,
"loss": 2.2717,
"step": 516
},
{
"epoch": 0.2,
"grad_norm": 3.046875,
"learning_rate": 3.9790679304897317e-05,
"loss": 2.2984,
"step": 517
},
{
"epoch": 0.2,
"grad_norm": 2.671875,
"learning_rate": 3.977093206951027e-05,
"loss": 2.3288,
"step": 518
},
{
"epoch": 0.2,
"grad_norm": 5.71875,
"learning_rate": 3.9751184834123226e-05,
"loss": 2.1387,
"step": 519
},
{
"epoch": 0.21,
"grad_norm": 3.4375,
"learning_rate": 3.9731437598736174e-05,
"loss": 2.2586,
"step": 520
},
{
"epoch": 0.21,
"grad_norm": 2.515625,
"learning_rate": 3.9711690363349135e-05,
"loss": 2.541,
"step": 521
},
{
"epoch": 0.21,
"grad_norm": 4.96875,
"learning_rate": 3.969194312796209e-05,
"loss": 2.1501,
"step": 522
},
{
"epoch": 0.21,
"grad_norm": 3.875,
"learning_rate": 3.9672195892575045e-05,
"loss": 2.1635,
"step": 523
},
{
"epoch": 0.21,
"grad_norm": 3.171875,
"learning_rate": 3.965244865718799e-05,
"loss": 2.5401,
"step": 524
},
{
"epoch": 0.21,
"grad_norm": 3.890625,
"learning_rate": 3.963270142180095e-05,
"loss": 2.4266,
"step": 525
},
{
"epoch": 0.21,
"grad_norm": 3.6875,
"learning_rate": 3.96129541864139e-05,
"loss": 2.4045,
"step": 526
},
{
"epoch": 0.21,
"grad_norm": 3.78125,
"learning_rate": 3.959320695102686e-05,
"loss": 2.6711,
"step": 527
},
{
"epoch": 0.21,
"grad_norm": 3.140625,
"learning_rate": 3.957345971563981e-05,
"loss": 2.092,
"step": 528
},
{
"epoch": 0.21,
"grad_norm": 3.375,
"learning_rate": 3.9553712480252766e-05,
"loss": 2.5062,
"step": 529
},
{
"epoch": 0.21,
"grad_norm": 3.03125,
"learning_rate": 3.953396524486572e-05,
"loss": 2.3299,
"step": 530
},
{
"epoch": 0.21,
"grad_norm": 3.3125,
"learning_rate": 3.9514218009478676e-05,
"loss": 2.2565,
"step": 531
},
{
"epoch": 0.21,
"grad_norm": 3.625,
"learning_rate": 3.949447077409163e-05,
"loss": 2.4908,
"step": 532
},
{
"epoch": 0.21,
"grad_norm": 3.15625,
"learning_rate": 3.9474723538704585e-05,
"loss": 2.2775,
"step": 533
},
{
"epoch": 0.21,
"grad_norm": 3.9375,
"learning_rate": 3.945497630331754e-05,
"loss": 2.3886,
"step": 534
},
{
"epoch": 0.21,
"grad_norm": 5.0,
"learning_rate": 3.943522906793049e-05,
"loss": 2.338,
"step": 535
},
{
"epoch": 0.21,
"grad_norm": 3.046875,
"learning_rate": 3.941548183254344e-05,
"loss": 2.1906,
"step": 536
},
{
"epoch": 0.21,
"grad_norm": 3.09375,
"learning_rate": 3.93957345971564e-05,
"loss": 2.4091,
"step": 537
},
{
"epoch": 0.21,
"grad_norm": 3.203125,
"learning_rate": 3.937598736176936e-05,
"loss": 2.4189,
"step": 538
},
{
"epoch": 0.21,
"grad_norm": 5.21875,
"learning_rate": 3.935624012638231e-05,
"loss": 2.3602,
"step": 539
},
{
"epoch": 0.21,
"grad_norm": 3.625,
"learning_rate": 3.933649289099526e-05,
"loss": 2.1972,
"step": 540
},
{
"epoch": 0.21,
"grad_norm": 6.78125,
"learning_rate": 3.9316745655608216e-05,
"loss": 2.1952,
"step": 541
},
{
"epoch": 0.21,
"grad_norm": 3.421875,
"learning_rate": 3.929699842022117e-05,
"loss": 2.1312,
"step": 542
},
{
"epoch": 0.21,
"grad_norm": 3.21875,
"learning_rate": 3.9277251184834126e-05,
"loss": 2.5984,
"step": 543
},
{
"epoch": 0.21,
"grad_norm": 3.984375,
"learning_rate": 3.925750394944708e-05,
"loss": 2.2538,
"step": 544
},
{
"epoch": 0.22,
"grad_norm": 2.9375,
"learning_rate": 3.923775671406003e-05,
"loss": 2.2859,
"step": 545
},
{
"epoch": 0.22,
"grad_norm": 2.984375,
"learning_rate": 3.921800947867299e-05,
"loss": 2.2972,
"step": 546
},
{
"epoch": 0.22,
"grad_norm": 2.890625,
"learning_rate": 3.9198262243285945e-05,
"loss": 2.1214,
"step": 547
},
{
"epoch": 0.22,
"grad_norm": 2.953125,
"learning_rate": 3.91785150078989e-05,
"loss": 2.0236,
"step": 548
},
{
"epoch": 0.22,
"grad_norm": 3.4375,
"learning_rate": 3.915876777251185e-05,
"loss": 2.4084,
"step": 549
},
{
"epoch": 0.22,
"grad_norm": 4.21875,
"learning_rate": 3.91390205371248e-05,
"loss": 2.292,
"step": 550
},
{
"epoch": 0.22,
"grad_norm": 4.4375,
"learning_rate": 3.911927330173776e-05,
"loss": 2.4264,
"step": 551
},
{
"epoch": 0.22,
"grad_norm": 2.828125,
"learning_rate": 3.909952606635071e-05,
"loss": 2.0862,
"step": 552
},
{
"epoch": 0.22,
"grad_norm": 3.78125,
"learning_rate": 3.9079778830963666e-05,
"loss": 2.1168,
"step": 553
},
{
"epoch": 0.22,
"grad_norm": 4.5,
"learning_rate": 3.906003159557662e-05,
"loss": 2.3743,
"step": 554
},
{
"epoch": 0.22,
"grad_norm": 3.8125,
"learning_rate": 3.9040284360189576e-05,
"loss": 1.9906,
"step": 555
},
{
"epoch": 0.22,
"grad_norm": 3.765625,
"learning_rate": 3.902053712480253e-05,
"loss": 2.1569,
"step": 556
},
{
"epoch": 0.22,
"grad_norm": 2.25,
"learning_rate": 3.9000789889415485e-05,
"loss": 2.2389,
"step": 557
},
{
"epoch": 0.22,
"grad_norm": 4.0,
"learning_rate": 3.898104265402844e-05,
"loss": 2.1058,
"step": 558
},
{
"epoch": 0.22,
"grad_norm": 3.125,
"learning_rate": 3.8961295418641395e-05,
"loss": 1.9809,
"step": 559
},
{
"epoch": 0.22,
"grad_norm": 3.71875,
"learning_rate": 3.894154818325434e-05,
"loss": 2.1415,
"step": 560
},
{
"epoch": 0.22,
"grad_norm": 3.25,
"learning_rate": 3.89218009478673e-05,
"loss": 2.0531,
"step": 561
},
{
"epoch": 0.22,
"grad_norm": 3.296875,
"learning_rate": 3.890205371248025e-05,
"loss": 2.0952,
"step": 562
},
{
"epoch": 0.22,
"grad_norm": 4.625,
"learning_rate": 3.8882306477093214e-05,
"loss": 2.1998,
"step": 563
},
{
"epoch": 0.22,
"grad_norm": 2.609375,
"learning_rate": 3.886255924170616e-05,
"loss": 1.9719,
"step": 564
},
{
"epoch": 0.22,
"grad_norm": 4.40625,
"learning_rate": 3.8842812006319116e-05,
"loss": 2.259,
"step": 565
},
{
"epoch": 0.22,
"grad_norm": 3.296875,
"learning_rate": 3.882306477093207e-05,
"loss": 2.3311,
"step": 566
},
{
"epoch": 0.22,
"grad_norm": 3.140625,
"learning_rate": 3.8803317535545026e-05,
"loss": 2.6632,
"step": 567
},
{
"epoch": 0.22,
"grad_norm": 2.734375,
"learning_rate": 3.878357030015798e-05,
"loss": 2.6106,
"step": 568
},
{
"epoch": 0.22,
"grad_norm": 3.578125,
"learning_rate": 3.8763823064770935e-05,
"loss": 2.6253,
"step": 569
},
{
"epoch": 0.23,
"grad_norm": 3.765625,
"learning_rate": 3.874407582938388e-05,
"loss": 2.3619,
"step": 570
},
{
"epoch": 0.23,
"grad_norm": 2.703125,
"learning_rate": 3.8724328593996845e-05,
"loss": 2.1297,
"step": 571
},
{
"epoch": 0.23,
"grad_norm": 3.015625,
"learning_rate": 3.87045813586098e-05,
"loss": 2.5538,
"step": 572
},
{
"epoch": 0.23,
"grad_norm": 3.140625,
"learning_rate": 3.8684834123222754e-05,
"loss": 2.3877,
"step": 573
},
{
"epoch": 0.23,
"grad_norm": 3.375,
"learning_rate": 3.866508688783571e-05,
"loss": 2.3093,
"step": 574
},
{
"epoch": 0.23,
"grad_norm": 5.0625,
"learning_rate": 3.864533965244866e-05,
"loss": 2.0965,
"step": 575
},
{
"epoch": 0.23,
"grad_norm": 3.03125,
"learning_rate": 3.862559241706161e-05,
"loss": 2.72,
"step": 576
},
{
"epoch": 0.23,
"grad_norm": 3.15625,
"learning_rate": 3.8605845181674566e-05,
"loss": 2.0347,
"step": 577
},
{
"epoch": 0.23,
"grad_norm": 5.375,
"learning_rate": 3.858609794628752e-05,
"loss": 2.2085,
"step": 578
},
{
"epoch": 0.23,
"grad_norm": 2.984375,
"learning_rate": 3.8566350710900476e-05,
"loss": 2.512,
"step": 579
},
{
"epoch": 0.23,
"grad_norm": 2.9375,
"learning_rate": 3.854660347551343e-05,
"loss": 1.9583,
"step": 580
},
{
"epoch": 0.23,
"grad_norm": 3.71875,
"learning_rate": 3.8526856240126385e-05,
"loss": 2.2079,
"step": 581
},
{
"epoch": 0.23,
"grad_norm": 6.5625,
"learning_rate": 3.850710900473934e-05,
"loss": 2.1276,
"step": 582
},
{
"epoch": 0.23,
"grad_norm": 4.03125,
"learning_rate": 3.8487361769352294e-05,
"loss": 2.4395,
"step": 583
},
{
"epoch": 0.23,
"grad_norm": 2.6875,
"learning_rate": 3.846761453396525e-05,
"loss": 2.1285,
"step": 584
},
{
"epoch": 0.23,
"grad_norm": 3.34375,
"learning_rate": 3.84478672985782e-05,
"loss": 2.23,
"step": 585
},
{
"epoch": 0.23,
"grad_norm": 4.15625,
"learning_rate": 3.842812006319115e-05,
"loss": 2.5419,
"step": 586
},
{
"epoch": 0.23,
"grad_norm": 3.375,
"learning_rate": 3.8408372827804107e-05,
"loss": 2.3108,
"step": 587
},
{
"epoch": 0.23,
"grad_norm": 4.5625,
"learning_rate": 3.838862559241707e-05,
"loss": 2.1337,
"step": 588
},
{
"epoch": 0.23,
"grad_norm": 3.890625,
"learning_rate": 3.8368878357030016e-05,
"loss": 2.2746,
"step": 589
},
{
"epoch": 0.23,
"grad_norm": 5.34375,
"learning_rate": 3.834913112164297e-05,
"loss": 2.3126,
"step": 590
},
{
"epoch": 0.23,
"grad_norm": 3.171875,
"learning_rate": 3.8329383886255925e-05,
"loss": 2.0393,
"step": 591
},
{
"epoch": 0.23,
"grad_norm": 3.953125,
"learning_rate": 3.830963665086888e-05,
"loss": 2.3325,
"step": 592
},
{
"epoch": 0.23,
"grad_norm": 2.890625,
"learning_rate": 3.8289889415481835e-05,
"loss": 2.3965,
"step": 593
},
{
"epoch": 0.23,
"grad_norm": 5.46875,
"learning_rate": 3.827014218009479e-05,
"loss": 2.4254,
"step": 594
},
{
"epoch": 0.23,
"grad_norm": 5.21875,
"learning_rate": 3.825039494470774e-05,
"loss": 2.0849,
"step": 595
},
{
"epoch": 0.24,
"grad_norm": 2.9375,
"learning_rate": 3.82306477093207e-05,
"loss": 2.517,
"step": 596
},
{
"epoch": 0.24,
"grad_norm": 3.9375,
"learning_rate": 3.8210900473933654e-05,
"loss": 2.1396,
"step": 597
},
{
"epoch": 0.24,
"grad_norm": 4.03125,
"learning_rate": 3.819115323854661e-05,
"loss": 2.1152,
"step": 598
},
{
"epoch": 0.24,
"grad_norm": 4.15625,
"learning_rate": 3.817140600315956e-05,
"loss": 2.3012,
"step": 599
},
{
"epoch": 0.24,
"grad_norm": 3.125,
"learning_rate": 3.815165876777251e-05,
"loss": 2.114,
"step": 600
},
{
"epoch": 0.24,
"grad_norm": 4.5625,
"learning_rate": 3.8131911532385466e-05,
"loss": 2.3197,
"step": 601
},
{
"epoch": 0.24,
"grad_norm": 3.296875,
"learning_rate": 3.811216429699842e-05,
"loss": 2.2122,
"step": 602
},
{
"epoch": 0.24,
"grad_norm": 2.515625,
"learning_rate": 3.8092417061611375e-05,
"loss": 1.9243,
"step": 603
},
{
"epoch": 0.24,
"grad_norm": 3.484375,
"learning_rate": 3.807266982622433e-05,
"loss": 2.3903,
"step": 604
},
{
"epoch": 0.24,
"grad_norm": 2.734375,
"learning_rate": 3.8052922590837285e-05,
"loss": 2.2106,
"step": 605
},
{
"epoch": 0.24,
"grad_norm": 4.78125,
"learning_rate": 3.803317535545024e-05,
"loss": 2.1726,
"step": 606
},
{
"epoch": 0.24,
"grad_norm": 2.390625,
"learning_rate": 3.8013428120063194e-05,
"loss": 2.0824,
"step": 607
},
{
"epoch": 0.24,
"grad_norm": 4.75,
"learning_rate": 3.799368088467615e-05,
"loss": 2.2388,
"step": 608
},
{
"epoch": 0.24,
"grad_norm": 3.03125,
"learning_rate": 3.7973933649289104e-05,
"loss": 2.3154,
"step": 609
},
{
"epoch": 0.24,
"grad_norm": 3.1875,
"learning_rate": 3.795418641390205e-05,
"loss": 2.4537,
"step": 610
},
{
"epoch": 0.24,
"grad_norm": 5.78125,
"learning_rate": 3.7934439178515006e-05,
"loss": 2.3807,
"step": 611
},
{
"epoch": 0.24,
"grad_norm": 2.84375,
"learning_rate": 3.791469194312796e-05,
"loss": 2.3659,
"step": 612
},
{
"epoch": 0.24,
"grad_norm": 3.96875,
"learning_rate": 3.789494470774092e-05,
"loss": 2.2103,
"step": 613
},
{
"epoch": 0.24,
"grad_norm": 4.34375,
"learning_rate": 3.787519747235387e-05,
"loss": 2.0942,
"step": 614
},
{
"epoch": 0.24,
"grad_norm": 3.96875,
"learning_rate": 3.7855450236966825e-05,
"loss": 2.416,
"step": 615
},
{
"epoch": 0.24,
"grad_norm": 2.578125,
"learning_rate": 3.783570300157978e-05,
"loss": 2.1004,
"step": 616
},
{
"epoch": 0.24,
"grad_norm": 2.9375,
"learning_rate": 3.7815955766192735e-05,
"loss": 2.2114,
"step": 617
},
{
"epoch": 0.24,
"grad_norm": 2.984375,
"learning_rate": 3.779620853080569e-05,
"loss": 2.1893,
"step": 618
},
{
"epoch": 0.24,
"grad_norm": 4.0,
"learning_rate": 3.7776461295418644e-05,
"loss": 2.3665,
"step": 619
},
{
"epoch": 0.24,
"grad_norm": 3.546875,
"learning_rate": 3.775671406003159e-05,
"loss": 2.2709,
"step": 620
},
{
"epoch": 0.25,
"grad_norm": 2.515625,
"learning_rate": 3.7736966824644554e-05,
"loss": 2.291,
"step": 621
},
{
"epoch": 0.25,
"grad_norm": 4.375,
"learning_rate": 3.771721958925751e-05,
"loss": 2.2977,
"step": 622
},
{
"epoch": 0.25,
"grad_norm": 4.4375,
"learning_rate": 3.769747235387046e-05,
"loss": 2.1841,
"step": 623
},
{
"epoch": 0.25,
"grad_norm": 5.0,
"learning_rate": 3.767772511848342e-05,
"loss": 2.102,
"step": 624
},
{
"epoch": 0.25,
"grad_norm": 3.4375,
"learning_rate": 3.7657977883096366e-05,
"loss": 1.8899,
"step": 625
},
{
"epoch": 0.25,
"grad_norm": 3.109375,
"learning_rate": 3.763823064770932e-05,
"loss": 2.0651,
"step": 626
},
{
"epoch": 0.25,
"grad_norm": 4.28125,
"learning_rate": 3.7618483412322275e-05,
"loss": 2.3477,
"step": 627
},
{
"epoch": 0.25,
"grad_norm": 3.453125,
"learning_rate": 3.759873617693523e-05,
"loss": 2.0798,
"step": 628
},
{
"epoch": 0.25,
"grad_norm": 3.140625,
"learning_rate": 3.7578988941548185e-05,
"loss": 2.4127,
"step": 629
},
{
"epoch": 0.25,
"grad_norm": 3.078125,
"learning_rate": 3.755924170616114e-05,
"loss": 2.247,
"step": 630
},
{
"epoch": 0.25,
"grad_norm": 4.125,
"learning_rate": 3.7539494470774094e-05,
"loss": 2.2454,
"step": 631
},
{
"epoch": 0.25,
"grad_norm": 2.828125,
"learning_rate": 3.751974723538705e-05,
"loss": 2.1944,
"step": 632
},
{
"epoch": 0.25,
"grad_norm": 3.921875,
"learning_rate": 3.7500000000000003e-05,
"loss": 2.315,
"step": 633
},
{
"epoch": 0.25,
"grad_norm": 3.140625,
"learning_rate": 3.748025276461296e-05,
"loss": 2.321,
"step": 634
},
{
"epoch": 0.25,
"grad_norm": 2.953125,
"learning_rate": 3.7460505529225906e-05,
"loss": 2.2275,
"step": 635
},
{
"epoch": 0.25,
"grad_norm": 5.34375,
"learning_rate": 3.744075829383886e-05,
"loss": 2.2846,
"step": 636
},
{
"epoch": 0.25,
"grad_norm": 2.9375,
"learning_rate": 3.7421011058451816e-05,
"loss": 2.0719,
"step": 637
},
{
"epoch": 0.25,
"grad_norm": 3.015625,
"learning_rate": 3.740126382306478e-05,
"loss": 2.391,
"step": 638
},
{
"epoch": 0.25,
"grad_norm": 4.1875,
"learning_rate": 3.738151658767773e-05,
"loss": 2.2538,
"step": 639
},
{
"epoch": 0.25,
"grad_norm": 4.3125,
"learning_rate": 3.736176935229068e-05,
"loss": 2.3326,
"step": 640
},
{
"epoch": 0.25,
"grad_norm": 3.75,
"learning_rate": 3.7342022116903635e-05,
"loss": 2.2493,
"step": 641
},
{
"epoch": 0.25,
"grad_norm": 3.0,
"learning_rate": 3.732227488151659e-05,
"loss": 1.8393,
"step": 642
},
{
"epoch": 0.25,
"grad_norm": 3.828125,
"learning_rate": 3.7302527646129544e-05,
"loss": 2.1447,
"step": 643
},
{
"epoch": 0.25,
"grad_norm": 4.46875,
"learning_rate": 3.72827804107425e-05,
"loss": 2.1317,
"step": 644
},
{
"epoch": 0.25,
"grad_norm": 5.4375,
"learning_rate": 3.726303317535545e-05,
"loss": 2.0121,
"step": 645
},
{
"epoch": 0.26,
"grad_norm": 3.609375,
"learning_rate": 3.724328593996841e-05,
"loss": 2.2991,
"step": 646
},
{
"epoch": 0.26,
"grad_norm": 4.875,
"learning_rate": 3.722353870458136e-05,
"loss": 2.366,
"step": 647
},
{
"epoch": 0.26,
"grad_norm": 6.3125,
"learning_rate": 3.720379146919432e-05,
"loss": 2.4566,
"step": 648
},
{
"epoch": 0.26,
"grad_norm": 3.125,
"learning_rate": 3.718404423380727e-05,
"loss": 2.3803,
"step": 649
},
{
"epoch": 0.26,
"grad_norm": 4.21875,
"learning_rate": 3.716429699842022e-05,
"loss": 2.1923,
"step": 650
},
{
"epoch": 0.26,
"grad_norm": 3.140625,
"learning_rate": 3.7144549763033175e-05,
"loss": 2.4029,
"step": 651
},
{
"epoch": 0.26,
"grad_norm": 2.90625,
"learning_rate": 3.712480252764613e-05,
"loss": 2.2696,
"step": 652
},
{
"epoch": 0.26,
"grad_norm": 3.09375,
"learning_rate": 3.7105055292259084e-05,
"loss": 2.1091,
"step": 653
},
{
"epoch": 0.26,
"grad_norm": 2.859375,
"learning_rate": 3.708530805687204e-05,
"loss": 2.3094,
"step": 654
},
{
"epoch": 0.26,
"grad_norm": 5.4375,
"learning_rate": 3.7065560821484994e-05,
"loss": 2.4479,
"step": 655
},
{
"epoch": 0.26,
"grad_norm": 3.953125,
"learning_rate": 3.704581358609795e-05,
"loss": 2.0438,
"step": 656
},
{
"epoch": 0.26,
"grad_norm": 3.84375,
"learning_rate": 3.70260663507109e-05,
"loss": 2.4032,
"step": 657
},
{
"epoch": 0.26,
"grad_norm": 3.59375,
"learning_rate": 3.700631911532386e-05,
"loss": 2.2473,
"step": 658
},
{
"epoch": 0.26,
"grad_norm": 3.328125,
"learning_rate": 3.698657187993681e-05,
"loss": 2.376,
"step": 659
},
{
"epoch": 0.26,
"grad_norm": 2.390625,
"learning_rate": 3.696682464454976e-05,
"loss": 1.9421,
"step": 660
},
{
"epoch": 0.26,
"grad_norm": 4.03125,
"learning_rate": 3.6947077409162715e-05,
"loss": 2.1651,
"step": 661
},
{
"epoch": 0.26,
"grad_norm": 3.59375,
"learning_rate": 3.692733017377567e-05,
"loss": 2.2003,
"step": 662
},
{
"epoch": 0.26,
"grad_norm": 3.3125,
"learning_rate": 3.690758293838863e-05,
"loss": 2.0253,
"step": 663
},
{
"epoch": 0.26,
"grad_norm": 3.71875,
"learning_rate": 3.6887835703001586e-05,
"loss": 2.3362,
"step": 664
},
{
"epoch": 0.26,
"grad_norm": 3.359375,
"learning_rate": 3.6868088467614534e-05,
"loss": 2.4982,
"step": 665
},
{
"epoch": 0.26,
"grad_norm": 4.375,
"learning_rate": 3.684834123222749e-05,
"loss": 2.3774,
"step": 666
},
{
"epoch": 0.26,
"grad_norm": 5.0,
"learning_rate": 3.6828593996840444e-05,
"loss": 1.5933,
"step": 667
},
{
"epoch": 0.26,
"grad_norm": 4.40625,
"learning_rate": 3.68088467614534e-05,
"loss": 2.1311,
"step": 668
},
{
"epoch": 0.26,
"grad_norm": 4.625,
"learning_rate": 3.678909952606635e-05,
"loss": 2.0043,
"step": 669
},
{
"epoch": 0.26,
"grad_norm": 4.46875,
"learning_rate": 3.67693522906793e-05,
"loss": 2.0759,
"step": 670
},
{
"epoch": 0.27,
"grad_norm": 3.046875,
"learning_rate": 3.674960505529226e-05,
"loss": 1.7726,
"step": 671
},
{
"epoch": 0.27,
"grad_norm": 5.78125,
"learning_rate": 3.672985781990522e-05,
"loss": 2.0477,
"step": 672
},
{
"epoch": 0.27,
"grad_norm": 4.78125,
"learning_rate": 3.671011058451817e-05,
"loss": 2.0617,
"step": 673
},
{
"epoch": 0.27,
"grad_norm": 4.53125,
"learning_rate": 3.669036334913113e-05,
"loss": 2.3605,
"step": 674
},
{
"epoch": 0.27,
"grad_norm": 3.640625,
"learning_rate": 3.6670616113744075e-05,
"loss": 2.28,
"step": 675
},
{
"epoch": 0.27,
"grad_norm": 3.921875,
"learning_rate": 3.665086887835703e-05,
"loss": 2.0516,
"step": 676
},
{
"epoch": 0.27,
"grad_norm": 8.25,
"learning_rate": 3.6631121642969984e-05,
"loss": 2.2003,
"step": 677
},
{
"epoch": 0.27,
"grad_norm": 2.953125,
"learning_rate": 3.661137440758294e-05,
"loss": 2.1424,
"step": 678
},
{
"epoch": 0.27,
"grad_norm": 3.140625,
"learning_rate": 3.6591627172195894e-05,
"loss": 2.3819,
"step": 679
},
{
"epoch": 0.27,
"grad_norm": 3.296875,
"learning_rate": 3.657187993680885e-05,
"loss": 2.1419,
"step": 680
},
{
"epoch": 0.27,
"grad_norm": 4.8125,
"learning_rate": 3.65521327014218e-05,
"loss": 2.2073,
"step": 681
},
{
"epoch": 0.27,
"grad_norm": 5.96875,
"learning_rate": 3.653238546603476e-05,
"loss": 2.1454,
"step": 682
},
{
"epoch": 0.27,
"grad_norm": 3.140625,
"learning_rate": 3.651263823064771e-05,
"loss": 2.1878,
"step": 683
},
{
"epoch": 0.27,
"grad_norm": 3.4375,
"learning_rate": 3.649289099526067e-05,
"loss": 2.0024,
"step": 684
},
{
"epoch": 0.27,
"grad_norm": 2.4375,
"learning_rate": 3.6473143759873615e-05,
"loss": 2.0062,
"step": 685
},
{
"epoch": 0.27,
"grad_norm": 4.65625,
"learning_rate": 3.645339652448657e-05,
"loss": 2.4761,
"step": 686
},
{
"epoch": 0.27,
"grad_norm": 5.40625,
"learning_rate": 3.6433649289099525e-05,
"loss": 2.3997,
"step": 687
},
{
"epoch": 0.27,
"grad_norm": 3.640625,
"learning_rate": 3.6413902053712486e-05,
"loss": 2.1696,
"step": 688
},
{
"epoch": 0.27,
"grad_norm": 5.875,
"learning_rate": 3.639415481832544e-05,
"loss": 2.1618,
"step": 689
},
{
"epoch": 0.27,
"grad_norm": 3.828125,
"learning_rate": 3.637440758293839e-05,
"loss": 2.0528,
"step": 690
},
{
"epoch": 0.27,
"grad_norm": 3.484375,
"learning_rate": 3.6354660347551344e-05,
"loss": 2.0864,
"step": 691
},
{
"epoch": 0.27,
"grad_norm": 2.734375,
"learning_rate": 3.63349131121643e-05,
"loss": 2.1031,
"step": 692
},
{
"epoch": 0.27,
"grad_norm": 3.515625,
"learning_rate": 3.631516587677725e-05,
"loss": 2.1598,
"step": 693
},
{
"epoch": 0.27,
"grad_norm": 3.6875,
"learning_rate": 3.629541864139021e-05,
"loss": 2.4488,
"step": 694
},
{
"epoch": 0.27,
"grad_norm": 3.765625,
"learning_rate": 3.6275671406003156e-05,
"loss": 2.3793,
"step": 695
},
{
"epoch": 0.27,
"grad_norm": 4.0625,
"learning_rate": 3.625592417061612e-05,
"loss": 2.2993,
"step": 696
},
{
"epoch": 0.28,
"grad_norm": 4.625,
"learning_rate": 3.623617693522907e-05,
"loss": 2.417,
"step": 697
},
{
"epoch": 0.28,
"grad_norm": 3.90625,
"learning_rate": 3.621642969984203e-05,
"loss": 2.4132,
"step": 698
},
{
"epoch": 0.28,
"grad_norm": 3.6875,
"learning_rate": 3.619668246445498e-05,
"loss": 2.5591,
"step": 699
},
{
"epoch": 0.28,
"grad_norm": 3.90625,
"learning_rate": 3.617693522906793e-05,
"loss": 2.142,
"step": 700
},
{
"epoch": 0.28,
"grad_norm": 3.71875,
"learning_rate": 3.6157187993680884e-05,
"loss": 2.1198,
"step": 701
},
{
"epoch": 0.28,
"grad_norm": 3.71875,
"learning_rate": 3.613744075829384e-05,
"loss": 2.3219,
"step": 702
},
{
"epoch": 0.28,
"grad_norm": 5.09375,
"learning_rate": 3.6117693522906793e-05,
"loss": 2.6155,
"step": 703
},
{
"epoch": 0.28,
"grad_norm": 4.96875,
"learning_rate": 3.6097946287519755e-05,
"loss": 2.1726,
"step": 704
},
{
"epoch": 0.28,
"grad_norm": 4.0625,
"learning_rate": 3.60781990521327e-05,
"loss": 2.5877,
"step": 705
},
{
"epoch": 0.28,
"grad_norm": 4.9375,
"learning_rate": 3.605845181674566e-05,
"loss": 2.0426,
"step": 706
},
{
"epoch": 0.28,
"grad_norm": 3.953125,
"learning_rate": 3.603870458135861e-05,
"loss": 2.2428,
"step": 707
},
{
"epoch": 0.28,
"grad_norm": 3.703125,
"learning_rate": 3.601895734597157e-05,
"loss": 1.9695,
"step": 708
},
{
"epoch": 0.28,
"grad_norm": 4.09375,
"learning_rate": 3.599921011058452e-05,
"loss": 2.4674,
"step": 709
},
{
"epoch": 0.28,
"grad_norm": 3.0625,
"learning_rate": 3.597946287519747e-05,
"loss": 2.5283,
"step": 710
},
{
"epoch": 0.28,
"grad_norm": 2.734375,
"learning_rate": 3.5959715639810424e-05,
"loss": 1.9793,
"step": 711
},
{
"epoch": 0.28,
"grad_norm": 3.265625,
"learning_rate": 3.593996840442338e-05,
"loss": 2.1185,
"step": 712
},
{
"epoch": 0.28,
"grad_norm": 4.40625,
"learning_rate": 3.592022116903634e-05,
"loss": 2.1784,
"step": 713
},
{
"epoch": 0.28,
"grad_norm": 4.4375,
"learning_rate": 3.5900473933649295e-05,
"loss": 2.4251,
"step": 714
},
{
"epoch": 0.28,
"grad_norm": 5.6875,
"learning_rate": 3.588072669826224e-05,
"loss": 2.5664,
"step": 715
},
{
"epoch": 0.28,
"grad_norm": 5.25,
"learning_rate": 3.58609794628752e-05,
"loss": 2.3782,
"step": 716
},
{
"epoch": 0.28,
"grad_norm": 4.59375,
"learning_rate": 3.584123222748815e-05,
"loss": 2.413,
"step": 717
},
{
"epoch": 0.28,
"grad_norm": 3.75,
"learning_rate": 3.582148499210111e-05,
"loss": 2.1953,
"step": 718
},
{
"epoch": 0.28,
"grad_norm": 4.21875,
"learning_rate": 3.580173775671406e-05,
"loss": 2.0919,
"step": 719
},
{
"epoch": 0.28,
"grad_norm": 7.65625,
"learning_rate": 3.578199052132701e-05,
"loss": 1.9951,
"step": 720
},
{
"epoch": 0.28,
"grad_norm": 5.1875,
"learning_rate": 3.576224328593997e-05,
"loss": 2.247,
"step": 721
},
{
"epoch": 0.29,
"grad_norm": 5.65625,
"learning_rate": 3.5742496050552926e-05,
"loss": 2.1906,
"step": 722
},
{
"epoch": 0.29,
"grad_norm": 3.21875,
"learning_rate": 3.572274881516588e-05,
"loss": 2.2582,
"step": 723
},
{
"epoch": 0.29,
"grad_norm": 6.59375,
"learning_rate": 3.5703001579778836e-05,
"loss": 2.3838,
"step": 724
},
{
"epoch": 0.29,
"grad_norm": 4.5625,
"learning_rate": 3.5683254344391784e-05,
"loss": 2.3544,
"step": 725
},
{
"epoch": 0.29,
"grad_norm": 4.78125,
"learning_rate": 3.566350710900474e-05,
"loss": 2.5825,
"step": 726
},
{
"epoch": 0.29,
"grad_norm": 16.75,
"learning_rate": 3.564375987361769e-05,
"loss": 2.2463,
"step": 727
},
{
"epoch": 0.29,
"grad_norm": 3.640625,
"learning_rate": 3.562401263823065e-05,
"loss": 2.2682,
"step": 728
},
{
"epoch": 0.29,
"grad_norm": 3.703125,
"learning_rate": 3.560426540284361e-05,
"loss": 2.4519,
"step": 729
},
{
"epoch": 0.29,
"grad_norm": 4.125,
"learning_rate": 3.558451816745656e-05,
"loss": 2.2136,
"step": 730
},
{
"epoch": 0.29,
"grad_norm": 3.5,
"learning_rate": 3.556477093206951e-05,
"loss": 2.119,
"step": 731
},
{
"epoch": 0.29,
"grad_norm": 4.34375,
"learning_rate": 3.554502369668247e-05,
"loss": 2.2532,
"step": 732
},
{
"epoch": 0.29,
"grad_norm": 3.859375,
"learning_rate": 3.552527646129542e-05,
"loss": 2.317,
"step": 733
},
{
"epoch": 0.29,
"grad_norm": 3.90625,
"learning_rate": 3.5505529225908376e-05,
"loss": 2.0767,
"step": 734
},
{
"epoch": 0.29,
"grad_norm": 3.640625,
"learning_rate": 3.5485781990521324e-05,
"loss": 1.9955,
"step": 735
},
{
"epoch": 0.29,
"grad_norm": 3.5,
"learning_rate": 3.546603475513428e-05,
"loss": 1.9777,
"step": 736
},
{
"epoch": 0.29,
"grad_norm": 2.453125,
"learning_rate": 3.5446287519747234e-05,
"loss": 2.4426,
"step": 737
},
{
"epoch": 0.29,
"grad_norm": 3.859375,
"learning_rate": 3.5426540284360195e-05,
"loss": 2.2804,
"step": 738
},
{
"epoch": 0.29,
"grad_norm": 2.515625,
"learning_rate": 3.540679304897315e-05,
"loss": 2.1437,
"step": 739
},
{
"epoch": 0.29,
"grad_norm": 3.3125,
"learning_rate": 3.53870458135861e-05,
"loss": 2.2342,
"step": 740
},
{
"epoch": 0.29,
"grad_norm": 4.15625,
"learning_rate": 3.536729857819905e-05,
"loss": 2.485,
"step": 741
},
{
"epoch": 0.29,
"grad_norm": 3.703125,
"learning_rate": 3.534755134281201e-05,
"loss": 2.0016,
"step": 742
},
{
"epoch": 0.29,
"grad_norm": 3.828125,
"learning_rate": 3.532780410742496e-05,
"loss": 2.159,
"step": 743
},
{
"epoch": 0.29,
"grad_norm": 3.828125,
"learning_rate": 3.530805687203792e-05,
"loss": 2.1591,
"step": 744
},
{
"epoch": 0.29,
"grad_norm": 3.09375,
"learning_rate": 3.5288309636650865e-05,
"loss": 2.1368,
"step": 745
},
{
"epoch": 0.29,
"grad_norm": 3.109375,
"learning_rate": 3.5268562401263826e-05,
"loss": 2.3244,
"step": 746
},
{
"epoch": 0.3,
"grad_norm": 4.625,
"learning_rate": 3.524881516587678e-05,
"loss": 2.1351,
"step": 747
},
{
"epoch": 0.3,
"grad_norm": 4.0,
"learning_rate": 3.5229067930489736e-05,
"loss": 2.5313,
"step": 748
},
{
"epoch": 0.3,
"grad_norm": 4.5625,
"learning_rate": 3.520932069510269e-05,
"loss": 2.4577,
"step": 749
},
{
"epoch": 0.3,
"grad_norm": 4.78125,
"learning_rate": 3.518957345971564e-05,
"loss": 2.1745,
"step": 750
},
{
"epoch": 0.3,
"grad_norm": 4.21875,
"learning_rate": 3.516982622432859e-05,
"loss": 2.3864,
"step": 751
},
{
"epoch": 0.3,
"grad_norm": 2.875,
"learning_rate": 3.515007898894155e-05,
"loss": 1.9815,
"step": 752
},
{
"epoch": 0.3,
"grad_norm": 4.09375,
"learning_rate": 3.51303317535545e-05,
"loss": 2.0835,
"step": 753
},
{
"epoch": 0.3,
"grad_norm": 4.125,
"learning_rate": 3.5110584518167464e-05,
"loss": 2.1332,
"step": 754
},
{
"epoch": 0.3,
"grad_norm": 3.734375,
"learning_rate": 3.509083728278041e-05,
"loss": 2.2189,
"step": 755
},
{
"epoch": 0.3,
"grad_norm": 2.953125,
"learning_rate": 3.507109004739337e-05,
"loss": 1.8231,
"step": 756
},
{
"epoch": 0.3,
"grad_norm": 3.328125,
"learning_rate": 3.505134281200632e-05,
"loss": 2.2728,
"step": 757
},
{
"epoch": 0.3,
"grad_norm": 4.125,
"learning_rate": 3.5031595576619276e-05,
"loss": 2.227,
"step": 758
},
{
"epoch": 0.3,
"grad_norm": 3.25,
"learning_rate": 3.501184834123223e-05,
"loss": 1.8384,
"step": 759
},
{
"epoch": 0.3,
"grad_norm": 4.375,
"learning_rate": 3.499210110584518e-05,
"loss": 2.7019,
"step": 760
},
{
"epoch": 0.3,
"grad_norm": 3.53125,
"learning_rate": 3.4972353870458134e-05,
"loss": 1.969,
"step": 761
},
{
"epoch": 0.3,
"grad_norm": 4.6875,
"learning_rate": 3.495260663507109e-05,
"loss": 2.4574,
"step": 762
},
{
"epoch": 0.3,
"grad_norm": 3.375,
"learning_rate": 3.493285939968405e-05,
"loss": 2.0816,
"step": 763
},
{
"epoch": 0.3,
"grad_norm": 3.703125,
"learning_rate": 3.4913112164297004e-05,
"loss": 2.2121,
"step": 764
},
{
"epoch": 0.3,
"grad_norm": 3.015625,
"learning_rate": 3.489336492890995e-05,
"loss": 1.9886,
"step": 765
},
{
"epoch": 0.3,
"grad_norm": 5.875,
"learning_rate": 3.487361769352291e-05,
"loss": 2.0416,
"step": 766
},
{
"epoch": 0.3,
"grad_norm": 5.59375,
"learning_rate": 3.485387045813586e-05,
"loss": 2.2839,
"step": 767
},
{
"epoch": 0.3,
"grad_norm": 5.0,
"learning_rate": 3.4834123222748817e-05,
"loss": 2.3425,
"step": 768
},
{
"epoch": 0.3,
"grad_norm": 2.984375,
"learning_rate": 3.481437598736177e-05,
"loss": 2.2007,
"step": 769
},
{
"epoch": 0.3,
"grad_norm": 5.28125,
"learning_rate": 3.4794628751974726e-05,
"loss": 2.3854,
"step": 770
},
{
"epoch": 0.3,
"grad_norm": 5.21875,
"learning_rate": 3.477488151658768e-05,
"loss": 2.292,
"step": 771
},
{
"epoch": 0.3,
"grad_norm": 3.78125,
"learning_rate": 3.4755134281200636e-05,
"loss": 2.02,
"step": 772
},
{
"epoch": 0.31,
"grad_norm": 4.90625,
"learning_rate": 3.473538704581359e-05,
"loss": 2.461,
"step": 773
},
{
"epoch": 0.31,
"grad_norm": 5.5,
"learning_rate": 3.4715639810426545e-05,
"loss": 2.6603,
"step": 774
},
{
"epoch": 0.31,
"grad_norm": 4.875,
"learning_rate": 3.469589257503949e-05,
"loss": 2.3989,
"step": 775
},
{
"epoch": 0.31,
"grad_norm": 3.859375,
"learning_rate": 3.467614533965245e-05,
"loss": 2.4317,
"step": 776
},
{
"epoch": 0.31,
"grad_norm": 4.46875,
"learning_rate": 3.46563981042654e-05,
"loss": 2.3054,
"step": 777
},
{
"epoch": 0.31,
"grad_norm": 6.09375,
"learning_rate": 3.463665086887836e-05,
"loss": 2.7222,
"step": 778
},
{
"epoch": 0.31,
"grad_norm": 3.6875,
"learning_rate": 3.461690363349132e-05,
"loss": 2.019,
"step": 779
},
{
"epoch": 0.31,
"grad_norm": 4.25,
"learning_rate": 3.4597156398104267e-05,
"loss": 2.2224,
"step": 780
},
{
"epoch": 0.31,
"grad_norm": 5.15625,
"learning_rate": 3.457740916271722e-05,
"loss": 2.1265,
"step": 781
},
{
"epoch": 0.31,
"grad_norm": 3.71875,
"learning_rate": 3.4557661927330176e-05,
"loss": 1.7375,
"step": 782
},
{
"epoch": 0.31,
"grad_norm": 9.5,
"learning_rate": 3.453791469194313e-05,
"loss": 2.0238,
"step": 783
},
{
"epoch": 0.31,
"grad_norm": 3.96875,
"learning_rate": 3.4518167456556085e-05,
"loss": 1.8839,
"step": 784
},
{
"epoch": 0.31,
"grad_norm": 5.0,
"learning_rate": 3.449842022116903e-05,
"loss": 2.2333,
"step": 785
},
{
"epoch": 0.31,
"grad_norm": 4.5625,
"learning_rate": 3.447867298578199e-05,
"loss": 1.672,
"step": 786
},
{
"epoch": 0.31,
"grad_norm": 5.90625,
"learning_rate": 3.445892575039494e-05,
"loss": 2.0666,
"step": 787
},
{
"epoch": 0.31,
"grad_norm": 4.09375,
"learning_rate": 3.4439178515007904e-05,
"loss": 2.0166,
"step": 788
},
{
"epoch": 0.31,
"grad_norm": 4.59375,
"learning_rate": 3.441943127962086e-05,
"loss": 2.0235,
"step": 789
},
{
"epoch": 0.31,
"grad_norm": 3.953125,
"learning_rate": 3.439968404423381e-05,
"loss": 2.463,
"step": 790
},
{
"epoch": 0.31,
"grad_norm": 4.8125,
"learning_rate": 3.437993680884676e-05,
"loss": 2.4109,
"step": 791
},
{
"epoch": 0.31,
"grad_norm": 5.21875,
"learning_rate": 3.4360189573459716e-05,
"loss": 1.9843,
"step": 792
},
{
"epoch": 0.31,
"grad_norm": 5.09375,
"learning_rate": 3.434044233807267e-05,
"loss": 2.2247,
"step": 793
},
{
"epoch": 0.31,
"grad_norm": 4.875,
"learning_rate": 3.4320695102685626e-05,
"loss": 2.3812,
"step": 794
},
{
"epoch": 0.31,
"grad_norm": 4.3125,
"learning_rate": 3.430094786729858e-05,
"loss": 1.8639,
"step": 795
},
{
"epoch": 0.31,
"grad_norm": 4.40625,
"learning_rate": 3.4281200631911535e-05,
"loss": 1.7815,
"step": 796
},
{
"epoch": 0.31,
"grad_norm": 4.40625,
"learning_rate": 3.426145339652449e-05,
"loss": 2.251,
"step": 797
},
{
"epoch": 0.32,
"grad_norm": 3.515625,
"learning_rate": 3.4241706161137445e-05,
"loss": 2.0406,
"step": 798
},
{
"epoch": 0.32,
"grad_norm": 5.40625,
"learning_rate": 3.42219589257504e-05,
"loss": 2.4374,
"step": 799
},
{
"epoch": 0.32,
"grad_norm": 4.8125,
"learning_rate": 3.420221169036335e-05,
"loss": 2.4243,
"step": 800
},
{
"epoch": 0.32,
"grad_norm": 4.4375,
"learning_rate": 3.41824644549763e-05,
"loss": 2.038,
"step": 801
},
{
"epoch": 0.32,
"grad_norm": 4.03125,
"learning_rate": 3.416271721958926e-05,
"loss": 2.0446,
"step": 802
},
{
"epoch": 0.32,
"grad_norm": 6.1875,
"learning_rate": 3.414296998420221e-05,
"loss": 2.6197,
"step": 803
},
{
"epoch": 0.32,
"grad_norm": 4.65625,
"learning_rate": 3.412322274881517e-05,
"loss": 2.2769,
"step": 804
},
{
"epoch": 0.32,
"grad_norm": 3.9375,
"learning_rate": 3.410347551342812e-05,
"loss": 1.9543,
"step": 805
},
{
"epoch": 0.32,
"grad_norm": 3.71875,
"learning_rate": 3.4083728278041076e-05,
"loss": 2.1902,
"step": 806
},
{
"epoch": 0.32,
"grad_norm": 4.5,
"learning_rate": 3.406398104265403e-05,
"loss": 2.5547,
"step": 807
},
{
"epoch": 0.32,
"grad_norm": 3.46875,
"learning_rate": 3.4044233807266985e-05,
"loss": 2.3123,
"step": 808
},
{
"epoch": 0.32,
"grad_norm": 3.515625,
"learning_rate": 3.402448657187994e-05,
"loss": 2.4583,
"step": 809
},
{
"epoch": 0.32,
"grad_norm": 4.59375,
"learning_rate": 3.400473933649289e-05,
"loss": 2.4065,
"step": 810
},
{
"epoch": 0.32,
"grad_norm": 5.5,
"learning_rate": 3.398499210110584e-05,
"loss": 2.3105,
"step": 811
},
{
"epoch": 0.32,
"grad_norm": 4.875,
"learning_rate": 3.39652448657188e-05,
"loss": 2.2501,
"step": 812
},
{
"epoch": 0.32,
"grad_norm": 3.5625,
"learning_rate": 3.394549763033176e-05,
"loss": 2.3876,
"step": 813
},
{
"epoch": 0.32,
"grad_norm": 3.34375,
"learning_rate": 3.3925750394944714e-05,
"loss": 1.8637,
"step": 814
},
{
"epoch": 0.32,
"grad_norm": 4.59375,
"learning_rate": 3.390600315955766e-05,
"loss": 2.4343,
"step": 815
},
{
"epoch": 0.32,
"grad_norm": 4.75,
"learning_rate": 3.3886255924170616e-05,
"loss": 2.2334,
"step": 816
},
{
"epoch": 0.32,
"grad_norm": 9.375,
"learning_rate": 3.386650868878357e-05,
"loss": 2.1471,
"step": 817
},
{
"epoch": 0.32,
"grad_norm": 6.5,
"learning_rate": 3.3846761453396526e-05,
"loss": 2.2564,
"step": 818
},
{
"epoch": 0.32,
"grad_norm": 4.75,
"learning_rate": 3.382701421800948e-05,
"loss": 2.0658,
"step": 819
},
{
"epoch": 0.32,
"grad_norm": 10.5,
"learning_rate": 3.3807266982622435e-05,
"loss": 2.1117,
"step": 820
},
{
"epoch": 0.32,
"grad_norm": 8.875,
"learning_rate": 3.378751974723539e-05,
"loss": 2.0247,
"step": 821
},
{
"epoch": 0.32,
"grad_norm": 4.46875,
"learning_rate": 3.3767772511848345e-05,
"loss": 1.7115,
"step": 822
},
{
"epoch": 0.33,
"grad_norm": 4.09375,
"learning_rate": 3.37480252764613e-05,
"loss": 1.9265,
"step": 823
},
{
"epoch": 0.33,
"grad_norm": 5.125,
"learning_rate": 3.3728278041074254e-05,
"loss": 2.3711,
"step": 824
},
{
"epoch": 0.33,
"grad_norm": 4.375,
"learning_rate": 3.37085308056872e-05,
"loss": 2.3025,
"step": 825
},
{
"epoch": 0.33,
"grad_norm": 3.703125,
"learning_rate": 3.368878357030016e-05,
"loss": 2.1649,
"step": 826
},
{
"epoch": 0.33,
"grad_norm": 4.21875,
"learning_rate": 3.366903633491311e-05,
"loss": 2.1055,
"step": 827
},
{
"epoch": 0.33,
"grad_norm": 4.65625,
"learning_rate": 3.3649289099526066e-05,
"loss": 2.0339,
"step": 828
},
{
"epoch": 0.33,
"grad_norm": 4.96875,
"learning_rate": 3.362954186413903e-05,
"loss": 2.6248,
"step": 829
},
{
"epoch": 0.33,
"grad_norm": 3.53125,
"learning_rate": 3.3609794628751976e-05,
"loss": 1.9992,
"step": 830
},
{
"epoch": 0.33,
"grad_norm": 3.3125,
"learning_rate": 3.359004739336493e-05,
"loss": 2.1959,
"step": 831
},
{
"epoch": 0.33,
"grad_norm": 3.09375,
"learning_rate": 3.3570300157977885e-05,
"loss": 2.1429,
"step": 832
},
{
"epoch": 0.33,
"grad_norm": 3.765625,
"learning_rate": 3.355055292259084e-05,
"loss": 2.0846,
"step": 833
},
{
"epoch": 0.33,
"grad_norm": 5.46875,
"learning_rate": 3.3530805687203794e-05,
"loss": 2.1404,
"step": 834
},
{
"epoch": 0.33,
"grad_norm": 5.40625,
"learning_rate": 3.351105845181675e-05,
"loss": 2.2914,
"step": 835
},
{
"epoch": 0.33,
"grad_norm": 4.375,
"learning_rate": 3.34913112164297e-05,
"loss": 2.2752,
"step": 836
},
{
"epoch": 0.33,
"grad_norm": 3.234375,
"learning_rate": 3.347156398104265e-05,
"loss": 2.1902,
"step": 837
},
{
"epoch": 0.33,
"grad_norm": 4.3125,
"learning_rate": 3.345181674565561e-05,
"loss": 2.0974,
"step": 838
},
{
"epoch": 0.33,
"grad_norm": 5.1875,
"learning_rate": 3.343206951026857e-05,
"loss": 2.2947,
"step": 839
},
{
"epoch": 0.33,
"grad_norm": 3.90625,
"learning_rate": 3.3412322274881516e-05,
"loss": 2.5153,
"step": 840
},
{
"epoch": 0.33,
"grad_norm": 4.28125,
"learning_rate": 3.339257503949447e-05,
"loss": 1.8428,
"step": 841
},
{
"epoch": 0.33,
"grad_norm": 3.90625,
"learning_rate": 3.3372827804107425e-05,
"loss": 2.0703,
"step": 842
},
{
"epoch": 0.33,
"grad_norm": 5.40625,
"learning_rate": 3.335308056872038e-05,
"loss": 2.208,
"step": 843
},
{
"epoch": 0.33,
"grad_norm": 4.03125,
"learning_rate": 3.3333333333333335e-05,
"loss": 2.4716,
"step": 844
},
{
"epoch": 0.33,
"grad_norm": 11.0,
"learning_rate": 3.331358609794629e-05,
"loss": 2.4981,
"step": 845
},
{
"epoch": 0.33,
"grad_norm": 5.875,
"learning_rate": 3.3293838862559244e-05,
"loss": 2.1312,
"step": 846
},
{
"epoch": 0.33,
"grad_norm": 4.4375,
"learning_rate": 3.32740916271722e-05,
"loss": 2.0895,
"step": 847
},
{
"epoch": 0.33,
"grad_norm": 4.34375,
"learning_rate": 3.3254344391785154e-05,
"loss": 2.4012,
"step": 848
},
{
"epoch": 0.34,
"grad_norm": 3.234375,
"learning_rate": 3.323459715639811e-05,
"loss": 2.2475,
"step": 849
},
{
"epoch": 0.34,
"grad_norm": 3.9375,
"learning_rate": 3.3214849921011056e-05,
"loss": 1.8374,
"step": 850
},
{
"epoch": 0.34,
"grad_norm": 3.8125,
"learning_rate": 3.319510268562401e-05,
"loss": 2.1409,
"step": 851
},
{
"epoch": 0.34,
"grad_norm": 4.78125,
"learning_rate": 3.3175355450236966e-05,
"loss": 2.1672,
"step": 852
},
{
"epoch": 0.34,
"grad_norm": 3.984375,
"learning_rate": 3.315560821484992e-05,
"loss": 2.0682,
"step": 853
},
{
"epoch": 0.34,
"grad_norm": 4.53125,
"learning_rate": 3.313586097946288e-05,
"loss": 2.1467,
"step": 854
},
{
"epoch": 0.34,
"grad_norm": 4.25,
"learning_rate": 3.311611374407583e-05,
"loss": 2.2585,
"step": 855
},
{
"epoch": 0.34,
"grad_norm": 3.828125,
"learning_rate": 3.3096366508688785e-05,
"loss": 2.18,
"step": 856
},
{
"epoch": 0.34,
"grad_norm": 3.09375,
"learning_rate": 3.307661927330174e-05,
"loss": 1.9451,
"step": 857
},
{
"epoch": 0.34,
"grad_norm": 3.859375,
"learning_rate": 3.3056872037914694e-05,
"loss": 1.7044,
"step": 858
},
{
"epoch": 0.34,
"grad_norm": 4.375,
"learning_rate": 3.303712480252765e-05,
"loss": 2.309,
"step": 859
},
{
"epoch": 0.34,
"grad_norm": 4.78125,
"learning_rate": 3.3017377567140604e-05,
"loss": 2.1901,
"step": 860
},
{
"epoch": 0.34,
"grad_norm": 7.0625,
"learning_rate": 3.299763033175355e-05,
"loss": 2.2445,
"step": 861
},
{
"epoch": 0.34,
"grad_norm": 4.9375,
"learning_rate": 3.2977883096366506e-05,
"loss": 2.5175,
"step": 862
},
{
"epoch": 0.34,
"grad_norm": 3.890625,
"learning_rate": 3.295813586097947e-05,
"loss": 2.1367,
"step": 863
},
{
"epoch": 0.34,
"grad_norm": 4.21875,
"learning_rate": 3.293838862559242e-05,
"loss": 1.8377,
"step": 864
},
{
"epoch": 0.34,
"grad_norm": 4.53125,
"learning_rate": 3.291864139020537e-05,
"loss": 2.3408,
"step": 865
},
{
"epoch": 0.34,
"grad_norm": 3.375,
"learning_rate": 3.2898894154818325e-05,
"loss": 2.1,
"step": 866
},
{
"epoch": 0.34,
"grad_norm": 4.40625,
"learning_rate": 3.287914691943128e-05,
"loss": 2.3707,
"step": 867
},
{
"epoch": 0.34,
"grad_norm": 6.5625,
"learning_rate": 3.2859399684044235e-05,
"loss": 2.3242,
"step": 868
},
{
"epoch": 0.34,
"grad_norm": 3.90625,
"learning_rate": 3.283965244865719e-05,
"loss": 2.4301,
"step": 869
},
{
"epoch": 0.34,
"grad_norm": 4.78125,
"learning_rate": 3.2819905213270144e-05,
"loss": 2.3216,
"step": 870
},
{
"epoch": 0.34,
"grad_norm": 4.90625,
"learning_rate": 3.28001579778831e-05,
"loss": 2.3015,
"step": 871
},
{
"epoch": 0.34,
"grad_norm": 5.15625,
"learning_rate": 3.2780410742496054e-05,
"loss": 1.913,
"step": 872
},
{
"epoch": 0.34,
"grad_norm": 8.5,
"learning_rate": 3.276066350710901e-05,
"loss": 2.5307,
"step": 873
},
{
"epoch": 0.35,
"grad_norm": 4.9375,
"learning_rate": 3.274091627172196e-05,
"loss": 2.4199,
"step": 874
},
{
"epoch": 0.35,
"grad_norm": 4.3125,
"learning_rate": 3.272116903633491e-05,
"loss": 2.1487,
"step": 875
},
{
"epoch": 0.35,
"grad_norm": 4.71875,
"learning_rate": 3.2701421800947866e-05,
"loss": 2.1049,
"step": 876
},
{
"epoch": 0.35,
"grad_norm": 5.34375,
"learning_rate": 3.268167456556082e-05,
"loss": 2.2781,
"step": 877
},
{
"epoch": 0.35,
"grad_norm": 3.734375,
"learning_rate": 3.2661927330173775e-05,
"loss": 2.3567,
"step": 878
},
{
"epoch": 0.35,
"grad_norm": 4.75,
"learning_rate": 3.264218009478674e-05,
"loss": 2.2516,
"step": 879
},
{
"epoch": 0.35,
"grad_norm": 4.09375,
"learning_rate": 3.2622432859399685e-05,
"loss": 2.3146,
"step": 880
},
{
"epoch": 0.35,
"grad_norm": 4.125,
"learning_rate": 3.260268562401264e-05,
"loss": 2.3071,
"step": 881
},
{
"epoch": 0.35,
"grad_norm": 3.625,
"learning_rate": 3.2582938388625594e-05,
"loss": 2.2509,
"step": 882
},
{
"epoch": 0.35,
"grad_norm": 4.75,
"learning_rate": 3.256319115323855e-05,
"loss": 2.1284,
"step": 883
},
{
"epoch": 0.35,
"grad_norm": 3.828125,
"learning_rate": 3.2543443917851504e-05,
"loss": 1.7567,
"step": 884
},
{
"epoch": 0.35,
"grad_norm": 5.34375,
"learning_rate": 3.252369668246446e-05,
"loss": 2.2056,
"step": 885
},
{
"epoch": 0.35,
"grad_norm": 4.84375,
"learning_rate": 3.2503949447077406e-05,
"loss": 2.2379,
"step": 886
},
{
"epoch": 0.35,
"grad_norm": 5.25,
"learning_rate": 3.248420221169036e-05,
"loss": 2.3635,
"step": 887
},
{
"epoch": 0.35,
"grad_norm": 4.3125,
"learning_rate": 3.246445497630332e-05,
"loss": 2.0825,
"step": 888
},
{
"epoch": 0.35,
"grad_norm": 4.90625,
"learning_rate": 3.244470774091628e-05,
"loss": 2.0221,
"step": 889
},
{
"epoch": 0.35,
"grad_norm": 5.59375,
"learning_rate": 3.2424960505529225e-05,
"loss": 2.2305,
"step": 890
},
{
"epoch": 0.35,
"grad_norm": 2.984375,
"learning_rate": 3.240521327014218e-05,
"loss": 2.0285,
"step": 891
},
{
"epoch": 0.35,
"grad_norm": 4.8125,
"learning_rate": 3.2385466034755135e-05,
"loss": 1.9585,
"step": 892
},
{
"epoch": 0.35,
"grad_norm": 4.78125,
"learning_rate": 3.236571879936809e-05,
"loss": 2.203,
"step": 893
},
{
"epoch": 0.35,
"grad_norm": 7.125,
"learning_rate": 3.2345971563981044e-05,
"loss": 2.3145,
"step": 894
},
{
"epoch": 0.35,
"grad_norm": 4.6875,
"learning_rate": 3.2326224328594e-05,
"loss": 2.1821,
"step": 895
},
{
"epoch": 0.35,
"grad_norm": 4.5,
"learning_rate": 3.2306477093206953e-05,
"loss": 2.0778,
"step": 896
},
{
"epoch": 0.35,
"grad_norm": 3.765625,
"learning_rate": 3.228672985781991e-05,
"loss": 1.9669,
"step": 897
},
{
"epoch": 0.35,
"grad_norm": 5.5,
"learning_rate": 3.226698262243286e-05,
"loss": 2.4555,
"step": 898
},
{
"epoch": 0.36,
"grad_norm": 5.8125,
"learning_rate": 3.224723538704582e-05,
"loss": 2.3628,
"step": 899
},
{
"epoch": 0.36,
"grad_norm": 6.90625,
"learning_rate": 3.222748815165877e-05,
"loss": 2.1712,
"step": 900
},
{
"epoch": 0.36,
"grad_norm": 5.40625,
"learning_rate": 3.220774091627172e-05,
"loss": 2.0738,
"step": 901
},
{
"epoch": 0.36,
"grad_norm": 6.40625,
"learning_rate": 3.2187993680884675e-05,
"loss": 2.021,
"step": 902
},
{
"epoch": 0.36,
"grad_norm": 5.4375,
"learning_rate": 3.216824644549763e-05,
"loss": 2.0044,
"step": 903
},
{
"epoch": 0.36,
"grad_norm": 4.03125,
"learning_rate": 3.214849921011059e-05,
"loss": 2.4063,
"step": 904
},
{
"epoch": 0.36,
"grad_norm": 4.625,
"learning_rate": 3.212875197472354e-05,
"loss": 1.9643,
"step": 905
},
{
"epoch": 0.36,
"grad_norm": 6.5625,
"learning_rate": 3.2109004739336494e-05,
"loss": 2.0587,
"step": 906
},
{
"epoch": 0.36,
"grad_norm": 4.625,
"learning_rate": 3.208925750394945e-05,
"loss": 1.9222,
"step": 907
},
{
"epoch": 0.36,
"grad_norm": 10.625,
"learning_rate": 3.20695102685624e-05,
"loss": 2.012,
"step": 908
},
{
"epoch": 0.36,
"grad_norm": 4.90625,
"learning_rate": 3.204976303317536e-05,
"loss": 2.2789,
"step": 909
},
{
"epoch": 0.36,
"grad_norm": 4.03125,
"learning_rate": 3.203001579778831e-05,
"loss": 2.3514,
"step": 910
},
{
"epoch": 0.36,
"grad_norm": 4.6875,
"learning_rate": 3.201026856240126e-05,
"loss": 1.9321,
"step": 911
},
{
"epoch": 0.36,
"grad_norm": 4.96875,
"learning_rate": 3.1990521327014215e-05,
"loss": 1.7105,
"step": 912
},
{
"epoch": 0.36,
"grad_norm": 5.65625,
"learning_rate": 3.197077409162718e-05,
"loss": 2.3593,
"step": 913
},
{
"epoch": 0.36,
"grad_norm": 4.78125,
"learning_rate": 3.195102685624013e-05,
"loss": 2.2884,
"step": 914
},
{
"epoch": 0.36,
"grad_norm": 4.75,
"learning_rate": 3.193127962085308e-05,
"loss": 2.2089,
"step": 915
},
{
"epoch": 0.36,
"grad_norm": 4.53125,
"learning_rate": 3.1911532385466034e-05,
"loss": 2.2624,
"step": 916
},
{
"epoch": 0.36,
"grad_norm": 6.0625,
"learning_rate": 3.189178515007899e-05,
"loss": 2.1718,
"step": 917
},
{
"epoch": 0.36,
"grad_norm": 5.25,
"learning_rate": 3.1872037914691944e-05,
"loss": 2.1799,
"step": 918
},
{
"epoch": 0.36,
"grad_norm": 3.6875,
"learning_rate": 3.18522906793049e-05,
"loss": 2.0073,
"step": 919
},
{
"epoch": 0.36,
"grad_norm": 6.09375,
"learning_rate": 3.183254344391785e-05,
"loss": 2.0775,
"step": 920
},
{
"epoch": 0.36,
"grad_norm": 3.5,
"learning_rate": 3.181279620853081e-05,
"loss": 1.9829,
"step": 921
},
{
"epoch": 0.36,
"grad_norm": 4.21875,
"learning_rate": 3.179304897314376e-05,
"loss": 2.1558,
"step": 922
},
{
"epoch": 0.36,
"grad_norm": 5.46875,
"learning_rate": 3.177330173775672e-05,
"loss": 2.5095,
"step": 923
},
{
"epoch": 0.36,
"grad_norm": 5.03125,
"learning_rate": 3.175355450236967e-05,
"loss": 2.2827,
"step": 924
},
{
"epoch": 0.37,
"grad_norm": 8.0625,
"learning_rate": 3.173380726698263e-05,
"loss": 2.3742,
"step": 925
},
{
"epoch": 0.37,
"grad_norm": 3.5,
"learning_rate": 3.1714060031595575e-05,
"loss": 2.2247,
"step": 926
},
{
"epoch": 0.37,
"grad_norm": 6.0,
"learning_rate": 3.169431279620853e-05,
"loss": 2.2053,
"step": 927
},
{
"epoch": 0.37,
"grad_norm": 4.40625,
"learning_rate": 3.1674565560821484e-05,
"loss": 2.076,
"step": 928
},
{
"epoch": 0.37,
"grad_norm": 3.75,
"learning_rate": 3.1654818325434446e-05,
"loss": 2.1465,
"step": 929
},
{
"epoch": 0.37,
"grad_norm": 4.875,
"learning_rate": 3.1635071090047394e-05,
"loss": 2.202,
"step": 930
},
{
"epoch": 0.37,
"grad_norm": 7.78125,
"learning_rate": 3.161532385466035e-05,
"loss": 2.1006,
"step": 931
},
{
"epoch": 0.37,
"grad_norm": 3.96875,
"learning_rate": 3.15955766192733e-05,
"loss": 1.8036,
"step": 932
},
{
"epoch": 0.37,
"grad_norm": 4.6875,
"learning_rate": 3.157582938388626e-05,
"loss": 2.2658,
"step": 933
},
{
"epoch": 0.37,
"grad_norm": 5.8125,
"learning_rate": 3.155608214849921e-05,
"loss": 2.3724,
"step": 934
},
{
"epoch": 0.37,
"grad_norm": 4.28125,
"learning_rate": 3.153633491311217e-05,
"loss": 2.0821,
"step": 935
},
{
"epoch": 0.37,
"grad_norm": 5.53125,
"learning_rate": 3.1516587677725115e-05,
"loss": 2.3037,
"step": 936
},
{
"epoch": 0.37,
"grad_norm": 4.28125,
"learning_rate": 3.149684044233807e-05,
"loss": 2.2822,
"step": 937
},
{
"epoch": 0.37,
"grad_norm": 3.9375,
"learning_rate": 3.147709320695103e-05,
"loss": 1.779,
"step": 938
},
{
"epoch": 0.37,
"grad_norm": 3.828125,
"learning_rate": 3.1457345971563986e-05,
"loss": 2.1391,
"step": 939
},
{
"epoch": 0.37,
"grad_norm": 3.671875,
"learning_rate": 3.1437598736176934e-05,
"loss": 1.9459,
"step": 940
},
{
"epoch": 0.37,
"grad_norm": 9.375,
"learning_rate": 3.141785150078989e-05,
"loss": 2.0456,
"step": 941
},
{
"epoch": 0.37,
"grad_norm": 5.5625,
"learning_rate": 3.1398104265402844e-05,
"loss": 2.2285,
"step": 942
},
{
"epoch": 0.37,
"grad_norm": 3.734375,
"learning_rate": 3.13783570300158e-05,
"loss": 2.1367,
"step": 943
},
{
"epoch": 0.37,
"grad_norm": 3.484375,
"learning_rate": 3.135860979462875e-05,
"loss": 2.1075,
"step": 944
},
{
"epoch": 0.37,
"grad_norm": 4.125,
"learning_rate": 3.133886255924171e-05,
"loss": 2.2398,
"step": 945
},
{
"epoch": 0.37,
"grad_norm": 6.40625,
"learning_rate": 3.131911532385466e-05,
"loss": 1.8532,
"step": 946
},
{
"epoch": 0.37,
"grad_norm": 4.0625,
"learning_rate": 3.129936808846762e-05,
"loss": 1.9183,
"step": 947
},
{
"epoch": 0.37,
"grad_norm": 3.375,
"learning_rate": 3.127962085308057e-05,
"loss": 2.0615,
"step": 948
},
{
"epoch": 0.37,
"grad_norm": 25.5,
"learning_rate": 3.125987361769353e-05,
"loss": 2.4542,
"step": 949
},
{
"epoch": 0.38,
"grad_norm": 3.75,
"learning_rate": 3.124012638230648e-05,
"loss": 2.0015,
"step": 950
},
{
"epoch": 0.38,
"grad_norm": 5.9375,
"learning_rate": 3.122037914691943e-05,
"loss": 2.0367,
"step": 951
},
{
"epoch": 0.38,
"grad_norm": 5.6875,
"learning_rate": 3.1200631911532384e-05,
"loss": 2.2596,
"step": 952
},
{
"epoch": 0.38,
"grad_norm": 4.78125,
"learning_rate": 3.118088467614534e-05,
"loss": 2.3189,
"step": 953
},
{
"epoch": 0.38,
"grad_norm": 4.78125,
"learning_rate": 3.11611374407583e-05,
"loss": 2.2707,
"step": 954
},
{
"epoch": 0.38,
"grad_norm": 5.0625,
"learning_rate": 3.114139020537125e-05,
"loss": 2.1067,
"step": 955
},
{
"epoch": 0.38,
"grad_norm": 5.3125,
"learning_rate": 3.11216429699842e-05,
"loss": 1.9319,
"step": 956
},
{
"epoch": 0.38,
"grad_norm": 4.84375,
"learning_rate": 3.110189573459716e-05,
"loss": 2.159,
"step": 957
},
{
"epoch": 0.38,
"grad_norm": 4.84375,
"learning_rate": 3.108214849921011e-05,
"loss": 2.1685,
"step": 958
},
{
"epoch": 0.38,
"grad_norm": 5.0625,
"learning_rate": 3.106240126382307e-05,
"loss": 2.076,
"step": 959
},
{
"epoch": 0.38,
"grad_norm": 4.59375,
"learning_rate": 3.104265402843602e-05,
"loss": 2.0845,
"step": 960
},
{
"epoch": 0.38,
"grad_norm": 3.859375,
"learning_rate": 3.102290679304897e-05,
"loss": 1.9341,
"step": 961
},
{
"epoch": 0.38,
"grad_norm": 8.6875,
"learning_rate": 3.100315955766193e-05,
"loss": 2.3594,
"step": 962
},
{
"epoch": 0.38,
"grad_norm": 6.75,
"learning_rate": 3.0983412322274886e-05,
"loss": 2.5823,
"step": 963
},
{
"epoch": 0.38,
"grad_norm": 5.5625,
"learning_rate": 3.096366508688784e-05,
"loss": 2.1992,
"step": 964
},
{
"epoch": 0.38,
"grad_norm": 5.96875,
"learning_rate": 3.0943917851500795e-05,
"loss": 1.8768,
"step": 965
},
{
"epoch": 0.38,
"grad_norm": 8.375,
"learning_rate": 3.0924170616113743e-05,
"loss": 1.9477,
"step": 966
},
{
"epoch": 0.38,
"grad_norm": 5.65625,
"learning_rate": 3.09044233807267e-05,
"loss": 2.2936,
"step": 967
},
{
"epoch": 0.38,
"grad_norm": 6.0625,
"learning_rate": 3.088467614533965e-05,
"loss": 2.0172,
"step": 968
},
{
"epoch": 0.38,
"grad_norm": 13.625,
"learning_rate": 3.086492890995261e-05,
"loss": 1.981,
"step": 969
},
{
"epoch": 0.38,
"grad_norm": 6.34375,
"learning_rate": 3.084518167456556e-05,
"loss": 2.4907,
"step": 970
},
{
"epoch": 0.38,
"grad_norm": 5.59375,
"learning_rate": 3.082543443917852e-05,
"loss": 1.8809,
"step": 971
},
{
"epoch": 0.38,
"grad_norm": 6.0625,
"learning_rate": 3.080568720379147e-05,
"loss": 1.8275,
"step": 972
},
{
"epoch": 0.38,
"grad_norm": 6.875,
"learning_rate": 3.0785939968404426e-05,
"loss": 2.2381,
"step": 973
},
{
"epoch": 0.38,
"grad_norm": 5.09375,
"learning_rate": 3.076619273301738e-05,
"loss": 1.9324,
"step": 974
},
{
"epoch": 0.39,
"grad_norm": 5.125,
"learning_rate": 3.0746445497630336e-05,
"loss": 2.0858,
"step": 975
},
{
"epoch": 0.39,
"grad_norm": 6.09375,
"learning_rate": 3.0726698262243284e-05,
"loss": 2.7086,
"step": 976
},
{
"epoch": 0.39,
"grad_norm": 4.8125,
"learning_rate": 3.070695102685624e-05,
"loss": 2.0537,
"step": 977
},
{
"epoch": 0.39,
"grad_norm": 5.5625,
"learning_rate": 3.068720379146919e-05,
"loss": 2.1612,
"step": 978
},
{
"epoch": 0.39,
"grad_norm": 4.28125,
"learning_rate": 3.0667456556082155e-05,
"loss": 1.9437,
"step": 979
},
{
"epoch": 0.39,
"grad_norm": 9.5,
"learning_rate": 3.06477093206951e-05,
"loss": 2.1381,
"step": 980
},
{
"epoch": 0.39,
"grad_norm": 5.875,
"learning_rate": 3.062796208530806e-05,
"loss": 2.1126,
"step": 981
},
{
"epoch": 0.39,
"grad_norm": 4.40625,
"learning_rate": 3.060821484992101e-05,
"loss": 1.7429,
"step": 982
},
{
"epoch": 0.39,
"grad_norm": 7.03125,
"learning_rate": 3.058846761453397e-05,
"loss": 2.189,
"step": 983
},
{
"epoch": 0.39,
"grad_norm": 6.875,
"learning_rate": 3.056872037914692e-05,
"loss": 2.4304,
"step": 984
},
{
"epoch": 0.39,
"grad_norm": 5.65625,
"learning_rate": 3.0548973143759876e-05,
"loss": 2.2723,
"step": 985
},
{
"epoch": 0.39,
"grad_norm": 5.65625,
"learning_rate": 3.0529225908372824e-05,
"loss": 2.0567,
"step": 986
},
{
"epoch": 0.39,
"grad_norm": 6.5625,
"learning_rate": 3.0509478672985786e-05,
"loss": 2.3466,
"step": 987
},
{
"epoch": 0.39,
"grad_norm": 5.90625,
"learning_rate": 3.048973143759874e-05,
"loss": 2.5319,
"step": 988
},
{
"epoch": 0.39,
"grad_norm": 10.3125,
"learning_rate": 3.0469984202211692e-05,
"loss": 2.0603,
"step": 989
},
{
"epoch": 0.39,
"grad_norm": 5.25,
"learning_rate": 3.0450236966824647e-05,
"loss": 2.1215,
"step": 990
},
{
"epoch": 0.39,
"grad_norm": 4.25,
"learning_rate": 3.04304897314376e-05,
"loss": 2.3808,
"step": 991
},
{
"epoch": 0.39,
"grad_norm": 6.03125,
"learning_rate": 3.0410742496050553e-05,
"loss": 2.2583,
"step": 992
},
{
"epoch": 0.39,
"grad_norm": 4.46875,
"learning_rate": 3.0390995260663507e-05,
"loss": 1.8414,
"step": 993
},
{
"epoch": 0.39,
"grad_norm": 5.34375,
"learning_rate": 3.0371248025276462e-05,
"loss": 2.1704,
"step": 994
},
{
"epoch": 0.39,
"grad_norm": 6.5625,
"learning_rate": 3.0351500789889413e-05,
"loss": 2.1396,
"step": 995
},
{
"epoch": 0.39,
"grad_norm": 4.875,
"learning_rate": 3.0331753554502375e-05,
"loss": 2.1264,
"step": 996
},
{
"epoch": 0.39,
"grad_norm": 5.96875,
"learning_rate": 3.0312006319115326e-05,
"loss": 2.1285,
"step": 997
},
{
"epoch": 0.39,
"grad_norm": 6.625,
"learning_rate": 3.029225908372828e-05,
"loss": 2.0612,
"step": 998
},
{
"epoch": 0.39,
"grad_norm": 4.84375,
"learning_rate": 3.0272511848341232e-05,
"loss": 1.9947,
"step": 999
},
{
"epoch": 0.39,
"grad_norm": 5.40625,
"learning_rate": 3.0252764612954187e-05,
"loss": 1.9982,
"step": 1000
},
{
"epoch": 0.4,
"grad_norm": 5.40625,
"learning_rate": 3.0233017377567142e-05,
"loss": 2.4577,
"step": 1001
},
{
"epoch": 0.4,
"grad_norm": 5.4375,
"learning_rate": 3.0213270142180093e-05,
"loss": 1.9462,
"step": 1002
},
{
"epoch": 0.4,
"grad_norm": 4.25,
"learning_rate": 3.0193522906793048e-05,
"loss": 2.1077,
"step": 1003
},
{
"epoch": 0.4,
"grad_norm": 4.8125,
"learning_rate": 3.0173775671406006e-05,
"loss": 2.2788,
"step": 1004
},
{
"epoch": 0.4,
"grad_norm": 3.78125,
"learning_rate": 3.015402843601896e-05,
"loss": 1.9781,
"step": 1005
},
{
"epoch": 0.4,
"grad_norm": 4.875,
"learning_rate": 3.0134281200631915e-05,
"loss": 2.269,
"step": 1006
},
{
"epoch": 0.4,
"grad_norm": 4.46875,
"learning_rate": 3.0114533965244867e-05,
"loss": 2.0683,
"step": 1007
},
{
"epoch": 0.4,
"grad_norm": 4.78125,
"learning_rate": 3.009478672985782e-05,
"loss": 2.1516,
"step": 1008
},
{
"epoch": 0.4,
"grad_norm": 5.34375,
"learning_rate": 3.0075039494470776e-05,
"loss": 2.1074,
"step": 1009
},
{
"epoch": 0.4,
"grad_norm": 6.65625,
"learning_rate": 3.0055292259083728e-05,
"loss": 1.9805,
"step": 1010
},
{
"epoch": 0.4,
"grad_norm": 5.625,
"learning_rate": 3.0035545023696682e-05,
"loss": 2.1971,
"step": 1011
},
{
"epoch": 0.4,
"grad_norm": 3.953125,
"learning_rate": 3.001579778830964e-05,
"loss": 1.9663,
"step": 1012
},
{
"epoch": 0.4,
"grad_norm": 6.375,
"learning_rate": 2.9996050552922595e-05,
"loss": 2.362,
"step": 1013
},
{
"epoch": 0.4,
"grad_norm": 5.1875,
"learning_rate": 2.9976303317535546e-05,
"loss": 1.9182,
"step": 1014
},
{
"epoch": 0.4,
"grad_norm": 4.4375,
"learning_rate": 2.99565560821485e-05,
"loss": 2.1931,
"step": 1015
},
{
"epoch": 0.4,
"grad_norm": 12.1875,
"learning_rate": 2.9936808846761456e-05,
"loss": 1.9034,
"step": 1016
},
{
"epoch": 0.4,
"grad_norm": 6.25,
"learning_rate": 2.9917061611374407e-05,
"loss": 1.9062,
"step": 1017
},
{
"epoch": 0.4,
"grad_norm": 5.78125,
"learning_rate": 2.9897314375987362e-05,
"loss": 2.0405,
"step": 1018
},
{
"epoch": 0.4,
"grad_norm": 6.53125,
"learning_rate": 2.9877567140600317e-05,
"loss": 2.0554,
"step": 1019
},
{
"epoch": 0.4,
"grad_norm": 6.40625,
"learning_rate": 2.9857819905213268e-05,
"loss": 2.1224,
"step": 1020
},
{
"epoch": 0.4,
"grad_norm": 5.34375,
"learning_rate": 2.983807266982623e-05,
"loss": 2.1465,
"step": 1021
},
{
"epoch": 0.4,
"grad_norm": 6.09375,
"learning_rate": 2.981832543443918e-05,
"loss": 2.2829,
"step": 1022
},
{
"epoch": 0.4,
"grad_norm": 5.84375,
"learning_rate": 2.9798578199052136e-05,
"loss": 2.1614,
"step": 1023
},
{
"epoch": 0.4,
"grad_norm": 17.75,
"learning_rate": 2.977883096366509e-05,
"loss": 2.2667,
"step": 1024
},
{
"epoch": 0.4,
"grad_norm": 7.75,
"learning_rate": 2.975908372827804e-05,
"loss": 2.0269,
"step": 1025
},
{
"epoch": 0.41,
"grad_norm": 5.375,
"learning_rate": 2.9739336492890996e-05,
"loss": 2.214,
"step": 1026
},
{
"epoch": 0.41,
"grad_norm": 7.4375,
"learning_rate": 2.9719589257503948e-05,
"loss": 2.1962,
"step": 1027
},
{
"epoch": 0.41,
"grad_norm": 6.4375,
"learning_rate": 2.9699842022116902e-05,
"loss": 1.8448,
"step": 1028
},
{
"epoch": 0.41,
"grad_norm": 3.8125,
"learning_rate": 2.968009478672986e-05,
"loss": 2.1769,
"step": 1029
},
{
"epoch": 0.41,
"grad_norm": 6.78125,
"learning_rate": 2.9660347551342815e-05,
"loss": 2.0494,
"step": 1030
},
{
"epoch": 0.41,
"grad_norm": 5.28125,
"learning_rate": 2.964060031595577e-05,
"loss": 2.0102,
"step": 1031
},
{
"epoch": 0.41,
"grad_norm": 5.1875,
"learning_rate": 2.962085308056872e-05,
"loss": 2.6173,
"step": 1032
},
{
"epoch": 0.41,
"grad_norm": 11.9375,
"learning_rate": 2.9601105845181676e-05,
"loss": 2.4153,
"step": 1033
},
{
"epoch": 0.41,
"grad_norm": 4.375,
"learning_rate": 2.958135860979463e-05,
"loss": 2.3776,
"step": 1034
},
{
"epoch": 0.41,
"grad_norm": 5.53125,
"learning_rate": 2.9561611374407582e-05,
"loss": 2.327,
"step": 1035
},
{
"epoch": 0.41,
"grad_norm": 4.5,
"learning_rate": 2.9541864139020537e-05,
"loss": 2.1199,
"step": 1036
},
{
"epoch": 0.41,
"grad_norm": 5.6875,
"learning_rate": 2.9522116903633495e-05,
"loss": 2.2445,
"step": 1037
},
{
"epoch": 0.41,
"grad_norm": 5.21875,
"learning_rate": 2.950236966824645e-05,
"loss": 2.033,
"step": 1038
},
{
"epoch": 0.41,
"grad_norm": 5.71875,
"learning_rate": 2.94826224328594e-05,
"loss": 1.7764,
"step": 1039
},
{
"epoch": 0.41,
"grad_norm": 5.375,
"learning_rate": 2.9462875197472356e-05,
"loss": 1.8907,
"step": 1040
},
{
"epoch": 0.41,
"grad_norm": 5.3125,
"learning_rate": 2.944312796208531e-05,
"loss": 1.9194,
"step": 1041
},
{
"epoch": 0.41,
"grad_norm": 6.125,
"learning_rate": 2.9423380726698262e-05,
"loss": 1.8192,
"step": 1042
},
{
"epoch": 0.41,
"grad_norm": 5.625,
"learning_rate": 2.9403633491311216e-05,
"loss": 2.0285,
"step": 1043
},
{
"epoch": 0.41,
"grad_norm": 4.75,
"learning_rate": 2.938388625592417e-05,
"loss": 1.8841,
"step": 1044
},
{
"epoch": 0.41,
"grad_norm": 6.78125,
"learning_rate": 2.9364139020537123e-05,
"loss": 2.3542,
"step": 1045
},
{
"epoch": 0.41,
"grad_norm": 4.25,
"learning_rate": 2.9344391785150084e-05,
"loss": 2.0713,
"step": 1046
},
{
"epoch": 0.41,
"grad_norm": 6.78125,
"learning_rate": 2.9324644549763035e-05,
"loss": 2.0577,
"step": 1047
},
{
"epoch": 0.41,
"grad_norm": 6.28125,
"learning_rate": 2.930489731437599e-05,
"loss": 2.1367,
"step": 1048
},
{
"epoch": 0.41,
"grad_norm": 5.25,
"learning_rate": 2.9285150078988945e-05,
"loss": 2.0427,
"step": 1049
},
{
"epoch": 0.41,
"grad_norm": 5.8125,
"learning_rate": 2.9265402843601896e-05,
"loss": 1.9849,
"step": 1050
},
{
"epoch": 0.42,
"grad_norm": 5.90625,
"learning_rate": 2.924565560821485e-05,
"loss": 2.0062,
"step": 1051
},
{
"epoch": 0.42,
"grad_norm": 9.0625,
"learning_rate": 2.9225908372827802e-05,
"loss": 2.158,
"step": 1052
},
{
"epoch": 0.42,
"grad_norm": 6.125,
"learning_rate": 2.9206161137440757e-05,
"loss": 2.3439,
"step": 1053
},
{
"epoch": 0.42,
"grad_norm": 4.71875,
"learning_rate": 2.9186413902053715e-05,
"loss": 2.0509,
"step": 1054
},
{
"epoch": 0.42,
"grad_norm": 5.03125,
"learning_rate": 2.916666666666667e-05,
"loss": 1.8932,
"step": 1055
},
{
"epoch": 0.42,
"grad_norm": 4.59375,
"learning_rate": 2.9146919431279624e-05,
"loss": 2.3145,
"step": 1056
},
{
"epoch": 0.42,
"grad_norm": 6.625,
"learning_rate": 2.9127172195892576e-05,
"loss": 2.2486,
"step": 1057
},
{
"epoch": 0.42,
"grad_norm": 6.40625,
"learning_rate": 2.910742496050553e-05,
"loss": 2.0019,
"step": 1058
},
{
"epoch": 0.42,
"grad_norm": 10.9375,
"learning_rate": 2.9087677725118485e-05,
"loss": 2.2197,
"step": 1059
},
{
"epoch": 0.42,
"grad_norm": 9.625,
"learning_rate": 2.9067930489731437e-05,
"loss": 1.9787,
"step": 1060
},
{
"epoch": 0.42,
"grad_norm": 8.1875,
"learning_rate": 2.904818325434439e-05,
"loss": 2.0667,
"step": 1061
},
{
"epoch": 0.42,
"grad_norm": 9.0,
"learning_rate": 2.902843601895735e-05,
"loss": 1.9183,
"step": 1062
},
{
"epoch": 0.42,
"grad_norm": 7.53125,
"learning_rate": 2.9008688783570304e-05,
"loss": 2.2603,
"step": 1063
},
{
"epoch": 0.42,
"grad_norm": 4.78125,
"learning_rate": 2.8988941548183255e-05,
"loss": 2.2278,
"step": 1064
},
{
"epoch": 0.42,
"grad_norm": 4.875,
"learning_rate": 2.896919431279621e-05,
"loss": 2.421,
"step": 1065
},
{
"epoch": 0.42,
"grad_norm": 6.28125,
"learning_rate": 2.8949447077409165e-05,
"loss": 2.0023,
"step": 1066
},
{
"epoch": 0.42,
"grad_norm": 5.34375,
"learning_rate": 2.8929699842022116e-05,
"loss": 2.0271,
"step": 1067
},
{
"epoch": 0.42,
"grad_norm": 5.78125,
"learning_rate": 2.890995260663507e-05,
"loss": 2.5885,
"step": 1068
},
{
"epoch": 0.42,
"grad_norm": 6.28125,
"learning_rate": 2.8890205371248026e-05,
"loss": 2.1043,
"step": 1069
},
{
"epoch": 0.42,
"grad_norm": 7.96875,
"learning_rate": 2.8870458135860977e-05,
"loss": 2.0675,
"step": 1070
},
{
"epoch": 0.42,
"grad_norm": 6.1875,
"learning_rate": 2.885071090047394e-05,
"loss": 2.1342,
"step": 1071
},
{
"epoch": 0.42,
"grad_norm": 5.8125,
"learning_rate": 2.883096366508689e-05,
"loss": 1.9258,
"step": 1072
},
{
"epoch": 0.42,
"grad_norm": 4.84375,
"learning_rate": 2.8811216429699845e-05,
"loss": 1.9811,
"step": 1073
},
{
"epoch": 0.42,
"grad_norm": 5.40625,
"learning_rate": 2.87914691943128e-05,
"loss": 1.862,
"step": 1074
},
{
"epoch": 0.42,
"grad_norm": 5.15625,
"learning_rate": 2.877172195892575e-05,
"loss": 2.3008,
"step": 1075
},
{
"epoch": 0.42,
"grad_norm": 5.5,
"learning_rate": 2.8751974723538705e-05,
"loss": 2.3399,
"step": 1076
},
{
"epoch": 0.43,
"grad_norm": 8.9375,
"learning_rate": 2.873222748815166e-05,
"loss": 2.0573,
"step": 1077
},
{
"epoch": 0.43,
"grad_norm": 6.46875,
"learning_rate": 2.871248025276461e-05,
"loss": 2.0687,
"step": 1078
},
{
"epoch": 0.43,
"grad_norm": 4.625,
"learning_rate": 2.869273301737757e-05,
"loss": 2.2145,
"step": 1079
},
{
"epoch": 0.43,
"grad_norm": 6.28125,
"learning_rate": 2.8672985781990524e-05,
"loss": 2.432,
"step": 1080
},
{
"epoch": 0.43,
"grad_norm": 6.78125,
"learning_rate": 2.865323854660348e-05,
"loss": 2.3317,
"step": 1081
},
{
"epoch": 0.43,
"grad_norm": 6.53125,
"learning_rate": 2.863349131121643e-05,
"loss": 2.2845,
"step": 1082
},
{
"epoch": 0.43,
"grad_norm": 5.0625,
"learning_rate": 2.8613744075829385e-05,
"loss": 2.0983,
"step": 1083
},
{
"epoch": 0.43,
"grad_norm": 7.28125,
"learning_rate": 2.859399684044234e-05,
"loss": 1.9619,
"step": 1084
},
{
"epoch": 0.43,
"grad_norm": 7.5,
"learning_rate": 2.857424960505529e-05,
"loss": 2.1664,
"step": 1085
},
{
"epoch": 0.43,
"grad_norm": 6.0625,
"learning_rate": 2.8554502369668246e-05,
"loss": 2.0207,
"step": 1086
},
{
"epoch": 0.43,
"grad_norm": 5.4375,
"learning_rate": 2.8534755134281204e-05,
"loss": 2.2528,
"step": 1087
},
{
"epoch": 0.43,
"grad_norm": 4.46875,
"learning_rate": 2.851500789889416e-05,
"loss": 1.9941,
"step": 1088
},
{
"epoch": 0.43,
"grad_norm": 5.09375,
"learning_rate": 2.8495260663507113e-05,
"loss": 2.0533,
"step": 1089
},
{
"epoch": 0.43,
"grad_norm": 4.28125,
"learning_rate": 2.8475513428120065e-05,
"loss": 2.1984,
"step": 1090
},
{
"epoch": 0.43,
"grad_norm": 4.5,
"learning_rate": 2.845576619273302e-05,
"loss": 2.0445,
"step": 1091
},
{
"epoch": 0.43,
"grad_norm": 7.03125,
"learning_rate": 2.843601895734597e-05,
"loss": 2.5089,
"step": 1092
},
{
"epoch": 0.43,
"grad_norm": 6.1875,
"learning_rate": 2.8416271721958926e-05,
"loss": 1.994,
"step": 1093
},
{
"epoch": 0.43,
"grad_norm": 16.0,
"learning_rate": 2.839652448657188e-05,
"loss": 2.1956,
"step": 1094
},
{
"epoch": 0.43,
"grad_norm": 4.25,
"learning_rate": 2.837677725118483e-05,
"loss": 2.2476,
"step": 1095
},
{
"epoch": 0.43,
"grad_norm": 6.5,
"learning_rate": 2.8357030015797793e-05,
"loss": 1.4839,
"step": 1096
},
{
"epoch": 0.43,
"grad_norm": 4.8125,
"learning_rate": 2.8337282780410744e-05,
"loss": 1.5829,
"step": 1097
},
{
"epoch": 0.43,
"grad_norm": 3.953125,
"learning_rate": 2.83175355450237e-05,
"loss": 1.5653,
"step": 1098
},
{
"epoch": 0.43,
"grad_norm": 3.84375,
"learning_rate": 2.8297788309636654e-05,
"loss": 2.3147,
"step": 1099
},
{
"epoch": 0.43,
"grad_norm": 4.9375,
"learning_rate": 2.8278041074249605e-05,
"loss": 2.0617,
"step": 1100
},
{
"epoch": 0.43,
"grad_norm": 7.90625,
"learning_rate": 2.825829383886256e-05,
"loss": 1.8926,
"step": 1101
},
{
"epoch": 0.44,
"grad_norm": 5.4375,
"learning_rate": 2.8238546603475515e-05,
"loss": 1.8856,
"step": 1102
},
{
"epoch": 0.44,
"grad_norm": 6.84375,
"learning_rate": 2.8218799368088466e-05,
"loss": 2.1553,
"step": 1103
},
{
"epoch": 0.44,
"grad_norm": 6.5,
"learning_rate": 2.8199052132701424e-05,
"loss": 2.444,
"step": 1104
},
{
"epoch": 0.44,
"grad_norm": 4.53125,
"learning_rate": 2.817930489731438e-05,
"loss": 1.7779,
"step": 1105
},
{
"epoch": 0.44,
"grad_norm": 6.125,
"learning_rate": 2.8159557661927334e-05,
"loss": 1.7947,
"step": 1106
},
{
"epoch": 0.44,
"grad_norm": 6.75,
"learning_rate": 2.8139810426540285e-05,
"loss": 2.4653,
"step": 1107
},
{
"epoch": 0.44,
"grad_norm": 4.0,
"learning_rate": 2.812006319115324e-05,
"loss": 2.2892,
"step": 1108
},
{
"epoch": 0.44,
"grad_norm": 8.625,
"learning_rate": 2.8100315955766194e-05,
"loss": 2.2005,
"step": 1109
},
{
"epoch": 0.44,
"grad_norm": 5.25,
"learning_rate": 2.8080568720379146e-05,
"loss": 2.0186,
"step": 1110
},
{
"epoch": 0.44,
"grad_norm": 5.625,
"learning_rate": 2.80608214849921e-05,
"loss": 2.1833,
"step": 1111
},
{
"epoch": 0.44,
"grad_norm": 4.5,
"learning_rate": 2.804107424960506e-05,
"loss": 2.147,
"step": 1112
},
{
"epoch": 0.44,
"grad_norm": 4.28125,
"learning_rate": 2.8021327014218013e-05,
"loss": 1.8431,
"step": 1113
},
{
"epoch": 0.44,
"grad_norm": 4.25,
"learning_rate": 2.8001579778830968e-05,
"loss": 1.7725,
"step": 1114
},
{
"epoch": 0.44,
"grad_norm": 5.09375,
"learning_rate": 2.798183254344392e-05,
"loss": 2.2364,
"step": 1115
},
{
"epoch": 0.44,
"grad_norm": 4.96875,
"learning_rate": 2.7962085308056874e-05,
"loss": 1.9349,
"step": 1116
},
{
"epoch": 0.44,
"grad_norm": 4.46875,
"learning_rate": 2.7942338072669825e-05,
"loss": 2.3327,
"step": 1117
},
{
"epoch": 0.44,
"grad_norm": 5.09375,
"learning_rate": 2.792259083728278e-05,
"loss": 2.2791,
"step": 1118
},
{
"epoch": 0.44,
"grad_norm": 5.0625,
"learning_rate": 2.7902843601895735e-05,
"loss": 2.0492,
"step": 1119
},
{
"epoch": 0.44,
"grad_norm": 5.0625,
"learning_rate": 2.7883096366508686e-05,
"loss": 2.011,
"step": 1120
},
{
"epoch": 0.44,
"grad_norm": 3.484375,
"learning_rate": 2.7863349131121648e-05,
"loss": 2.0302,
"step": 1121
},
{
"epoch": 0.44,
"grad_norm": 4.15625,
"learning_rate": 2.78436018957346e-05,
"loss": 2.353,
"step": 1122
},
{
"epoch": 0.44,
"grad_norm": 4.1875,
"learning_rate": 2.7823854660347554e-05,
"loss": 1.9375,
"step": 1123
},
{
"epoch": 0.44,
"grad_norm": 7.3125,
"learning_rate": 2.780410742496051e-05,
"loss": 1.9524,
"step": 1124
},
{
"epoch": 0.44,
"grad_norm": 5.40625,
"learning_rate": 2.778436018957346e-05,
"loss": 2.239,
"step": 1125
},
{
"epoch": 0.44,
"grad_norm": 4.78125,
"learning_rate": 2.7764612954186414e-05,
"loss": 2.42,
"step": 1126
},
{
"epoch": 0.45,
"grad_norm": 4.3125,
"learning_rate": 2.774486571879937e-05,
"loss": 2.0267,
"step": 1127
},
{
"epoch": 0.45,
"grad_norm": 5.09375,
"learning_rate": 2.772511848341232e-05,
"loss": 2.2728,
"step": 1128
},
{
"epoch": 0.45,
"grad_norm": 5.8125,
"learning_rate": 2.770537124802528e-05,
"loss": 2.3058,
"step": 1129
},
{
"epoch": 0.45,
"grad_norm": 7.78125,
"learning_rate": 2.7685624012638233e-05,
"loss": 2.2182,
"step": 1130
},
{
"epoch": 0.45,
"grad_norm": 7.59375,
"learning_rate": 2.7665876777251188e-05,
"loss": 2.0927,
"step": 1131
},
{
"epoch": 0.45,
"grad_norm": 6.0625,
"learning_rate": 2.764612954186414e-05,
"loss": 2.356,
"step": 1132
},
{
"epoch": 0.45,
"grad_norm": 6.15625,
"learning_rate": 2.7626382306477094e-05,
"loss": 1.9799,
"step": 1133
},
{
"epoch": 0.45,
"grad_norm": 3.734375,
"learning_rate": 2.760663507109005e-05,
"loss": 2.1737,
"step": 1134
},
{
"epoch": 0.45,
"grad_norm": 4.625,
"learning_rate": 2.7586887835703e-05,
"loss": 1.949,
"step": 1135
},
{
"epoch": 0.45,
"grad_norm": 7.59375,
"learning_rate": 2.7567140600315955e-05,
"loss": 1.7392,
"step": 1136
},
{
"epoch": 0.45,
"grad_norm": 4.9375,
"learning_rate": 2.7547393364928913e-05,
"loss": 1.6365,
"step": 1137
},
{
"epoch": 0.45,
"grad_norm": 5.84375,
"learning_rate": 2.7527646129541868e-05,
"loss": 2.1883,
"step": 1138
},
{
"epoch": 0.45,
"grad_norm": 6.5625,
"learning_rate": 2.7507898894154822e-05,
"loss": 2.316,
"step": 1139
},
{
"epoch": 0.45,
"grad_norm": 5.40625,
"learning_rate": 2.7488151658767774e-05,
"loss": 1.9597,
"step": 1140
},
{
"epoch": 0.45,
"grad_norm": 4.5625,
"learning_rate": 2.746840442338073e-05,
"loss": 2.0983,
"step": 1141
},
{
"epoch": 0.45,
"grad_norm": 4.78125,
"learning_rate": 2.7448657187993683e-05,
"loss": 1.8742,
"step": 1142
},
{
"epoch": 0.45,
"grad_norm": 7.21875,
"learning_rate": 2.7428909952606635e-05,
"loss": 2.0789,
"step": 1143
},
{
"epoch": 0.45,
"grad_norm": 4.53125,
"learning_rate": 2.740916271721959e-05,
"loss": 1.8375,
"step": 1144
},
{
"epoch": 0.45,
"grad_norm": 7.21875,
"learning_rate": 2.738941548183254e-05,
"loss": 1.894,
"step": 1145
},
{
"epoch": 0.45,
"grad_norm": 5.125,
"learning_rate": 2.7369668246445502e-05,
"loss": 1.8811,
"step": 1146
},
{
"epoch": 0.45,
"grad_norm": 4.875,
"learning_rate": 2.7349921011058453e-05,
"loss": 2.1069,
"step": 1147
},
{
"epoch": 0.45,
"grad_norm": 7.0625,
"learning_rate": 2.7330173775671408e-05,
"loss": 2.2026,
"step": 1148
},
{
"epoch": 0.45,
"grad_norm": 4.4375,
"learning_rate": 2.7310426540284363e-05,
"loss": 1.825,
"step": 1149
},
{
"epoch": 0.45,
"grad_norm": 5.65625,
"learning_rate": 2.7290679304897314e-05,
"loss": 1.8051,
"step": 1150
},
{
"epoch": 0.45,
"grad_norm": 4.34375,
"learning_rate": 2.727093206951027e-05,
"loss": 2.3469,
"step": 1151
},
{
"epoch": 0.45,
"grad_norm": 5.09375,
"learning_rate": 2.7251184834123224e-05,
"loss": 1.7316,
"step": 1152
},
{
"epoch": 0.46,
"grad_norm": 6.03125,
"learning_rate": 2.7231437598736175e-05,
"loss": 1.8558,
"step": 1153
},
{
"epoch": 0.46,
"grad_norm": 5.96875,
"learning_rate": 2.7211690363349137e-05,
"loss": 2.3068,
"step": 1154
},
{
"epoch": 0.46,
"grad_norm": 7.15625,
"learning_rate": 2.7191943127962088e-05,
"loss": 2.1935,
"step": 1155
},
{
"epoch": 0.46,
"grad_norm": 4.625,
"learning_rate": 2.7172195892575043e-05,
"loss": 2.0243,
"step": 1156
},
{
"epoch": 0.46,
"grad_norm": 4.71875,
"learning_rate": 2.7152448657187994e-05,
"loss": 1.853,
"step": 1157
},
{
"epoch": 0.46,
"grad_norm": 6.78125,
"learning_rate": 2.713270142180095e-05,
"loss": 1.9556,
"step": 1158
},
{
"epoch": 0.46,
"grad_norm": 7.34375,
"learning_rate": 2.7112954186413903e-05,
"loss": 1.9928,
"step": 1159
},
{
"epoch": 0.46,
"grad_norm": 4.375,
"learning_rate": 2.7093206951026855e-05,
"loss": 1.9273,
"step": 1160
},
{
"epoch": 0.46,
"grad_norm": 4.625,
"learning_rate": 2.707345971563981e-05,
"loss": 1.5774,
"step": 1161
},
{
"epoch": 0.46,
"grad_norm": 5.78125,
"learning_rate": 2.7053712480252768e-05,
"loss": 2.3145,
"step": 1162
},
{
"epoch": 0.46,
"grad_norm": 6.09375,
"learning_rate": 2.7033965244865722e-05,
"loss": 2.1099,
"step": 1163
},
{
"epoch": 0.46,
"grad_norm": 4.6875,
"learning_rate": 2.7014218009478677e-05,
"loss": 2.4977,
"step": 1164
},
{
"epoch": 0.46,
"grad_norm": 5.65625,
"learning_rate": 2.699447077409163e-05,
"loss": 1.8065,
"step": 1165
},
{
"epoch": 0.46,
"grad_norm": 6.09375,
"learning_rate": 2.6974723538704583e-05,
"loss": 1.8172,
"step": 1166
},
{
"epoch": 0.46,
"grad_norm": 4.3125,
"learning_rate": 2.6954976303317538e-05,
"loss": 1.7515,
"step": 1167
},
{
"epoch": 0.46,
"grad_norm": 4.90625,
"learning_rate": 2.693522906793049e-05,
"loss": 1.8238,
"step": 1168
},
{
"epoch": 0.46,
"grad_norm": 7.03125,
"learning_rate": 2.6915481832543444e-05,
"loss": 2.0191,
"step": 1169
},
{
"epoch": 0.46,
"grad_norm": 5.125,
"learning_rate": 2.6895734597156395e-05,
"loss": 2.4757,
"step": 1170
},
{
"epoch": 0.46,
"grad_norm": 5.15625,
"learning_rate": 2.6875987361769357e-05,
"loss": 2.2936,
"step": 1171
},
{
"epoch": 0.46,
"grad_norm": 7.90625,
"learning_rate": 2.6856240126382308e-05,
"loss": 2.1366,
"step": 1172
},
{
"epoch": 0.46,
"grad_norm": 4.84375,
"learning_rate": 2.6836492890995263e-05,
"loss": 2.4193,
"step": 1173
},
{
"epoch": 0.46,
"grad_norm": 4.28125,
"learning_rate": 2.6816745655608217e-05,
"loss": 1.928,
"step": 1174
},
{
"epoch": 0.46,
"grad_norm": 5.03125,
"learning_rate": 2.679699842022117e-05,
"loss": 2.2683,
"step": 1175
},
{
"epoch": 0.46,
"grad_norm": 3.4375,
"learning_rate": 2.6777251184834124e-05,
"loss": 1.9684,
"step": 1176
},
{
"epoch": 0.46,
"grad_norm": 4.46875,
"learning_rate": 2.6757503949447078e-05,
"loss": 1.8001,
"step": 1177
},
{
"epoch": 0.47,
"grad_norm": 5.3125,
"learning_rate": 2.673775671406003e-05,
"loss": 2.0277,
"step": 1178
},
{
"epoch": 0.47,
"grad_norm": 8.5,
"learning_rate": 2.671800947867299e-05,
"loss": 2.3923,
"step": 1179
},
{
"epoch": 0.47,
"grad_norm": 5.5625,
"learning_rate": 2.6698262243285942e-05,
"loss": 2.1788,
"step": 1180
},
{
"epoch": 0.47,
"grad_norm": 9.375,
"learning_rate": 2.6678515007898897e-05,
"loss": 2.0082,
"step": 1181
},
{
"epoch": 0.47,
"grad_norm": 8.375,
"learning_rate": 2.665876777251185e-05,
"loss": 1.866,
"step": 1182
},
{
"epoch": 0.47,
"grad_norm": 5.28125,
"learning_rate": 2.6639020537124803e-05,
"loss": 2.032,
"step": 1183
},
{
"epoch": 0.47,
"grad_norm": 4.75,
"learning_rate": 2.6619273301737758e-05,
"loss": 2.0754,
"step": 1184
},
{
"epoch": 0.47,
"grad_norm": 6.8125,
"learning_rate": 2.659952606635071e-05,
"loss": 2.2154,
"step": 1185
},
{
"epoch": 0.47,
"grad_norm": 4.59375,
"learning_rate": 2.6579778830963664e-05,
"loss": 2.0025,
"step": 1186
},
{
"epoch": 0.47,
"grad_norm": 6.4375,
"learning_rate": 2.6560031595576622e-05,
"loss": 2.2387,
"step": 1187
},
{
"epoch": 0.47,
"grad_norm": 5.125,
"learning_rate": 2.6540284360189577e-05,
"loss": 2.1835,
"step": 1188
},
{
"epoch": 0.47,
"grad_norm": 6.34375,
"learning_rate": 2.652053712480253e-05,
"loss": 1.9426,
"step": 1189
},
{
"epoch": 0.47,
"grad_norm": 4.59375,
"learning_rate": 2.6500789889415483e-05,
"loss": 2.0679,
"step": 1190
},
{
"epoch": 0.47,
"grad_norm": 34.5,
"learning_rate": 2.6481042654028438e-05,
"loss": 1.8497,
"step": 1191
},
{
"epoch": 0.47,
"grad_norm": 4.09375,
"learning_rate": 2.6461295418641392e-05,
"loss": 2.0029,
"step": 1192
},
{
"epoch": 0.47,
"grad_norm": 5.34375,
"learning_rate": 2.6441548183254344e-05,
"loss": 2.0559,
"step": 1193
},
{
"epoch": 0.47,
"grad_norm": 5.6875,
"learning_rate": 2.64218009478673e-05,
"loss": 2.1465,
"step": 1194
},
{
"epoch": 0.47,
"grad_norm": 4.96875,
"learning_rate": 2.640205371248025e-05,
"loss": 2.0189,
"step": 1195
},
{
"epoch": 0.47,
"grad_norm": 5.4375,
"learning_rate": 2.638230647709321e-05,
"loss": 2.1505,
"step": 1196
},
{
"epoch": 0.47,
"grad_norm": 7.90625,
"learning_rate": 2.6362559241706163e-05,
"loss": 2.2322,
"step": 1197
},
{
"epoch": 0.47,
"grad_norm": 5.28125,
"learning_rate": 2.6342812006319117e-05,
"loss": 2.1453,
"step": 1198
},
{
"epoch": 0.47,
"grad_norm": 8.75,
"learning_rate": 2.6323064770932072e-05,
"loss": 2.3321,
"step": 1199
},
{
"epoch": 0.47,
"grad_norm": 6.46875,
"learning_rate": 2.6303317535545023e-05,
"loss": 2.0895,
"step": 1200
},
{
"epoch": 0.47,
"grad_norm": 5.25,
"learning_rate": 2.6283570300157978e-05,
"loss": 2.3932,
"step": 1201
},
{
"epoch": 0.47,
"grad_norm": 5.3125,
"learning_rate": 2.6263823064770933e-05,
"loss": 2.0331,
"step": 1202
},
{
"epoch": 0.48,
"grad_norm": 5.0625,
"learning_rate": 2.6244075829383884e-05,
"loss": 2.1201,
"step": 1203
},
{
"epoch": 0.48,
"grad_norm": 6.96875,
"learning_rate": 2.6224328593996846e-05,
"loss": 1.9008,
"step": 1204
},
{
"epoch": 0.48,
"grad_norm": 11.6875,
"learning_rate": 2.6204581358609797e-05,
"loss": 2.1719,
"step": 1205
},
{
"epoch": 0.48,
"grad_norm": 4.28125,
"learning_rate": 2.618483412322275e-05,
"loss": 1.9865,
"step": 1206
},
{
"epoch": 0.48,
"grad_norm": 5.625,
"learning_rate": 2.6165086887835706e-05,
"loss": 1.9773,
"step": 1207
},
{
"epoch": 0.48,
"grad_norm": 5.40625,
"learning_rate": 2.6145339652448658e-05,
"loss": 1.7541,
"step": 1208
},
{
"epoch": 0.48,
"grad_norm": 4.46875,
"learning_rate": 2.6125592417061612e-05,
"loss": 2.0981,
"step": 1209
},
{
"epoch": 0.48,
"grad_norm": 6.65625,
"learning_rate": 2.6105845181674564e-05,
"loss": 2.1391,
"step": 1210
},
{
"epoch": 0.48,
"grad_norm": 4.125,
"learning_rate": 2.608609794628752e-05,
"loss": 1.9251,
"step": 1211
},
{
"epoch": 0.48,
"grad_norm": 5.96875,
"learning_rate": 2.6066350710900477e-05,
"loss": 2.4607,
"step": 1212
},
{
"epoch": 0.48,
"grad_norm": 5.25,
"learning_rate": 2.604660347551343e-05,
"loss": 2.1851,
"step": 1213
},
{
"epoch": 0.48,
"grad_norm": 11.3125,
"learning_rate": 2.6026856240126386e-05,
"loss": 2.0093,
"step": 1214
},
{
"epoch": 0.48,
"grad_norm": 6.5625,
"learning_rate": 2.6007109004739337e-05,
"loss": 1.9003,
"step": 1215
},
{
"epoch": 0.48,
"grad_norm": 4.59375,
"learning_rate": 2.5987361769352292e-05,
"loss": 2.0282,
"step": 1216
},
{
"epoch": 0.48,
"grad_norm": 6.84375,
"learning_rate": 2.5967614533965247e-05,
"loss": 2.1637,
"step": 1217
},
{
"epoch": 0.48,
"grad_norm": 5.84375,
"learning_rate": 2.5947867298578198e-05,
"loss": 1.9705,
"step": 1218
},
{
"epoch": 0.48,
"grad_norm": 4.875,
"learning_rate": 2.5928120063191153e-05,
"loss": 1.7375,
"step": 1219
},
{
"epoch": 0.48,
"grad_norm": 9.5,
"learning_rate": 2.5908372827804108e-05,
"loss": 1.6187,
"step": 1220
},
{
"epoch": 0.48,
"grad_norm": 8.3125,
"learning_rate": 2.5888625592417066e-05,
"loss": 2.1687,
"step": 1221
},
{
"epoch": 0.48,
"grad_norm": 6.1875,
"learning_rate": 2.5868878357030017e-05,
"loss": 1.8998,
"step": 1222
},
{
"epoch": 0.48,
"grad_norm": 7.0,
"learning_rate": 2.5849131121642972e-05,
"loss": 2.0586,
"step": 1223
},
{
"epoch": 0.48,
"grad_norm": 8.875,
"learning_rate": 2.5829383886255927e-05,
"loss": 2.4329,
"step": 1224
},
{
"epoch": 0.48,
"grad_norm": 4.8125,
"learning_rate": 2.5809636650868878e-05,
"loss": 1.7362,
"step": 1225
},
{
"epoch": 0.48,
"grad_norm": 6.46875,
"learning_rate": 2.5789889415481833e-05,
"loss": 2.2331,
"step": 1226
},
{
"epoch": 0.48,
"grad_norm": 8.4375,
"learning_rate": 2.5770142180094787e-05,
"loss": 2.426,
"step": 1227
},
{
"epoch": 0.48,
"grad_norm": 6.875,
"learning_rate": 2.575039494470774e-05,
"loss": 2.3051,
"step": 1228
},
{
"epoch": 0.49,
"grad_norm": 5.78125,
"learning_rate": 2.57306477093207e-05,
"loss": 2.4133,
"step": 1229
},
{
"epoch": 0.49,
"grad_norm": 4.28125,
"learning_rate": 2.571090047393365e-05,
"loss": 2.1426,
"step": 1230
},
{
"epoch": 0.49,
"grad_norm": 5.53125,
"learning_rate": 2.5691153238546606e-05,
"loss": 2.3813,
"step": 1231
},
{
"epoch": 0.49,
"grad_norm": 4.34375,
"learning_rate": 2.567140600315956e-05,
"loss": 1.9091,
"step": 1232
},
{
"epoch": 0.49,
"grad_norm": 5.375,
"learning_rate": 2.5651658767772512e-05,
"loss": 1.8847,
"step": 1233
},
{
"epoch": 0.49,
"grad_norm": 6.1875,
"learning_rate": 2.5631911532385467e-05,
"loss": 1.9896,
"step": 1234
},
{
"epoch": 0.49,
"grad_norm": 4.8125,
"learning_rate": 2.561216429699842e-05,
"loss": 2.2392,
"step": 1235
},
{
"epoch": 0.49,
"grad_norm": 5.84375,
"learning_rate": 2.5592417061611373e-05,
"loss": 1.6979,
"step": 1236
},
{
"epoch": 0.49,
"grad_norm": 5.6875,
"learning_rate": 2.557266982622433e-05,
"loss": 1.7742,
"step": 1237
},
{
"epoch": 0.49,
"grad_norm": 5.5625,
"learning_rate": 2.5552922590837286e-05,
"loss": 2.0146,
"step": 1238
},
{
"epoch": 0.49,
"grad_norm": 6.59375,
"learning_rate": 2.553317535545024e-05,
"loss": 1.9333,
"step": 1239
},
{
"epoch": 0.49,
"grad_norm": 4.53125,
"learning_rate": 2.5513428120063192e-05,
"loss": 2.0606,
"step": 1240
},
{
"epoch": 0.49,
"grad_norm": 5.0,
"learning_rate": 2.5493680884676147e-05,
"loss": 2.1148,
"step": 1241
},
{
"epoch": 0.49,
"grad_norm": 4.84375,
"learning_rate": 2.54739336492891e-05,
"loss": 2.1905,
"step": 1242
},
{
"epoch": 0.49,
"grad_norm": 4.9375,
"learning_rate": 2.5454186413902053e-05,
"loss": 2.0304,
"step": 1243
},
{
"epoch": 0.49,
"grad_norm": 5.53125,
"learning_rate": 2.5434439178515007e-05,
"loss": 1.9301,
"step": 1244
},
{
"epoch": 0.49,
"grad_norm": 8.5,
"learning_rate": 2.5414691943127962e-05,
"loss": 2.1242,
"step": 1245
},
{
"epoch": 0.49,
"grad_norm": 4.03125,
"learning_rate": 2.539494470774092e-05,
"loss": 2.3577,
"step": 1246
},
{
"epoch": 0.49,
"grad_norm": 5.75,
"learning_rate": 2.537519747235387e-05,
"loss": 2.4534,
"step": 1247
},
{
"epoch": 0.49,
"grad_norm": 5.53125,
"learning_rate": 2.5355450236966826e-05,
"loss": 2.3253,
"step": 1248
},
{
"epoch": 0.49,
"grad_norm": 5.4375,
"learning_rate": 2.533570300157978e-05,
"loss": 2.3095,
"step": 1249
},
{
"epoch": 0.49,
"grad_norm": 9.75,
"learning_rate": 2.5315955766192732e-05,
"loss": 2.1109,
"step": 1250
},
{
"epoch": 0.49,
"grad_norm": 4.9375,
"learning_rate": 2.5296208530805687e-05,
"loss": 2.2647,
"step": 1251
},
{
"epoch": 0.49,
"grad_norm": 4.09375,
"learning_rate": 2.5276461295418642e-05,
"loss": 1.9459,
"step": 1252
},
{
"epoch": 0.49,
"grad_norm": 6.59375,
"learning_rate": 2.5256714060031593e-05,
"loss": 1.9907,
"step": 1253
},
{
"epoch": 0.5,
"grad_norm": 5.90625,
"learning_rate": 2.5236966824644555e-05,
"loss": 2.085,
"step": 1254
},
{
"epoch": 0.5,
"grad_norm": 8.5625,
"learning_rate": 2.5217219589257506e-05,
"loss": 2.198,
"step": 1255
},
{
"epoch": 0.5,
"grad_norm": 10.25,
"learning_rate": 2.519747235387046e-05,
"loss": 2.1337,
"step": 1256
},
{
"epoch": 0.5,
"grad_norm": 5.0,
"learning_rate": 2.5177725118483415e-05,
"loss": 1.9642,
"step": 1257
},
{
"epoch": 0.5,
"grad_norm": 5.5625,
"learning_rate": 2.5157977883096367e-05,
"loss": 1.9634,
"step": 1258
},
{
"epoch": 0.5,
"grad_norm": 5.90625,
"learning_rate": 2.513823064770932e-05,
"loss": 2.2505,
"step": 1259
},
{
"epoch": 0.5,
"grad_norm": 5.5,
"learning_rate": 2.5118483412322273e-05,
"loss": 2.0375,
"step": 1260
},
{
"epoch": 0.5,
"grad_norm": 5.21875,
"learning_rate": 2.5098736176935228e-05,
"loss": 1.9021,
"step": 1261
},
{
"epoch": 0.5,
"grad_norm": 5.3125,
"learning_rate": 2.5078988941548186e-05,
"loss": 1.8203,
"step": 1262
},
{
"epoch": 0.5,
"grad_norm": 5.03125,
"learning_rate": 2.505924170616114e-05,
"loss": 2.2707,
"step": 1263
},
{
"epoch": 0.5,
"grad_norm": 5.96875,
"learning_rate": 2.5039494470774095e-05,
"loss": 1.8947,
"step": 1264
},
{
"epoch": 0.5,
"grad_norm": 4.9375,
"learning_rate": 2.5019747235387046e-05,
"loss": 1.8469,
"step": 1265
},
{
"epoch": 0.5,
"grad_norm": 5.625,
"learning_rate": 2.5e-05,
"loss": 2.1186,
"step": 1266
},
{
"epoch": 0.5,
"grad_norm": 7.5625,
"learning_rate": 2.4980252764612956e-05,
"loss": 2.3777,
"step": 1267
},
{
"epoch": 0.5,
"grad_norm": 5.0625,
"learning_rate": 2.4960505529225907e-05,
"loss": 1.8658,
"step": 1268
},
{
"epoch": 0.5,
"grad_norm": 7.59375,
"learning_rate": 2.4940758293838865e-05,
"loss": 2.2078,
"step": 1269
},
{
"epoch": 0.5,
"grad_norm": 5.40625,
"learning_rate": 2.4921011058451817e-05,
"loss": 2.1179,
"step": 1270
},
{
"epoch": 0.5,
"grad_norm": 5.21875,
"learning_rate": 2.490126382306477e-05,
"loss": 1.988,
"step": 1271
},
{
"epoch": 0.5,
"grad_norm": 6.0625,
"learning_rate": 2.4881516587677726e-05,
"loss": 1.7059,
"step": 1272
},
{
"epoch": 0.5,
"grad_norm": 7.65625,
"learning_rate": 2.486176935229068e-05,
"loss": 2.1586,
"step": 1273
},
{
"epoch": 0.5,
"grad_norm": 4.25,
"learning_rate": 2.4842022116903636e-05,
"loss": 1.9436,
"step": 1274
},
{
"epoch": 0.5,
"grad_norm": 7.375,
"learning_rate": 2.4822274881516587e-05,
"loss": 1.9944,
"step": 1275
},
{
"epoch": 0.5,
"grad_norm": 6.25,
"learning_rate": 2.480252764612954e-05,
"loss": 2.2967,
"step": 1276
},
{
"epoch": 0.5,
"grad_norm": 5.84375,
"learning_rate": 2.47827804107425e-05,
"loss": 2.0202,
"step": 1277
},
{
"epoch": 0.5,
"grad_norm": 5.34375,
"learning_rate": 2.476303317535545e-05,
"loss": 1.7282,
"step": 1278
},
{
"epoch": 0.51,
"grad_norm": 3.671875,
"learning_rate": 2.4743285939968406e-05,
"loss": 2.2839,
"step": 1279
},
{
"epoch": 0.51,
"grad_norm": 4.90625,
"learning_rate": 2.4723538704581357e-05,
"loss": 1.7881,
"step": 1280
},
{
"epoch": 0.51,
"grad_norm": 9.5625,
"learning_rate": 2.4703791469194315e-05,
"loss": 1.7582,
"step": 1281
},
{
"epoch": 0.51,
"grad_norm": 6.1875,
"learning_rate": 2.468404423380727e-05,
"loss": 1.7937,
"step": 1282
},
{
"epoch": 0.51,
"grad_norm": 5.75,
"learning_rate": 2.466429699842022e-05,
"loss": 2.1018,
"step": 1283
},
{
"epoch": 0.51,
"grad_norm": 7.21875,
"learning_rate": 2.4644549763033176e-05,
"loss": 2.1169,
"step": 1284
},
{
"epoch": 0.51,
"grad_norm": 5.34375,
"learning_rate": 2.462480252764613e-05,
"loss": 1.8524,
"step": 1285
},
{
"epoch": 0.51,
"grad_norm": 5.375,
"learning_rate": 2.4605055292259086e-05,
"loss": 2.0906,
"step": 1286
},
{
"epoch": 0.51,
"grad_norm": 4.90625,
"learning_rate": 2.458530805687204e-05,
"loss": 1.6387,
"step": 1287
},
{
"epoch": 0.51,
"grad_norm": 5.71875,
"learning_rate": 2.456556082148499e-05,
"loss": 1.6387,
"step": 1288
},
{
"epoch": 0.51,
"grad_norm": 6.46875,
"learning_rate": 2.454581358609795e-05,
"loss": 1.958,
"step": 1289
},
{
"epoch": 0.51,
"grad_norm": 10.375,
"learning_rate": 2.45260663507109e-05,
"loss": 2.2495,
"step": 1290
},
{
"epoch": 0.51,
"grad_norm": 6.59375,
"learning_rate": 2.4506319115323856e-05,
"loss": 2.3776,
"step": 1291
},
{
"epoch": 0.51,
"grad_norm": 6.09375,
"learning_rate": 2.448657187993681e-05,
"loss": 2.1806,
"step": 1292
},
{
"epoch": 0.51,
"grad_norm": 5.78125,
"learning_rate": 2.4466824644549762e-05,
"loss": 1.9989,
"step": 1293
},
{
"epoch": 0.51,
"grad_norm": 8.625,
"learning_rate": 2.444707740916272e-05,
"loss": 2.0007,
"step": 1294
},
{
"epoch": 0.51,
"grad_norm": 4.96875,
"learning_rate": 2.442733017377567e-05,
"loss": 1.8216,
"step": 1295
},
{
"epoch": 0.51,
"grad_norm": 6.84375,
"learning_rate": 2.4407582938388626e-05,
"loss": 2.411,
"step": 1296
},
{
"epoch": 0.51,
"grad_norm": 4.625,
"learning_rate": 2.438783570300158e-05,
"loss": 1.9321,
"step": 1297
},
{
"epoch": 0.51,
"grad_norm": 6.5,
"learning_rate": 2.4368088467614535e-05,
"loss": 2.0865,
"step": 1298
},
{
"epoch": 0.51,
"grad_norm": 4.625,
"learning_rate": 2.434834123222749e-05,
"loss": 1.8693,
"step": 1299
},
{
"epoch": 0.51,
"grad_norm": 5.03125,
"learning_rate": 2.432859399684044e-05,
"loss": 1.8448,
"step": 1300
},
{
"epoch": 0.51,
"grad_norm": 5.21875,
"learning_rate": 2.4308846761453396e-05,
"loss": 2.111,
"step": 1301
},
{
"epoch": 0.51,
"grad_norm": 12.75,
"learning_rate": 2.4289099526066354e-05,
"loss": 2.298,
"step": 1302
},
{
"epoch": 0.51,
"grad_norm": 6.0625,
"learning_rate": 2.4269352290679306e-05,
"loss": 1.7997,
"step": 1303
},
{
"epoch": 0.52,
"grad_norm": 7.53125,
"learning_rate": 2.424960505529226e-05,
"loss": 2.0493,
"step": 1304
},
{
"epoch": 0.52,
"grad_norm": 5.5625,
"learning_rate": 2.4229857819905215e-05,
"loss": 1.9611,
"step": 1305
},
{
"epoch": 0.52,
"grad_norm": 9.125,
"learning_rate": 2.421011058451817e-05,
"loss": 2.1518,
"step": 1306
},
{
"epoch": 0.52,
"grad_norm": 6.5625,
"learning_rate": 2.4190363349131125e-05,
"loss": 2.2332,
"step": 1307
},
{
"epoch": 0.52,
"grad_norm": 5.15625,
"learning_rate": 2.4170616113744076e-05,
"loss": 2.2586,
"step": 1308
},
{
"epoch": 0.52,
"grad_norm": 5.1875,
"learning_rate": 2.415086887835703e-05,
"loss": 2.035,
"step": 1309
},
{
"epoch": 0.52,
"grad_norm": 11.5625,
"learning_rate": 2.4131121642969985e-05,
"loss": 2.1551,
"step": 1310
},
{
"epoch": 0.52,
"grad_norm": 4.40625,
"learning_rate": 2.411137440758294e-05,
"loss": 1.9999,
"step": 1311
},
{
"epoch": 0.52,
"grad_norm": 4.46875,
"learning_rate": 2.4091627172195895e-05,
"loss": 1.7868,
"step": 1312
},
{
"epoch": 0.52,
"grad_norm": 7.125,
"learning_rate": 2.4071879936808846e-05,
"loss": 1.9033,
"step": 1313
},
{
"epoch": 0.52,
"grad_norm": 6.40625,
"learning_rate": 2.4052132701421804e-05,
"loss": 2.0659,
"step": 1314
},
{
"epoch": 0.52,
"grad_norm": 5.90625,
"learning_rate": 2.4032385466034756e-05,
"loss": 2.7933,
"step": 1315
},
{
"epoch": 0.52,
"grad_norm": 6.65625,
"learning_rate": 2.401263823064771e-05,
"loss": 2.2674,
"step": 1316
},
{
"epoch": 0.52,
"grad_norm": 6.59375,
"learning_rate": 2.3992890995260665e-05,
"loss": 1.7674,
"step": 1317
},
{
"epoch": 0.52,
"grad_norm": 5.34375,
"learning_rate": 2.3973143759873616e-05,
"loss": 2.2664,
"step": 1318
},
{
"epoch": 0.52,
"grad_norm": 6.09375,
"learning_rate": 2.3953396524486574e-05,
"loss": 2.2321,
"step": 1319
},
{
"epoch": 0.52,
"grad_norm": 5.53125,
"learning_rate": 2.3933649289099526e-05,
"loss": 1.888,
"step": 1320
},
{
"epoch": 0.52,
"grad_norm": 5.8125,
"learning_rate": 2.391390205371248e-05,
"loss": 2.0791,
"step": 1321
},
{
"epoch": 0.52,
"grad_norm": 5.3125,
"learning_rate": 2.3894154818325435e-05,
"loss": 1.9902,
"step": 1322
},
{
"epoch": 0.52,
"grad_norm": 5.28125,
"learning_rate": 2.387440758293839e-05,
"loss": 2.256,
"step": 1323
},
{
"epoch": 0.52,
"grad_norm": 5.46875,
"learning_rate": 2.3854660347551345e-05,
"loss": 2.1788,
"step": 1324
},
{
"epoch": 0.52,
"grad_norm": 6.625,
"learning_rate": 2.3834913112164296e-05,
"loss": 1.8647,
"step": 1325
},
{
"epoch": 0.52,
"grad_norm": 4.5625,
"learning_rate": 2.381516587677725e-05,
"loss": 2.0878,
"step": 1326
},
{
"epoch": 0.52,
"grad_norm": 7.28125,
"learning_rate": 2.379541864139021e-05,
"loss": 1.7149,
"step": 1327
},
{
"epoch": 0.52,
"grad_norm": 6.4375,
"learning_rate": 2.377567140600316e-05,
"loss": 2.2036,
"step": 1328
},
{
"epoch": 0.52,
"grad_norm": 5.90625,
"learning_rate": 2.3755924170616115e-05,
"loss": 1.9812,
"step": 1329
},
{
"epoch": 0.53,
"grad_norm": 6.1875,
"learning_rate": 2.373617693522907e-05,
"loss": 2.2046,
"step": 1330
},
{
"epoch": 0.53,
"grad_norm": 5.875,
"learning_rate": 2.3716429699842024e-05,
"loss": 1.768,
"step": 1331
},
{
"epoch": 0.53,
"grad_norm": 8.375,
"learning_rate": 2.369668246445498e-05,
"loss": 2.072,
"step": 1332
},
{
"epoch": 0.53,
"grad_norm": 8.0,
"learning_rate": 2.367693522906793e-05,
"loss": 2.09,
"step": 1333
},
{
"epoch": 0.53,
"grad_norm": 8.8125,
"learning_rate": 2.3657187993680885e-05,
"loss": 2.127,
"step": 1334
},
{
"epoch": 0.53,
"grad_norm": 5.5,
"learning_rate": 2.363744075829384e-05,
"loss": 1.7886,
"step": 1335
},
{
"epoch": 0.53,
"grad_norm": 7.34375,
"learning_rate": 2.3617693522906795e-05,
"loss": 2.1402,
"step": 1336
},
{
"epoch": 0.53,
"grad_norm": 6.5,
"learning_rate": 2.359794628751975e-05,
"loss": 2.2388,
"step": 1337
},
{
"epoch": 0.53,
"grad_norm": 4.53125,
"learning_rate": 2.35781990521327e-05,
"loss": 2.1949,
"step": 1338
},
{
"epoch": 0.53,
"grad_norm": 7.0625,
"learning_rate": 2.355845181674566e-05,
"loss": 1.7207,
"step": 1339
},
{
"epoch": 0.53,
"grad_norm": 5.125,
"learning_rate": 2.353870458135861e-05,
"loss": 1.827,
"step": 1340
},
{
"epoch": 0.53,
"grad_norm": 8.9375,
"learning_rate": 2.3518957345971565e-05,
"loss": 2.4086,
"step": 1341
},
{
"epoch": 0.53,
"grad_norm": 6.53125,
"learning_rate": 2.349921011058452e-05,
"loss": 2.2284,
"step": 1342
},
{
"epoch": 0.53,
"grad_norm": 4.34375,
"learning_rate": 2.347946287519747e-05,
"loss": 2.2509,
"step": 1343
},
{
"epoch": 0.53,
"grad_norm": 5.3125,
"learning_rate": 2.345971563981043e-05,
"loss": 2.2758,
"step": 1344
},
{
"epoch": 0.53,
"grad_norm": 7.84375,
"learning_rate": 2.343996840442338e-05,
"loss": 2.0413,
"step": 1345
},
{
"epoch": 0.53,
"grad_norm": 6.9375,
"learning_rate": 2.3420221169036335e-05,
"loss": 2.0006,
"step": 1346
},
{
"epoch": 0.53,
"grad_norm": 5.28125,
"learning_rate": 2.340047393364929e-05,
"loss": 2.0867,
"step": 1347
},
{
"epoch": 0.53,
"grad_norm": 4.5625,
"learning_rate": 2.3380726698262244e-05,
"loss": 2.1669,
"step": 1348
},
{
"epoch": 0.53,
"grad_norm": 5.28125,
"learning_rate": 2.33609794628752e-05,
"loss": 2.0839,
"step": 1349
},
{
"epoch": 0.53,
"grad_norm": 5.03125,
"learning_rate": 2.3341232227488154e-05,
"loss": 1.9204,
"step": 1350
},
{
"epoch": 0.53,
"grad_norm": 5.5625,
"learning_rate": 2.3321484992101105e-05,
"loss": 2.0454,
"step": 1351
},
{
"epoch": 0.53,
"grad_norm": 4.84375,
"learning_rate": 2.3301737756714063e-05,
"loss": 2.0868,
"step": 1352
},
{
"epoch": 0.53,
"grad_norm": 5.0,
"learning_rate": 2.3281990521327015e-05,
"loss": 1.872,
"step": 1353
},
{
"epoch": 0.53,
"grad_norm": 9.0625,
"learning_rate": 2.326224328593997e-05,
"loss": 2.4362,
"step": 1354
},
{
"epoch": 0.54,
"grad_norm": 5.9375,
"learning_rate": 2.3242496050552924e-05,
"loss": 2.2791,
"step": 1355
},
{
"epoch": 0.54,
"grad_norm": 5.5,
"learning_rate": 2.322274881516588e-05,
"loss": 1.865,
"step": 1356
},
{
"epoch": 0.54,
"grad_norm": 5.78125,
"learning_rate": 2.3203001579778834e-05,
"loss": 1.8422,
"step": 1357
},
{
"epoch": 0.54,
"grad_norm": 5.46875,
"learning_rate": 2.3183254344391785e-05,
"loss": 1.618,
"step": 1358
},
{
"epoch": 0.54,
"grad_norm": 4.6875,
"learning_rate": 2.316350710900474e-05,
"loss": 1.9734,
"step": 1359
},
{
"epoch": 0.54,
"grad_norm": 5.15625,
"learning_rate": 2.3143759873617694e-05,
"loss": 2.1669,
"step": 1360
},
{
"epoch": 0.54,
"grad_norm": 6.8125,
"learning_rate": 2.312401263823065e-05,
"loss": 1.9085,
"step": 1361
},
{
"epoch": 0.54,
"grad_norm": 9.875,
"learning_rate": 2.3104265402843604e-05,
"loss": 2.3026,
"step": 1362
},
{
"epoch": 0.54,
"grad_norm": 5.71875,
"learning_rate": 2.3084518167456555e-05,
"loss": 1.9714,
"step": 1363
},
{
"epoch": 0.54,
"grad_norm": 7.28125,
"learning_rate": 2.3064770932069513e-05,
"loss": 2.184,
"step": 1364
},
{
"epoch": 0.54,
"grad_norm": 10.1875,
"learning_rate": 2.3045023696682465e-05,
"loss": 2.2147,
"step": 1365
},
{
"epoch": 0.54,
"grad_norm": 6.03125,
"learning_rate": 2.302527646129542e-05,
"loss": 1.9486,
"step": 1366
},
{
"epoch": 0.54,
"grad_norm": 7.09375,
"learning_rate": 2.3005529225908374e-05,
"loss": 2.0125,
"step": 1367
},
{
"epoch": 0.54,
"grad_norm": 18.125,
"learning_rate": 2.2985781990521325e-05,
"loss": 2.1586,
"step": 1368
},
{
"epoch": 0.54,
"grad_norm": 7.3125,
"learning_rate": 2.2966034755134283e-05,
"loss": 2.0031,
"step": 1369
},
{
"epoch": 0.54,
"grad_norm": 6.03125,
"learning_rate": 2.2946287519747238e-05,
"loss": 2.0109,
"step": 1370
},
{
"epoch": 0.54,
"grad_norm": 5.9375,
"learning_rate": 2.292654028436019e-05,
"loss": 2.0119,
"step": 1371
},
{
"epoch": 0.54,
"grad_norm": 6.0625,
"learning_rate": 2.2906793048973144e-05,
"loss": 1.7232,
"step": 1372
},
{
"epoch": 0.54,
"grad_norm": 8.1875,
"learning_rate": 2.28870458135861e-05,
"loss": 2.3399,
"step": 1373
},
{
"epoch": 0.54,
"grad_norm": 5.71875,
"learning_rate": 2.2867298578199054e-05,
"loss": 1.7866,
"step": 1374
},
{
"epoch": 0.54,
"grad_norm": 5.875,
"learning_rate": 2.284755134281201e-05,
"loss": 1.5963,
"step": 1375
},
{
"epoch": 0.54,
"grad_norm": 5.65625,
"learning_rate": 2.282780410742496e-05,
"loss": 2.0123,
"step": 1376
},
{
"epoch": 0.54,
"grad_norm": 6.1875,
"learning_rate": 2.2808056872037918e-05,
"loss": 1.902,
"step": 1377
},
{
"epoch": 0.54,
"grad_norm": 5.78125,
"learning_rate": 2.278830963665087e-05,
"loss": 1.9016,
"step": 1378
},
{
"epoch": 0.54,
"grad_norm": 6.34375,
"learning_rate": 2.2768562401263824e-05,
"loss": 2.2819,
"step": 1379
},
{
"epoch": 0.55,
"grad_norm": 6.0625,
"learning_rate": 2.274881516587678e-05,
"loss": 2.0796,
"step": 1380
},
{
"epoch": 0.55,
"grad_norm": 7.3125,
"learning_rate": 2.2729067930489733e-05,
"loss": 1.7969,
"step": 1381
},
{
"epoch": 0.55,
"grad_norm": 7.125,
"learning_rate": 2.2709320695102688e-05,
"loss": 1.9306,
"step": 1382
},
{
"epoch": 0.55,
"grad_norm": 5.0,
"learning_rate": 2.268957345971564e-05,
"loss": 1.8823,
"step": 1383
},
{
"epoch": 0.55,
"grad_norm": 6.15625,
"learning_rate": 2.2669826224328594e-05,
"loss": 2.0985,
"step": 1384
},
{
"epoch": 0.55,
"grad_norm": 17.75,
"learning_rate": 2.265007898894155e-05,
"loss": 1.8407,
"step": 1385
},
{
"epoch": 0.55,
"grad_norm": 5.25,
"learning_rate": 2.2630331753554504e-05,
"loss": 1.8747,
"step": 1386
},
{
"epoch": 0.55,
"grad_norm": 6.25,
"learning_rate": 2.261058451816746e-05,
"loss": 2.2043,
"step": 1387
},
{
"epoch": 0.55,
"grad_norm": 9.0625,
"learning_rate": 2.259083728278041e-05,
"loss": 1.9946,
"step": 1388
},
{
"epoch": 0.55,
"grad_norm": 6.90625,
"learning_rate": 2.2571090047393368e-05,
"loss": 1.7495,
"step": 1389
},
{
"epoch": 0.55,
"grad_norm": 6.0,
"learning_rate": 2.255134281200632e-05,
"loss": 2.0695,
"step": 1390
},
{
"epoch": 0.55,
"grad_norm": 8.6875,
"learning_rate": 2.2531595576619274e-05,
"loss": 1.6823,
"step": 1391
},
{
"epoch": 0.55,
"grad_norm": 6.8125,
"learning_rate": 2.251184834123223e-05,
"loss": 1.9839,
"step": 1392
},
{
"epoch": 0.55,
"grad_norm": 9.5,
"learning_rate": 2.249210110584518e-05,
"loss": 2.0417,
"step": 1393
},
{
"epoch": 0.55,
"grad_norm": 7.40625,
"learning_rate": 2.2472353870458138e-05,
"loss": 1.8293,
"step": 1394
},
{
"epoch": 0.55,
"grad_norm": 5.34375,
"learning_rate": 2.2452606635071093e-05,
"loss": 1.9432,
"step": 1395
},
{
"epoch": 0.55,
"grad_norm": 5.15625,
"learning_rate": 2.2432859399684044e-05,
"loss": 2.0315,
"step": 1396
},
{
"epoch": 0.55,
"grad_norm": 7.59375,
"learning_rate": 2.2413112164297e-05,
"loss": 2.1926,
"step": 1397
},
{
"epoch": 0.55,
"grad_norm": 6.25,
"learning_rate": 2.2393364928909954e-05,
"loss": 2.4238,
"step": 1398
},
{
"epoch": 0.55,
"grad_norm": 3.921875,
"learning_rate": 2.2373617693522908e-05,
"loss": 1.9422,
"step": 1399
},
{
"epoch": 0.55,
"grad_norm": 7.59375,
"learning_rate": 2.2353870458135863e-05,
"loss": 1.772,
"step": 1400
},
{
"epoch": 0.55,
"grad_norm": 4.625,
"learning_rate": 2.2334123222748814e-05,
"loss": 2.3523,
"step": 1401
},
{
"epoch": 0.55,
"grad_norm": 4.09375,
"learning_rate": 2.2314375987361772e-05,
"loss": 2.1442,
"step": 1402
},
{
"epoch": 0.55,
"grad_norm": 5.1875,
"learning_rate": 2.2294628751974724e-05,
"loss": 1.8683,
"step": 1403
},
{
"epoch": 0.55,
"grad_norm": 5.78125,
"learning_rate": 2.227488151658768e-05,
"loss": 2.3855,
"step": 1404
},
{
"epoch": 0.55,
"grad_norm": 7.3125,
"learning_rate": 2.2255134281200633e-05,
"loss": 1.9594,
"step": 1405
},
{
"epoch": 0.56,
"grad_norm": 5.3125,
"learning_rate": 2.2235387045813588e-05,
"loss": 1.8106,
"step": 1406
},
{
"epoch": 0.56,
"grad_norm": 5.21875,
"learning_rate": 2.2215639810426543e-05,
"loss": 1.6813,
"step": 1407
},
{
"epoch": 0.56,
"grad_norm": 5.59375,
"learning_rate": 2.2195892575039494e-05,
"loss": 2.0662,
"step": 1408
},
{
"epoch": 0.56,
"grad_norm": 6.125,
"learning_rate": 2.217614533965245e-05,
"loss": 2.1793,
"step": 1409
},
{
"epoch": 0.56,
"grad_norm": 9.8125,
"learning_rate": 2.2156398104265403e-05,
"loss": 2.3651,
"step": 1410
},
{
"epoch": 0.56,
"grad_norm": 5.96875,
"learning_rate": 2.2136650868878358e-05,
"loss": 1.7811,
"step": 1411
},
{
"epoch": 0.56,
"grad_norm": 7.375,
"learning_rate": 2.2116903633491313e-05,
"loss": 2.2002,
"step": 1412
},
{
"epoch": 0.56,
"grad_norm": 4.8125,
"learning_rate": 2.2097156398104264e-05,
"loss": 1.5208,
"step": 1413
},
{
"epoch": 0.56,
"grad_norm": 6.96875,
"learning_rate": 2.2077409162717222e-05,
"loss": 2.2671,
"step": 1414
},
{
"epoch": 0.56,
"grad_norm": 8.25,
"learning_rate": 2.2057661927330177e-05,
"loss": 1.9222,
"step": 1415
},
{
"epoch": 0.56,
"grad_norm": 6.53125,
"learning_rate": 2.203791469194313e-05,
"loss": 1.7868,
"step": 1416
},
{
"epoch": 0.56,
"grad_norm": 10.6875,
"learning_rate": 2.2018167456556083e-05,
"loss": 1.9241,
"step": 1417
},
{
"epoch": 0.56,
"grad_norm": 7.375,
"learning_rate": 2.1998420221169034e-05,
"loss": 2.0531,
"step": 1418
},
{
"epoch": 0.56,
"grad_norm": 13.25,
"learning_rate": 2.1978672985781993e-05,
"loss": 2.187,
"step": 1419
},
{
"epoch": 0.56,
"grad_norm": 7.0625,
"learning_rate": 2.1958925750394947e-05,
"loss": 2.1615,
"step": 1420
},
{
"epoch": 0.56,
"grad_norm": 7.8125,
"learning_rate": 2.19391785150079e-05,
"loss": 1.9472,
"step": 1421
},
{
"epoch": 0.56,
"grad_norm": 8.8125,
"learning_rate": 2.1919431279620853e-05,
"loss": 1.7138,
"step": 1422
},
{
"epoch": 0.56,
"grad_norm": 17.875,
"learning_rate": 2.1899684044233808e-05,
"loss": 2.0935,
"step": 1423
},
{
"epoch": 0.56,
"grad_norm": 7.65625,
"learning_rate": 2.1879936808846763e-05,
"loss": 2.2063,
"step": 1424
},
{
"epoch": 0.56,
"grad_norm": 7.96875,
"learning_rate": 2.1860189573459718e-05,
"loss": 2.0975,
"step": 1425
},
{
"epoch": 0.56,
"grad_norm": 13.125,
"learning_rate": 2.184044233807267e-05,
"loss": 2.2844,
"step": 1426
},
{
"epoch": 0.56,
"grad_norm": 4.4375,
"learning_rate": 2.1820695102685627e-05,
"loss": 1.8359,
"step": 1427
},
{
"epoch": 0.56,
"grad_norm": 5.59375,
"learning_rate": 2.1800947867298578e-05,
"loss": 1.964,
"step": 1428
},
{
"epoch": 0.56,
"grad_norm": 9.4375,
"learning_rate": 2.1781200631911533e-05,
"loss": 2.1198,
"step": 1429
},
{
"epoch": 0.56,
"grad_norm": 8.4375,
"learning_rate": 2.1761453396524488e-05,
"loss": 1.9694,
"step": 1430
},
{
"epoch": 0.57,
"grad_norm": 4.875,
"learning_rate": 2.1741706161137442e-05,
"loss": 2.056,
"step": 1431
},
{
"epoch": 0.57,
"grad_norm": 14.6875,
"learning_rate": 2.1721958925750397e-05,
"loss": 2.4612,
"step": 1432
},
{
"epoch": 0.57,
"grad_norm": 7.09375,
"learning_rate": 2.170221169036335e-05,
"loss": 2.3723,
"step": 1433
},
{
"epoch": 0.57,
"grad_norm": 6.96875,
"learning_rate": 2.1682464454976303e-05,
"loss": 2.3522,
"step": 1434
},
{
"epoch": 0.57,
"grad_norm": 4.96875,
"learning_rate": 2.166271721958926e-05,
"loss": 2.049,
"step": 1435
},
{
"epoch": 0.57,
"grad_norm": 6.3125,
"learning_rate": 2.1642969984202213e-05,
"loss": 1.9485,
"step": 1436
},
{
"epoch": 0.57,
"grad_norm": 7.125,
"learning_rate": 2.1623222748815167e-05,
"loss": 2.081,
"step": 1437
},
{
"epoch": 0.57,
"grad_norm": 5.84375,
"learning_rate": 2.160347551342812e-05,
"loss": 2.0336,
"step": 1438
},
{
"epoch": 0.57,
"grad_norm": 6.1875,
"learning_rate": 2.1583728278041077e-05,
"loss": 2.123,
"step": 1439
},
{
"epoch": 0.57,
"grad_norm": 5.75,
"learning_rate": 2.156398104265403e-05,
"loss": 2.0289,
"step": 1440
},
{
"epoch": 0.57,
"grad_norm": 10.1875,
"learning_rate": 2.1544233807266983e-05,
"loss": 1.8162,
"step": 1441
},
{
"epoch": 0.57,
"grad_norm": 6.125,
"learning_rate": 2.1524486571879938e-05,
"loss": 1.711,
"step": 1442
},
{
"epoch": 0.57,
"grad_norm": 6.96875,
"learning_rate": 2.1504739336492892e-05,
"loss": 1.975,
"step": 1443
},
{
"epoch": 0.57,
"grad_norm": 6.09375,
"learning_rate": 2.1484992101105847e-05,
"loss": 1.9348,
"step": 1444
},
{
"epoch": 0.57,
"grad_norm": 6.46875,
"learning_rate": 2.1465244865718802e-05,
"loss": 2.0724,
"step": 1445
},
{
"epoch": 0.57,
"grad_norm": 9.4375,
"learning_rate": 2.1445497630331753e-05,
"loss": 2.0855,
"step": 1446
},
{
"epoch": 0.57,
"grad_norm": 8.4375,
"learning_rate": 2.1425750394944708e-05,
"loss": 1.9357,
"step": 1447
},
{
"epoch": 0.57,
"grad_norm": 6.15625,
"learning_rate": 2.1406003159557663e-05,
"loss": 1.7124,
"step": 1448
},
{
"epoch": 0.57,
"grad_norm": 4.96875,
"learning_rate": 2.1386255924170617e-05,
"loss": 2.0998,
"step": 1449
},
{
"epoch": 0.57,
"grad_norm": 7.03125,
"learning_rate": 2.1366508688783572e-05,
"loss": 1.9383,
"step": 1450
},
{
"epoch": 0.57,
"grad_norm": 11.625,
"learning_rate": 2.1346761453396523e-05,
"loss": 2.2265,
"step": 1451
},
{
"epoch": 0.57,
"grad_norm": 6.5,
"learning_rate": 2.132701421800948e-05,
"loss": 1.7533,
"step": 1452
},
{
"epoch": 0.57,
"grad_norm": 5.625,
"learning_rate": 2.1307266982622433e-05,
"loss": 1.814,
"step": 1453
},
{
"epoch": 0.57,
"grad_norm": 7.8125,
"learning_rate": 2.1287519747235388e-05,
"loss": 2.4784,
"step": 1454
},
{
"epoch": 0.57,
"grad_norm": 8.8125,
"learning_rate": 2.1267772511848342e-05,
"loss": 2.2497,
"step": 1455
},
{
"epoch": 0.58,
"grad_norm": 8.1875,
"learning_rate": 2.1248025276461297e-05,
"loss": 1.9518,
"step": 1456
},
{
"epoch": 0.58,
"grad_norm": 18.0,
"learning_rate": 2.1228278041074252e-05,
"loss": 1.7681,
"step": 1457
},
{
"epoch": 0.58,
"grad_norm": 7.78125,
"learning_rate": 2.1208530805687203e-05,
"loss": 1.9108,
"step": 1458
},
{
"epoch": 0.58,
"grad_norm": 8.125,
"learning_rate": 2.1188783570300158e-05,
"loss": 2.3868,
"step": 1459
},
{
"epoch": 0.58,
"grad_norm": 4.71875,
"learning_rate": 2.1169036334913116e-05,
"loss": 2.0207,
"step": 1460
},
{
"epoch": 0.58,
"grad_norm": 9.0,
"learning_rate": 2.1149289099526067e-05,
"loss": 1.8951,
"step": 1461
},
{
"epoch": 0.58,
"grad_norm": 6.59375,
"learning_rate": 2.1129541864139022e-05,
"loss": 2.1832,
"step": 1462
},
{
"epoch": 0.58,
"grad_norm": 6.90625,
"learning_rate": 2.1109794628751973e-05,
"loss": 1.7732,
"step": 1463
},
{
"epoch": 0.58,
"grad_norm": 5.40625,
"learning_rate": 2.109004739336493e-05,
"loss": 2.161,
"step": 1464
},
{
"epoch": 0.58,
"grad_norm": 4.75,
"learning_rate": 2.1070300157977886e-05,
"loss": 2.0491,
"step": 1465
},
{
"epoch": 0.58,
"grad_norm": 5.1875,
"learning_rate": 2.1050552922590837e-05,
"loss": 1.6109,
"step": 1466
},
{
"epoch": 0.58,
"grad_norm": 8.125,
"learning_rate": 2.1030805687203792e-05,
"loss": 2.2055,
"step": 1467
},
{
"epoch": 0.58,
"grad_norm": 6.125,
"learning_rate": 2.1011058451816747e-05,
"loss": 1.9164,
"step": 1468
},
{
"epoch": 0.58,
"grad_norm": 5.1875,
"learning_rate": 2.09913112164297e-05,
"loss": 1.7696,
"step": 1469
},
{
"epoch": 0.58,
"grad_norm": 7.15625,
"learning_rate": 2.0971563981042656e-05,
"loss": 2.011,
"step": 1470
},
{
"epoch": 0.58,
"grad_norm": 6.59375,
"learning_rate": 2.0951816745655608e-05,
"loss": 2.1029,
"step": 1471
},
{
"epoch": 0.58,
"grad_norm": 6.125,
"learning_rate": 2.0932069510268562e-05,
"loss": 2.0383,
"step": 1472
},
{
"epoch": 0.58,
"grad_norm": 9.0625,
"learning_rate": 2.0912322274881517e-05,
"loss": 2.0953,
"step": 1473
},
{
"epoch": 0.58,
"grad_norm": 6.53125,
"learning_rate": 2.0892575039494472e-05,
"loss": 2.3033,
"step": 1474
},
{
"epoch": 0.58,
"grad_norm": 5.25,
"learning_rate": 2.0872827804107427e-05,
"loss": 2.0421,
"step": 1475
},
{
"epoch": 0.58,
"grad_norm": 8.125,
"learning_rate": 2.0853080568720378e-05,
"loss": 2.4017,
"step": 1476
},
{
"epoch": 0.58,
"grad_norm": 4.78125,
"learning_rate": 2.0833333333333336e-05,
"loss": 1.8599,
"step": 1477
},
{
"epoch": 0.58,
"grad_norm": 5.34375,
"learning_rate": 2.0813586097946287e-05,
"loss": 2.1698,
"step": 1478
},
{
"epoch": 0.58,
"grad_norm": 7.78125,
"learning_rate": 2.0793838862559242e-05,
"loss": 2.126,
"step": 1479
},
{
"epoch": 0.58,
"grad_norm": 4.8125,
"learning_rate": 2.0774091627172197e-05,
"loss": 1.7458,
"step": 1480
},
{
"epoch": 0.58,
"grad_norm": 4.6875,
"learning_rate": 2.075434439178515e-05,
"loss": 1.8589,
"step": 1481
},
{
"epoch": 0.59,
"grad_norm": 7.0625,
"learning_rate": 2.0734597156398106e-05,
"loss": 2.2365,
"step": 1482
},
{
"epoch": 0.59,
"grad_norm": 7.1875,
"learning_rate": 2.0714849921011058e-05,
"loss": 1.8649,
"step": 1483
},
{
"epoch": 0.59,
"grad_norm": 8.375,
"learning_rate": 2.0695102685624012e-05,
"loss": 1.9744,
"step": 1484
},
{
"epoch": 0.59,
"grad_norm": 7.625,
"learning_rate": 2.067535545023697e-05,
"loss": 1.9504,
"step": 1485
},
{
"epoch": 0.59,
"grad_norm": 6.875,
"learning_rate": 2.0655608214849922e-05,
"loss": 2.304,
"step": 1486
},
{
"epoch": 0.59,
"grad_norm": 8.5,
"learning_rate": 2.0635860979462876e-05,
"loss": 2.0815,
"step": 1487
},
{
"epoch": 0.59,
"grad_norm": 6.40625,
"learning_rate": 2.061611374407583e-05,
"loss": 1.5787,
"step": 1488
},
{
"epoch": 0.59,
"grad_norm": 7.3125,
"learning_rate": 2.0596366508688786e-05,
"loss": 2.2125,
"step": 1489
},
{
"epoch": 0.59,
"grad_norm": 4.5625,
"learning_rate": 2.057661927330174e-05,
"loss": 2.2363,
"step": 1490
},
{
"epoch": 0.59,
"grad_norm": 5.1875,
"learning_rate": 2.0556872037914692e-05,
"loss": 1.9022,
"step": 1491
},
{
"epoch": 0.59,
"grad_norm": 4.625,
"learning_rate": 2.0537124802527647e-05,
"loss": 1.855,
"step": 1492
},
{
"epoch": 0.59,
"grad_norm": 14.9375,
"learning_rate": 2.05173775671406e-05,
"loss": 1.7785,
"step": 1493
},
{
"epoch": 0.59,
"grad_norm": 7.875,
"learning_rate": 2.0497630331753556e-05,
"loss": 2.0754,
"step": 1494
},
{
"epoch": 0.59,
"grad_norm": 12.125,
"learning_rate": 2.047788309636651e-05,
"loss": 2.2401,
"step": 1495
},
{
"epoch": 0.59,
"grad_norm": 5.46875,
"learning_rate": 2.0458135860979462e-05,
"loss": 1.8243,
"step": 1496
},
{
"epoch": 0.59,
"grad_norm": 5.71875,
"learning_rate": 2.0438388625592417e-05,
"loss": 1.8719,
"step": 1497
},
{
"epoch": 0.59,
"grad_norm": 5.90625,
"learning_rate": 2.041864139020537e-05,
"loss": 2.1369,
"step": 1498
},
{
"epoch": 0.59,
"grad_norm": 8.125,
"learning_rate": 2.0398894154818326e-05,
"loss": 2.4337,
"step": 1499
},
{
"epoch": 0.59,
"grad_norm": 7.75,
"learning_rate": 2.037914691943128e-05,
"loss": 1.8668,
"step": 1500
},
{
"epoch": 0.59,
"grad_norm": 6.28125,
"learning_rate": 2.0359399684044232e-05,
"loss": 1.9654,
"step": 1501
},
{
"epoch": 0.59,
"grad_norm": 8.875,
"learning_rate": 2.033965244865719e-05,
"loss": 2.1204,
"step": 1502
},
{
"epoch": 0.59,
"grad_norm": 7.34375,
"learning_rate": 2.0319905213270142e-05,
"loss": 2.0558,
"step": 1503
},
{
"epoch": 0.59,
"grad_norm": 7.625,
"learning_rate": 2.0300157977883097e-05,
"loss": 2.3101,
"step": 1504
},
{
"epoch": 0.59,
"grad_norm": 4.5,
"learning_rate": 2.028041074249605e-05,
"loss": 2.197,
"step": 1505
},
{
"epoch": 0.59,
"grad_norm": 6.125,
"learning_rate": 2.0260663507109006e-05,
"loss": 1.9409,
"step": 1506
},
{
"epoch": 0.6,
"grad_norm": 5.53125,
"learning_rate": 2.024091627172196e-05,
"loss": 2.2683,
"step": 1507
},
{
"epoch": 0.6,
"grad_norm": 8.25,
"learning_rate": 2.0221169036334912e-05,
"loss": 2.0666,
"step": 1508
},
{
"epoch": 0.6,
"grad_norm": 4.875,
"learning_rate": 2.0201421800947867e-05,
"loss": 2.2434,
"step": 1509
},
{
"epoch": 0.6,
"grad_norm": 6.0625,
"learning_rate": 2.0181674565560825e-05,
"loss": 2.0695,
"step": 1510
},
{
"epoch": 0.6,
"grad_norm": 6.46875,
"learning_rate": 2.0161927330173776e-05,
"loss": 1.7471,
"step": 1511
},
{
"epoch": 0.6,
"grad_norm": 7.3125,
"learning_rate": 2.014218009478673e-05,
"loss": 1.8171,
"step": 1512
},
{
"epoch": 0.6,
"grad_norm": 5.03125,
"learning_rate": 2.0122432859399686e-05,
"loss": 2.1872,
"step": 1513
},
{
"epoch": 0.6,
"grad_norm": 8.5625,
"learning_rate": 2.010268562401264e-05,
"loss": 1.8838,
"step": 1514
},
{
"epoch": 0.6,
"grad_norm": 5.875,
"learning_rate": 2.0082938388625595e-05,
"loss": 1.8742,
"step": 1515
},
{
"epoch": 0.6,
"grad_norm": 9.625,
"learning_rate": 2.0063191153238547e-05,
"loss": 2.1511,
"step": 1516
},
{
"epoch": 0.6,
"grad_norm": 5.6875,
"learning_rate": 2.00434439178515e-05,
"loss": 1.8989,
"step": 1517
},
{
"epoch": 0.6,
"grad_norm": 5.0,
"learning_rate": 2.0023696682464456e-05,
"loss": 1.9555,
"step": 1518
},
{
"epoch": 0.6,
"grad_norm": 5.125,
"learning_rate": 2.000394944707741e-05,
"loss": 1.9911,
"step": 1519
},
{
"epoch": 0.6,
"grad_norm": 7.46875,
"learning_rate": 1.9984202211690365e-05,
"loss": 1.8243,
"step": 1520
},
{
"epoch": 0.6,
"grad_norm": 7.1875,
"learning_rate": 1.9964454976303317e-05,
"loss": 2.2367,
"step": 1521
},
{
"epoch": 0.6,
"grad_norm": 8.625,
"learning_rate": 1.994470774091627e-05,
"loss": 2.0971,
"step": 1522
},
{
"epoch": 0.6,
"grad_norm": 11.1875,
"learning_rate": 1.9924960505529226e-05,
"loss": 2.4126,
"step": 1523
},
{
"epoch": 0.6,
"grad_norm": 7.09375,
"learning_rate": 1.990521327014218e-05,
"loss": 2.1638,
"step": 1524
},
{
"epoch": 0.6,
"grad_norm": 10.9375,
"learning_rate": 1.9885466034755136e-05,
"loss": 1.8433,
"step": 1525
},
{
"epoch": 0.6,
"grad_norm": 5.75,
"learning_rate": 1.9865718799368087e-05,
"loss": 1.9653,
"step": 1526
},
{
"epoch": 0.6,
"grad_norm": 11.5625,
"learning_rate": 1.9845971563981045e-05,
"loss": 2.2064,
"step": 1527
},
{
"epoch": 0.6,
"grad_norm": 5.6875,
"learning_rate": 1.9826224328593996e-05,
"loss": 1.8401,
"step": 1528
},
{
"epoch": 0.6,
"grad_norm": 6.46875,
"learning_rate": 1.980647709320695e-05,
"loss": 2.3478,
"step": 1529
},
{
"epoch": 0.6,
"grad_norm": 7.0,
"learning_rate": 1.9786729857819906e-05,
"loss": 1.6537,
"step": 1530
},
{
"epoch": 0.6,
"grad_norm": 4.5625,
"learning_rate": 1.976698262243286e-05,
"loss": 1.5274,
"step": 1531
},
{
"epoch": 0.61,
"grad_norm": 9.8125,
"learning_rate": 1.9747235387045815e-05,
"loss": 1.6213,
"step": 1532
},
{
"epoch": 0.61,
"grad_norm": 6.4375,
"learning_rate": 1.972748815165877e-05,
"loss": 1.6356,
"step": 1533
},
{
"epoch": 0.61,
"grad_norm": 4.6875,
"learning_rate": 1.970774091627172e-05,
"loss": 1.7562,
"step": 1534
},
{
"epoch": 0.61,
"grad_norm": 5.78125,
"learning_rate": 1.968799368088468e-05,
"loss": 2.0014,
"step": 1535
},
{
"epoch": 0.61,
"grad_norm": 8.375,
"learning_rate": 1.966824644549763e-05,
"loss": 2.1738,
"step": 1536
},
{
"epoch": 0.61,
"grad_norm": 13.75,
"learning_rate": 1.9648499210110586e-05,
"loss": 2.0728,
"step": 1537
},
{
"epoch": 0.61,
"grad_norm": 6.25,
"learning_rate": 1.962875197472354e-05,
"loss": 1.7464,
"step": 1538
},
{
"epoch": 0.61,
"grad_norm": 6.1875,
"learning_rate": 1.9609004739336495e-05,
"loss": 1.8631,
"step": 1539
},
{
"epoch": 0.61,
"grad_norm": 6.78125,
"learning_rate": 1.958925750394945e-05,
"loss": 2.061,
"step": 1540
},
{
"epoch": 0.61,
"grad_norm": 8.375,
"learning_rate": 1.95695102685624e-05,
"loss": 2.0003,
"step": 1541
},
{
"epoch": 0.61,
"grad_norm": 9.625,
"learning_rate": 1.9549763033175356e-05,
"loss": 2.2223,
"step": 1542
},
{
"epoch": 0.61,
"grad_norm": 11.0,
"learning_rate": 1.953001579778831e-05,
"loss": 2.0986,
"step": 1543
},
{
"epoch": 0.61,
"grad_norm": 7.625,
"learning_rate": 1.9510268562401265e-05,
"loss": 1.9166,
"step": 1544
},
{
"epoch": 0.61,
"grad_norm": 6.90625,
"learning_rate": 1.949052132701422e-05,
"loss": 2.0719,
"step": 1545
},
{
"epoch": 0.61,
"grad_norm": 7.875,
"learning_rate": 1.947077409162717e-05,
"loss": 1.9695,
"step": 1546
},
{
"epoch": 0.61,
"grad_norm": 5.84375,
"learning_rate": 1.9451026856240126e-05,
"loss": 1.7976,
"step": 1547
},
{
"epoch": 0.61,
"grad_norm": 7.1875,
"learning_rate": 1.943127962085308e-05,
"loss": 1.9442,
"step": 1548
},
{
"epoch": 0.61,
"grad_norm": 7.15625,
"learning_rate": 1.9411532385466035e-05,
"loss": 2.1809,
"step": 1549
},
{
"epoch": 0.61,
"grad_norm": 6.4375,
"learning_rate": 1.939178515007899e-05,
"loss": 1.9701,
"step": 1550
},
{
"epoch": 0.61,
"grad_norm": 6.84375,
"learning_rate": 1.937203791469194e-05,
"loss": 2.3102,
"step": 1551
},
{
"epoch": 0.61,
"grad_norm": 6.21875,
"learning_rate": 1.93522906793049e-05,
"loss": 1.9105,
"step": 1552
},
{
"epoch": 0.61,
"grad_norm": 6.21875,
"learning_rate": 1.9332543443917854e-05,
"loss": 1.7592,
"step": 1553
},
{
"epoch": 0.61,
"grad_norm": 7.53125,
"learning_rate": 1.9312796208530806e-05,
"loss": 2.1038,
"step": 1554
},
{
"epoch": 0.61,
"grad_norm": 6.75,
"learning_rate": 1.929304897314376e-05,
"loss": 2.0143,
"step": 1555
},
{
"epoch": 0.61,
"grad_norm": 5.90625,
"learning_rate": 1.9273301737756715e-05,
"loss": 2.1352,
"step": 1556
},
{
"epoch": 0.61,
"grad_norm": 6.0,
"learning_rate": 1.925355450236967e-05,
"loss": 2.1001,
"step": 1557
},
{
"epoch": 0.62,
"grad_norm": 9.1875,
"learning_rate": 1.9233807266982625e-05,
"loss": 1.9158,
"step": 1558
},
{
"epoch": 0.62,
"grad_norm": 6.65625,
"learning_rate": 1.9214060031595576e-05,
"loss": 2.3518,
"step": 1559
},
{
"epoch": 0.62,
"grad_norm": 7.34375,
"learning_rate": 1.9194312796208534e-05,
"loss": 2.191,
"step": 1560
},
{
"epoch": 0.62,
"grad_norm": 9.875,
"learning_rate": 1.9174565560821485e-05,
"loss": 1.8369,
"step": 1561
},
{
"epoch": 0.62,
"grad_norm": 5.46875,
"learning_rate": 1.915481832543444e-05,
"loss": 2.1488,
"step": 1562
},
{
"epoch": 0.62,
"grad_norm": 7.96875,
"learning_rate": 1.9135071090047395e-05,
"loss": 2.0566,
"step": 1563
},
{
"epoch": 0.62,
"grad_norm": 5.96875,
"learning_rate": 1.911532385466035e-05,
"loss": 1.8963,
"step": 1564
},
{
"epoch": 0.62,
"grad_norm": 7.78125,
"learning_rate": 1.9095576619273304e-05,
"loss": 2.1597,
"step": 1565
},
{
"epoch": 0.62,
"grad_norm": 8.75,
"learning_rate": 1.9075829383886256e-05,
"loss": 2.1049,
"step": 1566
},
{
"epoch": 0.62,
"grad_norm": 6.4375,
"learning_rate": 1.905608214849921e-05,
"loss": 1.9015,
"step": 1567
},
{
"epoch": 0.62,
"grad_norm": 7.75,
"learning_rate": 1.9036334913112165e-05,
"loss": 2.0288,
"step": 1568
},
{
"epoch": 0.62,
"grad_norm": 7.21875,
"learning_rate": 1.901658767772512e-05,
"loss": 1.8377,
"step": 1569
},
{
"epoch": 0.62,
"grad_norm": 7.65625,
"learning_rate": 1.8996840442338074e-05,
"loss": 2.2305,
"step": 1570
},
{
"epoch": 0.62,
"grad_norm": 9.75,
"learning_rate": 1.8977093206951026e-05,
"loss": 2.3968,
"step": 1571
},
{
"epoch": 0.62,
"grad_norm": 4.8125,
"learning_rate": 1.895734597156398e-05,
"loss": 1.7733,
"step": 1572
},
{
"epoch": 0.62,
"grad_norm": 5.0,
"learning_rate": 1.8937598736176935e-05,
"loss": 2.0269,
"step": 1573
},
{
"epoch": 0.62,
"grad_norm": 6.8125,
"learning_rate": 1.891785150078989e-05,
"loss": 2.0282,
"step": 1574
},
{
"epoch": 0.62,
"grad_norm": 6.875,
"learning_rate": 1.8898104265402845e-05,
"loss": 1.5773,
"step": 1575
},
{
"epoch": 0.62,
"grad_norm": 10.625,
"learning_rate": 1.8878357030015796e-05,
"loss": 1.8388,
"step": 1576
},
{
"epoch": 0.62,
"grad_norm": 9.125,
"learning_rate": 1.8858609794628754e-05,
"loss": 1.8759,
"step": 1577
},
{
"epoch": 0.62,
"grad_norm": 7.34375,
"learning_rate": 1.883886255924171e-05,
"loss": 1.9554,
"step": 1578
},
{
"epoch": 0.62,
"grad_norm": 7.90625,
"learning_rate": 1.881911532385466e-05,
"loss": 2.2813,
"step": 1579
},
{
"epoch": 0.62,
"grad_norm": 9.1875,
"learning_rate": 1.8799368088467615e-05,
"loss": 2.1975,
"step": 1580
},
{
"epoch": 0.62,
"grad_norm": 7.375,
"learning_rate": 1.877962085308057e-05,
"loss": 1.9664,
"step": 1581
},
{
"epoch": 0.62,
"grad_norm": 5.90625,
"learning_rate": 1.8759873617693524e-05,
"loss": 1.9054,
"step": 1582
},
{
"epoch": 0.63,
"grad_norm": 9.125,
"learning_rate": 1.874012638230648e-05,
"loss": 1.9299,
"step": 1583
},
{
"epoch": 0.63,
"grad_norm": 7.03125,
"learning_rate": 1.872037914691943e-05,
"loss": 2.133,
"step": 1584
},
{
"epoch": 0.63,
"grad_norm": 5.90625,
"learning_rate": 1.870063191153239e-05,
"loss": 2.055,
"step": 1585
},
{
"epoch": 0.63,
"grad_norm": 5.6875,
"learning_rate": 1.868088467614534e-05,
"loss": 1.9604,
"step": 1586
},
{
"epoch": 0.63,
"grad_norm": 8.5,
"learning_rate": 1.8661137440758295e-05,
"loss": 2.2078,
"step": 1587
},
{
"epoch": 0.63,
"grad_norm": 14.0,
"learning_rate": 1.864139020537125e-05,
"loss": 2.0295,
"step": 1588
},
{
"epoch": 0.63,
"grad_norm": 7.21875,
"learning_rate": 1.8621642969984204e-05,
"loss": 2.1574,
"step": 1589
},
{
"epoch": 0.63,
"grad_norm": 5.28125,
"learning_rate": 1.860189573459716e-05,
"loss": 1.663,
"step": 1590
},
{
"epoch": 0.63,
"grad_norm": 7.1875,
"learning_rate": 1.858214849921011e-05,
"loss": 2.0338,
"step": 1591
},
{
"epoch": 0.63,
"grad_norm": 6.34375,
"learning_rate": 1.8562401263823065e-05,
"loss": 2.2551,
"step": 1592
},
{
"epoch": 0.63,
"grad_norm": 7.21875,
"learning_rate": 1.854265402843602e-05,
"loss": 2.0029,
"step": 1593
},
{
"epoch": 0.63,
"grad_norm": 8.25,
"learning_rate": 1.8522906793048974e-05,
"loss": 2.1709,
"step": 1594
},
{
"epoch": 0.63,
"grad_norm": 7.875,
"learning_rate": 1.850315955766193e-05,
"loss": 2.061,
"step": 1595
},
{
"epoch": 0.63,
"grad_norm": 5.8125,
"learning_rate": 1.848341232227488e-05,
"loss": 1.9071,
"step": 1596
},
{
"epoch": 0.63,
"grad_norm": 4.96875,
"learning_rate": 1.8463665086887835e-05,
"loss": 2.0342,
"step": 1597
},
{
"epoch": 0.63,
"grad_norm": 5.5625,
"learning_rate": 1.8443917851500793e-05,
"loss": 1.8498,
"step": 1598
},
{
"epoch": 0.63,
"grad_norm": 13.9375,
"learning_rate": 1.8424170616113745e-05,
"loss": 2.6764,
"step": 1599
},
{
"epoch": 0.63,
"grad_norm": 6.15625,
"learning_rate": 1.84044233807267e-05,
"loss": 1.8037,
"step": 1600
},
{
"epoch": 0.63,
"grad_norm": 6.40625,
"learning_rate": 1.838467614533965e-05,
"loss": 1.97,
"step": 1601
},
{
"epoch": 0.63,
"grad_norm": 7.90625,
"learning_rate": 1.836492890995261e-05,
"loss": 2.0161,
"step": 1602
},
{
"epoch": 0.63,
"grad_norm": 7.84375,
"learning_rate": 1.8345181674565563e-05,
"loss": 1.8518,
"step": 1603
},
{
"epoch": 0.63,
"grad_norm": 6.84375,
"learning_rate": 1.8325434439178515e-05,
"loss": 1.89,
"step": 1604
},
{
"epoch": 0.63,
"grad_norm": 5.15625,
"learning_rate": 1.830568720379147e-05,
"loss": 2.0131,
"step": 1605
},
{
"epoch": 0.63,
"grad_norm": 10.125,
"learning_rate": 1.8285939968404424e-05,
"loss": 1.854,
"step": 1606
},
{
"epoch": 0.63,
"grad_norm": 6.96875,
"learning_rate": 1.826619273301738e-05,
"loss": 1.6231,
"step": 1607
},
{
"epoch": 0.64,
"grad_norm": 5.3125,
"learning_rate": 1.8246445497630334e-05,
"loss": 2.2025,
"step": 1608
},
{
"epoch": 0.64,
"grad_norm": 5.125,
"learning_rate": 1.8226698262243285e-05,
"loss": 1.9041,
"step": 1609
},
{
"epoch": 0.64,
"grad_norm": 13.0,
"learning_rate": 1.8206951026856243e-05,
"loss": 2.2722,
"step": 1610
},
{
"epoch": 0.64,
"grad_norm": 5.96875,
"learning_rate": 1.8187203791469194e-05,
"loss": 1.7949,
"step": 1611
},
{
"epoch": 0.64,
"grad_norm": 20.875,
"learning_rate": 1.816745655608215e-05,
"loss": 2.0874,
"step": 1612
},
{
"epoch": 0.64,
"grad_norm": 6.59375,
"learning_rate": 1.8147709320695104e-05,
"loss": 1.8927,
"step": 1613
},
{
"epoch": 0.64,
"grad_norm": 6.28125,
"learning_rate": 1.812796208530806e-05,
"loss": 1.7369,
"step": 1614
},
{
"epoch": 0.64,
"grad_norm": 7.03125,
"learning_rate": 1.8108214849921013e-05,
"loss": 2.1272,
"step": 1615
},
{
"epoch": 0.64,
"grad_norm": 5.625,
"learning_rate": 1.8088467614533965e-05,
"loss": 1.741,
"step": 1616
},
{
"epoch": 0.64,
"grad_norm": 5.6875,
"learning_rate": 1.806872037914692e-05,
"loss": 1.7574,
"step": 1617
},
{
"epoch": 0.64,
"grad_norm": 6.53125,
"learning_rate": 1.8048973143759877e-05,
"loss": 1.9099,
"step": 1618
},
{
"epoch": 0.64,
"grad_norm": 5.4375,
"learning_rate": 1.802922590837283e-05,
"loss": 1.89,
"step": 1619
},
{
"epoch": 0.64,
"grad_norm": 7.53125,
"learning_rate": 1.8009478672985784e-05,
"loss": 1.6475,
"step": 1620
},
{
"epoch": 0.64,
"grad_norm": 6.59375,
"learning_rate": 1.7989731437598735e-05,
"loss": 1.8913,
"step": 1621
},
{
"epoch": 0.64,
"grad_norm": 6.6875,
"learning_rate": 1.796998420221169e-05,
"loss": 2.1346,
"step": 1622
},
{
"epoch": 0.64,
"grad_norm": 5.59375,
"learning_rate": 1.7950236966824648e-05,
"loss": 2.187,
"step": 1623
},
{
"epoch": 0.64,
"grad_norm": 8.75,
"learning_rate": 1.79304897314376e-05,
"loss": 1.9771,
"step": 1624
},
{
"epoch": 0.64,
"grad_norm": 5.28125,
"learning_rate": 1.7910742496050554e-05,
"loss": 2.0439,
"step": 1625
},
{
"epoch": 0.64,
"grad_norm": 7.40625,
"learning_rate": 1.7890995260663505e-05,
"loss": 2.2925,
"step": 1626
},
{
"epoch": 0.64,
"grad_norm": 8.125,
"learning_rate": 1.7871248025276463e-05,
"loss": 1.8724,
"step": 1627
},
{
"epoch": 0.64,
"grad_norm": 6.09375,
"learning_rate": 1.7851500789889418e-05,
"loss": 1.8144,
"step": 1628
},
{
"epoch": 0.64,
"grad_norm": 7.625,
"learning_rate": 1.783175355450237e-05,
"loss": 1.94,
"step": 1629
},
{
"epoch": 0.64,
"grad_norm": 6.0625,
"learning_rate": 1.7812006319115324e-05,
"loss": 2.0621,
"step": 1630
},
{
"epoch": 0.64,
"grad_norm": 5.53125,
"learning_rate": 1.779225908372828e-05,
"loss": 2.1489,
"step": 1631
},
{
"epoch": 0.64,
"grad_norm": 5.4375,
"learning_rate": 1.7772511848341233e-05,
"loss": 2.159,
"step": 1632
},
{
"epoch": 0.64,
"grad_norm": 7.0625,
"learning_rate": 1.7752764612954188e-05,
"loss": 2.1219,
"step": 1633
},
{
"epoch": 0.65,
"grad_norm": 7.84375,
"learning_rate": 1.773301737756714e-05,
"loss": 2.0931,
"step": 1634
},
{
"epoch": 0.65,
"grad_norm": 4.53125,
"learning_rate": 1.7713270142180098e-05,
"loss": 2.2745,
"step": 1635
},
{
"epoch": 0.65,
"grad_norm": 5.9375,
"learning_rate": 1.769352290679305e-05,
"loss": 2.0663,
"step": 1636
},
{
"epoch": 0.65,
"grad_norm": 7.5,
"learning_rate": 1.7673775671406004e-05,
"loss": 1.7154,
"step": 1637
},
{
"epoch": 0.65,
"grad_norm": 8.5,
"learning_rate": 1.765402843601896e-05,
"loss": 1.8384,
"step": 1638
},
{
"epoch": 0.65,
"grad_norm": 5.15625,
"learning_rate": 1.7634281200631913e-05,
"loss": 1.7438,
"step": 1639
},
{
"epoch": 0.65,
"grad_norm": 5.65625,
"learning_rate": 1.7614533965244868e-05,
"loss": 2.3129,
"step": 1640
},
{
"epoch": 0.65,
"grad_norm": 5.78125,
"learning_rate": 1.759478672985782e-05,
"loss": 2.0107,
"step": 1641
},
{
"epoch": 0.65,
"grad_norm": 8.5625,
"learning_rate": 1.7575039494470774e-05,
"loss": 2.0142,
"step": 1642
},
{
"epoch": 0.65,
"grad_norm": 5.9375,
"learning_rate": 1.7555292259083732e-05,
"loss": 1.9512,
"step": 1643
},
{
"epoch": 0.65,
"grad_norm": 15.75,
"learning_rate": 1.7535545023696683e-05,
"loss": 2.5976,
"step": 1644
},
{
"epoch": 0.65,
"grad_norm": 5.28125,
"learning_rate": 1.7515797788309638e-05,
"loss": 1.9111,
"step": 1645
},
{
"epoch": 0.65,
"grad_norm": 7.75,
"learning_rate": 1.749605055292259e-05,
"loss": 1.9428,
"step": 1646
},
{
"epoch": 0.65,
"grad_norm": 7.59375,
"learning_rate": 1.7476303317535544e-05,
"loss": 1.9428,
"step": 1647
},
{
"epoch": 0.65,
"grad_norm": 6.5625,
"learning_rate": 1.7456556082148502e-05,
"loss": 2.1427,
"step": 1648
},
{
"epoch": 0.65,
"grad_norm": 7.875,
"learning_rate": 1.7436808846761454e-05,
"loss": 1.9315,
"step": 1649
},
{
"epoch": 0.65,
"grad_norm": 5.0,
"learning_rate": 1.7417061611374408e-05,
"loss": 2.0153,
"step": 1650
},
{
"epoch": 0.65,
"grad_norm": 6.0,
"learning_rate": 1.7397314375987363e-05,
"loss": 1.8733,
"step": 1651
},
{
"epoch": 0.65,
"grad_norm": 8.0625,
"learning_rate": 1.7377567140600318e-05,
"loss": 1.8876,
"step": 1652
},
{
"epoch": 0.65,
"grad_norm": 5.46875,
"learning_rate": 1.7357819905213272e-05,
"loss": 1.8168,
"step": 1653
},
{
"epoch": 0.65,
"grad_norm": 5.71875,
"learning_rate": 1.7338072669826224e-05,
"loss": 2.1031,
"step": 1654
},
{
"epoch": 0.65,
"grad_norm": 5.8125,
"learning_rate": 1.731832543443918e-05,
"loss": 1.7932,
"step": 1655
},
{
"epoch": 0.65,
"grad_norm": 6.71875,
"learning_rate": 1.7298578199052133e-05,
"loss": 2.3144,
"step": 1656
},
{
"epoch": 0.65,
"grad_norm": 6.9375,
"learning_rate": 1.7278830963665088e-05,
"loss": 2.1288,
"step": 1657
},
{
"epoch": 0.65,
"grad_norm": 6.375,
"learning_rate": 1.7259083728278043e-05,
"loss": 1.9595,
"step": 1658
},
{
"epoch": 0.66,
"grad_norm": 12.75,
"learning_rate": 1.7239336492890994e-05,
"loss": 2.1697,
"step": 1659
},
{
"epoch": 0.66,
"grad_norm": 6.4375,
"learning_rate": 1.7219589257503952e-05,
"loss": 1.7483,
"step": 1660
},
{
"epoch": 0.66,
"grad_norm": 5.28125,
"learning_rate": 1.7199842022116903e-05,
"loss": 1.8971,
"step": 1661
},
{
"epoch": 0.66,
"grad_norm": 6.375,
"learning_rate": 1.7180094786729858e-05,
"loss": 2.1833,
"step": 1662
},
{
"epoch": 0.66,
"grad_norm": 6.3125,
"learning_rate": 1.7160347551342813e-05,
"loss": 2.0499,
"step": 1663
},
{
"epoch": 0.66,
"grad_norm": 5.71875,
"learning_rate": 1.7140600315955768e-05,
"loss": 1.8579,
"step": 1664
},
{
"epoch": 0.66,
"grad_norm": 7.28125,
"learning_rate": 1.7120853080568722e-05,
"loss": 2.1343,
"step": 1665
},
{
"epoch": 0.66,
"grad_norm": 7.625,
"learning_rate": 1.7101105845181674e-05,
"loss": 1.9643,
"step": 1666
},
{
"epoch": 0.66,
"grad_norm": 7.03125,
"learning_rate": 1.708135860979463e-05,
"loss": 1.849,
"step": 1667
},
{
"epoch": 0.66,
"grad_norm": 8.25,
"learning_rate": 1.7061611374407587e-05,
"loss": 2.0693,
"step": 1668
},
{
"epoch": 0.66,
"grad_norm": 7.53125,
"learning_rate": 1.7041864139020538e-05,
"loss": 1.7442,
"step": 1669
},
{
"epoch": 0.66,
"grad_norm": 7.1875,
"learning_rate": 1.7022116903633493e-05,
"loss": 1.8766,
"step": 1670
},
{
"epoch": 0.66,
"grad_norm": 8.0625,
"learning_rate": 1.7002369668246444e-05,
"loss": 1.7387,
"step": 1671
},
{
"epoch": 0.66,
"grad_norm": 6.625,
"learning_rate": 1.69826224328594e-05,
"loss": 2.0458,
"step": 1672
},
{
"epoch": 0.66,
"grad_norm": 6.15625,
"learning_rate": 1.6962875197472357e-05,
"loss": 1.909,
"step": 1673
},
{
"epoch": 0.66,
"grad_norm": 6.03125,
"learning_rate": 1.6943127962085308e-05,
"loss": 2.047,
"step": 1674
},
{
"epoch": 0.66,
"grad_norm": 5.28125,
"learning_rate": 1.6923380726698263e-05,
"loss": 1.795,
"step": 1675
},
{
"epoch": 0.66,
"grad_norm": 7.9375,
"learning_rate": 1.6903633491311218e-05,
"loss": 2.1553,
"step": 1676
},
{
"epoch": 0.66,
"grad_norm": 5.53125,
"learning_rate": 1.6883886255924172e-05,
"loss": 2.0107,
"step": 1677
},
{
"epoch": 0.66,
"grad_norm": 6.625,
"learning_rate": 1.6864139020537127e-05,
"loss": 1.875,
"step": 1678
},
{
"epoch": 0.66,
"grad_norm": 7.0,
"learning_rate": 1.684439178515008e-05,
"loss": 1.9681,
"step": 1679
},
{
"epoch": 0.66,
"grad_norm": 5.3125,
"learning_rate": 1.6824644549763033e-05,
"loss": 1.9516,
"step": 1680
},
{
"epoch": 0.66,
"grad_norm": 8.5625,
"learning_rate": 1.6804897314375988e-05,
"loss": 2.2798,
"step": 1681
},
{
"epoch": 0.66,
"grad_norm": 5.21875,
"learning_rate": 1.6785150078988943e-05,
"loss": 1.9815,
"step": 1682
},
{
"epoch": 0.66,
"grad_norm": 10.75,
"learning_rate": 1.6765402843601897e-05,
"loss": 2.0302,
"step": 1683
},
{
"epoch": 0.67,
"grad_norm": 9.5625,
"learning_rate": 1.674565560821485e-05,
"loss": 2.0816,
"step": 1684
},
{
"epoch": 0.67,
"grad_norm": 7.34375,
"learning_rate": 1.6725908372827807e-05,
"loss": 1.9632,
"step": 1685
},
{
"epoch": 0.67,
"grad_norm": 6.46875,
"learning_rate": 1.6706161137440758e-05,
"loss": 2.1374,
"step": 1686
},
{
"epoch": 0.67,
"grad_norm": 5.90625,
"learning_rate": 1.6686413902053713e-05,
"loss": 2.1841,
"step": 1687
},
{
"epoch": 0.67,
"grad_norm": 6.1875,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.7648,
"step": 1688
},
{
"epoch": 0.67,
"grad_norm": 6.96875,
"learning_rate": 1.6646919431279622e-05,
"loss": 2.1754,
"step": 1689
},
{
"epoch": 0.67,
"grad_norm": 4.40625,
"learning_rate": 1.6627172195892577e-05,
"loss": 2.1205,
"step": 1690
},
{
"epoch": 0.67,
"grad_norm": 6.21875,
"learning_rate": 1.6607424960505528e-05,
"loss": 1.704,
"step": 1691
},
{
"epoch": 0.67,
"grad_norm": 7.3125,
"learning_rate": 1.6587677725118483e-05,
"loss": 2.141,
"step": 1692
},
{
"epoch": 0.67,
"grad_norm": 4.875,
"learning_rate": 1.656793048973144e-05,
"loss": 1.8771,
"step": 1693
},
{
"epoch": 0.67,
"grad_norm": 5.3125,
"learning_rate": 1.6548183254344392e-05,
"loss": 1.8377,
"step": 1694
},
{
"epoch": 0.67,
"grad_norm": 6.8125,
"learning_rate": 1.6528436018957347e-05,
"loss": 2.0314,
"step": 1695
},
{
"epoch": 0.67,
"grad_norm": 6.125,
"learning_rate": 1.6508688783570302e-05,
"loss": 2.1148,
"step": 1696
},
{
"epoch": 0.67,
"grad_norm": 6.4375,
"learning_rate": 1.6488941548183253e-05,
"loss": 1.8397,
"step": 1697
},
{
"epoch": 0.67,
"grad_norm": 5.875,
"learning_rate": 1.646919431279621e-05,
"loss": 1.8593,
"step": 1698
},
{
"epoch": 0.67,
"grad_norm": 7.65625,
"learning_rate": 1.6449447077409163e-05,
"loss": 1.896,
"step": 1699
},
{
"epoch": 0.67,
"grad_norm": 9.1875,
"learning_rate": 1.6429699842022117e-05,
"loss": 1.9238,
"step": 1700
},
{
"epoch": 0.67,
"grad_norm": 7.125,
"learning_rate": 1.6409952606635072e-05,
"loss": 2.3802,
"step": 1701
},
{
"epoch": 0.67,
"grad_norm": 5.75,
"learning_rate": 1.6390205371248027e-05,
"loss": 1.8684,
"step": 1702
},
{
"epoch": 0.67,
"grad_norm": 9.4375,
"learning_rate": 1.637045813586098e-05,
"loss": 2.0416,
"step": 1703
},
{
"epoch": 0.67,
"grad_norm": 7.0625,
"learning_rate": 1.6350710900473933e-05,
"loss": 1.7253,
"step": 1704
},
{
"epoch": 0.67,
"grad_norm": 7.84375,
"learning_rate": 1.6330963665086888e-05,
"loss": 2.0437,
"step": 1705
},
{
"epoch": 0.67,
"grad_norm": 7.8125,
"learning_rate": 1.6311216429699842e-05,
"loss": 1.6338,
"step": 1706
},
{
"epoch": 0.67,
"grad_norm": 10.5625,
"learning_rate": 1.6291469194312797e-05,
"loss": 2.0307,
"step": 1707
},
{
"epoch": 0.67,
"grad_norm": 7.25,
"learning_rate": 1.6271721958925752e-05,
"loss": 2.0623,
"step": 1708
},
{
"epoch": 0.67,
"grad_norm": 6.90625,
"learning_rate": 1.6251974723538703e-05,
"loss": 1.804,
"step": 1709
},
{
"epoch": 0.68,
"grad_norm": 8.3125,
"learning_rate": 1.623222748815166e-05,
"loss": 2.0095,
"step": 1710
},
{
"epoch": 0.68,
"grad_norm": 5.96875,
"learning_rate": 1.6212480252764613e-05,
"loss": 1.6792,
"step": 1711
},
{
"epoch": 0.68,
"grad_norm": 4.78125,
"learning_rate": 1.6192733017377567e-05,
"loss": 1.911,
"step": 1712
},
{
"epoch": 0.68,
"grad_norm": 6.1875,
"learning_rate": 1.6172985781990522e-05,
"loss": 2.2004,
"step": 1713
},
{
"epoch": 0.68,
"grad_norm": 6.25,
"learning_rate": 1.6153238546603477e-05,
"loss": 2.089,
"step": 1714
},
{
"epoch": 0.68,
"grad_norm": 6.96875,
"learning_rate": 1.613349131121643e-05,
"loss": 2.326,
"step": 1715
},
{
"epoch": 0.68,
"grad_norm": 4.5625,
"learning_rate": 1.6113744075829386e-05,
"loss": 1.9775,
"step": 1716
},
{
"epoch": 0.68,
"grad_norm": 11.5,
"learning_rate": 1.6093996840442338e-05,
"loss": 1.8057,
"step": 1717
},
{
"epoch": 0.68,
"grad_norm": 10.5,
"learning_rate": 1.6074249605055296e-05,
"loss": 1.8842,
"step": 1718
},
{
"epoch": 0.68,
"grad_norm": 5.3125,
"learning_rate": 1.6054502369668247e-05,
"loss": 2.1357,
"step": 1719
},
{
"epoch": 0.68,
"grad_norm": 5.4375,
"learning_rate": 1.60347551342812e-05,
"loss": 1.9051,
"step": 1720
},
{
"epoch": 0.68,
"grad_norm": 6.96875,
"learning_rate": 1.6015007898894156e-05,
"loss": 2.158,
"step": 1721
},
{
"epoch": 0.68,
"grad_norm": 8.5,
"learning_rate": 1.5995260663507108e-05,
"loss": 1.8643,
"step": 1722
},
{
"epoch": 0.68,
"grad_norm": 12.6875,
"learning_rate": 1.5975513428120066e-05,
"loss": 1.7875,
"step": 1723
},
{
"epoch": 0.68,
"grad_norm": 6.5,
"learning_rate": 1.5955766192733017e-05,
"loss": 2.1917,
"step": 1724
},
{
"epoch": 0.68,
"grad_norm": 5.625,
"learning_rate": 1.5936018957345972e-05,
"loss": 2.1706,
"step": 1725
},
{
"epoch": 0.68,
"grad_norm": 9.5625,
"learning_rate": 1.5916271721958927e-05,
"loss": 2.4081,
"step": 1726
},
{
"epoch": 0.68,
"grad_norm": 5.59375,
"learning_rate": 1.589652448657188e-05,
"loss": 2.1498,
"step": 1727
},
{
"epoch": 0.68,
"grad_norm": 5.96875,
"learning_rate": 1.5876777251184836e-05,
"loss": 2.0651,
"step": 1728
},
{
"epoch": 0.68,
"grad_norm": 6.0625,
"learning_rate": 1.5857030015797787e-05,
"loss": 1.6913,
"step": 1729
},
{
"epoch": 0.68,
"grad_norm": 5.21875,
"learning_rate": 1.5837282780410742e-05,
"loss": 1.9087,
"step": 1730
},
{
"epoch": 0.68,
"grad_norm": 11.375,
"learning_rate": 1.5817535545023697e-05,
"loss": 2.0926,
"step": 1731
},
{
"epoch": 0.68,
"grad_norm": 7.09375,
"learning_rate": 1.579778830963665e-05,
"loss": 2.1513,
"step": 1732
},
{
"epoch": 0.68,
"grad_norm": 9.0625,
"learning_rate": 1.5778041074249606e-05,
"loss": 2.1051,
"step": 1733
},
{
"epoch": 0.68,
"grad_norm": 6.25,
"learning_rate": 1.5758293838862558e-05,
"loss": 2.177,
"step": 1734
},
{
"epoch": 0.69,
"grad_norm": 6.21875,
"learning_rate": 1.5738546603475516e-05,
"loss": 2.1557,
"step": 1735
},
{
"epoch": 0.69,
"grad_norm": 5.4375,
"learning_rate": 1.5718799368088467e-05,
"loss": 1.8361,
"step": 1736
},
{
"epoch": 0.69,
"grad_norm": 6.21875,
"learning_rate": 1.5699052132701422e-05,
"loss": 2.1246,
"step": 1737
},
{
"epoch": 0.69,
"grad_norm": 6.34375,
"learning_rate": 1.5679304897314377e-05,
"loss": 1.9968,
"step": 1738
},
{
"epoch": 0.69,
"grad_norm": 9.3125,
"learning_rate": 1.565955766192733e-05,
"loss": 1.8559,
"step": 1739
},
{
"epoch": 0.69,
"grad_norm": 5.34375,
"learning_rate": 1.5639810426540286e-05,
"loss": 1.8423,
"step": 1740
},
{
"epoch": 0.69,
"grad_norm": 9.375,
"learning_rate": 1.562006319115324e-05,
"loss": 1.9812,
"step": 1741
},
{
"epoch": 0.69,
"grad_norm": 4.84375,
"learning_rate": 1.5600315955766192e-05,
"loss": 2.0073,
"step": 1742
},
{
"epoch": 0.69,
"grad_norm": 5.84375,
"learning_rate": 1.558056872037915e-05,
"loss": 1.8293,
"step": 1743
},
{
"epoch": 0.69,
"grad_norm": 5.65625,
"learning_rate": 1.55608214849921e-05,
"loss": 2.0012,
"step": 1744
},
{
"epoch": 0.69,
"grad_norm": 6.3125,
"learning_rate": 1.5541074249605056e-05,
"loss": 1.6819,
"step": 1745
},
{
"epoch": 0.69,
"grad_norm": 7.0,
"learning_rate": 1.552132701421801e-05,
"loss": 1.8722,
"step": 1746
},
{
"epoch": 0.69,
"grad_norm": 10.8125,
"learning_rate": 1.5501579778830966e-05,
"loss": 2.0567,
"step": 1747
},
{
"epoch": 0.69,
"grad_norm": 7.8125,
"learning_rate": 1.548183254344392e-05,
"loss": 2.3906,
"step": 1748
},
{
"epoch": 0.69,
"grad_norm": 9.9375,
"learning_rate": 1.5462085308056872e-05,
"loss": 1.9552,
"step": 1749
},
{
"epoch": 0.69,
"grad_norm": 4.84375,
"learning_rate": 1.5442338072669826e-05,
"loss": 1.489,
"step": 1750
},
{
"epoch": 0.69,
"grad_norm": 5.96875,
"learning_rate": 1.542259083728278e-05,
"loss": 1.8193,
"step": 1751
},
{
"epoch": 0.69,
"grad_norm": 9.4375,
"learning_rate": 1.5402843601895736e-05,
"loss": 1.7943,
"step": 1752
},
{
"epoch": 0.69,
"grad_norm": 12.6875,
"learning_rate": 1.538309636650869e-05,
"loss": 1.801,
"step": 1753
},
{
"epoch": 0.69,
"grad_norm": 6.46875,
"learning_rate": 1.5363349131121642e-05,
"loss": 2.1374,
"step": 1754
},
{
"epoch": 0.69,
"grad_norm": 5.03125,
"learning_rate": 1.5343601895734597e-05,
"loss": 2.1053,
"step": 1755
},
{
"epoch": 0.69,
"grad_norm": 5.75,
"learning_rate": 1.532385466034755e-05,
"loss": 1.8346,
"step": 1756
},
{
"epoch": 0.69,
"grad_norm": 4.5625,
"learning_rate": 1.5304107424960506e-05,
"loss": 2.083,
"step": 1757
},
{
"epoch": 0.69,
"grad_norm": 5.5625,
"learning_rate": 1.528436018957346e-05,
"loss": 2.1806,
"step": 1758
},
{
"epoch": 0.69,
"grad_norm": 7.40625,
"learning_rate": 1.5264612954186412e-05,
"loss": 1.9365,
"step": 1759
},
{
"epoch": 0.7,
"grad_norm": 6.3125,
"learning_rate": 1.524486571879937e-05,
"loss": 2.1798,
"step": 1760
},
{
"epoch": 0.7,
"grad_norm": 5.90625,
"learning_rate": 1.5225118483412323e-05,
"loss": 1.7343,
"step": 1761
},
{
"epoch": 0.7,
"grad_norm": 6.0,
"learning_rate": 1.5205371248025276e-05,
"loss": 2.2112,
"step": 1762
},
{
"epoch": 0.7,
"grad_norm": 7.59375,
"learning_rate": 1.5185624012638231e-05,
"loss": 2.3712,
"step": 1763
},
{
"epoch": 0.7,
"grad_norm": 5.9375,
"learning_rate": 1.5165876777251187e-05,
"loss": 2.2263,
"step": 1764
},
{
"epoch": 0.7,
"grad_norm": 6.53125,
"learning_rate": 1.514612954186414e-05,
"loss": 2.3804,
"step": 1765
},
{
"epoch": 0.7,
"grad_norm": 7.1875,
"learning_rate": 1.5126382306477094e-05,
"loss": 1.9832,
"step": 1766
},
{
"epoch": 0.7,
"grad_norm": 6.5,
"learning_rate": 1.5106635071090047e-05,
"loss": 1.9412,
"step": 1767
},
{
"epoch": 0.7,
"grad_norm": 6.53125,
"learning_rate": 1.5086887835703003e-05,
"loss": 2.1359,
"step": 1768
},
{
"epoch": 0.7,
"grad_norm": 9.0625,
"learning_rate": 1.5067140600315958e-05,
"loss": 1.8093,
"step": 1769
},
{
"epoch": 0.7,
"grad_norm": 6.4375,
"learning_rate": 1.504739336492891e-05,
"loss": 2.2793,
"step": 1770
},
{
"epoch": 0.7,
"grad_norm": 7.34375,
"learning_rate": 1.5027646129541864e-05,
"loss": 2.2631,
"step": 1771
},
{
"epoch": 0.7,
"grad_norm": 10.5625,
"learning_rate": 1.500789889415482e-05,
"loss": 1.7803,
"step": 1772
},
{
"epoch": 0.7,
"grad_norm": 10.5625,
"learning_rate": 1.4988151658767773e-05,
"loss": 1.8772,
"step": 1773
},
{
"epoch": 0.7,
"grad_norm": 8.9375,
"learning_rate": 1.4968404423380728e-05,
"loss": 1.9033,
"step": 1774
},
{
"epoch": 0.7,
"grad_norm": 6.375,
"learning_rate": 1.4948657187993681e-05,
"loss": 2.0273,
"step": 1775
},
{
"epoch": 0.7,
"grad_norm": 6.28125,
"learning_rate": 1.4928909952606634e-05,
"loss": 2.169,
"step": 1776
},
{
"epoch": 0.7,
"grad_norm": 7.21875,
"learning_rate": 1.490916271721959e-05,
"loss": 1.9322,
"step": 1777
},
{
"epoch": 0.7,
"grad_norm": 9.4375,
"learning_rate": 1.4889415481832545e-05,
"loss": 2.486,
"step": 1778
},
{
"epoch": 0.7,
"grad_norm": 10.125,
"learning_rate": 1.4869668246445498e-05,
"loss": 2.1295,
"step": 1779
},
{
"epoch": 0.7,
"grad_norm": 9.875,
"learning_rate": 1.4849921011058451e-05,
"loss": 1.9163,
"step": 1780
},
{
"epoch": 0.7,
"grad_norm": 7.5625,
"learning_rate": 1.4830173775671408e-05,
"loss": 1.8977,
"step": 1781
},
{
"epoch": 0.7,
"grad_norm": 16.875,
"learning_rate": 1.481042654028436e-05,
"loss": 1.9979,
"step": 1782
},
{
"epoch": 0.7,
"grad_norm": 11.6875,
"learning_rate": 1.4790679304897315e-05,
"loss": 2.1319,
"step": 1783
},
{
"epoch": 0.7,
"grad_norm": 7.625,
"learning_rate": 1.4770932069510268e-05,
"loss": 1.9456,
"step": 1784
},
{
"epoch": 0.7,
"grad_norm": 6.9375,
"learning_rate": 1.4751184834123225e-05,
"loss": 1.8715,
"step": 1785
},
{
"epoch": 0.71,
"grad_norm": 53.75,
"learning_rate": 1.4731437598736178e-05,
"loss": 1.9031,
"step": 1786
},
{
"epoch": 0.71,
"grad_norm": 8.875,
"learning_rate": 1.4711690363349131e-05,
"loss": 1.961,
"step": 1787
},
{
"epoch": 0.71,
"grad_norm": 8.75,
"learning_rate": 1.4691943127962086e-05,
"loss": 1.8373,
"step": 1788
},
{
"epoch": 0.71,
"grad_norm": 6.75,
"learning_rate": 1.4672195892575042e-05,
"loss": 1.7478,
"step": 1789
},
{
"epoch": 0.71,
"grad_norm": 8.125,
"learning_rate": 1.4652448657187995e-05,
"loss": 2.1335,
"step": 1790
},
{
"epoch": 0.71,
"grad_norm": 7.78125,
"learning_rate": 1.4632701421800948e-05,
"loss": 1.8655,
"step": 1791
},
{
"epoch": 0.71,
"grad_norm": 18.0,
"learning_rate": 1.4612954186413901e-05,
"loss": 2.0953,
"step": 1792
},
{
"epoch": 0.71,
"grad_norm": 12.875,
"learning_rate": 1.4593206951026858e-05,
"loss": 1.7111,
"step": 1793
},
{
"epoch": 0.71,
"grad_norm": 7.4375,
"learning_rate": 1.4573459715639812e-05,
"loss": 2.1003,
"step": 1794
},
{
"epoch": 0.71,
"grad_norm": 14.0625,
"learning_rate": 1.4553712480252765e-05,
"loss": 2.2544,
"step": 1795
},
{
"epoch": 0.71,
"grad_norm": 6.375,
"learning_rate": 1.4533965244865718e-05,
"loss": 2.1063,
"step": 1796
},
{
"epoch": 0.71,
"grad_norm": 7.03125,
"learning_rate": 1.4514218009478675e-05,
"loss": 2.3819,
"step": 1797
},
{
"epoch": 0.71,
"grad_norm": 9.6875,
"learning_rate": 1.4494470774091628e-05,
"loss": 1.9134,
"step": 1798
},
{
"epoch": 0.71,
"grad_norm": 5.4375,
"learning_rate": 1.4474723538704582e-05,
"loss": 2.1191,
"step": 1799
},
{
"epoch": 0.71,
"grad_norm": 5.625,
"learning_rate": 1.4454976303317535e-05,
"loss": 2.0396,
"step": 1800
},
{
"epoch": 0.71,
"grad_norm": 10.1875,
"learning_rate": 1.4435229067930489e-05,
"loss": 2.2287,
"step": 1801
},
{
"epoch": 0.71,
"grad_norm": 9.4375,
"learning_rate": 1.4415481832543445e-05,
"loss": 2.2715,
"step": 1802
},
{
"epoch": 0.71,
"grad_norm": 8.625,
"learning_rate": 1.43957345971564e-05,
"loss": 2.0739,
"step": 1803
},
{
"epoch": 0.71,
"grad_norm": 6.5,
"learning_rate": 1.4375987361769353e-05,
"loss": 1.6744,
"step": 1804
},
{
"epoch": 0.71,
"grad_norm": 7.25,
"learning_rate": 1.4356240126382306e-05,
"loss": 2.0855,
"step": 1805
},
{
"epoch": 0.71,
"grad_norm": 6.8125,
"learning_rate": 1.4336492890995262e-05,
"loss": 1.7502,
"step": 1806
},
{
"epoch": 0.71,
"grad_norm": 5.90625,
"learning_rate": 1.4316745655608215e-05,
"loss": 2.3185,
"step": 1807
},
{
"epoch": 0.71,
"grad_norm": 4.53125,
"learning_rate": 1.429699842022117e-05,
"loss": 2.1072,
"step": 1808
},
{
"epoch": 0.71,
"grad_norm": 6.5,
"learning_rate": 1.4277251184834123e-05,
"loss": 2.11,
"step": 1809
},
{
"epoch": 0.71,
"grad_norm": 4.84375,
"learning_rate": 1.425750394944708e-05,
"loss": 2.1957,
"step": 1810
},
{
"epoch": 0.72,
"grad_norm": 18.5,
"learning_rate": 1.4237756714060032e-05,
"loss": 1.8613,
"step": 1811
},
{
"epoch": 0.72,
"grad_norm": 15.5625,
"learning_rate": 1.4218009478672985e-05,
"loss": 1.9591,
"step": 1812
},
{
"epoch": 0.72,
"grad_norm": 6.59375,
"learning_rate": 1.419826224328594e-05,
"loss": 1.6567,
"step": 1813
},
{
"epoch": 0.72,
"grad_norm": 5.625,
"learning_rate": 1.4178515007898897e-05,
"loss": 1.7977,
"step": 1814
},
{
"epoch": 0.72,
"grad_norm": 7.0,
"learning_rate": 1.415876777251185e-05,
"loss": 2.1728,
"step": 1815
},
{
"epoch": 0.72,
"grad_norm": 9.0,
"learning_rate": 1.4139020537124803e-05,
"loss": 2.2893,
"step": 1816
},
{
"epoch": 0.72,
"grad_norm": 27.375,
"learning_rate": 1.4119273301737757e-05,
"loss": 1.5096,
"step": 1817
},
{
"epoch": 0.72,
"grad_norm": 7.28125,
"learning_rate": 1.4099526066350712e-05,
"loss": 2.0754,
"step": 1818
},
{
"epoch": 0.72,
"grad_norm": 7.5,
"learning_rate": 1.4079778830963667e-05,
"loss": 2.0997,
"step": 1819
},
{
"epoch": 0.72,
"grad_norm": 6.5,
"learning_rate": 1.406003159557662e-05,
"loss": 1.8124,
"step": 1820
},
{
"epoch": 0.72,
"grad_norm": 7.21875,
"learning_rate": 1.4040284360189573e-05,
"loss": 1.8613,
"step": 1821
},
{
"epoch": 0.72,
"grad_norm": 7.8125,
"learning_rate": 1.402053712480253e-05,
"loss": 1.9453,
"step": 1822
},
{
"epoch": 0.72,
"grad_norm": 6.0625,
"learning_rate": 1.4000789889415484e-05,
"loss": 2.0384,
"step": 1823
},
{
"epoch": 0.72,
"grad_norm": 5.34375,
"learning_rate": 1.3981042654028437e-05,
"loss": 1.923,
"step": 1824
},
{
"epoch": 0.72,
"grad_norm": 5.21875,
"learning_rate": 1.396129541864139e-05,
"loss": 1.886,
"step": 1825
},
{
"epoch": 0.72,
"grad_norm": 5.125,
"learning_rate": 1.3941548183254343e-05,
"loss": 1.8744,
"step": 1826
},
{
"epoch": 0.72,
"grad_norm": 9.875,
"learning_rate": 1.39218009478673e-05,
"loss": 1.5611,
"step": 1827
},
{
"epoch": 0.72,
"grad_norm": 5.46875,
"learning_rate": 1.3902053712480254e-05,
"loss": 1.7836,
"step": 1828
},
{
"epoch": 0.72,
"grad_norm": 5.9375,
"learning_rate": 1.3882306477093207e-05,
"loss": 2.0586,
"step": 1829
},
{
"epoch": 0.72,
"grad_norm": 9.0625,
"learning_rate": 1.386255924170616e-05,
"loss": 2.1186,
"step": 1830
},
{
"epoch": 0.72,
"grad_norm": 7.03125,
"learning_rate": 1.3842812006319117e-05,
"loss": 2.0262,
"step": 1831
},
{
"epoch": 0.72,
"grad_norm": 6.03125,
"learning_rate": 1.382306477093207e-05,
"loss": 2.0731,
"step": 1832
},
{
"epoch": 0.72,
"grad_norm": 5.03125,
"learning_rate": 1.3803317535545024e-05,
"loss": 2.0797,
"step": 1833
},
{
"epoch": 0.72,
"grad_norm": 5.59375,
"learning_rate": 1.3783570300157977e-05,
"loss": 2.1194,
"step": 1834
},
{
"epoch": 0.72,
"grad_norm": 8.375,
"learning_rate": 1.3763823064770934e-05,
"loss": 2.0719,
"step": 1835
},
{
"epoch": 0.73,
"grad_norm": 8.125,
"learning_rate": 1.3744075829383887e-05,
"loss": 1.9128,
"step": 1836
},
{
"epoch": 0.73,
"grad_norm": 5.125,
"learning_rate": 1.3724328593996842e-05,
"loss": 1.7425,
"step": 1837
},
{
"epoch": 0.73,
"grad_norm": 7.8125,
"learning_rate": 1.3704581358609795e-05,
"loss": 2.2982,
"step": 1838
},
{
"epoch": 0.73,
"grad_norm": 5.8125,
"learning_rate": 1.3684834123222751e-05,
"loss": 1.9633,
"step": 1839
},
{
"epoch": 0.73,
"grad_norm": 5.40625,
"learning_rate": 1.3665086887835704e-05,
"loss": 2.1067,
"step": 1840
},
{
"epoch": 0.73,
"grad_norm": 6.6875,
"learning_rate": 1.3645339652448657e-05,
"loss": 1.8924,
"step": 1841
},
{
"epoch": 0.73,
"grad_norm": 5.625,
"learning_rate": 1.3625592417061612e-05,
"loss": 2.142,
"step": 1842
},
{
"epoch": 0.73,
"grad_norm": 5.40625,
"learning_rate": 1.3605845181674568e-05,
"loss": 1.9444,
"step": 1843
},
{
"epoch": 0.73,
"grad_norm": 5.46875,
"learning_rate": 1.3586097946287521e-05,
"loss": 1.8868,
"step": 1844
},
{
"epoch": 0.73,
"grad_norm": 7.0625,
"learning_rate": 1.3566350710900474e-05,
"loss": 2.0836,
"step": 1845
},
{
"epoch": 0.73,
"grad_norm": 5.90625,
"learning_rate": 1.3546603475513427e-05,
"loss": 2.1672,
"step": 1846
},
{
"epoch": 0.73,
"grad_norm": 7.78125,
"learning_rate": 1.3526856240126384e-05,
"loss": 2.0987,
"step": 1847
},
{
"epoch": 0.73,
"grad_norm": 7.625,
"learning_rate": 1.3507109004739339e-05,
"loss": 2.0546,
"step": 1848
},
{
"epoch": 0.73,
"grad_norm": 8.5625,
"learning_rate": 1.3487361769352292e-05,
"loss": 2.0986,
"step": 1849
},
{
"epoch": 0.73,
"grad_norm": 6.875,
"learning_rate": 1.3467614533965245e-05,
"loss": 1.8614,
"step": 1850
},
{
"epoch": 0.73,
"grad_norm": 8.375,
"learning_rate": 1.3447867298578198e-05,
"loss": 2.0437,
"step": 1851
},
{
"epoch": 0.73,
"grad_norm": 7.40625,
"learning_rate": 1.3428120063191154e-05,
"loss": 1.7998,
"step": 1852
},
{
"epoch": 0.73,
"grad_norm": 10.5625,
"learning_rate": 1.3408372827804109e-05,
"loss": 1.9433,
"step": 1853
},
{
"epoch": 0.73,
"grad_norm": 10.1875,
"learning_rate": 1.3388625592417062e-05,
"loss": 2.2971,
"step": 1854
},
{
"epoch": 0.73,
"grad_norm": 12.5625,
"learning_rate": 1.3368878357030015e-05,
"loss": 2.0303,
"step": 1855
},
{
"epoch": 0.73,
"grad_norm": 6.375,
"learning_rate": 1.3349131121642971e-05,
"loss": 1.8925,
"step": 1856
},
{
"epoch": 0.73,
"grad_norm": 7.15625,
"learning_rate": 1.3329383886255924e-05,
"loss": 2.1903,
"step": 1857
},
{
"epoch": 0.73,
"grad_norm": 7.28125,
"learning_rate": 1.3309636650868879e-05,
"loss": 2.0483,
"step": 1858
},
{
"epoch": 0.73,
"grad_norm": 6.21875,
"learning_rate": 1.3289889415481832e-05,
"loss": 1.9829,
"step": 1859
},
{
"epoch": 0.73,
"grad_norm": 4.1875,
"learning_rate": 1.3270142180094788e-05,
"loss": 1.7854,
"step": 1860
},
{
"epoch": 0.73,
"grad_norm": 5.71875,
"learning_rate": 1.3250394944707741e-05,
"loss": 1.9232,
"step": 1861
},
{
"epoch": 0.74,
"grad_norm": 8.0625,
"learning_rate": 1.3230647709320696e-05,
"loss": 2.3038,
"step": 1862
},
{
"epoch": 0.74,
"grad_norm": 6.5625,
"learning_rate": 1.321090047393365e-05,
"loss": 1.9761,
"step": 1863
},
{
"epoch": 0.74,
"grad_norm": 12.9375,
"learning_rate": 1.3191153238546606e-05,
"loss": 1.8021,
"step": 1864
},
{
"epoch": 0.74,
"grad_norm": 7.625,
"learning_rate": 1.3171406003159559e-05,
"loss": 1.9288,
"step": 1865
},
{
"epoch": 0.74,
"grad_norm": 8.0,
"learning_rate": 1.3151658767772512e-05,
"loss": 2.1939,
"step": 1866
},
{
"epoch": 0.74,
"grad_norm": 7.5,
"learning_rate": 1.3131911532385466e-05,
"loss": 1.7427,
"step": 1867
},
{
"epoch": 0.74,
"grad_norm": 7.78125,
"learning_rate": 1.3112164296998423e-05,
"loss": 2.006,
"step": 1868
},
{
"epoch": 0.74,
"grad_norm": 10.75,
"learning_rate": 1.3092417061611376e-05,
"loss": 2.2151,
"step": 1869
},
{
"epoch": 0.74,
"grad_norm": 7.53125,
"learning_rate": 1.3072669826224329e-05,
"loss": 2.0153,
"step": 1870
},
{
"epoch": 0.74,
"grad_norm": 7.375,
"learning_rate": 1.3052922590837282e-05,
"loss": 2.1438,
"step": 1871
},
{
"epoch": 0.74,
"grad_norm": 6.5625,
"learning_rate": 1.3033175355450238e-05,
"loss": 1.7192,
"step": 1872
},
{
"epoch": 0.74,
"grad_norm": 5.71875,
"learning_rate": 1.3013428120063193e-05,
"loss": 1.9628,
"step": 1873
},
{
"epoch": 0.74,
"grad_norm": 8.875,
"learning_rate": 1.2993680884676146e-05,
"loss": 1.998,
"step": 1874
},
{
"epoch": 0.74,
"grad_norm": 4.65625,
"learning_rate": 1.2973933649289099e-05,
"loss": 1.8498,
"step": 1875
},
{
"epoch": 0.74,
"grad_norm": 9.625,
"learning_rate": 1.2954186413902054e-05,
"loss": 1.8135,
"step": 1876
},
{
"epoch": 0.74,
"grad_norm": 5.90625,
"learning_rate": 1.2934439178515009e-05,
"loss": 1.8159,
"step": 1877
},
{
"epoch": 0.74,
"grad_norm": 8.0,
"learning_rate": 1.2914691943127963e-05,
"loss": 2.2036,
"step": 1878
},
{
"epoch": 0.74,
"grad_norm": 9.8125,
"learning_rate": 1.2894944707740916e-05,
"loss": 2.1619,
"step": 1879
},
{
"epoch": 0.74,
"grad_norm": 5.96875,
"learning_rate": 1.287519747235387e-05,
"loss": 2.1687,
"step": 1880
},
{
"epoch": 0.74,
"grad_norm": 10.0,
"learning_rate": 1.2855450236966826e-05,
"loss": 2.2512,
"step": 1881
},
{
"epoch": 0.74,
"grad_norm": 7.5,
"learning_rate": 1.283570300157978e-05,
"loss": 2.0232,
"step": 1882
},
{
"epoch": 0.74,
"grad_norm": 11.625,
"learning_rate": 1.2815955766192733e-05,
"loss": 1.9691,
"step": 1883
},
{
"epoch": 0.74,
"grad_norm": 7.15625,
"learning_rate": 1.2796208530805687e-05,
"loss": 2.1155,
"step": 1884
},
{
"epoch": 0.74,
"grad_norm": 4.125,
"learning_rate": 1.2776461295418643e-05,
"loss": 1.793,
"step": 1885
},
{
"epoch": 0.74,
"grad_norm": 7.34375,
"learning_rate": 1.2756714060031596e-05,
"loss": 2.3832,
"step": 1886
},
{
"epoch": 0.75,
"grad_norm": 5.5,
"learning_rate": 1.273696682464455e-05,
"loss": 2.1625,
"step": 1887
},
{
"epoch": 0.75,
"grad_norm": 8.0,
"learning_rate": 1.2717219589257504e-05,
"loss": 2.0805,
"step": 1888
},
{
"epoch": 0.75,
"grad_norm": 7.0625,
"learning_rate": 1.269747235387046e-05,
"loss": 1.7718,
"step": 1889
},
{
"epoch": 0.75,
"grad_norm": 6.5,
"learning_rate": 1.2677725118483413e-05,
"loss": 1.7337,
"step": 1890
},
{
"epoch": 0.75,
"grad_norm": 10.375,
"learning_rate": 1.2657977883096366e-05,
"loss": 2.2849,
"step": 1891
},
{
"epoch": 0.75,
"grad_norm": 7.0625,
"learning_rate": 1.2638230647709321e-05,
"loss": 2.0064,
"step": 1892
},
{
"epoch": 0.75,
"grad_norm": 15.3125,
"learning_rate": 1.2618483412322277e-05,
"loss": 2.3614,
"step": 1893
},
{
"epoch": 0.75,
"grad_norm": 5.25,
"learning_rate": 1.259873617693523e-05,
"loss": 2.4093,
"step": 1894
},
{
"epoch": 0.75,
"grad_norm": 8.375,
"learning_rate": 1.2578988941548183e-05,
"loss": 2.1306,
"step": 1895
},
{
"epoch": 0.75,
"grad_norm": 5.46875,
"learning_rate": 1.2559241706161136e-05,
"loss": 2.0067,
"step": 1896
},
{
"epoch": 0.75,
"grad_norm": 12.0,
"learning_rate": 1.2539494470774093e-05,
"loss": 1.895,
"step": 1897
},
{
"epoch": 0.75,
"grad_norm": 6.53125,
"learning_rate": 1.2519747235387048e-05,
"loss": 1.9376,
"step": 1898
},
{
"epoch": 0.75,
"grad_norm": 6.625,
"learning_rate": 1.25e-05,
"loss": 1.9902,
"step": 1899
},
{
"epoch": 0.75,
"grad_norm": 8.125,
"learning_rate": 1.2480252764612954e-05,
"loss": 2.1392,
"step": 1900
},
{
"epoch": 0.75,
"grad_norm": 6.5,
"learning_rate": 1.2460505529225908e-05,
"loss": 2.302,
"step": 1901
},
{
"epoch": 0.75,
"grad_norm": 7.21875,
"learning_rate": 1.2440758293838863e-05,
"loss": 2.1621,
"step": 1902
},
{
"epoch": 0.75,
"grad_norm": 6.0625,
"learning_rate": 1.2421011058451818e-05,
"loss": 1.9279,
"step": 1903
},
{
"epoch": 0.75,
"grad_norm": 7.84375,
"learning_rate": 1.240126382306477e-05,
"loss": 2.2973,
"step": 1904
},
{
"epoch": 0.75,
"grad_norm": 5.65625,
"learning_rate": 1.2381516587677726e-05,
"loss": 1.903,
"step": 1905
},
{
"epoch": 0.75,
"grad_norm": 8.8125,
"learning_rate": 1.2361769352290679e-05,
"loss": 1.9986,
"step": 1906
},
{
"epoch": 0.75,
"grad_norm": 5.96875,
"learning_rate": 1.2342022116903635e-05,
"loss": 2.3003,
"step": 1907
},
{
"epoch": 0.75,
"grad_norm": 10.4375,
"learning_rate": 1.2322274881516588e-05,
"loss": 2.1117,
"step": 1908
},
{
"epoch": 0.75,
"grad_norm": 6.5625,
"learning_rate": 1.2302527646129543e-05,
"loss": 2.171,
"step": 1909
},
{
"epoch": 0.75,
"grad_norm": 6.4375,
"learning_rate": 1.2282780410742496e-05,
"loss": 2.0001,
"step": 1910
},
{
"epoch": 0.75,
"grad_norm": 9.0,
"learning_rate": 1.226303317535545e-05,
"loss": 1.8008,
"step": 1911
},
{
"epoch": 0.76,
"grad_norm": 7.5625,
"learning_rate": 1.2243285939968405e-05,
"loss": 2.1734,
"step": 1912
},
{
"epoch": 0.76,
"grad_norm": 9.4375,
"learning_rate": 1.222353870458136e-05,
"loss": 2.2471,
"step": 1913
},
{
"epoch": 0.76,
"grad_norm": 8.375,
"learning_rate": 1.2203791469194313e-05,
"loss": 1.7866,
"step": 1914
},
{
"epoch": 0.76,
"grad_norm": 7.5625,
"learning_rate": 1.2184044233807268e-05,
"loss": 2.1694,
"step": 1915
},
{
"epoch": 0.76,
"grad_norm": 10.25,
"learning_rate": 1.216429699842022e-05,
"loss": 1.7106,
"step": 1916
},
{
"epoch": 0.76,
"grad_norm": 13.4375,
"learning_rate": 1.2144549763033177e-05,
"loss": 2.0336,
"step": 1917
},
{
"epoch": 0.76,
"grad_norm": 11.0,
"learning_rate": 1.212480252764613e-05,
"loss": 2.0821,
"step": 1918
},
{
"epoch": 0.76,
"grad_norm": 12.375,
"learning_rate": 1.2105055292259085e-05,
"loss": 1.8813,
"step": 1919
},
{
"epoch": 0.76,
"grad_norm": 6.5625,
"learning_rate": 1.2085308056872038e-05,
"loss": 2.4087,
"step": 1920
},
{
"epoch": 0.76,
"grad_norm": 7.0,
"learning_rate": 1.2065560821484993e-05,
"loss": 2.2356,
"step": 1921
},
{
"epoch": 0.76,
"grad_norm": 5.5625,
"learning_rate": 1.2045813586097947e-05,
"loss": 2.0653,
"step": 1922
},
{
"epoch": 0.76,
"grad_norm": 11.25,
"learning_rate": 1.2026066350710902e-05,
"loss": 2.3318,
"step": 1923
},
{
"epoch": 0.76,
"grad_norm": 7.21875,
"learning_rate": 1.2006319115323855e-05,
"loss": 2.1132,
"step": 1924
},
{
"epoch": 0.76,
"grad_norm": 5.625,
"learning_rate": 1.1986571879936808e-05,
"loss": 2.0585,
"step": 1925
},
{
"epoch": 0.76,
"grad_norm": 7.0625,
"learning_rate": 1.1966824644549763e-05,
"loss": 1.9859,
"step": 1926
},
{
"epoch": 0.76,
"grad_norm": 8.3125,
"learning_rate": 1.1947077409162718e-05,
"loss": 2.3404,
"step": 1927
},
{
"epoch": 0.76,
"grad_norm": 4.8125,
"learning_rate": 1.1927330173775672e-05,
"loss": 1.6804,
"step": 1928
},
{
"epoch": 0.76,
"grad_norm": 5.0625,
"learning_rate": 1.1907582938388625e-05,
"loss": 2.0499,
"step": 1929
},
{
"epoch": 0.76,
"grad_norm": 11.5,
"learning_rate": 1.188783570300158e-05,
"loss": 2.3225,
"step": 1930
},
{
"epoch": 0.76,
"grad_norm": 7.90625,
"learning_rate": 1.1868088467614535e-05,
"loss": 2.1614,
"step": 1931
},
{
"epoch": 0.76,
"grad_norm": 7.59375,
"learning_rate": 1.184834123222749e-05,
"loss": 1.7469,
"step": 1932
},
{
"epoch": 0.76,
"grad_norm": 6.75,
"learning_rate": 1.1828593996840443e-05,
"loss": 1.8779,
"step": 1933
},
{
"epoch": 0.76,
"grad_norm": 6.25,
"learning_rate": 1.1808846761453397e-05,
"loss": 2.2457,
"step": 1934
},
{
"epoch": 0.76,
"grad_norm": 8.875,
"learning_rate": 1.178909952606635e-05,
"loss": 2.3114,
"step": 1935
},
{
"epoch": 0.76,
"grad_norm": 8.1875,
"learning_rate": 1.1769352290679305e-05,
"loss": 2.1986,
"step": 1936
},
{
"epoch": 0.77,
"grad_norm": 5.03125,
"learning_rate": 1.174960505529226e-05,
"loss": 1.9297,
"step": 1937
},
{
"epoch": 0.77,
"grad_norm": 7.6875,
"learning_rate": 1.1729857819905214e-05,
"loss": 2.1957,
"step": 1938
},
{
"epoch": 0.77,
"grad_norm": 6.0625,
"learning_rate": 1.1710110584518168e-05,
"loss": 1.569,
"step": 1939
},
{
"epoch": 0.77,
"grad_norm": 9.8125,
"learning_rate": 1.1690363349131122e-05,
"loss": 1.9632,
"step": 1940
},
{
"epoch": 0.77,
"grad_norm": 9.375,
"learning_rate": 1.1670616113744077e-05,
"loss": 2.0101,
"step": 1941
},
{
"epoch": 0.77,
"grad_norm": 12.1875,
"learning_rate": 1.1650868878357032e-05,
"loss": 2.1009,
"step": 1942
},
{
"epoch": 0.77,
"grad_norm": 7.75,
"learning_rate": 1.1631121642969985e-05,
"loss": 2.2322,
"step": 1943
},
{
"epoch": 0.77,
"grad_norm": 9.5,
"learning_rate": 1.161137440758294e-05,
"loss": 2.1393,
"step": 1944
},
{
"epoch": 0.77,
"grad_norm": 5.78125,
"learning_rate": 1.1591627172195892e-05,
"loss": 2.0069,
"step": 1945
},
{
"epoch": 0.77,
"grad_norm": 7.03125,
"learning_rate": 1.1571879936808847e-05,
"loss": 2.0065,
"step": 1946
},
{
"epoch": 0.77,
"grad_norm": 7.875,
"learning_rate": 1.1552132701421802e-05,
"loss": 2.0254,
"step": 1947
},
{
"epoch": 0.77,
"grad_norm": 7.8125,
"learning_rate": 1.1532385466034757e-05,
"loss": 1.847,
"step": 1948
},
{
"epoch": 0.77,
"grad_norm": 9.4375,
"learning_rate": 1.151263823064771e-05,
"loss": 2.128,
"step": 1949
},
{
"epoch": 0.77,
"grad_norm": 8.25,
"learning_rate": 1.1492890995260663e-05,
"loss": 1.7533,
"step": 1950
},
{
"epoch": 0.77,
"grad_norm": 5.9375,
"learning_rate": 1.1473143759873619e-05,
"loss": 1.9421,
"step": 1951
},
{
"epoch": 0.77,
"grad_norm": 6.21875,
"learning_rate": 1.1453396524486572e-05,
"loss": 1.7383,
"step": 1952
},
{
"epoch": 0.77,
"grad_norm": 10.6875,
"learning_rate": 1.1433649289099527e-05,
"loss": 1.818,
"step": 1953
},
{
"epoch": 0.77,
"grad_norm": 7.5625,
"learning_rate": 1.141390205371248e-05,
"loss": 2.2208,
"step": 1954
},
{
"epoch": 0.77,
"grad_norm": 9.625,
"learning_rate": 1.1394154818325435e-05,
"loss": 2.059,
"step": 1955
},
{
"epoch": 0.77,
"grad_norm": 8.8125,
"learning_rate": 1.137440758293839e-05,
"loss": 1.8589,
"step": 1956
},
{
"epoch": 0.77,
"grad_norm": 10.625,
"learning_rate": 1.1354660347551344e-05,
"loss": 2.2099,
"step": 1957
},
{
"epoch": 0.77,
"grad_norm": 9.375,
"learning_rate": 1.1334913112164297e-05,
"loss": 1.8583,
"step": 1958
},
{
"epoch": 0.77,
"grad_norm": 6.21875,
"learning_rate": 1.1315165876777252e-05,
"loss": 2.058,
"step": 1959
},
{
"epoch": 0.77,
"grad_norm": 6.71875,
"learning_rate": 1.1295418641390205e-05,
"loss": 1.896,
"step": 1960
},
{
"epoch": 0.77,
"grad_norm": 7.03125,
"learning_rate": 1.127567140600316e-05,
"loss": 1.9544,
"step": 1961
},
{
"epoch": 0.77,
"grad_norm": 7.59375,
"learning_rate": 1.1255924170616114e-05,
"loss": 2.3272,
"step": 1962
},
{
"epoch": 0.78,
"grad_norm": 5.90625,
"learning_rate": 1.1236176935229069e-05,
"loss": 2.438,
"step": 1963
},
{
"epoch": 0.78,
"grad_norm": 7.1875,
"learning_rate": 1.1216429699842022e-05,
"loss": 2.1181,
"step": 1964
},
{
"epoch": 0.78,
"grad_norm": 5.4375,
"learning_rate": 1.1196682464454977e-05,
"loss": 1.9564,
"step": 1965
},
{
"epoch": 0.78,
"grad_norm": 7.59375,
"learning_rate": 1.1176935229067931e-05,
"loss": 2.1204,
"step": 1966
},
{
"epoch": 0.78,
"grad_norm": 7.34375,
"learning_rate": 1.1157187993680886e-05,
"loss": 1.9671,
"step": 1967
},
{
"epoch": 0.78,
"grad_norm": 5.53125,
"learning_rate": 1.113744075829384e-05,
"loss": 2.0545,
"step": 1968
},
{
"epoch": 0.78,
"grad_norm": 6.8125,
"learning_rate": 1.1117693522906794e-05,
"loss": 1.8034,
"step": 1969
},
{
"epoch": 0.78,
"grad_norm": 10.1875,
"learning_rate": 1.1097946287519747e-05,
"loss": 2.1094,
"step": 1970
},
{
"epoch": 0.78,
"grad_norm": 10.0,
"learning_rate": 1.1078199052132702e-05,
"loss": 2.5193,
"step": 1971
},
{
"epoch": 0.78,
"grad_norm": 7.6875,
"learning_rate": 1.1058451816745656e-05,
"loss": 1.7605,
"step": 1972
},
{
"epoch": 0.78,
"grad_norm": 9.25,
"learning_rate": 1.1038704581358611e-05,
"loss": 2.1225,
"step": 1973
},
{
"epoch": 0.78,
"grad_norm": 16.375,
"learning_rate": 1.1018957345971564e-05,
"loss": 1.8774,
"step": 1974
},
{
"epoch": 0.78,
"grad_norm": 19.0,
"learning_rate": 1.0999210110584517e-05,
"loss": 1.8156,
"step": 1975
},
{
"epoch": 0.78,
"grad_norm": 6.9375,
"learning_rate": 1.0979462875197474e-05,
"loss": 2.0692,
"step": 1976
},
{
"epoch": 0.78,
"grad_norm": 8.875,
"learning_rate": 1.0959715639810427e-05,
"loss": 2.1784,
"step": 1977
},
{
"epoch": 0.78,
"grad_norm": 6.84375,
"learning_rate": 1.0939968404423381e-05,
"loss": 2.1852,
"step": 1978
},
{
"epoch": 0.78,
"grad_norm": 6.9375,
"learning_rate": 1.0920221169036334e-05,
"loss": 1.9077,
"step": 1979
},
{
"epoch": 0.78,
"grad_norm": 6.84375,
"learning_rate": 1.0900473933649289e-05,
"loss": 2.1726,
"step": 1980
},
{
"epoch": 0.78,
"grad_norm": 7.1875,
"learning_rate": 1.0880726698262244e-05,
"loss": 1.862,
"step": 1981
},
{
"epoch": 0.78,
"grad_norm": 8.3125,
"learning_rate": 1.0860979462875199e-05,
"loss": 2.0416,
"step": 1982
},
{
"epoch": 0.78,
"grad_norm": 7.21875,
"learning_rate": 1.0841232227488152e-05,
"loss": 2.1196,
"step": 1983
},
{
"epoch": 0.78,
"grad_norm": 8.3125,
"learning_rate": 1.0821484992101106e-05,
"loss": 2.3377,
"step": 1984
},
{
"epoch": 0.78,
"grad_norm": 5.6875,
"learning_rate": 1.080173775671406e-05,
"loss": 2.2647,
"step": 1985
},
{
"epoch": 0.78,
"grad_norm": 8.0625,
"learning_rate": 1.0781990521327016e-05,
"loss": 2.1212,
"step": 1986
},
{
"epoch": 0.78,
"grad_norm": 5.4375,
"learning_rate": 1.0762243285939969e-05,
"loss": 2.0415,
"step": 1987
},
{
"epoch": 0.79,
"grad_norm": 7.03125,
"learning_rate": 1.0742496050552924e-05,
"loss": 1.9346,
"step": 1988
},
{
"epoch": 0.79,
"grad_norm": 6.03125,
"learning_rate": 1.0722748815165877e-05,
"loss": 2.1867,
"step": 1989
},
{
"epoch": 0.79,
"grad_norm": 8.625,
"learning_rate": 1.0703001579778831e-05,
"loss": 2.3294,
"step": 1990
},
{
"epoch": 0.79,
"grad_norm": 12.5625,
"learning_rate": 1.0683254344391786e-05,
"loss": 1.706,
"step": 1991
},
{
"epoch": 0.79,
"grad_norm": 5.46875,
"learning_rate": 1.066350710900474e-05,
"loss": 2.0414,
"step": 1992
},
{
"epoch": 0.79,
"grad_norm": 6.65625,
"learning_rate": 1.0643759873617694e-05,
"loss": 2.24,
"step": 1993
},
{
"epoch": 0.79,
"grad_norm": 5.46875,
"learning_rate": 1.0624012638230648e-05,
"loss": 2.2053,
"step": 1994
},
{
"epoch": 0.79,
"grad_norm": 10.9375,
"learning_rate": 1.0604265402843602e-05,
"loss": 1.9783,
"step": 1995
},
{
"epoch": 0.79,
"grad_norm": 5.90625,
"learning_rate": 1.0584518167456558e-05,
"loss": 2.2241,
"step": 1996
},
{
"epoch": 0.79,
"grad_norm": 6.53125,
"learning_rate": 1.0564770932069511e-05,
"loss": 2.0235,
"step": 1997
},
{
"epoch": 0.79,
"grad_norm": 7.34375,
"learning_rate": 1.0545023696682466e-05,
"loss": 1.8712,
"step": 1998
},
{
"epoch": 0.79,
"grad_norm": 6.28125,
"learning_rate": 1.0525276461295419e-05,
"loss": 2.101,
"step": 1999
},
{
"epoch": 0.79,
"grad_norm": 7.84375,
"learning_rate": 1.0505529225908373e-05,
"loss": 2.1008,
"step": 2000
},
{
"epoch": 0.79,
"grad_norm": 13.4375,
"learning_rate": 1.0485781990521328e-05,
"loss": 2.1133,
"step": 2001
},
{
"epoch": 0.79,
"grad_norm": 8.875,
"learning_rate": 1.0466034755134281e-05,
"loss": 2.0236,
"step": 2002
},
{
"epoch": 0.79,
"grad_norm": 5.78125,
"learning_rate": 1.0446287519747236e-05,
"loss": 1.9779,
"step": 2003
},
{
"epoch": 0.79,
"grad_norm": 7.09375,
"learning_rate": 1.0426540284360189e-05,
"loss": 2.3413,
"step": 2004
},
{
"epoch": 0.79,
"grad_norm": 5.875,
"learning_rate": 1.0406793048973144e-05,
"loss": 1.6988,
"step": 2005
},
{
"epoch": 0.79,
"grad_norm": 9.4375,
"learning_rate": 1.0387045813586098e-05,
"loss": 1.4892,
"step": 2006
},
{
"epoch": 0.79,
"grad_norm": 7.78125,
"learning_rate": 1.0367298578199053e-05,
"loss": 1.8023,
"step": 2007
},
{
"epoch": 0.79,
"grad_norm": 6.84375,
"learning_rate": 1.0347551342812006e-05,
"loss": 2.0903,
"step": 2008
},
{
"epoch": 0.79,
"grad_norm": 12.0625,
"learning_rate": 1.0327804107424961e-05,
"loss": 2.3395,
"step": 2009
},
{
"epoch": 0.79,
"grad_norm": 6.65625,
"learning_rate": 1.0308056872037916e-05,
"loss": 1.7835,
"step": 2010
},
{
"epoch": 0.79,
"grad_norm": 7.71875,
"learning_rate": 1.028830963665087e-05,
"loss": 1.8328,
"step": 2011
},
{
"epoch": 0.79,
"grad_norm": 8.0,
"learning_rate": 1.0268562401263823e-05,
"loss": 1.5441,
"step": 2012
},
{
"epoch": 0.8,
"grad_norm": 6.15625,
"learning_rate": 1.0248815165876778e-05,
"loss": 1.9938,
"step": 2013
},
{
"epoch": 0.8,
"grad_norm": 8.4375,
"learning_rate": 1.0229067930489731e-05,
"loss": 1.8775,
"step": 2014
},
{
"epoch": 0.8,
"grad_norm": 5.78125,
"learning_rate": 1.0209320695102686e-05,
"loss": 1.8333,
"step": 2015
},
{
"epoch": 0.8,
"grad_norm": 8.375,
"learning_rate": 1.018957345971564e-05,
"loss": 2.2858,
"step": 2016
},
{
"epoch": 0.8,
"grad_norm": 7.0,
"learning_rate": 1.0169826224328595e-05,
"loss": 2.2677,
"step": 2017
},
{
"epoch": 0.8,
"grad_norm": 11.6875,
"learning_rate": 1.0150078988941548e-05,
"loss": 1.8377,
"step": 2018
},
{
"epoch": 0.8,
"grad_norm": 7.65625,
"learning_rate": 1.0130331753554503e-05,
"loss": 2.0848,
"step": 2019
},
{
"epoch": 0.8,
"grad_norm": 4.78125,
"learning_rate": 1.0110584518167456e-05,
"loss": 1.6013,
"step": 2020
},
{
"epoch": 0.8,
"grad_norm": 6.84375,
"learning_rate": 1.0090837282780412e-05,
"loss": 2.103,
"step": 2021
},
{
"epoch": 0.8,
"grad_norm": 6.21875,
"learning_rate": 1.0071090047393366e-05,
"loss": 1.7709,
"step": 2022
},
{
"epoch": 0.8,
"grad_norm": 6.09375,
"learning_rate": 1.005134281200632e-05,
"loss": 1.8256,
"step": 2023
},
{
"epoch": 0.8,
"grad_norm": 8.4375,
"learning_rate": 1.0031595576619273e-05,
"loss": 2.2294,
"step": 2024
},
{
"epoch": 0.8,
"grad_norm": 5.84375,
"learning_rate": 1.0011848341232228e-05,
"loss": 1.7831,
"step": 2025
},
{
"epoch": 0.8,
"grad_norm": 8.9375,
"learning_rate": 9.992101105845183e-06,
"loss": 1.9831,
"step": 2026
},
{
"epoch": 0.8,
"grad_norm": 8.25,
"learning_rate": 9.972353870458136e-06,
"loss": 1.934,
"step": 2027
},
{
"epoch": 0.8,
"grad_norm": 5.96875,
"learning_rate": 9.95260663507109e-06,
"loss": 1.5657,
"step": 2028
},
{
"epoch": 0.8,
"grad_norm": 6.375,
"learning_rate": 9.932859399684043e-06,
"loss": 1.8867,
"step": 2029
},
{
"epoch": 0.8,
"grad_norm": 8.6875,
"learning_rate": 9.913112164296998e-06,
"loss": 2.1059,
"step": 2030
},
{
"epoch": 0.8,
"grad_norm": 6.75,
"learning_rate": 9.893364928909953e-06,
"loss": 1.8007,
"step": 2031
},
{
"epoch": 0.8,
"grad_norm": 6.84375,
"learning_rate": 9.873617693522908e-06,
"loss": 1.6901,
"step": 2032
},
{
"epoch": 0.8,
"grad_norm": 9.3125,
"learning_rate": 9.85387045813586e-06,
"loss": 1.9501,
"step": 2033
},
{
"epoch": 0.8,
"grad_norm": 6.53125,
"learning_rate": 9.834123222748815e-06,
"loss": 1.9348,
"step": 2034
},
{
"epoch": 0.8,
"grad_norm": 6.0,
"learning_rate": 9.81437598736177e-06,
"loss": 1.891,
"step": 2035
},
{
"epoch": 0.8,
"grad_norm": 6.9375,
"learning_rate": 9.794628751974725e-06,
"loss": 1.9767,
"step": 2036
},
{
"epoch": 0.8,
"grad_norm": 7.375,
"learning_rate": 9.774881516587678e-06,
"loss": 2.1645,
"step": 2037
},
{
"epoch": 0.8,
"grad_norm": 8.5625,
"learning_rate": 9.755134281200633e-06,
"loss": 2.0751,
"step": 2038
},
{
"epoch": 0.81,
"grad_norm": 7.71875,
"learning_rate": 9.735387045813586e-06,
"loss": 2.3675,
"step": 2039
},
{
"epoch": 0.81,
"grad_norm": 6.65625,
"learning_rate": 9.71563981042654e-06,
"loss": 2.1672,
"step": 2040
},
{
"epoch": 0.81,
"grad_norm": 7.59375,
"learning_rate": 9.695892575039495e-06,
"loss": 2.1401,
"step": 2041
},
{
"epoch": 0.81,
"grad_norm": 9.25,
"learning_rate": 9.67614533965245e-06,
"loss": 2.3075,
"step": 2042
},
{
"epoch": 0.81,
"grad_norm": 4.9375,
"learning_rate": 9.656398104265403e-06,
"loss": 2.205,
"step": 2043
},
{
"epoch": 0.81,
"grad_norm": 11.1875,
"learning_rate": 9.636650868878358e-06,
"loss": 1.9111,
"step": 2044
},
{
"epoch": 0.81,
"grad_norm": 6.96875,
"learning_rate": 9.616903633491312e-06,
"loss": 1.4534,
"step": 2045
},
{
"epoch": 0.81,
"grad_norm": 9.1875,
"learning_rate": 9.597156398104267e-06,
"loss": 2.3191,
"step": 2046
},
{
"epoch": 0.81,
"grad_norm": 7.25,
"learning_rate": 9.57740916271722e-06,
"loss": 1.7988,
"step": 2047
},
{
"epoch": 0.81,
"grad_norm": 6.53125,
"learning_rate": 9.557661927330175e-06,
"loss": 2.0437,
"step": 2048
},
{
"epoch": 0.81,
"grad_norm": 6.78125,
"learning_rate": 9.537914691943128e-06,
"loss": 1.8445,
"step": 2049
},
{
"epoch": 0.81,
"grad_norm": 9.0,
"learning_rate": 9.518167456556083e-06,
"loss": 1.9873,
"step": 2050
},
{
"epoch": 0.81,
"grad_norm": 8.625,
"learning_rate": 9.498420221169037e-06,
"loss": 2.2198,
"step": 2051
},
{
"epoch": 0.81,
"grad_norm": 9.375,
"learning_rate": 9.47867298578199e-06,
"loss": 1.9627,
"step": 2052
},
{
"epoch": 0.81,
"grad_norm": 7.46875,
"learning_rate": 9.458925750394945e-06,
"loss": 2.1925,
"step": 2053
},
{
"epoch": 0.81,
"grad_norm": 5.34375,
"learning_rate": 9.439178515007898e-06,
"loss": 1.9734,
"step": 2054
},
{
"epoch": 0.81,
"grad_norm": 9.0625,
"learning_rate": 9.419431279620854e-06,
"loss": 2.058,
"step": 2055
},
{
"epoch": 0.81,
"grad_norm": 6.21875,
"learning_rate": 9.399684044233807e-06,
"loss": 1.7876,
"step": 2056
},
{
"epoch": 0.81,
"grad_norm": 6.625,
"learning_rate": 9.379936808846762e-06,
"loss": 1.966,
"step": 2057
},
{
"epoch": 0.81,
"grad_norm": 7.3125,
"learning_rate": 9.360189573459715e-06,
"loss": 1.846,
"step": 2058
},
{
"epoch": 0.81,
"grad_norm": 17.375,
"learning_rate": 9.34044233807267e-06,
"loss": 2.0262,
"step": 2059
},
{
"epoch": 0.81,
"grad_norm": 5.4375,
"learning_rate": 9.320695102685625e-06,
"loss": 2.054,
"step": 2060
},
{
"epoch": 0.81,
"grad_norm": 6.03125,
"learning_rate": 9.30094786729858e-06,
"loss": 2.0099,
"step": 2061
},
{
"epoch": 0.81,
"grad_norm": 8.0625,
"learning_rate": 9.281200631911532e-06,
"loss": 2.104,
"step": 2062
},
{
"epoch": 0.81,
"grad_norm": 9.0,
"learning_rate": 9.261453396524487e-06,
"loss": 2.1306,
"step": 2063
},
{
"epoch": 0.82,
"grad_norm": 6.96875,
"learning_rate": 9.24170616113744e-06,
"loss": 2.0223,
"step": 2064
},
{
"epoch": 0.82,
"grad_norm": 6.03125,
"learning_rate": 9.221958925750397e-06,
"loss": 2.0311,
"step": 2065
},
{
"epoch": 0.82,
"grad_norm": 7.21875,
"learning_rate": 9.20221169036335e-06,
"loss": 2.325,
"step": 2066
},
{
"epoch": 0.82,
"grad_norm": 8.3125,
"learning_rate": 9.182464454976304e-06,
"loss": 1.9492,
"step": 2067
},
{
"epoch": 0.82,
"grad_norm": 6.15625,
"learning_rate": 9.162717219589257e-06,
"loss": 1.8783,
"step": 2068
},
{
"epoch": 0.82,
"grad_norm": 8.0,
"learning_rate": 9.142969984202212e-06,
"loss": 2.4551,
"step": 2069
},
{
"epoch": 0.82,
"grad_norm": 4.71875,
"learning_rate": 9.123222748815167e-06,
"loss": 2.061,
"step": 2070
},
{
"epoch": 0.82,
"grad_norm": 7.25,
"learning_rate": 9.103475513428122e-06,
"loss": 2.029,
"step": 2071
},
{
"epoch": 0.82,
"grad_norm": 8.1875,
"learning_rate": 9.083728278041075e-06,
"loss": 1.8115,
"step": 2072
},
{
"epoch": 0.82,
"grad_norm": 9.25,
"learning_rate": 9.06398104265403e-06,
"loss": 1.9698,
"step": 2073
},
{
"epoch": 0.82,
"grad_norm": 7.59375,
"learning_rate": 9.044233807266982e-06,
"loss": 2.275,
"step": 2074
},
{
"epoch": 0.82,
"grad_norm": 6.09375,
"learning_rate": 9.024486571879939e-06,
"loss": 1.9314,
"step": 2075
},
{
"epoch": 0.82,
"grad_norm": 6.34375,
"learning_rate": 9.004739336492892e-06,
"loss": 2.093,
"step": 2076
},
{
"epoch": 0.82,
"grad_norm": 7.3125,
"learning_rate": 8.984992101105845e-06,
"loss": 1.6067,
"step": 2077
},
{
"epoch": 0.82,
"grad_norm": 7.71875,
"learning_rate": 8.9652448657188e-06,
"loss": 1.6374,
"step": 2078
},
{
"epoch": 0.82,
"grad_norm": 10.9375,
"learning_rate": 8.945497630331753e-06,
"loss": 2.2058,
"step": 2079
},
{
"epoch": 0.82,
"grad_norm": 7.40625,
"learning_rate": 8.925750394944709e-06,
"loss": 2.3108,
"step": 2080
},
{
"epoch": 0.82,
"grad_norm": 10.9375,
"learning_rate": 8.906003159557662e-06,
"loss": 1.7949,
"step": 2081
},
{
"epoch": 0.82,
"grad_norm": 11.9375,
"learning_rate": 8.886255924170617e-06,
"loss": 1.7964,
"step": 2082
},
{
"epoch": 0.82,
"grad_norm": 7.5625,
"learning_rate": 8.86650868878357e-06,
"loss": 1.9182,
"step": 2083
},
{
"epoch": 0.82,
"grad_norm": 7.0625,
"learning_rate": 8.846761453396524e-06,
"loss": 1.7283,
"step": 2084
},
{
"epoch": 0.82,
"grad_norm": 10.3125,
"learning_rate": 8.82701421800948e-06,
"loss": 2.3193,
"step": 2085
},
{
"epoch": 0.82,
"grad_norm": 5.53125,
"learning_rate": 8.807266982622434e-06,
"loss": 1.6221,
"step": 2086
},
{
"epoch": 0.82,
"grad_norm": 5.96875,
"learning_rate": 8.787519747235387e-06,
"loss": 2.0812,
"step": 2087
},
{
"epoch": 0.82,
"grad_norm": 5.90625,
"learning_rate": 8.767772511848342e-06,
"loss": 2.05,
"step": 2088
},
{
"epoch": 0.83,
"grad_norm": 7.28125,
"learning_rate": 8.748025276461295e-06,
"loss": 1.5181,
"step": 2089
},
{
"epoch": 0.83,
"grad_norm": 8.375,
"learning_rate": 8.728278041074251e-06,
"loss": 1.796,
"step": 2090
},
{
"epoch": 0.83,
"grad_norm": 5.9375,
"learning_rate": 8.708530805687204e-06,
"loss": 1.7643,
"step": 2091
},
{
"epoch": 0.83,
"grad_norm": 4.0625,
"learning_rate": 8.688783570300159e-06,
"loss": 1.8834,
"step": 2092
},
{
"epoch": 0.83,
"grad_norm": 8.1875,
"learning_rate": 8.669036334913112e-06,
"loss": 2.3145,
"step": 2093
},
{
"epoch": 0.83,
"grad_norm": 8.0625,
"learning_rate": 8.649289099526067e-06,
"loss": 2.1063,
"step": 2094
},
{
"epoch": 0.83,
"grad_norm": 6.4375,
"learning_rate": 8.629541864139021e-06,
"loss": 1.9504,
"step": 2095
},
{
"epoch": 0.83,
"grad_norm": 5.75,
"learning_rate": 8.609794628751976e-06,
"loss": 2.1573,
"step": 2096
},
{
"epoch": 0.83,
"grad_norm": 6.3125,
"learning_rate": 8.590047393364929e-06,
"loss": 1.5747,
"step": 2097
},
{
"epoch": 0.83,
"grad_norm": 5.28125,
"learning_rate": 8.570300157977884e-06,
"loss": 1.9786,
"step": 2098
},
{
"epoch": 0.83,
"grad_norm": 9.6875,
"learning_rate": 8.550552922590837e-06,
"loss": 2.127,
"step": 2099
},
{
"epoch": 0.83,
"grad_norm": 6.53125,
"learning_rate": 8.530805687203793e-06,
"loss": 1.5605,
"step": 2100
},
{
"epoch": 0.83,
"grad_norm": 7.15625,
"learning_rate": 8.511058451816746e-06,
"loss": 2.2978,
"step": 2101
},
{
"epoch": 0.83,
"grad_norm": 8.0625,
"learning_rate": 8.4913112164297e-06,
"loss": 2.2952,
"step": 2102
},
{
"epoch": 0.83,
"grad_norm": 5.15625,
"learning_rate": 8.471563981042654e-06,
"loss": 2.0014,
"step": 2103
},
{
"epoch": 0.83,
"grad_norm": 5.46875,
"learning_rate": 8.451816745655609e-06,
"loss": 1.9189,
"step": 2104
},
{
"epoch": 0.83,
"grad_norm": 6.46875,
"learning_rate": 8.432069510268564e-06,
"loss": 2.1088,
"step": 2105
},
{
"epoch": 0.83,
"grad_norm": 9.5,
"learning_rate": 8.412322274881517e-06,
"loss": 2.2276,
"step": 2106
},
{
"epoch": 0.83,
"grad_norm": 8.875,
"learning_rate": 8.392575039494471e-06,
"loss": 2.1024,
"step": 2107
},
{
"epoch": 0.83,
"grad_norm": 6.5,
"learning_rate": 8.372827804107424e-06,
"loss": 1.5879,
"step": 2108
},
{
"epoch": 0.83,
"grad_norm": 8.8125,
"learning_rate": 8.353080568720379e-06,
"loss": 2.2812,
"step": 2109
},
{
"epoch": 0.83,
"grad_norm": 6.3125,
"learning_rate": 8.333333333333334e-06,
"loss": 2.2193,
"step": 2110
},
{
"epoch": 0.83,
"grad_norm": 7.46875,
"learning_rate": 8.313586097946288e-06,
"loss": 2.2847,
"step": 2111
},
{
"epoch": 0.83,
"grad_norm": 7.1875,
"learning_rate": 8.293838862559241e-06,
"loss": 1.9285,
"step": 2112
},
{
"epoch": 0.83,
"grad_norm": 8.625,
"learning_rate": 8.274091627172196e-06,
"loss": 2.0989,
"step": 2113
},
{
"epoch": 0.83,
"grad_norm": 7.09375,
"learning_rate": 8.254344391785151e-06,
"loss": 2.308,
"step": 2114
},
{
"epoch": 0.84,
"grad_norm": 8.0,
"learning_rate": 8.234597156398106e-06,
"loss": 2.4248,
"step": 2115
},
{
"epoch": 0.84,
"grad_norm": 5.09375,
"learning_rate": 8.214849921011059e-06,
"loss": 2.0533,
"step": 2116
},
{
"epoch": 0.84,
"grad_norm": 4.96875,
"learning_rate": 8.195102685624013e-06,
"loss": 1.8354,
"step": 2117
},
{
"epoch": 0.84,
"grad_norm": 8.4375,
"learning_rate": 8.175355450236966e-06,
"loss": 1.4298,
"step": 2118
},
{
"epoch": 0.84,
"grad_norm": 9.375,
"learning_rate": 8.155608214849921e-06,
"loss": 1.7867,
"step": 2119
},
{
"epoch": 0.84,
"grad_norm": 5.9375,
"learning_rate": 8.135860979462876e-06,
"loss": 1.612,
"step": 2120
},
{
"epoch": 0.84,
"grad_norm": 7.34375,
"learning_rate": 8.11611374407583e-06,
"loss": 2.1354,
"step": 2121
},
{
"epoch": 0.84,
"grad_norm": 8.0625,
"learning_rate": 8.096366508688784e-06,
"loss": 2.1376,
"step": 2122
},
{
"epoch": 0.84,
"grad_norm": 5.71875,
"learning_rate": 8.076619273301738e-06,
"loss": 2.0522,
"step": 2123
},
{
"epoch": 0.84,
"grad_norm": 6.0625,
"learning_rate": 8.056872037914693e-06,
"loss": 1.7686,
"step": 2124
},
{
"epoch": 0.84,
"grad_norm": 5.3125,
"learning_rate": 8.037124802527648e-06,
"loss": 1.6335,
"step": 2125
},
{
"epoch": 0.84,
"grad_norm": 6.03125,
"learning_rate": 8.0173775671406e-06,
"loss": 1.9068,
"step": 2126
},
{
"epoch": 0.84,
"grad_norm": 8.8125,
"learning_rate": 7.997630331753554e-06,
"loss": 1.9618,
"step": 2127
},
{
"epoch": 0.84,
"grad_norm": 7.34375,
"learning_rate": 7.977883096366509e-06,
"loss": 1.596,
"step": 2128
},
{
"epoch": 0.84,
"grad_norm": 6.21875,
"learning_rate": 7.958135860979463e-06,
"loss": 1.832,
"step": 2129
},
{
"epoch": 0.84,
"grad_norm": 7.78125,
"learning_rate": 7.938388625592418e-06,
"loss": 1.9948,
"step": 2130
},
{
"epoch": 0.84,
"grad_norm": 7.625,
"learning_rate": 7.918641390205371e-06,
"loss": 1.9932,
"step": 2131
},
{
"epoch": 0.84,
"grad_norm": 17.875,
"learning_rate": 7.898894154818326e-06,
"loss": 1.9662,
"step": 2132
},
{
"epoch": 0.84,
"grad_norm": 8.75,
"learning_rate": 7.879146919431279e-06,
"loss": 1.7035,
"step": 2133
},
{
"epoch": 0.84,
"grad_norm": 8.4375,
"learning_rate": 7.859399684044234e-06,
"loss": 2.0167,
"step": 2134
},
{
"epoch": 0.84,
"grad_norm": 10.0,
"learning_rate": 7.839652448657188e-06,
"loss": 2.1269,
"step": 2135
},
{
"epoch": 0.84,
"grad_norm": 9.8125,
"learning_rate": 7.819905213270143e-06,
"loss": 1.8755,
"step": 2136
},
{
"epoch": 0.84,
"grad_norm": 8.3125,
"learning_rate": 7.800157977883096e-06,
"loss": 1.9478,
"step": 2137
},
{
"epoch": 0.84,
"grad_norm": 12.375,
"learning_rate": 7.78041074249605e-06,
"loss": 2.0363,
"step": 2138
},
{
"epoch": 0.84,
"grad_norm": 7.8125,
"learning_rate": 7.760663507109005e-06,
"loss": 1.9998,
"step": 2139
},
{
"epoch": 0.85,
"grad_norm": 5.0,
"learning_rate": 7.74091627172196e-06,
"loss": 2.0392,
"step": 2140
},
{
"epoch": 0.85,
"grad_norm": 10.0625,
"learning_rate": 7.721169036334913e-06,
"loss": 1.9286,
"step": 2141
},
{
"epoch": 0.85,
"grad_norm": 17.5,
"learning_rate": 7.701421800947868e-06,
"loss": 2.3163,
"step": 2142
},
{
"epoch": 0.85,
"grad_norm": 7.0,
"learning_rate": 7.681674565560821e-06,
"loss": 1.8881,
"step": 2143
},
{
"epoch": 0.85,
"grad_norm": 8.125,
"learning_rate": 7.661927330173776e-06,
"loss": 2.3167,
"step": 2144
},
{
"epoch": 0.85,
"grad_norm": 7.65625,
"learning_rate": 7.64218009478673e-06,
"loss": 1.7757,
"step": 2145
},
{
"epoch": 0.85,
"grad_norm": 7.34375,
"learning_rate": 7.622432859399685e-06,
"loss": 2.645,
"step": 2146
},
{
"epoch": 0.85,
"grad_norm": 56.25,
"learning_rate": 7.602685624012638e-06,
"loss": 2.1108,
"step": 2147
},
{
"epoch": 0.85,
"grad_norm": 6.40625,
"learning_rate": 7.582938388625594e-06,
"loss": 1.8257,
"step": 2148
},
{
"epoch": 0.85,
"grad_norm": 4.1875,
"learning_rate": 7.563191153238547e-06,
"loss": 2.0506,
"step": 2149
},
{
"epoch": 0.85,
"grad_norm": 5.0625,
"learning_rate": 7.5434439178515015e-06,
"loss": 1.9331,
"step": 2150
},
{
"epoch": 0.85,
"grad_norm": 9.8125,
"learning_rate": 7.523696682464455e-06,
"loss": 2.1486,
"step": 2151
},
{
"epoch": 0.85,
"grad_norm": 10.0625,
"learning_rate": 7.50394944707741e-06,
"loss": 1.7869,
"step": 2152
},
{
"epoch": 0.85,
"grad_norm": 5.5,
"learning_rate": 7.484202211690364e-06,
"loss": 2.0863,
"step": 2153
},
{
"epoch": 0.85,
"grad_norm": 7.21875,
"learning_rate": 7.464454976303317e-06,
"loss": 2.6342,
"step": 2154
},
{
"epoch": 0.85,
"grad_norm": 6.15625,
"learning_rate": 7.4447077409162726e-06,
"loss": 1.8327,
"step": 2155
},
{
"epoch": 0.85,
"grad_norm": 8.0625,
"learning_rate": 7.424960505529226e-06,
"loss": 2.0017,
"step": 2156
},
{
"epoch": 0.85,
"grad_norm": 6.0,
"learning_rate": 7.40521327014218e-06,
"loss": 1.9452,
"step": 2157
},
{
"epoch": 0.85,
"grad_norm": 5.34375,
"learning_rate": 7.385466034755134e-06,
"loss": 1.8217,
"step": 2158
},
{
"epoch": 0.85,
"grad_norm": 6.28125,
"learning_rate": 7.365718799368089e-06,
"loss": 1.9549,
"step": 2159
},
{
"epoch": 0.85,
"grad_norm": 11.9375,
"learning_rate": 7.345971563981043e-06,
"loss": 2.0542,
"step": 2160
},
{
"epoch": 0.85,
"grad_norm": 5.65625,
"learning_rate": 7.3262243285939975e-06,
"loss": 1.8715,
"step": 2161
},
{
"epoch": 0.85,
"grad_norm": 8.5625,
"learning_rate": 7.3064770932069506e-06,
"loss": 1.5777,
"step": 2162
},
{
"epoch": 0.85,
"grad_norm": 8.6875,
"learning_rate": 7.286729857819906e-06,
"loss": 2.1786,
"step": 2163
},
{
"epoch": 0.85,
"grad_norm": 6.03125,
"learning_rate": 7.266982622432859e-06,
"loss": 2.0227,
"step": 2164
},
{
"epoch": 0.86,
"grad_norm": 7.46875,
"learning_rate": 7.247235387045814e-06,
"loss": 2.3692,
"step": 2165
},
{
"epoch": 0.86,
"grad_norm": 10.5625,
"learning_rate": 7.227488151658768e-06,
"loss": 1.7344,
"step": 2166
},
{
"epoch": 0.86,
"grad_norm": 7.9375,
"learning_rate": 7.2077409162717225e-06,
"loss": 1.8434,
"step": 2167
},
{
"epoch": 0.86,
"grad_norm": 10.1875,
"learning_rate": 7.187993680884676e-06,
"loss": 2.3043,
"step": 2168
},
{
"epoch": 0.86,
"grad_norm": 6.5,
"learning_rate": 7.168246445497631e-06,
"loss": 2.135,
"step": 2169
},
{
"epoch": 0.86,
"grad_norm": 6.09375,
"learning_rate": 7.148499210110585e-06,
"loss": 1.7323,
"step": 2170
},
{
"epoch": 0.86,
"grad_norm": 5.5625,
"learning_rate": 7.12875197472354e-06,
"loss": 2.1462,
"step": 2171
},
{
"epoch": 0.86,
"grad_norm": 5.09375,
"learning_rate": 7.109004739336493e-06,
"loss": 2.0594,
"step": 2172
},
{
"epoch": 0.86,
"grad_norm": 5.625,
"learning_rate": 7.089257503949448e-06,
"loss": 1.8832,
"step": 2173
},
{
"epoch": 0.86,
"grad_norm": 4.84375,
"learning_rate": 7.069510268562401e-06,
"loss": 1.9718,
"step": 2174
},
{
"epoch": 0.86,
"grad_norm": 20.375,
"learning_rate": 7.049763033175356e-06,
"loss": 2.1593,
"step": 2175
},
{
"epoch": 0.86,
"grad_norm": 6.25,
"learning_rate": 7.03001579778831e-06,
"loss": 2.0384,
"step": 2176
},
{
"epoch": 0.86,
"grad_norm": 7.625,
"learning_rate": 7.010268562401265e-06,
"loss": 1.7588,
"step": 2177
},
{
"epoch": 0.86,
"grad_norm": 5.78125,
"learning_rate": 6.9905213270142185e-06,
"loss": 2.1634,
"step": 2178
},
{
"epoch": 0.86,
"grad_norm": 5.40625,
"learning_rate": 6.9707740916271715e-06,
"loss": 2.0796,
"step": 2179
},
{
"epoch": 0.86,
"grad_norm": 6.875,
"learning_rate": 6.951026856240127e-06,
"loss": 1.8347,
"step": 2180
},
{
"epoch": 0.86,
"grad_norm": 6.59375,
"learning_rate": 6.93127962085308e-06,
"loss": 1.7672,
"step": 2181
},
{
"epoch": 0.86,
"grad_norm": 6.3125,
"learning_rate": 6.911532385466035e-06,
"loss": 2.0492,
"step": 2182
},
{
"epoch": 0.86,
"grad_norm": 9.375,
"learning_rate": 6.891785150078989e-06,
"loss": 1.8628,
"step": 2183
},
{
"epoch": 0.86,
"grad_norm": 6.75,
"learning_rate": 6.8720379146919435e-06,
"loss": 1.8477,
"step": 2184
},
{
"epoch": 0.86,
"grad_norm": 5.625,
"learning_rate": 6.852290679304897e-06,
"loss": 1.8249,
"step": 2185
},
{
"epoch": 0.86,
"grad_norm": 6.96875,
"learning_rate": 6.832543443917852e-06,
"loss": 1.878,
"step": 2186
},
{
"epoch": 0.86,
"grad_norm": 7.5625,
"learning_rate": 6.812796208530806e-06,
"loss": 1.9796,
"step": 2187
},
{
"epoch": 0.86,
"grad_norm": 7.21875,
"learning_rate": 6.793048973143761e-06,
"loss": 2.2177,
"step": 2188
},
{
"epoch": 0.86,
"grad_norm": 9.1875,
"learning_rate": 6.773301737756714e-06,
"loss": 1.819,
"step": 2189
},
{
"epoch": 0.86,
"grad_norm": 6.6875,
"learning_rate": 6.753554502369669e-06,
"loss": 1.8783,
"step": 2190
},
{
"epoch": 0.87,
"grad_norm": 6.1875,
"learning_rate": 6.733807266982622e-06,
"loss": 1.8832,
"step": 2191
},
{
"epoch": 0.87,
"grad_norm": 9.3125,
"learning_rate": 6.714060031595577e-06,
"loss": 2.1891,
"step": 2192
},
{
"epoch": 0.87,
"grad_norm": 5.78125,
"learning_rate": 6.694312796208531e-06,
"loss": 1.7291,
"step": 2193
},
{
"epoch": 0.87,
"grad_norm": 10.875,
"learning_rate": 6.674565560821486e-06,
"loss": 1.9273,
"step": 2194
},
{
"epoch": 0.87,
"grad_norm": 9.25,
"learning_rate": 6.6548183254344395e-06,
"loss": 1.842,
"step": 2195
},
{
"epoch": 0.87,
"grad_norm": 7.40625,
"learning_rate": 6.635071090047394e-06,
"loss": 1.6964,
"step": 2196
},
{
"epoch": 0.87,
"grad_norm": 11.75,
"learning_rate": 6.615323854660348e-06,
"loss": 2.0595,
"step": 2197
},
{
"epoch": 0.87,
"grad_norm": 10.8125,
"learning_rate": 6.595576619273303e-06,
"loss": 1.9122,
"step": 2198
},
{
"epoch": 0.87,
"grad_norm": 6.25,
"learning_rate": 6.575829383886256e-06,
"loss": 2.048,
"step": 2199
},
{
"epoch": 0.87,
"grad_norm": 6.0,
"learning_rate": 6.556082148499211e-06,
"loss": 1.5724,
"step": 2200
},
{
"epoch": 0.87,
"grad_norm": 8.0,
"learning_rate": 6.5363349131121644e-06,
"loss": 2.1274,
"step": 2201
},
{
"epoch": 0.87,
"grad_norm": 6.15625,
"learning_rate": 6.516587677725119e-06,
"loss": 1.9009,
"step": 2202
},
{
"epoch": 0.87,
"grad_norm": 5.8125,
"learning_rate": 6.496840442338073e-06,
"loss": 2.1833,
"step": 2203
},
{
"epoch": 0.87,
"grad_norm": 7.5,
"learning_rate": 6.477093206951027e-06,
"loss": 2.0501,
"step": 2204
},
{
"epoch": 0.87,
"grad_norm": 5.125,
"learning_rate": 6.457345971563982e-06,
"loss": 1.9837,
"step": 2205
},
{
"epoch": 0.87,
"grad_norm": 7.09375,
"learning_rate": 6.437598736176935e-06,
"loss": 2.1771,
"step": 2206
},
{
"epoch": 0.87,
"grad_norm": 8.5,
"learning_rate": 6.41785150078989e-06,
"loss": 2.2891,
"step": 2207
},
{
"epoch": 0.87,
"grad_norm": 5.8125,
"learning_rate": 6.398104265402843e-06,
"loss": 1.6967,
"step": 2208
},
{
"epoch": 0.87,
"grad_norm": 7.09375,
"learning_rate": 6.378357030015798e-06,
"loss": 2.3102,
"step": 2209
},
{
"epoch": 0.87,
"grad_norm": 6.28125,
"learning_rate": 6.358609794628752e-06,
"loss": 2.0689,
"step": 2210
},
{
"epoch": 0.87,
"grad_norm": 8.9375,
"learning_rate": 6.338862559241707e-06,
"loss": 1.7543,
"step": 2211
},
{
"epoch": 0.87,
"grad_norm": 9.6875,
"learning_rate": 6.3191153238546605e-06,
"loss": 2.0905,
"step": 2212
},
{
"epoch": 0.87,
"grad_norm": 8.75,
"learning_rate": 6.299368088467615e-06,
"loss": 2.0979,
"step": 2213
},
{
"epoch": 0.87,
"grad_norm": 8.375,
"learning_rate": 6.279620853080568e-06,
"loss": 2.0436,
"step": 2214
},
{
"epoch": 0.87,
"grad_norm": 9.625,
"learning_rate": 6.259873617693524e-06,
"loss": 2.0788,
"step": 2215
},
{
"epoch": 0.88,
"grad_norm": 6.34375,
"learning_rate": 6.240126382306477e-06,
"loss": 2.0478,
"step": 2216
},
{
"epoch": 0.88,
"grad_norm": 8.0,
"learning_rate": 6.2203791469194315e-06,
"loss": 1.8715,
"step": 2217
},
{
"epoch": 0.88,
"grad_norm": 7.09375,
"learning_rate": 6.200631911532385e-06,
"loss": 1.8634,
"step": 2218
},
{
"epoch": 0.88,
"grad_norm": 8.875,
"learning_rate": 6.180884676145339e-06,
"loss": 1.5869,
"step": 2219
},
{
"epoch": 0.88,
"grad_norm": 20.875,
"learning_rate": 6.161137440758294e-06,
"loss": 1.9352,
"step": 2220
},
{
"epoch": 0.88,
"grad_norm": 5.96875,
"learning_rate": 6.141390205371248e-06,
"loss": 1.8931,
"step": 2221
},
{
"epoch": 0.88,
"grad_norm": 9.875,
"learning_rate": 6.121642969984203e-06,
"loss": 1.9679,
"step": 2222
},
{
"epoch": 0.88,
"grad_norm": 8.25,
"learning_rate": 6.1018957345971565e-06,
"loss": 2.2001,
"step": 2223
},
{
"epoch": 0.88,
"grad_norm": 8.25,
"learning_rate": 6.08214849921011e-06,
"loss": 2.2332,
"step": 2224
},
{
"epoch": 0.88,
"grad_norm": 6.15625,
"learning_rate": 6.062401263823065e-06,
"loss": 2.0986,
"step": 2225
},
{
"epoch": 0.88,
"grad_norm": 6.78125,
"learning_rate": 6.042654028436019e-06,
"loss": 2.1624,
"step": 2226
},
{
"epoch": 0.88,
"grad_norm": 8.6875,
"learning_rate": 6.022906793048974e-06,
"loss": 2.0263,
"step": 2227
},
{
"epoch": 0.88,
"grad_norm": 4.90625,
"learning_rate": 6.0031595576619276e-06,
"loss": 1.8413,
"step": 2228
},
{
"epoch": 0.88,
"grad_norm": 9.0,
"learning_rate": 5.9834123222748814e-06,
"loss": 2.1582,
"step": 2229
},
{
"epoch": 0.88,
"grad_norm": 7.4375,
"learning_rate": 5.963665086887836e-06,
"loss": 1.8635,
"step": 2230
},
{
"epoch": 0.88,
"grad_norm": 8.4375,
"learning_rate": 5.94391785150079e-06,
"loss": 1.6962,
"step": 2231
},
{
"epoch": 0.88,
"grad_norm": 5.59375,
"learning_rate": 5.924170616113745e-06,
"loss": 2.0882,
"step": 2232
},
{
"epoch": 0.88,
"grad_norm": 8.0625,
"learning_rate": 5.904423380726699e-06,
"loss": 1.9359,
"step": 2233
},
{
"epoch": 0.88,
"grad_norm": 6.625,
"learning_rate": 5.8846761453396525e-06,
"loss": 1.77,
"step": 2234
},
{
"epoch": 0.88,
"grad_norm": 9.75,
"learning_rate": 5.864928909952607e-06,
"loss": 2.3509,
"step": 2235
},
{
"epoch": 0.88,
"grad_norm": 6.65625,
"learning_rate": 5.845181674565561e-06,
"loss": 1.5534,
"step": 2236
},
{
"epoch": 0.88,
"grad_norm": 6.6875,
"learning_rate": 5.825434439178516e-06,
"loss": 2.2574,
"step": 2237
},
{
"epoch": 0.88,
"grad_norm": 7.25,
"learning_rate": 5.80568720379147e-06,
"loss": 2.0624,
"step": 2238
},
{
"epoch": 0.88,
"grad_norm": 5.34375,
"learning_rate": 5.785939968404424e-06,
"loss": 2.0173,
"step": 2239
},
{
"epoch": 0.88,
"grad_norm": 6.875,
"learning_rate": 5.766192733017378e-06,
"loss": 2.0317,
"step": 2240
},
{
"epoch": 0.89,
"grad_norm": 10.5625,
"learning_rate": 5.746445497630331e-06,
"loss": 2.21,
"step": 2241
},
{
"epoch": 0.89,
"grad_norm": 11.0,
"learning_rate": 5.726698262243286e-06,
"loss": 1.7312,
"step": 2242
},
{
"epoch": 0.89,
"grad_norm": 10.125,
"learning_rate": 5.70695102685624e-06,
"loss": 2.2897,
"step": 2243
},
{
"epoch": 0.89,
"grad_norm": 5.0,
"learning_rate": 5.687203791469195e-06,
"loss": 2.1127,
"step": 2244
},
{
"epoch": 0.89,
"grad_norm": 7.71875,
"learning_rate": 5.6674565560821485e-06,
"loss": 2.1219,
"step": 2245
},
{
"epoch": 0.89,
"grad_norm": 5.75,
"learning_rate": 5.647709320695102e-06,
"loss": 1.7997,
"step": 2246
},
{
"epoch": 0.89,
"grad_norm": 8.875,
"learning_rate": 5.627962085308057e-06,
"loss": 2.0101,
"step": 2247
},
{
"epoch": 0.89,
"grad_norm": 6.25,
"learning_rate": 5.608214849921011e-06,
"loss": 1.7678,
"step": 2248
},
{
"epoch": 0.89,
"grad_norm": 6.15625,
"learning_rate": 5.588467614533966e-06,
"loss": 1.829,
"step": 2249
},
{
"epoch": 0.89,
"grad_norm": 6.65625,
"learning_rate": 5.56872037914692e-06,
"loss": 1.9869,
"step": 2250
},
{
"epoch": 0.89,
"grad_norm": 8.0625,
"learning_rate": 5.5489731437598735e-06,
"loss": 1.9587,
"step": 2251
},
{
"epoch": 0.89,
"grad_norm": 4.6875,
"learning_rate": 5.529225908372828e-06,
"loss": 2.0677,
"step": 2252
},
{
"epoch": 0.89,
"grad_norm": 7.5625,
"learning_rate": 5.509478672985782e-06,
"loss": 1.9935,
"step": 2253
},
{
"epoch": 0.89,
"grad_norm": 9.125,
"learning_rate": 5.489731437598737e-06,
"loss": 1.7763,
"step": 2254
},
{
"epoch": 0.89,
"grad_norm": 4.71875,
"learning_rate": 5.469984202211691e-06,
"loss": 1.9501,
"step": 2255
},
{
"epoch": 0.89,
"grad_norm": 5.75,
"learning_rate": 5.4502369668246446e-06,
"loss": 1.7587,
"step": 2256
},
{
"epoch": 0.89,
"grad_norm": 6.0,
"learning_rate": 5.430489731437599e-06,
"loss": 2.4377,
"step": 2257
},
{
"epoch": 0.89,
"grad_norm": 11.375,
"learning_rate": 5.410742496050553e-06,
"loss": 1.6795,
"step": 2258
},
{
"epoch": 0.89,
"grad_norm": 9.875,
"learning_rate": 5.390995260663508e-06,
"loss": 2.4074,
"step": 2259
},
{
"epoch": 0.89,
"grad_norm": 5.96875,
"learning_rate": 5.371248025276462e-06,
"loss": 1.8348,
"step": 2260
},
{
"epoch": 0.89,
"grad_norm": 8.375,
"learning_rate": 5.351500789889416e-06,
"loss": 1.7934,
"step": 2261
},
{
"epoch": 0.89,
"grad_norm": 5.5625,
"learning_rate": 5.33175355450237e-06,
"loss": 2.1144,
"step": 2262
},
{
"epoch": 0.89,
"grad_norm": 13.6875,
"learning_rate": 5.312006319115324e-06,
"loss": 1.5424,
"step": 2263
},
{
"epoch": 0.89,
"grad_norm": 8.25,
"learning_rate": 5.292259083728279e-06,
"loss": 1.8813,
"step": 2264
},
{
"epoch": 0.89,
"grad_norm": 7.1875,
"learning_rate": 5.272511848341233e-06,
"loss": 1.8377,
"step": 2265
},
{
"epoch": 0.89,
"grad_norm": 5.0625,
"learning_rate": 5.252764612954187e-06,
"loss": 2.0018,
"step": 2266
},
{
"epoch": 0.9,
"grad_norm": 11.6875,
"learning_rate": 5.233017377567141e-06,
"loss": 2.1218,
"step": 2267
},
{
"epoch": 0.9,
"grad_norm": 8.3125,
"learning_rate": 5.2132701421800945e-06,
"loss": 2.0607,
"step": 2268
},
{
"epoch": 0.9,
"grad_norm": 10.625,
"learning_rate": 5.193522906793049e-06,
"loss": 1.6292,
"step": 2269
},
{
"epoch": 0.9,
"grad_norm": 6.0,
"learning_rate": 5.173775671406003e-06,
"loss": 2.5156,
"step": 2270
},
{
"epoch": 0.9,
"grad_norm": 5.03125,
"learning_rate": 5.154028436018958e-06,
"loss": 1.7236,
"step": 2271
},
{
"epoch": 0.9,
"grad_norm": 10.3125,
"learning_rate": 5.134281200631912e-06,
"loss": 1.7977,
"step": 2272
},
{
"epoch": 0.9,
"grad_norm": 6.03125,
"learning_rate": 5.1145339652448656e-06,
"loss": 1.9854,
"step": 2273
},
{
"epoch": 0.9,
"grad_norm": 7.90625,
"learning_rate": 5.09478672985782e-06,
"loss": 2.323,
"step": 2274
},
{
"epoch": 0.9,
"grad_norm": 8.8125,
"learning_rate": 5.075039494470774e-06,
"loss": 2.0142,
"step": 2275
},
{
"epoch": 0.9,
"grad_norm": 7.9375,
"learning_rate": 5.055292259083728e-06,
"loss": 2.4279,
"step": 2276
},
{
"epoch": 0.9,
"grad_norm": 12.8125,
"learning_rate": 5.035545023696683e-06,
"loss": 2.0451,
"step": 2277
},
{
"epoch": 0.9,
"grad_norm": 12.4375,
"learning_rate": 5.015797788309637e-06,
"loss": 1.7234,
"step": 2278
},
{
"epoch": 0.9,
"grad_norm": 7.90625,
"learning_rate": 4.996050552922591e-06,
"loss": 1.9291,
"step": 2279
},
{
"epoch": 0.9,
"grad_norm": 8.625,
"learning_rate": 4.976303317535545e-06,
"loss": 2.109,
"step": 2280
},
{
"epoch": 0.9,
"grad_norm": 9.0,
"learning_rate": 4.956556082148499e-06,
"loss": 2.4016,
"step": 2281
},
{
"epoch": 0.9,
"grad_norm": 5.125,
"learning_rate": 4.936808846761454e-06,
"loss": 2.1151,
"step": 2282
},
{
"epoch": 0.9,
"grad_norm": 5.3125,
"learning_rate": 4.917061611374408e-06,
"loss": 2.2157,
"step": 2283
},
{
"epoch": 0.9,
"grad_norm": 9.4375,
"learning_rate": 4.8973143759873624e-06,
"loss": 1.9176,
"step": 2284
},
{
"epoch": 0.9,
"grad_norm": 6.625,
"learning_rate": 4.877567140600316e-06,
"loss": 1.9946,
"step": 2285
},
{
"epoch": 0.9,
"grad_norm": 6.0625,
"learning_rate": 4.85781990521327e-06,
"loss": 2.124,
"step": 2286
},
{
"epoch": 0.9,
"grad_norm": 7.09375,
"learning_rate": 4.838072669826225e-06,
"loss": 1.9942,
"step": 2287
},
{
"epoch": 0.9,
"grad_norm": 7.53125,
"learning_rate": 4.818325434439179e-06,
"loss": 1.9503,
"step": 2288
},
{
"epoch": 0.9,
"grad_norm": 4.3125,
"learning_rate": 4.7985781990521335e-06,
"loss": 2.0472,
"step": 2289
},
{
"epoch": 0.9,
"grad_norm": 12.875,
"learning_rate": 4.778830963665087e-06,
"loss": 1.8307,
"step": 2290
},
{
"epoch": 0.9,
"grad_norm": 6.15625,
"learning_rate": 4.759083728278041e-06,
"loss": 2.1098,
"step": 2291
},
{
"epoch": 0.91,
"grad_norm": 7.625,
"learning_rate": 4.739336492890995e-06,
"loss": 1.8947,
"step": 2292
},
{
"epoch": 0.91,
"grad_norm": 10.5625,
"learning_rate": 4.719589257503949e-06,
"loss": 2.0737,
"step": 2293
},
{
"epoch": 0.91,
"grad_norm": 8.9375,
"learning_rate": 4.699842022116904e-06,
"loss": 1.7753,
"step": 2294
},
{
"epoch": 0.91,
"grad_norm": 6.84375,
"learning_rate": 4.680094786729858e-06,
"loss": 1.6473,
"step": 2295
},
{
"epoch": 0.91,
"grad_norm": 9.4375,
"learning_rate": 4.660347551342812e-06,
"loss": 2.0098,
"step": 2296
},
{
"epoch": 0.91,
"grad_norm": 7.96875,
"learning_rate": 4.640600315955766e-06,
"loss": 1.74,
"step": 2297
},
{
"epoch": 0.91,
"grad_norm": 10.125,
"learning_rate": 4.62085308056872e-06,
"loss": 1.9894,
"step": 2298
},
{
"epoch": 0.91,
"grad_norm": 9.5625,
"learning_rate": 4.601105845181675e-06,
"loss": 2.0322,
"step": 2299
},
{
"epoch": 0.91,
"grad_norm": 5.46875,
"learning_rate": 4.581358609794629e-06,
"loss": 1.7562,
"step": 2300
},
{
"epoch": 0.91,
"grad_norm": 50.5,
"learning_rate": 4.561611374407583e-06,
"loss": 1.7316,
"step": 2301
},
{
"epoch": 0.91,
"grad_norm": 8.4375,
"learning_rate": 4.541864139020537e-06,
"loss": 1.9911,
"step": 2302
},
{
"epoch": 0.91,
"grad_norm": 6.53125,
"learning_rate": 4.522116903633491e-06,
"loss": 1.6662,
"step": 2303
},
{
"epoch": 0.91,
"grad_norm": 6.5,
"learning_rate": 4.502369668246446e-06,
"loss": 1.981,
"step": 2304
},
{
"epoch": 0.91,
"grad_norm": 10.3125,
"learning_rate": 4.4826224328594e-06,
"loss": 2.0215,
"step": 2305
},
{
"epoch": 0.91,
"grad_norm": 6.9375,
"learning_rate": 4.4628751974723545e-06,
"loss": 2.1085,
"step": 2306
},
{
"epoch": 0.91,
"grad_norm": 7.59375,
"learning_rate": 4.443127962085308e-06,
"loss": 1.6288,
"step": 2307
},
{
"epoch": 0.91,
"grad_norm": 9.25,
"learning_rate": 4.423380726698262e-06,
"loss": 2.0319,
"step": 2308
},
{
"epoch": 0.91,
"grad_norm": 8.0625,
"learning_rate": 4.403633491311217e-06,
"loss": 1.8166,
"step": 2309
},
{
"epoch": 0.91,
"grad_norm": 7.15625,
"learning_rate": 4.383886255924171e-06,
"loss": 2.0406,
"step": 2310
},
{
"epoch": 0.91,
"grad_norm": 6.875,
"learning_rate": 4.3641390205371256e-06,
"loss": 1.584,
"step": 2311
},
{
"epoch": 0.91,
"grad_norm": 9.125,
"learning_rate": 4.3443917851500794e-06,
"loss": 2.174,
"step": 2312
},
{
"epoch": 0.91,
"grad_norm": 8.0625,
"learning_rate": 4.324644549763033e-06,
"loss": 1.9839,
"step": 2313
},
{
"epoch": 0.91,
"grad_norm": 7.25,
"learning_rate": 4.304897314375988e-06,
"loss": 2.0373,
"step": 2314
},
{
"epoch": 0.91,
"grad_norm": 7.1875,
"learning_rate": 4.285150078988942e-06,
"loss": 1.6178,
"step": 2315
},
{
"epoch": 0.91,
"grad_norm": 8.1875,
"learning_rate": 4.265402843601897e-06,
"loss": 1.933,
"step": 2316
},
{
"epoch": 0.92,
"grad_norm": 6.21875,
"learning_rate": 4.24565560821485e-06,
"loss": 1.796,
"step": 2317
},
{
"epoch": 0.92,
"grad_norm": 8.5625,
"learning_rate": 4.225908372827804e-06,
"loss": 1.874,
"step": 2318
},
{
"epoch": 0.92,
"grad_norm": 6.5,
"learning_rate": 4.206161137440758e-06,
"loss": 1.9773,
"step": 2319
},
{
"epoch": 0.92,
"grad_norm": 8.75,
"learning_rate": 4.186413902053712e-06,
"loss": 2.16,
"step": 2320
},
{
"epoch": 0.92,
"grad_norm": 7.125,
"learning_rate": 4.166666666666667e-06,
"loss": 2.2667,
"step": 2321
},
{
"epoch": 0.92,
"grad_norm": 8.3125,
"learning_rate": 4.146919431279621e-06,
"loss": 2.1831,
"step": 2322
},
{
"epoch": 0.92,
"grad_norm": 11.0625,
"learning_rate": 4.1271721958925755e-06,
"loss": 2.2412,
"step": 2323
},
{
"epoch": 0.92,
"grad_norm": 6.625,
"learning_rate": 4.107424960505529e-06,
"loss": 2.2614,
"step": 2324
},
{
"epoch": 0.92,
"grad_norm": 7.21875,
"learning_rate": 4.087677725118483e-06,
"loss": 2.1572,
"step": 2325
},
{
"epoch": 0.92,
"grad_norm": 12.8125,
"learning_rate": 4.067930489731438e-06,
"loss": 1.9435,
"step": 2326
},
{
"epoch": 0.92,
"grad_norm": 5.375,
"learning_rate": 4.048183254344392e-06,
"loss": 2.0918,
"step": 2327
},
{
"epoch": 0.92,
"grad_norm": 5.34375,
"learning_rate": 4.0284360189573465e-06,
"loss": 2.0789,
"step": 2328
},
{
"epoch": 0.92,
"grad_norm": 6.15625,
"learning_rate": 4.0086887835703e-06,
"loss": 1.8412,
"step": 2329
},
{
"epoch": 0.92,
"grad_norm": 7.71875,
"learning_rate": 3.988941548183254e-06,
"loss": 2.0313,
"step": 2330
},
{
"epoch": 0.92,
"grad_norm": 6.625,
"learning_rate": 3.969194312796209e-06,
"loss": 1.6321,
"step": 2331
},
{
"epoch": 0.92,
"grad_norm": 7.71875,
"learning_rate": 3.949447077409163e-06,
"loss": 2.0381,
"step": 2332
},
{
"epoch": 0.92,
"grad_norm": 6.21875,
"learning_rate": 3.929699842022117e-06,
"loss": 1.9909,
"step": 2333
},
{
"epoch": 0.92,
"grad_norm": 7.15625,
"learning_rate": 3.9099526066350715e-06,
"loss": 2.0142,
"step": 2334
},
{
"epoch": 0.92,
"grad_norm": 7.4375,
"learning_rate": 3.890205371248025e-06,
"loss": 1.9865,
"step": 2335
},
{
"epoch": 0.92,
"grad_norm": 9.0625,
"learning_rate": 3.87045813586098e-06,
"loss": 2.0208,
"step": 2336
},
{
"epoch": 0.92,
"grad_norm": 8.25,
"learning_rate": 3.850710900473934e-06,
"loss": 1.8589,
"step": 2337
},
{
"epoch": 0.92,
"grad_norm": 5.3125,
"learning_rate": 3.830963665086888e-06,
"loss": 2.0689,
"step": 2338
},
{
"epoch": 0.92,
"grad_norm": 7.59375,
"learning_rate": 3.8112164296998426e-06,
"loss": 2.3421,
"step": 2339
},
{
"epoch": 0.92,
"grad_norm": 5.90625,
"learning_rate": 3.791469194312797e-06,
"loss": 1.8997,
"step": 2340
},
{
"epoch": 0.92,
"grad_norm": 7.0625,
"learning_rate": 3.7717219589257507e-06,
"loss": 2.2321,
"step": 2341
},
{
"epoch": 0.92,
"grad_norm": 6.28125,
"learning_rate": 3.751974723538705e-06,
"loss": 2.0007,
"step": 2342
},
{
"epoch": 0.93,
"grad_norm": 10.125,
"learning_rate": 3.7322274881516585e-06,
"loss": 2.2585,
"step": 2343
},
{
"epoch": 0.93,
"grad_norm": 6.28125,
"learning_rate": 3.712480252764613e-06,
"loss": 1.8483,
"step": 2344
},
{
"epoch": 0.93,
"grad_norm": 8.4375,
"learning_rate": 3.692733017377567e-06,
"loss": 2.3332,
"step": 2345
},
{
"epoch": 0.93,
"grad_norm": 7.46875,
"learning_rate": 3.6729857819905214e-06,
"loss": 1.8113,
"step": 2346
},
{
"epoch": 0.93,
"grad_norm": 7.71875,
"learning_rate": 3.6532385466034753e-06,
"loss": 1.5883,
"step": 2347
},
{
"epoch": 0.93,
"grad_norm": 8.1875,
"learning_rate": 3.6334913112164296e-06,
"loss": 2.0843,
"step": 2348
},
{
"epoch": 0.93,
"grad_norm": 7.0,
"learning_rate": 3.613744075829384e-06,
"loss": 1.9153,
"step": 2349
},
{
"epoch": 0.93,
"grad_norm": 7.65625,
"learning_rate": 3.593996840442338e-06,
"loss": 1.8961,
"step": 2350
},
{
"epoch": 0.93,
"grad_norm": 5.03125,
"learning_rate": 3.5742496050552925e-06,
"loss": 1.9811,
"step": 2351
},
{
"epoch": 0.93,
"grad_norm": 5.34375,
"learning_rate": 3.5545023696682464e-06,
"loss": 1.7207,
"step": 2352
},
{
"epoch": 0.93,
"grad_norm": 11.6875,
"learning_rate": 3.5347551342812007e-06,
"loss": 1.9094,
"step": 2353
},
{
"epoch": 0.93,
"grad_norm": 7.84375,
"learning_rate": 3.515007898894155e-06,
"loss": 2.1064,
"step": 2354
},
{
"epoch": 0.93,
"grad_norm": 6.65625,
"learning_rate": 3.4952606635071093e-06,
"loss": 1.9244,
"step": 2355
},
{
"epoch": 0.93,
"grad_norm": 8.875,
"learning_rate": 3.4755134281200636e-06,
"loss": 1.7979,
"step": 2356
},
{
"epoch": 0.93,
"grad_norm": 14.3125,
"learning_rate": 3.4557661927330174e-06,
"loss": 2.1207,
"step": 2357
},
{
"epoch": 0.93,
"grad_norm": 8.8125,
"learning_rate": 3.4360189573459717e-06,
"loss": 1.9833,
"step": 2358
},
{
"epoch": 0.93,
"grad_norm": 10.625,
"learning_rate": 3.416271721958926e-06,
"loss": 2.1655,
"step": 2359
},
{
"epoch": 0.93,
"grad_norm": 9.6875,
"learning_rate": 3.3965244865718803e-06,
"loss": 1.8125,
"step": 2360
},
{
"epoch": 0.93,
"grad_norm": 14.25,
"learning_rate": 3.3767772511848346e-06,
"loss": 1.7163,
"step": 2361
},
{
"epoch": 0.93,
"grad_norm": 9.75,
"learning_rate": 3.3570300157977885e-06,
"loss": 1.8787,
"step": 2362
},
{
"epoch": 0.93,
"grad_norm": 9.6875,
"learning_rate": 3.337282780410743e-06,
"loss": 2.0581,
"step": 2363
},
{
"epoch": 0.93,
"grad_norm": 7.15625,
"learning_rate": 3.317535545023697e-06,
"loss": 1.7655,
"step": 2364
},
{
"epoch": 0.93,
"grad_norm": 7.71875,
"learning_rate": 3.2977883096366514e-06,
"loss": 2.2519,
"step": 2365
},
{
"epoch": 0.93,
"grad_norm": 6.625,
"learning_rate": 3.2780410742496057e-06,
"loss": 1.9056,
"step": 2366
},
{
"epoch": 0.93,
"grad_norm": 6.5,
"learning_rate": 3.2582938388625596e-06,
"loss": 1.7427,
"step": 2367
},
{
"epoch": 0.94,
"grad_norm": 15.8125,
"learning_rate": 3.2385466034755135e-06,
"loss": 1.9997,
"step": 2368
},
{
"epoch": 0.94,
"grad_norm": 9.5,
"learning_rate": 3.2187993680884673e-06,
"loss": 1.9292,
"step": 2369
},
{
"epoch": 0.94,
"grad_norm": 17.75,
"learning_rate": 3.1990521327014216e-06,
"loss": 2.2422,
"step": 2370
},
{
"epoch": 0.94,
"grad_norm": 9.3125,
"learning_rate": 3.179304897314376e-06,
"loss": 1.9487,
"step": 2371
},
{
"epoch": 0.94,
"grad_norm": 7.09375,
"learning_rate": 3.1595576619273302e-06,
"loss": 2.0298,
"step": 2372
},
{
"epoch": 0.94,
"grad_norm": 11.125,
"learning_rate": 3.139810426540284e-06,
"loss": 1.9431,
"step": 2373
},
{
"epoch": 0.94,
"grad_norm": 5.71875,
"learning_rate": 3.1200631911532384e-06,
"loss": 2.1532,
"step": 2374
},
{
"epoch": 0.94,
"grad_norm": 8.25,
"learning_rate": 3.1003159557661927e-06,
"loss": 2.2057,
"step": 2375
},
{
"epoch": 0.94,
"grad_norm": 5.75,
"learning_rate": 3.080568720379147e-06,
"loss": 1.5939,
"step": 2376
},
{
"epoch": 0.94,
"grad_norm": 8.5,
"learning_rate": 3.0608214849921013e-06,
"loss": 2.2438,
"step": 2377
},
{
"epoch": 0.94,
"grad_norm": 7.0625,
"learning_rate": 3.041074249605055e-06,
"loss": 1.8359,
"step": 2378
},
{
"epoch": 0.94,
"grad_norm": 7.59375,
"learning_rate": 3.0213270142180095e-06,
"loss": 1.839,
"step": 2379
},
{
"epoch": 0.94,
"grad_norm": 7.34375,
"learning_rate": 3.0015797788309638e-06,
"loss": 2.0552,
"step": 2380
},
{
"epoch": 0.94,
"grad_norm": 10.0625,
"learning_rate": 2.981832543443918e-06,
"loss": 2.0166,
"step": 2381
},
{
"epoch": 0.94,
"grad_norm": 5.6875,
"learning_rate": 2.9620853080568724e-06,
"loss": 2.0933,
"step": 2382
},
{
"epoch": 0.94,
"grad_norm": 6.90625,
"learning_rate": 2.9423380726698263e-06,
"loss": 1.8613,
"step": 2383
},
{
"epoch": 0.94,
"grad_norm": 17.0,
"learning_rate": 2.9225908372827806e-06,
"loss": 1.7204,
"step": 2384
},
{
"epoch": 0.94,
"grad_norm": 7.71875,
"learning_rate": 2.902843601895735e-06,
"loss": 2.0525,
"step": 2385
},
{
"epoch": 0.94,
"grad_norm": 10.4375,
"learning_rate": 2.883096366508689e-06,
"loss": 1.9942,
"step": 2386
},
{
"epoch": 0.94,
"grad_norm": 7.0,
"learning_rate": 2.863349131121643e-06,
"loss": 1.9629,
"step": 2387
},
{
"epoch": 0.94,
"grad_norm": 7.0,
"learning_rate": 2.8436018957345973e-06,
"loss": 1.8319,
"step": 2388
},
{
"epoch": 0.94,
"grad_norm": 8.8125,
"learning_rate": 2.823854660347551e-06,
"loss": 1.7072,
"step": 2389
},
{
"epoch": 0.94,
"grad_norm": 6.9375,
"learning_rate": 2.8041074249605055e-06,
"loss": 1.9124,
"step": 2390
},
{
"epoch": 0.94,
"grad_norm": 13.0,
"learning_rate": 2.78436018957346e-06,
"loss": 2.0848,
"step": 2391
},
{
"epoch": 0.94,
"grad_norm": 6.125,
"learning_rate": 2.764612954186414e-06,
"loss": 2.1611,
"step": 2392
},
{
"epoch": 0.95,
"grad_norm": 11.0,
"learning_rate": 2.7448657187993684e-06,
"loss": 2.2561,
"step": 2393
},
{
"epoch": 0.95,
"grad_norm": 5.40625,
"learning_rate": 2.7251184834123223e-06,
"loss": 1.7778,
"step": 2394
},
{
"epoch": 0.95,
"grad_norm": 14.25,
"learning_rate": 2.7053712480252766e-06,
"loss": 2.0715,
"step": 2395
},
{
"epoch": 0.95,
"grad_norm": 5.46875,
"learning_rate": 2.685624012638231e-06,
"loss": 1.8563,
"step": 2396
},
{
"epoch": 0.95,
"grad_norm": 8.0,
"learning_rate": 2.665876777251185e-06,
"loss": 1.7983,
"step": 2397
},
{
"epoch": 0.95,
"grad_norm": 6.6875,
"learning_rate": 2.6461295418641395e-06,
"loss": 1.9106,
"step": 2398
},
{
"epoch": 0.95,
"grad_norm": 7.03125,
"learning_rate": 2.6263823064770934e-06,
"loss": 2.111,
"step": 2399
},
{
"epoch": 0.95,
"grad_norm": 5.40625,
"learning_rate": 2.6066350710900472e-06,
"loss": 1.5117,
"step": 2400
},
{
"epoch": 0.95,
"grad_norm": 5.40625,
"learning_rate": 2.5868878357030015e-06,
"loss": 2.0003,
"step": 2401
},
{
"epoch": 0.95,
"grad_norm": 8.25,
"learning_rate": 2.567140600315956e-06,
"loss": 2.0373,
"step": 2402
},
{
"epoch": 0.95,
"grad_norm": 6.46875,
"learning_rate": 2.54739336492891e-06,
"loss": 2.4003,
"step": 2403
},
{
"epoch": 0.95,
"grad_norm": 4.53125,
"learning_rate": 2.527646129541864e-06,
"loss": 2.2799,
"step": 2404
},
{
"epoch": 0.95,
"grad_norm": 4.90625,
"learning_rate": 2.5078988941548183e-06,
"loss": 1.9846,
"step": 2405
},
{
"epoch": 0.95,
"grad_norm": 6.15625,
"learning_rate": 2.4881516587677726e-06,
"loss": 2.1624,
"step": 2406
},
{
"epoch": 0.95,
"grad_norm": 8.6875,
"learning_rate": 2.468404423380727e-06,
"loss": 2.1508,
"step": 2407
},
{
"epoch": 0.95,
"grad_norm": 5.4375,
"learning_rate": 2.4486571879936812e-06,
"loss": 2.0976,
"step": 2408
},
{
"epoch": 0.95,
"grad_norm": 7.65625,
"learning_rate": 2.428909952606635e-06,
"loss": 1.8191,
"step": 2409
},
{
"epoch": 0.95,
"grad_norm": 7.8125,
"learning_rate": 2.4091627172195894e-06,
"loss": 2.2424,
"step": 2410
},
{
"epoch": 0.95,
"grad_norm": 6.71875,
"learning_rate": 2.3894154818325437e-06,
"loss": 2.0062,
"step": 2411
},
{
"epoch": 0.95,
"grad_norm": 4.03125,
"learning_rate": 2.3696682464454976e-06,
"loss": 2.0536,
"step": 2412
},
{
"epoch": 0.95,
"grad_norm": 15.625,
"learning_rate": 2.349921011058452e-06,
"loss": 2.3879,
"step": 2413
},
{
"epoch": 0.95,
"grad_norm": 6.75,
"learning_rate": 2.330173775671406e-06,
"loss": 2.301,
"step": 2414
},
{
"epoch": 0.95,
"grad_norm": 6.0625,
"learning_rate": 2.31042654028436e-06,
"loss": 2.1147,
"step": 2415
},
{
"epoch": 0.95,
"grad_norm": 7.34375,
"learning_rate": 2.2906793048973143e-06,
"loss": 1.8629,
"step": 2416
},
{
"epoch": 0.95,
"grad_norm": 7.625,
"learning_rate": 2.2709320695102686e-06,
"loss": 2.0537,
"step": 2417
},
{
"epoch": 0.95,
"grad_norm": 5.5,
"learning_rate": 2.251184834123223e-06,
"loss": 1.9404,
"step": 2418
},
{
"epoch": 0.96,
"grad_norm": 4.53125,
"learning_rate": 2.2314375987361772e-06,
"loss": 1.7811,
"step": 2419
},
{
"epoch": 0.96,
"grad_norm": 6.875,
"learning_rate": 2.211690363349131e-06,
"loss": 2.1955,
"step": 2420
},
{
"epoch": 0.96,
"grad_norm": 5.71875,
"learning_rate": 2.1919431279620854e-06,
"loss": 2.1887,
"step": 2421
},
{
"epoch": 0.96,
"grad_norm": 7.09375,
"learning_rate": 2.1721958925750397e-06,
"loss": 2.2411,
"step": 2422
},
{
"epoch": 0.96,
"grad_norm": 10.0625,
"learning_rate": 2.152448657187994e-06,
"loss": 1.9926,
"step": 2423
},
{
"epoch": 0.96,
"grad_norm": 5.40625,
"learning_rate": 2.1327014218009483e-06,
"loss": 2.0833,
"step": 2424
},
{
"epoch": 0.96,
"grad_norm": 5.53125,
"learning_rate": 2.112954186413902e-06,
"loss": 1.8915,
"step": 2425
},
{
"epoch": 0.96,
"grad_norm": 7.4375,
"learning_rate": 2.093206951026856e-06,
"loss": 2.2001,
"step": 2426
},
{
"epoch": 0.96,
"grad_norm": 4.625,
"learning_rate": 2.0734597156398104e-06,
"loss": 1.9952,
"step": 2427
},
{
"epoch": 0.96,
"grad_norm": 7.28125,
"learning_rate": 2.0537124802527647e-06,
"loss": 2.5361,
"step": 2428
},
{
"epoch": 0.96,
"grad_norm": 6.71875,
"learning_rate": 2.033965244865719e-06,
"loss": 2.0804,
"step": 2429
},
{
"epoch": 0.96,
"grad_norm": 8.5,
"learning_rate": 2.0142180094786733e-06,
"loss": 1.9336,
"step": 2430
},
{
"epoch": 0.96,
"grad_norm": 7.90625,
"learning_rate": 1.994470774091627e-06,
"loss": 2.4606,
"step": 2431
},
{
"epoch": 0.96,
"grad_norm": 7.1875,
"learning_rate": 1.9747235387045814e-06,
"loss": 2.1012,
"step": 2432
},
{
"epoch": 0.96,
"grad_norm": 6.96875,
"learning_rate": 1.9549763033175357e-06,
"loss": 2.2764,
"step": 2433
},
{
"epoch": 0.96,
"grad_norm": 13.1875,
"learning_rate": 1.93522906793049e-06,
"loss": 1.9863,
"step": 2434
},
{
"epoch": 0.96,
"grad_norm": 8.4375,
"learning_rate": 1.915481832543444e-06,
"loss": 2.2291,
"step": 2435
},
{
"epoch": 0.96,
"grad_norm": 5.53125,
"learning_rate": 1.8957345971563984e-06,
"loss": 1.6912,
"step": 2436
},
{
"epoch": 0.96,
"grad_norm": 6.875,
"learning_rate": 1.8759873617693525e-06,
"loss": 2.3802,
"step": 2437
},
{
"epoch": 0.96,
"grad_norm": 5.84375,
"learning_rate": 1.8562401263823064e-06,
"loss": 1.9867,
"step": 2438
},
{
"epoch": 0.96,
"grad_norm": 6.0,
"learning_rate": 1.8364928909952607e-06,
"loss": 1.8941,
"step": 2439
},
{
"epoch": 0.96,
"grad_norm": 7.84375,
"learning_rate": 1.8167456556082148e-06,
"loss": 2.0673,
"step": 2440
},
{
"epoch": 0.96,
"grad_norm": 6.96875,
"learning_rate": 1.796998420221169e-06,
"loss": 2.1157,
"step": 2441
},
{
"epoch": 0.96,
"grad_norm": 9.8125,
"learning_rate": 1.7772511848341232e-06,
"loss": 1.5521,
"step": 2442
},
{
"epoch": 0.96,
"grad_norm": 6.53125,
"learning_rate": 1.7575039494470775e-06,
"loss": 2.2369,
"step": 2443
},
{
"epoch": 0.97,
"grad_norm": 8.0,
"learning_rate": 1.7377567140600318e-06,
"loss": 1.9889,
"step": 2444
},
{
"epoch": 0.97,
"grad_norm": 6.1875,
"learning_rate": 1.7180094786729859e-06,
"loss": 1.7363,
"step": 2445
},
{
"epoch": 0.97,
"grad_norm": 6.46875,
"learning_rate": 1.6982622432859402e-06,
"loss": 2.2275,
"step": 2446
},
{
"epoch": 0.97,
"grad_norm": 17.625,
"learning_rate": 1.6785150078988943e-06,
"loss": 1.7336,
"step": 2447
},
{
"epoch": 0.97,
"grad_norm": 8.3125,
"learning_rate": 1.6587677725118486e-06,
"loss": 2.0279,
"step": 2448
},
{
"epoch": 0.97,
"grad_norm": 8.0625,
"learning_rate": 1.6390205371248029e-06,
"loss": 2.1145,
"step": 2449
},
{
"epoch": 0.97,
"grad_norm": 5.5625,
"learning_rate": 1.6192733017377567e-06,
"loss": 2.0784,
"step": 2450
},
{
"epoch": 0.97,
"grad_norm": 5.8125,
"learning_rate": 1.5995260663507108e-06,
"loss": 1.7722,
"step": 2451
},
{
"epoch": 0.97,
"grad_norm": 8.9375,
"learning_rate": 1.5797788309636651e-06,
"loss": 2.1229,
"step": 2452
},
{
"epoch": 0.97,
"grad_norm": 8.3125,
"learning_rate": 1.5600315955766192e-06,
"loss": 1.9024,
"step": 2453
},
{
"epoch": 0.97,
"grad_norm": 9.1875,
"learning_rate": 1.5402843601895735e-06,
"loss": 2.3533,
"step": 2454
},
{
"epoch": 0.97,
"grad_norm": 5.46875,
"learning_rate": 1.5205371248025276e-06,
"loss": 1.8932,
"step": 2455
},
{
"epoch": 0.97,
"grad_norm": 13.1875,
"learning_rate": 1.5007898894154819e-06,
"loss": 1.8318,
"step": 2456
},
{
"epoch": 0.97,
"grad_norm": 8.625,
"learning_rate": 1.4810426540284362e-06,
"loss": 1.9967,
"step": 2457
},
{
"epoch": 0.97,
"grad_norm": 5.15625,
"learning_rate": 1.4612954186413903e-06,
"loss": 1.923,
"step": 2458
},
{
"epoch": 0.97,
"grad_norm": 6.78125,
"learning_rate": 1.4415481832543446e-06,
"loss": 2.2625,
"step": 2459
},
{
"epoch": 0.97,
"grad_norm": 9.1875,
"learning_rate": 1.4218009478672987e-06,
"loss": 1.6955,
"step": 2460
},
{
"epoch": 0.97,
"grad_norm": 9.4375,
"learning_rate": 1.4020537124802528e-06,
"loss": 1.9177,
"step": 2461
},
{
"epoch": 0.97,
"grad_norm": 7.65625,
"learning_rate": 1.382306477093207e-06,
"loss": 2.3586,
"step": 2462
},
{
"epoch": 0.97,
"grad_norm": 8.0,
"learning_rate": 1.3625592417061611e-06,
"loss": 2.2572,
"step": 2463
},
{
"epoch": 0.97,
"grad_norm": 10.375,
"learning_rate": 1.3428120063191154e-06,
"loss": 1.6699,
"step": 2464
},
{
"epoch": 0.97,
"grad_norm": 5.65625,
"learning_rate": 1.3230647709320697e-06,
"loss": 1.985,
"step": 2465
},
{
"epoch": 0.97,
"grad_norm": 7.84375,
"learning_rate": 1.3033175355450236e-06,
"loss": 1.918,
"step": 2466
},
{
"epoch": 0.97,
"grad_norm": 7.21875,
"learning_rate": 1.283570300157978e-06,
"loss": 2.0159,
"step": 2467
},
{
"epoch": 0.97,
"grad_norm": 6.8125,
"learning_rate": 1.263823064770932e-06,
"loss": 1.9729,
"step": 2468
},
{
"epoch": 0.98,
"grad_norm": 10.625,
"learning_rate": 1.2440758293838863e-06,
"loss": 2.0205,
"step": 2469
},
{
"epoch": 0.98,
"grad_norm": 5.71875,
"learning_rate": 1.2243285939968406e-06,
"loss": 2.1024,
"step": 2470
},
{
"epoch": 0.98,
"grad_norm": 6.59375,
"learning_rate": 1.2045813586097947e-06,
"loss": 1.9047,
"step": 2471
},
{
"epoch": 0.98,
"grad_norm": 5.71875,
"learning_rate": 1.1848341232227488e-06,
"loss": 2.0228,
"step": 2472
},
{
"epoch": 0.98,
"grad_norm": 5.0625,
"learning_rate": 1.165086887835703e-06,
"loss": 1.786,
"step": 2473
},
{
"epoch": 0.98,
"grad_norm": 5.96875,
"learning_rate": 1.1453396524486572e-06,
"loss": 1.5598,
"step": 2474
},
{
"epoch": 0.98,
"grad_norm": 12.9375,
"learning_rate": 1.1255924170616115e-06,
"loss": 1.887,
"step": 2475
},
{
"epoch": 0.98,
"grad_norm": 9.6875,
"learning_rate": 1.1058451816745656e-06,
"loss": 1.8705,
"step": 2476
},
{
"epoch": 0.98,
"grad_norm": 25.875,
"learning_rate": 1.0860979462875199e-06,
"loss": 1.9548,
"step": 2477
},
{
"epoch": 0.98,
"grad_norm": 6.75,
"learning_rate": 1.0663507109004742e-06,
"loss": 2.3911,
"step": 2478
},
{
"epoch": 0.98,
"grad_norm": 5.53125,
"learning_rate": 1.046603475513428e-06,
"loss": 1.9153,
"step": 2479
},
{
"epoch": 0.98,
"grad_norm": 7.03125,
"learning_rate": 1.0268562401263823e-06,
"loss": 1.9673,
"step": 2480
},
{
"epoch": 0.98,
"grad_norm": 5.03125,
"learning_rate": 1.0071090047393366e-06,
"loss": 1.9677,
"step": 2481
},
{
"epoch": 0.98,
"grad_norm": 6.96875,
"learning_rate": 9.873617693522907e-07,
"loss": 2.0399,
"step": 2482
},
{
"epoch": 0.98,
"grad_norm": 9.75,
"learning_rate": 9.67614533965245e-07,
"loss": 1.8344,
"step": 2483
},
{
"epoch": 0.98,
"grad_norm": 6.21875,
"learning_rate": 9.478672985781992e-07,
"loss": 2.0093,
"step": 2484
},
{
"epoch": 0.98,
"grad_norm": 6.84375,
"learning_rate": 9.281200631911532e-07,
"loss": 2.0771,
"step": 2485
},
{
"epoch": 0.98,
"grad_norm": 8.125,
"learning_rate": 9.083728278041074e-07,
"loss": 1.785,
"step": 2486
},
{
"epoch": 0.98,
"grad_norm": 5.0625,
"learning_rate": 8.886255924170616e-07,
"loss": 2.1664,
"step": 2487
},
{
"epoch": 0.98,
"grad_norm": 5.96875,
"learning_rate": 8.688783570300159e-07,
"loss": 2.1429,
"step": 2488
},
{
"epoch": 0.98,
"grad_norm": 9.0,
"learning_rate": 8.491311216429701e-07,
"loss": 1.688,
"step": 2489
},
{
"epoch": 0.98,
"grad_norm": 6.75,
"learning_rate": 8.293838862559243e-07,
"loss": 1.8533,
"step": 2490
},
{
"epoch": 0.98,
"grad_norm": 9.625,
"learning_rate": 8.096366508688784e-07,
"loss": 2.0315,
"step": 2491
},
{
"epoch": 0.98,
"grad_norm": 7.75,
"learning_rate": 7.898894154818326e-07,
"loss": 2.1683,
"step": 2492
},
{
"epoch": 0.98,
"grad_norm": 7.9375,
"learning_rate": 7.701421800947868e-07,
"loss": 2.0874,
"step": 2493
},
{
"epoch": 0.98,
"grad_norm": 10.0625,
"learning_rate": 7.503949447077409e-07,
"loss": 1.8496,
"step": 2494
},
{
"epoch": 0.99,
"grad_norm": 41.25,
"learning_rate": 7.306477093206951e-07,
"loss": 2.0257,
"step": 2495
},
{
"epoch": 0.99,
"grad_norm": 6.4375,
"learning_rate": 7.109004739336493e-07,
"loss": 1.7977,
"step": 2496
},
{
"epoch": 0.99,
"grad_norm": 5.53125,
"learning_rate": 6.911532385466035e-07,
"loss": 2.0043,
"step": 2497
},
{
"epoch": 0.99,
"grad_norm": 4.78125,
"learning_rate": 6.714060031595577e-07,
"loss": 1.9992,
"step": 2498
},
{
"epoch": 0.99,
"grad_norm": 5.875,
"learning_rate": 6.516587677725118e-07,
"loss": 2.1273,
"step": 2499
},
{
"epoch": 0.99,
"grad_norm": 5.0625,
"learning_rate": 6.31911532385466e-07,
"loss": 1.702,
"step": 2500
},
{
"epoch": 0.99,
"grad_norm": 7.53125,
"learning_rate": 6.121642969984203e-07,
"loss": 2.1332,
"step": 2501
},
{
"epoch": 0.99,
"grad_norm": 7.09375,
"learning_rate": 5.924170616113744e-07,
"loss": 1.6806,
"step": 2502
},
{
"epoch": 0.99,
"grad_norm": 12.75,
"learning_rate": 5.726698262243286e-07,
"loss": 1.9727,
"step": 2503
},
{
"epoch": 0.99,
"grad_norm": 5.15625,
"learning_rate": 5.529225908372828e-07,
"loss": 2.0633,
"step": 2504
},
{
"epoch": 0.99,
"grad_norm": 5.5,
"learning_rate": 5.331753554502371e-07,
"loss": 2.0735,
"step": 2505
},
{
"epoch": 0.99,
"grad_norm": 8.625,
"learning_rate": 5.134281200631912e-07,
"loss": 2.0358,
"step": 2506
},
{
"epoch": 0.99,
"grad_norm": 6.59375,
"learning_rate": 4.936808846761454e-07,
"loss": 2.2405,
"step": 2507
},
{
"epoch": 0.99,
"grad_norm": 8.625,
"learning_rate": 4.739336492890996e-07,
"loss": 1.8639,
"step": 2508
},
{
"epoch": 0.99,
"grad_norm": 4.71875,
"learning_rate": 4.541864139020537e-07,
"loss": 1.998,
"step": 2509
},
{
"epoch": 0.99,
"grad_norm": 9.375,
"learning_rate": 4.3443917851500794e-07,
"loss": 2.1872,
"step": 2510
},
{
"epoch": 0.99,
"grad_norm": 6.84375,
"learning_rate": 4.1469194312796214e-07,
"loss": 2.4313,
"step": 2511
},
{
"epoch": 0.99,
"grad_norm": 6.71875,
"learning_rate": 3.949447077409163e-07,
"loss": 1.7067,
"step": 2512
},
{
"epoch": 0.99,
"grad_norm": 5.28125,
"learning_rate": 3.7519747235387047e-07,
"loss": 1.7594,
"step": 2513
},
{
"epoch": 0.99,
"grad_norm": 7.90625,
"learning_rate": 3.5545023696682467e-07,
"loss": 1.7605,
"step": 2514
},
{
"epoch": 0.99,
"grad_norm": 6.90625,
"learning_rate": 3.3570300157977886e-07,
"loss": 1.9571,
"step": 2515
},
{
"epoch": 0.99,
"grad_norm": 8.3125,
"learning_rate": 3.15955766192733e-07,
"loss": 1.9812,
"step": 2516
},
{
"epoch": 0.99,
"grad_norm": 12.0,
"learning_rate": 2.962085308056872e-07,
"loss": 1.7699,
"step": 2517
},
{
"epoch": 0.99,
"grad_norm": 5.75,
"learning_rate": 2.764612954186414e-07,
"loss": 2.0047,
"step": 2518
},
{
"epoch": 0.99,
"grad_norm": 7.09375,
"learning_rate": 2.567140600315956e-07,
"loss": 1.817,
"step": 2519
},
{
"epoch": 1.0,
"grad_norm": 5.59375,
"learning_rate": 2.369668246445498e-07,
"loss": 2.0036,
"step": 2520
}
],
"logging_steps": 1.0,
"max_steps": 2532,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"total_flos": 3.930125662594636e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}