{ "best_global_step": 12168, "best_metric": 0.0602399967610836, "best_model_checkpoint": "./models/whisper-small-ml-baseline-high-lr/checkpoint-12168", "epoch": 5.0, "eval_steps": 4056, "global_step": 20280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 2.034205198287964, "eval_runtime": 65.0644, "eval_samples_per_second": 244.819, "eval_steps_per_second": 1.921, "step": 0 }, { "epoch": 0.0004930966469428008, "grad_norm": 23.125, "learning_rate": 9.861932938856017e-08, "loss": 1.9732, "step": 2 }, { "epoch": 0.0009861932938856016, "grad_norm": 23.125, "learning_rate": 2.958579881656805e-07, "loss": 1.9901, "step": 4 }, { "epoch": 0.0014792899408284023, "grad_norm": 23.25, "learning_rate": 4.930966469428008e-07, "loss": 1.9936, "step": 6 }, { "epoch": 0.0019723865877712033, "grad_norm": 23.75, "learning_rate": 6.903353057199211e-07, "loss": 1.9993, "step": 8 }, { "epoch": 0.002465483234714004, "grad_norm": 22.25, "learning_rate": 8.875739644970415e-07, "loss": 1.953, "step": 10 }, { "epoch": 0.0029585798816568047, "grad_norm": 22.125, "learning_rate": 1.0848126232741618e-06, "loss": 1.972, "step": 12 }, { "epoch": 0.0034516765285996054, "grad_norm": 22.875, "learning_rate": 1.282051282051282e-06, "loss": 1.9888, "step": 14 }, { "epoch": 0.0039447731755424065, "grad_norm": 22.75, "learning_rate": 1.4792899408284024e-06, "loss": 1.9878, "step": 16 }, { "epoch": 0.004437869822485207, "grad_norm": 21.5, "learning_rate": 1.6765285996055227e-06, "loss": 1.9712, "step": 18 }, { "epoch": 0.004930966469428008, "grad_norm": 21.375, "learning_rate": 1.8737672583826429e-06, "loss": 1.9812, "step": 20 }, { "epoch": 0.005424063116370809, "grad_norm": 20.5, "learning_rate": 2.0710059171597635e-06, "loss": 1.9411, "step": 22 }, { "epoch": 0.005917159763313609, "grad_norm": 19.75, "learning_rate": 2.268244575936884e-06, "loss": 1.9677, "step": 24 }, { "epoch": 0.00641025641025641, "grad_norm": 18.875, "learning_rate": 2.465483234714004e-06, "loss": 1.9734, "step": 26 }, { "epoch": 0.006903353057199211, "grad_norm": 18.25, "learning_rate": 2.6627218934911246e-06, "loss": 1.9409, "step": 28 }, { "epoch": 0.0073964497041420114, "grad_norm": 17.625, "learning_rate": 2.859960552268245e-06, "loss": 1.9655, "step": 30 }, { "epoch": 0.007889546351084813, "grad_norm": 15.6875, "learning_rate": 3.057199211045365e-06, "loss": 1.8875, "step": 32 }, { "epoch": 0.008382642998027613, "grad_norm": 15.3125, "learning_rate": 3.2544378698224853e-06, "loss": 1.8834, "step": 34 }, { "epoch": 0.008875739644970414, "grad_norm": 15.5625, "learning_rate": 3.451676528599606e-06, "loss": 1.8718, "step": 36 }, { "epoch": 0.009368836291913214, "grad_norm": 15.8125, "learning_rate": 3.648915187376726e-06, "loss": 1.8831, "step": 38 }, { "epoch": 0.009861932938856016, "grad_norm": 15.0, "learning_rate": 3.846153846153847e-06, "loss": 1.8631, "step": 40 }, { "epoch": 0.010355029585798817, "grad_norm": 14.6875, "learning_rate": 4.043392504930966e-06, "loss": 1.8634, "step": 42 }, { "epoch": 0.010848126232741617, "grad_norm": 13.3125, "learning_rate": 4.2406311637080875e-06, "loss": 1.8314, "step": 44 }, { "epoch": 0.011341222879684419, "grad_norm": 12.0625, "learning_rate": 4.437869822485207e-06, "loss": 1.8107, "step": 46 }, { "epoch": 0.011834319526627219, "grad_norm": 11.4375, "learning_rate": 4.6351084812623274e-06, "loss": 1.7942, "step": 48 }, { "epoch": 0.01232741617357002, "grad_norm": 10.4375, "learning_rate": 4.832347140039448e-06, "loss": 1.7604, "step": 50 }, { "epoch": 0.01282051282051282, "grad_norm": 9.5625, "learning_rate": 5.029585798816568e-06, "loss": 1.7598, "step": 52 }, { "epoch": 0.013313609467455622, "grad_norm": 9.0625, "learning_rate": 5.2268244575936885e-06, "loss": 1.7374, "step": 54 }, { "epoch": 0.013806706114398421, "grad_norm": 8.4375, "learning_rate": 5.424063116370809e-06, "loss": 1.7313, "step": 56 }, { "epoch": 0.014299802761341223, "grad_norm": 8.125, "learning_rate": 5.621301775147929e-06, "loss": 1.7077, "step": 58 }, { "epoch": 0.014792899408284023, "grad_norm": 7.5625, "learning_rate": 5.81854043392505e-06, "loss": 1.677, "step": 60 }, { "epoch": 0.015285996055226824, "grad_norm": 7.0, "learning_rate": 6.01577909270217e-06, "loss": 1.6691, "step": 62 }, { "epoch": 0.015779092702169626, "grad_norm": 7.03125, "learning_rate": 6.21301775147929e-06, "loss": 1.6344, "step": 64 }, { "epoch": 0.016272189349112426, "grad_norm": 6.0625, "learning_rate": 6.41025641025641e-06, "loss": 1.5967, "step": 66 }, { "epoch": 0.016765285996055226, "grad_norm": 6.1875, "learning_rate": 6.607495069033531e-06, "loss": 1.6006, "step": 68 }, { "epoch": 0.01725838264299803, "grad_norm": 6.28125, "learning_rate": 6.8047337278106515e-06, "loss": 1.5758, "step": 70 }, { "epoch": 0.01775147928994083, "grad_norm": 5.8125, "learning_rate": 7.001972386587771e-06, "loss": 1.5521, "step": 72 }, { "epoch": 0.01824457593688363, "grad_norm": 5.0625, "learning_rate": 7.199211045364892e-06, "loss": 1.5081, "step": 74 }, { "epoch": 0.01873767258382643, "grad_norm": 5.125, "learning_rate": 7.396449704142013e-06, "loss": 1.481, "step": 76 }, { "epoch": 0.019230769230769232, "grad_norm": 5.1875, "learning_rate": 7.593688362919132e-06, "loss": 1.4716, "step": 78 }, { "epoch": 0.01972386587771203, "grad_norm": 4.1875, "learning_rate": 7.790927021696252e-06, "loss": 1.4429, "step": 80 }, { "epoch": 0.02021696252465483, "grad_norm": 4.125, "learning_rate": 7.988165680473373e-06, "loss": 1.425, "step": 82 }, { "epoch": 0.020710059171597635, "grad_norm": 3.921875, "learning_rate": 8.185404339250494e-06, "loss": 1.405, "step": 84 }, { "epoch": 0.021203155818540435, "grad_norm": 4.34375, "learning_rate": 8.382642998027614e-06, "loss": 1.3888, "step": 86 }, { "epoch": 0.021696252465483234, "grad_norm": 5.3125, "learning_rate": 8.579881656804733e-06, "loss": 1.3612, "step": 88 }, { "epoch": 0.022189349112426034, "grad_norm": 4.1875, "learning_rate": 8.777120315581854e-06, "loss": 1.3386, "step": 90 }, { "epoch": 0.022682445759368838, "grad_norm": 3.515625, "learning_rate": 8.974358974358976e-06, "loss": 1.3304, "step": 92 }, { "epoch": 0.023175542406311637, "grad_norm": 3.859375, "learning_rate": 9.171597633136095e-06, "loss": 1.3084, "step": 94 }, { "epoch": 0.023668639053254437, "grad_norm": 4.1875, "learning_rate": 9.368836291913216e-06, "loss": 1.2826, "step": 96 }, { "epoch": 0.024161735700197237, "grad_norm": 3.875, "learning_rate": 9.566074950690336e-06, "loss": 1.2691, "step": 98 }, { "epoch": 0.02465483234714004, "grad_norm": 4.40625, "learning_rate": 9.763313609467455e-06, "loss": 1.2355, "step": 100 }, { "epoch": 0.02514792899408284, "grad_norm": 4.5, "learning_rate": 9.960552268244577e-06, "loss": 1.2235, "step": 102 }, { "epoch": 0.02564102564102564, "grad_norm": 3.984375, "learning_rate": 1.0157790927021698e-05, "loss": 1.1906, "step": 104 }, { "epoch": 0.026134122287968443, "grad_norm": 4.96875, "learning_rate": 1.0355029585798817e-05, "loss": 1.1732, "step": 106 }, { "epoch": 0.026627218934911243, "grad_norm": 4.34375, "learning_rate": 1.0552268244575937e-05, "loss": 1.1496, "step": 108 }, { "epoch": 0.027120315581854043, "grad_norm": 4.21875, "learning_rate": 1.0749506903353056e-05, "loss": 1.1229, "step": 110 }, { "epoch": 0.027613412228796843, "grad_norm": 4.25, "learning_rate": 1.094674556213018e-05, "loss": 1.0984, "step": 112 }, { "epoch": 0.028106508875739646, "grad_norm": 4.53125, "learning_rate": 1.1143984220907299e-05, "loss": 1.0504, "step": 114 }, { "epoch": 0.028599605522682446, "grad_norm": 5.71875, "learning_rate": 1.1341222879684418e-05, "loss": 1.0159, "step": 116 }, { "epoch": 0.029092702169625246, "grad_norm": 4.6875, "learning_rate": 1.153846153846154e-05, "loss": 0.9707, "step": 118 }, { "epoch": 0.029585798816568046, "grad_norm": 4.6875, "learning_rate": 1.1735700197238659e-05, "loss": 0.9146, "step": 120 }, { "epoch": 0.03007889546351085, "grad_norm": 4.21875, "learning_rate": 1.193293885601578e-05, "loss": 0.8875, "step": 122 }, { "epoch": 0.03057199211045365, "grad_norm": 4.3125, "learning_rate": 1.21301775147929e-05, "loss": 0.8699, "step": 124 }, { "epoch": 0.03106508875739645, "grad_norm": 4.46875, "learning_rate": 1.2327416173570021e-05, "loss": 0.8266, "step": 126 }, { "epoch": 0.03155818540433925, "grad_norm": 4.625, "learning_rate": 1.252465483234714e-05, "loss": 0.7786, "step": 128 }, { "epoch": 0.03205128205128205, "grad_norm": 4.1875, "learning_rate": 1.2721893491124262e-05, "loss": 0.7451, "step": 130 }, { "epoch": 0.03254437869822485, "grad_norm": 4.1875, "learning_rate": 1.2919132149901381e-05, "loss": 0.7227, "step": 132 }, { "epoch": 0.033037475345167655, "grad_norm": 4.5625, "learning_rate": 1.3116370808678502e-05, "loss": 0.6885, "step": 134 }, { "epoch": 0.03353057199211045, "grad_norm": 3.359375, "learning_rate": 1.3313609467455624e-05, "loss": 0.6587, "step": 136 }, { "epoch": 0.034023668639053255, "grad_norm": 4.6875, "learning_rate": 1.3510848126232742e-05, "loss": 0.6341, "step": 138 }, { "epoch": 0.03451676528599606, "grad_norm": 5.03125, "learning_rate": 1.3708086785009863e-05, "loss": 0.6167, "step": 140 }, { "epoch": 0.035009861932938854, "grad_norm": 4.96875, "learning_rate": 1.3905325443786982e-05, "loss": 0.5948, "step": 142 }, { "epoch": 0.03550295857988166, "grad_norm": 3.1875, "learning_rate": 1.4102564102564104e-05, "loss": 0.5768, "step": 144 }, { "epoch": 0.03599605522682446, "grad_norm": 3.15625, "learning_rate": 1.4299802761341225e-05, "loss": 0.5624, "step": 146 }, { "epoch": 0.03648915187376726, "grad_norm": 3.921875, "learning_rate": 1.4497041420118343e-05, "loss": 0.5311, "step": 148 }, { "epoch": 0.03698224852071006, "grad_norm": 3.25, "learning_rate": 1.4694280078895464e-05, "loss": 0.5299, "step": 150 }, { "epoch": 0.03747534516765286, "grad_norm": 3.453125, "learning_rate": 1.4891518737672585e-05, "loss": 0.4947, "step": 152 }, { "epoch": 0.03796844181459566, "grad_norm": 2.96875, "learning_rate": 1.5088757396449705e-05, "loss": 0.4862, "step": 154 }, { "epoch": 0.038461538461538464, "grad_norm": 3.453125, "learning_rate": 1.5285996055226824e-05, "loss": 0.4677, "step": 156 }, { "epoch": 0.03895463510848126, "grad_norm": 3.265625, "learning_rate": 1.5483234714003947e-05, "loss": 0.4577, "step": 158 }, { "epoch": 0.03944773175542406, "grad_norm": 3.21875, "learning_rate": 1.5680473372781066e-05, "loss": 0.4456, "step": 160 }, { "epoch": 0.03994082840236687, "grad_norm": 3.03125, "learning_rate": 1.5877712031558186e-05, "loss": 0.4459, "step": 162 }, { "epoch": 0.04043392504930966, "grad_norm": 3.078125, "learning_rate": 1.6074950690335306e-05, "loss": 0.4249, "step": 164 }, { "epoch": 0.040927021696252466, "grad_norm": 3.859375, "learning_rate": 1.6272189349112425e-05, "loss": 0.4159, "step": 166 }, { "epoch": 0.04142011834319527, "grad_norm": 3.125, "learning_rate": 1.6469428007889548e-05, "loss": 0.4136, "step": 168 }, { "epoch": 0.041913214990138066, "grad_norm": 3.0625, "learning_rate": 1.6666666666666667e-05, "loss": 0.3989, "step": 170 }, { "epoch": 0.04240631163708087, "grad_norm": 3.71875, "learning_rate": 1.6863905325443787e-05, "loss": 0.391, "step": 172 }, { "epoch": 0.042899408284023666, "grad_norm": 3.78125, "learning_rate": 1.706114398422091e-05, "loss": 0.3874, "step": 174 }, { "epoch": 0.04339250493096647, "grad_norm": 3.546875, "learning_rate": 1.725838264299803e-05, "loss": 0.384, "step": 176 }, { "epoch": 0.04388560157790927, "grad_norm": 3.0625, "learning_rate": 1.745562130177515e-05, "loss": 0.3746, "step": 178 }, { "epoch": 0.04437869822485207, "grad_norm": 3.9375, "learning_rate": 1.7652859960552272e-05, "loss": 0.3623, "step": 180 }, { "epoch": 0.04487179487179487, "grad_norm": 2.453125, "learning_rate": 1.7850098619329388e-05, "loss": 0.3748, "step": 182 }, { "epoch": 0.045364891518737675, "grad_norm": 2.453125, "learning_rate": 1.804733727810651e-05, "loss": 0.3672, "step": 184 }, { "epoch": 0.04585798816568047, "grad_norm": 2.8125, "learning_rate": 1.824457593688363e-05, "loss": 0.3505, "step": 186 }, { "epoch": 0.046351084812623275, "grad_norm": 2.921875, "learning_rate": 1.844181459566075e-05, "loss": 0.3425, "step": 188 }, { "epoch": 0.04684418145956608, "grad_norm": 2.546875, "learning_rate": 1.8639053254437873e-05, "loss": 0.3356, "step": 190 }, { "epoch": 0.047337278106508875, "grad_norm": 2.71875, "learning_rate": 1.883629191321499e-05, "loss": 0.3191, "step": 192 }, { "epoch": 0.04783037475345168, "grad_norm": 3.046875, "learning_rate": 1.9033530571992112e-05, "loss": 0.3223, "step": 194 }, { "epoch": 0.048323471400394474, "grad_norm": 3.09375, "learning_rate": 1.923076923076923e-05, "loss": 0.3332, "step": 196 }, { "epoch": 0.04881656804733728, "grad_norm": 2.75, "learning_rate": 1.942800788954635e-05, "loss": 0.3058, "step": 198 }, { "epoch": 0.04930966469428008, "grad_norm": 2.703125, "learning_rate": 1.9625246548323474e-05, "loss": 0.3213, "step": 200 }, { "epoch": 0.04980276134122288, "grad_norm": 2.5, "learning_rate": 1.9822485207100593e-05, "loss": 0.3116, "step": 202 }, { "epoch": 0.05029585798816568, "grad_norm": 3.0, "learning_rate": 2.0019723865877713e-05, "loss": 0.308, "step": 204 }, { "epoch": 0.050788954635108484, "grad_norm": 2.625, "learning_rate": 2.0216962524654832e-05, "loss": 0.3035, "step": 206 }, { "epoch": 0.05128205128205128, "grad_norm": 2.546875, "learning_rate": 2.0414201183431952e-05, "loss": 0.3024, "step": 208 }, { "epoch": 0.051775147928994084, "grad_norm": 2.96875, "learning_rate": 2.0611439842209075e-05, "loss": 0.2876, "step": 210 }, { "epoch": 0.05226824457593689, "grad_norm": 2.984375, "learning_rate": 2.0808678500986194e-05, "loss": 0.2927, "step": 212 }, { "epoch": 0.05276134122287968, "grad_norm": 2.390625, "learning_rate": 2.1005917159763314e-05, "loss": 0.2899, "step": 214 }, { "epoch": 0.05325443786982249, "grad_norm": 3.203125, "learning_rate": 2.1203155818540433e-05, "loss": 0.2813, "step": 216 }, { "epoch": 0.05374753451676528, "grad_norm": 3.140625, "learning_rate": 2.1400394477317556e-05, "loss": 0.2734, "step": 218 }, { "epoch": 0.054240631163708086, "grad_norm": 2.953125, "learning_rate": 2.1597633136094676e-05, "loss": 0.2813, "step": 220 }, { "epoch": 0.05473372781065089, "grad_norm": 2.640625, "learning_rate": 2.1794871794871795e-05, "loss": 0.2807, "step": 222 }, { "epoch": 0.055226824457593686, "grad_norm": 3.109375, "learning_rate": 2.199211045364892e-05, "loss": 0.2588, "step": 224 }, { "epoch": 0.05571992110453649, "grad_norm": 3.875, "learning_rate": 2.2189349112426034e-05, "loss": 0.2633, "step": 226 }, { "epoch": 0.05621301775147929, "grad_norm": 3.875, "learning_rate": 2.2386587771203157e-05, "loss": 0.2616, "step": 228 }, { "epoch": 0.05670611439842209, "grad_norm": 2.390625, "learning_rate": 2.2583826429980277e-05, "loss": 0.2529, "step": 230 }, { "epoch": 0.05719921104536489, "grad_norm": 2.921875, "learning_rate": 2.2781065088757396e-05, "loss": 0.2525, "step": 232 }, { "epoch": 0.057692307692307696, "grad_norm": 2.390625, "learning_rate": 2.297830374753452e-05, "loss": 0.2611, "step": 234 }, { "epoch": 0.05818540433925049, "grad_norm": 2.6875, "learning_rate": 2.317554240631164e-05, "loss": 0.2349, "step": 236 }, { "epoch": 0.058678500986193295, "grad_norm": 2.46875, "learning_rate": 2.337278106508876e-05, "loss": 0.2351, "step": 238 }, { "epoch": 0.05917159763313609, "grad_norm": 2.421875, "learning_rate": 2.357001972386588e-05, "loss": 0.2436, "step": 240 }, { "epoch": 0.059664694280078895, "grad_norm": 2.609375, "learning_rate": 2.3767258382642997e-05, "loss": 0.2303, "step": 242 }, { "epoch": 0.0601577909270217, "grad_norm": 2.5625, "learning_rate": 2.396449704142012e-05, "loss": 0.2366, "step": 244 }, { "epoch": 0.060650887573964495, "grad_norm": 2.25, "learning_rate": 2.416173570019724e-05, "loss": 0.2173, "step": 246 }, { "epoch": 0.0611439842209073, "grad_norm": 2.953125, "learning_rate": 2.435897435897436e-05, "loss": 0.2331, "step": 248 }, { "epoch": 0.0616370808678501, "grad_norm": 3.546875, "learning_rate": 2.4556213017751482e-05, "loss": 0.2223, "step": 250 }, { "epoch": 0.0621301775147929, "grad_norm": 2.890625, "learning_rate": 2.47534516765286e-05, "loss": 0.2215, "step": 252 }, { "epoch": 0.0626232741617357, "grad_norm": 2.140625, "learning_rate": 2.495069033530572e-05, "loss": 0.223, "step": 254 }, { "epoch": 0.0631163708086785, "grad_norm": 2.21875, "learning_rate": 2.514792899408284e-05, "loss": 0.2176, "step": 256 }, { "epoch": 0.06360946745562131, "grad_norm": 2.59375, "learning_rate": 2.5345167652859964e-05, "loss": 0.2097, "step": 258 }, { "epoch": 0.0641025641025641, "grad_norm": 3.015625, "learning_rate": 2.5542406311637083e-05, "loss": 0.2082, "step": 260 }, { "epoch": 0.0645956607495069, "grad_norm": 3.65625, "learning_rate": 2.57396449704142e-05, "loss": 0.1992, "step": 262 }, { "epoch": 0.0650887573964497, "grad_norm": 2.546875, "learning_rate": 2.5936883629191322e-05, "loss": 0.2171, "step": 264 }, { "epoch": 0.0655818540433925, "grad_norm": 2.375, "learning_rate": 2.6134122287968442e-05, "loss": 0.2123, "step": 266 }, { "epoch": 0.06607495069033531, "grad_norm": 2.96875, "learning_rate": 2.6331360946745565e-05, "loss": 0.1963, "step": 268 }, { "epoch": 0.06656804733727811, "grad_norm": 2.71875, "learning_rate": 2.6528599605522688e-05, "loss": 0.1946, "step": 270 }, { "epoch": 0.0670611439842209, "grad_norm": 2.546875, "learning_rate": 2.67258382642998e-05, "loss": 0.2032, "step": 272 }, { "epoch": 0.0675542406311637, "grad_norm": 1.890625, "learning_rate": 2.6923076923076923e-05, "loss": 0.1975, "step": 274 }, { "epoch": 0.06804733727810651, "grad_norm": 3.328125, "learning_rate": 2.7120315581854043e-05, "loss": 0.1951, "step": 276 }, { "epoch": 0.06854043392504931, "grad_norm": 3.53125, "learning_rate": 2.7317554240631166e-05, "loss": 0.2055, "step": 278 }, { "epoch": 0.06903353057199212, "grad_norm": 3.796875, "learning_rate": 2.751479289940829e-05, "loss": 0.1879, "step": 280 }, { "epoch": 0.0695266272189349, "grad_norm": 2.359375, "learning_rate": 2.7712031558185408e-05, "loss": 0.1937, "step": 282 }, { "epoch": 0.07001972386587771, "grad_norm": 3.984375, "learning_rate": 2.7909270216962524e-05, "loss": 0.1867, "step": 284 }, { "epoch": 0.07051282051282051, "grad_norm": 2.75, "learning_rate": 2.8106508875739644e-05, "loss": 0.1838, "step": 286 }, { "epoch": 0.07100591715976332, "grad_norm": 3.078125, "learning_rate": 2.8303747534516767e-05, "loss": 0.1846, "step": 288 }, { "epoch": 0.07149901380670612, "grad_norm": 1.703125, "learning_rate": 2.850098619329389e-05, "loss": 0.1946, "step": 290 }, { "epoch": 0.07199211045364892, "grad_norm": 2.375, "learning_rate": 2.869822485207101e-05, "loss": 0.1826, "step": 292 }, { "epoch": 0.07248520710059171, "grad_norm": 2.265625, "learning_rate": 2.8895463510848125e-05, "loss": 0.1804, "step": 294 }, { "epoch": 0.07297830374753451, "grad_norm": 3.484375, "learning_rate": 2.9092702169625248e-05, "loss": 0.1924, "step": 296 }, { "epoch": 0.07347140039447732, "grad_norm": 2.390625, "learning_rate": 2.9289940828402368e-05, "loss": 0.1799, "step": 298 }, { "epoch": 0.07396449704142012, "grad_norm": 1.984375, "learning_rate": 2.948717948717949e-05, "loss": 0.1816, "step": 300 }, { "epoch": 0.07445759368836292, "grad_norm": 1.6171875, "learning_rate": 2.968441814595661e-05, "loss": 0.1845, "step": 302 }, { "epoch": 0.07495069033530571, "grad_norm": 1.4453125, "learning_rate": 2.9881656804733733e-05, "loss": 0.1782, "step": 304 }, { "epoch": 0.07544378698224852, "grad_norm": 2.0625, "learning_rate": 3.007889546351085e-05, "loss": 0.1648, "step": 306 }, { "epoch": 0.07593688362919132, "grad_norm": 2.6875, "learning_rate": 3.027613412228797e-05, "loss": 0.1746, "step": 308 }, { "epoch": 0.07642998027613412, "grad_norm": 2.640625, "learning_rate": 3.047337278106509e-05, "loss": 0.1809, "step": 310 }, { "epoch": 0.07692307692307693, "grad_norm": 2.140625, "learning_rate": 3.0670611439842215e-05, "loss": 0.1694, "step": 312 }, { "epoch": 0.07741617357001973, "grad_norm": 2.5, "learning_rate": 3.0867850098619334e-05, "loss": 0.1771, "step": 314 }, { "epoch": 0.07790927021696252, "grad_norm": 3.390625, "learning_rate": 3.106508875739645e-05, "loss": 0.1746, "step": 316 }, { "epoch": 0.07840236686390532, "grad_norm": 2.953125, "learning_rate": 3.126232741617357e-05, "loss": 0.174, "step": 318 }, { "epoch": 0.07889546351084813, "grad_norm": 3.9375, "learning_rate": 3.145956607495069e-05, "loss": 0.1634, "step": 320 }, { "epoch": 0.07938856015779093, "grad_norm": 2.5, "learning_rate": 3.165680473372781e-05, "loss": 0.1693, "step": 322 }, { "epoch": 0.07988165680473373, "grad_norm": 2.296875, "learning_rate": 3.185404339250493e-05, "loss": 0.1669, "step": 324 }, { "epoch": 0.08037475345167652, "grad_norm": 5.34375, "learning_rate": 3.205128205128206e-05, "loss": 0.1747, "step": 326 }, { "epoch": 0.08086785009861933, "grad_norm": 4.5, "learning_rate": 3.224852071005917e-05, "loss": 0.1636, "step": 328 }, { "epoch": 0.08136094674556213, "grad_norm": 2.21875, "learning_rate": 3.244575936883629e-05, "loss": 0.1737, "step": 330 }, { "epoch": 0.08185404339250493, "grad_norm": 7.28125, "learning_rate": 3.2642998027613417e-05, "loss": 0.1793, "step": 332 }, { "epoch": 0.08234714003944774, "grad_norm": 4.65625, "learning_rate": 3.2840236686390536e-05, "loss": 0.1743, "step": 334 }, { "epoch": 0.08284023668639054, "grad_norm": 4.21875, "learning_rate": 3.3037475345167656e-05, "loss": 0.1681, "step": 336 }, { "epoch": 0.08333333333333333, "grad_norm": 3.171875, "learning_rate": 3.3234714003944775e-05, "loss": 0.156, "step": 338 }, { "epoch": 0.08382642998027613, "grad_norm": 2.265625, "learning_rate": 3.3431952662721895e-05, "loss": 0.1664, "step": 340 }, { "epoch": 0.08431952662721894, "grad_norm": 2.09375, "learning_rate": 3.3629191321499014e-05, "loss": 0.1654, "step": 342 }, { "epoch": 0.08481262327416174, "grad_norm": 2.3125, "learning_rate": 3.3826429980276134e-05, "loss": 0.1582, "step": 344 }, { "epoch": 0.08530571992110454, "grad_norm": 3.859375, "learning_rate": 3.402366863905326e-05, "loss": 0.1593, "step": 346 }, { "epoch": 0.08579881656804733, "grad_norm": 3.0625, "learning_rate": 3.422090729783038e-05, "loss": 0.1671, "step": 348 }, { "epoch": 0.08629191321499013, "grad_norm": 1.9921875, "learning_rate": 3.441814595660749e-05, "loss": 0.1556, "step": 350 }, { "epoch": 0.08678500986193294, "grad_norm": 3.03125, "learning_rate": 3.461538461538462e-05, "loss": 0.1636, "step": 352 }, { "epoch": 0.08727810650887574, "grad_norm": 3.4375, "learning_rate": 3.481262327416174e-05, "loss": 0.1489, "step": 354 }, { "epoch": 0.08777120315581854, "grad_norm": 1.90625, "learning_rate": 3.500986193293886e-05, "loss": 0.1595, "step": 356 }, { "epoch": 0.08826429980276135, "grad_norm": 2.5625, "learning_rate": 3.520710059171598e-05, "loss": 0.1552, "step": 358 }, { "epoch": 0.08875739644970414, "grad_norm": 2.359375, "learning_rate": 3.54043392504931e-05, "loss": 0.1651, "step": 360 }, { "epoch": 0.08925049309664694, "grad_norm": 2.53125, "learning_rate": 3.5601577909270216e-05, "loss": 0.1632, "step": 362 }, { "epoch": 0.08974358974358974, "grad_norm": 2.296875, "learning_rate": 3.5798816568047336e-05, "loss": 0.1528, "step": 364 }, { "epoch": 0.09023668639053255, "grad_norm": 2.078125, "learning_rate": 3.599605522682446e-05, "loss": 0.15, "step": 366 }, { "epoch": 0.09072978303747535, "grad_norm": 1.8828125, "learning_rate": 3.619329388560158e-05, "loss": 0.1598, "step": 368 }, { "epoch": 0.09122287968441814, "grad_norm": 3.109375, "learning_rate": 3.63905325443787e-05, "loss": 0.1555, "step": 370 }, { "epoch": 0.09171597633136094, "grad_norm": 1.9609375, "learning_rate": 3.658777120315582e-05, "loss": 0.1466, "step": 372 }, { "epoch": 0.09220907297830375, "grad_norm": 3.25, "learning_rate": 3.678500986193294e-05, "loss": 0.1545, "step": 374 }, { "epoch": 0.09270216962524655, "grad_norm": 3.015625, "learning_rate": 3.698224852071006e-05, "loss": 0.1581, "step": 376 }, { "epoch": 0.09319526627218935, "grad_norm": 2.609375, "learning_rate": 3.717948717948718e-05, "loss": 0.1592, "step": 378 }, { "epoch": 0.09368836291913216, "grad_norm": 1.78125, "learning_rate": 3.7376725838264305e-05, "loss": 0.152, "step": 380 }, { "epoch": 0.09418145956607495, "grad_norm": 1.875, "learning_rate": 3.757396449704142e-05, "loss": 0.1502, "step": 382 }, { "epoch": 0.09467455621301775, "grad_norm": 2.140625, "learning_rate": 3.777120315581854e-05, "loss": 0.1627, "step": 384 }, { "epoch": 0.09516765285996055, "grad_norm": 1.65625, "learning_rate": 3.7968441814595664e-05, "loss": 0.1508, "step": 386 }, { "epoch": 0.09566074950690336, "grad_norm": 1.96875, "learning_rate": 3.8165680473372784e-05, "loss": 0.1508, "step": 388 }, { "epoch": 0.09615384615384616, "grad_norm": 2.390625, "learning_rate": 3.83629191321499e-05, "loss": 0.171, "step": 390 }, { "epoch": 0.09664694280078895, "grad_norm": 1.890625, "learning_rate": 3.856015779092703e-05, "loss": 0.1912, "step": 392 }, { "epoch": 0.09714003944773175, "grad_norm": 1.546875, "learning_rate": 3.875739644970414e-05, "loss": 0.1458, "step": 394 }, { "epoch": 0.09763313609467456, "grad_norm": 2.015625, "learning_rate": 3.895463510848126e-05, "loss": 0.1531, "step": 396 }, { "epoch": 0.09812623274161736, "grad_norm": 2.171875, "learning_rate": 3.915187376725839e-05, "loss": 0.2027, "step": 398 }, { "epoch": 0.09861932938856016, "grad_norm": 2.078125, "learning_rate": 3.934911242603551e-05, "loss": 0.1979, "step": 400 }, { "epoch": 0.09911242603550297, "grad_norm": 1.8203125, "learning_rate": 3.954635108481263e-05, "loss": 0.1393, "step": 402 }, { "epoch": 0.09960552268244575, "grad_norm": 2.65625, "learning_rate": 3.974358974358974e-05, "loss": 0.1335, "step": 404 }, { "epoch": 0.10009861932938856, "grad_norm": 2.578125, "learning_rate": 3.9940828402366866e-05, "loss": 0.1869, "step": 406 }, { "epoch": 0.10059171597633136, "grad_norm": 2.90625, "learning_rate": 4.0138067061143986e-05, "loss": 0.2058, "step": 408 }, { "epoch": 0.10108481262327416, "grad_norm": 2.65625, "learning_rate": 4.0335305719921105e-05, "loss": 0.133, "step": 410 }, { "epoch": 0.10157790927021697, "grad_norm": 2.09375, "learning_rate": 4.053254437869823e-05, "loss": 0.1377, "step": 412 }, { "epoch": 0.10207100591715976, "grad_norm": 2.15625, "learning_rate": 4.072978303747535e-05, "loss": 0.1909, "step": 414 }, { "epoch": 0.10256410256410256, "grad_norm": 2.25, "learning_rate": 4.0927021696252464e-05, "loss": 0.2018, "step": 416 }, { "epoch": 0.10305719921104536, "grad_norm": 3.28125, "learning_rate": 4.112426035502959e-05, "loss": 0.1427, "step": 418 }, { "epoch": 0.10355029585798817, "grad_norm": 2.578125, "learning_rate": 4.132149901380671e-05, "loss": 0.1332, "step": 420 }, { "epoch": 0.10404339250493097, "grad_norm": 2.734375, "learning_rate": 4.151873767258383e-05, "loss": 0.2182, "step": 422 }, { "epoch": 0.10453648915187377, "grad_norm": 1.640625, "learning_rate": 4.171597633136095e-05, "loss": 0.226, "step": 424 }, { "epoch": 0.10502958579881656, "grad_norm": 1.6328125, "learning_rate": 4.191321499013807e-05, "loss": 0.1437, "step": 426 }, { "epoch": 0.10552268244575937, "grad_norm": 1.390625, "learning_rate": 4.211045364891519e-05, "loss": 0.1255, "step": 428 }, { "epoch": 0.10601577909270217, "grad_norm": 1.6484375, "learning_rate": 4.230769230769231e-05, "loss": 0.2089, "step": 430 }, { "epoch": 0.10650887573964497, "grad_norm": 2.1875, "learning_rate": 4.2504930966469433e-05, "loss": 0.2151, "step": 432 }, { "epoch": 0.10700197238658778, "grad_norm": 1.4140625, "learning_rate": 4.270216962524655e-05, "loss": 0.136, "step": 434 }, { "epoch": 0.10749506903353057, "grad_norm": 2.015625, "learning_rate": 4.289940828402367e-05, "loss": 0.1381, "step": 436 }, { "epoch": 0.10798816568047337, "grad_norm": 2.8125, "learning_rate": 4.309664694280079e-05, "loss": 0.2191, "step": 438 }, { "epoch": 0.10848126232741617, "grad_norm": 2.875, "learning_rate": 4.329388560157791e-05, "loss": 0.217, "step": 440 }, { "epoch": 0.10897435897435898, "grad_norm": 2.109375, "learning_rate": 4.349112426035503e-05, "loss": 0.1527, "step": 442 }, { "epoch": 0.10946745562130178, "grad_norm": 1.4375, "learning_rate": 4.368836291913215e-05, "loss": 0.1538, "step": 444 }, { "epoch": 0.10996055226824458, "grad_norm": 2.046875, "learning_rate": 4.388560157790928e-05, "loss": 0.2244, "step": 446 }, { "epoch": 0.11045364891518737, "grad_norm": 2.453125, "learning_rate": 4.408284023668639e-05, "loss": 0.2452, "step": 448 }, { "epoch": 0.11094674556213018, "grad_norm": 1.921875, "learning_rate": 4.428007889546351e-05, "loss": 0.1577, "step": 450 }, { "epoch": 0.11143984220907298, "grad_norm": 2.21875, "learning_rate": 4.4477317554240635e-05, "loss": 0.1576, "step": 452 }, { "epoch": 0.11193293885601578, "grad_norm": 2.734375, "learning_rate": 4.4674556213017755e-05, "loss": 0.2118, "step": 454 }, { "epoch": 0.11242603550295859, "grad_norm": 2.296875, "learning_rate": 4.4871794871794874e-05, "loss": 0.2099, "step": 456 }, { "epoch": 0.11291913214990137, "grad_norm": 1.453125, "learning_rate": 4.5069033530571994e-05, "loss": 0.1398, "step": 458 }, { "epoch": 0.11341222879684418, "grad_norm": 1.6953125, "learning_rate": 4.5266272189349114e-05, "loss": 0.1405, "step": 460 }, { "epoch": 0.11390532544378698, "grad_norm": 2.28125, "learning_rate": 4.546351084812623e-05, "loss": 0.2018, "step": 462 }, { "epoch": 0.11439842209072978, "grad_norm": 2.125, "learning_rate": 4.566074950690335e-05, "loss": 0.1951, "step": 464 }, { "epoch": 0.11489151873767259, "grad_norm": 1.390625, "learning_rate": 4.585798816568048e-05, "loss": 0.1505, "step": 466 }, { "epoch": 0.11538461538461539, "grad_norm": 1.5546875, "learning_rate": 4.60552268244576e-05, "loss": 0.1527, "step": 468 }, { "epoch": 0.11587771203155818, "grad_norm": 2.203125, "learning_rate": 4.625246548323471e-05, "loss": 0.2001, "step": 470 }, { "epoch": 0.11637080867850098, "grad_norm": 3.140625, "learning_rate": 4.644970414201184e-05, "loss": 0.2114, "step": 472 }, { "epoch": 0.11686390532544379, "grad_norm": 2.9375, "learning_rate": 4.664694280078896e-05, "loss": 0.1362, "step": 474 }, { "epoch": 0.11735700197238659, "grad_norm": 3.0, "learning_rate": 4.6844181459566076e-05, "loss": 0.1709, "step": 476 }, { "epoch": 0.1178500986193294, "grad_norm": 1.640625, "learning_rate": 4.7041420118343196e-05, "loss": 0.1914, "step": 478 }, { "epoch": 0.11834319526627218, "grad_norm": 3.0, "learning_rate": 4.723865877712032e-05, "loss": 0.1906, "step": 480 }, { "epoch": 0.11883629191321499, "grad_norm": 2.46875, "learning_rate": 4.7435897435897435e-05, "loss": 0.1796, "step": 482 }, { "epoch": 0.11932938856015779, "grad_norm": 3.1875, "learning_rate": 4.7633136094674555e-05, "loss": 0.1555, "step": 484 }, { "epoch": 0.11982248520710059, "grad_norm": 3.234375, "learning_rate": 4.783037475345168e-05, "loss": 0.2049, "step": 486 }, { "epoch": 0.1203155818540434, "grad_norm": 2.046875, "learning_rate": 4.80276134122288e-05, "loss": 0.1921, "step": 488 }, { "epoch": 0.1208086785009862, "grad_norm": 1.78125, "learning_rate": 4.822485207100592e-05, "loss": 0.1553, "step": 490 }, { "epoch": 0.12130177514792899, "grad_norm": 2.140625, "learning_rate": 4.842209072978304e-05, "loss": 0.166, "step": 492 }, { "epoch": 0.12179487179487179, "grad_norm": 2.0625, "learning_rate": 4.861932938856016e-05, "loss": 0.1908, "step": 494 }, { "epoch": 0.1222879684418146, "grad_norm": 1.7734375, "learning_rate": 4.881656804733728e-05, "loss": 0.1874, "step": 496 }, { "epoch": 0.1227810650887574, "grad_norm": 1.7109375, "learning_rate": 4.90138067061144e-05, "loss": 0.1648, "step": 498 }, { "epoch": 0.1232741617357002, "grad_norm": 1.8515625, "learning_rate": 4.9211045364891524e-05, "loss": 0.16, "step": 500 }, { "epoch": 0.12376725838264299, "grad_norm": 1.828125, "learning_rate": 4.9408284023668644e-05, "loss": 0.196, "step": 502 }, { "epoch": 0.1242603550295858, "grad_norm": 2.625, "learning_rate": 4.9605522682445757e-05, "loss": 0.1931, "step": 504 }, { "epoch": 0.1247534516765286, "grad_norm": 1.734375, "learning_rate": 4.980276134122288e-05, "loss": 0.1683, "step": 506 }, { "epoch": 0.1252465483234714, "grad_norm": 1.9453125, "learning_rate": 5e-05, "loss": 0.1598, "step": 508 }, { "epoch": 0.1257396449704142, "grad_norm": 2.234375, "learning_rate": 5.019723865877712e-05, "loss": 0.1901, "step": 510 }, { "epoch": 0.126232741617357, "grad_norm": 2.703125, "learning_rate": 5.039447731755425e-05, "loss": 0.1916, "step": 512 }, { "epoch": 0.1267258382642998, "grad_norm": 2.375, "learning_rate": 5.059171597633137e-05, "loss": 0.1795, "step": 514 }, { "epoch": 0.12721893491124261, "grad_norm": 2.25, "learning_rate": 5.078895463510849e-05, "loss": 0.1779, "step": 516 }, { "epoch": 0.12771203155818542, "grad_norm": 2.703125, "learning_rate": 5.098619329388561e-05, "loss": 0.1942, "step": 518 }, { "epoch": 0.1282051282051282, "grad_norm": 3.015625, "learning_rate": 5.118343195266272e-05, "loss": 0.2064, "step": 520 }, { "epoch": 0.128698224852071, "grad_norm": 3.015625, "learning_rate": 5.138067061143984e-05, "loss": 0.1712, "step": 522 }, { "epoch": 0.1291913214990138, "grad_norm": 2.265625, "learning_rate": 5.157790927021696e-05, "loss": 0.1746, "step": 524 }, { "epoch": 0.1296844181459566, "grad_norm": 1.578125, "learning_rate": 5.1775147928994085e-05, "loss": 0.1744, "step": 526 }, { "epoch": 0.1301775147928994, "grad_norm": 2.515625, "learning_rate": 5.1972386587771204e-05, "loss": 0.1891, "step": 528 }, { "epoch": 0.1306706114398422, "grad_norm": 1.828125, "learning_rate": 5.2169625246548324e-05, "loss": 0.1584, "step": 530 }, { "epoch": 0.131163708086785, "grad_norm": 2.109375, "learning_rate": 5.236686390532545e-05, "loss": 0.1558, "step": 532 }, { "epoch": 0.13165680473372782, "grad_norm": 1.6484375, "learning_rate": 5.256410256410257e-05, "loss": 0.1771, "step": 534 }, { "epoch": 0.13214990138067062, "grad_norm": 1.8203125, "learning_rate": 5.276134122287969e-05, "loss": 0.1821, "step": 536 }, { "epoch": 0.13264299802761342, "grad_norm": 1.71875, "learning_rate": 5.295857988165681e-05, "loss": 0.167, "step": 538 }, { "epoch": 0.13313609467455623, "grad_norm": 1.8125, "learning_rate": 5.3155818540433935e-05, "loss": 0.1688, "step": 540 }, { "epoch": 0.133629191321499, "grad_norm": 1.734375, "learning_rate": 5.335305719921104e-05, "loss": 0.1876, "step": 542 }, { "epoch": 0.1341222879684418, "grad_norm": 1.671875, "learning_rate": 5.355029585798817e-05, "loss": 0.187, "step": 544 }, { "epoch": 0.1346153846153846, "grad_norm": 2.875, "learning_rate": 5.374753451676529e-05, "loss": 0.2008, "step": 546 }, { "epoch": 0.1351084812623274, "grad_norm": 3.34375, "learning_rate": 5.3944773175542406e-05, "loss": 0.1844, "step": 548 }, { "epoch": 0.13560157790927022, "grad_norm": 2.921875, "learning_rate": 5.4142011834319526e-05, "loss": 0.1954, "step": 550 }, { "epoch": 0.13609467455621302, "grad_norm": 2.6875, "learning_rate": 5.433925049309665e-05, "loss": 0.1857, "step": 552 }, { "epoch": 0.13658777120315582, "grad_norm": 2.21875, "learning_rate": 5.453648915187377e-05, "loss": 0.1744, "step": 554 }, { "epoch": 0.13708086785009863, "grad_norm": 2.40625, "learning_rate": 5.473372781065089e-05, "loss": 0.1661, "step": 556 }, { "epoch": 0.13757396449704143, "grad_norm": 3.46875, "learning_rate": 5.493096646942801e-05, "loss": 0.1865, "step": 558 }, { "epoch": 0.13806706114398423, "grad_norm": 1.7265625, "learning_rate": 5.512820512820514e-05, "loss": 0.1814, "step": 560 }, { "epoch": 0.13856015779092704, "grad_norm": 1.8046875, "learning_rate": 5.532544378698226e-05, "loss": 0.1552, "step": 562 }, { "epoch": 0.1390532544378698, "grad_norm": 1.8984375, "learning_rate": 5.552268244575937e-05, "loss": 0.1633, "step": 564 }, { "epoch": 0.13954635108481261, "grad_norm": 2.453125, "learning_rate": 5.571992110453649e-05, "loss": 0.1767, "step": 566 }, { "epoch": 0.14003944773175542, "grad_norm": 2.328125, "learning_rate": 5.591715976331361e-05, "loss": 0.184, "step": 568 }, { "epoch": 0.14053254437869822, "grad_norm": 2.328125, "learning_rate": 5.611439842209073e-05, "loss": 0.1704, "step": 570 }, { "epoch": 0.14102564102564102, "grad_norm": 1.984375, "learning_rate": 5.6311637080867854e-05, "loss": 0.1725, "step": 572 }, { "epoch": 0.14151873767258383, "grad_norm": 2.453125, "learning_rate": 5.6508875739644974e-05, "loss": 0.1971, "step": 574 }, { "epoch": 0.14201183431952663, "grad_norm": 1.375, "learning_rate": 5.670611439842209e-05, "loss": 0.1814, "step": 576 }, { "epoch": 0.14250493096646943, "grad_norm": 1.5390625, "learning_rate": 5.690335305719921e-05, "loss": 0.1756, "step": 578 }, { "epoch": 0.14299802761341224, "grad_norm": 1.5, "learning_rate": 5.710059171597634e-05, "loss": 0.18, "step": 580 }, { "epoch": 0.14349112426035504, "grad_norm": 2.203125, "learning_rate": 5.729783037475346e-05, "loss": 0.1866, "step": 582 }, { "epoch": 0.14398422090729784, "grad_norm": 1.8828125, "learning_rate": 5.749506903353058e-05, "loss": 0.1948, "step": 584 }, { "epoch": 0.14447731755424062, "grad_norm": 2.046875, "learning_rate": 5.769230769230769e-05, "loss": 0.1744, "step": 586 }, { "epoch": 0.14497041420118342, "grad_norm": 2.46875, "learning_rate": 5.788954635108481e-05, "loss": 0.1688, "step": 588 }, { "epoch": 0.14546351084812623, "grad_norm": 2.078125, "learning_rate": 5.808678500986193e-05, "loss": 0.1758, "step": 590 }, { "epoch": 0.14595660749506903, "grad_norm": 1.9765625, "learning_rate": 5.8284023668639056e-05, "loss": 0.1717, "step": 592 }, { "epoch": 0.14644970414201183, "grad_norm": 1.6953125, "learning_rate": 5.8481262327416176e-05, "loss": 0.1642, "step": 594 }, { "epoch": 0.14694280078895464, "grad_norm": 1.7734375, "learning_rate": 5.8678500986193295e-05, "loss": 0.1563, "step": 596 }, { "epoch": 0.14743589743589744, "grad_norm": 1.828125, "learning_rate": 5.8875739644970415e-05, "loss": 0.1838, "step": 598 }, { "epoch": 0.14792899408284024, "grad_norm": 1.8828125, "learning_rate": 5.907297830374754e-05, "loss": 0.1779, "step": 600 }, { "epoch": 0.14842209072978305, "grad_norm": 1.7890625, "learning_rate": 5.927021696252466e-05, "loss": 0.1821, "step": 602 }, { "epoch": 0.14891518737672585, "grad_norm": 1.8671875, "learning_rate": 5.946745562130178e-05, "loss": 0.1742, "step": 604 }, { "epoch": 0.14940828402366865, "grad_norm": 3.0, "learning_rate": 5.9664694280078906e-05, "loss": 0.1834, "step": 606 }, { "epoch": 0.14990138067061143, "grad_norm": 3.078125, "learning_rate": 5.986193293885601e-05, "loss": 0.1985, "step": 608 }, { "epoch": 0.15039447731755423, "grad_norm": 2.953125, "learning_rate": 6.005917159763313e-05, "loss": 0.168, "step": 610 }, { "epoch": 0.15088757396449703, "grad_norm": 2.234375, "learning_rate": 6.025641025641026e-05, "loss": 0.1617, "step": 612 }, { "epoch": 0.15138067061143984, "grad_norm": 2.21875, "learning_rate": 6.045364891518738e-05, "loss": 0.175, "step": 614 }, { "epoch": 0.15187376725838264, "grad_norm": 2.5, "learning_rate": 6.06508875739645e-05, "loss": 0.1662, "step": 616 }, { "epoch": 0.15236686390532544, "grad_norm": 2.859375, "learning_rate": 6.084812623274162e-05, "loss": 0.1673, "step": 618 }, { "epoch": 0.15285996055226825, "grad_norm": 1.8359375, "learning_rate": 6.104536489151874e-05, "loss": 0.1618, "step": 620 }, { "epoch": 0.15335305719921105, "grad_norm": 1.6484375, "learning_rate": 6.124260355029586e-05, "loss": 0.1645, "step": 622 }, { "epoch": 0.15384615384615385, "grad_norm": 2.171875, "learning_rate": 6.143984220907298e-05, "loss": 0.1726, "step": 624 }, { "epoch": 0.15433925049309666, "grad_norm": 2.875, "learning_rate": 6.16370808678501e-05, "loss": 0.1656, "step": 626 }, { "epoch": 0.15483234714003946, "grad_norm": 2.46875, "learning_rate": 6.183431952662722e-05, "loss": 0.1638, "step": 628 }, { "epoch": 0.15532544378698224, "grad_norm": 2.25, "learning_rate": 6.203155818540434e-05, "loss": 0.1641, "step": 630 }, { "epoch": 0.15581854043392504, "grad_norm": 2.546875, "learning_rate": 6.222879684418146e-05, "loss": 0.1713, "step": 632 }, { "epoch": 0.15631163708086784, "grad_norm": 2.984375, "learning_rate": 6.242603550295858e-05, "loss": 0.151, "step": 634 }, { "epoch": 0.15680473372781065, "grad_norm": 2.140625, "learning_rate": 6.26232741617357e-05, "loss": 0.1677, "step": 636 }, { "epoch": 0.15729783037475345, "grad_norm": 1.6875, "learning_rate": 6.282051282051282e-05, "loss": 0.1569, "step": 638 }, { "epoch": 0.15779092702169625, "grad_norm": 5.0625, "learning_rate": 6.301775147928994e-05, "loss": 0.1723, "step": 640 }, { "epoch": 0.15828402366863906, "grad_norm": 4.21875, "learning_rate": 6.321499013806707e-05, "loss": 0.1635, "step": 642 }, { "epoch": 0.15877712031558186, "grad_norm": 3.578125, "learning_rate": 6.341222879684419e-05, "loss": 0.1714, "step": 644 }, { "epoch": 0.15927021696252466, "grad_norm": 4.6875, "learning_rate": 6.360946745562131e-05, "loss": 0.1865, "step": 646 }, { "epoch": 0.15976331360946747, "grad_norm": 2.640625, "learning_rate": 6.380670611439843e-05, "loss": 0.1748, "step": 648 }, { "epoch": 0.16025641025641027, "grad_norm": 1.90625, "learning_rate": 6.400394477317555e-05, "loss": 0.16, "step": 650 }, { "epoch": 0.16074950690335305, "grad_norm": 3.484375, "learning_rate": 6.420118343195266e-05, "loss": 0.17, "step": 652 }, { "epoch": 0.16124260355029585, "grad_norm": 2.890625, "learning_rate": 6.439842209072979e-05, "loss": 0.1669, "step": 654 }, { "epoch": 0.16173570019723865, "grad_norm": 2.515625, "learning_rate": 6.459566074950691e-05, "loss": 0.1806, "step": 656 }, { "epoch": 0.16222879684418146, "grad_norm": 1.296875, "learning_rate": 6.479289940828403e-05, "loss": 0.1513, "step": 658 }, { "epoch": 0.16272189349112426, "grad_norm": 1.609375, "learning_rate": 6.499013806706115e-05, "loss": 0.1556, "step": 660 }, { "epoch": 0.16321499013806706, "grad_norm": 3.484375, "learning_rate": 6.518737672583827e-05, "loss": 0.1687, "step": 662 }, { "epoch": 0.16370808678500987, "grad_norm": 3.265625, "learning_rate": 6.538461538461539e-05, "loss": 0.1745, "step": 664 }, { "epoch": 0.16420118343195267, "grad_norm": 2.21875, "learning_rate": 6.55818540433925e-05, "loss": 0.1599, "step": 666 }, { "epoch": 0.16469428007889547, "grad_norm": 1.5390625, "learning_rate": 6.577909270216963e-05, "loss": 0.1574, "step": 668 }, { "epoch": 0.16518737672583828, "grad_norm": 1.7734375, "learning_rate": 6.597633136094676e-05, "loss": 0.1672, "step": 670 }, { "epoch": 0.16568047337278108, "grad_norm": 2.453125, "learning_rate": 6.617357001972388e-05, "loss": 0.1692, "step": 672 }, { "epoch": 0.16617357001972385, "grad_norm": 1.953125, "learning_rate": 6.637080867850098e-05, "loss": 0.1529, "step": 674 }, { "epoch": 0.16666666666666666, "grad_norm": 1.7734375, "learning_rate": 6.65680473372781e-05, "loss": 0.1656, "step": 676 }, { "epoch": 0.16715976331360946, "grad_norm": 1.4375, "learning_rate": 6.676528599605522e-05, "loss": 0.1726, "step": 678 }, { "epoch": 0.16765285996055226, "grad_norm": 1.3984375, "learning_rate": 6.696252465483234e-05, "loss": 0.1582, "step": 680 }, { "epoch": 0.16814595660749507, "grad_norm": 1.3984375, "learning_rate": 6.715976331360948e-05, "loss": 0.1487, "step": 682 }, { "epoch": 0.16863905325443787, "grad_norm": 1.9140625, "learning_rate": 6.73570019723866e-05, "loss": 0.1647, "step": 684 }, { "epoch": 0.16913214990138067, "grad_norm": 2.359375, "learning_rate": 6.755424063116371e-05, "loss": 0.168, "step": 686 }, { "epoch": 0.16962524654832348, "grad_norm": 3.171875, "learning_rate": 6.775147928994083e-05, "loss": 0.1776, "step": 688 }, { "epoch": 0.17011834319526628, "grad_norm": 2.625, "learning_rate": 6.794871794871795e-05, "loss": 0.167, "step": 690 }, { "epoch": 0.17061143984220908, "grad_norm": 3.4375, "learning_rate": 6.814595660749507e-05, "loss": 0.166, "step": 692 }, { "epoch": 0.1711045364891519, "grad_norm": 1.6953125, "learning_rate": 6.834319526627219e-05, "loss": 0.1699, "step": 694 }, { "epoch": 0.17159763313609466, "grad_norm": 5.9375, "learning_rate": 6.854043392504931e-05, "loss": 0.1815, "step": 696 }, { "epoch": 0.17209072978303747, "grad_norm": 2.65625, "learning_rate": 6.873767258382643e-05, "loss": 0.1712, "step": 698 }, { "epoch": 0.17258382642998027, "grad_norm": 2.28125, "learning_rate": 6.893491124260355e-05, "loss": 0.1548, "step": 700 }, { "epoch": 0.17307692307692307, "grad_norm": 3.140625, "learning_rate": 6.913214990138067e-05, "loss": 0.1726, "step": 702 }, { "epoch": 0.17357001972386588, "grad_norm": 4.15625, "learning_rate": 6.932938856015779e-05, "loss": 0.1736, "step": 704 }, { "epoch": 0.17406311637080868, "grad_norm": 2.015625, "learning_rate": 6.952662721893491e-05, "loss": 0.1688, "step": 706 }, { "epoch": 0.17455621301775148, "grad_norm": 3.8125, "learning_rate": 6.972386587771203e-05, "loss": 0.1568, "step": 708 }, { "epoch": 0.17504930966469429, "grad_norm": 2.90625, "learning_rate": 6.992110453648916e-05, "loss": 0.1761, "step": 710 }, { "epoch": 0.1755424063116371, "grad_norm": 2.1875, "learning_rate": 7.011834319526628e-05, "loss": 0.1637, "step": 712 }, { "epoch": 0.1760355029585799, "grad_norm": 2.84375, "learning_rate": 7.03155818540434e-05, "loss": 0.1639, "step": 714 }, { "epoch": 0.1765285996055227, "grad_norm": 3.453125, "learning_rate": 7.051282051282052e-05, "loss": 0.1599, "step": 716 }, { "epoch": 0.17702169625246547, "grad_norm": 2.6875, "learning_rate": 7.071005917159763e-05, "loss": 0.1634, "step": 718 }, { "epoch": 0.17751479289940827, "grad_norm": 4.0, "learning_rate": 7.090729783037475e-05, "loss": 0.1687, "step": 720 }, { "epoch": 0.17800788954635108, "grad_norm": 2.671875, "learning_rate": 7.110453648915188e-05, "loss": 0.1428, "step": 722 }, { "epoch": 0.17850098619329388, "grad_norm": 1.7734375, "learning_rate": 7.1301775147929e-05, "loss": 0.1493, "step": 724 }, { "epoch": 0.17899408284023668, "grad_norm": 2.703125, "learning_rate": 7.149901380670612e-05, "loss": 0.1508, "step": 726 }, { "epoch": 0.1794871794871795, "grad_norm": 2.859375, "learning_rate": 7.169625246548324e-05, "loss": 0.1567, "step": 728 }, { "epoch": 0.1799802761341223, "grad_norm": 1.703125, "learning_rate": 7.189349112426036e-05, "loss": 0.1558, "step": 730 }, { "epoch": 0.1804733727810651, "grad_norm": 3.515625, "learning_rate": 7.209072978303748e-05, "loss": 0.1595, "step": 732 }, { "epoch": 0.1809664694280079, "grad_norm": 2.75, "learning_rate": 7.22879684418146e-05, "loss": 0.1588, "step": 734 }, { "epoch": 0.1814595660749507, "grad_norm": 1.5078125, "learning_rate": 7.248520710059173e-05, "loss": 0.1501, "step": 736 }, { "epoch": 0.1819526627218935, "grad_norm": 3.734375, "learning_rate": 7.268244575936885e-05, "loss": 0.151, "step": 738 }, { "epoch": 0.18244575936883628, "grad_norm": 2.59375, "learning_rate": 7.287968441814596e-05, "loss": 0.1475, "step": 740 }, { "epoch": 0.18293885601577908, "grad_norm": 2.015625, "learning_rate": 7.307692307692307e-05, "loss": 0.163, "step": 742 }, { "epoch": 0.1834319526627219, "grad_norm": 3.015625, "learning_rate": 7.32741617357002e-05, "loss": 0.1619, "step": 744 }, { "epoch": 0.1839250493096647, "grad_norm": 2.15625, "learning_rate": 7.347140039447731e-05, "loss": 0.1452, "step": 746 }, { "epoch": 0.1844181459566075, "grad_norm": 1.453125, "learning_rate": 7.366863905325445e-05, "loss": 0.1522, "step": 748 }, { "epoch": 0.1849112426035503, "grad_norm": 2.484375, "learning_rate": 7.386587771203157e-05, "loss": 0.1681, "step": 750 }, { "epoch": 0.1854043392504931, "grad_norm": 2.71875, "learning_rate": 7.406311637080869e-05, "loss": 0.1552, "step": 752 }, { "epoch": 0.1858974358974359, "grad_norm": 1.5234375, "learning_rate": 7.42603550295858e-05, "loss": 0.1655, "step": 754 }, { "epoch": 0.1863905325443787, "grad_norm": 1.5546875, "learning_rate": 7.445759368836292e-05, "loss": 0.1508, "step": 756 }, { "epoch": 0.1868836291913215, "grad_norm": 1.7578125, "learning_rate": 7.465483234714004e-05, "loss": 0.1639, "step": 758 }, { "epoch": 0.1873767258382643, "grad_norm": 1.734375, "learning_rate": 7.485207100591716e-05, "loss": 0.1648, "step": 760 }, { "epoch": 0.1878698224852071, "grad_norm": 1.6953125, "learning_rate": 7.504930966469428e-05, "loss": 0.1603, "step": 762 }, { "epoch": 0.1883629191321499, "grad_norm": 1.4609375, "learning_rate": 7.52465483234714e-05, "loss": 0.1508, "step": 764 }, { "epoch": 0.1888560157790927, "grad_norm": 1.578125, "learning_rate": 7.544378698224852e-05, "loss": 0.1593, "step": 766 }, { "epoch": 0.1893491124260355, "grad_norm": 1.8515625, "learning_rate": 7.564102564102564e-05, "loss": 0.1725, "step": 768 }, { "epoch": 0.1898422090729783, "grad_norm": 1.75, "learning_rate": 7.583826429980276e-05, "loss": 0.1699, "step": 770 }, { "epoch": 0.1903353057199211, "grad_norm": 1.90625, "learning_rate": 7.603550295857988e-05, "loss": 0.1525, "step": 772 }, { "epoch": 0.1908284023668639, "grad_norm": 2.296875, "learning_rate": 7.6232741617357e-05, "loss": 0.1733, "step": 774 }, { "epoch": 0.1913214990138067, "grad_norm": 1.7109375, "learning_rate": 7.642998027613413e-05, "loss": 0.1543, "step": 776 }, { "epoch": 0.19181459566074952, "grad_norm": 1.8828125, "learning_rate": 7.662721893491125e-05, "loss": 0.1601, "step": 778 }, { "epoch": 0.19230769230769232, "grad_norm": 1.4375, "learning_rate": 7.682445759368837e-05, "loss": 0.155, "step": 780 }, { "epoch": 0.19280078895463512, "grad_norm": 1.3125, "learning_rate": 7.702169625246549e-05, "loss": 0.1585, "step": 782 }, { "epoch": 0.1932938856015779, "grad_norm": 1.515625, "learning_rate": 7.72189349112426e-05, "loss": 0.1551, "step": 784 }, { "epoch": 0.1937869822485207, "grad_norm": 1.5, "learning_rate": 7.741617357001972e-05, "loss": 0.1723, "step": 786 }, { "epoch": 0.1942800788954635, "grad_norm": 1.796875, "learning_rate": 7.761341222879685e-05, "loss": 0.1625, "step": 788 }, { "epoch": 0.1947731755424063, "grad_norm": 1.5859375, "learning_rate": 7.781065088757397e-05, "loss": 0.1655, "step": 790 }, { "epoch": 0.1952662721893491, "grad_norm": 1.4921875, "learning_rate": 7.800788954635109e-05, "loss": 0.1629, "step": 792 }, { "epoch": 0.1957593688362919, "grad_norm": 1.5234375, "learning_rate": 7.820512820512821e-05, "loss": 0.154, "step": 794 }, { "epoch": 0.19625246548323472, "grad_norm": 1.65625, "learning_rate": 7.840236686390533e-05, "loss": 0.1611, "step": 796 }, { "epoch": 0.19674556213017752, "grad_norm": 1.5859375, "learning_rate": 7.859960552268245e-05, "loss": 0.1661, "step": 798 }, { "epoch": 0.19723865877712032, "grad_norm": 1.5078125, "learning_rate": 7.879684418145957e-05, "loss": 0.1709, "step": 800 }, { "epoch": 0.19773175542406313, "grad_norm": 1.5078125, "learning_rate": 7.899408284023669e-05, "loss": 0.1501, "step": 802 }, { "epoch": 0.19822485207100593, "grad_norm": 1.2890625, "learning_rate": 7.919132149901382e-05, "loss": 0.1397, "step": 804 }, { "epoch": 0.1987179487179487, "grad_norm": 2.34375, "learning_rate": 7.938856015779093e-05, "loss": 0.1559, "step": 806 }, { "epoch": 0.1992110453648915, "grad_norm": 2.421875, "learning_rate": 7.958579881656805e-05, "loss": 0.1514, "step": 808 }, { "epoch": 0.1997041420118343, "grad_norm": 2.421875, "learning_rate": 7.978303747534517e-05, "loss": 0.1556, "step": 810 }, { "epoch": 0.20019723865877712, "grad_norm": 1.8671875, "learning_rate": 7.998027613412229e-05, "loss": 0.1565, "step": 812 }, { "epoch": 0.20069033530571992, "grad_norm": 1.65625, "learning_rate": 8.01775147928994e-05, "loss": 0.1601, "step": 814 }, { "epoch": 0.20118343195266272, "grad_norm": 1.8125, "learning_rate": 8.037475345167654e-05, "loss": 0.1457, "step": 816 }, { "epoch": 0.20167652859960553, "grad_norm": 1.5078125, "learning_rate": 8.057199211045366e-05, "loss": 0.1414, "step": 818 }, { "epoch": 0.20216962524654833, "grad_norm": 2.15625, "learning_rate": 8.076923076923078e-05, "loss": 0.1593, "step": 820 }, { "epoch": 0.20266272189349113, "grad_norm": 2.578125, "learning_rate": 8.09664694280079e-05, "loss": 0.1635, "step": 822 }, { "epoch": 0.20315581854043394, "grad_norm": 2.328125, "learning_rate": 8.116370808678502e-05, "loss": 0.152, "step": 824 }, { "epoch": 0.20364891518737674, "grad_norm": 1.9375, "learning_rate": 8.136094674556214e-05, "loss": 0.1458, "step": 826 }, { "epoch": 0.20414201183431951, "grad_norm": 1.78125, "learning_rate": 8.155818540433925e-05, "loss": 0.1348, "step": 828 }, { "epoch": 0.20463510848126232, "grad_norm": 1.5625, "learning_rate": 8.175542406311637e-05, "loss": 0.1542, "step": 830 }, { "epoch": 0.20512820512820512, "grad_norm": 1.7265625, "learning_rate": 8.19526627218935e-05, "loss": 0.1471, "step": 832 }, { "epoch": 0.20562130177514792, "grad_norm": 2.671875, "learning_rate": 8.214990138067061e-05, "loss": 0.1475, "step": 834 }, { "epoch": 0.20611439842209073, "grad_norm": 4.625, "learning_rate": 8.234714003944773e-05, "loss": 0.1558, "step": 836 }, { "epoch": 0.20660749506903353, "grad_norm": 3.125, "learning_rate": 8.254437869822485e-05, "loss": 0.1556, "step": 838 }, { "epoch": 0.20710059171597633, "grad_norm": 3.5625, "learning_rate": 8.274161735700197e-05, "loss": 0.157, "step": 840 }, { "epoch": 0.20759368836291914, "grad_norm": 2.3125, "learning_rate": 8.29388560157791e-05, "loss": 0.1477, "step": 842 }, { "epoch": 0.20808678500986194, "grad_norm": 1.28125, "learning_rate": 8.313609467455622e-05, "loss": 0.1421, "step": 844 }, { "epoch": 0.20857988165680474, "grad_norm": 3.515625, "learning_rate": 8.333333333333334e-05, "loss": 0.1524, "step": 846 }, { "epoch": 0.20907297830374755, "grad_norm": 4.4375, "learning_rate": 8.353057199211046e-05, "loss": 0.166, "step": 848 }, { "epoch": 0.20956607495069032, "grad_norm": 3.1875, "learning_rate": 8.372781065088757e-05, "loss": 0.1594, "step": 850 }, { "epoch": 0.21005917159763313, "grad_norm": 2.453125, "learning_rate": 8.392504930966469e-05, "loss": 0.1515, "step": 852 }, { "epoch": 0.21055226824457593, "grad_norm": 1.8203125, "learning_rate": 8.412228796844181e-05, "loss": 0.1649, "step": 854 }, { "epoch": 0.21104536489151873, "grad_norm": 1.0625, "learning_rate": 8.431952662721894e-05, "loss": 0.1576, "step": 856 }, { "epoch": 0.21153846153846154, "grad_norm": 2.890625, "learning_rate": 8.451676528599606e-05, "loss": 0.1414, "step": 858 }, { "epoch": 0.21203155818540434, "grad_norm": 3.71875, "learning_rate": 8.471400394477318e-05, "loss": 0.1465, "step": 860 }, { "epoch": 0.21252465483234714, "grad_norm": 2.296875, "learning_rate": 8.49112426035503e-05, "loss": 0.1575, "step": 862 }, { "epoch": 0.21301775147928995, "grad_norm": 3.109375, "learning_rate": 8.510848126232742e-05, "loss": 0.169, "step": 864 }, { "epoch": 0.21351084812623275, "grad_norm": 2.21875, "learning_rate": 8.530571992110454e-05, "loss": 0.1543, "step": 866 }, { "epoch": 0.21400394477317555, "grad_norm": 1.5390625, "learning_rate": 8.550295857988166e-05, "loss": 0.1435, "step": 868 }, { "epoch": 0.21449704142011836, "grad_norm": 2.078125, "learning_rate": 8.570019723865879e-05, "loss": 0.1407, "step": 870 }, { "epoch": 0.21499013806706113, "grad_norm": 2.21875, "learning_rate": 8.58974358974359e-05, "loss": 0.1517, "step": 872 }, { "epoch": 0.21548323471400394, "grad_norm": 1.0625, "learning_rate": 8.609467455621302e-05, "loss": 0.1461, "step": 874 }, { "epoch": 0.21597633136094674, "grad_norm": 1.5, "learning_rate": 8.629191321499014e-05, "loss": 0.1499, "step": 876 }, { "epoch": 0.21646942800788954, "grad_norm": 1.578125, "learning_rate": 8.648915187376726e-05, "loss": 0.1478, "step": 878 }, { "epoch": 0.21696252465483234, "grad_norm": 1.390625, "learning_rate": 8.668639053254438e-05, "loss": 0.147, "step": 880 }, { "epoch": 0.21745562130177515, "grad_norm": 1.671875, "learning_rate": 8.688362919132151e-05, "loss": 0.1499, "step": 882 }, { "epoch": 0.21794871794871795, "grad_norm": 1.265625, "learning_rate": 8.708086785009863e-05, "loss": 0.1557, "step": 884 }, { "epoch": 0.21844181459566075, "grad_norm": 2.28125, "learning_rate": 8.727810650887575e-05, "loss": 0.156, "step": 886 }, { "epoch": 0.21893491124260356, "grad_norm": 1.9921875, "learning_rate": 8.747534516765287e-05, "loss": 0.1606, "step": 888 }, { "epoch": 0.21942800788954636, "grad_norm": 1.8046875, "learning_rate": 8.767258382642999e-05, "loss": 0.1562, "step": 890 }, { "epoch": 0.21992110453648916, "grad_norm": 1.359375, "learning_rate": 8.78698224852071e-05, "loss": 0.1494, "step": 892 }, { "epoch": 0.22041420118343194, "grad_norm": 1.671875, "learning_rate": 8.806706114398423e-05, "loss": 0.1458, "step": 894 }, { "epoch": 0.22090729783037474, "grad_norm": 1.4609375, "learning_rate": 8.826429980276135e-05, "loss": 0.1437, "step": 896 }, { "epoch": 0.22140039447731755, "grad_norm": 1.2890625, "learning_rate": 8.846153846153847e-05, "loss": 0.1496, "step": 898 }, { "epoch": 0.22189349112426035, "grad_norm": 1.4140625, "learning_rate": 8.865877712031558e-05, "loss": 0.1478, "step": 900 }, { "epoch": 0.22238658777120315, "grad_norm": 1.3671875, "learning_rate": 8.88560157790927e-05, "loss": 0.1476, "step": 902 }, { "epoch": 0.22287968441814596, "grad_norm": 1.7734375, "learning_rate": 8.905325443786982e-05, "loss": 0.1506, "step": 904 }, { "epoch": 0.22337278106508876, "grad_norm": 1.2265625, "learning_rate": 8.925049309664694e-05, "loss": 0.1537, "step": 906 }, { "epoch": 0.22386587771203156, "grad_norm": 1.1484375, "learning_rate": 8.944773175542406e-05, "loss": 0.1467, "step": 908 }, { "epoch": 0.22435897435897437, "grad_norm": 1.6484375, "learning_rate": 8.96449704142012e-05, "loss": 0.1583, "step": 910 }, { "epoch": 0.22485207100591717, "grad_norm": 1.359375, "learning_rate": 8.984220907297832e-05, "loss": 0.1325, "step": 912 }, { "epoch": 0.22534516765285997, "grad_norm": 1.2734375, "learning_rate": 9.003944773175543e-05, "loss": 0.1554, "step": 914 }, { "epoch": 0.22583826429980275, "grad_norm": 1.484375, "learning_rate": 9.023668639053254e-05, "loss": 0.1487, "step": 916 }, { "epoch": 0.22633136094674555, "grad_norm": 2.109375, "learning_rate": 9.043392504930966e-05, "loss": 0.1478, "step": 918 }, { "epoch": 0.22682445759368836, "grad_norm": 1.703125, "learning_rate": 9.063116370808678e-05, "loss": 0.1496, "step": 920 }, { "epoch": 0.22731755424063116, "grad_norm": 1.375, "learning_rate": 9.082840236686391e-05, "loss": 0.1467, "step": 922 }, { "epoch": 0.22781065088757396, "grad_norm": 1.734375, "learning_rate": 9.102564102564103e-05, "loss": 0.1491, "step": 924 }, { "epoch": 0.22830374753451677, "grad_norm": 2.359375, "learning_rate": 9.122287968441815e-05, "loss": 0.1399, "step": 926 }, { "epoch": 0.22879684418145957, "grad_norm": 1.8984375, "learning_rate": 9.142011834319527e-05, "loss": 0.1434, "step": 928 }, { "epoch": 0.22928994082840237, "grad_norm": 1.40625, "learning_rate": 9.161735700197239e-05, "loss": 0.1514, "step": 930 }, { "epoch": 0.22978303747534518, "grad_norm": 1.203125, "learning_rate": 9.181459566074951e-05, "loss": 0.1448, "step": 932 }, { "epoch": 0.23027613412228798, "grad_norm": 1.2578125, "learning_rate": 9.201183431952663e-05, "loss": 0.153, "step": 934 }, { "epoch": 0.23076923076923078, "grad_norm": 1.421875, "learning_rate": 9.220907297830376e-05, "loss": 0.1439, "step": 936 }, { "epoch": 0.23126232741617356, "grad_norm": 1.4921875, "learning_rate": 9.240631163708087e-05, "loss": 0.1427, "step": 938 }, { "epoch": 0.23175542406311636, "grad_norm": 1.4765625, "learning_rate": 9.260355029585799e-05, "loss": 0.1484, "step": 940 }, { "epoch": 0.23224852071005916, "grad_norm": 1.2578125, "learning_rate": 9.280078895463511e-05, "loss": 0.1396, "step": 942 }, { "epoch": 0.23274161735700197, "grad_norm": 1.6640625, "learning_rate": 9.299802761341223e-05, "loss": 0.1492, "step": 944 }, { "epoch": 0.23323471400394477, "grad_norm": 1.1015625, "learning_rate": 9.319526627218935e-05, "loss": 0.1434, "step": 946 }, { "epoch": 0.23372781065088757, "grad_norm": 1.6484375, "learning_rate": 9.339250493096647e-05, "loss": 0.134, "step": 948 }, { "epoch": 0.23422090729783038, "grad_norm": 1.421875, "learning_rate": 9.35897435897436e-05, "loss": 0.144, "step": 950 }, { "epoch": 0.23471400394477318, "grad_norm": 1.3046875, "learning_rate": 9.378698224852072e-05, "loss": 0.1412, "step": 952 }, { "epoch": 0.23520710059171598, "grad_norm": 1.3828125, "learning_rate": 9.398422090729784e-05, "loss": 0.1347, "step": 954 }, { "epoch": 0.2357001972386588, "grad_norm": 1.7734375, "learning_rate": 9.418145956607496e-05, "loss": 0.1434, "step": 956 }, { "epoch": 0.2361932938856016, "grad_norm": 1.2734375, "learning_rate": 9.437869822485208e-05, "loss": 0.1425, "step": 958 }, { "epoch": 0.23668639053254437, "grad_norm": 1.28125, "learning_rate": 9.457593688362918e-05, "loss": 0.1494, "step": 960 }, { "epoch": 0.23717948717948717, "grad_norm": 1.6484375, "learning_rate": 9.477317554240632e-05, "loss": 0.1423, "step": 962 }, { "epoch": 0.23767258382642997, "grad_norm": 1.578125, "learning_rate": 9.497041420118344e-05, "loss": 0.1441, "step": 964 }, { "epoch": 0.23816568047337278, "grad_norm": 1.2890625, "learning_rate": 9.516765285996056e-05, "loss": 0.1389, "step": 966 }, { "epoch": 0.23865877712031558, "grad_norm": 1.2421875, "learning_rate": 9.536489151873768e-05, "loss": 0.1483, "step": 968 }, { "epoch": 0.23915187376725838, "grad_norm": 1.125, "learning_rate": 9.55621301775148e-05, "loss": 0.1383, "step": 970 }, { "epoch": 0.23964497041420119, "grad_norm": 1.1484375, "learning_rate": 9.575936883629191e-05, "loss": 0.14, "step": 972 }, { "epoch": 0.240138067061144, "grad_norm": 1.5078125, "learning_rate": 9.595660749506903e-05, "loss": 0.1458, "step": 974 }, { "epoch": 0.2406311637080868, "grad_norm": 1.1640625, "learning_rate": 9.615384615384617e-05, "loss": 0.1371, "step": 976 }, { "epoch": 0.2411242603550296, "grad_norm": 1.1484375, "learning_rate": 9.635108481262329e-05, "loss": 0.1391, "step": 978 }, { "epoch": 0.2416173570019724, "grad_norm": 1.34375, "learning_rate": 9.65483234714004e-05, "loss": 0.1414, "step": 980 }, { "epoch": 0.24211045364891517, "grad_norm": 1.4453125, "learning_rate": 9.674556213017751e-05, "loss": 0.1407, "step": 982 }, { "epoch": 0.24260355029585798, "grad_norm": 1.5078125, "learning_rate": 9.694280078895463e-05, "loss": 0.1461, "step": 984 }, { "epoch": 0.24309664694280078, "grad_norm": 1.3359375, "learning_rate": 9.714003944773175e-05, "loss": 0.1351, "step": 986 }, { "epoch": 0.24358974358974358, "grad_norm": 1.2109375, "learning_rate": 9.733727810650888e-05, "loss": 0.1358, "step": 988 }, { "epoch": 0.2440828402366864, "grad_norm": 1.0859375, "learning_rate": 9.7534516765286e-05, "loss": 0.1483, "step": 990 }, { "epoch": 0.2445759368836292, "grad_norm": 1.171875, "learning_rate": 9.773175542406312e-05, "loss": 0.1372, "step": 992 }, { "epoch": 0.245069033530572, "grad_norm": 1.625, "learning_rate": 9.792899408284024e-05, "loss": 0.1424, "step": 994 }, { "epoch": 0.2455621301775148, "grad_norm": 1.6640625, "learning_rate": 9.812623274161736e-05, "loss": 0.1356, "step": 996 }, { "epoch": 0.2460552268244576, "grad_norm": 1.0703125, "learning_rate": 9.832347140039448e-05, "loss": 0.1405, "step": 998 }, { "epoch": 0.2465483234714004, "grad_norm": 1.703125, "learning_rate": 9.85207100591716e-05, "loss": 0.1285, "step": 1000 }, { "epoch": 0.2470414201183432, "grad_norm": 1.390625, "learning_rate": 9.871794871794872e-05, "loss": 0.1395, "step": 1002 }, { "epoch": 0.24753451676528598, "grad_norm": 1.6484375, "learning_rate": 9.891518737672584e-05, "loss": 0.1501, "step": 1004 }, { "epoch": 0.2480276134122288, "grad_norm": 1.765625, "learning_rate": 9.911242603550296e-05, "loss": 0.133, "step": 1006 }, { "epoch": 0.2485207100591716, "grad_norm": 1.90625, "learning_rate": 9.930966469428008e-05, "loss": 0.1452, "step": 1008 }, { "epoch": 0.2490138067061144, "grad_norm": 1.2265625, "learning_rate": 9.95069033530572e-05, "loss": 0.1403, "step": 1010 }, { "epoch": 0.2495069033530572, "grad_norm": 1.5390625, "learning_rate": 9.970414201183432e-05, "loss": 0.1481, "step": 1012 }, { "epoch": 0.25, "grad_norm": 2.1875, "learning_rate": 9.990138067061144e-05, "loss": 0.1352, "step": 1014 }, { "epoch": 0.2504930966469428, "grad_norm": 1.2734375, "learning_rate": 0.00010009861932938856, "loss": 0.1382, "step": 1016 }, { "epoch": 0.2509861932938856, "grad_norm": 1.0703125, "learning_rate": 0.00010029585798816568, "loss": 0.1354, "step": 1018 }, { "epoch": 0.2514792899408284, "grad_norm": 1.1328125, "learning_rate": 0.0001004930966469428, "loss": 0.1311, "step": 1020 }, { "epoch": 0.2519723865877712, "grad_norm": 1.1875, "learning_rate": 0.00010069033530571992, "loss": 0.1415, "step": 1022 }, { "epoch": 0.252465483234714, "grad_norm": 2.3125, "learning_rate": 0.00010088757396449704, "loss": 0.135, "step": 1024 }, { "epoch": 0.2529585798816568, "grad_norm": 1.59375, "learning_rate": 0.00010108481262327416, "loss": 0.1361, "step": 1026 }, { "epoch": 0.2534516765285996, "grad_norm": 1.75, "learning_rate": 0.00010128205128205129, "loss": 0.1237, "step": 1028 }, { "epoch": 0.2539447731755424, "grad_norm": 1.5390625, "learning_rate": 0.00010147928994082841, "loss": 0.138, "step": 1030 }, { "epoch": 0.25443786982248523, "grad_norm": 1.1953125, "learning_rate": 0.00010167652859960553, "loss": 0.134, "step": 1032 }, { "epoch": 0.25493096646942803, "grad_norm": 1.296875, "learning_rate": 0.00010187376725838265, "loss": 0.1373, "step": 1034 }, { "epoch": 0.25542406311637084, "grad_norm": 1.7265625, "learning_rate": 0.00010207100591715977, "loss": 0.1427, "step": 1036 }, { "epoch": 0.2559171597633136, "grad_norm": 1.015625, "learning_rate": 0.00010226824457593689, "loss": 0.1279, "step": 1038 }, { "epoch": 0.2564102564102564, "grad_norm": 1.0546875, "learning_rate": 0.000102465483234714, "loss": 0.1439, "step": 1040 }, { "epoch": 0.2569033530571992, "grad_norm": 1.4375, "learning_rate": 0.00010266272189349113, "loss": 0.1433, "step": 1042 }, { "epoch": 0.257396449704142, "grad_norm": 1.40625, "learning_rate": 0.00010285996055226826, "loss": 0.1435, "step": 1044 }, { "epoch": 0.2578895463510848, "grad_norm": 1.4765625, "learning_rate": 0.00010305719921104538, "loss": 0.1434, "step": 1046 }, { "epoch": 0.2583826429980276, "grad_norm": 1.5546875, "learning_rate": 0.0001032544378698225, "loss": 0.1415, "step": 1048 }, { "epoch": 0.2588757396449704, "grad_norm": 1.546875, "learning_rate": 0.00010345167652859962, "loss": 0.1287, "step": 1050 }, { "epoch": 0.2593688362919132, "grad_norm": 1.3671875, "learning_rate": 0.00010364891518737674, "loss": 0.1401, "step": 1052 }, { "epoch": 0.259861932938856, "grad_norm": 1.546875, "learning_rate": 0.00010384615384615386, "loss": 0.1298, "step": 1054 }, { "epoch": 0.2603550295857988, "grad_norm": 1.265625, "learning_rate": 0.00010404339250493098, "loss": 0.1391, "step": 1056 }, { "epoch": 0.2608481262327416, "grad_norm": 1.0625, "learning_rate": 0.00010424063116370811, "loss": 0.1378, "step": 1058 }, { "epoch": 0.2613412228796844, "grad_norm": 1.078125, "learning_rate": 0.0001044378698224852, "loss": 0.1277, "step": 1060 }, { "epoch": 0.2618343195266272, "grad_norm": 1.484375, "learning_rate": 0.00010463510848126232, "loss": 0.1371, "step": 1062 }, { "epoch": 0.26232741617357, "grad_norm": 1.7265625, "learning_rate": 0.00010483234714003944, "loss": 0.138, "step": 1064 }, { "epoch": 0.26282051282051283, "grad_norm": 1.5625, "learning_rate": 0.00010502958579881656, "loss": 0.134, "step": 1066 }, { "epoch": 0.26331360946745563, "grad_norm": 1.4140625, "learning_rate": 0.00010522682445759369, "loss": 0.1365, "step": 1068 }, { "epoch": 0.26380670611439844, "grad_norm": 0.9453125, "learning_rate": 0.00010542406311637081, "loss": 0.1419, "step": 1070 }, { "epoch": 0.26429980276134124, "grad_norm": 1.2890625, "learning_rate": 0.00010562130177514793, "loss": 0.1375, "step": 1072 }, { "epoch": 0.26479289940828404, "grad_norm": 1.3046875, "learning_rate": 0.00010581854043392505, "loss": 0.1412, "step": 1074 }, { "epoch": 0.26528599605522685, "grad_norm": 0.93359375, "learning_rate": 0.00010601577909270217, "loss": 0.1358, "step": 1076 }, { "epoch": 0.26577909270216965, "grad_norm": 1.0390625, "learning_rate": 0.00010621301775147929, "loss": 0.1412, "step": 1078 }, { "epoch": 0.26627218934911245, "grad_norm": 1.4609375, "learning_rate": 0.00010641025641025641, "loss": 0.1411, "step": 1080 }, { "epoch": 0.2667652859960552, "grad_norm": 1.359375, "learning_rate": 0.00010660749506903354, "loss": 0.1345, "step": 1082 }, { "epoch": 0.267258382642998, "grad_norm": 1.1328125, "learning_rate": 0.00010680473372781066, "loss": 0.1417, "step": 1084 }, { "epoch": 0.2677514792899408, "grad_norm": 1.671875, "learning_rate": 0.00010700197238658778, "loss": 0.1371, "step": 1086 }, { "epoch": 0.2682445759368836, "grad_norm": 1.28125, "learning_rate": 0.0001071992110453649, "loss": 0.1473, "step": 1088 }, { "epoch": 0.2687376725838264, "grad_norm": 1.3671875, "learning_rate": 0.00010739644970414202, "loss": 0.131, "step": 1090 }, { "epoch": 0.2692307692307692, "grad_norm": 1.7265625, "learning_rate": 0.00010759368836291914, "loss": 0.1346, "step": 1092 }, { "epoch": 0.269723865877712, "grad_norm": 1.390625, "learning_rate": 0.00010779092702169626, "loss": 0.1338, "step": 1094 }, { "epoch": 0.2702169625246548, "grad_norm": 0.97265625, "learning_rate": 0.00010798816568047338, "loss": 0.1239, "step": 1096 }, { "epoch": 0.27071005917159763, "grad_norm": 1.3984375, "learning_rate": 0.00010818540433925051, "loss": 0.1402, "step": 1098 }, { "epoch": 0.27120315581854043, "grad_norm": 1.3359375, "learning_rate": 0.00010838264299802763, "loss": 0.1334, "step": 1100 }, { "epoch": 0.27169625246548323, "grad_norm": 1.0390625, "learning_rate": 0.00010857988165680475, "loss": 0.1224, "step": 1102 }, { "epoch": 0.27218934911242604, "grad_norm": 1.1171875, "learning_rate": 0.00010877712031558184, "loss": 0.1441, "step": 1104 }, { "epoch": 0.27268244575936884, "grad_norm": 0.953125, "learning_rate": 0.00010897435897435896, "loss": 0.1335, "step": 1106 }, { "epoch": 0.27317554240631164, "grad_norm": 0.87890625, "learning_rate": 0.0001091715976331361, "loss": 0.1385, "step": 1108 }, { "epoch": 0.27366863905325445, "grad_norm": 0.98828125, "learning_rate": 0.00010936883629191322, "loss": 0.1362, "step": 1110 }, { "epoch": 0.27416173570019725, "grad_norm": 1.0390625, "learning_rate": 0.00010956607495069034, "loss": 0.1331, "step": 1112 }, { "epoch": 0.27465483234714005, "grad_norm": 1.0078125, "learning_rate": 0.00010976331360946746, "loss": 0.1225, "step": 1114 }, { "epoch": 0.27514792899408286, "grad_norm": 1.078125, "learning_rate": 0.00010996055226824457, "loss": 0.1287, "step": 1116 }, { "epoch": 0.27564102564102566, "grad_norm": 1.4140625, "learning_rate": 0.0001101577909270217, "loss": 0.1383, "step": 1118 }, { "epoch": 0.27613412228796846, "grad_norm": 1.1484375, "learning_rate": 0.00011035502958579881, "loss": 0.1304, "step": 1120 }, { "epoch": 0.27662721893491127, "grad_norm": 1.5078125, "learning_rate": 0.00011055226824457595, "loss": 0.1328, "step": 1122 }, { "epoch": 0.27712031558185407, "grad_norm": 1.984375, "learning_rate": 0.00011074950690335307, "loss": 0.1332, "step": 1124 }, { "epoch": 0.2776134122287968, "grad_norm": 1.703125, "learning_rate": 0.00011094674556213019, "loss": 0.1343, "step": 1126 }, { "epoch": 0.2781065088757396, "grad_norm": 1.0625, "learning_rate": 0.0001111439842209073, "loss": 0.1338, "step": 1128 }, { "epoch": 0.2785996055226824, "grad_norm": 1.625, "learning_rate": 0.00011134122287968442, "loss": 0.122, "step": 1130 }, { "epoch": 0.27909270216962523, "grad_norm": 1.390625, "learning_rate": 0.00011153846153846154, "loss": 0.1247, "step": 1132 }, { "epoch": 0.27958579881656803, "grad_norm": 1.0546875, "learning_rate": 0.00011173570019723866, "loss": 0.1259, "step": 1134 }, { "epoch": 0.28007889546351084, "grad_norm": 0.984375, "learning_rate": 0.00011193293885601578, "loss": 0.1254, "step": 1136 }, { "epoch": 0.28057199211045364, "grad_norm": 1.390625, "learning_rate": 0.00011213017751479292, "loss": 0.1319, "step": 1138 }, { "epoch": 0.28106508875739644, "grad_norm": 1.078125, "learning_rate": 0.00011232741617357004, "loss": 0.1331, "step": 1140 }, { "epoch": 0.28155818540433925, "grad_norm": 1.421875, "learning_rate": 0.00011252465483234716, "loss": 0.1289, "step": 1142 }, { "epoch": 0.28205128205128205, "grad_norm": 2.203125, "learning_rate": 0.00011272189349112428, "loss": 0.1333, "step": 1144 }, { "epoch": 0.28254437869822485, "grad_norm": 1.46875, "learning_rate": 0.0001129191321499014, "loss": 0.1409, "step": 1146 }, { "epoch": 0.28303747534516766, "grad_norm": 0.97265625, "learning_rate": 0.0001131163708086785, "loss": 0.1285, "step": 1148 }, { "epoch": 0.28353057199211046, "grad_norm": 1.40625, "learning_rate": 0.00011331360946745562, "loss": 0.1388, "step": 1150 }, { "epoch": 0.28402366863905326, "grad_norm": 1.2109375, "learning_rate": 0.00011351084812623274, "loss": 0.1263, "step": 1152 }, { "epoch": 0.28451676528599606, "grad_norm": 1.265625, "learning_rate": 0.00011370808678500986, "loss": 0.1346, "step": 1154 }, { "epoch": 0.28500986193293887, "grad_norm": 1.3046875, "learning_rate": 0.00011390532544378698, "loss": 0.1284, "step": 1156 }, { "epoch": 0.28550295857988167, "grad_norm": 0.91015625, "learning_rate": 0.0001141025641025641, "loss": 0.1356, "step": 1158 }, { "epoch": 0.2859960552268245, "grad_norm": 1.3046875, "learning_rate": 0.00011429980276134122, "loss": 0.1199, "step": 1160 }, { "epoch": 0.2864891518737673, "grad_norm": 1.2578125, "learning_rate": 0.00011449704142011835, "loss": 0.1353, "step": 1162 }, { "epoch": 0.2869822485207101, "grad_norm": 0.9765625, "learning_rate": 0.00011469428007889547, "loss": 0.1319, "step": 1164 }, { "epoch": 0.2874753451676529, "grad_norm": 1.1796875, "learning_rate": 0.00011489151873767259, "loss": 0.1319, "step": 1166 }, { "epoch": 0.2879684418145957, "grad_norm": 1.203125, "learning_rate": 0.00011508875739644971, "loss": 0.1363, "step": 1168 }, { "epoch": 0.28846153846153844, "grad_norm": 1.4765625, "learning_rate": 0.00011528599605522683, "loss": 0.1276, "step": 1170 }, { "epoch": 0.28895463510848124, "grad_norm": 1.078125, "learning_rate": 0.00011548323471400395, "loss": 0.1278, "step": 1172 }, { "epoch": 0.28944773175542404, "grad_norm": 1.5625, "learning_rate": 0.00011568047337278107, "loss": 0.1265, "step": 1174 }, { "epoch": 0.28994082840236685, "grad_norm": 1.5625, "learning_rate": 0.0001158777120315582, "loss": 0.1336, "step": 1176 }, { "epoch": 0.29043392504930965, "grad_norm": 1.359375, "learning_rate": 0.00011607495069033532, "loss": 0.1248, "step": 1178 }, { "epoch": 0.29092702169625245, "grad_norm": 0.98046875, "learning_rate": 0.00011627218934911244, "loss": 0.1286, "step": 1180 }, { "epoch": 0.29142011834319526, "grad_norm": 1.2265625, "learning_rate": 0.00011646942800788956, "loss": 0.1299, "step": 1182 }, { "epoch": 0.29191321499013806, "grad_norm": 1.4921875, "learning_rate": 0.00011666666666666668, "loss": 0.1266, "step": 1184 }, { "epoch": 0.29240631163708086, "grad_norm": 1.4453125, "learning_rate": 0.0001168639053254438, "loss": 0.1235, "step": 1186 }, { "epoch": 0.29289940828402367, "grad_norm": 1.21875, "learning_rate": 0.00011706114398422092, "loss": 0.1241, "step": 1188 }, { "epoch": 0.29339250493096647, "grad_norm": 1.1796875, "learning_rate": 0.00011725838264299804, "loss": 0.1312, "step": 1190 }, { "epoch": 0.2938856015779093, "grad_norm": 1.6640625, "learning_rate": 0.00011745562130177514, "loss": 0.1195, "step": 1192 }, { "epoch": 0.2943786982248521, "grad_norm": 1.5, "learning_rate": 0.00011765285996055226, "loss": 0.1281, "step": 1194 }, { "epoch": 0.2948717948717949, "grad_norm": 0.984375, "learning_rate": 0.00011785009861932938, "loss": 0.1321, "step": 1196 }, { "epoch": 0.2953648915187377, "grad_norm": 1.453125, "learning_rate": 0.0001180473372781065, "loss": 0.1341, "step": 1198 }, { "epoch": 0.2958579881656805, "grad_norm": 1.9296875, "learning_rate": 0.00011824457593688362, "loss": 0.1284, "step": 1200 }, { "epoch": 0.2963510848126233, "grad_norm": 2.0, "learning_rate": 0.00011844181459566075, "loss": 0.1255, "step": 1202 }, { "epoch": 0.2968441814595661, "grad_norm": 0.9765625, "learning_rate": 0.00011863905325443787, "loss": 0.1295, "step": 1204 }, { "epoch": 0.2973372781065089, "grad_norm": 2.3125, "learning_rate": 0.000118836291913215, "loss": 0.1218, "step": 1206 }, { "epoch": 0.2978303747534517, "grad_norm": 1.6171875, "learning_rate": 0.00011903353057199211, "loss": 0.1285, "step": 1208 }, { "epoch": 0.2983234714003945, "grad_norm": 1.0, "learning_rate": 0.00011923076923076923, "loss": 0.1199, "step": 1210 }, { "epoch": 0.2988165680473373, "grad_norm": 1.6640625, "learning_rate": 0.00011942800788954635, "loss": 0.1196, "step": 1212 }, { "epoch": 0.29930966469428005, "grad_norm": 1.921875, "learning_rate": 0.00011962524654832347, "loss": 0.1321, "step": 1214 }, { "epoch": 0.29980276134122286, "grad_norm": 1.71875, "learning_rate": 0.0001198224852071006, "loss": 0.1291, "step": 1216 }, { "epoch": 0.30029585798816566, "grad_norm": 1.2109375, "learning_rate": 0.00012001972386587772, "loss": 0.1141, "step": 1218 }, { "epoch": 0.30078895463510846, "grad_norm": 1.25, "learning_rate": 0.00012021696252465484, "loss": 0.1292, "step": 1220 }, { "epoch": 0.30128205128205127, "grad_norm": 1.2265625, "learning_rate": 0.00012041420118343196, "loss": 0.1319, "step": 1222 }, { "epoch": 0.30177514792899407, "grad_norm": 1.2578125, "learning_rate": 0.00012061143984220908, "loss": 0.1188, "step": 1224 }, { "epoch": 0.3022682445759369, "grad_norm": 1.1171875, "learning_rate": 0.0001208086785009862, "loss": 0.1125, "step": 1226 }, { "epoch": 0.3027613412228797, "grad_norm": 0.8125, "learning_rate": 0.00012100591715976332, "loss": 0.1272, "step": 1228 }, { "epoch": 0.3032544378698225, "grad_norm": 1.6015625, "learning_rate": 0.00012120315581854044, "loss": 0.1318, "step": 1230 }, { "epoch": 0.3037475345167653, "grad_norm": 1.5, "learning_rate": 0.00012140039447731757, "loss": 0.1342, "step": 1232 }, { "epoch": 0.3042406311637081, "grad_norm": 1.3828125, "learning_rate": 0.0001215976331360947, "loss": 0.1276, "step": 1234 }, { "epoch": 0.3047337278106509, "grad_norm": 1.703125, "learning_rate": 0.00012179487179487179, "loss": 0.1244, "step": 1236 }, { "epoch": 0.3052268244575937, "grad_norm": 1.1171875, "learning_rate": 0.0001219921104536489, "loss": 0.1304, "step": 1238 }, { "epoch": 0.3057199211045365, "grad_norm": 0.66796875, "learning_rate": 0.00012218934911242604, "loss": 0.1272, "step": 1240 }, { "epoch": 0.3062130177514793, "grad_norm": 1.1015625, "learning_rate": 0.00012238658777120315, "loss": 0.1243, "step": 1242 }, { "epoch": 0.3067061143984221, "grad_norm": 1.0625, "learning_rate": 0.00012258382642998028, "loss": 0.1207, "step": 1244 }, { "epoch": 0.3071992110453649, "grad_norm": 1.28125, "learning_rate": 0.00012278106508875738, "loss": 0.1261, "step": 1246 }, { "epoch": 0.3076923076923077, "grad_norm": 0.9765625, "learning_rate": 0.00012297830374753452, "loss": 0.1288, "step": 1248 }, { "epoch": 0.3081854043392505, "grad_norm": 0.84375, "learning_rate": 0.00012317554240631165, "loss": 0.1266, "step": 1250 }, { "epoch": 0.3086785009861933, "grad_norm": 0.98828125, "learning_rate": 0.00012337278106508876, "loss": 0.122, "step": 1252 }, { "epoch": 0.3091715976331361, "grad_norm": 1.4140625, "learning_rate": 0.0001235700197238659, "loss": 0.1294, "step": 1254 }, { "epoch": 0.3096646942800789, "grad_norm": 0.96484375, "learning_rate": 0.000123767258382643, "loss": 0.1355, "step": 1256 }, { "epoch": 0.31015779092702167, "grad_norm": 1.203125, "learning_rate": 0.00012396449704142013, "loss": 0.1309, "step": 1258 }, { "epoch": 0.3106508875739645, "grad_norm": 1.6171875, "learning_rate": 0.00012416173570019723, "loss": 0.1321, "step": 1260 }, { "epoch": 0.3111439842209073, "grad_norm": 1.6328125, "learning_rate": 0.00012435897435897437, "loss": 0.1411, "step": 1262 }, { "epoch": 0.3116370808678501, "grad_norm": 1.015625, "learning_rate": 0.0001245562130177515, "loss": 0.136, "step": 1264 }, { "epoch": 0.3121301775147929, "grad_norm": 1.0234375, "learning_rate": 0.0001247534516765286, "loss": 0.1326, "step": 1266 }, { "epoch": 0.3126232741617357, "grad_norm": 1.1796875, "learning_rate": 0.00012495069033530574, "loss": 0.1317, "step": 1268 }, { "epoch": 0.3131163708086785, "grad_norm": 1.5, "learning_rate": 0.00012514792899408285, "loss": 0.1309, "step": 1270 }, { "epoch": 0.3136094674556213, "grad_norm": 0.93359375, "learning_rate": 0.00012534516765285998, "loss": 0.1244, "step": 1272 }, { "epoch": 0.3141025641025641, "grad_norm": 0.90234375, "learning_rate": 0.00012554240631163708, "loss": 0.1306, "step": 1274 }, { "epoch": 0.3145956607495069, "grad_norm": 1.546875, "learning_rate": 0.00012573964497041422, "loss": 0.1381, "step": 1276 }, { "epoch": 0.3150887573964497, "grad_norm": 1.4609375, "learning_rate": 0.00012593688362919135, "loss": 0.1394, "step": 1278 }, { "epoch": 0.3155818540433925, "grad_norm": 1.078125, "learning_rate": 0.00012613412228796843, "loss": 0.129, "step": 1280 }, { "epoch": 0.3160749506903353, "grad_norm": 1.0859375, "learning_rate": 0.00012633136094674556, "loss": 0.1218, "step": 1282 }, { "epoch": 0.3165680473372781, "grad_norm": 1.078125, "learning_rate": 0.00012652859960552267, "loss": 0.128, "step": 1284 }, { "epoch": 0.3170611439842209, "grad_norm": 1.140625, "learning_rate": 0.0001267258382642998, "loss": 0.1297, "step": 1286 }, { "epoch": 0.3175542406311637, "grad_norm": 1.40625, "learning_rate": 0.00012692307692307693, "loss": 0.123, "step": 1288 }, { "epoch": 0.3180473372781065, "grad_norm": 1.4921875, "learning_rate": 0.00012712031558185404, "loss": 0.1238, "step": 1290 }, { "epoch": 0.3185404339250493, "grad_norm": 0.82421875, "learning_rate": 0.00012731755424063117, "loss": 0.1181, "step": 1292 }, { "epoch": 0.31903353057199213, "grad_norm": 1.7265625, "learning_rate": 0.00012751479289940828, "loss": 0.1305, "step": 1294 }, { "epoch": 0.31952662721893493, "grad_norm": 1.0859375, "learning_rate": 0.0001277120315581854, "loss": 0.1229, "step": 1296 }, { "epoch": 0.32001972386587774, "grad_norm": 1.9296875, "learning_rate": 0.00012790927021696252, "loss": 0.132, "step": 1298 }, { "epoch": 0.32051282051282054, "grad_norm": 1.1953125, "learning_rate": 0.00012810650887573965, "loss": 0.132, "step": 1300 }, { "epoch": 0.3210059171597633, "grad_norm": 0.97265625, "learning_rate": 0.00012830374753451679, "loss": 0.1248, "step": 1302 }, { "epoch": 0.3214990138067061, "grad_norm": 1.4296875, "learning_rate": 0.0001285009861932939, "loss": 0.1265, "step": 1304 }, { "epoch": 0.3219921104536489, "grad_norm": 1.5390625, "learning_rate": 0.00012869822485207102, "loss": 0.1222, "step": 1306 }, { "epoch": 0.3224852071005917, "grad_norm": 1.390625, "learning_rate": 0.00012889546351084813, "loss": 0.1249, "step": 1308 }, { "epoch": 0.3229783037475345, "grad_norm": 0.76953125, "learning_rate": 0.00012909270216962526, "loss": 0.1318, "step": 1310 }, { "epoch": 0.3234714003944773, "grad_norm": 1.0546875, "learning_rate": 0.00012928994082840237, "loss": 0.1277, "step": 1312 }, { "epoch": 0.3239644970414201, "grad_norm": 1.421875, "learning_rate": 0.0001294871794871795, "loss": 0.1192, "step": 1314 }, { "epoch": 0.3244575936883629, "grad_norm": 1.46875, "learning_rate": 0.0001296844181459566, "loss": 0.1196, "step": 1316 }, { "epoch": 0.3249506903353057, "grad_norm": 1.5078125, "learning_rate": 0.00012988165680473374, "loss": 0.1317, "step": 1318 }, { "epoch": 0.3254437869822485, "grad_norm": 1.1328125, "learning_rate": 0.00013007889546351087, "loss": 0.1258, "step": 1320 }, { "epoch": 0.3259368836291913, "grad_norm": 1.09375, "learning_rate": 0.00013027613412228798, "loss": 0.1257, "step": 1322 }, { "epoch": 0.3264299802761341, "grad_norm": 1.328125, "learning_rate": 0.00013047337278106509, "loss": 0.1321, "step": 1324 }, { "epoch": 0.3269230769230769, "grad_norm": 1.3671875, "learning_rate": 0.0001306706114398422, "loss": 0.127, "step": 1326 }, { "epoch": 0.32741617357001973, "grad_norm": 1.046875, "learning_rate": 0.00013086785009861933, "loss": 0.1225, "step": 1328 }, { "epoch": 0.32790927021696253, "grad_norm": 0.94921875, "learning_rate": 0.00013106508875739646, "loss": 0.1222, "step": 1330 }, { "epoch": 0.32840236686390534, "grad_norm": 1.34375, "learning_rate": 0.00013126232741617356, "loss": 0.1253, "step": 1332 }, { "epoch": 0.32889546351084814, "grad_norm": 1.75, "learning_rate": 0.0001314595660749507, "loss": 0.127, "step": 1334 }, { "epoch": 0.32938856015779094, "grad_norm": 1.5625, "learning_rate": 0.0001316568047337278, "loss": 0.1201, "step": 1336 }, { "epoch": 0.32988165680473375, "grad_norm": 1.15625, "learning_rate": 0.00013185404339250494, "loss": 0.1292, "step": 1338 }, { "epoch": 0.33037475345167655, "grad_norm": 1.78125, "learning_rate": 0.00013205128205128204, "loss": 0.1226, "step": 1340 }, { "epoch": 0.33086785009861935, "grad_norm": 1.671875, "learning_rate": 0.00013224852071005918, "loss": 0.1267, "step": 1342 }, { "epoch": 0.33136094674556216, "grad_norm": 1.578125, "learning_rate": 0.0001324457593688363, "loss": 0.1217, "step": 1344 }, { "epoch": 0.3318540433925049, "grad_norm": 0.91796875, "learning_rate": 0.00013264299802761341, "loss": 0.1261, "step": 1346 }, { "epoch": 0.3323471400394477, "grad_norm": 1.375, "learning_rate": 0.00013284023668639055, "loss": 0.1227, "step": 1348 }, { "epoch": 0.3328402366863905, "grad_norm": 1.75, "learning_rate": 0.00013303747534516765, "loss": 0.1206, "step": 1350 }, { "epoch": 0.3333333333333333, "grad_norm": 1.2265625, "learning_rate": 0.0001332347140039448, "loss": 0.1186, "step": 1352 }, { "epoch": 0.3338264299802761, "grad_norm": 0.91015625, "learning_rate": 0.0001334319526627219, "loss": 0.1137, "step": 1354 }, { "epoch": 0.3343195266272189, "grad_norm": 0.9921875, "learning_rate": 0.00013362919132149903, "loss": 0.1135, "step": 1356 }, { "epoch": 0.3348126232741617, "grad_norm": 1.8203125, "learning_rate": 0.00013382642998027616, "loss": 0.1256, "step": 1358 }, { "epoch": 0.33530571992110453, "grad_norm": 1.4453125, "learning_rate": 0.00013402366863905326, "loss": 0.1241, "step": 1360 }, { "epoch": 0.33579881656804733, "grad_norm": 1.0078125, "learning_rate": 0.0001342209072978304, "loss": 0.1126, "step": 1362 }, { "epoch": 0.33629191321499013, "grad_norm": 0.98828125, "learning_rate": 0.0001344181459566075, "loss": 0.128, "step": 1364 }, { "epoch": 0.33678500986193294, "grad_norm": 0.765625, "learning_rate": 0.00013461538461538464, "loss": 0.1197, "step": 1366 }, { "epoch": 0.33727810650887574, "grad_norm": 1.375, "learning_rate": 0.00013481262327416174, "loss": 0.1326, "step": 1368 }, { "epoch": 0.33777120315581854, "grad_norm": 1.09375, "learning_rate": 0.00013500986193293885, "loss": 0.1232, "step": 1370 }, { "epoch": 0.33826429980276135, "grad_norm": 1.0859375, "learning_rate": 0.00013520710059171598, "loss": 0.1286, "step": 1372 }, { "epoch": 0.33875739644970415, "grad_norm": 1.140625, "learning_rate": 0.0001354043392504931, "loss": 0.1266, "step": 1374 }, { "epoch": 0.33925049309664695, "grad_norm": 0.8359375, "learning_rate": 0.00013560157790927022, "loss": 0.1202, "step": 1376 }, { "epoch": 0.33974358974358976, "grad_norm": 0.9140625, "learning_rate": 0.00013579881656804733, "loss": 0.1291, "step": 1378 }, { "epoch": 0.34023668639053256, "grad_norm": 1.0625, "learning_rate": 0.00013599605522682446, "loss": 0.1225, "step": 1380 }, { "epoch": 0.34072978303747536, "grad_norm": 1.2734375, "learning_rate": 0.0001361932938856016, "loss": 0.1282, "step": 1382 }, { "epoch": 0.34122287968441817, "grad_norm": 1.1953125, "learning_rate": 0.0001363905325443787, "loss": 0.134, "step": 1384 }, { "epoch": 0.34171597633136097, "grad_norm": 1.21875, "learning_rate": 0.00013658777120315583, "loss": 0.1168, "step": 1386 }, { "epoch": 0.3422090729783038, "grad_norm": 0.96484375, "learning_rate": 0.00013678500986193294, "loss": 0.124, "step": 1388 }, { "epoch": 0.3427021696252465, "grad_norm": 1.203125, "learning_rate": 0.00013698224852071007, "loss": 0.1212, "step": 1390 }, { "epoch": 0.3431952662721893, "grad_norm": 0.7734375, "learning_rate": 0.00013717948717948718, "loss": 0.1229, "step": 1392 }, { "epoch": 0.34368836291913213, "grad_norm": 1.1640625, "learning_rate": 0.0001373767258382643, "loss": 0.1346, "step": 1394 }, { "epoch": 0.34418145956607493, "grad_norm": 1.546875, "learning_rate": 0.00013757396449704144, "loss": 0.1272, "step": 1396 }, { "epoch": 0.34467455621301774, "grad_norm": 1.0, "learning_rate": 0.00013777120315581855, "loss": 0.136, "step": 1398 }, { "epoch": 0.34516765285996054, "grad_norm": 1.5078125, "learning_rate": 0.00013796844181459568, "loss": 0.1259, "step": 1400 }, { "epoch": 0.34566074950690334, "grad_norm": 1.3515625, "learning_rate": 0.0001381656804733728, "loss": 0.129, "step": 1402 }, { "epoch": 0.34615384615384615, "grad_norm": 1.21875, "learning_rate": 0.00013836291913214992, "loss": 0.1246, "step": 1404 }, { "epoch": 0.34664694280078895, "grad_norm": 1.3359375, "learning_rate": 0.00013856015779092703, "loss": 0.1292, "step": 1406 }, { "epoch": 0.34714003944773175, "grad_norm": 0.765625, "learning_rate": 0.00013875739644970416, "loss": 0.1219, "step": 1408 }, { "epoch": 0.34763313609467456, "grad_norm": 0.75, "learning_rate": 0.00013895463510848127, "loss": 0.1288, "step": 1410 }, { "epoch": 0.34812623274161736, "grad_norm": 1.2734375, "learning_rate": 0.00013915187376725837, "loss": 0.111, "step": 1412 }, { "epoch": 0.34861932938856016, "grad_norm": 1.125, "learning_rate": 0.0001393491124260355, "loss": 0.124, "step": 1414 }, { "epoch": 0.34911242603550297, "grad_norm": 0.79296875, "learning_rate": 0.0001395463510848126, "loss": 0.1168, "step": 1416 }, { "epoch": 0.34960552268244577, "grad_norm": 0.9140625, "learning_rate": 0.00013974358974358974, "loss": 0.1117, "step": 1418 }, { "epoch": 0.35009861932938857, "grad_norm": 0.99609375, "learning_rate": 0.00013994082840236685, "loss": 0.118, "step": 1420 }, { "epoch": 0.3505917159763314, "grad_norm": 0.82421875, "learning_rate": 0.00014013806706114398, "loss": 0.1185, "step": 1422 }, { "epoch": 0.3510848126232742, "grad_norm": 0.98046875, "learning_rate": 0.00014033530571992112, "loss": 0.1269, "step": 1424 }, { "epoch": 0.351577909270217, "grad_norm": 0.7890625, "learning_rate": 0.00014053254437869822, "loss": 0.124, "step": 1426 }, { "epoch": 0.3520710059171598, "grad_norm": 0.8125, "learning_rate": 0.00014072978303747536, "loss": 0.1184, "step": 1428 }, { "epoch": 0.3525641025641026, "grad_norm": 0.9296875, "learning_rate": 0.00014092702169625246, "loss": 0.115, "step": 1430 }, { "epoch": 0.3530571992110454, "grad_norm": 1.21875, "learning_rate": 0.0001411242603550296, "loss": 0.125, "step": 1432 }, { "epoch": 0.35355029585798814, "grad_norm": 1.203125, "learning_rate": 0.0001413214990138067, "loss": 0.1193, "step": 1434 }, { "epoch": 0.35404339250493094, "grad_norm": 1.796875, "learning_rate": 0.00014151873767258383, "loss": 0.116, "step": 1436 }, { "epoch": 0.35453648915187375, "grad_norm": 1.2421875, "learning_rate": 0.00014171597633136097, "loss": 0.122, "step": 1438 }, { "epoch": 0.35502958579881655, "grad_norm": 1.3359375, "learning_rate": 0.00014191321499013807, "loss": 0.1187, "step": 1440 }, { "epoch": 0.35552268244575935, "grad_norm": 0.76171875, "learning_rate": 0.0001421104536489152, "loss": 0.115, "step": 1442 }, { "epoch": 0.35601577909270216, "grad_norm": 1.046875, "learning_rate": 0.0001423076923076923, "loss": 0.1205, "step": 1444 }, { "epoch": 0.35650887573964496, "grad_norm": 0.9921875, "learning_rate": 0.00014250493096646944, "loss": 0.1164, "step": 1446 }, { "epoch": 0.35700197238658776, "grad_norm": 1.3203125, "learning_rate": 0.00014270216962524655, "loss": 0.1188, "step": 1448 }, { "epoch": 0.35749506903353057, "grad_norm": 1.5234375, "learning_rate": 0.00014289940828402368, "loss": 0.1307, "step": 1450 }, { "epoch": 0.35798816568047337, "grad_norm": 1.3828125, "learning_rate": 0.00014309664694280082, "loss": 0.1101, "step": 1452 }, { "epoch": 0.3584812623274162, "grad_norm": 1.34375, "learning_rate": 0.00014329388560157792, "loss": 0.1172, "step": 1454 }, { "epoch": 0.358974358974359, "grad_norm": 0.70703125, "learning_rate": 0.00014349112426035503, "loss": 0.1116, "step": 1456 }, { "epoch": 0.3594674556213018, "grad_norm": 0.93359375, "learning_rate": 0.00014368836291913214, "loss": 0.1055, "step": 1458 }, { "epoch": 0.3599605522682446, "grad_norm": 0.87890625, "learning_rate": 0.00014388560157790927, "loss": 0.1086, "step": 1460 }, { "epoch": 0.3604536489151874, "grad_norm": 0.9453125, "learning_rate": 0.0001440828402366864, "loss": 0.1245, "step": 1462 }, { "epoch": 0.3609467455621302, "grad_norm": 1.3828125, "learning_rate": 0.0001442800788954635, "loss": 0.1289, "step": 1464 }, { "epoch": 0.361439842209073, "grad_norm": 1.71875, "learning_rate": 0.00014447731755424064, "loss": 0.1195, "step": 1466 }, { "epoch": 0.3619329388560158, "grad_norm": 1.5234375, "learning_rate": 0.00014467455621301775, "loss": 0.1061, "step": 1468 }, { "epoch": 0.3624260355029586, "grad_norm": 2.0, "learning_rate": 0.00014487179487179488, "loss": 0.122, "step": 1470 }, { "epoch": 0.3629191321499014, "grad_norm": 1.1328125, "learning_rate": 0.00014506903353057199, "loss": 0.1175, "step": 1472 }, { "epoch": 0.3634122287968442, "grad_norm": 3.640625, "learning_rate": 0.00014526627218934912, "loss": 0.1194, "step": 1474 }, { "epoch": 0.363905325443787, "grad_norm": 0.7109375, "learning_rate": 0.00014546351084812625, "loss": 0.1178, "step": 1476 }, { "epoch": 0.36439842209072976, "grad_norm": 3.515625, "learning_rate": 0.00014566074950690336, "loss": 0.1174, "step": 1478 }, { "epoch": 0.36489151873767256, "grad_norm": 1.375, "learning_rate": 0.0001458579881656805, "loss": 0.1106, "step": 1480 }, { "epoch": 0.36538461538461536, "grad_norm": 1.3671875, "learning_rate": 0.0001460552268244576, "loss": 0.1156, "step": 1482 }, { "epoch": 0.36587771203155817, "grad_norm": 1.78125, "learning_rate": 0.00014625246548323473, "loss": 0.1118, "step": 1484 }, { "epoch": 0.36637080867850097, "grad_norm": 1.1328125, "learning_rate": 0.00014644970414201184, "loss": 0.1216, "step": 1486 }, { "epoch": 0.3668639053254438, "grad_norm": 0.66015625, "learning_rate": 0.00014664694280078897, "loss": 0.1094, "step": 1488 }, { "epoch": 0.3673570019723866, "grad_norm": 0.78125, "learning_rate": 0.0001468441814595661, "loss": 0.1076, "step": 1490 }, { "epoch": 0.3678500986193294, "grad_norm": 0.875, "learning_rate": 0.0001470414201183432, "loss": 0.1038, "step": 1492 }, { "epoch": 0.3683431952662722, "grad_norm": 0.7421875, "learning_rate": 0.00014723865877712034, "loss": 0.1115, "step": 1494 }, { "epoch": 0.368836291913215, "grad_norm": 0.91015625, "learning_rate": 0.00014743589743589745, "loss": 0.1114, "step": 1496 }, { "epoch": 0.3693293885601578, "grad_norm": 0.74609375, "learning_rate": 0.00014763313609467458, "loss": 0.1076, "step": 1498 }, { "epoch": 0.3698224852071006, "grad_norm": 0.94921875, "learning_rate": 0.00014783037475345169, "loss": 0.1154, "step": 1500 }, { "epoch": 0.3703155818540434, "grad_norm": 0.83203125, "learning_rate": 0.0001480276134122288, "loss": 0.1143, "step": 1502 }, { "epoch": 0.3708086785009862, "grad_norm": 0.76953125, "learning_rate": 0.00014822485207100592, "loss": 0.1142, "step": 1504 }, { "epoch": 0.371301775147929, "grad_norm": 0.890625, "learning_rate": 0.00014842209072978303, "loss": 0.1226, "step": 1506 }, { "epoch": 0.3717948717948718, "grad_norm": 0.953125, "learning_rate": 0.00014861932938856016, "loss": 0.117, "step": 1508 }, { "epoch": 0.3722879684418146, "grad_norm": 1.2421875, "learning_rate": 0.00014881656804733727, "loss": 0.1203, "step": 1510 }, { "epoch": 0.3727810650887574, "grad_norm": 1.03125, "learning_rate": 0.0001490138067061144, "loss": 0.1216, "step": 1512 }, { "epoch": 0.3732741617357002, "grad_norm": 0.89453125, "learning_rate": 0.0001492110453648915, "loss": 0.1219, "step": 1514 }, { "epoch": 0.373767258382643, "grad_norm": 1.4921875, "learning_rate": 0.00014940828402366864, "loss": 0.1226, "step": 1516 }, { "epoch": 0.3742603550295858, "grad_norm": 1.265625, "learning_rate": 0.00014960552268244577, "loss": 0.1176, "step": 1518 }, { "epoch": 0.3747534516765286, "grad_norm": 0.75390625, "learning_rate": 0.00014980276134122288, "loss": 0.1111, "step": 1520 }, { "epoch": 0.3752465483234714, "grad_norm": 0.73046875, "learning_rate": 0.00015000000000000001, "loss": 0.1142, "step": 1522 }, { "epoch": 0.3757396449704142, "grad_norm": 0.75390625, "learning_rate": 0.00015019723865877712, "loss": 0.1113, "step": 1524 }, { "epoch": 0.376232741617357, "grad_norm": 0.6796875, "learning_rate": 0.00015039447731755425, "loss": 0.12, "step": 1526 }, { "epoch": 0.3767258382642998, "grad_norm": 1.1015625, "learning_rate": 0.00015059171597633136, "loss": 0.1117, "step": 1528 }, { "epoch": 0.3772189349112426, "grad_norm": 1.2890625, "learning_rate": 0.0001507889546351085, "loss": 0.1098, "step": 1530 }, { "epoch": 0.3777120315581854, "grad_norm": 1.140625, "learning_rate": 0.00015098619329388563, "loss": 0.1061, "step": 1532 }, { "epoch": 0.3782051282051282, "grad_norm": 1.375, "learning_rate": 0.00015118343195266273, "loss": 0.1158, "step": 1534 }, { "epoch": 0.378698224852071, "grad_norm": 1.40625, "learning_rate": 0.00015138067061143986, "loss": 0.1166, "step": 1536 }, { "epoch": 0.3791913214990138, "grad_norm": 1.0390625, "learning_rate": 0.00015157790927021697, "loss": 0.1183, "step": 1538 }, { "epoch": 0.3796844181459566, "grad_norm": 1.15625, "learning_rate": 0.0001517751479289941, "loss": 0.1123, "step": 1540 }, { "epoch": 0.3801775147928994, "grad_norm": 1.265625, "learning_rate": 0.0001519723865877712, "loss": 0.1142, "step": 1542 }, { "epoch": 0.3806706114398422, "grad_norm": 1.21875, "learning_rate": 0.00015216962524654834, "loss": 0.1165, "step": 1544 }, { "epoch": 0.381163708086785, "grad_norm": 1.1328125, "learning_rate": 0.00015236686390532545, "loss": 0.1139, "step": 1546 }, { "epoch": 0.3816568047337278, "grad_norm": 1.296875, "learning_rate": 0.00015256410256410255, "loss": 0.1147, "step": 1548 }, { "epoch": 0.3821499013806706, "grad_norm": 1.421875, "learning_rate": 0.0001527613412228797, "loss": 0.1063, "step": 1550 }, { "epoch": 0.3826429980276134, "grad_norm": 0.90234375, "learning_rate": 0.0001529585798816568, "loss": 0.1122, "step": 1552 }, { "epoch": 0.3831360946745562, "grad_norm": 0.81640625, "learning_rate": 0.00015315581854043393, "loss": 0.1069, "step": 1554 }, { "epoch": 0.38362919132149903, "grad_norm": 0.6484375, "learning_rate": 0.00015335305719921106, "loss": 0.1088, "step": 1556 }, { "epoch": 0.38412228796844183, "grad_norm": 0.98828125, "learning_rate": 0.00015355029585798817, "loss": 0.117, "step": 1558 }, { "epoch": 0.38461538461538464, "grad_norm": 1.09375, "learning_rate": 0.0001537475345167653, "loss": 0.1204, "step": 1560 }, { "epoch": 0.38510848126232744, "grad_norm": 1.0859375, "learning_rate": 0.0001539447731755424, "loss": 0.1173, "step": 1562 }, { "epoch": 0.38560157790927024, "grad_norm": 1.03125, "learning_rate": 0.00015414201183431954, "loss": 0.1141, "step": 1564 }, { "epoch": 0.386094674556213, "grad_norm": 0.75, "learning_rate": 0.00015433925049309664, "loss": 0.1052, "step": 1566 }, { "epoch": 0.3865877712031558, "grad_norm": 0.8125, "learning_rate": 0.00015453648915187378, "loss": 0.1028, "step": 1568 }, { "epoch": 0.3870808678500986, "grad_norm": 0.73828125, "learning_rate": 0.0001547337278106509, "loss": 0.1164, "step": 1570 }, { "epoch": 0.3875739644970414, "grad_norm": 0.78515625, "learning_rate": 0.00015493096646942802, "loss": 0.1104, "step": 1572 }, { "epoch": 0.3880670611439842, "grad_norm": 0.8515625, "learning_rate": 0.00015512820512820515, "loss": 0.1204, "step": 1574 }, { "epoch": 0.388560157790927, "grad_norm": 0.984375, "learning_rate": 0.00015532544378698225, "loss": 0.1249, "step": 1576 }, { "epoch": 0.3890532544378698, "grad_norm": 1.046875, "learning_rate": 0.0001555226824457594, "loss": 0.1194, "step": 1578 }, { "epoch": 0.3895463510848126, "grad_norm": 1.1484375, "learning_rate": 0.0001557199211045365, "loss": 0.111, "step": 1580 }, { "epoch": 0.3900394477317554, "grad_norm": 0.84375, "learning_rate": 0.00015591715976331363, "loss": 0.1095, "step": 1582 }, { "epoch": 0.3905325443786982, "grad_norm": 1.0390625, "learning_rate": 0.00015611439842209076, "loss": 0.1218, "step": 1584 }, { "epoch": 0.391025641025641, "grad_norm": 0.79296875, "learning_rate": 0.00015631163708086787, "loss": 0.1131, "step": 1586 }, { "epoch": 0.3915187376725838, "grad_norm": 0.8671875, "learning_rate": 0.000156508875739645, "loss": 0.1128, "step": 1588 }, { "epoch": 0.39201183431952663, "grad_norm": 1.03125, "learning_rate": 0.00015670611439842208, "loss": 0.1062, "step": 1590 }, { "epoch": 0.39250493096646943, "grad_norm": 0.76953125, "learning_rate": 0.0001569033530571992, "loss": 0.1089, "step": 1592 }, { "epoch": 0.39299802761341224, "grad_norm": 0.95703125, "learning_rate": 0.00015710059171597634, "loss": 0.1064, "step": 1594 }, { "epoch": 0.39349112426035504, "grad_norm": 1.0546875, "learning_rate": 0.00015729783037475345, "loss": 0.1149, "step": 1596 }, { "epoch": 0.39398422090729784, "grad_norm": 1.0859375, "learning_rate": 0.00015749506903353058, "loss": 0.1179, "step": 1598 }, { "epoch": 0.39447731755424065, "grad_norm": 1.140625, "learning_rate": 0.0001576923076923077, "loss": 0.1149, "step": 1600 }, { "epoch": 0.39497041420118345, "grad_norm": 0.765625, "learning_rate": 0.00015788954635108482, "loss": 0.1044, "step": 1602 }, { "epoch": 0.39546351084812625, "grad_norm": 0.83203125, "learning_rate": 0.00015808678500986193, "loss": 0.1062, "step": 1604 }, { "epoch": 0.39595660749506906, "grad_norm": 1.4140625, "learning_rate": 0.00015828402366863906, "loss": 0.1162, "step": 1606 }, { "epoch": 0.39644970414201186, "grad_norm": 1.515625, "learning_rate": 0.00015848126232741617, "loss": 0.1087, "step": 1608 }, { "epoch": 0.3969428007889546, "grad_norm": 0.76171875, "learning_rate": 0.0001586785009861933, "loss": 0.1169, "step": 1610 }, { "epoch": 0.3974358974358974, "grad_norm": 0.859375, "learning_rate": 0.00015887573964497043, "loss": 0.112, "step": 1612 }, { "epoch": 0.3979289940828402, "grad_norm": 0.71875, "learning_rate": 0.00015907297830374754, "loss": 0.0958, "step": 1614 }, { "epoch": 0.398422090729783, "grad_norm": 1.1484375, "learning_rate": 0.00015927021696252467, "loss": 0.1214, "step": 1616 }, { "epoch": 0.3989151873767258, "grad_norm": 0.78125, "learning_rate": 0.00015946745562130178, "loss": 0.1087, "step": 1618 }, { "epoch": 0.3994082840236686, "grad_norm": 0.8671875, "learning_rate": 0.0001596646942800789, "loss": 0.107, "step": 1620 }, { "epoch": 0.39990138067061143, "grad_norm": 0.80859375, "learning_rate": 0.00015986193293885602, "loss": 0.1051, "step": 1622 }, { "epoch": 0.40039447731755423, "grad_norm": 0.98046875, "learning_rate": 0.00016005917159763315, "loss": 0.1053, "step": 1624 }, { "epoch": 0.40088757396449703, "grad_norm": 1.0546875, "learning_rate": 0.00016025641025641028, "loss": 0.1106, "step": 1626 }, { "epoch": 0.40138067061143984, "grad_norm": 1.4453125, "learning_rate": 0.0001604536489151874, "loss": 0.1124, "step": 1628 }, { "epoch": 0.40187376725838264, "grad_norm": 1.4140625, "learning_rate": 0.00016065088757396452, "loss": 0.1169, "step": 1630 }, { "epoch": 0.40236686390532544, "grad_norm": 0.8359375, "learning_rate": 0.00016084812623274163, "loss": 0.1133, "step": 1632 }, { "epoch": 0.40285996055226825, "grad_norm": 0.859375, "learning_rate": 0.00016104536489151873, "loss": 0.1054, "step": 1634 }, { "epoch": 0.40335305719921105, "grad_norm": 0.8828125, "learning_rate": 0.00016124260355029587, "loss": 0.1069, "step": 1636 }, { "epoch": 0.40384615384615385, "grad_norm": 0.97265625, "learning_rate": 0.00016143984220907297, "loss": 0.1104, "step": 1638 }, { "epoch": 0.40433925049309666, "grad_norm": 1.328125, "learning_rate": 0.0001616370808678501, "loss": 0.1023, "step": 1640 }, { "epoch": 0.40483234714003946, "grad_norm": 1.5390625, "learning_rate": 0.0001618343195266272, "loss": 0.1058, "step": 1642 }, { "epoch": 0.40532544378698226, "grad_norm": 1.2734375, "learning_rate": 0.00016203155818540435, "loss": 0.1085, "step": 1644 }, { "epoch": 0.40581854043392507, "grad_norm": 0.9375, "learning_rate": 0.00016222879684418145, "loss": 0.1058, "step": 1646 }, { "epoch": 0.40631163708086787, "grad_norm": 1.53125, "learning_rate": 0.00016242603550295858, "loss": 0.1099, "step": 1648 }, { "epoch": 0.4068047337278107, "grad_norm": 1.6640625, "learning_rate": 0.00016262327416173572, "loss": 0.1087, "step": 1650 }, { "epoch": 0.4072978303747535, "grad_norm": 1.328125, "learning_rate": 0.00016282051282051282, "loss": 0.108, "step": 1652 }, { "epoch": 0.4077909270216962, "grad_norm": 0.85546875, "learning_rate": 0.00016301775147928996, "loss": 0.1091, "step": 1654 }, { "epoch": 0.40828402366863903, "grad_norm": 1.2890625, "learning_rate": 0.00016321499013806706, "loss": 0.0988, "step": 1656 }, { "epoch": 0.40877712031558183, "grad_norm": 0.953125, "learning_rate": 0.0001634122287968442, "loss": 0.1052, "step": 1658 }, { "epoch": 0.40927021696252464, "grad_norm": 1.1640625, "learning_rate": 0.0001636094674556213, "loss": 0.1074, "step": 1660 }, { "epoch": 0.40976331360946744, "grad_norm": 0.7421875, "learning_rate": 0.00016380670611439843, "loss": 0.1022, "step": 1662 }, { "epoch": 0.41025641025641024, "grad_norm": 0.77734375, "learning_rate": 0.00016400394477317557, "loss": 0.1096, "step": 1664 }, { "epoch": 0.41074950690335305, "grad_norm": 0.7265625, "learning_rate": 0.00016420118343195267, "loss": 0.103, "step": 1666 }, { "epoch": 0.41124260355029585, "grad_norm": 0.875, "learning_rate": 0.0001643984220907298, "loss": 0.0972, "step": 1668 }, { "epoch": 0.41173570019723865, "grad_norm": 0.76171875, "learning_rate": 0.0001645956607495069, "loss": 0.1001, "step": 1670 }, { "epoch": 0.41222879684418146, "grad_norm": 0.66015625, "learning_rate": 0.00016479289940828405, "loss": 0.1043, "step": 1672 }, { "epoch": 0.41272189349112426, "grad_norm": 1.1328125, "learning_rate": 0.00016499013806706115, "loss": 0.1121, "step": 1674 }, { "epoch": 0.41321499013806706, "grad_norm": 1.3046875, "learning_rate": 0.00016518737672583829, "loss": 0.1105, "step": 1676 }, { "epoch": 0.41370808678500987, "grad_norm": 0.75, "learning_rate": 0.0001653846153846154, "loss": 0.1062, "step": 1678 }, { "epoch": 0.41420118343195267, "grad_norm": 0.9609375, "learning_rate": 0.0001655818540433925, "loss": 0.1064, "step": 1680 }, { "epoch": 0.41469428007889547, "grad_norm": 1.3671875, "learning_rate": 0.00016577909270216963, "loss": 0.1155, "step": 1682 }, { "epoch": 0.4151873767258383, "grad_norm": 1.3984375, "learning_rate": 0.00016597633136094674, "loss": 0.1089, "step": 1684 }, { "epoch": 0.4156804733727811, "grad_norm": 0.796875, "learning_rate": 0.00016617357001972387, "loss": 0.1021, "step": 1686 }, { "epoch": 0.4161735700197239, "grad_norm": 0.8125, "learning_rate": 0.000166370808678501, "loss": 0.105, "step": 1688 }, { "epoch": 0.4166666666666667, "grad_norm": 0.75, "learning_rate": 0.0001665680473372781, "loss": 0.1018, "step": 1690 }, { "epoch": 0.4171597633136095, "grad_norm": 0.7890625, "learning_rate": 0.00016676528599605524, "loss": 0.1072, "step": 1692 }, { "epoch": 0.4176528599605523, "grad_norm": 0.66796875, "learning_rate": 0.00016696252465483235, "loss": 0.1011, "step": 1694 }, { "epoch": 0.4181459566074951, "grad_norm": 0.78515625, "learning_rate": 0.00016715976331360948, "loss": 0.1107, "step": 1696 }, { "epoch": 0.41863905325443784, "grad_norm": 0.76171875, "learning_rate": 0.00016735700197238659, "loss": 0.0995, "step": 1698 }, { "epoch": 0.41913214990138065, "grad_norm": 0.75390625, "learning_rate": 0.00016755424063116372, "loss": 0.0969, "step": 1700 }, { "epoch": 0.41962524654832345, "grad_norm": 0.78515625, "learning_rate": 0.00016775147928994083, "loss": 0.1057, "step": 1702 }, { "epoch": 0.42011834319526625, "grad_norm": 0.8984375, "learning_rate": 0.00016794871794871796, "loss": 0.0933, "step": 1704 }, { "epoch": 0.42061143984220906, "grad_norm": 0.90625, "learning_rate": 0.0001681459566074951, "loss": 0.1105, "step": 1706 }, { "epoch": 0.42110453648915186, "grad_norm": 0.97265625, "learning_rate": 0.0001683431952662722, "loss": 0.1093, "step": 1708 }, { "epoch": 0.42159763313609466, "grad_norm": 1.4609375, "learning_rate": 0.00016854043392504933, "loss": 0.1084, "step": 1710 }, { "epoch": 0.42209072978303747, "grad_norm": 1.3359375, "learning_rate": 0.00016873767258382644, "loss": 0.1085, "step": 1712 }, { "epoch": 0.42258382642998027, "grad_norm": 1.1875, "learning_rate": 0.00016893491124260357, "loss": 0.1029, "step": 1714 }, { "epoch": 0.4230769230769231, "grad_norm": 0.69140625, "learning_rate": 0.00016913214990138068, "loss": 0.1156, "step": 1716 }, { "epoch": 0.4235700197238659, "grad_norm": 0.66796875, "learning_rate": 0.0001693293885601578, "loss": 0.1067, "step": 1718 }, { "epoch": 0.4240631163708087, "grad_norm": 0.83203125, "learning_rate": 0.00016952662721893494, "loss": 0.1072, "step": 1720 }, { "epoch": 0.4245562130177515, "grad_norm": 0.86328125, "learning_rate": 0.00016972386587771202, "loss": 0.1069, "step": 1722 }, { "epoch": 0.4250493096646943, "grad_norm": 0.78125, "learning_rate": 0.00016992110453648915, "loss": 0.0973, "step": 1724 }, { "epoch": 0.4255424063116371, "grad_norm": 0.953125, "learning_rate": 0.00017011834319526626, "loss": 0.1111, "step": 1726 }, { "epoch": 0.4260355029585799, "grad_norm": 1.21875, "learning_rate": 0.0001703155818540434, "loss": 0.1096, "step": 1728 }, { "epoch": 0.4265285996055227, "grad_norm": 1.0546875, "learning_rate": 0.00017051282051282053, "loss": 0.108, "step": 1730 }, { "epoch": 0.4270216962524655, "grad_norm": 0.8515625, "learning_rate": 0.00017071005917159763, "loss": 0.107, "step": 1732 }, { "epoch": 0.4275147928994083, "grad_norm": 1.125, "learning_rate": 0.00017090729783037476, "loss": 0.1051, "step": 1734 }, { "epoch": 0.4280078895463511, "grad_norm": 0.875, "learning_rate": 0.00017110453648915187, "loss": 0.1009, "step": 1736 }, { "epoch": 0.4285009861932939, "grad_norm": 1.140625, "learning_rate": 0.000171301775147929, "loss": 0.1054, "step": 1738 }, { "epoch": 0.4289940828402367, "grad_norm": 0.8203125, "learning_rate": 0.0001714990138067061, "loss": 0.1056, "step": 1740 }, { "epoch": 0.42948717948717946, "grad_norm": 0.75390625, "learning_rate": 0.00017169625246548324, "loss": 0.1049, "step": 1742 }, { "epoch": 0.42998027613412226, "grad_norm": 0.88671875, "learning_rate": 0.00017189349112426038, "loss": 0.1014, "step": 1744 }, { "epoch": 0.43047337278106507, "grad_norm": 1.0625, "learning_rate": 0.00017209072978303748, "loss": 0.0971, "step": 1746 }, { "epoch": 0.43096646942800787, "grad_norm": 1.4921875, "learning_rate": 0.00017228796844181461, "loss": 0.1064, "step": 1748 }, { "epoch": 0.4314595660749507, "grad_norm": 0.9140625, "learning_rate": 0.00017248520710059172, "loss": 0.1022, "step": 1750 }, { "epoch": 0.4319526627218935, "grad_norm": 0.609375, "learning_rate": 0.00017268244575936885, "loss": 0.0964, "step": 1752 }, { "epoch": 0.4324457593688363, "grad_norm": 0.63671875, "learning_rate": 0.00017287968441814596, "loss": 0.1065, "step": 1754 }, { "epoch": 0.4329388560157791, "grad_norm": 0.6328125, "learning_rate": 0.0001730769230769231, "loss": 0.097, "step": 1756 }, { "epoch": 0.4334319526627219, "grad_norm": 0.6328125, "learning_rate": 0.00017327416173570023, "loss": 0.1049, "step": 1758 }, { "epoch": 0.4339250493096647, "grad_norm": 0.86328125, "learning_rate": 0.00017347140039447733, "loss": 0.1005, "step": 1760 }, { "epoch": 0.4344181459566075, "grad_norm": 0.6953125, "learning_rate": 0.00017366863905325447, "loss": 0.1073, "step": 1762 }, { "epoch": 0.4349112426035503, "grad_norm": 0.6953125, "learning_rate": 0.00017386587771203157, "loss": 0.1075, "step": 1764 }, { "epoch": 0.4354043392504931, "grad_norm": 0.671875, "learning_rate": 0.00017406311637080868, "loss": 0.1033, "step": 1766 }, { "epoch": 0.4358974358974359, "grad_norm": 0.86328125, "learning_rate": 0.0001742603550295858, "loss": 0.105, "step": 1768 }, { "epoch": 0.4363905325443787, "grad_norm": 0.91015625, "learning_rate": 0.00017445759368836292, "loss": 0.1109, "step": 1770 }, { "epoch": 0.4368836291913215, "grad_norm": 1.609375, "learning_rate": 0.00017465483234714005, "loss": 0.1024, "step": 1772 }, { "epoch": 0.4373767258382643, "grad_norm": 1.578125, "learning_rate": 0.00017485207100591716, "loss": 0.0935, "step": 1774 }, { "epoch": 0.4378698224852071, "grad_norm": 0.7265625, "learning_rate": 0.0001750493096646943, "loss": 0.0939, "step": 1776 }, { "epoch": 0.4383629191321499, "grad_norm": 0.76171875, "learning_rate": 0.0001752465483234714, "loss": 0.0986, "step": 1778 }, { "epoch": 0.4388560157790927, "grad_norm": 0.84765625, "learning_rate": 0.00017544378698224853, "loss": 0.1001, "step": 1780 }, { "epoch": 0.4393491124260355, "grad_norm": 1.0546875, "learning_rate": 0.00017564102564102566, "loss": 0.1015, "step": 1782 }, { "epoch": 0.43984220907297833, "grad_norm": 1.28125, "learning_rate": 0.00017583826429980277, "loss": 0.0988, "step": 1784 }, { "epoch": 0.4403353057199211, "grad_norm": 1.3828125, "learning_rate": 0.0001760355029585799, "loss": 0.1007, "step": 1786 }, { "epoch": 0.4408284023668639, "grad_norm": 0.859375, "learning_rate": 0.000176232741617357, "loss": 0.1039, "step": 1788 }, { "epoch": 0.4413214990138067, "grad_norm": 0.69921875, "learning_rate": 0.00017642998027613414, "loss": 0.0953, "step": 1790 }, { "epoch": 0.4418145956607495, "grad_norm": 0.67578125, "learning_rate": 0.00017662721893491124, "loss": 0.0984, "step": 1792 }, { "epoch": 0.4423076923076923, "grad_norm": 0.96484375, "learning_rate": 0.00017682445759368838, "loss": 0.0975, "step": 1794 }, { "epoch": 0.4428007889546351, "grad_norm": 0.65625, "learning_rate": 0.00017702169625246548, "loss": 0.0921, "step": 1796 }, { "epoch": 0.4432938856015779, "grad_norm": 0.734375, "learning_rate": 0.00017721893491124262, "loss": 0.0988, "step": 1798 }, { "epoch": 0.4437869822485207, "grad_norm": 0.76953125, "learning_rate": 0.00017741617357001975, "loss": 0.0926, "step": 1800 }, { "epoch": 0.4442800788954635, "grad_norm": 0.72265625, "learning_rate": 0.00017761341222879686, "loss": 0.1028, "step": 1802 }, { "epoch": 0.4447731755424063, "grad_norm": 1.0234375, "learning_rate": 0.000177810650887574, "loss": 0.0946, "step": 1804 }, { "epoch": 0.4452662721893491, "grad_norm": 0.86328125, "learning_rate": 0.0001780078895463511, "loss": 0.1017, "step": 1806 }, { "epoch": 0.4457593688362919, "grad_norm": 0.859375, "learning_rate": 0.00017820512820512823, "loss": 0.1024, "step": 1808 }, { "epoch": 0.4462524654832347, "grad_norm": 1.0078125, "learning_rate": 0.00017840236686390533, "loss": 0.0991, "step": 1810 }, { "epoch": 0.4467455621301775, "grad_norm": 1.1484375, "learning_rate": 0.00017859960552268244, "loss": 0.0932, "step": 1812 }, { "epoch": 0.4472386587771203, "grad_norm": 1.21875, "learning_rate": 0.00017879684418145957, "loss": 0.111, "step": 1814 }, { "epoch": 0.4477317554240631, "grad_norm": 0.62109375, "learning_rate": 0.00017899408284023668, "loss": 0.0943, "step": 1816 }, { "epoch": 0.44822485207100593, "grad_norm": 1.0859375, "learning_rate": 0.0001791913214990138, "loss": 0.0949, "step": 1818 }, { "epoch": 0.44871794871794873, "grad_norm": 1.171875, "learning_rate": 0.00017938856015779092, "loss": 0.095, "step": 1820 }, { "epoch": 0.44921104536489154, "grad_norm": 0.91796875, "learning_rate": 0.00017958579881656805, "loss": 0.1036, "step": 1822 }, { "epoch": 0.44970414201183434, "grad_norm": 0.83203125, "learning_rate": 0.00017978303747534518, "loss": 0.0985, "step": 1824 }, { "epoch": 0.45019723865877714, "grad_norm": 0.70703125, "learning_rate": 0.0001799802761341223, "loss": 0.1005, "step": 1826 }, { "epoch": 0.45069033530571995, "grad_norm": 0.828125, "learning_rate": 0.00018017751479289942, "loss": 0.1037, "step": 1828 }, { "epoch": 0.4511834319526627, "grad_norm": 0.71875, "learning_rate": 0.00018037475345167653, "loss": 0.0914, "step": 1830 }, { "epoch": 0.4516765285996055, "grad_norm": 0.83203125, "learning_rate": 0.00018057199211045366, "loss": 0.1018, "step": 1832 }, { "epoch": 0.4521696252465483, "grad_norm": 1.40625, "learning_rate": 0.00018076923076923077, "loss": 0.1051, "step": 1834 }, { "epoch": 0.4526627218934911, "grad_norm": 0.8984375, "learning_rate": 0.0001809664694280079, "loss": 0.1053, "step": 1836 }, { "epoch": 0.4531558185404339, "grad_norm": 0.7734375, "learning_rate": 0.00018116370808678503, "loss": 0.0942, "step": 1838 }, { "epoch": 0.4536489151873767, "grad_norm": 0.74609375, "learning_rate": 0.00018136094674556214, "loss": 0.0964, "step": 1840 }, { "epoch": 0.4541420118343195, "grad_norm": 0.71484375, "learning_rate": 0.00018155818540433927, "loss": 0.1001, "step": 1842 }, { "epoch": 0.4546351084812623, "grad_norm": 0.5625, "learning_rate": 0.00018175542406311638, "loss": 0.1023, "step": 1844 }, { "epoch": 0.4551282051282051, "grad_norm": 0.72265625, "learning_rate": 0.0001819526627218935, "loss": 0.0977, "step": 1846 }, { "epoch": 0.4556213017751479, "grad_norm": 0.7890625, "learning_rate": 0.00018214990138067062, "loss": 0.0992, "step": 1848 }, { "epoch": 0.4561143984220907, "grad_norm": 0.9296875, "learning_rate": 0.00018234714003944775, "loss": 0.0949, "step": 1850 }, { "epoch": 0.45660749506903353, "grad_norm": 0.78515625, "learning_rate": 0.00018254437869822488, "loss": 0.0998, "step": 1852 }, { "epoch": 0.45710059171597633, "grad_norm": 0.8515625, "learning_rate": 0.00018274161735700196, "loss": 0.1033, "step": 1854 }, { "epoch": 0.45759368836291914, "grad_norm": 0.86328125, "learning_rate": 0.0001829388560157791, "loss": 0.1019, "step": 1856 }, { "epoch": 0.45808678500986194, "grad_norm": 0.9140625, "learning_rate": 0.0001831360946745562, "loss": 0.1061, "step": 1858 }, { "epoch": 0.45857988165680474, "grad_norm": 1.0234375, "learning_rate": 0.00018333333333333334, "loss": 0.0965, "step": 1860 }, { "epoch": 0.45907297830374755, "grad_norm": 1.28125, "learning_rate": 0.00018353057199211047, "loss": 0.0915, "step": 1862 }, { "epoch": 0.45956607495069035, "grad_norm": 1.1328125, "learning_rate": 0.00018372781065088757, "loss": 0.1025, "step": 1864 }, { "epoch": 0.46005917159763315, "grad_norm": 0.57421875, "learning_rate": 0.0001839250493096647, "loss": 0.0973, "step": 1866 }, { "epoch": 0.46055226824457596, "grad_norm": 0.6796875, "learning_rate": 0.0001841222879684418, "loss": 0.0975, "step": 1868 }, { "epoch": 0.46104536489151876, "grad_norm": 1.125, "learning_rate": 0.00018431952662721895, "loss": 0.0943, "step": 1870 }, { "epoch": 0.46153846153846156, "grad_norm": 1.125, "learning_rate": 0.00018451676528599605, "loss": 0.1008, "step": 1872 }, { "epoch": 0.4620315581854043, "grad_norm": 1.21875, "learning_rate": 0.00018471400394477319, "loss": 0.0985, "step": 1874 }, { "epoch": 0.4625246548323471, "grad_norm": 1.09375, "learning_rate": 0.00018491124260355032, "loss": 0.0998, "step": 1876 }, { "epoch": 0.4630177514792899, "grad_norm": 0.765625, "learning_rate": 0.00018510848126232742, "loss": 0.0986, "step": 1878 }, { "epoch": 0.4635108481262327, "grad_norm": 0.890625, "learning_rate": 0.00018530571992110456, "loss": 0.1121, "step": 1880 }, { "epoch": 0.4640039447731755, "grad_norm": 1.3125, "learning_rate": 0.00018550295857988166, "loss": 0.0962, "step": 1882 }, { "epoch": 0.46449704142011833, "grad_norm": 1.234375, "learning_rate": 0.0001857001972386588, "loss": 0.1077, "step": 1884 }, { "epoch": 0.46499013806706113, "grad_norm": 1.15625, "learning_rate": 0.0001858974358974359, "loss": 0.0981, "step": 1886 }, { "epoch": 0.46548323471400394, "grad_norm": 0.80078125, "learning_rate": 0.00018609467455621304, "loss": 0.0962, "step": 1888 }, { "epoch": 0.46597633136094674, "grad_norm": 0.8203125, "learning_rate": 0.00018629191321499014, "loss": 0.0977, "step": 1890 }, { "epoch": 0.46646942800788954, "grad_norm": 1.515625, "learning_rate": 0.00018648915187376727, "loss": 0.1, "step": 1892 }, { "epoch": 0.46696252465483234, "grad_norm": 0.79296875, "learning_rate": 0.0001866863905325444, "loss": 0.0936, "step": 1894 }, { "epoch": 0.46745562130177515, "grad_norm": 1.03125, "learning_rate": 0.00018688362919132151, "loss": 0.0998, "step": 1896 }, { "epoch": 0.46794871794871795, "grad_norm": 0.69140625, "learning_rate": 0.00018708086785009862, "loss": 0.1021, "step": 1898 }, { "epoch": 0.46844181459566075, "grad_norm": 1.140625, "learning_rate": 0.00018727810650887573, "loss": 0.1037, "step": 1900 }, { "epoch": 0.46893491124260356, "grad_norm": 1.2265625, "learning_rate": 0.00018747534516765286, "loss": 0.0964, "step": 1902 }, { "epoch": 0.46942800788954636, "grad_norm": 1.3671875, "learning_rate": 0.00018767258382643, "loss": 0.0954, "step": 1904 }, { "epoch": 0.46992110453648916, "grad_norm": 0.6875, "learning_rate": 0.0001878698224852071, "loss": 0.099, "step": 1906 }, { "epoch": 0.47041420118343197, "grad_norm": 1.140625, "learning_rate": 0.00018806706114398423, "loss": 0.0974, "step": 1908 }, { "epoch": 0.47090729783037477, "grad_norm": 1.015625, "learning_rate": 0.00018826429980276134, "loss": 0.0946, "step": 1910 }, { "epoch": 0.4714003944773176, "grad_norm": 0.96484375, "learning_rate": 0.00018846153846153847, "loss": 0.0954, "step": 1912 }, { "epoch": 0.4718934911242604, "grad_norm": 0.81640625, "learning_rate": 0.00018865877712031558, "loss": 0.0951, "step": 1914 }, { "epoch": 0.4723865877712032, "grad_norm": 0.78125, "learning_rate": 0.0001888560157790927, "loss": 0.0978, "step": 1916 }, { "epoch": 0.47287968441814593, "grad_norm": 0.70703125, "learning_rate": 0.00018905325443786984, "loss": 0.1012, "step": 1918 }, { "epoch": 0.47337278106508873, "grad_norm": 0.59765625, "learning_rate": 0.00018925049309664695, "loss": 0.0894, "step": 1920 }, { "epoch": 0.47386587771203154, "grad_norm": 0.69140625, "learning_rate": 0.00018944773175542408, "loss": 0.0941, "step": 1922 }, { "epoch": 0.47435897435897434, "grad_norm": 0.5234375, "learning_rate": 0.0001896449704142012, "loss": 0.0935, "step": 1924 }, { "epoch": 0.47485207100591714, "grad_norm": 0.5859375, "learning_rate": 0.00018984220907297832, "loss": 0.0959, "step": 1926 }, { "epoch": 0.47534516765285995, "grad_norm": 0.5703125, "learning_rate": 0.00019003944773175543, "loss": 0.1012, "step": 1928 }, { "epoch": 0.47583826429980275, "grad_norm": 0.6015625, "learning_rate": 0.00019023668639053256, "loss": 0.096, "step": 1930 }, { "epoch": 0.47633136094674555, "grad_norm": 0.7109375, "learning_rate": 0.0001904339250493097, "loss": 0.0945, "step": 1932 }, { "epoch": 0.47682445759368836, "grad_norm": 0.76171875, "learning_rate": 0.0001906311637080868, "loss": 0.0975, "step": 1934 }, { "epoch": 0.47731755424063116, "grad_norm": 0.6015625, "learning_rate": 0.00019082840236686393, "loss": 0.0996, "step": 1936 }, { "epoch": 0.47781065088757396, "grad_norm": 0.57421875, "learning_rate": 0.00019102564102564104, "loss": 0.0889, "step": 1938 }, { "epoch": 0.47830374753451677, "grad_norm": 0.6328125, "learning_rate": 0.00019122287968441817, "loss": 0.1021, "step": 1940 }, { "epoch": 0.47879684418145957, "grad_norm": 0.8515625, "learning_rate": 0.00019142011834319528, "loss": 0.0964, "step": 1942 }, { "epoch": 0.47928994082840237, "grad_norm": 0.94921875, "learning_rate": 0.00019161735700197238, "loss": 0.0911, "step": 1944 }, { "epoch": 0.4797830374753452, "grad_norm": 0.8359375, "learning_rate": 0.00019181459566074952, "loss": 0.092, "step": 1946 }, { "epoch": 0.480276134122288, "grad_norm": 0.8828125, "learning_rate": 0.00019201183431952662, "loss": 0.1005, "step": 1948 }, { "epoch": 0.4807692307692308, "grad_norm": 0.56640625, "learning_rate": 0.00019220907297830375, "loss": 0.0948, "step": 1950 }, { "epoch": 0.4812623274161736, "grad_norm": 0.56640625, "learning_rate": 0.00019240631163708086, "loss": 0.0947, "step": 1952 }, { "epoch": 0.4817554240631164, "grad_norm": 0.5390625, "learning_rate": 0.000192603550295858, "loss": 0.093, "step": 1954 }, { "epoch": 0.4822485207100592, "grad_norm": 0.5078125, "learning_rate": 0.00019280078895463513, "loss": 0.0904, "step": 1956 }, { "epoch": 0.482741617357002, "grad_norm": 0.69140625, "learning_rate": 0.00019299802761341223, "loss": 0.0892, "step": 1958 }, { "epoch": 0.4832347140039448, "grad_norm": 0.62109375, "learning_rate": 0.00019319526627218937, "loss": 0.098, "step": 1960 }, { "epoch": 0.48372781065088755, "grad_norm": 0.62890625, "learning_rate": 0.00019339250493096647, "loss": 0.0929, "step": 1962 }, { "epoch": 0.48422090729783035, "grad_norm": 1.0234375, "learning_rate": 0.0001935897435897436, "loss": 0.0985, "step": 1964 }, { "epoch": 0.48471400394477315, "grad_norm": 0.7421875, "learning_rate": 0.0001937869822485207, "loss": 0.0947, "step": 1966 }, { "epoch": 0.48520710059171596, "grad_norm": 0.734375, "learning_rate": 0.00019398422090729784, "loss": 0.1011, "step": 1968 }, { "epoch": 0.48570019723865876, "grad_norm": 0.69140625, "learning_rate": 0.00019418145956607498, "loss": 0.0946, "step": 1970 }, { "epoch": 0.48619329388560156, "grad_norm": 0.60546875, "learning_rate": 0.00019437869822485208, "loss": 0.0891, "step": 1972 }, { "epoch": 0.48668639053254437, "grad_norm": 0.98828125, "learning_rate": 0.00019457593688362922, "loss": 0.0955, "step": 1974 }, { "epoch": 0.48717948717948717, "grad_norm": 0.71875, "learning_rate": 0.00019477317554240632, "loss": 0.1009, "step": 1976 }, { "epoch": 0.48767258382643, "grad_norm": 1.0234375, "learning_rate": 0.00019497041420118345, "loss": 0.0999, "step": 1978 }, { "epoch": 0.4881656804733728, "grad_norm": 0.85546875, "learning_rate": 0.00019516765285996056, "loss": 0.092, "step": 1980 }, { "epoch": 0.4886587771203156, "grad_norm": 0.859375, "learning_rate": 0.0001953648915187377, "loss": 0.0983, "step": 1982 }, { "epoch": 0.4891518737672584, "grad_norm": 0.62109375, "learning_rate": 0.0001955621301775148, "loss": 0.0983, "step": 1984 }, { "epoch": 0.4896449704142012, "grad_norm": 0.80078125, "learning_rate": 0.0001957593688362919, "loss": 0.1015, "step": 1986 }, { "epoch": 0.490138067061144, "grad_norm": 0.66796875, "learning_rate": 0.00019595660749506904, "loss": 0.0915, "step": 1988 }, { "epoch": 0.4906311637080868, "grad_norm": 0.6171875, "learning_rate": 0.00019615384615384615, "loss": 0.0947, "step": 1990 }, { "epoch": 0.4911242603550296, "grad_norm": 0.609375, "learning_rate": 0.00019635108481262328, "loss": 0.0877, "step": 1992 }, { "epoch": 0.4916173570019724, "grad_norm": 0.6328125, "learning_rate": 0.00019654832347140038, "loss": 0.0931, "step": 1994 }, { "epoch": 0.4921104536489152, "grad_norm": 0.77734375, "learning_rate": 0.00019674556213017752, "loss": 0.0988, "step": 1996 }, { "epoch": 0.492603550295858, "grad_norm": 1.078125, "learning_rate": 0.00019694280078895465, "loss": 0.0988, "step": 1998 }, { "epoch": 0.4930966469428008, "grad_norm": 0.85546875, "learning_rate": 0.00019714003944773176, "loss": 0.098, "step": 2000 }, { "epoch": 0.4935897435897436, "grad_norm": 0.6953125, "learning_rate": 0.0001973372781065089, "loss": 0.097, "step": 2002 }, { "epoch": 0.4940828402366864, "grad_norm": 0.734375, "learning_rate": 0.000197534516765286, "loss": 0.0859, "step": 2004 }, { "epoch": 0.49457593688362916, "grad_norm": 0.66796875, "learning_rate": 0.00019773175542406313, "loss": 0.0976, "step": 2006 }, { "epoch": 0.49506903353057197, "grad_norm": 0.6875, "learning_rate": 0.00019792899408284023, "loss": 0.0956, "step": 2008 }, { "epoch": 0.49556213017751477, "grad_norm": 0.8203125, "learning_rate": 0.00019812623274161737, "loss": 0.1056, "step": 2010 }, { "epoch": 0.4960552268244576, "grad_norm": 0.6171875, "learning_rate": 0.0001983234714003945, "loss": 0.0916, "step": 2012 }, { "epoch": 0.4965483234714004, "grad_norm": 0.97265625, "learning_rate": 0.0001985207100591716, "loss": 0.1042, "step": 2014 }, { "epoch": 0.4970414201183432, "grad_norm": 0.63671875, "learning_rate": 0.00019871794871794874, "loss": 0.0963, "step": 2016 }, { "epoch": 0.497534516765286, "grad_norm": 1.0, "learning_rate": 0.00019891518737672585, "loss": 0.0916, "step": 2018 }, { "epoch": 0.4980276134122288, "grad_norm": 0.59375, "learning_rate": 0.00019911242603550298, "loss": 0.0962, "step": 2020 }, { "epoch": 0.4985207100591716, "grad_norm": 0.81640625, "learning_rate": 0.00019930966469428008, "loss": 0.0963, "step": 2022 }, { "epoch": 0.4990138067061144, "grad_norm": 0.890625, "learning_rate": 0.00019950690335305722, "loss": 0.0934, "step": 2024 }, { "epoch": 0.4995069033530572, "grad_norm": 0.83203125, "learning_rate": 0.00019970414201183435, "loss": 0.1109, "step": 2026 }, { "epoch": 0.5, "grad_norm": 0.8984375, "learning_rate": 0.00019990138067061146, "loss": 0.1064, "step": 2028 }, { "epoch": 0.5004930966469427, "grad_norm": 1.3046875, "learning_rate": 0.00019999999851868022, "loss": 0.0915, "step": 2030 }, { "epoch": 0.5009861932938856, "grad_norm": 0.78515625, "learning_rate": 0.00019999998666812206, "loss": 0.0898, "step": 2032 }, { "epoch": 0.5014792899408284, "grad_norm": 1.0, "learning_rate": 0.00019999996296700715, "loss": 0.0973, "step": 2034 }, { "epoch": 0.5019723865877712, "grad_norm": 0.6484375, "learning_rate": 0.0001999999274153383, "loss": 0.099, "step": 2036 }, { "epoch": 0.502465483234714, "grad_norm": 0.7265625, "learning_rate": 0.00019999988001311977, "loss": 0.0948, "step": 2038 }, { "epoch": 0.5029585798816568, "grad_norm": 0.875, "learning_rate": 0.00019999982076035705, "loss": 0.095, "step": 2040 }, { "epoch": 0.5034516765285996, "grad_norm": 0.75390625, "learning_rate": 0.0001999997496570573, "loss": 0.0993, "step": 2042 }, { "epoch": 0.5039447731755424, "grad_norm": 0.76171875, "learning_rate": 0.00019999966670322886, "loss": 0.0909, "step": 2044 }, { "epoch": 0.5044378698224852, "grad_norm": 0.76953125, "learning_rate": 0.0001999995718988816, "loss": 0.0937, "step": 2046 }, { "epoch": 0.504930966469428, "grad_norm": 0.5703125, "learning_rate": 0.00019999946524402678, "loss": 0.0921, "step": 2048 }, { "epoch": 0.5054240631163708, "grad_norm": 0.6953125, "learning_rate": 0.00019999934673867696, "loss": 0.0898, "step": 2050 }, { "epoch": 0.5059171597633136, "grad_norm": 0.6796875, "learning_rate": 0.00019999921638284623, "loss": 0.0971, "step": 2052 }, { "epoch": 0.5064102564102564, "grad_norm": 1.1640625, "learning_rate": 0.00019999907417655004, "loss": 0.0966, "step": 2054 }, { "epoch": 0.5069033530571992, "grad_norm": 1.34375, "learning_rate": 0.00019999892011980522, "loss": 0.094, "step": 2056 }, { "epoch": 0.507396449704142, "grad_norm": 0.6640625, "learning_rate": 0.00019999875421263007, "loss": 0.0972, "step": 2058 }, { "epoch": 0.5078895463510849, "grad_norm": 0.84765625, "learning_rate": 0.00019999857645504424, "loss": 0.0895, "step": 2060 }, { "epoch": 0.5083826429980276, "grad_norm": 0.66796875, "learning_rate": 0.00019999838684706873, "loss": 0.0905, "step": 2062 }, { "epoch": 0.5088757396449705, "grad_norm": 0.5703125, "learning_rate": 0.0001999981853887261, "loss": 0.092, "step": 2064 }, { "epoch": 0.5093688362919132, "grad_norm": 0.7421875, "learning_rate": 0.00019999797208004015, "loss": 0.0923, "step": 2066 }, { "epoch": 0.5098619329388561, "grad_norm": 0.953125, "learning_rate": 0.00019999774692103621, "loss": 0.0854, "step": 2068 }, { "epoch": 0.5103550295857988, "grad_norm": 0.7109375, "learning_rate": 0.00019999750991174097, "loss": 0.0936, "step": 2070 }, { "epoch": 0.5108481262327417, "grad_norm": 0.828125, "learning_rate": 0.00019999726105218244, "loss": 0.0993, "step": 2072 }, { "epoch": 0.5113412228796844, "grad_norm": 0.79296875, "learning_rate": 0.0001999970003423902, "loss": 0.0951, "step": 2074 }, { "epoch": 0.5118343195266272, "grad_norm": 0.9921875, "learning_rate": 0.00019999672778239506, "loss": 0.0932, "step": 2076 }, { "epoch": 0.51232741617357, "grad_norm": 0.486328125, "learning_rate": 0.0001999964433722294, "loss": 0.0897, "step": 2078 }, { "epoch": 0.5128205128205128, "grad_norm": 0.64453125, "learning_rate": 0.0001999961471119269, "loss": 0.0935, "step": 2080 }, { "epoch": 0.5133136094674556, "grad_norm": 0.70703125, "learning_rate": 0.00019999583900152265, "loss": 0.0988, "step": 2082 }, { "epoch": 0.5138067061143984, "grad_norm": 0.86328125, "learning_rate": 0.00019999551904105318, "loss": 0.0975, "step": 2084 }, { "epoch": 0.5142998027613412, "grad_norm": 0.9921875, "learning_rate": 0.0001999951872305564, "loss": 0.0878, "step": 2086 }, { "epoch": 0.514792899408284, "grad_norm": 0.86328125, "learning_rate": 0.00019999484357007162, "loss": 0.0938, "step": 2088 }, { "epoch": 0.5152859960552268, "grad_norm": 0.625, "learning_rate": 0.00019999448805963957, "loss": 0.093, "step": 2090 }, { "epoch": 0.5157790927021696, "grad_norm": 0.85546875, "learning_rate": 0.0001999941206993024, "loss": 0.0904, "step": 2092 }, { "epoch": 0.5162721893491125, "grad_norm": 1.140625, "learning_rate": 0.00019999374148910363, "loss": 0.0945, "step": 2094 }, { "epoch": 0.5167652859960552, "grad_norm": 0.9921875, "learning_rate": 0.00019999335042908823, "loss": 0.0949, "step": 2096 }, { "epoch": 0.5172583826429981, "grad_norm": 0.80859375, "learning_rate": 0.00019999294751930249, "loss": 0.0929, "step": 2098 }, { "epoch": 0.5177514792899408, "grad_norm": 0.60546875, "learning_rate": 0.00019999253275979414, "loss": 0.0937, "step": 2100 }, { "epoch": 0.5182445759368837, "grad_norm": 1.03125, "learning_rate": 0.00019999210615061243, "loss": 0.0948, "step": 2102 }, { "epoch": 0.5187376725838264, "grad_norm": 0.9765625, "learning_rate": 0.00019999166769180783, "loss": 0.086, "step": 2104 }, { "epoch": 0.5192307692307693, "grad_norm": 0.9140625, "learning_rate": 0.0001999912173834323, "loss": 0.0921, "step": 2106 }, { "epoch": 0.519723865877712, "grad_norm": 0.58984375, "learning_rate": 0.0001999907552255393, "loss": 0.0962, "step": 2108 }, { "epoch": 0.5202169625246549, "grad_norm": 0.6171875, "learning_rate": 0.0001999902812181835, "loss": 0.0926, "step": 2110 }, { "epoch": 0.5207100591715976, "grad_norm": 0.78515625, "learning_rate": 0.00019998979536142106, "loss": 0.094, "step": 2112 }, { "epoch": 0.5212031558185405, "grad_norm": 0.71875, "learning_rate": 0.00019998929765530963, "loss": 0.0861, "step": 2114 }, { "epoch": 0.5216962524654832, "grad_norm": 0.59765625, "learning_rate": 0.00019998878809990815, "loss": 0.0889, "step": 2116 }, { "epoch": 0.522189349112426, "grad_norm": 0.62109375, "learning_rate": 0.00019998826669527704, "loss": 0.0878, "step": 2118 }, { "epoch": 0.5226824457593688, "grad_norm": 0.6484375, "learning_rate": 0.00019998773344147806, "loss": 0.0948, "step": 2120 }, { "epoch": 0.5231755424063116, "grad_norm": 0.65234375, "learning_rate": 0.00019998718833857436, "loss": 0.0876, "step": 2122 }, { "epoch": 0.5236686390532544, "grad_norm": 0.88671875, "learning_rate": 0.00019998663138663062, "loss": 0.0907, "step": 2124 }, { "epoch": 0.5241617357001972, "grad_norm": 1.2421875, "learning_rate": 0.0001999860625857128, "loss": 0.0962, "step": 2126 }, { "epoch": 0.52465483234714, "grad_norm": 1.4296875, "learning_rate": 0.00019998548193588832, "loss": 0.0937, "step": 2128 }, { "epoch": 0.5251479289940828, "grad_norm": 0.625, "learning_rate": 0.000199984889437226, "loss": 0.0892, "step": 2130 }, { "epoch": 0.5256410256410257, "grad_norm": 0.96484375, "learning_rate": 0.000199984285089796, "loss": 0.0952, "step": 2132 }, { "epoch": 0.5261341222879684, "grad_norm": 0.80859375, "learning_rate": 0.00019998366889366999, "loss": 0.0915, "step": 2134 }, { "epoch": 0.5266272189349113, "grad_norm": 0.921875, "learning_rate": 0.000199983040848921, "loss": 0.0932, "step": 2136 }, { "epoch": 0.527120315581854, "grad_norm": 0.7890625, "learning_rate": 0.0001999824009556234, "loss": 0.0945, "step": 2138 }, { "epoch": 0.5276134122287969, "grad_norm": 0.80078125, "learning_rate": 0.00019998174921385308, "loss": 0.0849, "step": 2140 }, { "epoch": 0.5281065088757396, "grad_norm": 0.5859375, "learning_rate": 0.00019998108562368727, "loss": 0.0874, "step": 2142 }, { "epoch": 0.5285996055226825, "grad_norm": 0.76953125, "learning_rate": 0.00019998041018520453, "loss": 0.0911, "step": 2144 }, { "epoch": 0.5290927021696252, "grad_norm": 0.51953125, "learning_rate": 0.00019997972289848503, "loss": 0.093, "step": 2146 }, { "epoch": 0.5295857988165681, "grad_norm": 1.0703125, "learning_rate": 0.00019997902376361012, "loss": 0.0845, "step": 2148 }, { "epoch": 0.5300788954635108, "grad_norm": 0.7265625, "learning_rate": 0.0001999783127806627, "loss": 0.0938, "step": 2150 }, { "epoch": 0.5305719921104537, "grad_norm": 0.60546875, "learning_rate": 0.00019997758994972698, "loss": 0.0968, "step": 2152 }, { "epoch": 0.5310650887573964, "grad_norm": 0.60546875, "learning_rate": 0.00019997685527088865, "loss": 0.0914, "step": 2154 }, { "epoch": 0.5315581854043393, "grad_norm": 0.57421875, "learning_rate": 0.0001999761087442348, "loss": 0.0988, "step": 2156 }, { "epoch": 0.532051282051282, "grad_norm": 0.46875, "learning_rate": 0.0001999753503698538, "loss": 0.0863, "step": 2158 }, { "epoch": 0.5325443786982249, "grad_norm": 0.54296875, "learning_rate": 0.00019997458014783568, "loss": 0.0873, "step": 2160 }, { "epoch": 0.5330374753451677, "grad_norm": 0.60546875, "learning_rate": 0.00019997379807827156, "loss": 0.0884, "step": 2162 }, { "epoch": 0.5335305719921104, "grad_norm": 0.640625, "learning_rate": 0.00019997300416125423, "loss": 0.0893, "step": 2164 }, { "epoch": 0.5340236686390533, "grad_norm": 0.734375, "learning_rate": 0.0001999721983968777, "loss": 0.0854, "step": 2166 }, { "epoch": 0.534516765285996, "grad_norm": 0.7109375, "learning_rate": 0.00019997138078523753, "loss": 0.0848, "step": 2168 }, { "epoch": 0.5350098619329389, "grad_norm": 0.73828125, "learning_rate": 0.0001999705513264305, "loss": 0.0804, "step": 2170 }, { "epoch": 0.5355029585798816, "grad_norm": 0.5625, "learning_rate": 0.00019996971002055503, "loss": 0.094, "step": 2172 }, { "epoch": 0.5359960552268245, "grad_norm": 0.7734375, "learning_rate": 0.00019996885686771073, "loss": 0.0887, "step": 2174 }, { "epoch": 0.5364891518737672, "grad_norm": 0.8984375, "learning_rate": 0.00019996799186799876, "loss": 0.0894, "step": 2176 }, { "epoch": 0.5369822485207101, "grad_norm": 0.6484375, "learning_rate": 0.00019996711502152158, "loss": 0.0846, "step": 2178 }, { "epoch": 0.5374753451676528, "grad_norm": 0.515625, "learning_rate": 0.00019996622632838316, "loss": 0.0841, "step": 2180 }, { "epoch": 0.5379684418145957, "grad_norm": 0.7421875, "learning_rate": 0.00019996532578868875, "loss": 0.0875, "step": 2182 }, { "epoch": 0.5384615384615384, "grad_norm": 0.5546875, "learning_rate": 0.0001999644134025451, "loss": 0.0869, "step": 2184 }, { "epoch": 0.5389546351084813, "grad_norm": 0.78125, "learning_rate": 0.00019996348917006036, "loss": 0.0937, "step": 2186 }, { "epoch": 0.539447731755424, "grad_norm": 0.76953125, "learning_rate": 0.000199962553091344, "loss": 0.0948, "step": 2188 }, { "epoch": 0.5399408284023669, "grad_norm": 0.9140625, "learning_rate": 0.00019996160516650696, "loss": 0.0935, "step": 2190 }, { "epoch": 0.5404339250493096, "grad_norm": 0.60546875, "learning_rate": 0.00019996064539566164, "loss": 0.0864, "step": 2192 }, { "epoch": 0.5409270216962525, "grad_norm": 0.70703125, "learning_rate": 0.00019995967377892172, "loss": 0.0925, "step": 2194 }, { "epoch": 0.5414201183431953, "grad_norm": 0.87890625, "learning_rate": 0.00019995869031640235, "loss": 0.0866, "step": 2196 }, { "epoch": 0.5419132149901381, "grad_norm": 0.7109375, "learning_rate": 0.0001999576950082201, "loss": 0.0842, "step": 2198 }, { "epoch": 0.5424063116370809, "grad_norm": 0.71875, "learning_rate": 0.00019995668785449284, "loss": 0.0914, "step": 2200 }, { "epoch": 0.5428994082840237, "grad_norm": 0.703125, "learning_rate": 0.00019995566885534002, "loss": 0.0923, "step": 2202 }, { "epoch": 0.5433925049309665, "grad_norm": 0.55078125, "learning_rate": 0.00019995463801088237, "loss": 0.0869, "step": 2204 }, { "epoch": 0.5438856015779092, "grad_norm": 0.66015625, "learning_rate": 0.000199953595321242, "loss": 0.0831, "step": 2206 }, { "epoch": 0.5443786982248521, "grad_norm": 0.51953125, "learning_rate": 0.00019995254078654257, "loss": 0.0842, "step": 2208 }, { "epoch": 0.5448717948717948, "grad_norm": 0.55078125, "learning_rate": 0.00019995147440690896, "loss": 0.0827, "step": 2210 }, { "epoch": 0.5453648915187377, "grad_norm": 0.734375, "learning_rate": 0.0001999503961824676, "loss": 0.0916, "step": 2212 }, { "epoch": 0.5458579881656804, "grad_norm": 0.5546875, "learning_rate": 0.00019994930611334623, "loss": 0.0782, "step": 2214 }, { "epoch": 0.5463510848126233, "grad_norm": 0.91796875, "learning_rate": 0.000199948204199674, "loss": 0.0906, "step": 2216 }, { "epoch": 0.546844181459566, "grad_norm": 0.83984375, "learning_rate": 0.00019994709044158157, "loss": 0.0901, "step": 2218 }, { "epoch": 0.5473372781065089, "grad_norm": 0.69140625, "learning_rate": 0.0001999459648392009, "loss": 0.0936, "step": 2220 }, { "epoch": 0.5478303747534516, "grad_norm": 0.875, "learning_rate": 0.00019994482739266534, "loss": 0.0864, "step": 2222 }, { "epoch": 0.5483234714003945, "grad_norm": 0.66015625, "learning_rate": 0.00019994367810210973, "loss": 0.0926, "step": 2224 }, { "epoch": 0.5488165680473372, "grad_norm": 0.58203125, "learning_rate": 0.00019994251696767027, "loss": 0.0924, "step": 2226 }, { "epoch": 0.5493096646942801, "grad_norm": 0.59375, "learning_rate": 0.0001999413439894845, "loss": 0.0862, "step": 2228 }, { "epoch": 0.5498027613412229, "grad_norm": 0.88671875, "learning_rate": 0.0001999401591676915, "loss": 0.0905, "step": 2230 }, { "epoch": 0.5502958579881657, "grad_norm": 0.7890625, "learning_rate": 0.00019993896250243163, "loss": 0.0926, "step": 2232 }, { "epoch": 0.5507889546351085, "grad_norm": 0.486328125, "learning_rate": 0.0001999377539938467, "loss": 0.0888, "step": 2234 }, { "epoch": 0.5512820512820513, "grad_norm": 0.78125, "learning_rate": 0.00019993653364207997, "loss": 0.0802, "step": 2236 }, { "epoch": 0.5517751479289941, "grad_norm": 1.1015625, "learning_rate": 0.000199935301447276, "loss": 0.0889, "step": 2238 }, { "epoch": 0.5522682445759369, "grad_norm": 0.93359375, "learning_rate": 0.00019993405740958086, "loss": 0.0935, "step": 2240 }, { "epoch": 0.5527613412228797, "grad_norm": 0.5703125, "learning_rate": 0.00019993280152914196, "loss": 0.0866, "step": 2242 }, { "epoch": 0.5532544378698225, "grad_norm": 0.58984375, "learning_rate": 0.00019993153380610813, "loss": 0.0923, "step": 2244 }, { "epoch": 0.5537475345167653, "grad_norm": 0.88671875, "learning_rate": 0.0001999302542406296, "loss": 0.0838, "step": 2246 }, { "epoch": 0.5542406311637081, "grad_norm": 0.765625, "learning_rate": 0.00019992896283285797, "loss": 0.0841, "step": 2248 }, { "epoch": 0.5547337278106509, "grad_norm": 1.125, "learning_rate": 0.00019992765958294634, "loss": 0.0831, "step": 2250 }, { "epoch": 0.5552268244575936, "grad_norm": 0.671875, "learning_rate": 0.00019992634449104911, "loss": 0.0864, "step": 2252 }, { "epoch": 0.5557199211045365, "grad_norm": 0.95703125, "learning_rate": 0.00019992501755732215, "loss": 0.0857, "step": 2254 }, { "epoch": 0.5562130177514792, "grad_norm": 1.46875, "learning_rate": 0.0001999236787819227, "loss": 0.0845, "step": 2256 }, { "epoch": 0.5567061143984221, "grad_norm": 0.66796875, "learning_rate": 0.0001999223281650094, "loss": 0.0922, "step": 2258 }, { "epoch": 0.5571992110453649, "grad_norm": 0.80078125, "learning_rate": 0.00019992096570674235, "loss": 0.086, "step": 2260 }, { "epoch": 0.5576923076923077, "grad_norm": 0.8671875, "learning_rate": 0.00019991959140728296, "loss": 0.0849, "step": 2262 }, { "epoch": 0.5581854043392505, "grad_norm": 0.76171875, "learning_rate": 0.0001999182052667941, "loss": 0.0885, "step": 2264 }, { "epoch": 0.5586785009861933, "grad_norm": 0.67578125, "learning_rate": 0.00019991680728544004, "loss": 0.0868, "step": 2266 }, { "epoch": 0.5591715976331361, "grad_norm": 0.63671875, "learning_rate": 0.00019991539746338645, "loss": 0.0902, "step": 2268 }, { "epoch": 0.5596646942800789, "grad_norm": 0.478515625, "learning_rate": 0.00019991397580080045, "loss": 0.0829, "step": 2270 }, { "epoch": 0.5601577909270217, "grad_norm": 0.89453125, "learning_rate": 0.0001999125422978504, "loss": 0.0839, "step": 2272 }, { "epoch": 0.5606508875739645, "grad_norm": 0.97265625, "learning_rate": 0.0001999110969547063, "loss": 0.0927, "step": 2274 }, { "epoch": 0.5611439842209073, "grad_norm": 1.03125, "learning_rate": 0.00019990963977153936, "loss": 0.0916, "step": 2276 }, { "epoch": 0.5616370808678501, "grad_norm": 0.796875, "learning_rate": 0.00019990817074852228, "loss": 0.0853, "step": 2278 }, { "epoch": 0.5621301775147929, "grad_norm": 0.51171875, "learning_rate": 0.00019990668988582915, "loss": 0.0868, "step": 2280 }, { "epoch": 0.5626232741617357, "grad_norm": 1.265625, "learning_rate": 0.00019990519718363547, "loss": 0.0931, "step": 2282 }, { "epoch": 0.5631163708086785, "grad_norm": 1.6015625, "learning_rate": 0.0001999036926421181, "loss": 0.0897, "step": 2284 }, { "epoch": 0.5636094674556213, "grad_norm": 0.84375, "learning_rate": 0.0001999021762614554, "loss": 0.0776, "step": 2286 }, { "epoch": 0.5641025641025641, "grad_norm": 0.84765625, "learning_rate": 0.00019990064804182702, "loss": 0.0831, "step": 2288 }, { "epoch": 0.564595660749507, "grad_norm": 0.8984375, "learning_rate": 0.00019989910798341406, "loss": 0.0853, "step": 2290 }, { "epoch": 0.5650887573964497, "grad_norm": 0.6015625, "learning_rate": 0.00019989755608639903, "loss": 0.087, "step": 2292 }, { "epoch": 0.5655818540433925, "grad_norm": 0.859375, "learning_rate": 0.00019989599235096587, "loss": 0.0811, "step": 2294 }, { "epoch": 0.5660749506903353, "grad_norm": 0.609375, "learning_rate": 0.00019989441677729984, "loss": 0.0813, "step": 2296 }, { "epoch": 0.5665680473372781, "grad_norm": 0.71484375, "learning_rate": 0.00019989282936558773, "loss": 0.0825, "step": 2298 }, { "epoch": 0.5670611439842209, "grad_norm": 0.498046875, "learning_rate": 0.00019989123011601758, "loss": 0.0858, "step": 2300 }, { "epoch": 0.5675542406311637, "grad_norm": 0.68359375, "learning_rate": 0.00019988961902877894, "loss": 0.0892, "step": 2302 }, { "epoch": 0.5680473372781065, "grad_norm": 0.84765625, "learning_rate": 0.00019988799610406272, "loss": 0.0863, "step": 2304 }, { "epoch": 0.5685404339250493, "grad_norm": 0.53125, "learning_rate": 0.00019988636134206128, "loss": 0.0853, "step": 2306 }, { "epoch": 0.5690335305719921, "grad_norm": 0.82421875, "learning_rate": 0.00019988471474296831, "loss": 0.0846, "step": 2308 }, { "epoch": 0.5695266272189349, "grad_norm": 0.6796875, "learning_rate": 0.000199883056306979, "loss": 0.0869, "step": 2310 }, { "epoch": 0.5700197238658777, "grad_norm": 0.7734375, "learning_rate": 0.0001998813860342898, "loss": 0.0916, "step": 2312 }, { "epoch": 0.5705128205128205, "grad_norm": 0.62890625, "learning_rate": 0.0001998797039250987, "loss": 0.0888, "step": 2314 }, { "epoch": 0.5710059171597633, "grad_norm": 0.51953125, "learning_rate": 0.00019987800997960502, "loss": 0.0941, "step": 2316 }, { "epoch": 0.5714990138067061, "grad_norm": 0.59765625, "learning_rate": 0.00019987630419800954, "loss": 0.0955, "step": 2318 }, { "epoch": 0.571992110453649, "grad_norm": 0.53125, "learning_rate": 0.00019987458658051434, "loss": 0.082, "step": 2320 }, { "epoch": 0.5724852071005917, "grad_norm": 0.455078125, "learning_rate": 0.000199872857127323, "loss": 0.0864, "step": 2322 }, { "epoch": 0.5729783037475346, "grad_norm": 0.65625, "learning_rate": 0.0001998711158386405, "loss": 0.0893, "step": 2324 }, { "epoch": 0.5734714003944773, "grad_norm": 0.61328125, "learning_rate": 0.00019986936271467314, "loss": 0.0886, "step": 2326 }, { "epoch": 0.5739644970414202, "grad_norm": 0.6484375, "learning_rate": 0.00019986759775562872, "loss": 0.0911, "step": 2328 }, { "epoch": 0.5744575936883629, "grad_norm": 0.83984375, "learning_rate": 0.00019986582096171636, "loss": 0.0916, "step": 2330 }, { "epoch": 0.5749506903353058, "grad_norm": 0.65625, "learning_rate": 0.00019986403233314667, "loss": 0.088, "step": 2332 }, { "epoch": 0.5754437869822485, "grad_norm": 0.447265625, "learning_rate": 0.00019986223187013153, "loss": 0.0778, "step": 2334 }, { "epoch": 0.5759368836291914, "grad_norm": 0.578125, "learning_rate": 0.00019986041957288438, "loss": 0.0883, "step": 2336 }, { "epoch": 0.5764299802761341, "grad_norm": 0.828125, "learning_rate": 0.00019985859544161994, "loss": 0.0875, "step": 2338 }, { "epoch": 0.5769230769230769, "grad_norm": 1.03125, "learning_rate": 0.00019985675947655442, "loss": 0.0936, "step": 2340 }, { "epoch": 0.5774161735700197, "grad_norm": 0.59375, "learning_rate": 0.00019985491167790538, "loss": 0.0814, "step": 2342 }, { "epoch": 0.5779092702169625, "grad_norm": 0.79296875, "learning_rate": 0.00019985305204589177, "loss": 0.0912, "step": 2344 }, { "epoch": 0.5784023668639053, "grad_norm": 0.64453125, "learning_rate": 0.000199851180580734, "loss": 0.0939, "step": 2346 }, { "epoch": 0.5788954635108481, "grad_norm": 0.54296875, "learning_rate": 0.00019984929728265383, "loss": 0.0924, "step": 2348 }, { "epoch": 0.5793885601577909, "grad_norm": 0.478515625, "learning_rate": 0.00019984740215187445, "loss": 0.086, "step": 2350 }, { "epoch": 0.5798816568047337, "grad_norm": 0.70703125, "learning_rate": 0.00019984549518862042, "loss": 0.0842, "step": 2352 }, { "epoch": 0.5803747534516766, "grad_norm": 0.5859375, "learning_rate": 0.00019984357639311778, "loss": 0.089, "step": 2354 }, { "epoch": 0.5808678500986193, "grad_norm": 0.6015625, "learning_rate": 0.00019984164576559383, "loss": 0.0846, "step": 2356 }, { "epoch": 0.5813609467455622, "grad_norm": 0.5234375, "learning_rate": 0.00019983970330627747, "loss": 0.0867, "step": 2358 }, { "epoch": 0.5818540433925049, "grad_norm": 0.53515625, "learning_rate": 0.00019983774901539877, "loss": 0.0819, "step": 2360 }, { "epoch": 0.5823471400394478, "grad_norm": 0.51171875, "learning_rate": 0.00019983578289318942, "loss": 0.0876, "step": 2362 }, { "epoch": 0.5828402366863905, "grad_norm": 0.55078125, "learning_rate": 0.00019983380493988242, "loss": 0.0841, "step": 2364 }, { "epoch": 0.5833333333333334, "grad_norm": 0.49609375, "learning_rate": 0.0001998318151557121, "loss": 0.0822, "step": 2366 }, { "epoch": 0.5838264299802761, "grad_norm": 0.64453125, "learning_rate": 0.00019982981354091427, "loss": 0.0802, "step": 2368 }, { "epoch": 0.584319526627219, "grad_norm": 0.63671875, "learning_rate": 0.00019982780009572619, "loss": 0.0909, "step": 2370 }, { "epoch": 0.5848126232741617, "grad_norm": 0.43359375, "learning_rate": 0.00019982577482038645, "loss": 0.0904, "step": 2372 }, { "epoch": 0.5853057199211046, "grad_norm": 0.59375, "learning_rate": 0.000199823737715135, "loss": 0.0771, "step": 2374 }, { "epoch": 0.5857988165680473, "grad_norm": 0.91015625, "learning_rate": 0.0001998216887802133, "loss": 0.0875, "step": 2376 }, { "epoch": 0.5862919132149902, "grad_norm": 0.640625, "learning_rate": 0.00019981962801586414, "loss": 0.0924, "step": 2378 }, { "epoch": 0.5867850098619329, "grad_norm": 0.69140625, "learning_rate": 0.00019981755542233177, "loss": 0.0929, "step": 2380 }, { "epoch": 0.5872781065088757, "grad_norm": 0.55859375, "learning_rate": 0.00019981547099986172, "loss": 0.08, "step": 2382 }, { "epoch": 0.5877712031558185, "grad_norm": 0.55078125, "learning_rate": 0.0001998133747487011, "loss": 0.0841, "step": 2384 }, { "epoch": 0.5882642998027613, "grad_norm": 0.5703125, "learning_rate": 0.00019981126666909828, "loss": 0.0886, "step": 2386 }, { "epoch": 0.5887573964497042, "grad_norm": 0.6015625, "learning_rate": 0.00019980914676130307, "loss": 0.0778, "step": 2388 }, { "epoch": 0.5892504930966469, "grad_norm": 0.6015625, "learning_rate": 0.0001998070150255667, "loss": 0.0902, "step": 2390 }, { "epoch": 0.5897435897435898, "grad_norm": 0.9609375, "learning_rate": 0.00019980487146214183, "loss": 0.0862, "step": 2392 }, { "epoch": 0.5902366863905325, "grad_norm": 0.8125, "learning_rate": 0.00019980271607128244, "loss": 0.09, "step": 2394 }, { "epoch": 0.5907297830374754, "grad_norm": 0.55078125, "learning_rate": 0.00019980054885324395, "loss": 0.0829, "step": 2396 }, { "epoch": 0.5912228796844181, "grad_norm": 0.8359375, "learning_rate": 0.00019979836980828323, "loss": 0.0773, "step": 2398 }, { "epoch": 0.591715976331361, "grad_norm": 0.87109375, "learning_rate": 0.00019979617893665848, "loss": 0.0871, "step": 2400 }, { "epoch": 0.5922090729783037, "grad_norm": 1.1953125, "learning_rate": 0.00019979397623862933, "loss": 0.09, "step": 2402 }, { "epoch": 0.5927021696252466, "grad_norm": 0.625, "learning_rate": 0.0001997917617144568, "loss": 0.0822, "step": 2404 }, { "epoch": 0.5931952662721893, "grad_norm": 0.875, "learning_rate": 0.00019978953536440336, "loss": 0.0907, "step": 2406 }, { "epoch": 0.5936883629191322, "grad_norm": 1.2265625, "learning_rate": 0.00019978729718873281, "loss": 0.0882, "step": 2408 }, { "epoch": 0.5941814595660749, "grad_norm": 0.6328125, "learning_rate": 0.00019978504718771043, "loss": 0.084, "step": 2410 }, { "epoch": 0.5946745562130178, "grad_norm": 0.9921875, "learning_rate": 0.0001997827853616028, "loss": 0.0812, "step": 2412 }, { "epoch": 0.5951676528599605, "grad_norm": 1.296875, "learning_rate": 0.00019978051171067805, "loss": 0.0883, "step": 2414 }, { "epoch": 0.5956607495069034, "grad_norm": 0.6640625, "learning_rate": 0.00019977822623520547, "loss": 0.0898, "step": 2416 }, { "epoch": 0.5961538461538461, "grad_norm": 1.2265625, "learning_rate": 0.00019977592893545608, "loss": 0.0817, "step": 2418 }, { "epoch": 0.596646942800789, "grad_norm": 1.0859375, "learning_rate": 0.00019977361981170195, "loss": 0.0907, "step": 2420 }, { "epoch": 0.5971400394477318, "grad_norm": 0.474609375, "learning_rate": 0.00019977129886421686, "loss": 0.0854, "step": 2422 }, { "epoch": 0.5976331360946746, "grad_norm": 0.63671875, "learning_rate": 0.0001997689660932758, "loss": 0.084, "step": 2424 }, { "epoch": 0.5981262327416174, "grad_norm": 0.53515625, "learning_rate": 0.00019976662149915518, "loss": 0.0796, "step": 2426 }, { "epoch": 0.5986193293885601, "grad_norm": 0.482421875, "learning_rate": 0.00019976426508213293, "loss": 0.0814, "step": 2428 }, { "epoch": 0.599112426035503, "grad_norm": 0.71875, "learning_rate": 0.00019976189684248825, "loss": 0.0837, "step": 2430 }, { "epoch": 0.5996055226824457, "grad_norm": 1.0078125, "learning_rate": 0.0001997595167805018, "loss": 0.0807, "step": 2432 }, { "epoch": 0.6000986193293886, "grad_norm": 0.8984375, "learning_rate": 0.0001997571248964556, "loss": 0.0884, "step": 2434 }, { "epoch": 0.6005917159763313, "grad_norm": 0.4921875, "learning_rate": 0.00019975472119063313, "loss": 0.0856, "step": 2436 }, { "epoch": 0.6010848126232742, "grad_norm": 1.0390625, "learning_rate": 0.00019975230566331926, "loss": 0.0865, "step": 2438 }, { "epoch": 0.6015779092702169, "grad_norm": 1.25, "learning_rate": 0.00019974987831480022, "loss": 0.0864, "step": 2440 }, { "epoch": 0.6020710059171598, "grad_norm": 0.419921875, "learning_rate": 0.00019974743914536364, "loss": 0.0879, "step": 2442 }, { "epoch": 0.6025641025641025, "grad_norm": 0.7734375, "learning_rate": 0.00019974498815529863, "loss": 0.0873, "step": 2444 }, { "epoch": 0.6030571992110454, "grad_norm": 1.046875, "learning_rate": 0.0001997425253448956, "loss": 0.0895, "step": 2446 }, { "epoch": 0.6035502958579881, "grad_norm": 0.578125, "learning_rate": 0.00019974005071444646, "loss": 0.0858, "step": 2448 }, { "epoch": 0.604043392504931, "grad_norm": 0.439453125, "learning_rate": 0.00019973756426424442, "loss": 0.0848, "step": 2450 }, { "epoch": 0.6045364891518737, "grad_norm": 0.45703125, "learning_rate": 0.0001997350659945841, "loss": 0.0822, "step": 2452 }, { "epoch": 0.6050295857988166, "grad_norm": 0.51171875, "learning_rate": 0.00019973255590576168, "loss": 0.0827, "step": 2454 }, { "epoch": 0.6055226824457594, "grad_norm": 0.7734375, "learning_rate": 0.0001997300339980745, "loss": 0.0863, "step": 2456 }, { "epoch": 0.6060157790927022, "grad_norm": 0.625, "learning_rate": 0.00019972750027182152, "loss": 0.0924, "step": 2458 }, { "epoch": 0.606508875739645, "grad_norm": 0.58203125, "learning_rate": 0.00019972495472730294, "loss": 0.0838, "step": 2460 }, { "epoch": 0.6070019723865878, "grad_norm": 0.578125, "learning_rate": 0.00019972239736482038, "loss": 0.0857, "step": 2462 }, { "epoch": 0.6074950690335306, "grad_norm": 0.5859375, "learning_rate": 0.000199719828184677, "loss": 0.0835, "step": 2464 }, { "epoch": 0.6079881656804734, "grad_norm": 0.69140625, "learning_rate": 0.0001997172471871772, "loss": 0.0854, "step": 2466 }, { "epoch": 0.6084812623274162, "grad_norm": 0.80859375, "learning_rate": 0.00019971465437262686, "loss": 0.0832, "step": 2468 }, { "epoch": 0.6089743589743589, "grad_norm": 0.455078125, "learning_rate": 0.00019971204974133326, "loss": 0.0793, "step": 2470 }, { "epoch": 0.6094674556213018, "grad_norm": 0.6484375, "learning_rate": 0.000199709433293605, "loss": 0.0868, "step": 2472 }, { "epoch": 0.6099605522682445, "grad_norm": 0.60546875, "learning_rate": 0.00019970680502975224, "loss": 0.0853, "step": 2474 }, { "epoch": 0.6104536489151874, "grad_norm": 0.640625, "learning_rate": 0.00019970416495008637, "loss": 0.0878, "step": 2476 }, { "epoch": 0.6109467455621301, "grad_norm": 0.6328125, "learning_rate": 0.0001997015130549203, "loss": 0.0744, "step": 2478 }, { "epoch": 0.611439842209073, "grad_norm": 0.625, "learning_rate": 0.00019969884934456823, "loss": 0.085, "step": 2480 }, { "epoch": 0.6119329388560157, "grad_norm": 0.5546875, "learning_rate": 0.00019969617381934587, "loss": 0.0876, "step": 2482 }, { "epoch": 0.6124260355029586, "grad_norm": 0.6484375, "learning_rate": 0.0001996934864795703, "loss": 0.0836, "step": 2484 }, { "epoch": 0.6129191321499013, "grad_norm": 0.69921875, "learning_rate": 0.00019969078732555998, "loss": 0.0823, "step": 2486 }, { "epoch": 0.6134122287968442, "grad_norm": 0.5390625, "learning_rate": 0.0001996880763576347, "loss": 0.0812, "step": 2488 }, { "epoch": 0.613905325443787, "grad_norm": 0.48828125, "learning_rate": 0.00019968535357611585, "loss": 0.0789, "step": 2490 }, { "epoch": 0.6143984220907298, "grad_norm": 0.73828125, "learning_rate": 0.00019968261898132598, "loss": 0.0906, "step": 2492 }, { "epoch": 0.6148915187376726, "grad_norm": 0.5546875, "learning_rate": 0.00019967987257358925, "loss": 0.0843, "step": 2494 }, { "epoch": 0.6153846153846154, "grad_norm": 0.69921875, "learning_rate": 0.00019967711435323103, "loss": 0.0852, "step": 2496 }, { "epoch": 0.6158777120315582, "grad_norm": 0.47265625, "learning_rate": 0.0001996743443205783, "loss": 0.0789, "step": 2498 }, { "epoch": 0.616370808678501, "grad_norm": 0.6484375, "learning_rate": 0.0001996715624759592, "loss": 0.0873, "step": 2500 }, { "epoch": 0.6168639053254438, "grad_norm": 0.6484375, "learning_rate": 0.00019966876881970347, "loss": 0.0807, "step": 2502 }, { "epoch": 0.6173570019723866, "grad_norm": 0.6640625, "learning_rate": 0.00019966596335214216, "loss": 0.0819, "step": 2504 }, { "epoch": 0.6178500986193294, "grad_norm": 0.51171875, "learning_rate": 0.00019966314607360773, "loss": 0.085, "step": 2506 }, { "epoch": 0.6183431952662722, "grad_norm": 0.73828125, "learning_rate": 0.00019966031698443403, "loss": 0.0888, "step": 2508 }, { "epoch": 0.618836291913215, "grad_norm": 0.5703125, "learning_rate": 0.00019965747608495632, "loss": 0.0802, "step": 2510 }, { "epoch": 0.6193293885601578, "grad_norm": 0.51171875, "learning_rate": 0.0001996546233755113, "loss": 0.0752, "step": 2512 }, { "epoch": 0.6198224852071006, "grad_norm": 0.8046875, "learning_rate": 0.00019965175885643704, "loss": 0.0799, "step": 2514 }, { "epoch": 0.6203155818540433, "grad_norm": 0.62890625, "learning_rate": 0.00019964888252807294, "loss": 0.0829, "step": 2516 }, { "epoch": 0.6208086785009862, "grad_norm": 0.5390625, "learning_rate": 0.0001996459943907599, "loss": 0.088, "step": 2518 }, { "epoch": 0.621301775147929, "grad_norm": 0.74609375, "learning_rate": 0.00019964309444484016, "loss": 0.0804, "step": 2520 }, { "epoch": 0.6217948717948718, "grad_norm": 0.7734375, "learning_rate": 0.0001996401826906574, "loss": 0.0932, "step": 2522 }, { "epoch": 0.6222879684418146, "grad_norm": 0.8671875, "learning_rate": 0.00019963725912855665, "loss": 0.0843, "step": 2524 }, { "epoch": 0.6227810650887574, "grad_norm": 0.765625, "learning_rate": 0.00019963432375888441, "loss": 0.0828, "step": 2526 }, { "epoch": 0.6232741617357002, "grad_norm": 0.91796875, "learning_rate": 0.00019963137658198852, "loss": 0.0776, "step": 2528 }, { "epoch": 0.623767258382643, "grad_norm": 0.90234375, "learning_rate": 0.00019962841759821825, "loss": 0.0827, "step": 2530 }, { "epoch": 0.6242603550295858, "grad_norm": 0.53125, "learning_rate": 0.00019962544680792422, "loss": 0.0854, "step": 2532 }, { "epoch": 0.6247534516765286, "grad_norm": 0.90625, "learning_rate": 0.0001996224642114585, "loss": 0.0788, "step": 2534 }, { "epoch": 0.6252465483234714, "grad_norm": 0.59375, "learning_rate": 0.00019961946980917456, "loss": 0.0778, "step": 2536 }, { "epoch": 0.6257396449704142, "grad_norm": 0.84765625, "learning_rate": 0.00019961646360142725, "loss": 0.0859, "step": 2538 }, { "epoch": 0.626232741617357, "grad_norm": 0.875, "learning_rate": 0.0001996134455885728, "loss": 0.0843, "step": 2540 }, { "epoch": 0.6267258382642998, "grad_norm": 0.62109375, "learning_rate": 0.0001996104157709689, "loss": 0.0834, "step": 2542 }, { "epoch": 0.6272189349112426, "grad_norm": 0.6640625, "learning_rate": 0.00019960737414897458, "loss": 0.0819, "step": 2544 }, { "epoch": 0.6277120315581854, "grad_norm": 0.62109375, "learning_rate": 0.00019960432072295024, "loss": 0.0799, "step": 2546 }, { "epoch": 0.6282051282051282, "grad_norm": 1.0078125, "learning_rate": 0.0001996012554932578, "loss": 0.0848, "step": 2548 }, { "epoch": 0.628698224852071, "grad_norm": 0.5234375, "learning_rate": 0.0001995981784602605, "loss": 0.0874, "step": 2550 }, { "epoch": 0.6291913214990138, "grad_norm": 0.63671875, "learning_rate": 0.00019959508962432297, "loss": 0.0879, "step": 2552 }, { "epoch": 0.6296844181459567, "grad_norm": 0.5546875, "learning_rate": 0.00019959198898581124, "loss": 0.0874, "step": 2554 }, { "epoch": 0.6301775147928994, "grad_norm": 0.423828125, "learning_rate": 0.00019958887654509275, "loss": 0.0798, "step": 2556 }, { "epoch": 0.6306706114398422, "grad_norm": 0.412109375, "learning_rate": 0.00019958575230253635, "loss": 0.079, "step": 2558 }, { "epoch": 0.631163708086785, "grad_norm": 0.76171875, "learning_rate": 0.00019958261625851233, "loss": 0.0796, "step": 2560 }, { "epoch": 0.6316568047337278, "grad_norm": 0.640625, "learning_rate": 0.00019957946841339227, "loss": 0.0891, "step": 2562 }, { "epoch": 0.6321499013806706, "grad_norm": 0.515625, "learning_rate": 0.0001995763087675492, "loss": 0.0797, "step": 2564 }, { "epoch": 0.6326429980276134, "grad_norm": 0.46484375, "learning_rate": 0.00019957313732135758, "loss": 0.0831, "step": 2566 }, { "epoch": 0.6331360946745562, "grad_norm": 0.53515625, "learning_rate": 0.00019956995407519328, "loss": 0.0806, "step": 2568 }, { "epoch": 0.633629191321499, "grad_norm": 0.53125, "learning_rate": 0.00019956675902943344, "loss": 0.0754, "step": 2570 }, { "epoch": 0.6341222879684418, "grad_norm": 0.470703125, "learning_rate": 0.00019956355218445678, "loss": 0.0818, "step": 2572 }, { "epoch": 0.6346153846153846, "grad_norm": 0.5, "learning_rate": 0.00019956033354064329, "loss": 0.0854, "step": 2574 }, { "epoch": 0.6351084812623274, "grad_norm": 0.65625, "learning_rate": 0.0001995571030983744, "loss": 0.0803, "step": 2576 }, { "epoch": 0.6356015779092702, "grad_norm": 0.64453125, "learning_rate": 0.00019955386085803293, "loss": 0.0823, "step": 2578 }, { "epoch": 0.636094674556213, "grad_norm": 0.41796875, "learning_rate": 0.00019955060682000313, "loss": 0.0848, "step": 2580 }, { "epoch": 0.6365877712031558, "grad_norm": 0.62109375, "learning_rate": 0.00019954734098467057, "loss": 0.0788, "step": 2582 }, { "epoch": 0.6370808678500987, "grad_norm": 0.392578125, "learning_rate": 0.00019954406335242233, "loss": 0.0808, "step": 2584 }, { "epoch": 0.6375739644970414, "grad_norm": 0.57421875, "learning_rate": 0.0001995407739236468, "loss": 0.0861, "step": 2586 }, { "epoch": 0.6380670611439843, "grad_norm": 0.61328125, "learning_rate": 0.0001995374726987338, "loss": 0.0813, "step": 2588 }, { "epoch": 0.638560157790927, "grad_norm": 0.54296875, "learning_rate": 0.00019953415967807452, "loss": 0.0866, "step": 2590 }, { "epoch": 0.6390532544378699, "grad_norm": 0.84765625, "learning_rate": 0.0001995308348620616, "loss": 0.0847, "step": 2592 }, { "epoch": 0.6395463510848126, "grad_norm": 0.5859375, "learning_rate": 0.00019952749825108903, "loss": 0.081, "step": 2594 }, { "epoch": 0.6400394477317555, "grad_norm": 0.640625, "learning_rate": 0.00019952414984555222, "loss": 0.0782, "step": 2596 }, { "epoch": 0.6405325443786982, "grad_norm": 0.546875, "learning_rate": 0.00019952078964584802, "loss": 0.0794, "step": 2598 }, { "epoch": 0.6410256410256411, "grad_norm": 0.69921875, "learning_rate": 0.00019951741765237454, "loss": 0.0746, "step": 2600 }, { "epoch": 0.6415187376725838, "grad_norm": 0.5234375, "learning_rate": 0.00019951403386553147, "loss": 0.0818, "step": 2602 }, { "epoch": 0.6420118343195266, "grad_norm": 0.47265625, "learning_rate": 0.0001995106382857198, "loss": 0.0809, "step": 2604 }, { "epoch": 0.6425049309664694, "grad_norm": 0.482421875, "learning_rate": 0.00019950723091334186, "loss": 0.0822, "step": 2606 }, { "epoch": 0.6429980276134122, "grad_norm": 0.58984375, "learning_rate": 0.00019950381174880147, "loss": 0.0835, "step": 2608 }, { "epoch": 0.643491124260355, "grad_norm": 0.55859375, "learning_rate": 0.00019950038079250384, "loss": 0.0778, "step": 2610 }, { "epoch": 0.6439842209072978, "grad_norm": 0.515625, "learning_rate": 0.00019949693804485554, "loss": 0.0785, "step": 2612 }, { "epoch": 0.6444773175542406, "grad_norm": 0.478515625, "learning_rate": 0.00019949348350626456, "loss": 0.0837, "step": 2614 }, { "epoch": 0.6449704142011834, "grad_norm": 0.46484375, "learning_rate": 0.0001994900171771403, "loss": 0.0765, "step": 2616 }, { "epoch": 0.6454635108481263, "grad_norm": 0.484375, "learning_rate": 0.00019948653905789353, "loss": 0.0724, "step": 2618 }, { "epoch": 0.645956607495069, "grad_norm": 0.7578125, "learning_rate": 0.00019948304914893639, "loss": 0.0758, "step": 2620 }, { "epoch": 0.6464497041420119, "grad_norm": 0.546875, "learning_rate": 0.00019947954745068252, "loss": 0.0789, "step": 2622 }, { "epoch": 0.6469428007889546, "grad_norm": 0.58203125, "learning_rate": 0.00019947603396354682, "loss": 0.0834, "step": 2624 }, { "epoch": 0.6474358974358975, "grad_norm": 0.470703125, "learning_rate": 0.0001994725086879457, "loss": 0.0808, "step": 2626 }, { "epoch": 0.6479289940828402, "grad_norm": 0.51171875, "learning_rate": 0.00019946897162429695, "loss": 0.0804, "step": 2628 }, { "epoch": 0.6484220907297831, "grad_norm": 0.55078125, "learning_rate": 0.00019946542277301968, "loss": 0.0808, "step": 2630 }, { "epoch": 0.6489151873767258, "grad_norm": 0.58984375, "learning_rate": 0.00019946186213453445, "loss": 0.0754, "step": 2632 }, { "epoch": 0.6494082840236687, "grad_norm": 0.68359375, "learning_rate": 0.00019945828970926327, "loss": 0.0828, "step": 2634 }, { "epoch": 0.6499013806706114, "grad_norm": 0.58203125, "learning_rate": 0.00019945470549762942, "loss": 0.0869, "step": 2636 }, { "epoch": 0.6503944773175543, "grad_norm": 0.55078125, "learning_rate": 0.0001994511095000577, "loss": 0.0782, "step": 2638 }, { "epoch": 0.650887573964497, "grad_norm": 0.50390625, "learning_rate": 0.00019944750171697423, "loss": 0.0807, "step": 2640 }, { "epoch": 0.6513806706114399, "grad_norm": 0.482421875, "learning_rate": 0.00019944388214880653, "loss": 0.0821, "step": 2642 }, { "epoch": 0.6518737672583826, "grad_norm": 0.431640625, "learning_rate": 0.00019944025079598362, "loss": 0.0837, "step": 2644 }, { "epoch": 0.6523668639053254, "grad_norm": 0.59375, "learning_rate": 0.0001994366076589358, "loss": 0.0821, "step": 2646 }, { "epoch": 0.6528599605522682, "grad_norm": 0.64453125, "learning_rate": 0.00019943295273809475, "loss": 0.0886, "step": 2648 }, { "epoch": 0.653353057199211, "grad_norm": 0.6484375, "learning_rate": 0.00019942928603389366, "loss": 0.0819, "step": 2650 }, { "epoch": 0.6538461538461539, "grad_norm": 0.6328125, "learning_rate": 0.00019942560754676703, "loss": 0.0827, "step": 2652 }, { "epoch": 0.6543392504930966, "grad_norm": 0.5703125, "learning_rate": 0.00019942191727715075, "loss": 0.0773, "step": 2654 }, { "epoch": 0.6548323471400395, "grad_norm": 0.439453125, "learning_rate": 0.00019941821522548222, "loss": 0.0818, "step": 2656 }, { "epoch": 0.6553254437869822, "grad_norm": 0.6875, "learning_rate": 0.0001994145013922001, "loss": 0.0822, "step": 2658 }, { "epoch": 0.6558185404339251, "grad_norm": 0.490234375, "learning_rate": 0.0001994107757777445, "loss": 0.0815, "step": 2660 }, { "epoch": 0.6563116370808678, "grad_norm": 0.5078125, "learning_rate": 0.00019940703838255692, "loss": 0.0807, "step": 2662 }, { "epoch": 0.6568047337278107, "grad_norm": 0.68359375, "learning_rate": 0.00019940328920708028, "loss": 0.0856, "step": 2664 }, { "epoch": 0.6572978303747534, "grad_norm": 0.46875, "learning_rate": 0.00019939952825175888, "loss": 0.0797, "step": 2666 }, { "epoch": 0.6577909270216963, "grad_norm": 0.58203125, "learning_rate": 0.0001993957555170384, "loss": 0.0778, "step": 2668 }, { "epoch": 0.658284023668639, "grad_norm": 0.40234375, "learning_rate": 0.00019939197100336592, "loss": 0.0759, "step": 2670 }, { "epoch": 0.6587771203155819, "grad_norm": 0.515625, "learning_rate": 0.00019938817471118996, "loss": 0.0734, "step": 2672 }, { "epoch": 0.6592702169625246, "grad_norm": 0.46875, "learning_rate": 0.0001993843666409604, "loss": 0.0833, "step": 2674 }, { "epoch": 0.6597633136094675, "grad_norm": 0.498046875, "learning_rate": 0.00019938054679312844, "loss": 0.0792, "step": 2676 }, { "epoch": 0.6602564102564102, "grad_norm": 0.486328125, "learning_rate": 0.0001993767151681469, "loss": 0.0792, "step": 2678 }, { "epoch": 0.6607495069033531, "grad_norm": 0.56640625, "learning_rate": 0.0001993728717664697, "loss": 0.0856, "step": 2680 }, { "epoch": 0.6612426035502958, "grad_norm": 0.60546875, "learning_rate": 0.0001993690165885524, "loss": 0.081, "step": 2682 }, { "epoch": 0.6617357001972387, "grad_norm": 0.671875, "learning_rate": 0.00019936514963485183, "loss": 0.074, "step": 2684 }, { "epoch": 0.6622287968441815, "grad_norm": 0.640625, "learning_rate": 0.0001993612709058262, "loss": 0.0796, "step": 2686 }, { "epoch": 0.6627218934911243, "grad_norm": 0.52734375, "learning_rate": 0.00019935738040193526, "loss": 0.0777, "step": 2688 }, { "epoch": 0.6632149901380671, "grad_norm": 0.6015625, "learning_rate": 0.00019935347812363998, "loss": 0.0838, "step": 2690 }, { "epoch": 0.6637080867850098, "grad_norm": 0.8515625, "learning_rate": 0.00019934956407140285, "loss": 0.0854, "step": 2692 }, { "epoch": 0.6642011834319527, "grad_norm": 1.2109375, "learning_rate": 0.0001993456382456876, "loss": 0.0824, "step": 2694 }, { "epoch": 0.6646942800788954, "grad_norm": 1.2890625, "learning_rate": 0.00019934170064695963, "loss": 0.08, "step": 2696 }, { "epoch": 0.6651873767258383, "grad_norm": 0.703125, "learning_rate": 0.00019933775127568547, "loss": 0.0877, "step": 2698 }, { "epoch": 0.665680473372781, "grad_norm": 0.71875, "learning_rate": 0.00019933379013233312, "loss": 0.0816, "step": 2700 }, { "epoch": 0.6661735700197239, "grad_norm": 0.78125, "learning_rate": 0.00019932981721737202, "loss": 0.0971, "step": 2702 }, { "epoch": 0.6666666666666666, "grad_norm": 0.65234375, "learning_rate": 0.000199325832531273, "loss": 0.075, "step": 2704 }, { "epoch": 0.6671597633136095, "grad_norm": 0.76953125, "learning_rate": 0.0001993218360745083, "loss": 0.0827, "step": 2706 }, { "epoch": 0.6676528599605522, "grad_norm": 0.466796875, "learning_rate": 0.00019931782784755146, "loss": 0.0743, "step": 2708 }, { "epoch": 0.6681459566074951, "grad_norm": 0.63671875, "learning_rate": 0.0001993138078508775, "loss": 0.0861, "step": 2710 }, { "epoch": 0.6686390532544378, "grad_norm": 0.546875, "learning_rate": 0.0001993097760849628, "loss": 0.0726, "step": 2712 }, { "epoch": 0.6691321499013807, "grad_norm": 0.4609375, "learning_rate": 0.00019930573255028518, "loss": 0.0796, "step": 2714 }, { "epoch": 0.6696252465483234, "grad_norm": 0.494140625, "learning_rate": 0.00019930167724732377, "loss": 0.0779, "step": 2716 }, { "epoch": 0.6701183431952663, "grad_norm": 0.59375, "learning_rate": 0.0001992976101765592, "loss": 0.0779, "step": 2718 }, { "epoch": 0.6706114398422091, "grad_norm": 0.53515625, "learning_rate": 0.00019929353133847343, "loss": 0.0753, "step": 2720 }, { "epoch": 0.6711045364891519, "grad_norm": 1.0, "learning_rate": 0.00019928944073354982, "loss": 0.0921, "step": 2722 }, { "epoch": 0.6715976331360947, "grad_norm": 0.921875, "learning_rate": 0.00019928533836227307, "loss": 0.0875, "step": 2724 }, { "epoch": 0.6720907297830375, "grad_norm": 0.6640625, "learning_rate": 0.0001992812242251294, "loss": 0.0768, "step": 2726 }, { "epoch": 0.6725838264299803, "grad_norm": 0.82421875, "learning_rate": 0.0001992770983226064, "loss": 0.084, "step": 2728 }, { "epoch": 0.6730769230769231, "grad_norm": 0.62109375, "learning_rate": 0.00019927296065519289, "loss": 0.0807, "step": 2730 }, { "epoch": 0.6735700197238659, "grad_norm": 0.439453125, "learning_rate": 0.0001992688112233793, "loss": 0.0761, "step": 2732 }, { "epoch": 0.6740631163708086, "grad_norm": 0.55859375, "learning_rate": 0.00019926465002765732, "loss": 0.0834, "step": 2734 }, { "epoch": 0.6745562130177515, "grad_norm": 0.59375, "learning_rate": 0.00019926047706852012, "loss": 0.0825, "step": 2736 }, { "epoch": 0.6750493096646942, "grad_norm": 0.46484375, "learning_rate": 0.00019925629234646217, "loss": 0.0822, "step": 2738 }, { "epoch": 0.6755424063116371, "grad_norm": 0.44921875, "learning_rate": 0.0001992520958619794, "loss": 0.0788, "step": 2740 }, { "epoch": 0.6760355029585798, "grad_norm": 0.58203125, "learning_rate": 0.0001992478876155691, "loss": 0.0808, "step": 2742 }, { "epoch": 0.6765285996055227, "grad_norm": 0.546875, "learning_rate": 0.00019924366760773002, "loss": 0.072, "step": 2744 }, { "epoch": 0.6770216962524654, "grad_norm": 0.44140625, "learning_rate": 0.0001992394358389622, "loss": 0.0786, "step": 2746 }, { "epoch": 0.6775147928994083, "grad_norm": 0.5859375, "learning_rate": 0.00019923519230976715, "loss": 0.0833, "step": 2748 }, { "epoch": 0.678007889546351, "grad_norm": 0.380859375, "learning_rate": 0.00019923093702064776, "loss": 0.0807, "step": 2750 }, { "epoch": 0.6785009861932939, "grad_norm": 0.427734375, "learning_rate": 0.00019922666997210833, "loss": 0.0818, "step": 2752 }, { "epoch": 0.6789940828402367, "grad_norm": 0.6484375, "learning_rate": 0.00019922239116465447, "loss": 0.0845, "step": 2754 }, { "epoch": 0.6794871794871795, "grad_norm": 0.6484375, "learning_rate": 0.00019921810059879324, "loss": 0.0784, "step": 2756 }, { "epoch": 0.6799802761341223, "grad_norm": 0.72265625, "learning_rate": 0.00019921379827503316, "loss": 0.0837, "step": 2758 }, { "epoch": 0.6804733727810651, "grad_norm": 0.67578125, "learning_rate": 0.00019920948419388404, "loss": 0.0927, "step": 2760 }, { "epoch": 0.6809664694280079, "grad_norm": 0.453125, "learning_rate": 0.00019920515835585714, "loss": 0.0786, "step": 2762 }, { "epoch": 0.6814595660749507, "grad_norm": 0.80859375, "learning_rate": 0.00019920082076146508, "loss": 0.083, "step": 2764 }, { "epoch": 0.6819526627218935, "grad_norm": 0.578125, "learning_rate": 0.00019919647141122186, "loss": 0.0798, "step": 2766 }, { "epoch": 0.6824457593688363, "grad_norm": 0.6015625, "learning_rate": 0.00019919211030564298, "loss": 0.0786, "step": 2768 }, { "epoch": 0.6829388560157791, "grad_norm": 0.92578125, "learning_rate": 0.0001991877374452452, "loss": 0.0792, "step": 2770 }, { "epoch": 0.6834319526627219, "grad_norm": 0.5625, "learning_rate": 0.00019918335283054673, "loss": 0.0769, "step": 2772 }, { "epoch": 0.6839250493096647, "grad_norm": 0.640625, "learning_rate": 0.00019917895646206715, "loss": 0.0726, "step": 2774 }, { "epoch": 0.6844181459566075, "grad_norm": 0.6015625, "learning_rate": 0.0001991745483403275, "loss": 0.0834, "step": 2776 }, { "epoch": 0.6849112426035503, "grad_norm": 0.4609375, "learning_rate": 0.0001991701284658502, "loss": 0.0736, "step": 2778 }, { "epoch": 0.685404339250493, "grad_norm": 0.64453125, "learning_rate": 0.00019916569683915895, "loss": 0.076, "step": 2780 }, { "epoch": 0.6858974358974359, "grad_norm": 0.55078125, "learning_rate": 0.00019916125346077898, "loss": 0.0776, "step": 2782 }, { "epoch": 0.6863905325443787, "grad_norm": 0.52734375, "learning_rate": 0.00019915679833123676, "loss": 0.0731, "step": 2784 }, { "epoch": 0.6868836291913215, "grad_norm": 0.6796875, "learning_rate": 0.00019915233145106038, "loss": 0.0791, "step": 2786 }, { "epoch": 0.6873767258382643, "grad_norm": 0.6328125, "learning_rate": 0.0001991478528207791, "loss": 0.0797, "step": 2788 }, { "epoch": 0.6878698224852071, "grad_norm": 0.490234375, "learning_rate": 0.0001991433624409237, "loss": 0.0789, "step": 2790 }, { "epoch": 0.6883629191321499, "grad_norm": 0.625, "learning_rate": 0.0001991388603120263, "loss": 0.0784, "step": 2792 }, { "epoch": 0.6888560157790927, "grad_norm": 0.490234375, "learning_rate": 0.00019913434643462045, "loss": 0.0792, "step": 2794 }, { "epoch": 0.6893491124260355, "grad_norm": 0.59375, "learning_rate": 0.00019912982080924103, "loss": 0.0745, "step": 2796 }, { "epoch": 0.6898422090729783, "grad_norm": 0.67578125, "learning_rate": 0.00019912528343642437, "loss": 0.082, "step": 2798 }, { "epoch": 0.6903353057199211, "grad_norm": 0.5234375, "learning_rate": 0.00019912073431670822, "loss": 0.0706, "step": 2800 }, { "epoch": 0.6908284023668639, "grad_norm": 0.85546875, "learning_rate": 0.0001991161734506316, "loss": 0.0774, "step": 2802 }, { "epoch": 0.6913214990138067, "grad_norm": 0.640625, "learning_rate": 0.00019911160083873505, "loss": 0.0744, "step": 2804 }, { "epoch": 0.6918145956607495, "grad_norm": 0.486328125, "learning_rate": 0.0001991070164815604, "loss": 0.0797, "step": 2806 }, { "epoch": 0.6923076923076923, "grad_norm": 0.6015625, "learning_rate": 0.00019910242037965098, "loss": 0.0755, "step": 2808 }, { "epoch": 0.6928007889546351, "grad_norm": 0.6484375, "learning_rate": 0.00019909781253355142, "loss": 0.0758, "step": 2810 }, { "epoch": 0.6932938856015779, "grad_norm": 0.71484375, "learning_rate": 0.00019909319294380778, "loss": 0.088, "step": 2812 }, { "epoch": 0.6937869822485208, "grad_norm": 0.80078125, "learning_rate": 0.0001990885616109675, "loss": 0.0805, "step": 2814 }, { "epoch": 0.6942800788954635, "grad_norm": 0.56640625, "learning_rate": 0.00019908391853557946, "loss": 0.0784, "step": 2816 }, { "epoch": 0.6947731755424064, "grad_norm": 0.7890625, "learning_rate": 0.00019907926371819383, "loss": 0.0873, "step": 2818 }, { "epoch": 0.6952662721893491, "grad_norm": 0.91796875, "learning_rate": 0.00019907459715936228, "loss": 0.0805, "step": 2820 }, { "epoch": 0.6957593688362919, "grad_norm": 0.609375, "learning_rate": 0.0001990699188596378, "loss": 0.0784, "step": 2822 }, { "epoch": 0.6962524654832347, "grad_norm": 0.7578125, "learning_rate": 0.0001990652288195748, "loss": 0.0779, "step": 2824 }, { "epoch": 0.6967455621301775, "grad_norm": 0.79296875, "learning_rate": 0.0001990605270397291, "loss": 0.0843, "step": 2826 }, { "epoch": 0.6972386587771203, "grad_norm": 0.64453125, "learning_rate": 0.0001990558135206578, "loss": 0.0756, "step": 2828 }, { "epoch": 0.6977317554240631, "grad_norm": 0.5234375, "learning_rate": 0.0001990510882629196, "loss": 0.0751, "step": 2830 }, { "epoch": 0.6982248520710059, "grad_norm": 1.0703125, "learning_rate": 0.00019904635126707438, "loss": 0.0788, "step": 2832 }, { "epoch": 0.6987179487179487, "grad_norm": 0.69921875, "learning_rate": 0.00019904160253368354, "loss": 0.0777, "step": 2834 }, { "epoch": 0.6992110453648915, "grad_norm": 0.462890625, "learning_rate": 0.00019903684206330982, "loss": 0.0751, "step": 2836 }, { "epoch": 0.6997041420118343, "grad_norm": 0.44921875, "learning_rate": 0.00019903206985651738, "loss": 0.0772, "step": 2838 }, { "epoch": 0.7001972386587771, "grad_norm": 0.5859375, "learning_rate": 0.0001990272859138717, "loss": 0.0793, "step": 2840 }, { "epoch": 0.7006903353057199, "grad_norm": 0.96484375, "learning_rate": 0.0001990224902359398, "loss": 0.0829, "step": 2842 }, { "epoch": 0.7011834319526628, "grad_norm": 0.5078125, "learning_rate": 0.00019901768282328986, "loss": 0.0804, "step": 2844 }, { "epoch": 0.7016765285996055, "grad_norm": 0.71484375, "learning_rate": 0.00019901286367649172, "loss": 0.079, "step": 2846 }, { "epoch": 0.7021696252465484, "grad_norm": 0.71875, "learning_rate": 0.0001990080327961164, "loss": 0.0794, "step": 2848 }, { "epoch": 0.7026627218934911, "grad_norm": 0.80859375, "learning_rate": 0.0001990031901827364, "loss": 0.0822, "step": 2850 }, { "epoch": 0.703155818540434, "grad_norm": 0.60546875, "learning_rate": 0.00019899833583692563, "loss": 0.0777, "step": 2852 }, { "epoch": 0.7036489151873767, "grad_norm": 0.83984375, "learning_rate": 0.0001989934697592593, "loss": 0.0764, "step": 2854 }, { "epoch": 0.7041420118343196, "grad_norm": 0.5546875, "learning_rate": 0.00019898859195031408, "loss": 0.0804, "step": 2856 }, { "epoch": 0.7046351084812623, "grad_norm": 0.4609375, "learning_rate": 0.00019898370241066806, "loss": 0.0779, "step": 2858 }, { "epoch": 0.7051282051282052, "grad_norm": 0.52734375, "learning_rate": 0.00019897880114090065, "loss": 0.0744, "step": 2860 }, { "epoch": 0.7056213017751479, "grad_norm": 0.5078125, "learning_rate": 0.00019897388814159265, "loss": 0.0717, "step": 2862 }, { "epoch": 0.7061143984220908, "grad_norm": 0.609375, "learning_rate": 0.00019896896341332633, "loss": 0.0733, "step": 2864 }, { "epoch": 0.7066074950690335, "grad_norm": 0.58984375, "learning_rate": 0.00019896402695668527, "loss": 0.0765, "step": 2866 }, { "epoch": 0.7071005917159763, "grad_norm": 0.451171875, "learning_rate": 0.00019895907877225447, "loss": 0.0752, "step": 2868 }, { "epoch": 0.7075936883629191, "grad_norm": 0.61328125, "learning_rate": 0.00019895411886062031, "loss": 0.0739, "step": 2870 }, { "epoch": 0.7080867850098619, "grad_norm": 0.48828125, "learning_rate": 0.0001989491472223706, "loss": 0.0721, "step": 2872 }, { "epoch": 0.7085798816568047, "grad_norm": 0.59765625, "learning_rate": 0.00019894416385809444, "loss": 0.078, "step": 2874 }, { "epoch": 0.7090729783037475, "grad_norm": 0.53125, "learning_rate": 0.0001989391687683825, "loss": 0.0861, "step": 2876 }, { "epoch": 0.7095660749506904, "grad_norm": 0.9140625, "learning_rate": 0.0001989341619538266, "loss": 0.0793, "step": 2878 }, { "epoch": 0.7100591715976331, "grad_norm": 0.58203125, "learning_rate": 0.00019892914341502013, "loss": 0.0785, "step": 2880 }, { "epoch": 0.710552268244576, "grad_norm": 0.6015625, "learning_rate": 0.00019892411315255783, "loss": 0.081, "step": 2882 }, { "epoch": 0.7110453648915187, "grad_norm": 0.73046875, "learning_rate": 0.00019891907116703582, "loss": 0.0845, "step": 2884 }, { "epoch": 0.7115384615384616, "grad_norm": 0.43359375, "learning_rate": 0.00019891401745905152, "loss": 0.0762, "step": 2886 }, { "epoch": 0.7120315581854043, "grad_norm": 0.494140625, "learning_rate": 0.00019890895202920395, "loss": 0.0778, "step": 2888 }, { "epoch": 0.7125246548323472, "grad_norm": 0.54296875, "learning_rate": 0.0001989038748780933, "loss": 0.0727, "step": 2890 }, { "epoch": 0.7130177514792899, "grad_norm": 0.59765625, "learning_rate": 0.00019889878600632126, "loss": 0.0844, "step": 2892 }, { "epoch": 0.7135108481262328, "grad_norm": 0.44921875, "learning_rate": 0.0001988936854144909, "loss": 0.0816, "step": 2894 }, { "epoch": 0.7140039447731755, "grad_norm": 0.41796875, "learning_rate": 0.0001988885731032067, "loss": 0.0808, "step": 2896 }, { "epoch": 0.7144970414201184, "grad_norm": 0.734375, "learning_rate": 0.00019888344907307442, "loss": 0.0813, "step": 2898 }, { "epoch": 0.7149901380670611, "grad_norm": 0.68359375, "learning_rate": 0.00019887831332470138, "loss": 0.0823, "step": 2900 }, { "epoch": 0.715483234714004, "grad_norm": 0.55078125, "learning_rate": 0.0001988731658586961, "loss": 0.081, "step": 2902 }, { "epoch": 0.7159763313609467, "grad_norm": 0.478515625, "learning_rate": 0.00019886800667566864, "loss": 0.0805, "step": 2904 }, { "epoch": 0.7164694280078896, "grad_norm": 0.6484375, "learning_rate": 0.0001988628357762304, "loss": 0.0791, "step": 2906 }, { "epoch": 0.7169625246548323, "grad_norm": 0.447265625, "learning_rate": 0.0001988576531609941, "loss": 0.0666, "step": 2908 }, { "epoch": 0.7174556213017751, "grad_norm": 0.455078125, "learning_rate": 0.00019885245883057403, "loss": 0.0778, "step": 2910 }, { "epoch": 0.717948717948718, "grad_norm": 0.72265625, "learning_rate": 0.00019884725278558558, "loss": 0.08, "step": 2912 }, { "epoch": 0.7184418145956607, "grad_norm": 0.8125, "learning_rate": 0.00019884203502664583, "loss": 0.0796, "step": 2914 }, { "epoch": 0.7189349112426036, "grad_norm": 0.85546875, "learning_rate": 0.00019883680555437305, "loss": 0.0781, "step": 2916 }, { "epoch": 0.7194280078895463, "grad_norm": 0.62109375, "learning_rate": 0.00019883156436938698, "loss": 0.0781, "step": 2918 }, { "epoch": 0.7199211045364892, "grad_norm": 0.48828125, "learning_rate": 0.00019882631147230876, "loss": 0.0787, "step": 2920 }, { "epoch": 0.7204142011834319, "grad_norm": 0.6796875, "learning_rate": 0.0001988210468637608, "loss": 0.0884, "step": 2922 }, { "epoch": 0.7209072978303748, "grad_norm": 0.57421875, "learning_rate": 0.00019881577054436705, "loss": 0.076, "step": 2924 }, { "epoch": 0.7214003944773175, "grad_norm": 0.6171875, "learning_rate": 0.0001988104825147528, "loss": 0.0768, "step": 2926 }, { "epoch": 0.7218934911242604, "grad_norm": 0.490234375, "learning_rate": 0.00019880518277554468, "loss": 0.0783, "step": 2928 }, { "epoch": 0.7223865877712031, "grad_norm": 0.58203125, "learning_rate": 0.00019879987132737073, "loss": 0.0774, "step": 2930 }, { "epoch": 0.722879684418146, "grad_norm": 0.73046875, "learning_rate": 0.0001987945481708604, "loss": 0.0725, "step": 2932 }, { "epoch": 0.7233727810650887, "grad_norm": 0.4375, "learning_rate": 0.00019878921330664453, "loss": 0.0828, "step": 2934 }, { "epoch": 0.7238658777120316, "grad_norm": 0.8359375, "learning_rate": 0.0001987838667353553, "loss": 0.0778, "step": 2936 }, { "epoch": 0.7243589743589743, "grad_norm": 0.6328125, "learning_rate": 0.00019877850845762633, "loss": 0.075, "step": 2938 }, { "epoch": 0.7248520710059172, "grad_norm": 0.56640625, "learning_rate": 0.00019877313847409261, "loss": 0.083, "step": 2940 }, { "epoch": 0.72534516765286, "grad_norm": 0.458984375, "learning_rate": 0.0001987677567853905, "loss": 0.0787, "step": 2942 }, { "epoch": 0.7258382642998028, "grad_norm": 0.56640625, "learning_rate": 0.00019876236339215773, "loss": 0.0824, "step": 2944 }, { "epoch": 0.7263313609467456, "grad_norm": 0.42578125, "learning_rate": 0.0001987569582950335, "loss": 0.0777, "step": 2946 }, { "epoch": 0.7268244575936884, "grad_norm": 0.546875, "learning_rate": 0.00019875154149465833, "loss": 0.0786, "step": 2948 }, { "epoch": 0.7273175542406312, "grad_norm": 0.44140625, "learning_rate": 0.00019874611299167412, "loss": 0.0736, "step": 2950 }, { "epoch": 0.727810650887574, "grad_norm": 0.4921875, "learning_rate": 0.0001987406727867242, "loss": 0.0754, "step": 2952 }, { "epoch": 0.7283037475345168, "grad_norm": 0.419921875, "learning_rate": 0.0001987352208804533, "loss": 0.0764, "step": 2954 }, { "epoch": 0.7287968441814595, "grad_norm": 0.58984375, "learning_rate": 0.0001987297572735074, "loss": 0.0795, "step": 2956 }, { "epoch": 0.7292899408284024, "grad_norm": 0.3984375, "learning_rate": 0.00019872428196653403, "loss": 0.0767, "step": 2958 }, { "epoch": 0.7297830374753451, "grad_norm": 0.51171875, "learning_rate": 0.00019871879496018206, "loss": 0.0712, "step": 2960 }, { "epoch": 0.730276134122288, "grad_norm": 0.55078125, "learning_rate": 0.00019871329625510172, "loss": 0.0753, "step": 2962 }, { "epoch": 0.7307692307692307, "grad_norm": 0.498046875, "learning_rate": 0.00019870778585194461, "loss": 0.0739, "step": 2964 }, { "epoch": 0.7312623274161736, "grad_norm": 0.56640625, "learning_rate": 0.00019870226375136378, "loss": 0.0756, "step": 2966 }, { "epoch": 0.7317554240631163, "grad_norm": 0.4765625, "learning_rate": 0.0001986967299540136, "loss": 0.0747, "step": 2968 }, { "epoch": 0.7322485207100592, "grad_norm": 0.474609375, "learning_rate": 0.00019869118446054986, "loss": 0.0756, "step": 2970 }, { "epoch": 0.7327416173570019, "grad_norm": 0.703125, "learning_rate": 0.00019868562727162975, "loss": 0.081, "step": 2972 }, { "epoch": 0.7332347140039448, "grad_norm": 0.5625, "learning_rate": 0.00019868005838791184, "loss": 0.0706, "step": 2974 }, { "epoch": 0.7337278106508875, "grad_norm": 0.84375, "learning_rate": 0.00019867447781005604, "loss": 0.0789, "step": 2976 }, { "epoch": 0.7342209072978304, "grad_norm": 0.5546875, "learning_rate": 0.00019866888553872366, "loss": 0.0795, "step": 2978 }, { "epoch": 0.7347140039447732, "grad_norm": 0.87890625, "learning_rate": 0.00019866328157457748, "loss": 0.0777, "step": 2980 }, { "epoch": 0.735207100591716, "grad_norm": 0.49609375, "learning_rate": 0.00019865766591828157, "loss": 0.0812, "step": 2982 }, { "epoch": 0.7357001972386588, "grad_norm": 0.6171875, "learning_rate": 0.0001986520385705014, "loss": 0.0742, "step": 2984 }, { "epoch": 0.7361932938856016, "grad_norm": 0.8203125, "learning_rate": 0.00019864639953190387, "loss": 0.075, "step": 2986 }, { "epoch": 0.7366863905325444, "grad_norm": 0.72265625, "learning_rate": 0.0001986407488031572, "loss": 0.0854, "step": 2988 }, { "epoch": 0.7371794871794872, "grad_norm": 0.90625, "learning_rate": 0.0001986350863849311, "loss": 0.0714, "step": 2990 }, { "epoch": 0.73767258382643, "grad_norm": 0.51171875, "learning_rate": 0.00019862941227789653, "loss": 0.0722, "step": 2992 }, { "epoch": 0.7381656804733728, "grad_norm": 0.78125, "learning_rate": 0.0001986237264827259, "loss": 0.0778, "step": 2994 }, { "epoch": 0.7386587771203156, "grad_norm": 0.6015625, "learning_rate": 0.00019861802900009309, "loss": 0.0694, "step": 2996 }, { "epoch": 0.7391518737672583, "grad_norm": 0.42578125, "learning_rate": 0.0001986123198306732, "loss": 0.0775, "step": 2998 }, { "epoch": 0.7396449704142012, "grad_norm": 0.5703125, "learning_rate": 0.00019860659897514286, "loss": 0.0784, "step": 3000 }, { "epoch": 0.7401380670611439, "grad_norm": 0.439453125, "learning_rate": 0.00019860086643417994, "loss": 0.076, "step": 3002 }, { "epoch": 0.7406311637080868, "grad_norm": 0.97265625, "learning_rate": 0.00019859512220846387, "loss": 0.0745, "step": 3004 }, { "epoch": 0.7411242603550295, "grad_norm": 1.0546875, "learning_rate": 0.0001985893662986753, "loss": 0.0803, "step": 3006 }, { "epoch": 0.7416173570019724, "grad_norm": 0.5234375, "learning_rate": 0.0001985835987054964, "loss": 0.081, "step": 3008 }, { "epoch": 0.7421104536489151, "grad_norm": 0.71875, "learning_rate": 0.00019857781942961064, "loss": 0.0872, "step": 3010 }, { "epoch": 0.742603550295858, "grad_norm": 0.50390625, "learning_rate": 0.00019857202847170288, "loss": 0.0729, "step": 3012 }, { "epoch": 0.7430966469428008, "grad_norm": 0.58203125, "learning_rate": 0.00019856622583245938, "loss": 0.0741, "step": 3014 }, { "epoch": 0.7435897435897436, "grad_norm": 0.7265625, "learning_rate": 0.0001985604115125678, "loss": 0.0802, "step": 3016 }, { "epoch": 0.7440828402366864, "grad_norm": 0.51171875, "learning_rate": 0.0001985545855127172, "loss": 0.0777, "step": 3018 }, { "epoch": 0.7445759368836292, "grad_norm": 0.7421875, "learning_rate": 0.0001985487478335979, "loss": 0.0765, "step": 3020 }, { "epoch": 0.745069033530572, "grad_norm": 0.5078125, "learning_rate": 0.00019854289847590176, "loss": 0.069, "step": 3022 }, { "epoch": 0.7455621301775148, "grad_norm": 0.6953125, "learning_rate": 0.00019853703744032202, "loss": 0.0711, "step": 3024 }, { "epoch": 0.7460552268244576, "grad_norm": 0.5390625, "learning_rate": 0.00019853116472755313, "loss": 0.079, "step": 3026 }, { "epoch": 0.7465483234714004, "grad_norm": 0.478515625, "learning_rate": 0.0001985252803382911, "loss": 0.0663, "step": 3028 }, { "epoch": 0.7470414201183432, "grad_norm": 0.6484375, "learning_rate": 0.00019851938427323327, "loss": 0.0762, "step": 3030 }, { "epoch": 0.747534516765286, "grad_norm": 0.478515625, "learning_rate": 0.0001985134765330783, "loss": 0.0746, "step": 3032 }, { "epoch": 0.7480276134122288, "grad_norm": 0.482421875, "learning_rate": 0.0001985075571185264, "loss": 0.0771, "step": 3034 }, { "epoch": 0.7485207100591716, "grad_norm": 0.53125, "learning_rate": 0.00019850162603027896, "loss": 0.0779, "step": 3036 }, { "epoch": 0.7490138067061144, "grad_norm": 0.6953125, "learning_rate": 0.00019849568326903884, "loss": 0.0713, "step": 3038 }, { "epoch": 0.7495069033530573, "grad_norm": 0.453125, "learning_rate": 0.00019848972883551035, "loss": 0.0774, "step": 3040 }, { "epoch": 0.75, "grad_norm": 0.546875, "learning_rate": 0.00019848376273039908, "loss": 0.0802, "step": 3042 }, { "epoch": 0.7504930966469427, "grad_norm": 0.60546875, "learning_rate": 0.00019847778495441206, "loss": 0.077, "step": 3044 }, { "epoch": 0.7509861932938856, "grad_norm": 0.4375, "learning_rate": 0.00019847179550825774, "loss": 0.0797, "step": 3046 }, { "epoch": 0.7514792899408284, "grad_norm": 0.75390625, "learning_rate": 0.00019846579439264584, "loss": 0.0758, "step": 3048 }, { "epoch": 0.7519723865877712, "grad_norm": 0.453125, "learning_rate": 0.0001984597816082875, "loss": 0.0828, "step": 3050 }, { "epoch": 0.752465483234714, "grad_norm": 0.70703125, "learning_rate": 0.00019845375715589535, "loss": 0.0751, "step": 3052 }, { "epoch": 0.7529585798816568, "grad_norm": 0.69921875, "learning_rate": 0.00019844772103618327, "loss": 0.0802, "step": 3054 }, { "epoch": 0.7534516765285996, "grad_norm": 0.5234375, "learning_rate": 0.00019844167324986657, "loss": 0.0705, "step": 3056 }, { "epoch": 0.7539447731755424, "grad_norm": 0.6953125, "learning_rate": 0.000198435613797662, "loss": 0.0727, "step": 3058 }, { "epoch": 0.7544378698224852, "grad_norm": 0.57421875, "learning_rate": 0.00019842954268028758, "loss": 0.0735, "step": 3060 }, { "epoch": 0.754930966469428, "grad_norm": 0.55078125, "learning_rate": 0.00019842345989846277, "loss": 0.0762, "step": 3062 }, { "epoch": 0.7554240631163708, "grad_norm": 0.490234375, "learning_rate": 0.00019841736545290846, "loss": 0.0781, "step": 3064 }, { "epoch": 0.7559171597633136, "grad_norm": 0.515625, "learning_rate": 0.00019841125934434684, "loss": 0.0792, "step": 3066 }, { "epoch": 0.7564102564102564, "grad_norm": 0.6328125, "learning_rate": 0.00019840514157350153, "loss": 0.08, "step": 3068 }, { "epoch": 0.7569033530571992, "grad_norm": 0.435546875, "learning_rate": 0.00019839901214109754, "loss": 0.0717, "step": 3070 }, { "epoch": 0.757396449704142, "grad_norm": 0.5, "learning_rate": 0.0001983928710478612, "loss": 0.0808, "step": 3072 }, { "epoch": 0.7578895463510849, "grad_norm": 0.48828125, "learning_rate": 0.00019838671829452032, "loss": 0.0788, "step": 3074 }, { "epoch": 0.7583826429980276, "grad_norm": 0.48828125, "learning_rate": 0.00019838055388180396, "loss": 0.0773, "step": 3076 }, { "epoch": 0.7588757396449705, "grad_norm": 0.486328125, "learning_rate": 0.00019837437781044268, "loss": 0.0678, "step": 3078 }, { "epoch": 0.7593688362919132, "grad_norm": 0.515625, "learning_rate": 0.0001983681900811684, "loss": 0.0684, "step": 3080 }, { "epoch": 0.7598619329388561, "grad_norm": 0.482421875, "learning_rate": 0.00019836199069471437, "loss": 0.0761, "step": 3082 }, { "epoch": 0.7603550295857988, "grad_norm": 0.498046875, "learning_rate": 0.00019835577965181527, "loss": 0.075, "step": 3084 }, { "epoch": 0.7608481262327417, "grad_norm": 0.578125, "learning_rate": 0.00019834955695320709, "loss": 0.0764, "step": 3086 }, { "epoch": 0.7613412228796844, "grad_norm": 0.50390625, "learning_rate": 0.00019834332259962734, "loss": 0.0756, "step": 3088 }, { "epoch": 0.7618343195266272, "grad_norm": 0.45703125, "learning_rate": 0.00019833707659181477, "loss": 0.0708, "step": 3090 }, { "epoch": 0.76232741617357, "grad_norm": 0.4296875, "learning_rate": 0.00019833081893050957, "loss": 0.0788, "step": 3092 }, { "epoch": 0.7628205128205128, "grad_norm": 0.5, "learning_rate": 0.00019832454961645334, "loss": 0.0746, "step": 3094 }, { "epoch": 0.7633136094674556, "grad_norm": 0.609375, "learning_rate": 0.000198318268650389, "loss": 0.0777, "step": 3096 }, { "epoch": 0.7638067061143984, "grad_norm": 0.396484375, "learning_rate": 0.00019831197603306086, "loss": 0.0707, "step": 3098 }, { "epoch": 0.7642998027613412, "grad_norm": 0.69921875, "learning_rate": 0.0001983056717652147, "loss": 0.0781, "step": 3100 }, { "epoch": 0.764792899408284, "grad_norm": 1.0, "learning_rate": 0.00019829935584759754, "loss": 0.0767, "step": 3102 }, { "epoch": 0.7652859960552268, "grad_norm": 0.431640625, "learning_rate": 0.0001982930282809579, "loss": 0.0676, "step": 3104 }, { "epoch": 0.7657790927021696, "grad_norm": 0.6796875, "learning_rate": 0.00019828668906604556, "loss": 0.078, "step": 3106 }, { "epoch": 0.7662721893491125, "grad_norm": 0.5, "learning_rate": 0.00019828033820361185, "loss": 0.0797, "step": 3108 }, { "epoch": 0.7667652859960552, "grad_norm": 0.66015625, "learning_rate": 0.00019827397569440933, "loss": 0.066, "step": 3110 }, { "epoch": 0.7672583826429981, "grad_norm": 0.431640625, "learning_rate": 0.00019826760153919198, "loss": 0.0745, "step": 3112 }, { "epoch": 0.7677514792899408, "grad_norm": 0.78125, "learning_rate": 0.0001982612157387152, "loss": 0.076, "step": 3114 }, { "epoch": 0.7682445759368837, "grad_norm": 0.5703125, "learning_rate": 0.00019825481829373575, "loss": 0.0718, "step": 3116 }, { "epoch": 0.7687376725838264, "grad_norm": 0.64453125, "learning_rate": 0.00019824840920501174, "loss": 0.0726, "step": 3118 }, { "epoch": 0.7692307692307693, "grad_norm": 0.486328125, "learning_rate": 0.00019824198847330266, "loss": 0.0697, "step": 3120 }, { "epoch": 0.769723865877712, "grad_norm": 0.49609375, "learning_rate": 0.0001982355560993695, "loss": 0.0788, "step": 3122 }, { "epoch": 0.7702169625246549, "grad_norm": 0.5390625, "learning_rate": 0.00019822911208397441, "loss": 0.0712, "step": 3124 }, { "epoch": 0.7707100591715976, "grad_norm": 0.60546875, "learning_rate": 0.00019822265642788111, "loss": 0.0706, "step": 3126 }, { "epoch": 0.7712031558185405, "grad_norm": 0.6640625, "learning_rate": 0.00019821618913185463, "loss": 0.0752, "step": 3128 }, { "epoch": 0.7716962524654832, "grad_norm": 0.5234375, "learning_rate": 0.00019820971019666135, "loss": 0.0776, "step": 3130 }, { "epoch": 0.772189349112426, "grad_norm": 0.57421875, "learning_rate": 0.00019820321962306907, "loss": 0.0778, "step": 3132 }, { "epoch": 0.7726824457593688, "grad_norm": 0.59765625, "learning_rate": 0.000198196717411847, "loss": 0.0741, "step": 3134 }, { "epoch": 0.7731755424063116, "grad_norm": 0.62890625, "learning_rate": 0.00019819020356376564, "loss": 0.0738, "step": 3136 }, { "epoch": 0.7736686390532544, "grad_norm": 0.56640625, "learning_rate": 0.00019818367807959692, "loss": 0.0742, "step": 3138 }, { "epoch": 0.7741617357001972, "grad_norm": 0.53125, "learning_rate": 0.00019817714096011417, "loss": 0.0846, "step": 3140 }, { "epoch": 0.77465483234714, "grad_norm": 0.72265625, "learning_rate": 0.0001981705922060921, "loss": 0.0813, "step": 3142 }, { "epoch": 0.7751479289940828, "grad_norm": 0.56640625, "learning_rate": 0.00019816403181830667, "loss": 0.0763, "step": 3144 }, { "epoch": 0.7756410256410257, "grad_norm": 0.703125, "learning_rate": 0.00019815745979753543, "loss": 0.0786, "step": 3146 }, { "epoch": 0.7761341222879684, "grad_norm": 0.6875, "learning_rate": 0.00019815087614455718, "loss": 0.0764, "step": 3148 }, { "epoch": 0.7766272189349113, "grad_norm": 0.70703125, "learning_rate": 0.0001981442808601521, "loss": 0.0768, "step": 3150 }, { "epoch": 0.777120315581854, "grad_norm": 0.84765625, "learning_rate": 0.00019813767394510174, "loss": 0.075, "step": 3152 }, { "epoch": 0.7776134122287969, "grad_norm": 0.74609375, "learning_rate": 0.00019813105540018912, "loss": 0.0782, "step": 3154 }, { "epoch": 0.7781065088757396, "grad_norm": 0.58203125, "learning_rate": 0.0001981244252261985, "loss": 0.0763, "step": 3156 }, { "epoch": 0.7785996055226825, "grad_norm": 0.44921875, "learning_rate": 0.00019811778342391568, "loss": 0.0757, "step": 3158 }, { "epoch": 0.7790927021696252, "grad_norm": 0.5078125, "learning_rate": 0.00019811112999412766, "loss": 0.0736, "step": 3160 }, { "epoch": 0.7795857988165681, "grad_norm": 0.578125, "learning_rate": 0.00019810446493762301, "loss": 0.0772, "step": 3162 }, { "epoch": 0.7800788954635108, "grad_norm": 0.58984375, "learning_rate": 0.00019809778825519148, "loss": 0.074, "step": 3164 }, { "epoch": 0.7805719921104537, "grad_norm": 0.58984375, "learning_rate": 0.00019809109994762433, "loss": 0.0744, "step": 3166 }, { "epoch": 0.7810650887573964, "grad_norm": 0.44921875, "learning_rate": 0.00019808440001571417, "loss": 0.0724, "step": 3168 }, { "epoch": 0.7815581854043393, "grad_norm": 0.5234375, "learning_rate": 0.000198077688460255, "loss": 0.0761, "step": 3170 }, { "epoch": 0.782051282051282, "grad_norm": 0.5625, "learning_rate": 0.00019807096528204213, "loss": 0.0778, "step": 3172 }, { "epoch": 0.7825443786982249, "grad_norm": 0.482421875, "learning_rate": 0.00019806423048187232, "loss": 0.0788, "step": 3174 }, { "epoch": 0.7830374753451677, "grad_norm": 0.439453125, "learning_rate": 0.00019805748406054367, "loss": 0.0699, "step": 3176 }, { "epoch": 0.7835305719921104, "grad_norm": 0.5546875, "learning_rate": 0.00019805072601885567, "loss": 0.0826, "step": 3178 }, { "epoch": 0.7840236686390533, "grad_norm": 0.6015625, "learning_rate": 0.00019804395635760922, "loss": 0.0694, "step": 3180 }, { "epoch": 0.784516765285996, "grad_norm": 0.66015625, "learning_rate": 0.0001980371750776065, "loss": 0.0718, "step": 3182 }, { "epoch": 0.7850098619329389, "grad_norm": 0.48046875, "learning_rate": 0.0001980303821796512, "loss": 0.0762, "step": 3184 }, { "epoch": 0.7855029585798816, "grad_norm": 0.515625, "learning_rate": 0.00019802357766454827, "loss": 0.0686, "step": 3186 }, { "epoch": 0.7859960552268245, "grad_norm": 0.42578125, "learning_rate": 0.00019801676153310407, "loss": 0.0703, "step": 3188 }, { "epoch": 0.7864891518737672, "grad_norm": 0.4375, "learning_rate": 0.0001980099337861264, "loss": 0.0731, "step": 3190 }, { "epoch": 0.7869822485207101, "grad_norm": 0.5, "learning_rate": 0.00019800309442442434, "loss": 0.0741, "step": 3192 }, { "epoch": 0.7874753451676528, "grad_norm": 0.4296875, "learning_rate": 0.00019799624344880843, "loss": 0.0781, "step": 3194 }, { "epoch": 0.7879684418145957, "grad_norm": 0.546875, "learning_rate": 0.0001979893808600905, "loss": 0.0795, "step": 3196 }, { "epoch": 0.7884615384615384, "grad_norm": 0.439453125, "learning_rate": 0.00019798250665908386, "loss": 0.0816, "step": 3198 }, { "epoch": 0.7889546351084813, "grad_norm": 0.486328125, "learning_rate": 0.00019797562084660313, "loss": 0.0738, "step": 3200 }, { "epoch": 0.789447731755424, "grad_norm": 0.6953125, "learning_rate": 0.00019796872342346425, "loss": 0.0757, "step": 3202 }, { "epoch": 0.7899408284023669, "grad_norm": 0.494140625, "learning_rate": 0.0001979618143904847, "loss": 0.078, "step": 3204 }, { "epoch": 0.7904339250493096, "grad_norm": 0.40234375, "learning_rate": 0.0001979548937484832, "loss": 0.072, "step": 3206 }, { "epoch": 0.7909270216962525, "grad_norm": 0.6171875, "learning_rate": 0.00019794796149827986, "loss": 0.0769, "step": 3208 }, { "epoch": 0.7914201183431953, "grad_norm": 0.68359375, "learning_rate": 0.0001979410176406962, "loss": 0.0769, "step": 3210 }, { "epoch": 0.7919132149901381, "grad_norm": 0.53125, "learning_rate": 0.00019793406217655517, "loss": 0.0769, "step": 3212 }, { "epoch": 0.7924063116370809, "grad_norm": 0.75, "learning_rate": 0.00019792709510668094, "loss": 0.0731, "step": 3214 }, { "epoch": 0.7928994082840237, "grad_norm": 0.474609375, "learning_rate": 0.00019792011643189917, "loss": 0.0721, "step": 3216 }, { "epoch": 0.7933925049309665, "grad_norm": 0.76953125, "learning_rate": 0.0001979131261530369, "loss": 0.0725, "step": 3218 }, { "epoch": 0.7938856015779092, "grad_norm": 0.71484375, "learning_rate": 0.00019790612427092254, "loss": 0.0743, "step": 3220 }, { "epoch": 0.7943786982248521, "grad_norm": 0.75390625, "learning_rate": 0.00019789911078638576, "loss": 0.0719, "step": 3222 }, { "epoch": 0.7948717948717948, "grad_norm": 0.75390625, "learning_rate": 0.00019789208570025779, "loss": 0.0788, "step": 3224 }, { "epoch": 0.7953648915187377, "grad_norm": 0.48828125, "learning_rate": 0.0001978850490133711, "loss": 0.0746, "step": 3226 }, { "epoch": 0.7958579881656804, "grad_norm": 0.50390625, "learning_rate": 0.00019787800072655956, "loss": 0.0774, "step": 3228 }, { "epoch": 0.7963510848126233, "grad_norm": 0.41796875, "learning_rate": 0.0001978709408406585, "loss": 0.0777, "step": 3230 }, { "epoch": 0.796844181459566, "grad_norm": 0.70703125, "learning_rate": 0.00019786386935650448, "loss": 0.0771, "step": 3232 }, { "epoch": 0.7973372781065089, "grad_norm": 0.396484375, "learning_rate": 0.0001978567862749355, "loss": 0.0764, "step": 3234 }, { "epoch": 0.7978303747534516, "grad_norm": 0.74609375, "learning_rate": 0.00019784969159679106, "loss": 0.0775, "step": 3236 }, { "epoch": 0.7983234714003945, "grad_norm": 0.578125, "learning_rate": 0.0001978425853229118, "loss": 0.0782, "step": 3238 }, { "epoch": 0.7988165680473372, "grad_norm": 0.482421875, "learning_rate": 0.0001978354674541399, "loss": 0.0734, "step": 3240 }, { "epoch": 0.7993096646942801, "grad_norm": 0.609375, "learning_rate": 0.00019782833799131887, "loss": 0.0768, "step": 3242 }, { "epoch": 0.7998027613412229, "grad_norm": 0.53125, "learning_rate": 0.00019782119693529358, "loss": 0.0724, "step": 3244 }, { "epoch": 0.8002958579881657, "grad_norm": 0.4921875, "learning_rate": 0.00019781404428691028, "loss": 0.0763, "step": 3246 }, { "epoch": 0.8007889546351085, "grad_norm": 0.75, "learning_rate": 0.00019780688004701662, "loss": 0.0803, "step": 3248 }, { "epoch": 0.8012820512820513, "grad_norm": 0.87890625, "learning_rate": 0.0001977997042164616, "loss": 0.08, "step": 3250 }, { "epoch": 0.8017751479289941, "grad_norm": 0.46875, "learning_rate": 0.00019779251679609558, "loss": 0.0739, "step": 3252 }, { "epoch": 0.8022682445759369, "grad_norm": 0.59765625, "learning_rate": 0.0001977853177867703, "loss": 0.073, "step": 3254 }, { "epoch": 0.8027613412228797, "grad_norm": 0.4140625, "learning_rate": 0.00019777810718933888, "loss": 0.0704, "step": 3256 }, { "epoch": 0.8032544378698225, "grad_norm": 0.671875, "learning_rate": 0.00019777088500465588, "loss": 0.08, "step": 3258 }, { "epoch": 0.8037475345167653, "grad_norm": 0.61328125, "learning_rate": 0.00019776365123357713, "loss": 0.0704, "step": 3260 }, { "epoch": 0.8042406311637081, "grad_norm": 0.435546875, "learning_rate": 0.00019775640587695983, "loss": 0.0729, "step": 3262 }, { "epoch": 0.8047337278106509, "grad_norm": 0.625, "learning_rate": 0.00019774914893566265, "loss": 0.0691, "step": 3264 }, { "epoch": 0.8052268244575936, "grad_norm": 0.4296875, "learning_rate": 0.00019774188041054555, "loss": 0.0759, "step": 3266 }, { "epoch": 0.8057199211045365, "grad_norm": 0.3828125, "learning_rate": 0.0001977346003024699, "loss": 0.0804, "step": 3268 }, { "epoch": 0.8062130177514792, "grad_norm": 0.470703125, "learning_rate": 0.00019772730861229848, "loss": 0.0771, "step": 3270 }, { "epoch": 0.8067061143984221, "grad_norm": 0.56640625, "learning_rate": 0.00019772000534089529, "loss": 0.0768, "step": 3272 }, { "epoch": 0.8071992110453649, "grad_norm": 0.58203125, "learning_rate": 0.00019771269048912587, "loss": 0.0743, "step": 3274 }, { "epoch": 0.8076923076923077, "grad_norm": 0.57421875, "learning_rate": 0.0001977053640578571, "loss": 0.078, "step": 3276 }, { "epoch": 0.8081854043392505, "grad_norm": 0.6875, "learning_rate": 0.00019769802604795717, "loss": 0.0707, "step": 3278 }, { "epoch": 0.8086785009861933, "grad_norm": 0.73046875, "learning_rate": 0.00019769067646029567, "loss": 0.0739, "step": 3280 }, { "epoch": 0.8091715976331361, "grad_norm": 0.45703125, "learning_rate": 0.00019768331529574357, "loss": 0.0757, "step": 3282 }, { "epoch": 0.8096646942800789, "grad_norm": 0.875, "learning_rate": 0.0001976759425551732, "loss": 0.0747, "step": 3284 }, { "epoch": 0.8101577909270217, "grad_norm": 0.482421875, "learning_rate": 0.0001976685582394583, "loss": 0.0719, "step": 3286 }, { "epoch": 0.8106508875739645, "grad_norm": 0.5, "learning_rate": 0.00019766116234947397, "loss": 0.0763, "step": 3288 }, { "epoch": 0.8111439842209073, "grad_norm": 0.439453125, "learning_rate": 0.00019765375488609658, "loss": 0.0731, "step": 3290 }, { "epoch": 0.8116370808678501, "grad_norm": 0.6796875, "learning_rate": 0.00019764633585020404, "loss": 0.0733, "step": 3292 }, { "epoch": 0.8121301775147929, "grad_norm": 0.458984375, "learning_rate": 0.0001976389052426755, "loss": 0.0719, "step": 3294 }, { "epoch": 0.8126232741617357, "grad_norm": 0.53125, "learning_rate": 0.00019763146306439152, "loss": 0.0692, "step": 3296 }, { "epoch": 0.8131163708086785, "grad_norm": 0.4375, "learning_rate": 0.00019762400931623412, "loss": 0.0745, "step": 3298 }, { "epoch": 0.8136094674556213, "grad_norm": 0.490234375, "learning_rate": 0.00019761654399908651, "loss": 0.0731, "step": 3300 }, { "epoch": 0.8141025641025641, "grad_norm": 0.58203125, "learning_rate": 0.00019760906711383344, "loss": 0.0732, "step": 3302 }, { "epoch": 0.814595660749507, "grad_norm": 0.46875, "learning_rate": 0.00019760157866136093, "loss": 0.0789, "step": 3304 }, { "epoch": 0.8150887573964497, "grad_norm": 0.5859375, "learning_rate": 0.00019759407864255638, "loss": 0.0796, "step": 3306 }, { "epoch": 0.8155818540433925, "grad_norm": 0.484375, "learning_rate": 0.00019758656705830865, "loss": 0.0696, "step": 3308 }, { "epoch": 0.8160749506903353, "grad_norm": 0.609375, "learning_rate": 0.00019757904390950787, "loss": 0.0791, "step": 3310 }, { "epoch": 0.8165680473372781, "grad_norm": 0.5625, "learning_rate": 0.00019757150919704554, "loss": 0.0745, "step": 3312 }, { "epoch": 0.8170611439842209, "grad_norm": 0.435546875, "learning_rate": 0.00019756396292181462, "loss": 0.0726, "step": 3314 }, { "epoch": 0.8175542406311637, "grad_norm": 0.58984375, "learning_rate": 0.00019755640508470942, "loss": 0.0748, "step": 3316 }, { "epoch": 0.8180473372781065, "grad_norm": 0.58203125, "learning_rate": 0.00019754883568662545, "loss": 0.0776, "step": 3318 }, { "epoch": 0.8185404339250493, "grad_norm": 0.65234375, "learning_rate": 0.00019754125472845987, "loss": 0.0742, "step": 3320 }, { "epoch": 0.8190335305719921, "grad_norm": 0.87109375, "learning_rate": 0.00019753366221111096, "loss": 0.077, "step": 3322 }, { "epoch": 0.8195266272189349, "grad_norm": 0.435546875, "learning_rate": 0.00019752605813547853, "loss": 0.0729, "step": 3324 }, { "epoch": 0.8200197238658777, "grad_norm": 0.43359375, "learning_rate": 0.0001975184425024637, "loss": 0.0765, "step": 3326 }, { "epoch": 0.8205128205128205, "grad_norm": 0.56640625, "learning_rate": 0.00019751081531296896, "loss": 0.0717, "step": 3328 }, { "epoch": 0.8210059171597633, "grad_norm": 0.640625, "learning_rate": 0.00019750317656789818, "loss": 0.0828, "step": 3330 }, { "epoch": 0.8214990138067061, "grad_norm": 0.482421875, "learning_rate": 0.00019749552626815658, "loss": 0.0685, "step": 3332 }, { "epoch": 0.821992110453649, "grad_norm": 0.55078125, "learning_rate": 0.0001974878644146508, "loss": 0.0774, "step": 3334 }, { "epoch": 0.8224852071005917, "grad_norm": 0.5390625, "learning_rate": 0.00019748019100828876, "loss": 0.0783, "step": 3336 }, { "epoch": 0.8229783037475346, "grad_norm": 0.435546875, "learning_rate": 0.00019747250604997983, "loss": 0.0708, "step": 3338 }, { "epoch": 0.8234714003944773, "grad_norm": 0.44140625, "learning_rate": 0.0001974648095406347, "loss": 0.0743, "step": 3340 }, { "epoch": 0.8239644970414202, "grad_norm": 0.56640625, "learning_rate": 0.00019745710148116546, "loss": 0.0735, "step": 3342 }, { "epoch": 0.8244575936883629, "grad_norm": 0.54296875, "learning_rate": 0.0001974493818724856, "loss": 0.0777, "step": 3344 }, { "epoch": 0.8249506903353058, "grad_norm": 0.423828125, "learning_rate": 0.00019744165071550987, "loss": 0.0802, "step": 3346 }, { "epoch": 0.8254437869822485, "grad_norm": 0.4921875, "learning_rate": 0.0001974339080111545, "loss": 0.0743, "step": 3348 }, { "epoch": 0.8259368836291914, "grad_norm": 0.52734375, "learning_rate": 0.00019742615376033707, "loss": 0.0759, "step": 3350 }, { "epoch": 0.8264299802761341, "grad_norm": 0.50390625, "learning_rate": 0.0001974183879639764, "loss": 0.0779, "step": 3352 }, { "epoch": 0.8269230769230769, "grad_norm": 0.439453125, "learning_rate": 0.00019741061062299284, "loss": 0.0713, "step": 3354 }, { "epoch": 0.8274161735700197, "grad_norm": 0.474609375, "learning_rate": 0.00019740282173830807, "loss": 0.0666, "step": 3356 }, { "epoch": 0.8279092702169625, "grad_norm": 0.37109375, "learning_rate": 0.00019739502131084513, "loss": 0.0695, "step": 3358 }, { "epoch": 0.8284023668639053, "grad_norm": 0.478515625, "learning_rate": 0.0001973872093415283, "loss": 0.0704, "step": 3360 }, { "epoch": 0.8288954635108481, "grad_norm": 0.6328125, "learning_rate": 0.0001973793858312835, "loss": 0.0806, "step": 3362 }, { "epoch": 0.8293885601577909, "grad_norm": 0.8828125, "learning_rate": 0.00019737155078103778, "loss": 0.0735, "step": 3364 }, { "epoch": 0.8298816568047337, "grad_norm": 0.365234375, "learning_rate": 0.0001973637041917196, "loss": 0.0696, "step": 3366 }, { "epoch": 0.8303747534516766, "grad_norm": 0.62890625, "learning_rate": 0.00019735584606425888, "loss": 0.0763, "step": 3368 }, { "epoch": 0.8308678500986193, "grad_norm": 0.6875, "learning_rate": 0.0001973479763995868, "loss": 0.0743, "step": 3370 }, { "epoch": 0.8313609467455622, "grad_norm": 0.5703125, "learning_rate": 0.00019734009519863603, "loss": 0.0751, "step": 3372 }, { "epoch": 0.8318540433925049, "grad_norm": 0.380859375, "learning_rate": 0.00019733220246234053, "loss": 0.0688, "step": 3374 }, { "epoch": 0.8323471400394478, "grad_norm": 0.53125, "learning_rate": 0.00019732429819163555, "loss": 0.0832, "step": 3376 }, { "epoch": 0.8328402366863905, "grad_norm": 0.58203125, "learning_rate": 0.00019731638238745789, "loss": 0.0688, "step": 3378 }, { "epoch": 0.8333333333333334, "grad_norm": 0.609375, "learning_rate": 0.00019730845505074553, "loss": 0.0731, "step": 3380 }, { "epoch": 0.8338264299802761, "grad_norm": 0.57421875, "learning_rate": 0.000197300516182438, "loss": 0.0754, "step": 3382 }, { "epoch": 0.834319526627219, "grad_norm": 0.61328125, "learning_rate": 0.00019729256578347599, "loss": 0.0721, "step": 3384 }, { "epoch": 0.8348126232741617, "grad_norm": 0.5, "learning_rate": 0.00019728460385480174, "loss": 0.0757, "step": 3386 }, { "epoch": 0.8353057199211046, "grad_norm": 0.6484375, "learning_rate": 0.00019727663039735877, "loss": 0.08, "step": 3388 }, { "epoch": 0.8357988165680473, "grad_norm": 1.03125, "learning_rate": 0.00019726864541209197, "loss": 0.0768, "step": 3390 }, { "epoch": 0.8362919132149902, "grad_norm": 0.359375, "learning_rate": 0.00019726064889994763, "loss": 0.0755, "step": 3392 }, { "epoch": 0.8367850098619329, "grad_norm": 0.85546875, "learning_rate": 0.00019725264086187334, "loss": 0.0778, "step": 3394 }, { "epoch": 0.8372781065088757, "grad_norm": 0.51953125, "learning_rate": 0.00019724462129881812, "loss": 0.0756, "step": 3396 }, { "epoch": 0.8377712031558185, "grad_norm": 1.015625, "learning_rate": 0.00019723659021173235, "loss": 0.0712, "step": 3398 }, { "epoch": 0.8382642998027613, "grad_norm": 0.921875, "learning_rate": 0.0001972285476015677, "loss": 0.0799, "step": 3400 }, { "epoch": 0.8387573964497042, "grad_norm": 0.5078125, "learning_rate": 0.00019722049346927736, "loss": 0.0737, "step": 3402 }, { "epoch": 0.8392504930966469, "grad_norm": 0.890625, "learning_rate": 0.0001972124278158157, "loss": 0.0777, "step": 3404 }, { "epoch": 0.8397435897435898, "grad_norm": 0.67578125, "learning_rate": 0.0001972043506421386, "loss": 0.0757, "step": 3406 }, { "epoch": 0.8402366863905325, "grad_norm": 0.61328125, "learning_rate": 0.0001971962619492032, "loss": 0.0716, "step": 3408 }, { "epoch": 0.8407297830374754, "grad_norm": 0.55078125, "learning_rate": 0.0001971881617379681, "loss": 0.0715, "step": 3410 }, { "epoch": 0.8412228796844181, "grad_norm": 0.49609375, "learning_rate": 0.0001971800500093932, "loss": 0.0786, "step": 3412 }, { "epoch": 0.841715976331361, "grad_norm": 0.5703125, "learning_rate": 0.0001971719267644398, "loss": 0.0712, "step": 3414 }, { "epoch": 0.8422090729783037, "grad_norm": 0.51171875, "learning_rate": 0.00019716379200407054, "loss": 0.0729, "step": 3416 }, { "epoch": 0.8427021696252466, "grad_norm": 0.73828125, "learning_rate": 0.00019715564572924941, "loss": 0.0804, "step": 3418 }, { "epoch": 0.8431952662721893, "grad_norm": 0.4140625, "learning_rate": 0.00019714748794094186, "loss": 0.0728, "step": 3420 }, { "epoch": 0.8436883629191322, "grad_norm": 0.427734375, "learning_rate": 0.00019713931864011452, "loss": 0.0727, "step": 3422 }, { "epoch": 0.8441814595660749, "grad_norm": 0.45703125, "learning_rate": 0.00019713113782773563, "loss": 0.0698, "step": 3424 }, { "epoch": 0.8446745562130178, "grad_norm": 0.52734375, "learning_rate": 0.00019712294550477456, "loss": 0.0732, "step": 3426 }, { "epoch": 0.8451676528599605, "grad_norm": 0.51953125, "learning_rate": 0.0001971147416722022, "loss": 0.0696, "step": 3428 }, { "epoch": 0.8456607495069034, "grad_norm": 0.609375, "learning_rate": 0.0001971065263309907, "loss": 0.0728, "step": 3430 }, { "epoch": 0.8461538461538461, "grad_norm": 0.412109375, "learning_rate": 0.00019709829948211368, "loss": 0.0702, "step": 3432 }, { "epoch": 0.846646942800789, "grad_norm": 0.6484375, "learning_rate": 0.00019709006112654604, "loss": 0.0766, "step": 3434 }, { "epoch": 0.8471400394477318, "grad_norm": 0.43359375, "learning_rate": 0.0001970818112652641, "loss": 0.0717, "step": 3436 }, { "epoch": 0.8476331360946746, "grad_norm": 0.55078125, "learning_rate": 0.00019707354989924547, "loss": 0.0686, "step": 3438 }, { "epoch": 0.8481262327416174, "grad_norm": 0.43359375, "learning_rate": 0.00019706527702946917, "loss": 0.0753, "step": 3440 }, { "epoch": 0.8486193293885601, "grad_norm": 0.478515625, "learning_rate": 0.00019705699265691563, "loss": 0.0719, "step": 3442 }, { "epoch": 0.849112426035503, "grad_norm": 0.6484375, "learning_rate": 0.00019704869678256656, "loss": 0.0755, "step": 3444 }, { "epoch": 0.8496055226824457, "grad_norm": 0.7109375, "learning_rate": 0.00019704038940740505, "loss": 0.0732, "step": 3446 }, { "epoch": 0.8500986193293886, "grad_norm": 0.40625, "learning_rate": 0.00019703207053241558, "loss": 0.072, "step": 3448 }, { "epoch": 0.8505917159763313, "grad_norm": 0.47265625, "learning_rate": 0.00019702374015858403, "loss": 0.0702, "step": 3450 }, { "epoch": 0.8510848126232742, "grad_norm": 0.5859375, "learning_rate": 0.00019701539828689753, "loss": 0.0747, "step": 3452 }, { "epoch": 0.8515779092702169, "grad_norm": 0.78515625, "learning_rate": 0.0001970070449183447, "loss": 0.0798, "step": 3454 }, { "epoch": 0.8520710059171598, "grad_norm": 0.451171875, "learning_rate": 0.00019699868005391543, "loss": 0.0713, "step": 3456 }, { "epoch": 0.8525641025641025, "grad_norm": 0.83203125, "learning_rate": 0.00019699030369460096, "loss": 0.0765, "step": 3458 }, { "epoch": 0.8530571992110454, "grad_norm": 0.431640625, "learning_rate": 0.000196981915841394, "loss": 0.0658, "step": 3460 }, { "epoch": 0.8535502958579881, "grad_norm": 0.609375, "learning_rate": 0.00019697351649528854, "loss": 0.0698, "step": 3462 }, { "epoch": 0.854043392504931, "grad_norm": 0.546875, "learning_rate": 0.00019696510565727994, "loss": 0.0796, "step": 3464 }, { "epoch": 0.8545364891518737, "grad_norm": 0.69921875, "learning_rate": 0.00019695668332836495, "loss": 0.0836, "step": 3466 }, { "epoch": 0.8550295857988166, "grad_norm": 0.66796875, "learning_rate": 0.00019694824950954165, "loss": 0.0745, "step": 3468 }, { "epoch": 0.8555226824457594, "grad_norm": 0.6171875, "learning_rate": 0.00019693980420180945, "loss": 0.0778, "step": 3470 }, { "epoch": 0.8560157790927022, "grad_norm": 0.6640625, "learning_rate": 0.00019693134740616925, "loss": 0.0807, "step": 3472 }, { "epoch": 0.856508875739645, "grad_norm": 0.58984375, "learning_rate": 0.0001969228791236232, "loss": 0.0705, "step": 3474 }, { "epoch": 0.8570019723865878, "grad_norm": 0.486328125, "learning_rate": 0.0001969143993551748, "loss": 0.0719, "step": 3476 }, { "epoch": 0.8574950690335306, "grad_norm": 0.6796875, "learning_rate": 0.00019690590810182896, "loss": 0.0819, "step": 3478 }, { "epoch": 0.8579881656804734, "grad_norm": 0.490234375, "learning_rate": 0.00019689740536459199, "loss": 0.075, "step": 3480 }, { "epoch": 0.8584812623274162, "grad_norm": 0.80078125, "learning_rate": 0.00019688889114447146, "loss": 0.0724, "step": 3482 }, { "epoch": 0.8589743589743589, "grad_norm": 0.8203125, "learning_rate": 0.00019688036544247635, "loss": 0.0671, "step": 3484 }, { "epoch": 0.8594674556213018, "grad_norm": 0.41015625, "learning_rate": 0.00019687182825961705, "loss": 0.0712, "step": 3486 }, { "epoch": 0.8599605522682445, "grad_norm": 0.44140625, "learning_rate": 0.00019686327959690526, "loss": 0.0724, "step": 3488 }, { "epoch": 0.8604536489151874, "grad_norm": 0.419921875, "learning_rate": 0.00019685471945535399, "loss": 0.0738, "step": 3490 }, { "epoch": 0.8609467455621301, "grad_norm": 0.4609375, "learning_rate": 0.0001968461478359777, "loss": 0.0752, "step": 3492 }, { "epoch": 0.861439842209073, "grad_norm": 0.4140625, "learning_rate": 0.00019683756473979217, "loss": 0.0636, "step": 3494 }, { "epoch": 0.8619329388560157, "grad_norm": 0.65625, "learning_rate": 0.0001968289701678146, "loss": 0.0711, "step": 3496 }, { "epoch": 0.8624260355029586, "grad_norm": 0.58984375, "learning_rate": 0.00019682036412106335, "loss": 0.0714, "step": 3498 }, { "epoch": 0.8629191321499013, "grad_norm": 0.703125, "learning_rate": 0.00019681174660055845, "loss": 0.0765, "step": 3500 }, { "epoch": 0.8634122287968442, "grad_norm": 0.5625, "learning_rate": 0.00019680311760732102, "loss": 0.0661, "step": 3502 }, { "epoch": 0.863905325443787, "grad_norm": 0.5703125, "learning_rate": 0.00019679447714237368, "loss": 0.07, "step": 3504 }, { "epoch": 0.8643984220907298, "grad_norm": 0.484375, "learning_rate": 0.00019678582520674036, "loss": 0.0786, "step": 3506 }, { "epoch": 0.8648915187376726, "grad_norm": 0.46484375, "learning_rate": 0.00019677716180144638, "loss": 0.0708, "step": 3508 }, { "epoch": 0.8653846153846154, "grad_norm": 0.7265625, "learning_rate": 0.00019676848692751838, "loss": 0.072, "step": 3510 }, { "epoch": 0.8658777120315582, "grad_norm": 0.6171875, "learning_rate": 0.0001967598005859844, "loss": 0.0761, "step": 3512 }, { "epoch": 0.866370808678501, "grad_norm": 0.65625, "learning_rate": 0.0001967511027778738, "loss": 0.0755, "step": 3514 }, { "epoch": 0.8668639053254438, "grad_norm": 0.8671875, "learning_rate": 0.00019674239350421735, "loss": 0.0731, "step": 3516 }, { "epoch": 0.8673570019723866, "grad_norm": 0.765625, "learning_rate": 0.0001967336727660471, "loss": 0.0691, "step": 3518 }, { "epoch": 0.8678500986193294, "grad_norm": 0.55859375, "learning_rate": 0.00019672494056439659, "loss": 0.0739, "step": 3520 }, { "epoch": 0.8683431952662722, "grad_norm": 0.494140625, "learning_rate": 0.00019671619690030055, "loss": 0.0788, "step": 3522 }, { "epoch": 0.868836291913215, "grad_norm": 0.458984375, "learning_rate": 0.00019670744177479517, "loss": 0.0668, "step": 3524 }, { "epoch": 0.8693293885601578, "grad_norm": 0.76953125, "learning_rate": 0.00019669867518891802, "loss": 0.0805, "step": 3526 }, { "epoch": 0.8698224852071006, "grad_norm": 0.443359375, "learning_rate": 0.00019668989714370793, "loss": 0.0747, "step": 3528 }, { "epoch": 0.8703155818540433, "grad_norm": 0.55859375, "learning_rate": 0.00019668110764020522, "loss": 0.0656, "step": 3530 }, { "epoch": 0.8708086785009862, "grad_norm": 0.58203125, "learning_rate": 0.00019667230667945145, "loss": 0.0782, "step": 3532 }, { "epoch": 0.871301775147929, "grad_norm": 0.921875, "learning_rate": 0.00019666349426248955, "loss": 0.0786, "step": 3534 }, { "epoch": 0.8717948717948718, "grad_norm": 0.5234375, "learning_rate": 0.00019665467039036393, "loss": 0.0728, "step": 3536 }, { "epoch": 0.8722879684418146, "grad_norm": 0.494140625, "learning_rate": 0.0001966458350641202, "loss": 0.0662, "step": 3538 }, { "epoch": 0.8727810650887574, "grad_norm": 0.474609375, "learning_rate": 0.0001966369882848054, "loss": 0.0676, "step": 3540 }, { "epoch": 0.8732741617357002, "grad_norm": 0.5625, "learning_rate": 0.00019662813005346793, "loss": 0.0747, "step": 3542 }, { "epoch": 0.873767258382643, "grad_norm": 0.48828125, "learning_rate": 0.00019661926037115756, "loss": 0.0726, "step": 3544 }, { "epoch": 0.8742603550295858, "grad_norm": 0.515625, "learning_rate": 0.0001966103792389254, "loss": 0.0711, "step": 3546 }, { "epoch": 0.8747534516765286, "grad_norm": 0.41796875, "learning_rate": 0.00019660148665782387, "loss": 0.0693, "step": 3548 }, { "epoch": 0.8752465483234714, "grad_norm": 0.486328125, "learning_rate": 0.00019659258262890683, "loss": 0.0761, "step": 3550 }, { "epoch": 0.8757396449704142, "grad_norm": 0.482421875, "learning_rate": 0.00019658366715322946, "loss": 0.0786, "step": 3552 }, { "epoch": 0.876232741617357, "grad_norm": 0.484375, "learning_rate": 0.00019657474023184826, "loss": 0.0727, "step": 3554 }, { "epoch": 0.8767258382642998, "grad_norm": 0.5078125, "learning_rate": 0.00019656580186582114, "loss": 0.0679, "step": 3556 }, { "epoch": 0.8772189349112426, "grad_norm": 0.40625, "learning_rate": 0.00019655685205620732, "loss": 0.0741, "step": 3558 }, { "epoch": 0.8777120315581854, "grad_norm": 0.427734375, "learning_rate": 0.0001965478908040675, "loss": 0.0703, "step": 3560 }, { "epoch": 0.8782051282051282, "grad_norm": 0.57421875, "learning_rate": 0.0001965389181104635, "loss": 0.0772, "step": 3562 }, { "epoch": 0.878698224852071, "grad_norm": 0.5859375, "learning_rate": 0.0001965299339764587, "loss": 0.0781, "step": 3564 }, { "epoch": 0.8791913214990138, "grad_norm": 0.431640625, "learning_rate": 0.0001965209384031178, "loss": 0.0703, "step": 3566 }, { "epoch": 0.8796844181459567, "grad_norm": 0.408203125, "learning_rate": 0.0001965119313915068, "loss": 0.0722, "step": 3568 }, { "epoch": 0.8801775147928994, "grad_norm": 0.66015625, "learning_rate": 0.00019650291294269305, "loss": 0.0715, "step": 3570 }, { "epoch": 0.8806706114398422, "grad_norm": 0.4296875, "learning_rate": 0.0001964938830577453, "loss": 0.0703, "step": 3572 }, { "epoch": 0.881163708086785, "grad_norm": 0.58203125, "learning_rate": 0.00019648484173773368, "loss": 0.0713, "step": 3574 }, { "epoch": 0.8816568047337278, "grad_norm": 0.361328125, "learning_rate": 0.0001964757889837296, "loss": 0.0681, "step": 3576 }, { "epoch": 0.8821499013806706, "grad_norm": 0.75390625, "learning_rate": 0.0001964667247968059, "loss": 0.0759, "step": 3578 }, { "epoch": 0.8826429980276134, "grad_norm": 0.435546875, "learning_rate": 0.0001964576491780367, "loss": 0.0667, "step": 3580 }, { "epoch": 0.8831360946745562, "grad_norm": 0.75, "learning_rate": 0.0001964485621284975, "loss": 0.0791, "step": 3582 }, { "epoch": 0.883629191321499, "grad_norm": 0.51171875, "learning_rate": 0.0001964394636492652, "loss": 0.0707, "step": 3584 }, { "epoch": 0.8841222879684418, "grad_norm": 0.55078125, "learning_rate": 0.000196430353741418, "loss": 0.0668, "step": 3586 }, { "epoch": 0.8846153846153846, "grad_norm": 0.95703125, "learning_rate": 0.0001964212324060355, "loss": 0.071, "step": 3588 }, { "epoch": 0.8851084812623274, "grad_norm": 0.55078125, "learning_rate": 0.00019641209964419863, "loss": 0.0784, "step": 3590 }, { "epoch": 0.8856015779092702, "grad_norm": 0.6484375, "learning_rate": 0.0001964029554569896, "loss": 0.0706, "step": 3592 }, { "epoch": 0.886094674556213, "grad_norm": 0.6015625, "learning_rate": 0.00019639379984549217, "loss": 0.0678, "step": 3594 }, { "epoch": 0.8865877712031558, "grad_norm": 0.59375, "learning_rate": 0.00019638463281079123, "loss": 0.0663, "step": 3596 }, { "epoch": 0.8870808678500987, "grad_norm": 0.70703125, "learning_rate": 0.00019637545435397314, "loss": 0.0761, "step": 3598 }, { "epoch": 0.8875739644970414, "grad_norm": 0.5078125, "learning_rate": 0.00019636626447612563, "loss": 0.0706, "step": 3600 }, { "epoch": 0.8880670611439843, "grad_norm": 0.44140625, "learning_rate": 0.00019635706317833776, "loss": 0.0685, "step": 3602 }, { "epoch": 0.888560157790927, "grad_norm": 0.54296875, "learning_rate": 0.00019634785046169988, "loss": 0.0738, "step": 3604 }, { "epoch": 0.8890532544378699, "grad_norm": 0.76171875, "learning_rate": 0.0001963386263273038, "loss": 0.0713, "step": 3606 }, { "epoch": 0.8895463510848126, "grad_norm": 0.49609375, "learning_rate": 0.00019632939077624263, "loss": 0.0717, "step": 3608 }, { "epoch": 0.8900394477317555, "grad_norm": 0.60546875, "learning_rate": 0.0001963201438096108, "loss": 0.0755, "step": 3610 }, { "epoch": 0.8905325443786982, "grad_norm": 0.45703125, "learning_rate": 0.00019631088542850417, "loss": 0.0741, "step": 3612 }, { "epoch": 0.8910256410256411, "grad_norm": 0.56640625, "learning_rate": 0.00019630161563401985, "loss": 0.0702, "step": 3614 }, { "epoch": 0.8915187376725838, "grad_norm": 0.53125, "learning_rate": 0.00019629233442725643, "loss": 0.0685, "step": 3616 }, { "epoch": 0.8920118343195266, "grad_norm": 1.0078125, "learning_rate": 0.00019628304180931375, "loss": 0.0747, "step": 3618 }, { "epoch": 0.8925049309664694, "grad_norm": 0.388671875, "learning_rate": 0.00019627373778129302, "loss": 0.074, "step": 3620 }, { "epoch": 0.8929980276134122, "grad_norm": 0.76953125, "learning_rate": 0.00019626442234429683, "loss": 0.0707, "step": 3622 }, { "epoch": 0.893491124260355, "grad_norm": 0.5625, "learning_rate": 0.00019625509549942916, "loss": 0.067, "step": 3624 }, { "epoch": 0.8939842209072978, "grad_norm": 0.43359375, "learning_rate": 0.0001962457572477952, "loss": 0.0746, "step": 3626 }, { "epoch": 0.8944773175542406, "grad_norm": 0.52734375, "learning_rate": 0.00019623640759050168, "loss": 0.0789, "step": 3628 }, { "epoch": 0.8949704142011834, "grad_norm": 0.32421875, "learning_rate": 0.00019622704652865654, "loss": 0.0654, "step": 3630 }, { "epoch": 0.8954635108481263, "grad_norm": 0.6171875, "learning_rate": 0.00019621767406336914, "loss": 0.0749, "step": 3632 }, { "epoch": 0.895956607495069, "grad_norm": 0.37890625, "learning_rate": 0.0001962082901957501, "loss": 0.0741, "step": 3634 }, { "epoch": 0.8964497041420119, "grad_norm": 0.51171875, "learning_rate": 0.00019619889492691154, "loss": 0.0769, "step": 3636 }, { "epoch": 0.8969428007889546, "grad_norm": 0.404296875, "learning_rate": 0.00019618948825796684, "loss": 0.0718, "step": 3638 }, { "epoch": 0.8974358974358975, "grad_norm": 0.408203125, "learning_rate": 0.00019618007019003072, "loss": 0.0666, "step": 3640 }, { "epoch": 0.8979289940828402, "grad_norm": 0.357421875, "learning_rate": 0.0001961706407242193, "loss": 0.0696, "step": 3642 }, { "epoch": 0.8984220907297831, "grad_norm": 0.482421875, "learning_rate": 0.00019616119986164997, "loss": 0.0733, "step": 3644 }, { "epoch": 0.8989151873767258, "grad_norm": 0.333984375, "learning_rate": 0.00019615174760344157, "loss": 0.071, "step": 3646 }, { "epoch": 0.8994082840236687, "grad_norm": 0.451171875, "learning_rate": 0.00019614228395071424, "loss": 0.0789, "step": 3648 }, { "epoch": 0.8999013806706114, "grad_norm": 0.55078125, "learning_rate": 0.00019613280890458948, "loss": 0.0737, "step": 3650 }, { "epoch": 0.9003944773175543, "grad_norm": 0.45703125, "learning_rate": 0.00019612332246619012, "loss": 0.0755, "step": 3652 }, { "epoch": 0.900887573964497, "grad_norm": 0.4609375, "learning_rate": 0.00019611382463664037, "loss": 0.0661, "step": 3654 }, { "epoch": 0.9013806706114399, "grad_norm": 0.37890625, "learning_rate": 0.0001961043154170658, "loss": 0.0683, "step": 3656 }, { "epoch": 0.9018737672583826, "grad_norm": 0.392578125, "learning_rate": 0.00019609479480859322, "loss": 0.0692, "step": 3658 }, { "epoch": 0.9023668639053254, "grad_norm": 0.390625, "learning_rate": 0.00019608526281235093, "loss": 0.0606, "step": 3660 }, { "epoch": 0.9028599605522682, "grad_norm": 0.419921875, "learning_rate": 0.00019607571942946854, "loss": 0.0723, "step": 3662 }, { "epoch": 0.903353057199211, "grad_norm": 0.431640625, "learning_rate": 0.00019606616466107699, "loss": 0.0706, "step": 3664 }, { "epoch": 0.9038461538461539, "grad_norm": 0.51171875, "learning_rate": 0.00019605659850830857, "loss": 0.0737, "step": 3666 }, { "epoch": 0.9043392504930966, "grad_norm": 0.5546875, "learning_rate": 0.0001960470209722969, "loss": 0.0824, "step": 3668 }, { "epoch": 0.9048323471400395, "grad_norm": 0.439453125, "learning_rate": 0.000196037432054177, "loss": 0.068, "step": 3670 }, { "epoch": 0.9053254437869822, "grad_norm": 0.39453125, "learning_rate": 0.0001960278317550852, "loss": 0.0703, "step": 3672 }, { "epoch": 0.9058185404339251, "grad_norm": 0.5, "learning_rate": 0.00019601822007615915, "loss": 0.0746, "step": 3674 }, { "epoch": 0.9063116370808678, "grad_norm": 0.4765625, "learning_rate": 0.00019600859701853794, "loss": 0.0689, "step": 3676 }, { "epoch": 0.9068047337278107, "grad_norm": 0.54296875, "learning_rate": 0.000195998962583362, "loss": 0.0723, "step": 3678 }, { "epoch": 0.9072978303747534, "grad_norm": 0.431640625, "learning_rate": 0.0001959893167717729, "loss": 0.074, "step": 3680 }, { "epoch": 0.9077909270216963, "grad_norm": 0.466796875, "learning_rate": 0.0001959796595849139, "loss": 0.0684, "step": 3682 }, { "epoch": 0.908284023668639, "grad_norm": 0.482421875, "learning_rate": 0.00019596999102392934, "loss": 0.0697, "step": 3684 }, { "epoch": 0.9087771203155819, "grad_norm": 0.42578125, "learning_rate": 0.00019596031108996503, "loss": 0.0702, "step": 3686 }, { "epoch": 0.9092702169625246, "grad_norm": 0.44140625, "learning_rate": 0.00019595061978416807, "loss": 0.0685, "step": 3688 }, { "epoch": 0.9097633136094675, "grad_norm": 0.39453125, "learning_rate": 0.0001959409171076869, "loss": 0.076, "step": 3690 }, { "epoch": 0.9102564102564102, "grad_norm": 0.5078125, "learning_rate": 0.00019593120306167147, "loss": 0.0687, "step": 3692 }, { "epoch": 0.9107495069033531, "grad_norm": 0.34765625, "learning_rate": 0.00019592147764727283, "loss": 0.0746, "step": 3694 }, { "epoch": 0.9112426035502958, "grad_norm": 0.3984375, "learning_rate": 0.0001959117408656435, "loss": 0.0717, "step": 3696 }, { "epoch": 0.9117357001972387, "grad_norm": 0.42578125, "learning_rate": 0.0001959019927179374, "loss": 0.0624, "step": 3698 }, { "epoch": 0.9122287968441815, "grad_norm": 0.3984375, "learning_rate": 0.00019589223320530975, "loss": 0.0751, "step": 3700 }, { "epoch": 0.9127218934911243, "grad_norm": 0.69140625, "learning_rate": 0.000195882462328917, "loss": 0.0672, "step": 3702 }, { "epoch": 0.9132149901380671, "grad_norm": 0.478515625, "learning_rate": 0.0001958726800899172, "loss": 0.0747, "step": 3704 }, { "epoch": 0.9137080867850098, "grad_norm": 0.62890625, "learning_rate": 0.00019586288648946947, "loss": 0.0678, "step": 3706 }, { "epoch": 0.9142011834319527, "grad_norm": 0.5234375, "learning_rate": 0.00019585308152873448, "loss": 0.0722, "step": 3708 }, { "epoch": 0.9146942800788954, "grad_norm": 0.4296875, "learning_rate": 0.00019584326520887414, "loss": 0.0743, "step": 3710 }, { "epoch": 0.9151873767258383, "grad_norm": 0.47265625, "learning_rate": 0.00019583343753105177, "loss": 0.0636, "step": 3712 }, { "epoch": 0.915680473372781, "grad_norm": 0.453125, "learning_rate": 0.000195823598496432, "loss": 0.0694, "step": 3714 }, { "epoch": 0.9161735700197239, "grad_norm": 0.41015625, "learning_rate": 0.00019581374810618077, "loss": 0.0713, "step": 3716 }, { "epoch": 0.9166666666666666, "grad_norm": 0.5, "learning_rate": 0.00019580388636146543, "loss": 0.0667, "step": 3718 }, { "epoch": 0.9171597633136095, "grad_norm": 0.443359375, "learning_rate": 0.00019579401326345468, "loss": 0.0698, "step": 3720 }, { "epoch": 0.9176528599605522, "grad_norm": 0.423828125, "learning_rate": 0.0001957841288133185, "loss": 0.0692, "step": 3722 }, { "epoch": 0.9181459566074951, "grad_norm": 0.380859375, "learning_rate": 0.00019577423301222826, "loss": 0.0653, "step": 3724 }, { "epoch": 0.9186390532544378, "grad_norm": 0.396484375, "learning_rate": 0.0001957643258613567, "loss": 0.0655, "step": 3726 }, { "epoch": 0.9191321499013807, "grad_norm": 0.44140625, "learning_rate": 0.00019575440736187781, "loss": 0.074, "step": 3728 }, { "epoch": 0.9196252465483234, "grad_norm": 0.37890625, "learning_rate": 0.00019574447751496705, "loss": 0.0739, "step": 3730 }, { "epoch": 0.9201183431952663, "grad_norm": 0.4453125, "learning_rate": 0.00019573453632180113, "loss": 0.073, "step": 3732 }, { "epoch": 0.9206114398422091, "grad_norm": 0.38671875, "learning_rate": 0.00019572458378355813, "loss": 0.0701, "step": 3734 }, { "epoch": 0.9211045364891519, "grad_norm": 0.65234375, "learning_rate": 0.00019571461990141754, "loss": 0.0692, "step": 3736 }, { "epoch": 0.9215976331360947, "grad_norm": 0.404296875, "learning_rate": 0.00019570464467656001, "loss": 0.072, "step": 3738 }, { "epoch": 0.9220907297830375, "grad_norm": 0.35546875, "learning_rate": 0.0001956946581101678, "loss": 0.0703, "step": 3740 }, { "epoch": 0.9225838264299803, "grad_norm": 0.6015625, "learning_rate": 0.0001956846602034243, "loss": 0.0741, "step": 3742 }, { "epoch": 0.9230769230769231, "grad_norm": 0.419921875, "learning_rate": 0.0001956746509575143, "loss": 0.0679, "step": 3744 }, { "epoch": 0.9235700197238659, "grad_norm": 0.6328125, "learning_rate": 0.00019566463037362403, "loss": 0.0723, "step": 3746 }, { "epoch": 0.9240631163708086, "grad_norm": 0.796875, "learning_rate": 0.00019565459845294092, "loss": 0.0746, "step": 3748 }, { "epoch": 0.9245562130177515, "grad_norm": 0.55078125, "learning_rate": 0.00019564455519665384, "loss": 0.0815, "step": 3750 }, { "epoch": 0.9250493096646942, "grad_norm": 0.69140625, "learning_rate": 0.0001956345006059529, "loss": 0.0688, "step": 3752 }, { "epoch": 0.9255424063116371, "grad_norm": 0.49609375, "learning_rate": 0.00019562443468202975, "loss": 0.074, "step": 3754 }, { "epoch": 0.9260355029585798, "grad_norm": 0.68359375, "learning_rate": 0.00019561435742607717, "loss": 0.0737, "step": 3756 }, { "epoch": 0.9265285996055227, "grad_norm": 0.50390625, "learning_rate": 0.0001956042688392894, "loss": 0.0759, "step": 3758 }, { "epoch": 0.9270216962524654, "grad_norm": 0.80078125, "learning_rate": 0.00019559416892286197, "loss": 0.0731, "step": 3760 }, { "epoch": 0.9275147928994083, "grad_norm": 0.455078125, "learning_rate": 0.0001955840576779918, "loss": 0.0703, "step": 3762 }, { "epoch": 0.928007889546351, "grad_norm": 0.6484375, "learning_rate": 0.0001955739351058771, "loss": 0.0761, "step": 3764 }, { "epoch": 0.9285009861932939, "grad_norm": 0.64453125, "learning_rate": 0.00019556380120771752, "loss": 0.0748, "step": 3766 }, { "epoch": 0.9289940828402367, "grad_norm": 0.478515625, "learning_rate": 0.0001955536559847139, "loss": 0.0734, "step": 3768 }, { "epoch": 0.9294871794871795, "grad_norm": 0.62109375, "learning_rate": 0.00019554349943806857, "loss": 0.0679, "step": 3770 }, { "epoch": 0.9299802761341223, "grad_norm": 0.41015625, "learning_rate": 0.0001955333315689851, "loss": 0.067, "step": 3772 }, { "epoch": 0.9304733727810651, "grad_norm": 0.44140625, "learning_rate": 0.00019552315237866843, "loss": 0.0718, "step": 3774 }, { "epoch": 0.9309664694280079, "grad_norm": 0.6015625, "learning_rate": 0.00019551296186832488, "loss": 0.0705, "step": 3776 }, { "epoch": 0.9314595660749507, "grad_norm": 0.80078125, "learning_rate": 0.00019550276003916207, "loss": 0.0818, "step": 3778 }, { "epoch": 0.9319526627218935, "grad_norm": 0.44140625, "learning_rate": 0.00019549254689238898, "loss": 0.0736, "step": 3780 }, { "epoch": 0.9324457593688363, "grad_norm": 0.54296875, "learning_rate": 0.00019548232242921595, "loss": 0.0683, "step": 3782 }, { "epoch": 0.9329388560157791, "grad_norm": 0.498046875, "learning_rate": 0.00019547208665085457, "loss": 0.0709, "step": 3784 }, { "epoch": 0.9334319526627219, "grad_norm": 0.73828125, "learning_rate": 0.00019546183955851789, "loss": 0.0711, "step": 3786 }, { "epoch": 0.9339250493096647, "grad_norm": 0.59375, "learning_rate": 0.00019545158115342025, "loss": 0.0738, "step": 3788 }, { "epoch": 0.9344181459566075, "grad_norm": 0.376953125, "learning_rate": 0.00019544131143677727, "loss": 0.0729, "step": 3790 }, { "epoch": 0.9349112426035503, "grad_norm": 0.60546875, "learning_rate": 0.00019543103040980603, "loss": 0.0731, "step": 3792 }, { "epoch": 0.935404339250493, "grad_norm": 0.5859375, "learning_rate": 0.00019542073807372485, "loss": 0.0714, "step": 3794 }, { "epoch": 0.9358974358974359, "grad_norm": 0.7421875, "learning_rate": 0.00019541043442975346, "loss": 0.0654, "step": 3796 }, { "epoch": 0.9363905325443787, "grad_norm": 0.71875, "learning_rate": 0.0001954001194791129, "loss": 0.0769, "step": 3798 }, { "epoch": 0.9368836291913215, "grad_norm": 0.4453125, "learning_rate": 0.0001953897932230255, "loss": 0.0668, "step": 3800 }, { "epoch": 0.9373767258382643, "grad_norm": 0.455078125, "learning_rate": 0.00019537945566271505, "loss": 0.0732, "step": 3802 }, { "epoch": 0.9378698224852071, "grad_norm": 0.34765625, "learning_rate": 0.00019536910679940656, "loss": 0.0672, "step": 3804 }, { "epoch": 0.9383629191321499, "grad_norm": 0.404296875, "learning_rate": 0.00019535874663432644, "loss": 0.0683, "step": 3806 }, { "epoch": 0.9388560157790927, "grad_norm": 0.45703125, "learning_rate": 0.00019534837516870245, "loss": 0.0746, "step": 3808 }, { "epoch": 0.9393491124260355, "grad_norm": 0.361328125, "learning_rate": 0.00019533799240376362, "loss": 0.0656, "step": 3810 }, { "epoch": 0.9398422090729783, "grad_norm": 0.61328125, "learning_rate": 0.00019532759834074037, "loss": 0.0706, "step": 3812 }, { "epoch": 0.9403353057199211, "grad_norm": 0.51171875, "learning_rate": 0.0001953171929808645, "loss": 0.0735, "step": 3814 }, { "epoch": 0.9408284023668639, "grad_norm": 0.4375, "learning_rate": 0.00019530677632536908, "loss": 0.0714, "step": 3816 }, { "epoch": 0.9413214990138067, "grad_norm": 0.56640625, "learning_rate": 0.00019529634837548855, "loss": 0.0712, "step": 3818 }, { "epoch": 0.9418145956607495, "grad_norm": 0.5078125, "learning_rate": 0.00019528590913245867, "loss": 0.0686, "step": 3820 }, { "epoch": 0.9423076923076923, "grad_norm": 0.46484375, "learning_rate": 0.00019527545859751656, "loss": 0.0729, "step": 3822 }, { "epoch": 0.9428007889546351, "grad_norm": 0.46875, "learning_rate": 0.00019526499677190062, "loss": 0.0714, "step": 3824 }, { "epoch": 0.9432938856015779, "grad_norm": 0.51953125, "learning_rate": 0.0001952545236568507, "loss": 0.0723, "step": 3826 }, { "epoch": 0.9437869822485208, "grad_norm": 0.55859375, "learning_rate": 0.00019524403925360787, "loss": 0.0732, "step": 3828 }, { "epoch": 0.9442800788954635, "grad_norm": 0.478515625, "learning_rate": 0.00019523354356341464, "loss": 0.0694, "step": 3830 }, { "epoch": 0.9447731755424064, "grad_norm": 0.4453125, "learning_rate": 0.00019522303658751477, "loss": 0.0747, "step": 3832 }, { "epoch": 0.9452662721893491, "grad_norm": 0.46484375, "learning_rate": 0.0001952125183271534, "loss": 0.0711, "step": 3834 }, { "epoch": 0.9457593688362919, "grad_norm": 0.4296875, "learning_rate": 0.00019520198878357703, "loss": 0.0732, "step": 3836 }, { "epoch": 0.9462524654832347, "grad_norm": 0.36328125, "learning_rate": 0.0001951914479580334, "loss": 0.0664, "step": 3838 }, { "epoch": 0.9467455621301775, "grad_norm": 0.427734375, "learning_rate": 0.00019518089585177174, "loss": 0.0709, "step": 3840 }, { "epoch": 0.9472386587771203, "grad_norm": 0.5859375, "learning_rate": 0.0001951703324660425, "loss": 0.0692, "step": 3842 }, { "epoch": 0.9477317554240631, "grad_norm": 0.388671875, "learning_rate": 0.00019515975780209753, "loss": 0.068, "step": 3844 }, { "epoch": 0.9482248520710059, "grad_norm": 0.5546875, "learning_rate": 0.0001951491718611899, "loss": 0.0784, "step": 3846 }, { "epoch": 0.9487179487179487, "grad_norm": 0.52734375, "learning_rate": 0.0001951385746445742, "loss": 0.068, "step": 3848 }, { "epoch": 0.9492110453648915, "grad_norm": 0.46484375, "learning_rate": 0.0001951279661535062, "loss": 0.0677, "step": 3850 }, { "epoch": 0.9497041420118343, "grad_norm": 0.58203125, "learning_rate": 0.00019511734638924308, "loss": 0.0691, "step": 3852 }, { "epoch": 0.9501972386587771, "grad_norm": 0.44140625, "learning_rate": 0.00019510671535304333, "loss": 0.0691, "step": 3854 }, { "epoch": 0.9506903353057199, "grad_norm": 0.61328125, "learning_rate": 0.00019509607304616687, "loss": 0.0784, "step": 3856 }, { "epoch": 0.9511834319526628, "grad_norm": 0.51171875, "learning_rate": 0.00019508541946987475, "loss": 0.0766, "step": 3858 }, { "epoch": 0.9516765285996055, "grad_norm": 0.6484375, "learning_rate": 0.00019507475462542955, "loss": 0.0723, "step": 3860 }, { "epoch": 0.9521696252465484, "grad_norm": 0.373046875, "learning_rate": 0.0001950640785140951, "loss": 0.0721, "step": 3862 }, { "epoch": 0.9526627218934911, "grad_norm": 0.451171875, "learning_rate": 0.00019505339113713656, "loss": 0.0758, "step": 3864 }, { "epoch": 0.953155818540434, "grad_norm": 0.57421875, "learning_rate": 0.00019504269249582046, "loss": 0.0674, "step": 3866 }, { "epoch": 0.9536489151873767, "grad_norm": 0.51171875, "learning_rate": 0.00019503198259141467, "loss": 0.0742, "step": 3868 }, { "epoch": 0.9541420118343196, "grad_norm": 0.546875, "learning_rate": 0.00019502126142518833, "loss": 0.0662, "step": 3870 }, { "epoch": 0.9546351084812623, "grad_norm": 0.5078125, "learning_rate": 0.00019501052899841201, "loss": 0.0694, "step": 3872 }, { "epoch": 0.9551282051282052, "grad_norm": 0.4921875, "learning_rate": 0.0001949997853123575, "loss": 0.0726, "step": 3874 }, { "epoch": 0.9556213017751479, "grad_norm": 0.5078125, "learning_rate": 0.00019498903036829806, "loss": 0.0704, "step": 3876 }, { "epoch": 0.9561143984220908, "grad_norm": 0.4140625, "learning_rate": 0.0001949782641675081, "loss": 0.0604, "step": 3878 }, { "epoch": 0.9566074950690335, "grad_norm": 0.69140625, "learning_rate": 0.00019496748671126362, "loss": 0.0699, "step": 3880 }, { "epoch": 0.9571005917159763, "grad_norm": 0.5859375, "learning_rate": 0.0001949566980008417, "loss": 0.0721, "step": 3882 }, { "epoch": 0.9575936883629191, "grad_norm": 0.4375, "learning_rate": 0.0001949458980375209, "loss": 0.0709, "step": 3884 }, { "epoch": 0.9580867850098619, "grad_norm": 0.546875, "learning_rate": 0.00019493508682258108, "loss": 0.0718, "step": 3886 }, { "epoch": 0.9585798816568047, "grad_norm": 0.7578125, "learning_rate": 0.0001949242643573034, "loss": 0.0802, "step": 3888 }, { "epoch": 0.9590729783037475, "grad_norm": 0.439453125, "learning_rate": 0.00019491343064297043, "loss": 0.0672, "step": 3890 }, { "epoch": 0.9595660749506904, "grad_norm": 0.5546875, "learning_rate": 0.000194902585680866, "loss": 0.0725, "step": 3892 }, { "epoch": 0.9600591715976331, "grad_norm": 0.484375, "learning_rate": 0.00019489172947227527, "loss": 0.0781, "step": 3894 }, { "epoch": 0.960552268244576, "grad_norm": 0.88671875, "learning_rate": 0.0001948808620184848, "loss": 0.071, "step": 3896 }, { "epoch": 0.9610453648915187, "grad_norm": 0.408203125, "learning_rate": 0.00019486998332078246, "loss": 0.0726, "step": 3898 }, { "epoch": 0.9615384615384616, "grad_norm": 0.55859375, "learning_rate": 0.0001948590933804574, "loss": 0.0745, "step": 3900 }, { "epoch": 0.9620315581854043, "grad_norm": 0.5703125, "learning_rate": 0.00019484819219880013, "loss": 0.0673, "step": 3902 }, { "epoch": 0.9625246548323472, "grad_norm": 0.41796875, "learning_rate": 0.00019483727977710253, "loss": 0.0744, "step": 3904 }, { "epoch": 0.9630177514792899, "grad_norm": 0.73828125, "learning_rate": 0.00019482635611665775, "loss": 0.0733, "step": 3906 }, { "epoch": 0.9635108481262328, "grad_norm": 0.41796875, "learning_rate": 0.00019481542121876038, "loss": 0.0672, "step": 3908 }, { "epoch": 0.9640039447731755, "grad_norm": 0.62109375, "learning_rate": 0.0001948044750847062, "loss": 0.0751, "step": 3910 }, { "epoch": 0.9644970414201184, "grad_norm": 0.5859375, "learning_rate": 0.00019479351771579235, "loss": 0.0732, "step": 3912 }, { "epoch": 0.9649901380670611, "grad_norm": 0.640625, "learning_rate": 0.00019478254911331743, "loss": 0.071, "step": 3914 }, { "epoch": 0.965483234714004, "grad_norm": 0.515625, "learning_rate": 0.00019477156927858125, "loss": 0.0668, "step": 3916 }, { "epoch": 0.9659763313609467, "grad_norm": 0.56640625, "learning_rate": 0.00019476057821288495, "loss": 0.0728, "step": 3918 }, { "epoch": 0.9664694280078896, "grad_norm": 0.5234375, "learning_rate": 0.00019474957591753106, "loss": 0.0706, "step": 3920 }, { "epoch": 0.9669625246548323, "grad_norm": 0.5, "learning_rate": 0.00019473856239382345, "loss": 0.0694, "step": 3922 }, { "epoch": 0.9674556213017751, "grad_norm": 0.60546875, "learning_rate": 0.0001947275376430672, "loss": 0.0709, "step": 3924 }, { "epoch": 0.967948717948718, "grad_norm": 0.515625, "learning_rate": 0.00019471650166656888, "loss": 0.0679, "step": 3926 }, { "epoch": 0.9684418145956607, "grad_norm": 0.50390625, "learning_rate": 0.00019470545446563625, "loss": 0.063, "step": 3928 }, { "epoch": 0.9689349112426036, "grad_norm": 0.41796875, "learning_rate": 0.00019469439604157852, "loss": 0.0715, "step": 3930 }, { "epoch": 0.9694280078895463, "grad_norm": 0.640625, "learning_rate": 0.00019468332639570614, "loss": 0.0802, "step": 3932 }, { "epoch": 0.9699211045364892, "grad_norm": 0.48046875, "learning_rate": 0.00019467224552933095, "loss": 0.0689, "step": 3934 }, { "epoch": 0.9704142011834319, "grad_norm": 0.72265625, "learning_rate": 0.00019466115344376607, "loss": 0.0646, "step": 3936 }, { "epoch": 0.9709072978303748, "grad_norm": 0.482421875, "learning_rate": 0.00019465005014032602, "loss": 0.066, "step": 3938 }, { "epoch": 0.9714003944773175, "grad_norm": 0.51171875, "learning_rate": 0.00019463893562032654, "loss": 0.0749, "step": 3940 }, { "epoch": 0.9718934911242604, "grad_norm": 0.50390625, "learning_rate": 0.0001946278098850848, "loss": 0.0667, "step": 3942 }, { "epoch": 0.9723865877712031, "grad_norm": 0.53125, "learning_rate": 0.00019461667293591925, "loss": 0.0764, "step": 3944 }, { "epoch": 0.972879684418146, "grad_norm": 0.373046875, "learning_rate": 0.0001946055247741497, "loss": 0.0618, "step": 3946 }, { "epoch": 0.9733727810650887, "grad_norm": 0.52734375, "learning_rate": 0.00019459436540109724, "loss": 0.0739, "step": 3948 }, { "epoch": 0.9738658777120316, "grad_norm": 0.6015625, "learning_rate": 0.00019458319481808434, "loss": 0.077, "step": 3950 }, { "epoch": 0.9743589743589743, "grad_norm": 0.439453125, "learning_rate": 0.00019457201302643476, "loss": 0.077, "step": 3952 }, { "epoch": 0.9748520710059172, "grad_norm": 0.482421875, "learning_rate": 0.0001945608200274736, "loss": 0.0685, "step": 3954 }, { "epoch": 0.97534516765286, "grad_norm": 0.462890625, "learning_rate": 0.00019454961582252733, "loss": 0.0702, "step": 3956 }, { "epoch": 0.9758382642998028, "grad_norm": 0.443359375, "learning_rate": 0.00019453840041292367, "loss": 0.0721, "step": 3958 }, { "epoch": 0.9763313609467456, "grad_norm": 0.443359375, "learning_rate": 0.00019452717379999176, "loss": 0.0696, "step": 3960 }, { "epoch": 0.9768244575936884, "grad_norm": 0.42578125, "learning_rate": 0.00019451593598506195, "loss": 0.0645, "step": 3962 }, { "epoch": 0.9773175542406312, "grad_norm": 0.52734375, "learning_rate": 0.00019450468696946603, "loss": 0.0712, "step": 3964 }, { "epoch": 0.977810650887574, "grad_norm": 0.61328125, "learning_rate": 0.00019449342675453707, "loss": 0.0659, "step": 3966 }, { "epoch": 0.9783037475345168, "grad_norm": 0.427734375, "learning_rate": 0.00019448215534160945, "loss": 0.067, "step": 3968 }, { "epoch": 0.9787968441814595, "grad_norm": 0.4921875, "learning_rate": 0.0001944708727320189, "loss": 0.0751, "step": 3970 }, { "epoch": 0.9792899408284024, "grad_norm": 0.51171875, "learning_rate": 0.00019445957892710245, "loss": 0.0716, "step": 3972 }, { "epoch": 0.9797830374753451, "grad_norm": 0.466796875, "learning_rate": 0.00019444827392819852, "loss": 0.0727, "step": 3974 }, { "epoch": 0.980276134122288, "grad_norm": 0.3984375, "learning_rate": 0.0001944369577366468, "loss": 0.0702, "step": 3976 }, { "epoch": 0.9807692307692307, "grad_norm": 0.40234375, "learning_rate": 0.00019442563035378831, "loss": 0.0744, "step": 3978 }, { "epoch": 0.9812623274161736, "grad_norm": 0.44140625, "learning_rate": 0.00019441429178096542, "loss": 0.0678, "step": 3980 }, { "epoch": 0.9817554240631163, "grad_norm": 0.625, "learning_rate": 0.00019440294201952184, "loss": 0.067, "step": 3982 }, { "epoch": 0.9822485207100592, "grad_norm": 0.5078125, "learning_rate": 0.0001943915810708025, "loss": 0.0729, "step": 3984 }, { "epoch": 0.9827416173570019, "grad_norm": 0.5625, "learning_rate": 0.00019438020893615383, "loss": 0.0654, "step": 3986 }, { "epoch": 0.9832347140039448, "grad_norm": 0.486328125, "learning_rate": 0.00019436882561692343, "loss": 0.0758, "step": 3988 }, { "epoch": 0.9837278106508875, "grad_norm": 0.69921875, "learning_rate": 0.0001943574311144603, "loss": 0.0684, "step": 3990 }, { "epoch": 0.9842209072978304, "grad_norm": 0.5390625, "learning_rate": 0.00019434602543011477, "loss": 0.0786, "step": 3992 }, { "epoch": 0.9847140039447732, "grad_norm": 0.4765625, "learning_rate": 0.00019433460856523849, "loss": 0.0722, "step": 3994 }, { "epoch": 0.985207100591716, "grad_norm": 0.4140625, "learning_rate": 0.00019432318052118436, "loss": 0.0714, "step": 3996 }, { "epoch": 0.9857001972386588, "grad_norm": 0.50390625, "learning_rate": 0.00019431174129930673, "loss": 0.0708, "step": 3998 }, { "epoch": 0.9861932938856016, "grad_norm": 0.44140625, "learning_rate": 0.00019430029090096115, "loss": 0.0628, "step": 4000 }, { "epoch": 0.9866863905325444, "grad_norm": 0.455078125, "learning_rate": 0.00019428882932750465, "loss": 0.0698, "step": 4002 }, { "epoch": 0.9871794871794872, "grad_norm": 0.400390625, "learning_rate": 0.00019427735658029537, "loss": 0.0728, "step": 4004 }, { "epoch": 0.98767258382643, "grad_norm": 0.6640625, "learning_rate": 0.000194265872660693, "loss": 0.0675, "step": 4006 }, { "epoch": 0.9881656804733728, "grad_norm": 0.4453125, "learning_rate": 0.0001942543775700584, "loss": 0.0706, "step": 4008 }, { "epoch": 0.9886587771203156, "grad_norm": 0.396484375, "learning_rate": 0.0001942428713097538, "loss": 0.0733, "step": 4010 }, { "epoch": 0.9891518737672583, "grad_norm": 0.419921875, "learning_rate": 0.0001942313538811428, "loss": 0.0684, "step": 4012 }, { "epoch": 0.9896449704142012, "grad_norm": 0.48046875, "learning_rate": 0.0001942198252855902, "loss": 0.0731, "step": 4014 }, { "epoch": 0.9901380670611439, "grad_norm": 0.5234375, "learning_rate": 0.00019420828552446228, "loss": 0.068, "step": 4016 }, { "epoch": 0.9906311637080868, "grad_norm": 0.515625, "learning_rate": 0.0001941967345991265, "loss": 0.0719, "step": 4018 }, { "epoch": 0.9911242603550295, "grad_norm": 0.51953125, "learning_rate": 0.00019418517251095177, "loss": 0.0718, "step": 4020 }, { "epoch": 0.9916173570019724, "grad_norm": 0.42578125, "learning_rate": 0.0001941735992613082, "loss": 0.072, "step": 4022 }, { "epoch": 0.9921104536489151, "grad_norm": 0.416015625, "learning_rate": 0.00019416201485156735, "loss": 0.0654, "step": 4024 }, { "epoch": 0.992603550295858, "grad_norm": 0.546875, "learning_rate": 0.000194150419283102, "loss": 0.0665, "step": 4026 }, { "epoch": 0.9930966469428008, "grad_norm": 0.4296875, "learning_rate": 0.0001941388125572863, "loss": 0.0703, "step": 4028 }, { "epoch": 0.9935897435897436, "grad_norm": 0.609375, "learning_rate": 0.00019412719467549571, "loss": 0.0736, "step": 4030 }, { "epoch": 0.9940828402366864, "grad_norm": 0.54296875, "learning_rate": 0.000194115565639107, "loss": 0.0733, "step": 4032 }, { "epoch": 0.9945759368836292, "grad_norm": 0.392578125, "learning_rate": 0.00019410392544949828, "loss": 0.0635, "step": 4034 }, { "epoch": 0.995069033530572, "grad_norm": 0.44140625, "learning_rate": 0.000194092274108049, "loss": 0.0796, "step": 4036 }, { "epoch": 0.9955621301775148, "grad_norm": 0.53125, "learning_rate": 0.0001940806116161399, "loss": 0.068, "step": 4038 }, { "epoch": 0.9960552268244576, "grad_norm": 0.318359375, "learning_rate": 0.00019406893797515305, "loss": 0.0656, "step": 4040 }, { "epoch": 0.9965483234714004, "grad_norm": 0.482421875, "learning_rate": 0.00019405725318647182, "loss": 0.0709, "step": 4042 }, { "epoch": 0.9970414201183432, "grad_norm": 0.515625, "learning_rate": 0.00019404555725148094, "loss": 0.0741, "step": 4044 }, { "epoch": 0.997534516765286, "grad_norm": 0.46484375, "learning_rate": 0.00019403385017156642, "loss": 0.0703, "step": 4046 }, { "epoch": 0.9980276134122288, "grad_norm": 0.55078125, "learning_rate": 0.0001940221319481157, "loss": 0.074, "step": 4048 }, { "epoch": 0.9985207100591716, "grad_norm": 0.48828125, "learning_rate": 0.00019401040258251734, "loss": 0.0785, "step": 4050 }, { "epoch": 0.9990138067061144, "grad_norm": 0.359375, "learning_rate": 0.0001939986620761614, "loss": 0.0653, "step": 4052 }, { "epoch": 0.9995069033530573, "grad_norm": 0.5, "learning_rate": 0.0001939869104304392, "loss": 0.066, "step": 4054 }, { "epoch": 1.0, "grad_norm": 0.6953125, "learning_rate": 0.00019397514764674334, "loss": 0.0715, "step": 4056 }, { "epoch": 1.0, "eval_loss": 0.07079669833183289, "eval_runtime": 61.6782, "eval_samples_per_second": 258.26, "eval_steps_per_second": 2.027, "step": 4056 }, { "epoch": 1.0004930966469427, "grad_norm": 0.337890625, "learning_rate": 0.0001939633737264678, "loss": 0.0537, "step": 4058 }, { "epoch": 1.0009861932938855, "grad_norm": 0.46875, "learning_rate": 0.00019395158867100784, "loss": 0.0474, "step": 4060 }, { "epoch": 1.0014792899408285, "grad_norm": 0.462890625, "learning_rate": 0.00019393979248176005, "loss": 0.0485, "step": 4062 }, { "epoch": 1.0019723865877712, "grad_norm": 0.322265625, "learning_rate": 0.00019392798516012239, "loss": 0.0542, "step": 4064 }, { "epoch": 1.002465483234714, "grad_norm": 0.345703125, "learning_rate": 0.00019391616670749406, "loss": 0.0498, "step": 4066 }, { "epoch": 1.0029585798816567, "grad_norm": 0.3828125, "learning_rate": 0.0001939043371252756, "loss": 0.0454, "step": 4068 }, { "epoch": 1.0034516765285997, "grad_norm": 0.44140625, "learning_rate": 0.0001938924964148689, "loss": 0.058, "step": 4070 }, { "epoch": 1.0039447731755424, "grad_norm": 0.404296875, "learning_rate": 0.00019388064457767713, "loss": 0.0587, "step": 4072 }, { "epoch": 1.0044378698224852, "grad_norm": 0.5625, "learning_rate": 0.00019386878161510484, "loss": 0.0557, "step": 4074 }, { "epoch": 1.004930966469428, "grad_norm": 0.41796875, "learning_rate": 0.00019385690752855782, "loss": 0.0514, "step": 4076 }, { "epoch": 1.005424063116371, "grad_norm": 0.421875, "learning_rate": 0.00019384502231944325, "loss": 0.051, "step": 4078 }, { "epoch": 1.0059171597633136, "grad_norm": 0.34765625, "learning_rate": 0.00019383312598916954, "loss": 0.0546, "step": 4080 }, { "epoch": 1.0064102564102564, "grad_norm": 0.3203125, "learning_rate": 0.00019382121853914652, "loss": 0.0532, "step": 4082 }, { "epoch": 1.0069033530571991, "grad_norm": 0.345703125, "learning_rate": 0.00019380929997078527, "loss": 0.0474, "step": 4084 }, { "epoch": 1.007396449704142, "grad_norm": 0.46875, "learning_rate": 0.0001937973702854982, "loss": 0.0529, "step": 4086 }, { "epoch": 1.0078895463510849, "grad_norm": 0.3125, "learning_rate": 0.00019378542948469905, "loss": 0.0503, "step": 4088 }, { "epoch": 1.0083826429980276, "grad_norm": 0.29296875, "learning_rate": 0.0001937734775698029, "loss": 0.0522, "step": 4090 }, { "epoch": 1.0088757396449703, "grad_norm": 0.310546875, "learning_rate": 0.00019376151454222608, "loss": 0.0497, "step": 4092 }, { "epoch": 1.0093688362919133, "grad_norm": 0.578125, "learning_rate": 0.0001937495404033863, "loss": 0.0528, "step": 4094 }, { "epoch": 1.009861932938856, "grad_norm": 0.3515625, "learning_rate": 0.00019373755515470254, "loss": 0.0521, "step": 4096 }, { "epoch": 1.0103550295857988, "grad_norm": 0.447265625, "learning_rate": 0.00019372555879759514, "loss": 0.0491, "step": 4098 }, { "epoch": 1.0108481262327416, "grad_norm": 0.50390625, "learning_rate": 0.0001937135513334857, "loss": 0.046, "step": 4100 }, { "epoch": 1.0113412228796843, "grad_norm": 0.4375, "learning_rate": 0.00019370153276379722, "loss": 0.0551, "step": 4102 }, { "epoch": 1.0118343195266273, "grad_norm": 0.439453125, "learning_rate": 0.00019368950308995395, "loss": 0.048, "step": 4104 }, { "epoch": 1.01232741617357, "grad_norm": 0.337890625, "learning_rate": 0.00019367746231338146, "loss": 0.0468, "step": 4106 }, { "epoch": 1.0128205128205128, "grad_norm": 0.45703125, "learning_rate": 0.00019366541043550666, "loss": 0.0518, "step": 4108 }, { "epoch": 1.0133136094674555, "grad_norm": 0.365234375, "learning_rate": 0.00019365334745775775, "loss": 0.0498, "step": 4110 }, { "epoch": 1.0138067061143985, "grad_norm": 0.455078125, "learning_rate": 0.00019364127338156428, "loss": 0.0498, "step": 4112 }, { "epoch": 1.0142998027613412, "grad_norm": 0.466796875, "learning_rate": 0.0001936291882083571, "loss": 0.047, "step": 4114 }, { "epoch": 1.014792899408284, "grad_norm": 0.349609375, "learning_rate": 0.00019361709193956836, "loss": 0.0491, "step": 4116 }, { "epoch": 1.0152859960552267, "grad_norm": 0.625, "learning_rate": 0.0001936049845766315, "loss": 0.0542, "step": 4118 }, { "epoch": 1.0157790927021697, "grad_norm": 0.322265625, "learning_rate": 0.00019359286612098139, "loss": 0.0482, "step": 4120 }, { "epoch": 1.0162721893491125, "grad_norm": 0.5234375, "learning_rate": 0.00019358073657405406, "loss": 0.0569, "step": 4122 }, { "epoch": 1.0167652859960552, "grad_norm": 0.3203125, "learning_rate": 0.00019356859593728699, "loss": 0.0434, "step": 4124 }, { "epoch": 1.017258382642998, "grad_norm": 0.515625, "learning_rate": 0.00019355644421211886, "loss": 0.0462, "step": 4126 }, { "epoch": 1.017751479289941, "grad_norm": 0.41015625, "learning_rate": 0.0001935442813999897, "loss": 0.0523, "step": 4128 }, { "epoch": 1.0182445759368837, "grad_norm": 0.5234375, "learning_rate": 0.000193532107502341, "loss": 0.048, "step": 4130 }, { "epoch": 1.0187376725838264, "grad_norm": 0.4140625, "learning_rate": 0.00019351992252061525, "loss": 0.0467, "step": 4132 }, { "epoch": 1.0192307692307692, "grad_norm": 0.412109375, "learning_rate": 0.00019350772645625658, "loss": 0.0484, "step": 4134 }, { "epoch": 1.0197238658777121, "grad_norm": 0.369140625, "learning_rate": 0.00019349551931071023, "loss": 0.0545, "step": 4136 }, { "epoch": 1.0202169625246549, "grad_norm": 0.337890625, "learning_rate": 0.00019348330108542286, "loss": 0.0468, "step": 4138 }, { "epoch": 1.0207100591715976, "grad_norm": 0.40234375, "learning_rate": 0.00019347107178184234, "loss": 0.0528, "step": 4140 }, { "epoch": 1.0212031558185404, "grad_norm": 0.353515625, "learning_rate": 0.00019345883140141793, "loss": 0.0454, "step": 4142 }, { "epoch": 1.0216962524654831, "grad_norm": 0.3359375, "learning_rate": 0.00019344657994560023, "loss": 0.0472, "step": 4144 }, { "epoch": 1.022189349112426, "grad_norm": 0.28515625, "learning_rate": 0.000193434317415841, "loss": 0.0483, "step": 4146 }, { "epoch": 1.0226824457593688, "grad_norm": 0.3515625, "learning_rate": 0.00019342204381359355, "loss": 0.0449, "step": 4148 }, { "epoch": 1.0231755424063116, "grad_norm": 0.41796875, "learning_rate": 0.0001934097591403123, "loss": 0.0551, "step": 4150 }, { "epoch": 1.0236686390532543, "grad_norm": 0.3515625, "learning_rate": 0.00019339746339745305, "loss": 0.0505, "step": 4152 }, { "epoch": 1.0241617357001973, "grad_norm": 0.43359375, "learning_rate": 0.00019338515658647292, "loss": 0.0474, "step": 4154 }, { "epoch": 1.02465483234714, "grad_norm": 0.365234375, "learning_rate": 0.0001933728387088303, "loss": 0.0471, "step": 4156 }, { "epoch": 1.0251479289940828, "grad_norm": 0.333984375, "learning_rate": 0.00019336050976598502, "loss": 0.0469, "step": 4158 }, { "epoch": 1.0256410256410255, "grad_norm": 0.302734375, "learning_rate": 0.00019334816975939803, "loss": 0.0447, "step": 4160 }, { "epoch": 1.0261341222879685, "grad_norm": 0.5078125, "learning_rate": 0.00019333581869053178, "loss": 0.0494, "step": 4162 }, { "epoch": 1.0266272189349113, "grad_norm": 0.314453125, "learning_rate": 0.00019332345656084988, "loss": 0.0477, "step": 4164 }, { "epoch": 1.027120315581854, "grad_norm": 0.337890625, "learning_rate": 0.0001933110833718173, "loss": 0.043, "step": 4166 }, { "epoch": 1.0276134122287968, "grad_norm": 0.318359375, "learning_rate": 0.0001932986991249004, "loss": 0.051, "step": 4168 }, { "epoch": 1.0281065088757397, "grad_norm": 0.353515625, "learning_rate": 0.0001932863038215667, "loss": 0.0443, "step": 4170 }, { "epoch": 1.0285996055226825, "grad_norm": 0.42578125, "learning_rate": 0.00019327389746328518, "loss": 0.0553, "step": 4172 }, { "epoch": 1.0290927021696252, "grad_norm": 0.353515625, "learning_rate": 0.00019326148005152606, "loss": 0.0435, "step": 4174 }, { "epoch": 1.029585798816568, "grad_norm": 0.46484375, "learning_rate": 0.00019324905158776078, "loss": 0.0429, "step": 4176 }, { "epoch": 1.030078895463511, "grad_norm": 0.25390625, "learning_rate": 0.00019323661207346229, "loss": 0.0455, "step": 4178 }, { "epoch": 1.0305719921104537, "grad_norm": 0.306640625, "learning_rate": 0.0001932241615101047, "loss": 0.049, "step": 4180 }, { "epoch": 1.0310650887573964, "grad_norm": 0.51953125, "learning_rate": 0.0001932116998991635, "loss": 0.0501, "step": 4182 }, { "epoch": 1.0315581854043392, "grad_norm": 0.4375, "learning_rate": 0.0001931992272421154, "loss": 0.0428, "step": 4184 }, { "epoch": 1.032051282051282, "grad_norm": 0.5078125, "learning_rate": 0.0001931867435404385, "loss": 0.046, "step": 4186 }, { "epoch": 1.032544378698225, "grad_norm": 0.388671875, "learning_rate": 0.00019317424879561224, "loss": 0.0476, "step": 4188 }, { "epoch": 1.0330374753451677, "grad_norm": 0.5, "learning_rate": 0.00019316174300911727, "loss": 0.0447, "step": 4190 }, { "epoch": 1.0335305719921104, "grad_norm": 0.384765625, "learning_rate": 0.0001931492261824356, "loss": 0.0467, "step": 4192 }, { "epoch": 1.0340236686390532, "grad_norm": 0.376953125, "learning_rate": 0.00019313669831705054, "loss": 0.0454, "step": 4194 }, { "epoch": 1.0345167652859961, "grad_norm": 0.4375, "learning_rate": 0.0001931241594144467, "loss": 0.0495, "step": 4196 }, { "epoch": 1.0350098619329389, "grad_norm": 0.306640625, "learning_rate": 0.00019311160947611006, "loss": 0.0468, "step": 4198 }, { "epoch": 1.0355029585798816, "grad_norm": 0.384765625, "learning_rate": 0.00019309904850352784, "loss": 0.0446, "step": 4200 }, { "epoch": 1.0359960552268244, "grad_norm": 0.28125, "learning_rate": 0.00019308647649818856, "loss": 0.0459, "step": 4202 }, { "epoch": 1.0364891518737673, "grad_norm": 0.408203125, "learning_rate": 0.0001930738934615821, "loss": 0.0449, "step": 4204 }, { "epoch": 1.03698224852071, "grad_norm": 0.306640625, "learning_rate": 0.0001930612993951996, "loss": 0.0498, "step": 4206 }, { "epoch": 1.0374753451676528, "grad_norm": 0.294921875, "learning_rate": 0.00019304869430053351, "loss": 0.0436, "step": 4208 }, { "epoch": 1.0379684418145956, "grad_norm": 0.376953125, "learning_rate": 0.00019303607817907766, "loss": 0.0392, "step": 4210 }, { "epoch": 1.0384615384615385, "grad_norm": 0.26953125, "learning_rate": 0.00019302345103232709, "loss": 0.0441, "step": 4212 }, { "epoch": 1.0389546351084813, "grad_norm": 0.34765625, "learning_rate": 0.00019301081286177818, "loss": 0.0433, "step": 4214 }, { "epoch": 1.039447731755424, "grad_norm": 0.34765625, "learning_rate": 0.00019299816366892864, "loss": 0.0406, "step": 4216 }, { "epoch": 1.0399408284023668, "grad_norm": 0.3046875, "learning_rate": 0.00019298550345527748, "loss": 0.0501, "step": 4218 }, { "epoch": 1.0404339250493098, "grad_norm": 0.349609375, "learning_rate": 0.000192972832222325, "loss": 0.0427, "step": 4220 }, { "epoch": 1.0409270216962525, "grad_norm": 0.298828125, "learning_rate": 0.0001929601499715728, "loss": 0.0413, "step": 4222 }, { "epoch": 1.0414201183431953, "grad_norm": 0.39453125, "learning_rate": 0.00019294745670452384, "loss": 0.0473, "step": 4224 }, { "epoch": 1.041913214990138, "grad_norm": 0.41015625, "learning_rate": 0.00019293475242268223, "loss": 0.043, "step": 4226 }, { "epoch": 1.042406311637081, "grad_norm": 0.259765625, "learning_rate": 0.00019292203712755366, "loss": 0.0456, "step": 4228 }, { "epoch": 1.0428994082840237, "grad_norm": 0.380859375, "learning_rate": 0.00019290931082064484, "loss": 0.0435, "step": 4230 }, { "epoch": 1.0433925049309665, "grad_norm": 0.392578125, "learning_rate": 0.00019289657350346395, "loss": 0.0489, "step": 4232 }, { "epoch": 1.0438856015779092, "grad_norm": 0.5625, "learning_rate": 0.00019288382517752045, "loss": 0.0459, "step": 4234 }, { "epoch": 1.044378698224852, "grad_norm": 0.275390625, "learning_rate": 0.00019287106584432503, "loss": 0.0447, "step": 4236 }, { "epoch": 1.044871794871795, "grad_norm": 0.33203125, "learning_rate": 0.00019285829550538984, "loss": 0.0474, "step": 4238 }, { "epoch": 1.0453648915187377, "grad_norm": 0.328125, "learning_rate": 0.00019284551416222812, "loss": 0.0466, "step": 4240 }, { "epoch": 1.0458579881656804, "grad_norm": 0.380859375, "learning_rate": 0.0001928327218163546, "loss": 0.046, "step": 4242 }, { "epoch": 1.0463510848126232, "grad_norm": 0.5078125, "learning_rate": 0.00019281991846928527, "loss": 0.0458, "step": 4244 }, { "epoch": 1.0468441814595661, "grad_norm": 0.41796875, "learning_rate": 0.00019280710412253733, "loss": 0.0468, "step": 4246 }, { "epoch": 1.047337278106509, "grad_norm": 0.458984375, "learning_rate": 0.00019279427877762938, "loss": 0.0429, "step": 4248 }, { "epoch": 1.0478303747534516, "grad_norm": 0.35546875, "learning_rate": 0.0001927814424360813, "loss": 0.0416, "step": 4250 }, { "epoch": 1.0483234714003944, "grad_norm": 0.353515625, "learning_rate": 0.0001927685950994143, "loss": 0.048, "step": 4252 }, { "epoch": 1.0488165680473374, "grad_norm": 0.2490234375, "learning_rate": 0.0001927557367691508, "loss": 0.0389, "step": 4254 }, { "epoch": 1.04930966469428, "grad_norm": 0.44921875, "learning_rate": 0.00019274286744681463, "loss": 0.0434, "step": 4256 }, { "epoch": 1.0498027613412229, "grad_norm": 0.39453125, "learning_rate": 0.00019272998713393082, "loss": 0.0471, "step": 4258 }, { "epoch": 1.0502958579881656, "grad_norm": 0.337890625, "learning_rate": 0.00019271709583202582, "loss": 0.0438, "step": 4260 }, { "epoch": 1.0507889546351086, "grad_norm": 0.298828125, "learning_rate": 0.0001927041935426273, "loss": 0.0451, "step": 4262 }, { "epoch": 1.0512820512820513, "grad_norm": 0.330078125, "learning_rate": 0.00019269128026726425, "loss": 0.0452, "step": 4264 }, { "epoch": 1.051775147928994, "grad_norm": 0.470703125, "learning_rate": 0.00019267835600746697, "loss": 0.0446, "step": 4266 }, { "epoch": 1.0522682445759368, "grad_norm": 0.458984375, "learning_rate": 0.00019266542076476704, "loss": 0.0444, "step": 4268 }, { "epoch": 1.0527613412228798, "grad_norm": 0.5, "learning_rate": 0.00019265247454069737, "loss": 0.0477, "step": 4270 }, { "epoch": 1.0532544378698225, "grad_norm": 0.396484375, "learning_rate": 0.00019263951733679218, "loss": 0.0452, "step": 4272 }, { "epoch": 1.0537475345167653, "grad_norm": 0.369140625, "learning_rate": 0.00019262654915458693, "loss": 0.0415, "step": 4274 }, { "epoch": 1.054240631163708, "grad_norm": 0.2890625, "learning_rate": 0.00019261356999561845, "loss": 0.0447, "step": 4276 }, { "epoch": 1.054733727810651, "grad_norm": 0.3671875, "learning_rate": 0.00019260057986142485, "loss": 0.0472, "step": 4278 }, { "epoch": 1.0552268244575937, "grad_norm": 0.5078125, "learning_rate": 0.00019258757875354553, "loss": 0.0403, "step": 4280 }, { "epoch": 1.0557199211045365, "grad_norm": 0.365234375, "learning_rate": 0.00019257456667352117, "loss": 0.0434, "step": 4282 }, { "epoch": 1.0562130177514792, "grad_norm": 0.412109375, "learning_rate": 0.0001925615436228938, "loss": 0.0478, "step": 4284 }, { "epoch": 1.056706114398422, "grad_norm": 0.30078125, "learning_rate": 0.0001925485096032067, "loss": 0.0473, "step": 4286 }, { "epoch": 1.057199211045365, "grad_norm": 0.302734375, "learning_rate": 0.0001925354646160045, "loss": 0.0448, "step": 4288 }, { "epoch": 1.0576923076923077, "grad_norm": 0.408203125, "learning_rate": 0.00019252240866283308, "loss": 0.0442, "step": 4290 }, { "epoch": 1.0581854043392505, "grad_norm": 0.3125, "learning_rate": 0.00019250934174523966, "loss": 0.0408, "step": 4292 }, { "epoch": 1.0586785009861932, "grad_norm": 0.369140625, "learning_rate": 0.00019249626386477273, "loss": 0.0417, "step": 4294 }, { "epoch": 1.0591715976331362, "grad_norm": 0.287109375, "learning_rate": 0.0001924831750229821, "loss": 0.041, "step": 4296 }, { "epoch": 1.059664694280079, "grad_norm": 0.294921875, "learning_rate": 0.00019247007522141886, "loss": 0.0433, "step": 4298 }, { "epoch": 1.0601577909270217, "grad_norm": 0.37109375, "learning_rate": 0.00019245696446163545, "loss": 0.0458, "step": 4300 }, { "epoch": 1.0606508875739644, "grad_norm": 0.3203125, "learning_rate": 0.00019244384274518553, "loss": 0.0382, "step": 4302 }, { "epoch": 1.0611439842209074, "grad_norm": 0.314453125, "learning_rate": 0.00019243071007362408, "loss": 0.0476, "step": 4304 }, { "epoch": 1.0616370808678501, "grad_norm": 0.3828125, "learning_rate": 0.00019241756644850744, "loss": 0.0434, "step": 4306 }, { "epoch": 1.0621301775147929, "grad_norm": 0.3671875, "learning_rate": 0.00019240441187139316, "loss": 0.0475, "step": 4308 }, { "epoch": 1.0626232741617356, "grad_norm": 0.416015625, "learning_rate": 0.0001923912463438402, "loss": 0.0439, "step": 4310 }, { "epoch": 1.0631163708086786, "grad_norm": 0.326171875, "learning_rate": 0.00019237806986740865, "loss": 0.0408, "step": 4312 }, { "epoch": 1.0636094674556213, "grad_norm": 0.408203125, "learning_rate": 0.00019236488244366007, "loss": 0.0418, "step": 4314 }, { "epoch": 1.064102564102564, "grad_norm": 0.38671875, "learning_rate": 0.0001923516840741572, "loss": 0.0449, "step": 4316 }, { "epoch": 1.0645956607495068, "grad_norm": 0.380859375, "learning_rate": 0.0001923384747604642, "loss": 0.0396, "step": 4318 }, { "epoch": 1.0650887573964498, "grad_norm": 0.423828125, "learning_rate": 0.00019232525450414633, "loss": 0.046, "step": 4320 }, { "epoch": 1.0655818540433926, "grad_norm": 0.3125, "learning_rate": 0.00019231202330677034, "loss": 0.0442, "step": 4322 }, { "epoch": 1.0660749506903353, "grad_norm": 0.33203125, "learning_rate": 0.0001922987811699042, "loss": 0.0382, "step": 4324 }, { "epoch": 1.066568047337278, "grad_norm": 0.37109375, "learning_rate": 0.00019228552809511714, "loss": 0.0422, "step": 4326 }, { "epoch": 1.0670611439842208, "grad_norm": 0.3515625, "learning_rate": 0.00019227226408397976, "loss": 0.046, "step": 4328 }, { "epoch": 1.0675542406311638, "grad_norm": 0.29296875, "learning_rate": 0.00019225898913806388, "loss": 0.0407, "step": 4330 }, { "epoch": 1.0680473372781065, "grad_norm": 0.36328125, "learning_rate": 0.00019224570325894268, "loss": 0.0445, "step": 4332 }, { "epoch": 1.0685404339250493, "grad_norm": 0.267578125, "learning_rate": 0.00019223240644819061, "loss": 0.0487, "step": 4334 }, { "epoch": 1.069033530571992, "grad_norm": 0.3125, "learning_rate": 0.00019221909870738345, "loss": 0.0414, "step": 4336 }, { "epoch": 1.069526627218935, "grad_norm": 0.26953125, "learning_rate": 0.0001922057800380982, "loss": 0.0416, "step": 4338 }, { "epoch": 1.0700197238658777, "grad_norm": 0.373046875, "learning_rate": 0.00019219245044191317, "loss": 0.0403, "step": 4340 }, { "epoch": 1.0705128205128205, "grad_norm": 0.298828125, "learning_rate": 0.00019217910992040807, "loss": 0.0413, "step": 4342 }, { "epoch": 1.0710059171597632, "grad_norm": 0.345703125, "learning_rate": 0.00019216575847516375, "loss": 0.0414, "step": 4344 }, { "epoch": 1.0714990138067062, "grad_norm": 0.400390625, "learning_rate": 0.0001921523961077625, "loss": 0.0454, "step": 4346 }, { "epoch": 1.071992110453649, "grad_norm": 0.302734375, "learning_rate": 0.00019213902281978775, "loss": 0.043, "step": 4348 }, { "epoch": 1.0724852071005917, "grad_norm": 0.31640625, "learning_rate": 0.00019212563861282437, "loss": 0.0432, "step": 4350 }, { "epoch": 1.0729783037475344, "grad_norm": 0.353515625, "learning_rate": 0.00019211224348845846, "loss": 0.0472, "step": 4352 }, { "epoch": 1.0734714003944774, "grad_norm": 0.271484375, "learning_rate": 0.00019209883744827742, "loss": 0.0427, "step": 4354 }, { "epoch": 1.0739644970414202, "grad_norm": 0.275390625, "learning_rate": 0.0001920854204938699, "loss": 0.0431, "step": 4356 }, { "epoch": 1.074457593688363, "grad_norm": 0.30859375, "learning_rate": 0.00019207199262682593, "loss": 0.0433, "step": 4358 }, { "epoch": 1.0749506903353057, "grad_norm": 0.275390625, "learning_rate": 0.00019205855384873675, "loss": 0.044, "step": 4360 }, { "epoch": 1.0754437869822486, "grad_norm": 0.294921875, "learning_rate": 0.00019204510416119498, "loss": 0.0374, "step": 4362 }, { "epoch": 1.0759368836291914, "grad_norm": 0.478515625, "learning_rate": 0.00019203164356579447, "loss": 0.0442, "step": 4364 }, { "epoch": 1.0764299802761341, "grad_norm": 0.361328125, "learning_rate": 0.0001920181720641303, "loss": 0.0448, "step": 4366 }, { "epoch": 1.0769230769230769, "grad_norm": 0.37109375, "learning_rate": 0.00019200468965779904, "loss": 0.0419, "step": 4368 }, { "epoch": 1.0774161735700196, "grad_norm": 0.328125, "learning_rate": 0.00019199119634839831, "loss": 0.0461, "step": 4370 }, { "epoch": 1.0779092702169626, "grad_norm": 0.30859375, "learning_rate": 0.00019197769213752722, "loss": 0.0433, "step": 4372 }, { "epoch": 1.0784023668639053, "grad_norm": 0.3046875, "learning_rate": 0.00019196417702678606, "loss": 0.0449, "step": 4374 }, { "epoch": 1.078895463510848, "grad_norm": 0.345703125, "learning_rate": 0.0001919506510177765, "loss": 0.0424, "step": 4376 }, { "epoch": 1.0793885601577908, "grad_norm": 0.318359375, "learning_rate": 0.00019193711411210138, "loss": 0.0434, "step": 4378 }, { "epoch": 1.0798816568047338, "grad_norm": 0.291015625, "learning_rate": 0.00019192356631136492, "loss": 0.0441, "step": 4380 }, { "epoch": 1.0803747534516766, "grad_norm": 0.24609375, "learning_rate": 0.00019191000761717265, "loss": 0.0426, "step": 4382 }, { "epoch": 1.0808678500986193, "grad_norm": 0.2890625, "learning_rate": 0.00019189643803113127, "loss": 0.0387, "step": 4384 }, { "epoch": 1.081360946745562, "grad_norm": 0.310546875, "learning_rate": 0.00019188285755484893, "loss": 0.0418, "step": 4386 }, { "epoch": 1.081854043392505, "grad_norm": 0.31640625, "learning_rate": 0.00019186926618993498, "loss": 0.046, "step": 4388 }, { "epoch": 1.0823471400394478, "grad_norm": 0.3046875, "learning_rate": 0.000191855663938, "loss": 0.0436, "step": 4390 }, { "epoch": 1.0828402366863905, "grad_norm": 0.287109375, "learning_rate": 0.00019184205080065604, "loss": 0.0424, "step": 4392 }, { "epoch": 1.0833333333333333, "grad_norm": 0.30078125, "learning_rate": 0.00019182842677951625, "loss": 0.0408, "step": 4394 }, { "epoch": 1.0838264299802762, "grad_norm": 0.302734375, "learning_rate": 0.00019181479187619518, "loss": 0.0475, "step": 4396 }, { "epoch": 1.084319526627219, "grad_norm": 0.33203125, "learning_rate": 0.00019180114609230865, "loss": 0.043, "step": 4398 }, { "epoch": 1.0848126232741617, "grad_norm": 0.30859375, "learning_rate": 0.00019178748942947377, "loss": 0.044, "step": 4400 }, { "epoch": 1.0853057199211045, "grad_norm": 0.296875, "learning_rate": 0.0001917738218893089, "loss": 0.0458, "step": 4402 }, { "epoch": 1.0857988165680474, "grad_norm": 0.40234375, "learning_rate": 0.00019176014347343373, "loss": 0.0433, "step": 4404 }, { "epoch": 1.0862919132149902, "grad_norm": 0.26171875, "learning_rate": 0.00019174645418346925, "loss": 0.0448, "step": 4406 }, { "epoch": 1.086785009861933, "grad_norm": 0.451171875, "learning_rate": 0.00019173275402103773, "loss": 0.0421, "step": 4408 }, { "epoch": 1.0872781065088757, "grad_norm": 0.265625, "learning_rate": 0.00019171904298776265, "loss": 0.0387, "step": 4410 }, { "epoch": 1.0877712031558184, "grad_norm": 0.296875, "learning_rate": 0.00019170532108526888, "loss": 0.0441, "step": 4412 }, { "epoch": 1.0882642998027614, "grad_norm": 0.333984375, "learning_rate": 0.00019169158831518256, "loss": 0.0385, "step": 4414 }, { "epoch": 1.0887573964497042, "grad_norm": 0.31640625, "learning_rate": 0.00019167784467913107, "loss": 0.0457, "step": 4416 }, { "epoch": 1.089250493096647, "grad_norm": 0.392578125, "learning_rate": 0.00019166409017874313, "loss": 0.0468, "step": 4418 }, { "epoch": 1.0897435897435896, "grad_norm": 0.404296875, "learning_rate": 0.00019165032481564874, "loss": 0.042, "step": 4420 }, { "epoch": 1.0902366863905326, "grad_norm": 0.2890625, "learning_rate": 0.0001916365485914791, "loss": 0.0403, "step": 4422 }, { "epoch": 1.0907297830374754, "grad_norm": 0.2392578125, "learning_rate": 0.00019162276150786687, "loss": 0.0438, "step": 4424 }, { "epoch": 1.0912228796844181, "grad_norm": 0.46875, "learning_rate": 0.00019160896356644583, "loss": 0.0434, "step": 4426 }, { "epoch": 1.0917159763313609, "grad_norm": 0.45703125, "learning_rate": 0.00019159515476885113, "loss": 0.0385, "step": 4428 }, { "epoch": 1.0922090729783038, "grad_norm": 0.322265625, "learning_rate": 0.00019158133511671917, "loss": 0.04, "step": 4430 }, { "epoch": 1.0927021696252466, "grad_norm": 0.46484375, "learning_rate": 0.00019156750461168768, "loss": 0.0415, "step": 4432 }, { "epoch": 1.0931952662721893, "grad_norm": 0.34375, "learning_rate": 0.00019155366325539563, "loss": 0.043, "step": 4434 }, { "epoch": 1.093688362919132, "grad_norm": 0.439453125, "learning_rate": 0.00019153981104948336, "loss": 0.043, "step": 4436 }, { "epoch": 1.094181459566075, "grad_norm": 0.291015625, "learning_rate": 0.00019152594799559233, "loss": 0.0436, "step": 4438 }, { "epoch": 1.0946745562130178, "grad_norm": 0.408203125, "learning_rate": 0.0001915120740953655, "loss": 0.0448, "step": 4440 }, { "epoch": 1.0951676528599605, "grad_norm": 0.5, "learning_rate": 0.00019149818935044693, "loss": 0.042, "step": 4442 }, { "epoch": 1.0956607495069033, "grad_norm": 0.5078125, "learning_rate": 0.00019148429376248207, "loss": 0.0416, "step": 4444 }, { "epoch": 1.0961538461538463, "grad_norm": 0.31640625, "learning_rate": 0.0001914703873331176, "loss": 0.0445, "step": 4446 }, { "epoch": 1.096646942800789, "grad_norm": 0.37109375, "learning_rate": 0.00019145647006400154, "loss": 0.0515, "step": 4448 }, { "epoch": 1.0971400394477318, "grad_norm": 0.341796875, "learning_rate": 0.00019144254195678314, "loss": 0.0417, "step": 4450 }, { "epoch": 1.0976331360946745, "grad_norm": 0.341796875, "learning_rate": 0.000191428603013113, "loss": 0.0422, "step": 4452 }, { "epoch": 1.0981262327416172, "grad_norm": 0.3984375, "learning_rate": 0.00019141465323464286, "loss": 0.0492, "step": 4454 }, { "epoch": 1.0986193293885602, "grad_norm": 0.29296875, "learning_rate": 0.00019140069262302598, "loss": 0.0487, "step": 4456 }, { "epoch": 1.099112426035503, "grad_norm": 0.328125, "learning_rate": 0.0001913867211799167, "loss": 0.0397, "step": 4458 }, { "epoch": 1.0996055226824457, "grad_norm": 0.29296875, "learning_rate": 0.0001913727389069707, "loss": 0.0376, "step": 4460 }, { "epoch": 1.1000986193293885, "grad_norm": 0.337890625, "learning_rate": 0.00019135874580584499, "loss": 0.0481, "step": 4462 }, { "epoch": 1.1005917159763314, "grad_norm": 0.423828125, "learning_rate": 0.00019134474187819782, "loss": 0.0448, "step": 4464 }, { "epoch": 1.1010848126232742, "grad_norm": 0.328125, "learning_rate": 0.00019133072712568873, "loss": 0.039, "step": 4466 }, { "epoch": 1.101577909270217, "grad_norm": 0.384765625, "learning_rate": 0.00019131670154997856, "loss": 0.039, "step": 4468 }, { "epoch": 1.1020710059171597, "grad_norm": 0.298828125, "learning_rate": 0.00019130266515272938, "loss": 0.042, "step": 4470 }, { "epoch": 1.1025641025641026, "grad_norm": 0.306640625, "learning_rate": 0.00019128861793560464, "loss": 0.0525, "step": 4472 }, { "epoch": 1.1030571992110454, "grad_norm": 0.279296875, "learning_rate": 0.00019127455990026898, "loss": 0.0426, "step": 4474 }, { "epoch": 1.1035502958579881, "grad_norm": 0.31640625, "learning_rate": 0.00019126049104838837, "loss": 0.04, "step": 4476 }, { "epoch": 1.1040433925049309, "grad_norm": 0.380859375, "learning_rate": 0.00019124641138163002, "loss": 0.0548, "step": 4478 }, { "epoch": 1.1045364891518739, "grad_norm": 0.37109375, "learning_rate": 0.00019123232090166248, "loss": 0.0555, "step": 4480 }, { "epoch": 1.1050295857988166, "grad_norm": 0.40625, "learning_rate": 0.00019121821961015553, "loss": 0.0422, "step": 4482 }, { "epoch": 1.1055226824457594, "grad_norm": 0.40625, "learning_rate": 0.00019120410750878027, "loss": 0.0372, "step": 4484 }, { "epoch": 1.106015779092702, "grad_norm": 0.48046875, "learning_rate": 0.00019118998459920902, "loss": 0.0516, "step": 4486 }, { "epoch": 1.106508875739645, "grad_norm": 0.52734375, "learning_rate": 0.00019117585088311548, "loss": 0.0574, "step": 4488 }, { "epoch": 1.1070019723865878, "grad_norm": 0.380859375, "learning_rate": 0.00019116170636217453, "loss": 0.0427, "step": 4490 }, { "epoch": 1.1074950690335306, "grad_norm": 0.302734375, "learning_rate": 0.00019114755103806242, "loss": 0.0416, "step": 4492 }, { "epoch": 1.1079881656804733, "grad_norm": 0.3828125, "learning_rate": 0.00019113338491245658, "loss": 0.0554, "step": 4494 }, { "epoch": 1.108481262327416, "grad_norm": 0.384765625, "learning_rate": 0.00019111920798703584, "loss": 0.0547, "step": 4496 }, { "epoch": 1.108974358974359, "grad_norm": 0.41796875, "learning_rate": 0.0001911050202634802, "loss": 0.0446, "step": 4498 }, { "epoch": 1.1094674556213018, "grad_norm": 0.36328125, "learning_rate": 0.00019109082174347097, "loss": 0.0465, "step": 4500 }, { "epoch": 1.1099605522682445, "grad_norm": 0.470703125, "learning_rate": 0.00019107661242869078, "loss": 0.0568, "step": 4502 }, { "epoch": 1.1104536489151873, "grad_norm": 0.51171875, "learning_rate": 0.00019106239232082354, "loss": 0.0678, "step": 4504 }, { "epoch": 1.1109467455621302, "grad_norm": 0.35546875, "learning_rate": 0.00019104816142155438, "loss": 0.0456, "step": 4506 }, { "epoch": 1.111439842209073, "grad_norm": 0.458984375, "learning_rate": 0.0001910339197325697, "loss": 0.0479, "step": 4508 }, { "epoch": 1.1119329388560157, "grad_norm": 0.4453125, "learning_rate": 0.00019101966725555732, "loss": 0.0581, "step": 4510 }, { "epoch": 1.1124260355029585, "grad_norm": 0.498046875, "learning_rate": 0.00019100540399220615, "loss": 0.0602, "step": 4512 }, { "epoch": 1.1129191321499015, "grad_norm": 0.314453125, "learning_rate": 0.0001909911299442065, "loss": 0.0435, "step": 4514 }, { "epoch": 1.1134122287968442, "grad_norm": 0.4140625, "learning_rate": 0.00019097684511324993, "loss": 0.0421, "step": 4516 }, { "epoch": 1.113905325443787, "grad_norm": 0.349609375, "learning_rate": 0.00019096254950102927, "loss": 0.0583, "step": 4518 }, { "epoch": 1.1143984220907297, "grad_norm": 0.37109375, "learning_rate": 0.0001909482431092386, "loss": 0.0575, "step": 4520 }, { "epoch": 1.1148915187376727, "grad_norm": 0.423828125, "learning_rate": 0.00019093392593957334, "loss": 0.0454, "step": 4522 }, { "epoch": 1.1153846153846154, "grad_norm": 0.330078125, "learning_rate": 0.00019091959799373014, "loss": 0.0446, "step": 4524 }, { "epoch": 1.1158777120315582, "grad_norm": 0.375, "learning_rate": 0.00019090525927340695, "loss": 0.0555, "step": 4526 }, { "epoch": 1.116370808678501, "grad_norm": 0.373046875, "learning_rate": 0.00019089090978030302, "loss": 0.0619, "step": 4528 }, { "epoch": 1.1168639053254439, "grad_norm": 0.515625, "learning_rate": 0.00019087654951611876, "loss": 0.04, "step": 4530 }, { "epoch": 1.1173570019723866, "grad_norm": 0.271484375, "learning_rate": 0.00019086217848255602, "loss": 0.0546, "step": 4532 }, { "epoch": 1.1178500986193294, "grad_norm": 0.412109375, "learning_rate": 0.00019084779668131782, "loss": 0.0586, "step": 4534 }, { "epoch": 1.1183431952662721, "grad_norm": 0.30859375, "learning_rate": 0.00019083340411410848, "loss": 0.0548, "step": 4536 }, { "epoch": 1.1188362919132149, "grad_norm": 0.49609375, "learning_rate": 0.0001908190007826336, "loss": 0.0587, "step": 4538 }, { "epoch": 1.1193293885601578, "grad_norm": 0.3125, "learning_rate": 0.00019080458668860008, "loss": 0.0457, "step": 4540 }, { "epoch": 1.1198224852071006, "grad_norm": 0.419921875, "learning_rate": 0.00019079016183371603, "loss": 0.059, "step": 4542 }, { "epoch": 1.1203155818540433, "grad_norm": 0.416015625, "learning_rate": 0.0001907757262196909, "loss": 0.0568, "step": 4544 }, { "epoch": 1.1208086785009863, "grad_norm": 0.279296875, "learning_rate": 0.0001907612798482354, "loss": 0.0469, "step": 4546 }, { "epoch": 1.121301775147929, "grad_norm": 0.40234375, "learning_rate": 0.00019074682272106146, "loss": 0.0503, "step": 4548 }, { "epoch": 1.1217948717948718, "grad_norm": 0.359375, "learning_rate": 0.0001907323548398824, "loss": 0.0612, "step": 4550 }, { "epoch": 1.1222879684418146, "grad_norm": 0.53515625, "learning_rate": 0.00019071787620641265, "loss": 0.0612, "step": 4552 }, { "epoch": 1.1227810650887573, "grad_norm": 0.3203125, "learning_rate": 0.00019070338682236813, "loss": 0.055, "step": 4554 }, { "epoch": 1.1232741617357003, "grad_norm": 0.375, "learning_rate": 0.00019068888668946583, "loss": 0.0516, "step": 4556 }, { "epoch": 1.123767258382643, "grad_norm": 0.6796875, "learning_rate": 0.00019067437580942408, "loss": 0.061, "step": 4558 }, { "epoch": 1.1242603550295858, "grad_norm": 0.42578125, "learning_rate": 0.00019065985418396258, "loss": 0.0594, "step": 4560 }, { "epoch": 1.1247534516765285, "grad_norm": 0.375, "learning_rate": 0.00019064532181480216, "loss": 0.0588, "step": 4562 }, { "epoch": 1.1252465483234715, "grad_norm": 0.5625, "learning_rate": 0.000190630778703665, "loss": 0.0541, "step": 4564 }, { "epoch": 1.1257396449704142, "grad_norm": 0.44921875, "learning_rate": 0.00019061622485227458, "loss": 0.0606, "step": 4566 }, { "epoch": 1.126232741617357, "grad_norm": 0.427734375, "learning_rate": 0.00019060166026235552, "loss": 0.0621, "step": 4568 }, { "epoch": 1.1267258382642997, "grad_norm": 0.400390625, "learning_rate": 0.00019058708493563392, "loss": 0.0608, "step": 4570 }, { "epoch": 1.1272189349112427, "grad_norm": 0.310546875, "learning_rate": 0.00019057249887383695, "loss": 0.0631, "step": 4572 }, { "epoch": 1.1277120315581854, "grad_norm": 0.54296875, "learning_rate": 0.00019055790207869318, "loss": 0.0638, "step": 4574 }, { "epoch": 1.1282051282051282, "grad_norm": 0.423828125, "learning_rate": 0.0001905432945519324, "loss": 0.0711, "step": 4576 }, { "epoch": 1.128698224852071, "grad_norm": 0.439453125, "learning_rate": 0.00019052867629528567, "loss": 0.0579, "step": 4578 }, { "epoch": 1.1291913214990137, "grad_norm": 0.396484375, "learning_rate": 0.0001905140473104854, "loss": 0.0574, "step": 4580 }, { "epoch": 1.1296844181459567, "grad_norm": 0.400390625, "learning_rate": 0.0001904994075992651, "loss": 0.0592, "step": 4582 }, { "epoch": 1.1301775147928994, "grad_norm": 0.38671875, "learning_rate": 0.00019048475716335975, "loss": 0.0643, "step": 4584 }, { "epoch": 1.1306706114398422, "grad_norm": 0.38671875, "learning_rate": 0.00019047009600450546, "loss": 0.0562, "step": 4586 }, { "epoch": 1.1311637080867851, "grad_norm": 0.51171875, "learning_rate": 0.00019045542412443972, "loss": 0.051, "step": 4588 }, { "epoch": 1.1316568047337279, "grad_norm": 0.451171875, "learning_rate": 0.00019044074152490113, "loss": 0.0605, "step": 4590 }, { "epoch": 1.1321499013806706, "grad_norm": 0.6875, "learning_rate": 0.00019042604820762975, "loss": 0.0672, "step": 4592 }, { "epoch": 1.1326429980276134, "grad_norm": 0.609375, "learning_rate": 0.00019041134417436674, "loss": 0.059, "step": 4594 }, { "epoch": 1.1331360946745561, "grad_norm": 0.4609375, "learning_rate": 0.00019039662942685467, "loss": 0.0588, "step": 4596 }, { "epoch": 1.133629191321499, "grad_norm": 0.640625, "learning_rate": 0.0001903819039668373, "loss": 0.0636, "step": 4598 }, { "epoch": 1.1341222879684418, "grad_norm": 0.7734375, "learning_rate": 0.0001903671677960597, "loss": 0.0673, "step": 4600 }, { "epoch": 1.1346153846153846, "grad_norm": 0.9140625, "learning_rate": 0.00019035242091626814, "loss": 0.0736, "step": 4602 }, { "epoch": 1.1351084812623273, "grad_norm": 0.48828125, "learning_rate": 0.00019033766332921028, "loss": 0.064, "step": 4604 }, { "epoch": 1.1356015779092703, "grad_norm": 0.7109375, "learning_rate": 0.0001903228950366349, "loss": 0.066, "step": 4606 }, { "epoch": 1.136094674556213, "grad_norm": 0.423828125, "learning_rate": 0.00019030811604029218, "loss": 0.065, "step": 4608 }, { "epoch": 1.1365877712031558, "grad_norm": 0.396484375, "learning_rate": 0.0001902933263419335, "loss": 0.0618, "step": 4610 }, { "epoch": 1.1370808678500985, "grad_norm": 0.3828125, "learning_rate": 0.00019027852594331147, "loss": 0.06, "step": 4612 }, { "epoch": 1.1375739644970415, "grad_norm": 0.54296875, "learning_rate": 0.0001902637148461801, "loss": 0.0709, "step": 4614 }, { "epoch": 1.1380670611439843, "grad_norm": 0.421875, "learning_rate": 0.00019024889305229456, "loss": 0.0653, "step": 4616 }, { "epoch": 1.138560157790927, "grad_norm": 0.515625, "learning_rate": 0.00019023406056341128, "loss": 0.0576, "step": 4618 }, { "epoch": 1.1390532544378698, "grad_norm": 0.40234375, "learning_rate": 0.00019021921738128806, "loss": 0.0599, "step": 4620 }, { "epoch": 1.1395463510848125, "grad_norm": 0.490234375, "learning_rate": 0.00019020436350768383, "loss": 0.0652, "step": 4622 }, { "epoch": 1.1400394477317555, "grad_norm": 0.380859375, "learning_rate": 0.0001901894989443589, "loss": 0.0719, "step": 4624 }, { "epoch": 1.1405325443786982, "grad_norm": 0.431640625, "learning_rate": 0.00019017462369307476, "loss": 0.0631, "step": 4626 }, { "epoch": 1.141025641025641, "grad_norm": 0.388671875, "learning_rate": 0.0001901597377555943, "loss": 0.0677, "step": 4628 }, { "epoch": 1.141518737672584, "grad_norm": 0.62109375, "learning_rate": 0.00019014484113368148, "loss": 0.0802, "step": 4630 }, { "epoch": 1.1420118343195267, "grad_norm": 0.41015625, "learning_rate": 0.0001901299338291017, "loss": 0.0675, "step": 4632 }, { "epoch": 1.1425049309664694, "grad_norm": 0.7578125, "learning_rate": 0.00019011501584362153, "loss": 0.067, "step": 4634 }, { "epoch": 1.1429980276134122, "grad_norm": 0.58984375, "learning_rate": 0.00019010008717900884, "loss": 0.0705, "step": 4636 }, { "epoch": 1.143491124260355, "grad_norm": 0.439453125, "learning_rate": 0.00019008514783703278, "loss": 0.0717, "step": 4638 }, { "epoch": 1.143984220907298, "grad_norm": 0.578125, "learning_rate": 0.00019007019781946375, "loss": 0.0727, "step": 4640 }, { "epoch": 1.1444773175542406, "grad_norm": 0.3515625, "learning_rate": 0.00019005523712807337, "loss": 0.0636, "step": 4642 }, { "epoch": 1.1449704142011834, "grad_norm": 0.4921875, "learning_rate": 0.00019004026576463457, "loss": 0.0594, "step": 4644 }, { "epoch": 1.1454635108481261, "grad_norm": 0.447265625, "learning_rate": 0.0001900252837309216, "loss": 0.0692, "step": 4646 }, { "epoch": 1.1459566074950691, "grad_norm": 0.51953125, "learning_rate": 0.00019001029102870982, "loss": 0.0664, "step": 4648 }, { "epoch": 1.1464497041420119, "grad_norm": 0.5625, "learning_rate": 0.000189995287659776, "loss": 0.0637, "step": 4650 }, { "epoch": 1.1469428007889546, "grad_norm": 0.427734375, "learning_rate": 0.00018998027362589816, "loss": 0.0608, "step": 4652 }, { "epoch": 1.1474358974358974, "grad_norm": 0.498046875, "learning_rate": 0.0001899652489288555, "loss": 0.0697, "step": 4654 }, { "epoch": 1.1479289940828403, "grad_norm": 0.49609375, "learning_rate": 0.00018995021357042852, "loss": 0.0698, "step": 4656 }, { "epoch": 1.148422090729783, "grad_norm": 0.404296875, "learning_rate": 0.00018993516755239903, "loss": 0.072, "step": 4658 }, { "epoch": 1.1489151873767258, "grad_norm": 0.55078125, "learning_rate": 0.00018992011087655007, "loss": 0.0687, "step": 4660 }, { "epoch": 1.1494082840236686, "grad_norm": 0.5390625, "learning_rate": 0.0001899050435446659, "loss": 0.071, "step": 4662 }, { "epoch": 1.1499013806706113, "grad_norm": 0.59765625, "learning_rate": 0.00018988996555853208, "loss": 0.0766, "step": 4664 }, { "epoch": 1.1503944773175543, "grad_norm": 0.640625, "learning_rate": 0.00018987487691993552, "loss": 0.067, "step": 4666 }, { "epoch": 1.150887573964497, "grad_norm": 0.439453125, "learning_rate": 0.0001898597776306642, "loss": 0.0649, "step": 4668 }, { "epoch": 1.1513806706114398, "grad_norm": 0.625, "learning_rate": 0.00018984466769250753, "loss": 0.0708, "step": 4670 }, { "epoch": 1.1518737672583828, "grad_norm": 0.396484375, "learning_rate": 0.00018982954710725612, "loss": 0.0659, "step": 4672 }, { "epoch": 1.1523668639053255, "grad_norm": 0.46875, "learning_rate": 0.00018981441587670182, "loss": 0.069, "step": 4674 }, { "epoch": 1.1528599605522682, "grad_norm": 0.380859375, "learning_rate": 0.0001897992740026378, "loss": 0.0639, "step": 4676 }, { "epoch": 1.153353057199211, "grad_norm": 0.70703125, "learning_rate": 0.0001897841214868584, "loss": 0.0703, "step": 4678 }, { "epoch": 1.1538461538461537, "grad_norm": 0.4765625, "learning_rate": 0.00018976895833115935, "loss": 0.0734, "step": 4680 }, { "epoch": 1.1543392504930967, "grad_norm": 0.50390625, "learning_rate": 0.00018975378453733752, "loss": 0.0643, "step": 4682 }, { "epoch": 1.1548323471400395, "grad_norm": 0.43359375, "learning_rate": 0.00018973860010719108, "loss": 0.0673, "step": 4684 }, { "epoch": 1.1553254437869822, "grad_norm": 0.466796875, "learning_rate": 0.0001897234050425195, "loss": 0.0673, "step": 4686 }, { "epoch": 1.155818540433925, "grad_norm": 0.494140625, "learning_rate": 0.0001897081993451235, "loss": 0.0781, "step": 4688 }, { "epoch": 1.156311637080868, "grad_norm": 0.423828125, "learning_rate": 0.00018969298301680499, "loss": 0.0617, "step": 4690 }, { "epoch": 1.1568047337278107, "grad_norm": 0.49609375, "learning_rate": 0.0001896777560593672, "loss": 0.0709, "step": 4692 }, { "epoch": 1.1572978303747534, "grad_norm": 0.482421875, "learning_rate": 0.00018966251847461462, "loss": 0.0645, "step": 4694 }, { "epoch": 1.1577909270216962, "grad_norm": 0.421875, "learning_rate": 0.00018964727026435303, "loss": 0.0709, "step": 4696 }, { "epoch": 1.1582840236686391, "grad_norm": 0.38671875, "learning_rate": 0.00018963201143038937, "loss": 0.0683, "step": 4698 }, { "epoch": 1.1587771203155819, "grad_norm": 0.4296875, "learning_rate": 0.00018961674197453187, "loss": 0.0708, "step": 4700 }, { "epoch": 1.1592702169625246, "grad_norm": 0.48046875, "learning_rate": 0.00018960146189859014, "loss": 0.0782, "step": 4702 }, { "epoch": 1.1597633136094674, "grad_norm": 0.56640625, "learning_rate": 0.0001895861712043749, "loss": 0.0732, "step": 4704 }, { "epoch": 1.1602564102564104, "grad_norm": 0.435546875, "learning_rate": 0.00018957086989369816, "loss": 0.0656, "step": 4706 }, { "epoch": 1.160749506903353, "grad_norm": 0.462890625, "learning_rate": 0.00018955555796837324, "loss": 0.0712, "step": 4708 }, { "epoch": 1.1612426035502958, "grad_norm": 0.66015625, "learning_rate": 0.00018954023543021472, "loss": 0.0722, "step": 4710 }, { "epoch": 1.1617357001972386, "grad_norm": 0.439453125, "learning_rate": 0.00018952490228103837, "loss": 0.0817, "step": 4712 }, { "epoch": 1.1622287968441816, "grad_norm": 0.4921875, "learning_rate": 0.0001895095585226612, "loss": 0.0674, "step": 4714 }, { "epoch": 1.1627218934911243, "grad_norm": 0.62109375, "learning_rate": 0.00018949420415690166, "loss": 0.0661, "step": 4716 }, { "epoch": 1.163214990138067, "grad_norm": 0.52734375, "learning_rate": 0.0001894788391855792, "loss": 0.0707, "step": 4718 }, { "epoch": 1.1637080867850098, "grad_norm": 0.390625, "learning_rate": 0.00018946346361051474, "loss": 0.0748, "step": 4720 }, { "epoch": 1.1642011834319526, "grad_norm": 0.61328125, "learning_rate": 0.00018944807743353034, "loss": 0.0676, "step": 4722 }, { "epoch": 1.1646942800788955, "grad_norm": 0.578125, "learning_rate": 0.00018943268065644935, "loss": 0.0647, "step": 4724 }, { "epoch": 1.1651873767258383, "grad_norm": 0.341796875, "learning_rate": 0.00018941727328109637, "loss": 0.0695, "step": 4726 }, { "epoch": 1.165680473372781, "grad_norm": 0.58203125, "learning_rate": 0.00018940185530929725, "loss": 0.0728, "step": 4728 }, { "epoch": 1.1661735700197238, "grad_norm": 0.43359375, "learning_rate": 0.00018938642674287914, "loss": 0.0646, "step": 4730 }, { "epoch": 1.1666666666666667, "grad_norm": 0.455078125, "learning_rate": 0.0001893709875836704, "loss": 0.0707, "step": 4732 }, { "epoch": 1.1671597633136095, "grad_norm": 0.419921875, "learning_rate": 0.00018935553783350063, "loss": 0.0729, "step": 4734 }, { "epoch": 1.1676528599605522, "grad_norm": 0.52734375, "learning_rate": 0.00018934007749420073, "loss": 0.0726, "step": 4736 }, { "epoch": 1.168145956607495, "grad_norm": 0.5, "learning_rate": 0.00018932460656760283, "loss": 0.0621, "step": 4738 }, { "epoch": 1.168639053254438, "grad_norm": 0.498046875, "learning_rate": 0.00018930912505554033, "loss": 0.0696, "step": 4740 }, { "epoch": 1.1691321499013807, "grad_norm": 0.73046875, "learning_rate": 0.0001892936329598479, "loss": 0.0746, "step": 4742 }, { "epoch": 1.1696252465483234, "grad_norm": 1.078125, "learning_rate": 0.00018927813028236137, "loss": 0.0762, "step": 4744 }, { "epoch": 1.1701183431952662, "grad_norm": 0.6015625, "learning_rate": 0.00018926261702491797, "loss": 0.0712, "step": 4746 }, { "epoch": 1.1706114398422092, "grad_norm": 0.7109375, "learning_rate": 0.00018924709318935605, "loss": 0.0744, "step": 4748 }, { "epoch": 1.171104536489152, "grad_norm": 0.62890625, "learning_rate": 0.0001892315587775153, "loss": 0.0718, "step": 4750 }, { "epoch": 1.1715976331360947, "grad_norm": 0.52734375, "learning_rate": 0.00018921601379123665, "loss": 0.0778, "step": 4752 }, { "epoch": 1.1720907297830374, "grad_norm": 0.43359375, "learning_rate": 0.00018920045823236223, "loss": 0.0751, "step": 4754 }, { "epoch": 1.1725838264299804, "grad_norm": 0.388671875, "learning_rate": 0.0001891848921027355, "loss": 0.0646, "step": 4756 }, { "epoch": 1.1730769230769231, "grad_norm": 0.5703125, "learning_rate": 0.00018916931540420107, "loss": 0.0806, "step": 4758 }, { "epoch": 1.1735700197238659, "grad_norm": 0.71875, "learning_rate": 0.00018915372813860492, "loss": 0.0801, "step": 4760 }, { "epoch": 1.1740631163708086, "grad_norm": 0.46875, "learning_rate": 0.00018913813030779425, "loss": 0.0718, "step": 4762 }, { "epoch": 1.1745562130177514, "grad_norm": 0.55859375, "learning_rate": 0.00018912252191361742, "loss": 0.064, "step": 4764 }, { "epoch": 1.1750493096646943, "grad_norm": 0.5390625, "learning_rate": 0.00018910690295792412, "loss": 0.0831, "step": 4766 }, { "epoch": 1.175542406311637, "grad_norm": 0.55078125, "learning_rate": 0.00018909127344256537, "loss": 0.0735, "step": 4768 }, { "epoch": 1.1760355029585798, "grad_norm": 0.4453125, "learning_rate": 0.00018907563336939326, "loss": 0.0768, "step": 4770 }, { "epoch": 1.1765285996055228, "grad_norm": 0.51171875, "learning_rate": 0.00018905998274026126, "loss": 0.0739, "step": 4772 }, { "epoch": 1.1770216962524656, "grad_norm": 0.388671875, "learning_rate": 0.00018904432155702406, "loss": 0.0755, "step": 4774 }, { "epoch": 1.1775147928994083, "grad_norm": 0.46875, "learning_rate": 0.00018902864982153764, "loss": 0.0777, "step": 4776 }, { "epoch": 1.178007889546351, "grad_norm": 0.38671875, "learning_rate": 0.0001890129675356591, "loss": 0.0651, "step": 4778 }, { "epoch": 1.1785009861932938, "grad_norm": 0.4453125, "learning_rate": 0.00018899727470124693, "loss": 0.0659, "step": 4780 }, { "epoch": 1.1789940828402368, "grad_norm": 0.484375, "learning_rate": 0.00018898157132016082, "loss": 0.0674, "step": 4782 }, { "epoch": 1.1794871794871795, "grad_norm": 0.59375, "learning_rate": 0.00018896585739426169, "loss": 0.0723, "step": 4784 }, { "epoch": 1.1799802761341223, "grad_norm": 0.380859375, "learning_rate": 0.00018895013292541174, "loss": 0.0694, "step": 4786 }, { "epoch": 1.180473372781065, "grad_norm": 0.69921875, "learning_rate": 0.0001889343979154744, "loss": 0.0726, "step": 4788 }, { "epoch": 1.180966469428008, "grad_norm": 0.40625, "learning_rate": 0.00018891865236631437, "loss": 0.0713, "step": 4790 }, { "epoch": 1.1814595660749507, "grad_norm": 0.796875, "learning_rate": 0.00018890289627979755, "loss": 0.0701, "step": 4792 }, { "epoch": 1.1819526627218935, "grad_norm": 0.44140625, "learning_rate": 0.00018888712965779117, "loss": 0.0671, "step": 4794 }, { "epoch": 1.1824457593688362, "grad_norm": 0.46875, "learning_rate": 0.00018887135250216365, "loss": 0.0671, "step": 4796 }, { "epoch": 1.1829388560157792, "grad_norm": 0.52734375, "learning_rate": 0.00018885556481478466, "loss": 0.0755, "step": 4798 }, { "epoch": 1.183431952662722, "grad_norm": 0.5234375, "learning_rate": 0.00018883976659752512, "loss": 0.0732, "step": 4800 }, { "epoch": 1.1839250493096647, "grad_norm": 0.50390625, "learning_rate": 0.00018882395785225725, "loss": 0.0666, "step": 4802 }, { "epoch": 1.1844181459566074, "grad_norm": 0.5546875, "learning_rate": 0.0001888081385808544, "loss": 0.0721, "step": 4804 }, { "epoch": 1.1849112426035502, "grad_norm": 0.64453125, "learning_rate": 0.0001887923087851913, "loss": 0.079, "step": 4806 }, { "epoch": 1.1854043392504932, "grad_norm": 0.6328125, "learning_rate": 0.00018877646846714388, "loss": 0.0718, "step": 4808 }, { "epoch": 1.185897435897436, "grad_norm": 0.71484375, "learning_rate": 0.0001887606176285893, "loss": 0.078, "step": 4810 }, { "epoch": 1.1863905325443787, "grad_norm": 0.54296875, "learning_rate": 0.0001887447562714059, "loss": 0.0703, "step": 4812 }, { "epoch": 1.1868836291913216, "grad_norm": 0.59765625, "learning_rate": 0.00018872888439747345, "loss": 0.0738, "step": 4814 }, { "epoch": 1.1873767258382644, "grad_norm": 0.6015625, "learning_rate": 0.00018871300200867274, "loss": 0.0784, "step": 4816 }, { "epoch": 1.1878698224852071, "grad_norm": 0.482421875, "learning_rate": 0.00018869710910688603, "loss": 0.0754, "step": 4818 }, { "epoch": 1.1883629191321499, "grad_norm": 0.458984375, "learning_rate": 0.00018868120569399664, "loss": 0.0677, "step": 4820 }, { "epoch": 1.1888560157790926, "grad_norm": 0.640625, "learning_rate": 0.00018866529177188928, "loss": 0.0749, "step": 4822 }, { "epoch": 1.1893491124260356, "grad_norm": 0.83203125, "learning_rate": 0.00018864936734244977, "loss": 0.0796, "step": 4824 }, { "epoch": 1.1898422090729783, "grad_norm": 0.416015625, "learning_rate": 0.00018863343240756527, "loss": 0.0821, "step": 4826 }, { "epoch": 1.190335305719921, "grad_norm": 0.67578125, "learning_rate": 0.00018861748696912418, "loss": 0.0697, "step": 4828 }, { "epoch": 1.1908284023668638, "grad_norm": 0.62109375, "learning_rate": 0.0001886015310290161, "loss": 0.0794, "step": 4830 }, { "epoch": 1.1913214990138068, "grad_norm": 1.4375, "learning_rate": 0.00018858556458913191, "loss": 0.0729, "step": 4832 }, { "epoch": 1.1918145956607495, "grad_norm": 0.515625, "learning_rate": 0.00018856958765136374, "loss": 0.0744, "step": 4834 }, { "epoch": 1.1923076923076923, "grad_norm": 1.109375, "learning_rate": 0.0001885536002176049, "loss": 0.0759, "step": 4836 }, { "epoch": 1.192800788954635, "grad_norm": 0.796875, "learning_rate": 0.00018853760228975002, "loss": 0.0744, "step": 4838 }, { "epoch": 1.193293885601578, "grad_norm": 0.7109375, "learning_rate": 0.00018852159386969494, "loss": 0.0729, "step": 4840 }, { "epoch": 1.1937869822485208, "grad_norm": 0.48046875, "learning_rate": 0.00018850557495933673, "loss": 0.0781, "step": 4842 }, { "epoch": 1.1942800788954635, "grad_norm": 0.671875, "learning_rate": 0.00018848954556057373, "loss": 0.0743, "step": 4844 }, { "epoch": 1.1947731755424063, "grad_norm": 0.375, "learning_rate": 0.00018847350567530554, "loss": 0.0758, "step": 4846 }, { "epoch": 1.195266272189349, "grad_norm": 0.498046875, "learning_rate": 0.00018845745530543295, "loss": 0.0777, "step": 4848 }, { "epoch": 1.195759368836292, "grad_norm": 0.44140625, "learning_rate": 0.000188441394452858, "loss": 0.0739, "step": 4850 }, { "epoch": 1.1962524654832347, "grad_norm": 0.478515625, "learning_rate": 0.00018842532311948404, "loss": 0.0791, "step": 4852 }, { "epoch": 1.1967455621301775, "grad_norm": 0.484375, "learning_rate": 0.0001884092413072156, "loss": 0.0791, "step": 4854 }, { "epoch": 1.1972386587771204, "grad_norm": 0.494140625, "learning_rate": 0.00018839314901795842, "loss": 0.0815, "step": 4856 }, { "epoch": 1.1977317554240632, "grad_norm": 0.625, "learning_rate": 0.00018837704625361958, "loss": 0.0682, "step": 4858 }, { "epoch": 1.198224852071006, "grad_norm": 0.34765625, "learning_rate": 0.00018836093301610732, "loss": 0.0622, "step": 4860 }, { "epoch": 1.1987179487179487, "grad_norm": 0.8984375, "learning_rate": 0.00018834480930733115, "loss": 0.0731, "step": 4862 }, { "epoch": 1.1992110453648914, "grad_norm": 0.578125, "learning_rate": 0.00018832867512920181, "loss": 0.0712, "step": 4864 }, { "epoch": 1.1997041420118344, "grad_norm": 0.703125, "learning_rate": 0.00018831253048363133, "loss": 0.0709, "step": 4866 }, { "epoch": 1.2001972386587771, "grad_norm": 0.474609375, "learning_rate": 0.00018829637537253288, "loss": 0.0741, "step": 4868 }, { "epoch": 1.20069033530572, "grad_norm": 0.61328125, "learning_rate": 0.000188280209797821, "loss": 0.0795, "step": 4870 }, { "epoch": 1.2011834319526626, "grad_norm": 0.443359375, "learning_rate": 0.00018826403376141136, "loss": 0.0676, "step": 4872 }, { "epoch": 1.2016765285996056, "grad_norm": 0.462890625, "learning_rate": 0.0001882478472652209, "loss": 0.0682, "step": 4874 }, { "epoch": 1.2021696252465484, "grad_norm": 0.421875, "learning_rate": 0.0001882316503111678, "loss": 0.0781, "step": 4876 }, { "epoch": 1.202662721893491, "grad_norm": 0.453125, "learning_rate": 0.00018821544290117153, "loss": 0.08, "step": 4878 }, { "epoch": 1.2031558185404339, "grad_norm": 0.5078125, "learning_rate": 0.0001881992250371528, "loss": 0.0741, "step": 4880 }, { "epoch": 1.2036489151873768, "grad_norm": 0.56640625, "learning_rate": 0.0001881829967210334, "loss": 0.0683, "step": 4882 }, { "epoch": 1.2041420118343196, "grad_norm": 0.57421875, "learning_rate": 0.00018816675795473655, "loss": 0.0651, "step": 4884 }, { "epoch": 1.2046351084812623, "grad_norm": 0.734375, "learning_rate": 0.00018815050874018666, "loss": 0.0752, "step": 4886 }, { "epoch": 1.205128205128205, "grad_norm": 0.44140625, "learning_rate": 0.00018813424907930925, "loss": 0.0733, "step": 4888 }, { "epoch": 1.2056213017751478, "grad_norm": 0.55078125, "learning_rate": 0.0001881179789740313, "loss": 0.0738, "step": 4890 }, { "epoch": 1.2061143984220908, "grad_norm": 0.5390625, "learning_rate": 0.00018810169842628085, "loss": 0.0766, "step": 4892 }, { "epoch": 1.2066074950690335, "grad_norm": 0.5234375, "learning_rate": 0.0001880854074379872, "loss": 0.0742, "step": 4894 }, { "epoch": 1.2071005917159763, "grad_norm": 0.484375, "learning_rate": 0.00018806910601108098, "loss": 0.077, "step": 4896 }, { "epoch": 1.2075936883629192, "grad_norm": 0.60546875, "learning_rate": 0.00018805279414749402, "loss": 0.0733, "step": 4898 }, { "epoch": 1.208086785009862, "grad_norm": 0.49609375, "learning_rate": 0.0001880364718491593, "loss": 0.0707, "step": 4900 }, { "epoch": 1.2085798816568047, "grad_norm": 0.6328125, "learning_rate": 0.00018802013911801112, "loss": 0.0762, "step": 4902 }, { "epoch": 1.2090729783037475, "grad_norm": 0.8203125, "learning_rate": 0.00018800379595598502, "loss": 0.0831, "step": 4904 }, { "epoch": 1.2095660749506902, "grad_norm": 0.64453125, "learning_rate": 0.00018798744236501777, "loss": 0.0805, "step": 4906 }, { "epoch": 1.2100591715976332, "grad_norm": 0.484375, "learning_rate": 0.00018797107834704732, "loss": 0.0786, "step": 4908 }, { "epoch": 1.210552268244576, "grad_norm": 0.57421875, "learning_rate": 0.0001879547039040129, "loss": 0.0845, "step": 4910 }, { "epoch": 1.2110453648915187, "grad_norm": 0.474609375, "learning_rate": 0.00018793831903785504, "loss": 0.0824, "step": 4912 }, { "epoch": 1.2115384615384615, "grad_norm": 0.470703125, "learning_rate": 0.00018792192375051536, "loss": 0.0706, "step": 4914 }, { "epoch": 1.2120315581854044, "grad_norm": 0.51171875, "learning_rate": 0.00018790551804393683, "loss": 0.0769, "step": 4916 }, { "epoch": 1.2125246548323472, "grad_norm": 0.67578125, "learning_rate": 0.00018788910192006362, "loss": 0.0758, "step": 4918 }, { "epoch": 1.21301775147929, "grad_norm": 0.93359375, "learning_rate": 0.00018787267538084108, "loss": 0.0851, "step": 4920 }, { "epoch": 1.2135108481262327, "grad_norm": 0.62890625, "learning_rate": 0.00018785623842821594, "loss": 0.0782, "step": 4922 }, { "epoch": 1.2140039447731756, "grad_norm": 0.79296875, "learning_rate": 0.000187839791064136, "loss": 0.0769, "step": 4924 }, { "epoch": 1.2144970414201184, "grad_norm": 0.625, "learning_rate": 0.00018782333329055037, "loss": 0.0713, "step": 4926 }, { "epoch": 1.2149901380670611, "grad_norm": 0.69921875, "learning_rate": 0.00018780686510940943, "loss": 0.0785, "step": 4928 }, { "epoch": 1.2154832347140039, "grad_norm": 0.435546875, "learning_rate": 0.00018779038652266472, "loss": 0.0735, "step": 4930 }, { "epoch": 1.2159763313609466, "grad_norm": 0.60546875, "learning_rate": 0.000187773897532269, "loss": 0.0793, "step": 4932 }, { "epoch": 1.2164694280078896, "grad_norm": 0.54296875, "learning_rate": 0.0001877573981401764, "loss": 0.0755, "step": 4934 }, { "epoch": 1.2169625246548323, "grad_norm": 0.734375, "learning_rate": 0.00018774088834834217, "loss": 0.0744, "step": 4936 }, { "epoch": 1.217455621301775, "grad_norm": 0.478515625, "learning_rate": 0.00018772436815872275, "loss": 0.0745, "step": 4938 }, { "epoch": 1.217948717948718, "grad_norm": 0.98828125, "learning_rate": 0.00018770783757327594, "loss": 0.0779, "step": 4940 }, { "epoch": 1.2184418145956608, "grad_norm": 0.447265625, "learning_rate": 0.00018769129659396063, "loss": 0.0768, "step": 4942 }, { "epoch": 1.2189349112426036, "grad_norm": 0.59765625, "learning_rate": 0.00018767474522273712, "loss": 0.0804, "step": 4944 }, { "epoch": 1.2194280078895463, "grad_norm": 0.4609375, "learning_rate": 0.00018765818346156676, "loss": 0.0791, "step": 4946 }, { "epoch": 1.219921104536489, "grad_norm": 0.8359375, "learning_rate": 0.00018764161131241223, "loss": 0.0749, "step": 4948 }, { "epoch": 1.220414201183432, "grad_norm": 0.7109375, "learning_rate": 0.00018762502877723745, "loss": 0.0741, "step": 4950 }, { "epoch": 1.2209072978303748, "grad_norm": 0.7578125, "learning_rate": 0.0001876084358580075, "loss": 0.0734, "step": 4952 }, { "epoch": 1.2214003944773175, "grad_norm": 0.40234375, "learning_rate": 0.0001875918325566888, "loss": 0.0772, "step": 4954 }, { "epoch": 1.2218934911242603, "grad_norm": 0.75390625, "learning_rate": 0.00018757521887524886, "loss": 0.0728, "step": 4956 }, { "epoch": 1.2223865877712032, "grad_norm": 0.55859375, "learning_rate": 0.00018755859481565655, "loss": 0.0737, "step": 4958 }, { "epoch": 1.222879684418146, "grad_norm": 0.90234375, "learning_rate": 0.00018754196037988185, "loss": 0.0757, "step": 4960 }, { "epoch": 1.2233727810650887, "grad_norm": 0.62890625, "learning_rate": 0.00018752531556989613, "loss": 0.0774, "step": 4962 }, { "epoch": 1.2238658777120315, "grad_norm": 0.6640625, "learning_rate": 0.00018750866038767181, "loss": 0.0721, "step": 4964 }, { "epoch": 1.2243589743589745, "grad_norm": 0.400390625, "learning_rate": 0.00018749199483518263, "loss": 0.0805, "step": 4966 }, { "epoch": 1.2248520710059172, "grad_norm": 0.734375, "learning_rate": 0.00018747531891440358, "loss": 0.0653, "step": 4968 }, { "epoch": 1.22534516765286, "grad_norm": 0.51953125, "learning_rate": 0.00018745863262731085, "loss": 0.0799, "step": 4970 }, { "epoch": 1.2258382642998027, "grad_norm": 0.7421875, "learning_rate": 0.00018744193597588184, "loss": 0.0782, "step": 4972 }, { "epoch": 1.2263313609467454, "grad_norm": 0.494140625, "learning_rate": 0.0001874252289620952, "loss": 0.0773, "step": 4974 }, { "epoch": 1.2268244575936884, "grad_norm": 0.71875, "learning_rate": 0.0001874085115879308, "loss": 0.0758, "step": 4976 }, { "epoch": 1.2273175542406312, "grad_norm": 0.43359375, "learning_rate": 0.00018739178385536977, "loss": 0.0785, "step": 4978 }, { "epoch": 1.227810650887574, "grad_norm": 0.7109375, "learning_rate": 0.00018737504576639443, "loss": 0.0782, "step": 4980 }, { "epoch": 1.2283037475345169, "grad_norm": 0.5859375, "learning_rate": 0.0001873582973229883, "loss": 0.0777, "step": 4982 }, { "epoch": 1.2287968441814596, "grad_norm": 0.83203125, "learning_rate": 0.00018734153852713618, "loss": 0.0753, "step": 4984 }, { "epoch": 1.2292899408284024, "grad_norm": 0.6953125, "learning_rate": 0.00018732476938082413, "loss": 0.0792, "step": 4986 }, { "epoch": 1.2297830374753451, "grad_norm": 0.859375, "learning_rate": 0.00018730798988603932, "loss": 0.0767, "step": 4988 }, { "epoch": 1.2302761341222879, "grad_norm": 0.72265625, "learning_rate": 0.00018729120004477027, "loss": 0.0833, "step": 4990 }, { "epoch": 1.2307692307692308, "grad_norm": 0.478515625, "learning_rate": 0.0001872743998590066, "loss": 0.0772, "step": 4992 }, { "epoch": 1.2312623274161736, "grad_norm": 0.73046875, "learning_rate": 0.0001872575893307393, "loss": 0.0753, "step": 4994 }, { "epoch": 1.2317554240631163, "grad_norm": 0.5234375, "learning_rate": 0.0001872407684619605, "loss": 0.0767, "step": 4996 }, { "epoch": 1.232248520710059, "grad_norm": 0.63671875, "learning_rate": 0.0001872239372546635, "loss": 0.0726, "step": 4998 }, { "epoch": 1.232741617357002, "grad_norm": 0.53125, "learning_rate": 0.00018720709571084298, "loss": 0.0792, "step": 5000 }, { "epoch": 1.2332347140039448, "grad_norm": 0.51953125, "learning_rate": 0.00018719024383249467, "loss": 0.0777, "step": 5002 }, { "epoch": 1.2337278106508875, "grad_norm": 0.51953125, "learning_rate": 0.00018717338162161572, "loss": 0.0721, "step": 5004 }, { "epoch": 1.2342209072978303, "grad_norm": 0.52734375, "learning_rate": 0.00018715650908020427, "loss": 0.0749, "step": 5006 }, { "epoch": 1.2347140039447733, "grad_norm": 0.53125, "learning_rate": 0.0001871396262102599, "loss": 0.0717, "step": 5008 }, { "epoch": 1.235207100591716, "grad_norm": 0.32421875, "learning_rate": 0.0001871227330137833, "loss": 0.0712, "step": 5010 }, { "epoch": 1.2357001972386588, "grad_norm": 0.6484375, "learning_rate": 0.0001871058294927764, "loss": 0.0766, "step": 5012 }, { "epoch": 1.2361932938856015, "grad_norm": 0.458984375, "learning_rate": 0.0001870889156492424, "loss": 0.0718, "step": 5014 }, { "epoch": 1.2366863905325443, "grad_norm": 0.58984375, "learning_rate": 0.00018707199148518565, "loss": 0.0809, "step": 5016 }, { "epoch": 1.2371794871794872, "grad_norm": 0.6328125, "learning_rate": 0.00018705505700261171, "loss": 0.0776, "step": 5018 }, { "epoch": 1.23767258382643, "grad_norm": 0.412109375, "learning_rate": 0.00018703811220352752, "loss": 0.076, "step": 5020 }, { "epoch": 1.2381656804733727, "grad_norm": 0.62890625, "learning_rate": 0.00018702115708994104, "loss": 0.0755, "step": 5022 }, { "epoch": 1.2386587771203157, "grad_norm": 0.453125, "learning_rate": 0.0001870041916638616, "loss": 0.0811, "step": 5024 }, { "epoch": 1.2391518737672584, "grad_norm": 0.515625, "learning_rate": 0.00018698721592729967, "loss": 0.0737, "step": 5026 }, { "epoch": 1.2396449704142012, "grad_norm": 0.447265625, "learning_rate": 0.00018697022988226698, "loss": 0.0757, "step": 5028 }, { "epoch": 1.240138067061144, "grad_norm": 0.4921875, "learning_rate": 0.00018695323353077646, "loss": 0.0786, "step": 5030 }, { "epoch": 1.2406311637080867, "grad_norm": 0.43359375, "learning_rate": 0.00018693622687484228, "loss": 0.0752, "step": 5032 }, { "epoch": 1.2411242603550297, "grad_norm": 0.455078125, "learning_rate": 0.00018691920991647986, "loss": 0.0746, "step": 5034 }, { "epoch": 1.2416173570019724, "grad_norm": 0.51953125, "learning_rate": 0.0001869021826577057, "loss": 0.076, "step": 5036 }, { "epoch": 1.2421104536489151, "grad_norm": 0.474609375, "learning_rate": 0.00018688514510053778, "loss": 0.0756, "step": 5038 }, { "epoch": 1.242603550295858, "grad_norm": 0.376953125, "learning_rate": 0.00018686809724699502, "loss": 0.0794, "step": 5040 }, { "epoch": 1.2430966469428009, "grad_norm": 0.486328125, "learning_rate": 0.00018685103909909772, "loss": 0.0705, "step": 5042 }, { "epoch": 1.2435897435897436, "grad_norm": 0.41015625, "learning_rate": 0.00018683397065886737, "loss": 0.075, "step": 5044 }, { "epoch": 1.2440828402366864, "grad_norm": 0.443359375, "learning_rate": 0.0001868168919283267, "loss": 0.0768, "step": 5046 }, { "epoch": 1.244575936883629, "grad_norm": 0.35546875, "learning_rate": 0.0001867998029094996, "loss": 0.0735, "step": 5048 }, { "epoch": 1.245069033530572, "grad_norm": 0.6875, "learning_rate": 0.00018678270360441123, "loss": 0.0768, "step": 5050 }, { "epoch": 1.2455621301775148, "grad_norm": 0.431640625, "learning_rate": 0.00018676559401508796, "loss": 0.0698, "step": 5052 }, { "epoch": 1.2460552268244576, "grad_norm": 0.546875, "learning_rate": 0.00018674847414355735, "loss": 0.0785, "step": 5054 }, { "epoch": 1.2465483234714003, "grad_norm": 0.5546875, "learning_rate": 0.00018673134399184823, "loss": 0.0705, "step": 5056 }, { "epoch": 1.2470414201183433, "grad_norm": 0.7734375, "learning_rate": 0.00018671420356199058, "loss": 0.0744, "step": 5058 }, { "epoch": 1.247534516765286, "grad_norm": 0.400390625, "learning_rate": 0.00018669705285601564, "loss": 0.0835, "step": 5060 }, { "epoch": 1.2480276134122288, "grad_norm": 0.5703125, "learning_rate": 0.0001866798918759559, "loss": 0.0723, "step": 5062 }, { "epoch": 1.2485207100591715, "grad_norm": 0.451171875, "learning_rate": 0.00018666272062384503, "loss": 0.0826, "step": 5064 }, { "epoch": 1.2490138067061145, "grad_norm": 0.53125, "learning_rate": 0.0001866455391017179, "loss": 0.0772, "step": 5066 }, { "epoch": 1.2495069033530573, "grad_norm": 0.5234375, "learning_rate": 0.0001866283473116106, "loss": 0.0829, "step": 5068 }, { "epoch": 1.25, "grad_norm": 0.62109375, "learning_rate": 0.0001866111452555605, "loss": 0.0761, "step": 5070 }, { "epoch": 1.2504930966469427, "grad_norm": 0.4140625, "learning_rate": 0.0001865939329356061, "loss": 0.0741, "step": 5072 }, { "epoch": 1.2509861932938855, "grad_norm": 0.72265625, "learning_rate": 0.00018657671035378717, "loss": 0.0723, "step": 5074 }, { "epoch": 1.2514792899408285, "grad_norm": 0.482421875, "learning_rate": 0.00018655947751214467, "loss": 0.0723, "step": 5076 }, { "epoch": 1.2519723865877712, "grad_norm": 1.21875, "learning_rate": 0.0001865422344127208, "loss": 0.0812, "step": 5078 }, { "epoch": 1.252465483234714, "grad_norm": 0.5859375, "learning_rate": 0.00018652498105755898, "loss": 0.0743, "step": 5080 }, { "epoch": 1.252958579881657, "grad_norm": 0.71875, "learning_rate": 0.00018650771744870378, "loss": 0.0751, "step": 5082 }, { "epoch": 1.2534516765285997, "grad_norm": 0.4921875, "learning_rate": 0.0001864904435882011, "loss": 0.0686, "step": 5084 }, { "epoch": 1.2539447731755424, "grad_norm": 0.63671875, "learning_rate": 0.00018647315947809792, "loss": 0.0759, "step": 5086 }, { "epoch": 1.2544378698224852, "grad_norm": 0.37890625, "learning_rate": 0.00018645586512044257, "loss": 0.0751, "step": 5088 }, { "epoch": 1.254930966469428, "grad_norm": 0.50390625, "learning_rate": 0.00018643856051728448, "loss": 0.0759, "step": 5090 }, { "epoch": 1.255424063116371, "grad_norm": 0.423828125, "learning_rate": 0.00018642124567067435, "loss": 0.0829, "step": 5092 }, { "epoch": 1.2559171597633136, "grad_norm": 0.421875, "learning_rate": 0.0001864039205826641, "loss": 0.0715, "step": 5094 }, { "epoch": 1.2564102564102564, "grad_norm": 0.40625, "learning_rate": 0.00018638658525530686, "loss": 0.0826, "step": 5096 }, { "epoch": 1.2569033530571991, "grad_norm": 0.59765625, "learning_rate": 0.00018636923969065695, "loss": 0.0797, "step": 5098 }, { "epoch": 1.2573964497041419, "grad_norm": 0.490234375, "learning_rate": 0.00018635188389076987, "loss": 0.0788, "step": 5100 }, { "epoch": 1.2578895463510849, "grad_norm": 0.423828125, "learning_rate": 0.00018633451785770246, "loss": 0.0834, "step": 5102 }, { "epoch": 1.2583826429980276, "grad_norm": 0.5546875, "learning_rate": 0.00018631714159351263, "loss": 0.0797, "step": 5104 }, { "epoch": 1.2588757396449703, "grad_norm": 0.5546875, "learning_rate": 0.00018629975510025962, "loss": 0.0724, "step": 5106 }, { "epoch": 1.2593688362919133, "grad_norm": 0.5859375, "learning_rate": 0.00018628235838000375, "loss": 0.0815, "step": 5108 }, { "epoch": 1.259861932938856, "grad_norm": 0.6171875, "learning_rate": 0.00018626495143480668, "loss": 0.0742, "step": 5110 }, { "epoch": 1.2603550295857988, "grad_norm": 0.53515625, "learning_rate": 0.00018624753426673126, "loss": 0.0801, "step": 5112 }, { "epoch": 1.2608481262327416, "grad_norm": 0.484375, "learning_rate": 0.00018623010687784147, "loss": 0.0799, "step": 5114 }, { "epoch": 1.2613412228796843, "grad_norm": 0.357421875, "learning_rate": 0.00018621266927020254, "loss": 0.0698, "step": 5116 }, { "epoch": 1.2618343195266273, "grad_norm": 0.5859375, "learning_rate": 0.00018619522144588098, "loss": 0.0765, "step": 5118 }, { "epoch": 1.26232741617357, "grad_norm": 0.47265625, "learning_rate": 0.00018617776340694442, "loss": 0.0759, "step": 5120 }, { "epoch": 1.2628205128205128, "grad_norm": 0.35546875, "learning_rate": 0.00018616029515546175, "loss": 0.0718, "step": 5122 }, { "epoch": 1.2633136094674557, "grad_norm": 0.43359375, "learning_rate": 0.000186142816693503, "loss": 0.0731, "step": 5124 }, { "epoch": 1.2638067061143985, "grad_norm": 0.48828125, "learning_rate": 0.00018612532802313956, "loss": 0.0785, "step": 5126 }, { "epoch": 1.2642998027613412, "grad_norm": 0.369140625, "learning_rate": 0.00018610782914644389, "loss": 0.078, "step": 5128 }, { "epoch": 1.264792899408284, "grad_norm": 0.376953125, "learning_rate": 0.00018609032006548968, "loss": 0.0809, "step": 5130 }, { "epoch": 1.2652859960552267, "grad_norm": 0.56640625, "learning_rate": 0.00018607280078235188, "loss": 0.0751, "step": 5132 }, { "epoch": 1.2657790927021697, "grad_norm": 0.5078125, "learning_rate": 0.00018605527129910663, "loss": 0.0837, "step": 5134 }, { "epoch": 1.2662721893491125, "grad_norm": 0.484375, "learning_rate": 0.00018603773161783124, "loss": 0.0762, "step": 5136 }, { "epoch": 1.2667652859960552, "grad_norm": 0.5390625, "learning_rate": 0.0001860201817406043, "loss": 0.0741, "step": 5138 }, { "epoch": 1.267258382642998, "grad_norm": 0.490234375, "learning_rate": 0.00018600262166950553, "loss": 0.08, "step": 5140 }, { "epoch": 1.2677514792899407, "grad_norm": 0.45703125, "learning_rate": 0.00018598505140661592, "loss": 0.0774, "step": 5142 }, { "epoch": 1.2682445759368837, "grad_norm": 0.46875, "learning_rate": 0.00018596747095401766, "loss": 0.0871, "step": 5144 }, { "epoch": 1.2687376725838264, "grad_norm": 0.466796875, "learning_rate": 0.0001859498803137941, "loss": 0.0755, "step": 5146 }, { "epoch": 1.2692307692307692, "grad_norm": 0.482421875, "learning_rate": 0.00018593227948802981, "loss": 0.0743, "step": 5148 }, { "epoch": 1.2697238658777121, "grad_norm": 0.474609375, "learning_rate": 0.00018591466847881066, "loss": 0.0781, "step": 5150 }, { "epoch": 1.2702169625246549, "grad_norm": 0.462890625, "learning_rate": 0.0001858970472882236, "loss": 0.0703, "step": 5152 }, { "epoch": 1.2707100591715976, "grad_norm": 0.466796875, "learning_rate": 0.00018587941591835682, "loss": 0.0804, "step": 5154 }, { "epoch": 1.2712031558185404, "grad_norm": 0.51171875, "learning_rate": 0.00018586177437129978, "loss": 0.0762, "step": 5156 }, { "epoch": 1.2716962524654831, "grad_norm": 0.419921875, "learning_rate": 0.0001858441226491431, "loss": 0.0692, "step": 5158 }, { "epoch": 1.272189349112426, "grad_norm": 0.53515625, "learning_rate": 0.00018582646075397858, "loss": 0.0822, "step": 5160 }, { "epoch": 1.2726824457593688, "grad_norm": 0.423828125, "learning_rate": 0.00018580878868789928, "loss": 0.0767, "step": 5162 }, { "epoch": 1.2731755424063116, "grad_norm": 0.53125, "learning_rate": 0.0001857911064529994, "loss": 0.0819, "step": 5164 }, { "epoch": 1.2736686390532546, "grad_norm": 0.625, "learning_rate": 0.00018577341405137443, "loss": 0.079, "step": 5166 }, { "epoch": 1.2741617357001973, "grad_norm": 0.470703125, "learning_rate": 0.000185755711485121, "loss": 0.0785, "step": 5168 }, { "epoch": 1.27465483234714, "grad_norm": 0.5546875, "learning_rate": 0.00018573799875633694, "loss": 0.0685, "step": 5170 }, { "epoch": 1.2751479289940828, "grad_norm": 0.4140625, "learning_rate": 0.00018572027586712136, "loss": 0.0738, "step": 5172 }, { "epoch": 1.2756410256410255, "grad_norm": 0.4375, "learning_rate": 0.00018570254281957444, "loss": 0.0813, "step": 5174 }, { "epoch": 1.2761341222879685, "grad_norm": 0.5234375, "learning_rate": 0.00018568479961579774, "loss": 0.074, "step": 5176 }, { "epoch": 1.2766272189349113, "grad_norm": 0.5703125, "learning_rate": 0.00018566704625789387, "loss": 0.0769, "step": 5178 }, { "epoch": 1.277120315581854, "grad_norm": 0.4609375, "learning_rate": 0.0001856492827479667, "loss": 0.076, "step": 5180 }, { "epoch": 1.2776134122287968, "grad_norm": 0.53515625, "learning_rate": 0.00018563150908812133, "loss": 0.0768, "step": 5182 }, { "epoch": 1.2781065088757395, "grad_norm": 0.3984375, "learning_rate": 0.00018561372528046402, "loss": 0.0782, "step": 5184 }, { "epoch": 1.2785996055226825, "grad_norm": 0.48828125, "learning_rate": 0.00018559593132710229, "loss": 0.0665, "step": 5186 }, { "epoch": 1.2790927021696252, "grad_norm": 0.515625, "learning_rate": 0.00018557812723014476, "loss": 0.0697, "step": 5188 }, { "epoch": 1.279585798816568, "grad_norm": 0.45703125, "learning_rate": 0.00018556031299170136, "loss": 0.0719, "step": 5190 }, { "epoch": 1.280078895463511, "grad_norm": 0.435546875, "learning_rate": 0.00018554248861388315, "loss": 0.0729, "step": 5192 }, { "epoch": 1.2805719921104537, "grad_norm": 0.404296875, "learning_rate": 0.00018552465409880245, "loss": 0.0758, "step": 5194 }, { "epoch": 1.2810650887573964, "grad_norm": 0.63671875, "learning_rate": 0.0001855068094485727, "loss": 0.0772, "step": 5196 }, { "epoch": 1.2815581854043392, "grad_norm": 0.76171875, "learning_rate": 0.00018548895466530865, "loss": 0.0746, "step": 5198 }, { "epoch": 1.282051282051282, "grad_norm": 0.44140625, "learning_rate": 0.00018547108975112617, "loss": 0.0763, "step": 5200 }, { "epoch": 1.282544378698225, "grad_norm": 0.6171875, "learning_rate": 0.00018545321470814233, "loss": 0.0841, "step": 5202 }, { "epoch": 1.2830374753451677, "grad_norm": 0.451171875, "learning_rate": 0.00018543532953847544, "loss": 0.0771, "step": 5204 }, { "epoch": 1.2835305719921104, "grad_norm": 0.66015625, "learning_rate": 0.00018541743424424498, "loss": 0.0809, "step": 5206 }, { "epoch": 1.2840236686390534, "grad_norm": 0.546875, "learning_rate": 0.00018539952882757165, "loss": 0.0759, "step": 5208 }, { "epoch": 1.2845167652859961, "grad_norm": 0.52734375, "learning_rate": 0.00018538161329057737, "loss": 0.0793, "step": 5210 }, { "epoch": 1.2850098619329389, "grad_norm": 0.455078125, "learning_rate": 0.00018536368763538518, "loss": 0.0737, "step": 5212 }, { "epoch": 1.2855029585798816, "grad_norm": 0.54296875, "learning_rate": 0.0001853457518641194, "loss": 0.0775, "step": 5214 }, { "epoch": 1.2859960552268244, "grad_norm": 0.453125, "learning_rate": 0.00018532780597890552, "loss": 0.0695, "step": 5216 }, { "epoch": 1.2864891518737673, "grad_norm": 0.5703125, "learning_rate": 0.00018530984998187022, "loss": 0.0784, "step": 5218 }, { "epoch": 1.28698224852071, "grad_norm": 0.58984375, "learning_rate": 0.0001852918838751414, "loss": 0.079, "step": 5220 }, { "epoch": 1.2874753451676528, "grad_norm": 0.6328125, "learning_rate": 0.0001852739076608481, "loss": 0.0761, "step": 5222 }, { "epoch": 1.2879684418145958, "grad_norm": 0.54296875, "learning_rate": 0.00018525592134112064, "loss": 0.0815, "step": 5224 }, { "epoch": 1.2884615384615383, "grad_norm": 0.498046875, "learning_rate": 0.00018523792491809054, "loss": 0.0742, "step": 5226 }, { "epoch": 1.2889546351084813, "grad_norm": 0.46875, "learning_rate": 0.0001852199183938904, "loss": 0.0739, "step": 5228 }, { "epoch": 1.289447731755424, "grad_norm": 0.5625, "learning_rate": 0.0001852019017706541, "loss": 0.0721, "step": 5230 }, { "epoch": 1.2899408284023668, "grad_norm": 0.4375, "learning_rate": 0.00018518387505051675, "loss": 0.0758, "step": 5232 }, { "epoch": 1.2904339250493098, "grad_norm": 0.37890625, "learning_rate": 0.0001851658382356146, "loss": 0.0719, "step": 5234 }, { "epoch": 1.2909270216962525, "grad_norm": 0.5234375, "learning_rate": 0.00018514779132808513, "loss": 0.078, "step": 5236 }, { "epoch": 1.2914201183431953, "grad_norm": 0.5546875, "learning_rate": 0.00018512973433006696, "loss": 0.0755, "step": 5238 }, { "epoch": 1.291913214990138, "grad_norm": 0.671875, "learning_rate": 0.00018511166724369997, "loss": 0.0739, "step": 5240 }, { "epoch": 1.2924063116370808, "grad_norm": 0.67578125, "learning_rate": 0.00018509359007112523, "loss": 0.0744, "step": 5242 }, { "epoch": 1.2928994082840237, "grad_norm": 0.45703125, "learning_rate": 0.00018507550281448497, "loss": 0.0735, "step": 5244 }, { "epoch": 1.2933925049309665, "grad_norm": 0.59375, "learning_rate": 0.0001850574054759226, "loss": 0.0766, "step": 5246 }, { "epoch": 1.2938856015779092, "grad_norm": 0.482421875, "learning_rate": 0.00018503929805758278, "loss": 0.0704, "step": 5248 }, { "epoch": 1.2943786982248522, "grad_norm": 0.87890625, "learning_rate": 0.00018502118056161135, "loss": 0.0777, "step": 5250 }, { "epoch": 1.294871794871795, "grad_norm": 0.578125, "learning_rate": 0.00018500305299015533, "loss": 0.0762, "step": 5252 }, { "epoch": 1.2953648915187377, "grad_norm": 0.58203125, "learning_rate": 0.00018498491534536292, "loss": 0.0775, "step": 5254 }, { "epoch": 1.2958579881656804, "grad_norm": 0.431640625, "learning_rate": 0.0001849667676293836, "loss": 0.0746, "step": 5256 }, { "epoch": 1.2963510848126232, "grad_norm": 0.6796875, "learning_rate": 0.00018494860984436784, "loss": 0.0767, "step": 5258 }, { "epoch": 1.2968441814595661, "grad_norm": 0.4921875, "learning_rate": 0.00018493044199246756, "loss": 0.0753, "step": 5260 }, { "epoch": 1.297337278106509, "grad_norm": 0.59765625, "learning_rate": 0.00018491226407583574, "loss": 0.0686, "step": 5262 }, { "epoch": 1.2978303747534516, "grad_norm": 0.5, "learning_rate": 0.0001848940760966265, "loss": 0.0766, "step": 5264 }, { "epoch": 1.2983234714003946, "grad_norm": 0.671875, "learning_rate": 0.00018487587805699526, "loss": 0.0732, "step": 5266 }, { "epoch": 1.2988165680473374, "grad_norm": 0.412109375, "learning_rate": 0.0001848576699590986, "loss": 0.0709, "step": 5268 }, { "epoch": 1.29930966469428, "grad_norm": 0.79296875, "learning_rate": 0.00018483945180509424, "loss": 0.0757, "step": 5270 }, { "epoch": 1.2998027613412229, "grad_norm": 0.5859375, "learning_rate": 0.0001848212235971412, "loss": 0.0747, "step": 5272 }, { "epoch": 1.3002958579881656, "grad_norm": 0.66015625, "learning_rate": 0.00018480298533739956, "loss": 0.0697, "step": 5274 }, { "epoch": 1.3007889546351086, "grad_norm": 0.53515625, "learning_rate": 0.00018478473702803069, "loss": 0.0754, "step": 5276 }, { "epoch": 1.3012820512820513, "grad_norm": 0.5546875, "learning_rate": 0.00018476647867119708, "loss": 0.078, "step": 5278 }, { "epoch": 1.301775147928994, "grad_norm": 0.52734375, "learning_rate": 0.00018474821026906246, "loss": 0.0713, "step": 5280 }, { "epoch": 1.3022682445759368, "grad_norm": 0.73046875, "learning_rate": 0.00018472993182379178, "loss": 0.0703, "step": 5282 }, { "epoch": 1.3027613412228796, "grad_norm": 0.5703125, "learning_rate": 0.0001847116433375511, "loss": 0.0791, "step": 5284 }, { "epoch": 1.3032544378698225, "grad_norm": 0.6640625, "learning_rate": 0.00018469334481250772, "loss": 0.0786, "step": 5286 }, { "epoch": 1.3037475345167653, "grad_norm": 0.8359375, "learning_rate": 0.00018467503625083009, "loss": 0.0861, "step": 5288 }, { "epoch": 1.304240631163708, "grad_norm": 0.77734375, "learning_rate": 0.00018465671765468793, "loss": 0.08, "step": 5290 }, { "epoch": 1.304733727810651, "grad_norm": 0.671875, "learning_rate": 0.00018463838902625207, "loss": 0.0743, "step": 5292 }, { "epoch": 1.3052268244575937, "grad_norm": 1.265625, "learning_rate": 0.00018462005036769453, "loss": 0.0772, "step": 5294 }, { "epoch": 1.3057199211045365, "grad_norm": 0.50390625, "learning_rate": 0.00018460170168118855, "loss": 0.0776, "step": 5296 }, { "epoch": 1.3062130177514792, "grad_norm": 0.81640625, "learning_rate": 0.00018458334296890858, "loss": 0.0759, "step": 5298 }, { "epoch": 1.306706114398422, "grad_norm": 0.82421875, "learning_rate": 0.0001845649742330302, "loss": 0.0703, "step": 5300 }, { "epoch": 1.307199211045365, "grad_norm": 0.9921875, "learning_rate": 0.00018454659547573025, "loss": 0.0741, "step": 5302 }, { "epoch": 1.3076923076923077, "grad_norm": 0.404296875, "learning_rate": 0.0001845282066991867, "loss": 0.0727, "step": 5304 }, { "epoch": 1.3081854043392505, "grad_norm": 0.65625, "learning_rate": 0.00018450980790557866, "loss": 0.078, "step": 5306 }, { "epoch": 1.3086785009861934, "grad_norm": 0.56640625, "learning_rate": 0.0001844913990970866, "loss": 0.0699, "step": 5308 }, { "epoch": 1.3091715976331362, "grad_norm": 0.59375, "learning_rate": 0.00018447298027589196, "loss": 0.0729, "step": 5310 }, { "epoch": 1.309664694280079, "grad_norm": 0.62109375, "learning_rate": 0.00018445455144417753, "loss": 0.0803, "step": 5312 }, { "epoch": 1.3101577909270217, "grad_norm": 0.578125, "learning_rate": 0.00018443611260412723, "loss": 0.0803, "step": 5314 }, { "epoch": 1.3106508875739644, "grad_norm": 0.5, "learning_rate": 0.00018441766375792615, "loss": 0.0747, "step": 5316 }, { "epoch": 1.3111439842209074, "grad_norm": 0.47265625, "learning_rate": 0.00018439920490776062, "loss": 0.0801, "step": 5318 }, { "epoch": 1.3116370808678501, "grad_norm": 0.455078125, "learning_rate": 0.00018438073605581801, "loss": 0.0792, "step": 5320 }, { "epoch": 1.3121301775147929, "grad_norm": 0.54296875, "learning_rate": 0.00018436225720428712, "loss": 0.0841, "step": 5322 }, { "epoch": 1.3126232741617356, "grad_norm": 0.5078125, "learning_rate": 0.00018434376835535773, "loss": 0.0726, "step": 5324 }, { "epoch": 1.3131163708086784, "grad_norm": 0.40234375, "learning_rate": 0.00018432526951122084, "loss": 0.0756, "step": 5326 }, { "epoch": 1.3136094674556213, "grad_norm": 0.455078125, "learning_rate": 0.0001843067606740687, "loss": 0.0693, "step": 5328 }, { "epoch": 1.314102564102564, "grad_norm": 0.59765625, "learning_rate": 0.00018428824184609475, "loss": 0.0739, "step": 5330 }, { "epoch": 1.3145956607495068, "grad_norm": 0.494140625, "learning_rate": 0.00018426971302949353, "loss": 0.0789, "step": 5332 }, { "epoch": 1.3150887573964498, "grad_norm": 0.462890625, "learning_rate": 0.00018425117422646075, "loss": 0.076, "step": 5334 }, { "epoch": 1.3155818540433926, "grad_norm": 0.43359375, "learning_rate": 0.0001842326254391935, "loss": 0.0713, "step": 5336 }, { "epoch": 1.3160749506903353, "grad_norm": 0.404296875, "learning_rate": 0.0001842140666698898, "loss": 0.0705, "step": 5338 }, { "epoch": 1.316568047337278, "grad_norm": 0.408203125, "learning_rate": 0.00018419549792074905, "loss": 0.0738, "step": 5340 }, { "epoch": 1.3170611439842208, "grad_norm": 0.5, "learning_rate": 0.00018417691919397168, "loss": 0.0713, "step": 5342 }, { "epoch": 1.3175542406311638, "grad_norm": 0.515625, "learning_rate": 0.00018415833049175941, "loss": 0.0705, "step": 5344 }, { "epoch": 1.3180473372781065, "grad_norm": 0.6328125, "learning_rate": 0.0001841397318163151, "loss": 0.0758, "step": 5346 }, { "epoch": 1.3185404339250493, "grad_norm": 0.37890625, "learning_rate": 0.00018412112316984277, "loss": 0.0704, "step": 5348 }, { "epoch": 1.3190335305719922, "grad_norm": 0.4765625, "learning_rate": 0.0001841025045545477, "loss": 0.0741, "step": 5350 }, { "epoch": 1.319526627218935, "grad_norm": 0.380859375, "learning_rate": 0.00018408387597263626, "loss": 0.0732, "step": 5352 }, { "epoch": 1.3200197238658777, "grad_norm": 0.81640625, "learning_rate": 0.00018406523742631604, "loss": 0.0802, "step": 5354 }, { "epoch": 1.3205128205128205, "grad_norm": 0.76953125, "learning_rate": 0.00018404658891779584, "loss": 0.0752, "step": 5356 }, { "epoch": 1.3210059171597632, "grad_norm": 0.6875, "learning_rate": 0.0001840279304492856, "loss": 0.0721, "step": 5358 }, { "epoch": 1.3214990138067062, "grad_norm": 0.64453125, "learning_rate": 0.00018400926202299643, "loss": 0.0728, "step": 5360 }, { "epoch": 1.321992110453649, "grad_norm": 0.671875, "learning_rate": 0.00018399058364114068, "loss": 0.0729, "step": 5362 }, { "epoch": 1.3224852071005917, "grad_norm": 0.84375, "learning_rate": 0.00018397189530593183, "loss": 0.0735, "step": 5364 }, { "epoch": 1.3229783037475344, "grad_norm": 0.458984375, "learning_rate": 0.00018395319701958453, "loss": 0.0757, "step": 5366 }, { "epoch": 1.3234714003944772, "grad_norm": 0.51171875, "learning_rate": 0.00018393448878431466, "loss": 0.0746, "step": 5368 }, { "epoch": 1.3239644970414202, "grad_norm": 0.49609375, "learning_rate": 0.00018391577060233925, "loss": 0.0695, "step": 5370 }, { "epoch": 1.324457593688363, "grad_norm": 0.44140625, "learning_rate": 0.0001838970424758765, "loss": 0.0649, "step": 5372 }, { "epoch": 1.3249506903353057, "grad_norm": 0.443359375, "learning_rate": 0.00018387830440714576, "loss": 0.0751, "step": 5374 }, { "epoch": 1.3254437869822486, "grad_norm": 0.458984375, "learning_rate": 0.00018385955639836765, "loss": 0.0709, "step": 5376 }, { "epoch": 1.3259368836291914, "grad_norm": 0.466796875, "learning_rate": 0.0001838407984517639, "loss": 0.0714, "step": 5378 }, { "epoch": 1.3264299802761341, "grad_norm": 0.57421875, "learning_rate": 0.0001838220305695574, "loss": 0.078, "step": 5380 }, { "epoch": 1.3269230769230769, "grad_norm": 0.48828125, "learning_rate": 0.0001838032527539723, "loss": 0.0707, "step": 5382 }, { "epoch": 1.3274161735700196, "grad_norm": 0.470703125, "learning_rate": 0.00018378446500723388, "loss": 0.0686, "step": 5384 }, { "epoch": 1.3279092702169626, "grad_norm": 0.53125, "learning_rate": 0.00018376566733156852, "loss": 0.0718, "step": 5386 }, { "epoch": 1.3284023668639053, "grad_norm": 0.5234375, "learning_rate": 0.00018374685972920388, "loss": 0.0754, "step": 5388 }, { "epoch": 1.328895463510848, "grad_norm": 0.447265625, "learning_rate": 0.00018372804220236882, "loss": 0.0728, "step": 5390 }, { "epoch": 1.329388560157791, "grad_norm": 0.458984375, "learning_rate": 0.00018370921475329327, "loss": 0.0699, "step": 5392 }, { "epoch": 1.3298816568047338, "grad_norm": 0.61328125, "learning_rate": 0.00018369037738420844, "loss": 0.0757, "step": 5394 }, { "epoch": 1.3303747534516766, "grad_norm": 0.38671875, "learning_rate": 0.00018367153009734655, "loss": 0.069, "step": 5396 }, { "epoch": 1.3308678500986193, "grad_norm": 0.45703125, "learning_rate": 0.0001836526728949412, "loss": 0.074, "step": 5398 }, { "epoch": 1.331360946745562, "grad_norm": 0.46875, "learning_rate": 0.00018363380577922708, "loss": 0.0684, "step": 5400 }, { "epoch": 1.331854043392505, "grad_norm": 0.458984375, "learning_rate": 0.00018361492875244002, "loss": 0.0769, "step": 5402 }, { "epoch": 1.3323471400394478, "grad_norm": 0.52734375, "learning_rate": 0.00018359604181681705, "loss": 0.0741, "step": 5404 }, { "epoch": 1.3328402366863905, "grad_norm": 0.458984375, "learning_rate": 0.00018357714497459638, "loss": 0.072, "step": 5406 }, { "epoch": 1.3333333333333333, "grad_norm": 0.427734375, "learning_rate": 0.0001835582382280174, "loss": 0.0743, "step": 5408 }, { "epoch": 1.333826429980276, "grad_norm": 0.404296875, "learning_rate": 0.00018353932157932063, "loss": 0.0691, "step": 5410 }, { "epoch": 1.334319526627219, "grad_norm": 0.5078125, "learning_rate": 0.00018352039503074786, "loss": 0.0637, "step": 5412 }, { "epoch": 1.3348126232741617, "grad_norm": 0.52734375, "learning_rate": 0.00018350145858454195, "loss": 0.0704, "step": 5414 }, { "epoch": 1.3353057199211045, "grad_norm": 0.447265625, "learning_rate": 0.000183482512242947, "loss": 0.0721, "step": 5416 }, { "epoch": 1.3357988165680474, "grad_norm": 0.447265625, "learning_rate": 0.00018346355600820823, "loss": 0.0657, "step": 5418 }, { "epoch": 1.3362919132149902, "grad_norm": 0.53125, "learning_rate": 0.0001834445898825721, "loss": 0.0725, "step": 5420 }, { "epoch": 1.336785009861933, "grad_norm": 0.390625, "learning_rate": 0.00018342561386828615, "loss": 0.0698, "step": 5422 }, { "epoch": 1.3372781065088757, "grad_norm": 0.61328125, "learning_rate": 0.00018340662796759914, "loss": 0.0755, "step": 5424 }, { "epoch": 1.3377712031558184, "grad_norm": 0.45703125, "learning_rate": 0.00018338763218276108, "loss": 0.0737, "step": 5426 }, { "epoch": 1.3382642998027614, "grad_norm": 0.5625, "learning_rate": 0.000183368626516023, "loss": 0.0759, "step": 5428 }, { "epoch": 1.3387573964497042, "grad_norm": 0.44140625, "learning_rate": 0.00018334961096963724, "loss": 0.0697, "step": 5430 }, { "epoch": 1.339250493096647, "grad_norm": 0.515625, "learning_rate": 0.00018333058554585718, "loss": 0.0687, "step": 5432 }, { "epoch": 1.3397435897435899, "grad_norm": 0.37109375, "learning_rate": 0.0001833115502469375, "loss": 0.0738, "step": 5434 }, { "epoch": 1.3402366863905326, "grad_norm": 0.494140625, "learning_rate": 0.00018329250507513397, "loss": 0.0695, "step": 5436 }, { "epoch": 1.3407297830374754, "grad_norm": 0.3828125, "learning_rate": 0.00018327345003270348, "loss": 0.0688, "step": 5438 }, { "epoch": 1.3412228796844181, "grad_norm": 0.4375, "learning_rate": 0.00018325438512190427, "loss": 0.0767, "step": 5440 }, { "epoch": 1.3417159763313609, "grad_norm": 0.431640625, "learning_rate": 0.00018323531034499562, "loss": 0.0689, "step": 5442 }, { "epoch": 1.3422090729783038, "grad_norm": 0.486328125, "learning_rate": 0.00018321622570423792, "loss": 0.0727, "step": 5444 }, { "epoch": 1.3427021696252466, "grad_norm": 0.515625, "learning_rate": 0.00018319713120189287, "loss": 0.0686, "step": 5446 }, { "epoch": 1.3431952662721893, "grad_norm": 0.453125, "learning_rate": 0.00018317802684022325, "loss": 0.0711, "step": 5448 }, { "epoch": 1.343688362919132, "grad_norm": 0.53125, "learning_rate": 0.00018315891262149305, "loss": 0.0803, "step": 5450 }, { "epoch": 1.3441814595660748, "grad_norm": 0.4921875, "learning_rate": 0.0001831397885479674, "loss": 0.0756, "step": 5452 }, { "epoch": 1.3446745562130178, "grad_norm": 0.60546875, "learning_rate": 0.00018312065462191263, "loss": 0.0773, "step": 5454 }, { "epoch": 1.3451676528599605, "grad_norm": 0.37109375, "learning_rate": 0.00018310151084559618, "loss": 0.0744, "step": 5456 }, { "epoch": 1.3456607495069033, "grad_norm": 0.435546875, "learning_rate": 0.00018308235722128672, "loss": 0.0752, "step": 5458 }, { "epoch": 1.3461538461538463, "grad_norm": 0.4375, "learning_rate": 0.00018306319375125407, "loss": 0.0751, "step": 5460 }, { "epoch": 1.346646942800789, "grad_norm": 0.458984375, "learning_rate": 0.0001830440204377692, "loss": 0.0757, "step": 5462 }, { "epoch": 1.3471400394477318, "grad_norm": 0.40625, "learning_rate": 0.00018302483728310423, "loss": 0.0709, "step": 5464 }, { "epoch": 1.3476331360946745, "grad_norm": 0.53125, "learning_rate": 0.00018300564428953248, "loss": 0.0758, "step": 5466 }, { "epoch": 1.3481262327416172, "grad_norm": 0.44921875, "learning_rate": 0.00018298644145932848, "loss": 0.0631, "step": 5468 }, { "epoch": 1.3486193293885602, "grad_norm": 0.40234375, "learning_rate": 0.00018296722879476777, "loss": 0.07, "step": 5470 }, { "epoch": 1.349112426035503, "grad_norm": 0.6875, "learning_rate": 0.00018294800629812727, "loss": 0.0678, "step": 5472 }, { "epoch": 1.3496055226824457, "grad_norm": 0.48046875, "learning_rate": 0.00018292877397168487, "loss": 0.0662, "step": 5474 }, { "epoch": 1.3500986193293887, "grad_norm": 0.51953125, "learning_rate": 0.00018290953181771978, "loss": 0.0664, "step": 5476 }, { "epoch": 1.3505917159763314, "grad_norm": 0.41796875, "learning_rate": 0.00018289027983851223, "loss": 0.0712, "step": 5478 }, { "epoch": 1.3510848126232742, "grad_norm": 0.4375, "learning_rate": 0.00018287101803634372, "loss": 0.0728, "step": 5480 }, { "epoch": 1.351577909270217, "grad_norm": 0.57421875, "learning_rate": 0.0001828517464134969, "loss": 0.0728, "step": 5482 }, { "epoch": 1.3520710059171597, "grad_norm": 0.57421875, "learning_rate": 0.00018283246497225554, "loss": 0.0712, "step": 5484 }, { "epoch": 1.3525641025641026, "grad_norm": 0.412109375, "learning_rate": 0.0001828131737149046, "loss": 0.0685, "step": 5486 }, { "epoch": 1.3530571992110454, "grad_norm": 0.48828125, "learning_rate": 0.00018279387264373017, "loss": 0.073, "step": 5488 }, { "epoch": 1.3535502958579881, "grad_norm": 0.73046875, "learning_rate": 0.0001827745617610196, "loss": 0.0702, "step": 5490 }, { "epoch": 1.3540433925049309, "grad_norm": 0.396484375, "learning_rate": 0.0001827552410690613, "loss": 0.0662, "step": 5492 }, { "epoch": 1.3545364891518736, "grad_norm": 0.85546875, "learning_rate": 0.0001827359105701449, "loss": 0.0679, "step": 5494 }, { "epoch": 1.3550295857988166, "grad_norm": 0.515625, "learning_rate": 0.00018271657026656114, "loss": 0.069, "step": 5496 }, { "epoch": 1.3555226824457594, "grad_norm": 0.421875, "learning_rate": 0.000182697220160602, "loss": 0.0661, "step": 5498 }, { "epoch": 1.356015779092702, "grad_norm": 0.51953125, "learning_rate": 0.00018267786025456052, "loss": 0.0705, "step": 5500 }, { "epoch": 1.356508875739645, "grad_norm": 0.39453125, "learning_rate": 0.00018265849055073102, "loss": 0.065, "step": 5502 }, { "epoch": 1.3570019723865878, "grad_norm": 0.375, "learning_rate": 0.00018263911105140886, "loss": 0.0698, "step": 5504 }, { "epoch": 1.3574950690335306, "grad_norm": 0.53125, "learning_rate": 0.00018261972175889063, "loss": 0.0785, "step": 5506 }, { "epoch": 1.3579881656804733, "grad_norm": 0.412109375, "learning_rate": 0.00018260032267547412, "loss": 0.0644, "step": 5508 }, { "epoch": 1.358481262327416, "grad_norm": 0.376953125, "learning_rate": 0.00018258091380345817, "loss": 0.0705, "step": 5510 }, { "epoch": 1.358974358974359, "grad_norm": 0.44140625, "learning_rate": 0.00018256149514514285, "loss": 0.069, "step": 5512 }, { "epoch": 1.3594674556213018, "grad_norm": 0.41796875, "learning_rate": 0.00018254206670282938, "loss": 0.0612, "step": 5514 }, { "epoch": 1.3599605522682445, "grad_norm": 0.373046875, "learning_rate": 0.0001825226284788202, "loss": 0.0632, "step": 5516 }, { "epoch": 1.3604536489151875, "grad_norm": 0.7890625, "learning_rate": 0.00018250318047541877, "loss": 0.0736, "step": 5518 }, { "epoch": 1.3609467455621302, "grad_norm": 0.435546875, "learning_rate": 0.00018248372269492983, "loss": 0.0748, "step": 5520 }, { "epoch": 1.361439842209073, "grad_norm": 0.59375, "learning_rate": 0.0001824642551396592, "loss": 0.0714, "step": 5522 }, { "epoch": 1.3619329388560157, "grad_norm": 0.44140625, "learning_rate": 0.00018244477781191392, "loss": 0.0589, "step": 5524 }, { "epoch": 1.3624260355029585, "grad_norm": 0.72265625, "learning_rate": 0.00018242529071400214, "loss": 0.0706, "step": 5526 }, { "epoch": 1.3629191321499015, "grad_norm": 0.625, "learning_rate": 0.00018240579384823323, "loss": 0.0674, "step": 5528 }, { "epoch": 1.3634122287968442, "grad_norm": 0.76171875, "learning_rate": 0.00018238628721691766, "loss": 0.0702, "step": 5530 }, { "epoch": 1.363905325443787, "grad_norm": 0.578125, "learning_rate": 0.00018236677082236703, "loss": 0.0697, "step": 5532 }, { "epoch": 1.3643984220907297, "grad_norm": 0.54296875, "learning_rate": 0.00018234724466689421, "loss": 0.0707, "step": 5534 }, { "epoch": 1.3648915187376724, "grad_norm": 0.365234375, "learning_rate": 0.00018232770875281314, "loss": 0.0653, "step": 5536 }, { "epoch": 1.3653846153846154, "grad_norm": 0.474609375, "learning_rate": 0.0001823081630824389, "loss": 0.0733, "step": 5538 }, { "epoch": 1.3658777120315582, "grad_norm": 0.482421875, "learning_rate": 0.00018228860765808782, "loss": 0.0691, "step": 5540 }, { "epoch": 1.366370808678501, "grad_norm": 0.54296875, "learning_rate": 0.00018226904248207725, "loss": 0.0729, "step": 5542 }, { "epoch": 1.3668639053254439, "grad_norm": 0.5390625, "learning_rate": 0.00018224946755672584, "loss": 0.0677, "step": 5544 }, { "epoch": 1.3673570019723866, "grad_norm": 0.625, "learning_rate": 0.00018222988288435328, "loss": 0.0659, "step": 5546 }, { "epoch": 1.3678500986193294, "grad_norm": 0.478515625, "learning_rate": 0.0001822102884672805, "loss": 0.0615, "step": 5548 }, { "epoch": 1.3683431952662721, "grad_norm": 0.54296875, "learning_rate": 0.00018219068430782952, "loss": 0.0643, "step": 5550 }, { "epoch": 1.3688362919132149, "grad_norm": 0.341796875, "learning_rate": 0.00018217107040832358, "loss": 0.0632, "step": 5552 }, { "epoch": 1.3693293885601578, "grad_norm": 0.400390625, "learning_rate": 0.00018215144677108698, "loss": 0.0632, "step": 5554 }, { "epoch": 1.3698224852071006, "grad_norm": 0.6171875, "learning_rate": 0.0001821318133984453, "loss": 0.0685, "step": 5556 }, { "epoch": 1.3703155818540433, "grad_norm": 0.578125, "learning_rate": 0.00018211217029272513, "loss": 0.0657, "step": 5558 }, { "epoch": 1.3708086785009863, "grad_norm": 0.47265625, "learning_rate": 0.00018209251745625435, "loss": 0.0677, "step": 5560 }, { "epoch": 1.371301775147929, "grad_norm": 0.61328125, "learning_rate": 0.0001820728548913619, "loss": 0.0717, "step": 5562 }, { "epoch": 1.3717948717948718, "grad_norm": 0.42578125, "learning_rate": 0.0001820531826003779, "loss": 0.0679, "step": 5564 }, { "epoch": 1.3722879684418146, "grad_norm": 0.43359375, "learning_rate": 0.00018203350058563364, "loss": 0.0671, "step": 5566 }, { "epoch": 1.3727810650887573, "grad_norm": 0.6015625, "learning_rate": 0.00018201380884946158, "loss": 0.0716, "step": 5568 }, { "epoch": 1.3732741617357003, "grad_norm": 0.7578125, "learning_rate": 0.00018199410739419523, "loss": 0.0738, "step": 5570 }, { "epoch": 1.373767258382643, "grad_norm": 0.41796875, "learning_rate": 0.00018197439622216934, "loss": 0.0696, "step": 5572 }, { "epoch": 1.3742603550295858, "grad_norm": 0.734375, "learning_rate": 0.00018195467533571984, "loss": 0.0704, "step": 5574 }, { "epoch": 1.3747534516765287, "grad_norm": 0.578125, "learning_rate": 0.00018193494473718374, "loss": 0.0624, "step": 5576 }, { "epoch": 1.3752465483234713, "grad_norm": 0.5390625, "learning_rate": 0.0001819152044288992, "loss": 0.0686, "step": 5578 }, { "epoch": 1.3757396449704142, "grad_norm": 0.46875, "learning_rate": 0.00018189545441320556, "loss": 0.0683, "step": 5580 }, { "epoch": 1.376232741617357, "grad_norm": 0.6640625, "learning_rate": 0.00018187569469244337, "loss": 0.0714, "step": 5582 }, { "epoch": 1.3767258382642997, "grad_norm": 0.369140625, "learning_rate": 0.00018185592526895423, "loss": 0.065, "step": 5584 }, { "epoch": 1.3772189349112427, "grad_norm": 0.6015625, "learning_rate": 0.00018183614614508087, "loss": 0.0646, "step": 5586 }, { "epoch": 1.3777120315581854, "grad_norm": 0.45703125, "learning_rate": 0.00018181635732316732, "loss": 0.0604, "step": 5588 }, { "epoch": 1.3782051282051282, "grad_norm": 0.5078125, "learning_rate": 0.00018179655880555862, "loss": 0.0654, "step": 5590 }, { "epoch": 1.378698224852071, "grad_norm": 0.419921875, "learning_rate": 0.000181776750594601, "loss": 0.0679, "step": 5592 }, { "epoch": 1.3791913214990137, "grad_norm": 0.46875, "learning_rate": 0.00018175693269264183, "loss": 0.0716, "step": 5594 }, { "epoch": 1.3796844181459567, "grad_norm": 0.59765625, "learning_rate": 0.00018173710510202968, "loss": 0.0644, "step": 5596 }, { "epoch": 1.3801775147928994, "grad_norm": 0.50390625, "learning_rate": 0.0001817172678251142, "loss": 0.0699, "step": 5598 }, { "epoch": 1.3806706114398422, "grad_norm": 0.412109375, "learning_rate": 0.0001816974208642463, "loss": 0.0689, "step": 5600 }, { "epoch": 1.3811637080867851, "grad_norm": 0.41015625, "learning_rate": 0.0001816775642217778, "loss": 0.0663, "step": 5602 }, { "epoch": 1.3816568047337279, "grad_norm": 0.375, "learning_rate": 0.00018165769790006199, "loss": 0.0656, "step": 5604 }, { "epoch": 1.3821499013806706, "grad_norm": 0.37890625, "learning_rate": 0.000181637821901453, "loss": 0.0637, "step": 5606 }, { "epoch": 1.3826429980276134, "grad_norm": 0.42578125, "learning_rate": 0.00018161793622830633, "loss": 0.0674, "step": 5608 }, { "epoch": 1.3831360946745561, "grad_norm": 0.447265625, "learning_rate": 0.0001815980408829785, "loss": 0.0628, "step": 5610 }, { "epoch": 1.383629191321499, "grad_norm": 0.392578125, "learning_rate": 0.00018157813586782727, "loss": 0.0646, "step": 5612 }, { "epoch": 1.3841222879684418, "grad_norm": 0.53515625, "learning_rate": 0.0001815582211852114, "loss": 0.0701, "step": 5614 }, { "epoch": 1.3846153846153846, "grad_norm": 0.625, "learning_rate": 0.00018153829683749105, "loss": 0.0723, "step": 5616 }, { "epoch": 1.3851084812623276, "grad_norm": 0.6328125, "learning_rate": 0.00018151836282702722, "loss": 0.0678, "step": 5618 }, { "epoch": 1.3856015779092703, "grad_norm": 0.6015625, "learning_rate": 0.0001814984191561823, "loss": 0.0648, "step": 5620 }, { "epoch": 1.386094674556213, "grad_norm": 0.50390625, "learning_rate": 0.00018147846582731962, "loss": 0.0602, "step": 5622 }, { "epoch": 1.3865877712031558, "grad_norm": 0.490234375, "learning_rate": 0.00018145850284280384, "loss": 0.0597, "step": 5624 }, { "epoch": 1.3870808678500985, "grad_norm": 0.47265625, "learning_rate": 0.0001814385302050007, "loss": 0.0695, "step": 5626 }, { "epoch": 1.3875739644970415, "grad_norm": 0.44921875, "learning_rate": 0.00018141854791627702, "loss": 0.0636, "step": 5628 }, { "epoch": 1.3880670611439843, "grad_norm": 0.6875, "learning_rate": 0.0001813985559790008, "loss": 0.0698, "step": 5630 }, { "epoch": 1.388560157790927, "grad_norm": 0.55078125, "learning_rate": 0.00018137855439554124, "loss": 0.0736, "step": 5632 }, { "epoch": 1.3890532544378698, "grad_norm": 0.67578125, "learning_rate": 0.00018135854316826864, "loss": 0.0692, "step": 5634 }, { "epoch": 1.3895463510848125, "grad_norm": 0.435546875, "learning_rate": 0.00018133852229955443, "loss": 0.0627, "step": 5636 }, { "epoch": 1.3900394477317555, "grad_norm": 0.671875, "learning_rate": 0.00018131849179177115, "loss": 0.0638, "step": 5638 }, { "epoch": 1.3905325443786982, "grad_norm": 0.3984375, "learning_rate": 0.00018129845164729263, "loss": 0.0718, "step": 5640 }, { "epoch": 1.391025641025641, "grad_norm": 0.8203125, "learning_rate": 0.00018127840186849364, "loss": 0.0647, "step": 5642 }, { "epoch": 1.391518737672584, "grad_norm": 0.46875, "learning_rate": 0.0001812583424577502, "loss": 0.0661, "step": 5644 }, { "epoch": 1.3920118343195267, "grad_norm": 0.44140625, "learning_rate": 0.00018123827341743956, "loss": 0.0626, "step": 5646 }, { "epoch": 1.3925049309664694, "grad_norm": 0.5546875, "learning_rate": 0.0001812181947499399, "loss": 0.0633, "step": 5648 }, { "epoch": 1.3929980276134122, "grad_norm": 0.50390625, "learning_rate": 0.0001811981064576307, "loss": 0.064, "step": 5650 }, { "epoch": 1.393491124260355, "grad_norm": 0.419921875, "learning_rate": 0.00018117800854289254, "loss": 0.0688, "step": 5652 }, { "epoch": 1.393984220907298, "grad_norm": 0.47265625, "learning_rate": 0.0001811579010081071, "loss": 0.0713, "step": 5654 }, { "epoch": 1.3944773175542406, "grad_norm": 0.36328125, "learning_rate": 0.00018113778385565733, "loss": 0.0676, "step": 5656 }, { "epoch": 1.3949704142011834, "grad_norm": 0.478515625, "learning_rate": 0.0001811176570879271, "loss": 0.0616, "step": 5658 }, { "epoch": 1.3954635108481264, "grad_norm": 0.43359375, "learning_rate": 0.00018109752070730162, "loss": 0.0602, "step": 5660 }, { "epoch": 1.3959566074950691, "grad_norm": 0.373046875, "learning_rate": 0.00018107737471616714, "loss": 0.0678, "step": 5662 }, { "epoch": 1.3964497041420119, "grad_norm": 0.38671875, "learning_rate": 0.0001810572191169111, "loss": 0.0658, "step": 5664 }, { "epoch": 1.3969428007889546, "grad_norm": 0.4296875, "learning_rate": 0.00018103705391192199, "loss": 0.0668, "step": 5666 }, { "epoch": 1.3974358974358974, "grad_norm": 0.40625, "learning_rate": 0.00018101687910358956, "loss": 0.0664, "step": 5668 }, { "epoch": 1.3979289940828403, "grad_norm": 0.52734375, "learning_rate": 0.0001809966946943046, "loss": 0.0553, "step": 5670 }, { "epoch": 1.398422090729783, "grad_norm": 0.439453125, "learning_rate": 0.00018097650068645913, "loss": 0.075, "step": 5672 }, { "epoch": 1.3989151873767258, "grad_norm": 0.3828125, "learning_rate": 0.0001809562970824462, "loss": 0.0655, "step": 5674 }, { "epoch": 1.3994082840236686, "grad_norm": 0.6640625, "learning_rate": 0.00018093608388466005, "loss": 0.0649, "step": 5676 }, { "epoch": 1.3999013806706113, "grad_norm": 0.42578125, "learning_rate": 0.00018091586109549604, "loss": 0.0619, "step": 5678 }, { "epoch": 1.4003944773175543, "grad_norm": 0.56640625, "learning_rate": 0.00018089562871735076, "loss": 0.0614, "step": 5680 }, { "epoch": 1.400887573964497, "grad_norm": 0.470703125, "learning_rate": 0.0001808753867526218, "loss": 0.0666, "step": 5682 }, { "epoch": 1.4013806706114398, "grad_norm": 0.4140625, "learning_rate": 0.00018085513520370796, "loss": 0.0695, "step": 5684 }, { "epoch": 1.4018737672583828, "grad_norm": 0.447265625, "learning_rate": 0.00018083487407300917, "loss": 0.071, "step": 5686 }, { "epoch": 1.4023668639053255, "grad_norm": 0.56640625, "learning_rate": 0.00018081460336292647, "loss": 0.0709, "step": 5688 }, { "epoch": 1.4028599605522682, "grad_norm": 0.46875, "learning_rate": 0.00018079432307586206, "loss": 0.0633, "step": 5690 }, { "epoch": 1.403353057199211, "grad_norm": 0.703125, "learning_rate": 0.00018077403321421927, "loss": 0.0632, "step": 5692 }, { "epoch": 1.4038461538461537, "grad_norm": 0.396484375, "learning_rate": 0.00018075373378040256, "loss": 0.0645, "step": 5694 }, { "epoch": 1.4043392504930967, "grad_norm": 0.51171875, "learning_rate": 0.0001807334247768175, "loss": 0.0622, "step": 5696 }, { "epoch": 1.4048323471400395, "grad_norm": 0.412109375, "learning_rate": 0.0001807131062058709, "loss": 0.0618, "step": 5698 }, { "epoch": 1.4053254437869822, "grad_norm": 0.3671875, "learning_rate": 0.00018069277806997052, "loss": 0.0616, "step": 5700 }, { "epoch": 1.4058185404339252, "grad_norm": 0.396484375, "learning_rate": 0.00018067244037152545, "loss": 0.0623, "step": 5702 }, { "epoch": 1.406311637080868, "grad_norm": 0.47265625, "learning_rate": 0.00018065209311294574, "loss": 0.0679, "step": 5704 }, { "epoch": 1.4068047337278107, "grad_norm": 0.357421875, "learning_rate": 0.00018063173629664271, "loss": 0.0667, "step": 5706 }, { "epoch": 1.4072978303747534, "grad_norm": 0.447265625, "learning_rate": 0.00018061136992502874, "loss": 0.065, "step": 5708 }, { "epoch": 1.4077909270216962, "grad_norm": 0.4375, "learning_rate": 0.00018059099400051734, "loss": 0.0638, "step": 5710 }, { "epoch": 1.4082840236686391, "grad_norm": 0.357421875, "learning_rate": 0.0001805706085255232, "loss": 0.0592, "step": 5712 }, { "epoch": 1.4087771203155819, "grad_norm": 0.353515625, "learning_rate": 0.00018055021350246213, "loss": 0.0625, "step": 5714 }, { "epoch": 1.4092702169625246, "grad_norm": 0.38671875, "learning_rate": 0.000180529808933751, "loss": 0.0667, "step": 5716 }, { "epoch": 1.4097633136094674, "grad_norm": 0.50390625, "learning_rate": 0.0001805093948218079, "loss": 0.0613, "step": 5718 }, { "epoch": 1.4102564102564101, "grad_norm": 0.4765625, "learning_rate": 0.000180488971169052, "loss": 0.0658, "step": 5720 }, { "epoch": 1.410749506903353, "grad_norm": 0.3828125, "learning_rate": 0.0001804685379779036, "loss": 0.0631, "step": 5722 }, { "epoch": 1.4112426035502958, "grad_norm": 0.3984375, "learning_rate": 0.00018044809525078422, "loss": 0.0556, "step": 5724 }, { "epoch": 1.4117357001972386, "grad_norm": 0.34765625, "learning_rate": 0.00018042764299011635, "loss": 0.0601, "step": 5726 }, { "epoch": 1.4122287968441816, "grad_norm": 0.3828125, "learning_rate": 0.00018040718119832376, "loss": 0.0627, "step": 5728 }, { "epoch": 1.4127218934911243, "grad_norm": 0.38671875, "learning_rate": 0.00018038670987783123, "loss": 0.0682, "step": 5730 }, { "epoch": 1.413214990138067, "grad_norm": 0.447265625, "learning_rate": 0.00018036622903106476, "loss": 0.069, "step": 5732 }, { "epoch": 1.4137080867850098, "grad_norm": 0.3515625, "learning_rate": 0.00018034573866045146, "loss": 0.0643, "step": 5734 }, { "epoch": 1.4142011834319526, "grad_norm": 0.478515625, "learning_rate": 0.00018032523876841957, "loss": 0.0629, "step": 5736 }, { "epoch": 1.4146942800788955, "grad_norm": 0.416015625, "learning_rate": 0.00018030472935739834, "loss": 0.0715, "step": 5738 }, { "epoch": 1.4151873767258383, "grad_norm": 0.40625, "learning_rate": 0.00018028421042981836, "loss": 0.0681, "step": 5740 }, { "epoch": 1.415680473372781, "grad_norm": 0.5078125, "learning_rate": 0.00018026368198811117, "loss": 0.0629, "step": 5742 }, { "epoch": 1.416173570019724, "grad_norm": 0.404296875, "learning_rate": 0.00018024314403470955, "loss": 0.0663, "step": 5744 }, { "epoch": 1.4166666666666667, "grad_norm": 0.3984375, "learning_rate": 0.0001802225965720473, "loss": 0.0644, "step": 5746 }, { "epoch": 1.4171597633136095, "grad_norm": 0.45703125, "learning_rate": 0.00018020203960255953, "loss": 0.0663, "step": 5748 }, { "epoch": 1.4176528599605522, "grad_norm": 0.49609375, "learning_rate": 0.00018018147312868222, "loss": 0.0601, "step": 5750 }, { "epoch": 1.418145956607495, "grad_norm": 0.53515625, "learning_rate": 0.00018016089715285267, "loss": 0.0669, "step": 5752 }, { "epoch": 1.418639053254438, "grad_norm": 0.515625, "learning_rate": 0.00018014031167750923, "loss": 0.0582, "step": 5754 }, { "epoch": 1.4191321499013807, "grad_norm": 0.443359375, "learning_rate": 0.00018011971670509148, "loss": 0.0569, "step": 5756 }, { "epoch": 1.4196252465483234, "grad_norm": 0.361328125, "learning_rate": 0.0001800991122380399, "loss": 0.0603, "step": 5758 }, { "epoch": 1.4201183431952662, "grad_norm": 0.423828125, "learning_rate": 0.00018007849827879634, "loss": 0.0546, "step": 5760 }, { "epoch": 1.420611439842209, "grad_norm": 0.3671875, "learning_rate": 0.00018005787482980358, "loss": 0.0651, "step": 5762 }, { "epoch": 1.421104536489152, "grad_norm": 0.474609375, "learning_rate": 0.00018003724189350574, "loss": 0.064, "step": 5764 }, { "epoch": 1.4215976331360947, "grad_norm": 0.419921875, "learning_rate": 0.0001800165994723478, "loss": 0.0652, "step": 5766 }, { "epoch": 1.4220907297830374, "grad_norm": 0.435546875, "learning_rate": 0.00017999594756877612, "loss": 0.0653, "step": 5768 }, { "epoch": 1.4225838264299804, "grad_norm": 0.5390625, "learning_rate": 0.000179975286185238, "loss": 0.0618, "step": 5770 }, { "epoch": 1.4230769230769231, "grad_norm": 0.47265625, "learning_rate": 0.00017995461532418193, "loss": 0.0685, "step": 5772 }, { "epoch": 1.4235700197238659, "grad_norm": 0.4921875, "learning_rate": 0.00017993393498805753, "loss": 0.0649, "step": 5774 }, { "epoch": 1.4240631163708086, "grad_norm": 0.451171875, "learning_rate": 0.00017991324517931557, "loss": 0.0654, "step": 5776 }, { "epoch": 1.4245562130177514, "grad_norm": 0.54296875, "learning_rate": 0.00017989254590040785, "loss": 0.0648, "step": 5778 }, { "epoch": 1.4250493096646943, "grad_norm": 0.38671875, "learning_rate": 0.00017987183715378736, "loss": 0.0587, "step": 5780 }, { "epoch": 1.425542406311637, "grad_norm": 0.443359375, "learning_rate": 0.00017985111894190827, "loss": 0.068, "step": 5782 }, { "epoch": 1.4260355029585798, "grad_norm": 0.361328125, "learning_rate": 0.00017983039126722572, "loss": 0.0671, "step": 5784 }, { "epoch": 1.4265285996055228, "grad_norm": 0.384765625, "learning_rate": 0.0001798096541321961, "loss": 0.0646, "step": 5786 }, { "epoch": 1.4270216962524656, "grad_norm": 0.408203125, "learning_rate": 0.00017978890753927686, "loss": 0.0655, "step": 5788 }, { "epoch": 1.4275147928994083, "grad_norm": 0.416015625, "learning_rate": 0.0001797681514909266, "loss": 0.0613, "step": 5790 }, { "epoch": 1.428007889546351, "grad_norm": 0.462890625, "learning_rate": 0.000179747385989605, "loss": 0.0627, "step": 5792 }, { "epoch": 1.4285009861932938, "grad_norm": 0.369140625, "learning_rate": 0.00017972661103777293, "loss": 0.0621, "step": 5794 }, { "epoch": 1.4289940828402368, "grad_norm": 0.59375, "learning_rate": 0.0001797058266378923, "loss": 0.0625, "step": 5796 }, { "epoch": 1.4294871794871795, "grad_norm": 0.400390625, "learning_rate": 0.0001796850327924262, "loss": 0.0633, "step": 5798 }, { "epoch": 1.4299802761341223, "grad_norm": 0.45703125, "learning_rate": 0.00017966422950383882, "loss": 0.0601, "step": 5800 }, { "epoch": 1.430473372781065, "grad_norm": 0.40625, "learning_rate": 0.00017964341677459541, "loss": 0.0582, "step": 5802 }, { "epoch": 1.4309664694280078, "grad_norm": 0.427734375, "learning_rate": 0.00017962259460716252, "loss": 0.0623, "step": 5804 }, { "epoch": 1.4314595660749507, "grad_norm": 0.419921875, "learning_rate": 0.00017960176300400757, "loss": 0.0616, "step": 5806 }, { "epoch": 1.4319526627218935, "grad_norm": 0.291015625, "learning_rate": 0.00017958092196759923, "loss": 0.0559, "step": 5808 }, { "epoch": 1.4324457593688362, "grad_norm": 0.62109375, "learning_rate": 0.00017956007150040737, "loss": 0.0653, "step": 5810 }, { "epoch": 1.4329388560157792, "grad_norm": 0.2890625, "learning_rate": 0.00017953921160490278, "loss": 0.0601, "step": 5812 }, { "epoch": 1.433431952662722, "grad_norm": 0.466796875, "learning_rate": 0.00017951834228355757, "loss": 0.0643, "step": 5814 }, { "epoch": 1.4339250493096647, "grad_norm": 0.37109375, "learning_rate": 0.00017949746353884482, "loss": 0.0616, "step": 5816 }, { "epoch": 1.4344181459566074, "grad_norm": 0.38671875, "learning_rate": 0.00017947657537323877, "loss": 0.0679, "step": 5818 }, { "epoch": 1.4349112426035502, "grad_norm": 0.57421875, "learning_rate": 0.0001794556777892148, "loss": 0.0669, "step": 5820 }, { "epoch": 1.4354043392504932, "grad_norm": 0.388671875, "learning_rate": 0.0001794347707892494, "loss": 0.0614, "step": 5822 }, { "epoch": 1.435897435897436, "grad_norm": 0.76953125, "learning_rate": 0.00017941385437582016, "loss": 0.0654, "step": 5824 }, { "epoch": 1.4363905325443787, "grad_norm": 0.63671875, "learning_rate": 0.00017939292855140575, "loss": 0.0671, "step": 5826 }, { "epoch": 1.4368836291913216, "grad_norm": 0.447265625, "learning_rate": 0.00017937199331848603, "loss": 0.0648, "step": 5828 }, { "epoch": 1.4373767258382644, "grad_norm": 0.310546875, "learning_rate": 0.000179351048679542, "loss": 0.0572, "step": 5830 }, { "epoch": 1.4378698224852071, "grad_norm": 0.36328125, "learning_rate": 0.0001793300946370556, "loss": 0.0553, "step": 5832 }, { "epoch": 1.4383629191321499, "grad_norm": 0.443359375, "learning_rate": 0.0001793091311935101, "loss": 0.061, "step": 5834 }, { "epoch": 1.4388560157790926, "grad_norm": 0.5078125, "learning_rate": 0.00017928815835138978, "loss": 0.0618, "step": 5836 }, { "epoch": 1.4393491124260356, "grad_norm": 0.56640625, "learning_rate": 0.00017926717611317994, "loss": 0.0608, "step": 5838 }, { "epoch": 1.4398422090729783, "grad_norm": 0.62890625, "learning_rate": 0.00017924618448136718, "loss": 0.0614, "step": 5840 }, { "epoch": 1.440335305719921, "grad_norm": 0.65234375, "learning_rate": 0.00017922518345843912, "loss": 0.0631, "step": 5842 }, { "epoch": 1.4408284023668638, "grad_norm": 0.43359375, "learning_rate": 0.0001792041730468845, "loss": 0.0625, "step": 5844 }, { "epoch": 1.4413214990138066, "grad_norm": 0.515625, "learning_rate": 0.00017918315324919312, "loss": 0.0596, "step": 5846 }, { "epoch": 1.4418145956607495, "grad_norm": 0.4140625, "learning_rate": 0.000179162124067856, "loss": 0.061, "step": 5848 }, { "epoch": 1.4423076923076923, "grad_norm": 0.875, "learning_rate": 0.00017914108550536517, "loss": 0.0616, "step": 5850 }, { "epoch": 1.442800788954635, "grad_norm": 0.40625, "learning_rate": 0.00017912003756421384, "loss": 0.0561, "step": 5852 }, { "epoch": 1.443293885601578, "grad_norm": 0.7265625, "learning_rate": 0.00017909898024689633, "loss": 0.0611, "step": 5854 }, { "epoch": 1.4437869822485208, "grad_norm": 0.51953125, "learning_rate": 0.000179077913555908, "loss": 0.0556, "step": 5856 }, { "epoch": 1.4442800788954635, "grad_norm": 0.64453125, "learning_rate": 0.00017905683749374544, "loss": 0.0625, "step": 5858 }, { "epoch": 1.4447731755424063, "grad_norm": 0.58984375, "learning_rate": 0.00017903575206290625, "loss": 0.0587, "step": 5860 }, { "epoch": 1.445266272189349, "grad_norm": 0.546875, "learning_rate": 0.0001790146572658891, "loss": 0.0619, "step": 5862 }, { "epoch": 1.445759368836292, "grad_norm": 0.53125, "learning_rate": 0.00017899355310519393, "loss": 0.0613, "step": 5864 }, { "epoch": 1.4462524654832347, "grad_norm": 0.46875, "learning_rate": 0.00017897243958332168, "loss": 0.0602, "step": 5866 }, { "epoch": 1.4467455621301775, "grad_norm": 0.61328125, "learning_rate": 0.0001789513167027744, "loss": 0.0558, "step": 5868 }, { "epoch": 1.4472386587771204, "grad_norm": 0.44921875, "learning_rate": 0.00017893018446605533, "loss": 0.0684, "step": 5870 }, { "epoch": 1.4477317554240632, "grad_norm": 0.5078125, "learning_rate": 0.00017890904287566866, "loss": 0.0575, "step": 5872 }, { "epoch": 1.448224852071006, "grad_norm": 0.47265625, "learning_rate": 0.00017888789193411982, "loss": 0.058, "step": 5874 }, { "epoch": 1.4487179487179487, "grad_norm": 0.6484375, "learning_rate": 0.0001788667316439154, "loss": 0.057, "step": 5876 }, { "epoch": 1.4492110453648914, "grad_norm": 0.431640625, "learning_rate": 0.0001788455620075629, "loss": 0.0629, "step": 5878 }, { "epoch": 1.4497041420118344, "grad_norm": 0.66015625, "learning_rate": 0.0001788243830275711, "loss": 0.0605, "step": 5880 }, { "epoch": 1.4501972386587771, "grad_norm": 0.45703125, "learning_rate": 0.0001788031947064498, "loss": 0.0614, "step": 5882 }, { "epoch": 1.45069033530572, "grad_norm": 0.57421875, "learning_rate": 0.00017878199704670996, "loss": 0.0672, "step": 5884 }, { "epoch": 1.4511834319526626, "grad_norm": 0.68359375, "learning_rate": 0.0001787607900508636, "loss": 0.0541, "step": 5886 }, { "epoch": 1.4516765285996054, "grad_norm": 0.515625, "learning_rate": 0.00017873957372142388, "loss": 0.0628, "step": 5888 }, { "epoch": 1.4521696252465484, "grad_norm": 0.53515625, "learning_rate": 0.00017871834806090501, "loss": 0.0672, "step": 5890 }, { "epoch": 1.452662721893491, "grad_norm": 0.546875, "learning_rate": 0.00017869711307182245, "loss": 0.0644, "step": 5892 }, { "epoch": 1.4531558185404339, "grad_norm": 0.66015625, "learning_rate": 0.00017867586875669255, "loss": 0.0559, "step": 5894 }, { "epoch": 1.4536489151873768, "grad_norm": 0.55078125, "learning_rate": 0.00017865461511803297, "loss": 0.0601, "step": 5896 }, { "epoch": 1.4541420118343196, "grad_norm": 0.44921875, "learning_rate": 0.00017863335215836235, "loss": 0.0628, "step": 5898 }, { "epoch": 1.4546351084812623, "grad_norm": 0.396484375, "learning_rate": 0.00017861207988020044, "loss": 0.0648, "step": 5900 }, { "epoch": 1.455128205128205, "grad_norm": 0.486328125, "learning_rate": 0.00017859079828606817, "loss": 0.0598, "step": 5902 }, { "epoch": 1.4556213017751478, "grad_norm": 0.380859375, "learning_rate": 0.00017856950737848748, "loss": 0.0605, "step": 5904 }, { "epoch": 1.4561143984220908, "grad_norm": 0.59375, "learning_rate": 0.0001785482071599815, "loss": 0.057, "step": 5906 }, { "epoch": 1.4566074950690335, "grad_norm": 0.62109375, "learning_rate": 0.00017852689763307443, "loss": 0.0613, "step": 5908 }, { "epoch": 1.4571005917159763, "grad_norm": 0.462890625, "learning_rate": 0.00017850557880029158, "loss": 0.0629, "step": 5910 }, { "epoch": 1.4575936883629192, "grad_norm": 0.447265625, "learning_rate": 0.00017848425066415926, "loss": 0.0635, "step": 5912 }, { "epoch": 1.458086785009862, "grad_norm": 0.330078125, "learning_rate": 0.00017846291322720507, "loss": 0.0643, "step": 5914 }, { "epoch": 1.4585798816568047, "grad_norm": 0.447265625, "learning_rate": 0.00017844156649195759, "loss": 0.0584, "step": 5916 }, { "epoch": 1.4590729783037475, "grad_norm": 0.39453125, "learning_rate": 0.00017842021046094648, "loss": 0.0557, "step": 5918 }, { "epoch": 1.4595660749506902, "grad_norm": 0.3984375, "learning_rate": 0.00017839884513670264, "loss": 0.0615, "step": 5920 }, { "epoch": 1.4600591715976332, "grad_norm": 0.4375, "learning_rate": 0.0001783774705217579, "loss": 0.0593, "step": 5922 }, { "epoch": 1.460552268244576, "grad_norm": 0.380859375, "learning_rate": 0.0001783560866186453, "loss": 0.0604, "step": 5924 }, { "epoch": 1.4610453648915187, "grad_norm": 0.365234375, "learning_rate": 0.00017833469342989894, "loss": 0.057, "step": 5926 }, { "epoch": 1.4615384615384617, "grad_norm": 0.3671875, "learning_rate": 0.00017831329095805408, "loss": 0.0609, "step": 5928 }, { "epoch": 1.4620315581854042, "grad_norm": 0.5234375, "learning_rate": 0.00017829187920564697, "loss": 0.0628, "step": 5930 }, { "epoch": 1.4625246548323472, "grad_norm": 0.361328125, "learning_rate": 0.00017827045817521504, "loss": 0.063, "step": 5932 }, { "epoch": 1.46301775147929, "grad_norm": 0.423828125, "learning_rate": 0.00017824902786929683, "loss": 0.0599, "step": 5934 }, { "epoch": 1.4635108481262327, "grad_norm": 0.443359375, "learning_rate": 0.00017822758829043193, "loss": 0.0715, "step": 5936 }, { "epoch": 1.4640039447731756, "grad_norm": 0.412109375, "learning_rate": 0.00017820613944116102, "loss": 0.06, "step": 5938 }, { "epoch": 1.4644970414201184, "grad_norm": 0.490234375, "learning_rate": 0.00017818468132402597, "loss": 0.067, "step": 5940 }, { "epoch": 1.4649901380670611, "grad_norm": 0.41796875, "learning_rate": 0.00017816321394156966, "loss": 0.0627, "step": 5942 }, { "epoch": 1.4654832347140039, "grad_norm": 0.423828125, "learning_rate": 0.00017814173729633606, "loss": 0.0631, "step": 5944 }, { "epoch": 1.4659763313609466, "grad_norm": 0.5625, "learning_rate": 0.00017812025139087031, "loss": 0.0616, "step": 5946 }, { "epoch": 1.4664694280078896, "grad_norm": 0.65234375, "learning_rate": 0.00017809875622771864, "loss": 0.064, "step": 5948 }, { "epoch": 1.4669625246548323, "grad_norm": 0.5390625, "learning_rate": 0.0001780772518094283, "loss": 0.0592, "step": 5950 }, { "epoch": 1.467455621301775, "grad_norm": 0.375, "learning_rate": 0.00017805573813854767, "loss": 0.063, "step": 5952 }, { "epoch": 1.467948717948718, "grad_norm": 0.439453125, "learning_rate": 0.00017803421521762626, "loss": 0.0658, "step": 5954 }, { "epoch": 1.4684418145956608, "grad_norm": 0.490234375, "learning_rate": 0.00017801268304921467, "loss": 0.0677, "step": 5956 }, { "epoch": 1.4689349112426036, "grad_norm": 0.361328125, "learning_rate": 0.00017799114163586456, "loss": 0.0609, "step": 5958 }, { "epoch": 1.4694280078895463, "grad_norm": 0.384765625, "learning_rate": 0.00017796959098012873, "loss": 0.0571, "step": 5960 }, { "epoch": 1.469921104536489, "grad_norm": 0.296875, "learning_rate": 0.00017794803108456104, "loss": 0.0619, "step": 5962 }, { "epoch": 1.470414201183432, "grad_norm": 0.41015625, "learning_rate": 0.00017792646195171648, "loss": 0.061, "step": 5964 }, { "epoch": 1.4709072978303748, "grad_norm": 0.361328125, "learning_rate": 0.00017790488358415107, "loss": 0.0594, "step": 5966 }, { "epoch": 1.4714003944773175, "grad_norm": 0.3671875, "learning_rate": 0.00017788329598442197, "loss": 0.061, "step": 5968 }, { "epoch": 1.4718934911242605, "grad_norm": 0.373046875, "learning_rate": 0.0001778616991550875, "loss": 0.0587, "step": 5970 }, { "epoch": 1.4723865877712032, "grad_norm": 0.375, "learning_rate": 0.00017784009309870693, "loss": 0.0613, "step": 5972 }, { "epoch": 1.472879684418146, "grad_norm": 0.39453125, "learning_rate": 0.0001778184778178407, "loss": 0.0631, "step": 5974 }, { "epoch": 1.4733727810650887, "grad_norm": 0.35546875, "learning_rate": 0.00017779685331505043, "loss": 0.0566, "step": 5976 }, { "epoch": 1.4738658777120315, "grad_norm": 0.3515625, "learning_rate": 0.00017777521959289866, "loss": 0.0577, "step": 5978 }, { "epoch": 1.4743589743589745, "grad_norm": 0.40234375, "learning_rate": 0.0001777535766539491, "loss": 0.0574, "step": 5980 }, { "epoch": 1.4748520710059172, "grad_norm": 0.37890625, "learning_rate": 0.00017773192450076656, "loss": 0.0607, "step": 5982 }, { "epoch": 1.47534516765286, "grad_norm": 0.341796875, "learning_rate": 0.000177710263135917, "loss": 0.0622, "step": 5984 }, { "epoch": 1.4758382642998027, "grad_norm": 0.400390625, "learning_rate": 0.0001776885925619674, "loss": 0.058, "step": 5986 }, { "epoch": 1.4763313609467454, "grad_norm": 0.396484375, "learning_rate": 0.00017766691278148583, "loss": 0.0598, "step": 5988 }, { "epoch": 1.4768244575936884, "grad_norm": 0.58203125, "learning_rate": 0.00017764522379704143, "loss": 0.0608, "step": 5990 }, { "epoch": 1.4773175542406312, "grad_norm": 0.6953125, "learning_rate": 0.00017762352561120448, "loss": 0.0611, "step": 5992 }, { "epoch": 1.477810650887574, "grad_norm": 0.6484375, "learning_rate": 0.00017760181822654638, "loss": 0.0565, "step": 5994 }, { "epoch": 1.4783037475345169, "grad_norm": 0.5, "learning_rate": 0.00017758010164563954, "loss": 0.064, "step": 5996 }, { "epoch": 1.4787968441814596, "grad_norm": 0.47265625, "learning_rate": 0.0001775583758710575, "loss": 0.0593, "step": 5998 }, { "epoch": 1.4792899408284024, "grad_norm": 0.60546875, "learning_rate": 0.0001775366409053749, "loss": 0.0567, "step": 6000 }, { "epoch": 1.4797830374753451, "grad_norm": 0.51953125, "learning_rate": 0.00017751489675116742, "loss": 0.0581, "step": 6002 }, { "epoch": 1.4802761341222879, "grad_norm": 0.498046875, "learning_rate": 0.0001774931434110119, "loss": 0.0616, "step": 6004 }, { "epoch": 1.4807692307692308, "grad_norm": 0.427734375, "learning_rate": 0.00017747138088748622, "loss": 0.0608, "step": 6006 }, { "epoch": 1.4812623274161736, "grad_norm": 0.578125, "learning_rate": 0.00017744960918316936, "loss": 0.0625, "step": 6008 }, { "epoch": 1.4817554240631163, "grad_norm": 0.56640625, "learning_rate": 0.0001774278283006414, "loss": 0.0593, "step": 6010 }, { "epoch": 1.4822485207100593, "grad_norm": 0.482421875, "learning_rate": 0.00017740603824248347, "loss": 0.0575, "step": 6012 }, { "epoch": 1.482741617357002, "grad_norm": 0.3359375, "learning_rate": 0.00017738423901127783, "loss": 0.0539, "step": 6014 }, { "epoch": 1.4832347140039448, "grad_norm": 0.546875, "learning_rate": 0.0001773624306096078, "loss": 0.0624, "step": 6016 }, { "epoch": 1.4837278106508875, "grad_norm": 0.375, "learning_rate": 0.00017734061304005785, "loss": 0.0592, "step": 6018 }, { "epoch": 1.4842209072978303, "grad_norm": 0.337890625, "learning_rate": 0.0001773187863052134, "loss": 0.0592, "step": 6020 }, { "epoch": 1.4847140039447733, "grad_norm": 0.357421875, "learning_rate": 0.0001772969504076611, "loss": 0.0588, "step": 6022 }, { "epoch": 1.485207100591716, "grad_norm": 0.578125, "learning_rate": 0.0001772751053499886, "loss": 0.0611, "step": 6024 }, { "epoch": 1.4857001972386588, "grad_norm": 0.3828125, "learning_rate": 0.00017725325113478464, "loss": 0.0583, "step": 6026 }, { "epoch": 1.4861932938856015, "grad_norm": 0.310546875, "learning_rate": 0.00017723138776463913, "loss": 0.0534, "step": 6028 }, { "epoch": 1.4866863905325443, "grad_norm": 0.50390625, "learning_rate": 0.00017720951524214293, "loss": 0.0584, "step": 6030 }, { "epoch": 1.4871794871794872, "grad_norm": 0.515625, "learning_rate": 0.0001771876335698881, "loss": 0.0616, "step": 6032 }, { "epoch": 1.48767258382643, "grad_norm": 0.435546875, "learning_rate": 0.00017716574275046773, "loss": 0.0612, "step": 6034 }, { "epoch": 1.4881656804733727, "grad_norm": 0.3984375, "learning_rate": 0.00017714384278647605, "loss": 0.0568, "step": 6036 }, { "epoch": 1.4886587771203157, "grad_norm": 0.373046875, "learning_rate": 0.00017712193368050823, "loss": 0.0603, "step": 6038 }, { "epoch": 1.4891518737672584, "grad_norm": 0.43359375, "learning_rate": 0.0001771000154351607, "loss": 0.0606, "step": 6040 }, { "epoch": 1.4896449704142012, "grad_norm": 0.462890625, "learning_rate": 0.00017707808805303085, "loss": 0.0642, "step": 6042 }, { "epoch": 1.490138067061144, "grad_norm": 0.400390625, "learning_rate": 0.00017705615153671724, "loss": 0.0546, "step": 6044 }, { "epoch": 1.4906311637080867, "grad_norm": 0.384765625, "learning_rate": 0.00017703420588881946, "loss": 0.0587, "step": 6046 }, { "epoch": 1.4911242603550297, "grad_norm": 0.44921875, "learning_rate": 0.00017701225111193815, "loss": 0.0536, "step": 6048 }, { "epoch": 1.4916173570019724, "grad_norm": 0.4375, "learning_rate": 0.0001769902872086751, "loss": 0.0582, "step": 6050 }, { "epoch": 1.4921104536489151, "grad_norm": 0.5390625, "learning_rate": 0.0001769683141816332, "loss": 0.0592, "step": 6052 }, { "epoch": 1.4926035502958581, "grad_norm": 0.431640625, "learning_rate": 0.0001769463320334163, "loss": 0.06, "step": 6054 }, { "epoch": 1.4930966469428009, "grad_norm": 0.458984375, "learning_rate": 0.0001769243407666294, "loss": 0.0585, "step": 6056 }, { "epoch": 1.4935897435897436, "grad_norm": 0.373046875, "learning_rate": 0.00017690234038387868, "loss": 0.0618, "step": 6058 }, { "epoch": 1.4940828402366864, "grad_norm": 0.376953125, "learning_rate": 0.00017688033088777126, "loss": 0.0503, "step": 6060 }, { "epoch": 1.494575936883629, "grad_norm": 0.51953125, "learning_rate": 0.00017685831228091535, "loss": 0.0594, "step": 6062 }, { "epoch": 1.495069033530572, "grad_norm": 0.359375, "learning_rate": 0.00017683628456592035, "loss": 0.0589, "step": 6064 }, { "epoch": 1.4955621301775148, "grad_norm": 0.53125, "learning_rate": 0.00017681424774539662, "loss": 0.067, "step": 6066 }, { "epoch": 1.4960552268244576, "grad_norm": 0.42578125, "learning_rate": 0.00017679220182195563, "loss": 0.0567, "step": 6068 }, { "epoch": 1.4965483234714003, "grad_norm": 0.7109375, "learning_rate": 0.00017677014679821, "loss": 0.0654, "step": 6070 }, { "epoch": 1.497041420118343, "grad_norm": 0.408203125, "learning_rate": 0.00017674808267677334, "loss": 0.0604, "step": 6072 }, { "epoch": 1.497534516765286, "grad_norm": 0.53515625, "learning_rate": 0.00017672600946026036, "loss": 0.0568, "step": 6074 }, { "epoch": 1.4980276134122288, "grad_norm": 0.546875, "learning_rate": 0.0001767039271512869, "loss": 0.0596, "step": 6076 }, { "epoch": 1.4985207100591715, "grad_norm": 0.404296875, "learning_rate": 0.0001766818357524698, "loss": 0.0604, "step": 6078 }, { "epoch": 1.4990138067061145, "grad_norm": 0.53125, "learning_rate": 0.00017665973526642703, "loss": 0.0594, "step": 6080 }, { "epoch": 1.4995069033530573, "grad_norm": 0.3671875, "learning_rate": 0.00017663762569577763, "loss": 0.0705, "step": 6082 }, { "epoch": 1.5, "grad_norm": 0.396484375, "learning_rate": 0.00017661550704314166, "loss": 0.0694, "step": 6084 }, { "epoch": 1.5004930966469427, "grad_norm": 0.349609375, "learning_rate": 0.00017659337931114037, "loss": 0.0585, "step": 6086 }, { "epoch": 1.5009861932938855, "grad_norm": 0.298828125, "learning_rate": 0.00017657124250239597, "loss": 0.0563, "step": 6088 }, { "epoch": 1.5014792899408285, "grad_norm": 0.337890625, "learning_rate": 0.00017654909661953184, "loss": 0.0601, "step": 6090 }, { "epoch": 1.5019723865877712, "grad_norm": 0.4140625, "learning_rate": 0.00017652694166517234, "loss": 0.0623, "step": 6092 }, { "epoch": 1.502465483234714, "grad_norm": 0.4375, "learning_rate": 0.00017650477764194296, "loss": 0.0601, "step": 6094 }, { "epoch": 1.502958579881657, "grad_norm": 0.408203125, "learning_rate": 0.00017648260455247034, "loss": 0.0584, "step": 6096 }, { "epoch": 1.5034516765285995, "grad_norm": 0.361328125, "learning_rate": 0.00017646042239938202, "loss": 0.0652, "step": 6098 }, { "epoch": 1.5039447731755424, "grad_norm": 0.5703125, "learning_rate": 0.00017643823118530675, "loss": 0.0574, "step": 6100 }, { "epoch": 1.5044378698224852, "grad_norm": 0.30078125, "learning_rate": 0.0001764160309128743, "loss": 0.0588, "step": 6102 }, { "epoch": 1.504930966469428, "grad_norm": 0.349609375, "learning_rate": 0.00017639382158471555, "loss": 0.0574, "step": 6104 }, { "epoch": 1.505424063116371, "grad_norm": 0.46484375, "learning_rate": 0.0001763716032034624, "loss": 0.057, "step": 6106 }, { "epoch": 1.5059171597633136, "grad_norm": 0.396484375, "learning_rate": 0.0001763493757717479, "loss": 0.0625, "step": 6108 }, { "epoch": 1.5064102564102564, "grad_norm": 0.384765625, "learning_rate": 0.00017632713929220605, "loss": 0.0601, "step": 6110 }, { "epoch": 1.5069033530571994, "grad_norm": 0.455078125, "learning_rate": 0.00017630489376747204, "loss": 0.0626, "step": 6112 }, { "epoch": 1.5073964497041419, "grad_norm": 0.416015625, "learning_rate": 0.00017628263920018214, "loss": 0.0622, "step": 6114 }, { "epoch": 1.5078895463510849, "grad_norm": 0.33984375, "learning_rate": 0.00017626037559297352, "loss": 0.0559, "step": 6116 }, { "epoch": 1.5083826429980276, "grad_norm": 0.41015625, "learning_rate": 0.00017623810294848465, "loss": 0.0575, "step": 6118 }, { "epoch": 1.5088757396449703, "grad_norm": 0.39453125, "learning_rate": 0.0001762158212693549, "loss": 0.0591, "step": 6120 }, { "epoch": 1.5093688362919133, "grad_norm": 0.416015625, "learning_rate": 0.0001761935305582248, "loss": 0.058, "step": 6122 }, { "epoch": 1.509861932938856, "grad_norm": 0.361328125, "learning_rate": 0.00017617123081773591, "loss": 0.0521, "step": 6124 }, { "epoch": 1.5103550295857988, "grad_norm": 0.365234375, "learning_rate": 0.00017614892205053092, "loss": 0.0593, "step": 6126 }, { "epoch": 1.5108481262327418, "grad_norm": 0.44921875, "learning_rate": 0.00017612660425925347, "loss": 0.0624, "step": 6128 }, { "epoch": 1.5113412228796843, "grad_norm": 0.47265625, "learning_rate": 0.00017610427744654838, "loss": 0.06, "step": 6130 }, { "epoch": 1.5118343195266273, "grad_norm": 0.498046875, "learning_rate": 0.0001760819416150615, "loss": 0.0588, "step": 6132 }, { "epoch": 1.51232741617357, "grad_norm": 0.4921875, "learning_rate": 0.00017605959676743977, "loss": 0.0587, "step": 6134 }, { "epoch": 1.5128205128205128, "grad_norm": 0.52734375, "learning_rate": 0.00017603724290633115, "loss": 0.0598, "step": 6136 }, { "epoch": 1.5133136094674557, "grad_norm": 0.7734375, "learning_rate": 0.00017601488003438472, "loss": 0.0644, "step": 6138 }, { "epoch": 1.5138067061143983, "grad_norm": 0.71484375, "learning_rate": 0.00017599250815425056, "loss": 0.0625, "step": 6140 }, { "epoch": 1.5142998027613412, "grad_norm": 0.51171875, "learning_rate": 0.00017597012726857994, "loss": 0.0552, "step": 6142 }, { "epoch": 1.514792899408284, "grad_norm": 0.396484375, "learning_rate": 0.00017594773738002504, "loss": 0.0592, "step": 6144 }, { "epoch": 1.5152859960552267, "grad_norm": 0.65625, "learning_rate": 0.00017592533849123925, "loss": 0.0589, "step": 6146 }, { "epoch": 1.5157790927021697, "grad_norm": 0.34375, "learning_rate": 0.0001759029306048769, "loss": 0.0571, "step": 6148 }, { "epoch": 1.5162721893491125, "grad_norm": 0.66796875, "learning_rate": 0.00017588051372359353, "loss": 0.0596, "step": 6150 }, { "epoch": 1.5167652859960552, "grad_norm": 0.431640625, "learning_rate": 0.00017585808785004558, "loss": 0.063, "step": 6152 }, { "epoch": 1.5172583826429982, "grad_norm": 0.39453125, "learning_rate": 0.00017583565298689071, "loss": 0.0586, "step": 6154 }, { "epoch": 1.5177514792899407, "grad_norm": 0.5546875, "learning_rate": 0.00017581320913678754, "loss": 0.0583, "step": 6156 }, { "epoch": 1.5182445759368837, "grad_norm": 0.37890625, "learning_rate": 0.00017579075630239581, "loss": 0.0612, "step": 6158 }, { "epoch": 1.5187376725838264, "grad_norm": 0.447265625, "learning_rate": 0.00017576829448637629, "loss": 0.055, "step": 6160 }, { "epoch": 1.5192307692307692, "grad_norm": 0.42578125, "learning_rate": 0.00017574582369139083, "loss": 0.0605, "step": 6162 }, { "epoch": 1.5197238658777121, "grad_norm": 0.57421875, "learning_rate": 0.00017572334392010236, "loss": 0.0614, "step": 6164 }, { "epoch": 1.5202169625246549, "grad_norm": 0.443359375, "learning_rate": 0.00017570085517517483, "loss": 0.0598, "step": 6166 }, { "epoch": 1.5207100591715976, "grad_norm": 0.45703125, "learning_rate": 0.00017567835745927333, "loss": 0.0601, "step": 6168 }, { "epoch": 1.5212031558185406, "grad_norm": 0.482421875, "learning_rate": 0.00017565585077506394, "loss": 0.0542, "step": 6170 }, { "epoch": 1.5216962524654831, "grad_norm": 0.388671875, "learning_rate": 0.00017563333512521378, "loss": 0.0562, "step": 6172 }, { "epoch": 1.522189349112426, "grad_norm": 0.328125, "learning_rate": 0.00017561081051239118, "loss": 0.0566, "step": 6174 }, { "epoch": 1.5226824457593688, "grad_norm": 0.380859375, "learning_rate": 0.00017558827693926534, "loss": 0.0602, "step": 6176 }, { "epoch": 1.5231755424063116, "grad_norm": 0.48828125, "learning_rate": 0.00017556573440850667, "loss": 0.0561, "step": 6178 }, { "epoch": 1.5236686390532546, "grad_norm": 0.546875, "learning_rate": 0.00017554318292278657, "loss": 0.0599, "step": 6180 }, { "epoch": 1.524161735700197, "grad_norm": 0.400390625, "learning_rate": 0.0001755206224847775, "loss": 0.0632, "step": 6182 }, { "epoch": 1.52465483234714, "grad_norm": 0.734375, "learning_rate": 0.00017549805309715304, "loss": 0.0615, "step": 6184 }, { "epoch": 1.5251479289940828, "grad_norm": 0.42578125, "learning_rate": 0.00017547547476258775, "loss": 0.0577, "step": 6186 }, { "epoch": 1.5256410256410255, "grad_norm": 0.447265625, "learning_rate": 0.0001754528874837573, "loss": 0.0604, "step": 6188 }, { "epoch": 1.5261341222879685, "grad_norm": 0.66796875, "learning_rate": 0.00017543029126333836, "loss": 0.0589, "step": 6190 }, { "epoch": 1.5266272189349113, "grad_norm": 0.38671875, "learning_rate": 0.00017540768610400882, "loss": 0.0605, "step": 6192 }, { "epoch": 1.527120315581854, "grad_norm": 0.38671875, "learning_rate": 0.00017538507200844742, "loss": 0.0621, "step": 6194 }, { "epoch": 1.527613412228797, "grad_norm": 0.349609375, "learning_rate": 0.0001753624489793341, "loss": 0.0541, "step": 6196 }, { "epoch": 1.5281065088757395, "grad_norm": 0.48046875, "learning_rate": 0.0001753398170193498, "loss": 0.0557, "step": 6198 }, { "epoch": 1.5285996055226825, "grad_norm": 0.359375, "learning_rate": 0.00017531717613117654, "loss": 0.0589, "step": 6200 }, { "epoch": 1.5290927021696252, "grad_norm": 0.419921875, "learning_rate": 0.00017529452631749742, "loss": 0.0592, "step": 6202 }, { "epoch": 1.529585798816568, "grad_norm": 0.322265625, "learning_rate": 0.00017527186758099653, "loss": 0.053, "step": 6204 }, { "epoch": 1.530078895463511, "grad_norm": 0.45703125, "learning_rate": 0.000175249199924359, "loss": 0.0633, "step": 6206 }, { "epoch": 1.5305719921104537, "grad_norm": 0.4765625, "learning_rate": 0.00017522652335027122, "loss": 0.0621, "step": 6208 }, { "epoch": 1.5310650887573964, "grad_norm": 0.36328125, "learning_rate": 0.00017520383786142038, "loss": 0.0589, "step": 6210 }, { "epoch": 1.5315581854043394, "grad_norm": 0.482421875, "learning_rate": 0.0001751811434604949, "loss": 0.0661, "step": 6212 }, { "epoch": 1.532051282051282, "grad_norm": 0.3359375, "learning_rate": 0.00017515844015018414, "loss": 0.0581, "step": 6214 }, { "epoch": 1.532544378698225, "grad_norm": 0.33984375, "learning_rate": 0.0001751357279331786, "loss": 0.0573, "step": 6216 }, { "epoch": 1.5330374753451677, "grad_norm": 0.392578125, "learning_rate": 0.00017511300681216977, "loss": 0.0564, "step": 6218 }, { "epoch": 1.5335305719921104, "grad_norm": 0.41796875, "learning_rate": 0.00017509027678985027, "loss": 0.0564, "step": 6220 }, { "epoch": 1.5340236686390534, "grad_norm": 0.396484375, "learning_rate": 0.0001750675378689137, "loss": 0.0562, "step": 6222 }, { "epoch": 1.534516765285996, "grad_norm": 0.318359375, "learning_rate": 0.0001750447900520548, "loss": 0.054, "step": 6224 }, { "epoch": 1.5350098619329389, "grad_norm": 0.404296875, "learning_rate": 0.0001750220333419693, "loss": 0.0524, "step": 6226 }, { "epoch": 1.5355029585798816, "grad_norm": 0.42578125, "learning_rate": 0.00017499926774135395, "loss": 0.0618, "step": 6228 }, { "epoch": 1.5359960552268244, "grad_norm": 0.373046875, "learning_rate": 0.00017497649325290662, "loss": 0.058, "step": 6230 }, { "epoch": 1.5364891518737673, "grad_norm": 0.408203125, "learning_rate": 0.00017495370987932624, "loss": 0.0574, "step": 6232 }, { "epoch": 1.53698224852071, "grad_norm": 0.388671875, "learning_rate": 0.00017493091762331276, "loss": 0.0544, "step": 6234 }, { "epoch": 1.5374753451676528, "grad_norm": 0.421875, "learning_rate": 0.00017490811648756716, "loss": 0.0554, "step": 6236 }, { "epoch": 1.5379684418145958, "grad_norm": 0.3203125, "learning_rate": 0.00017488530647479154, "loss": 0.0562, "step": 6238 }, { "epoch": 1.5384615384615383, "grad_norm": 0.431640625, "learning_rate": 0.00017486248758768897, "loss": 0.0562, "step": 6240 }, { "epoch": 1.5389546351084813, "grad_norm": 0.65625, "learning_rate": 0.00017483965982896367, "loss": 0.0604, "step": 6242 }, { "epoch": 1.539447731755424, "grad_norm": 0.61328125, "learning_rate": 0.00017481682320132083, "loss": 0.0629, "step": 6244 }, { "epoch": 1.5399408284023668, "grad_norm": 0.6171875, "learning_rate": 0.0001747939777074667, "loss": 0.0618, "step": 6246 }, { "epoch": 1.5404339250493098, "grad_norm": 0.66015625, "learning_rate": 0.00017477112335010864, "loss": 0.0574, "step": 6248 }, { "epoch": 1.5409270216962525, "grad_norm": 0.396484375, "learning_rate": 0.00017474826013195495, "loss": 0.0617, "step": 6250 }, { "epoch": 1.5414201183431953, "grad_norm": 0.474609375, "learning_rate": 0.00017472538805571514, "loss": 0.057, "step": 6252 }, { "epoch": 1.5419132149901382, "grad_norm": 0.388671875, "learning_rate": 0.0001747025071240996, "loss": 0.0568, "step": 6254 }, { "epoch": 1.5424063116370808, "grad_norm": 0.546875, "learning_rate": 0.0001746796173398199, "loss": 0.0583, "step": 6256 }, { "epoch": 1.5428994082840237, "grad_norm": 0.42578125, "learning_rate": 0.00017465671870558856, "loss": 0.0609, "step": 6258 }, { "epoch": 1.5433925049309665, "grad_norm": 0.400390625, "learning_rate": 0.00017463381122411926, "loss": 0.057, "step": 6260 }, { "epoch": 1.5438856015779092, "grad_norm": 0.43359375, "learning_rate": 0.00017461089489812661, "loss": 0.0545, "step": 6262 }, { "epoch": 1.5443786982248522, "grad_norm": 0.435546875, "learning_rate": 0.00017458796973032637, "loss": 0.0546, "step": 6264 }, { "epoch": 1.5448717948717947, "grad_norm": 0.345703125, "learning_rate": 0.00017456503572343523, "loss": 0.054, "step": 6266 }, { "epoch": 1.5453648915187377, "grad_norm": 0.42578125, "learning_rate": 0.00017454209288017105, "loss": 0.0599, "step": 6268 }, { "epoch": 1.5458579881656804, "grad_norm": 0.42578125, "learning_rate": 0.0001745191412032527, "loss": 0.0514, "step": 6270 }, { "epoch": 1.5463510848126232, "grad_norm": 0.3359375, "learning_rate": 0.00017449618069540003, "loss": 0.0613, "step": 6272 }, { "epoch": 1.5468441814595661, "grad_norm": 0.40625, "learning_rate": 0.000174473211359334, "loss": 0.06, "step": 6274 }, { "epoch": 1.547337278106509, "grad_norm": 0.37109375, "learning_rate": 0.00017445023319777664, "loss": 0.0633, "step": 6276 }, { "epoch": 1.5478303747534516, "grad_norm": 0.376953125, "learning_rate": 0.00017442724621345097, "loss": 0.0552, "step": 6278 }, { "epoch": 1.5483234714003946, "grad_norm": 0.373046875, "learning_rate": 0.00017440425040908108, "loss": 0.0614, "step": 6280 }, { "epoch": 1.5488165680473371, "grad_norm": 0.333984375, "learning_rate": 0.00017438124578739208, "loss": 0.0592, "step": 6282 }, { "epoch": 1.54930966469428, "grad_norm": 0.328125, "learning_rate": 0.00017435823235111017, "loss": 0.0554, "step": 6284 }, { "epoch": 1.5498027613412229, "grad_norm": 0.34765625, "learning_rate": 0.00017433521010296255, "loss": 0.0594, "step": 6286 }, { "epoch": 1.5502958579881656, "grad_norm": 0.423828125, "learning_rate": 0.00017431217904567752, "loss": 0.0633, "step": 6288 }, { "epoch": 1.5507889546351086, "grad_norm": 0.38671875, "learning_rate": 0.00017428913918198435, "loss": 0.0569, "step": 6290 }, { "epoch": 1.5512820512820513, "grad_norm": 0.380859375, "learning_rate": 0.00017426609051461345, "loss": 0.0525, "step": 6292 }, { "epoch": 1.551775147928994, "grad_norm": 0.404296875, "learning_rate": 0.00017424303304629612, "loss": 0.0578, "step": 6294 }, { "epoch": 1.552268244575937, "grad_norm": 0.337890625, "learning_rate": 0.00017421996677976487, "loss": 0.0594, "step": 6296 }, { "epoch": 1.5527613412228796, "grad_norm": 0.451171875, "learning_rate": 0.00017419689171775318, "loss": 0.0574, "step": 6298 }, { "epoch": 1.5532544378698225, "grad_norm": 0.375, "learning_rate": 0.00017417380786299556, "loss": 0.062, "step": 6300 }, { "epoch": 1.5537475345167653, "grad_norm": 0.39453125, "learning_rate": 0.00017415071521822756, "loss": 0.0554, "step": 6302 }, { "epoch": 1.554240631163708, "grad_norm": 0.38671875, "learning_rate": 0.0001741276137861858, "loss": 0.0558, "step": 6304 }, { "epoch": 1.554733727810651, "grad_norm": 0.453125, "learning_rate": 0.00017410450356960795, "loss": 0.0552, "step": 6306 }, { "epoch": 1.5552268244575935, "grad_norm": 0.384765625, "learning_rate": 0.00017408138457123266, "loss": 0.0594, "step": 6308 }, { "epoch": 1.5557199211045365, "grad_norm": 0.4296875, "learning_rate": 0.0001740582567937997, "loss": 0.0563, "step": 6310 }, { "epoch": 1.5562130177514792, "grad_norm": 0.376953125, "learning_rate": 0.00017403512024004983, "loss": 0.0568, "step": 6312 }, { "epoch": 1.556706114398422, "grad_norm": 0.482421875, "learning_rate": 0.00017401197491272483, "loss": 0.0607, "step": 6314 }, { "epoch": 1.557199211045365, "grad_norm": 0.392578125, "learning_rate": 0.00017398882081456755, "loss": 0.0571, "step": 6316 }, { "epoch": 1.5576923076923077, "grad_norm": 0.3359375, "learning_rate": 0.00017396565794832193, "loss": 0.0574, "step": 6318 }, { "epoch": 1.5581854043392505, "grad_norm": 0.337890625, "learning_rate": 0.00017394248631673288, "loss": 0.0591, "step": 6320 }, { "epoch": 1.5586785009861934, "grad_norm": 0.384765625, "learning_rate": 0.00017391930592254636, "loss": 0.0574, "step": 6322 }, { "epoch": 1.559171597633136, "grad_norm": 0.53515625, "learning_rate": 0.00017389611676850938, "loss": 0.0619, "step": 6324 }, { "epoch": 1.559664694280079, "grad_norm": 0.345703125, "learning_rate": 0.00017387291885736996, "loss": 0.0542, "step": 6326 }, { "epoch": 1.5601577909270217, "grad_norm": 0.69921875, "learning_rate": 0.0001738497121918772, "loss": 0.0557, "step": 6328 }, { "epoch": 1.5606508875739644, "grad_norm": 0.361328125, "learning_rate": 0.00017382649677478122, "loss": 0.0618, "step": 6330 }, { "epoch": 1.5611439842209074, "grad_norm": 0.37109375, "learning_rate": 0.00017380327260883317, "loss": 0.0608, "step": 6332 }, { "epoch": 1.5616370808678501, "grad_norm": 0.41015625, "learning_rate": 0.00017378003969678528, "loss": 0.0559, "step": 6334 }, { "epoch": 1.5621301775147929, "grad_norm": 0.421875, "learning_rate": 0.00017375679804139076, "loss": 0.0549, "step": 6336 }, { "epoch": 1.5626232741617359, "grad_norm": 0.5703125, "learning_rate": 0.00017373354764540385, "loss": 0.0603, "step": 6338 }, { "epoch": 1.5631163708086784, "grad_norm": 0.515625, "learning_rate": 0.00017371028851157985, "loss": 0.0611, "step": 6340 }, { "epoch": 1.5636094674556213, "grad_norm": 0.37109375, "learning_rate": 0.00017368702064267516, "loss": 0.0478, "step": 6342 }, { "epoch": 1.564102564102564, "grad_norm": 0.490234375, "learning_rate": 0.00017366374404144706, "loss": 0.0556, "step": 6344 }, { "epoch": 1.5645956607495068, "grad_norm": 0.4140625, "learning_rate": 0.000173640458710654, "loss": 0.056, "step": 6346 }, { "epoch": 1.5650887573964498, "grad_norm": 0.48828125, "learning_rate": 0.00017361716465305547, "loss": 0.0577, "step": 6348 }, { "epoch": 1.5655818540433923, "grad_norm": 0.357421875, "learning_rate": 0.0001735938618714119, "loss": 0.0542, "step": 6350 }, { "epoch": 1.5660749506903353, "grad_norm": 0.38671875, "learning_rate": 0.00017357055036848476, "loss": 0.0551, "step": 6352 }, { "epoch": 1.566568047337278, "grad_norm": 0.3125, "learning_rate": 0.00017354723014703665, "loss": 0.055, "step": 6354 }, { "epoch": 1.5670611439842208, "grad_norm": 0.3828125, "learning_rate": 0.00017352390120983118, "loss": 0.0572, "step": 6356 }, { "epoch": 1.5675542406311638, "grad_norm": 0.373046875, "learning_rate": 0.00017350056355963287, "loss": 0.0583, "step": 6358 }, { "epoch": 1.5680473372781065, "grad_norm": 0.39453125, "learning_rate": 0.0001734772171992074, "loss": 0.0573, "step": 6360 }, { "epoch": 1.5685404339250493, "grad_norm": 0.3671875, "learning_rate": 0.00017345386213132147, "loss": 0.057, "step": 6362 }, { "epoch": 1.5690335305719922, "grad_norm": 0.345703125, "learning_rate": 0.00017343049835874275, "loss": 0.0547, "step": 6364 }, { "epoch": 1.5695266272189348, "grad_norm": 0.400390625, "learning_rate": 0.00017340712588424, "loss": 0.0564, "step": 6366 }, { "epoch": 1.5700197238658777, "grad_norm": 0.40234375, "learning_rate": 0.00017338374471058296, "loss": 0.0612, "step": 6368 }, { "epoch": 1.5705128205128205, "grad_norm": 0.443359375, "learning_rate": 0.00017336035484054248, "loss": 0.0606, "step": 6370 }, { "epoch": 1.5710059171597632, "grad_norm": 0.48046875, "learning_rate": 0.00017333695627689034, "loss": 0.0643, "step": 6372 }, { "epoch": 1.5714990138067062, "grad_norm": 0.369140625, "learning_rate": 0.00017331354902239943, "loss": 0.0645, "step": 6374 }, { "epoch": 1.571992110453649, "grad_norm": 0.5546875, "learning_rate": 0.0001732901330798436, "loss": 0.0552, "step": 6376 }, { "epoch": 1.5724852071005917, "grad_norm": 0.51171875, "learning_rate": 0.00017326670845199785, "loss": 0.0577, "step": 6378 }, { "epoch": 1.5729783037475347, "grad_norm": 0.388671875, "learning_rate": 0.00017324327514163802, "loss": 0.0598, "step": 6380 }, { "epoch": 1.5734714003944772, "grad_norm": 0.376953125, "learning_rate": 0.0001732198331515412, "loss": 0.0605, "step": 6382 }, { "epoch": 1.5739644970414202, "grad_norm": 0.5390625, "learning_rate": 0.00017319638248448533, "loss": 0.0597, "step": 6384 }, { "epoch": 1.574457593688363, "grad_norm": 0.328125, "learning_rate": 0.00017317292314324945, "loss": 0.0604, "step": 6386 }, { "epoch": 1.5749506903353057, "grad_norm": 0.392578125, "learning_rate": 0.00017314945513061364, "loss": 0.0596, "step": 6388 }, { "epoch": 1.5754437869822486, "grad_norm": 0.419921875, "learning_rate": 0.000173125978449359, "loss": 0.0525, "step": 6390 }, { "epoch": 1.5759368836291914, "grad_norm": 0.369140625, "learning_rate": 0.00017310249310226758, "loss": 0.0592, "step": 6392 }, { "epoch": 1.5764299802761341, "grad_norm": 0.35546875, "learning_rate": 0.0001730789990921226, "loss": 0.0586, "step": 6394 }, { "epoch": 1.5769230769230769, "grad_norm": 0.3515625, "learning_rate": 0.00017305549642170819, "loss": 0.0615, "step": 6396 }, { "epoch": 1.5774161735700196, "grad_norm": 0.396484375, "learning_rate": 0.0001730319850938096, "loss": 0.0535, "step": 6398 }, { "epoch": 1.5779092702169626, "grad_norm": 0.474609375, "learning_rate": 0.00017300846511121302, "loss": 0.0619, "step": 6400 }, { "epoch": 1.5784023668639053, "grad_norm": 0.357421875, "learning_rate": 0.0001729849364767057, "loss": 0.0636, "step": 6402 }, { "epoch": 1.578895463510848, "grad_norm": 0.44921875, "learning_rate": 0.00017296139919307585, "loss": 0.0618, "step": 6404 }, { "epoch": 1.579388560157791, "grad_norm": 0.4375, "learning_rate": 0.00017293785326311288, "loss": 0.0607, "step": 6406 }, { "epoch": 1.5798816568047336, "grad_norm": 0.390625, "learning_rate": 0.00017291429868960707, "loss": 0.0557, "step": 6408 }, { "epoch": 1.5803747534516766, "grad_norm": 0.345703125, "learning_rate": 0.00017289073547534976, "loss": 0.0608, "step": 6410 }, { "epoch": 1.5808678500986193, "grad_norm": 0.365234375, "learning_rate": 0.00017286716362313332, "loss": 0.0567, "step": 6412 }, { "epoch": 1.581360946745562, "grad_norm": 0.412109375, "learning_rate": 0.00017284358313575116, "loss": 0.0599, "step": 6414 }, { "epoch": 1.581854043392505, "grad_norm": 0.341796875, "learning_rate": 0.00017281999401599766, "loss": 0.0548, "step": 6416 }, { "epoch": 1.5823471400394478, "grad_norm": 0.34375, "learning_rate": 0.0001727963962666683, "loss": 0.0591, "step": 6418 }, { "epoch": 1.5828402366863905, "grad_norm": 0.33984375, "learning_rate": 0.00017277278989055958, "loss": 0.0554, "step": 6420 }, { "epoch": 1.5833333333333335, "grad_norm": 0.3359375, "learning_rate": 0.00017274917489046892, "loss": 0.0538, "step": 6422 }, { "epoch": 1.583826429980276, "grad_norm": 0.474609375, "learning_rate": 0.00017272555126919488, "loss": 0.0548, "step": 6424 }, { "epoch": 1.584319526627219, "grad_norm": 0.42578125, "learning_rate": 0.00017270191902953695, "loss": 0.061, "step": 6426 }, { "epoch": 1.5848126232741617, "grad_norm": 0.5234375, "learning_rate": 0.00017267827817429567, "loss": 0.062, "step": 6428 }, { "epoch": 1.5853057199211045, "grad_norm": 0.33984375, "learning_rate": 0.0001726546287062727, "loss": 0.0517, "step": 6430 }, { "epoch": 1.5857988165680474, "grad_norm": 0.34375, "learning_rate": 0.00017263097062827053, "loss": 0.0584, "step": 6432 }, { "epoch": 1.5862919132149902, "grad_norm": 0.3984375, "learning_rate": 0.00017260730394309283, "loss": 0.065, "step": 6434 }, { "epoch": 1.586785009861933, "grad_norm": 0.41015625, "learning_rate": 0.00017258362865354426, "loss": 0.0635, "step": 6436 }, { "epoch": 1.5872781065088757, "grad_norm": 0.3671875, "learning_rate": 0.0001725599447624304, "loss": 0.054, "step": 6438 }, { "epoch": 1.5877712031558184, "grad_norm": 0.3828125, "learning_rate": 0.00017253625227255798, "loss": 0.0576, "step": 6440 }, { "epoch": 1.5882642998027614, "grad_norm": 0.458984375, "learning_rate": 0.00017251255118673467, "loss": 0.0611, "step": 6442 }, { "epoch": 1.5887573964497042, "grad_norm": 0.361328125, "learning_rate": 0.0001724888415077692, "loss": 0.0537, "step": 6444 }, { "epoch": 1.589250493096647, "grad_norm": 0.5390625, "learning_rate": 0.00017246512323847125, "loss": 0.0619, "step": 6446 }, { "epoch": 1.5897435897435899, "grad_norm": 0.333984375, "learning_rate": 0.00017244139638165163, "loss": 0.0591, "step": 6448 }, { "epoch": 1.5902366863905324, "grad_norm": 0.546875, "learning_rate": 0.00017241766094012206, "loss": 0.0605, "step": 6450 }, { "epoch": 1.5907297830374754, "grad_norm": 0.4453125, "learning_rate": 0.00017239391691669536, "loss": 0.057, "step": 6452 }, { "epoch": 1.5912228796844181, "grad_norm": 0.365234375, "learning_rate": 0.0001723701643141853, "loss": 0.0507, "step": 6454 }, { "epoch": 1.5917159763313609, "grad_norm": 0.4375, "learning_rate": 0.00017234640313540674, "loss": 0.0583, "step": 6456 }, { "epoch": 1.5922090729783038, "grad_norm": 0.515625, "learning_rate": 0.00017232263338317542, "loss": 0.0614, "step": 6458 }, { "epoch": 1.5927021696252466, "grad_norm": 0.51953125, "learning_rate": 0.00017229885506030827, "loss": 0.057, "step": 6460 }, { "epoch": 1.5931952662721893, "grad_norm": 0.365234375, "learning_rate": 0.00017227506816962316, "loss": 0.0614, "step": 6462 }, { "epoch": 1.5936883629191323, "grad_norm": 0.50390625, "learning_rate": 0.0001722512727139389, "loss": 0.0623, "step": 6464 }, { "epoch": 1.5941814595660748, "grad_norm": 0.380859375, "learning_rate": 0.00017222746869607542, "loss": 0.0579, "step": 6466 }, { "epoch": 1.5946745562130178, "grad_norm": 0.423828125, "learning_rate": 0.00017220365611885367, "loss": 0.055, "step": 6468 }, { "epoch": 1.5951676528599605, "grad_norm": 0.43359375, "learning_rate": 0.0001721798349850955, "loss": 0.0608, "step": 6470 }, { "epoch": 1.5956607495069033, "grad_norm": 0.40234375, "learning_rate": 0.0001721560052976239, "loss": 0.0612, "step": 6472 }, { "epoch": 1.5961538461538463, "grad_norm": 0.359375, "learning_rate": 0.00017213216705926282, "loss": 0.056, "step": 6474 }, { "epoch": 1.596646942800789, "grad_norm": 0.359375, "learning_rate": 0.0001721083202728372, "loss": 0.0617, "step": 6476 }, { "epoch": 1.5971400394477318, "grad_norm": 0.328125, "learning_rate": 0.000172084464941173, "loss": 0.0576, "step": 6478 }, { "epoch": 1.5976331360946747, "grad_norm": 0.416015625, "learning_rate": 0.00017206060106709727, "loss": 0.0576, "step": 6480 }, { "epoch": 1.5981262327416172, "grad_norm": 0.34375, "learning_rate": 0.00017203672865343794, "loss": 0.0523, "step": 6482 }, { "epoch": 1.5986193293885602, "grad_norm": 0.400390625, "learning_rate": 0.0001720128477030241, "loss": 0.0558, "step": 6484 }, { "epoch": 1.599112426035503, "grad_norm": 0.3515625, "learning_rate": 0.00017198895821868572, "loss": 0.0568, "step": 6486 }, { "epoch": 1.5996055226824457, "grad_norm": 0.357421875, "learning_rate": 0.00017196506020325387, "loss": 0.0549, "step": 6488 }, { "epoch": 1.6000986193293887, "grad_norm": 0.5625, "learning_rate": 0.00017194115365956058, "loss": 0.0605, "step": 6490 }, { "epoch": 1.6005917159763312, "grad_norm": 0.359375, "learning_rate": 0.00017191723859043888, "loss": 0.0584, "step": 6492 }, { "epoch": 1.6010848126232742, "grad_norm": 0.38671875, "learning_rate": 0.0001718933149987229, "loss": 0.0587, "step": 6494 }, { "epoch": 1.601577909270217, "grad_norm": 0.57421875, "learning_rate": 0.0001718693828872477, "loss": 0.0585, "step": 6496 }, { "epoch": 1.6020710059171597, "grad_norm": 0.46484375, "learning_rate": 0.00017184544225884937, "loss": 0.0595, "step": 6498 }, { "epoch": 1.6025641025641026, "grad_norm": 0.373046875, "learning_rate": 0.000171821493116365, "loss": 0.0594, "step": 6500 }, { "epoch": 1.6030571992110454, "grad_norm": 0.39453125, "learning_rate": 0.00017179753546263266, "loss": 0.062, "step": 6502 }, { "epoch": 1.6035502958579881, "grad_norm": 0.40234375, "learning_rate": 0.00017177356930049152, "loss": 0.058, "step": 6504 }, { "epoch": 1.604043392504931, "grad_norm": 0.498046875, "learning_rate": 0.00017174959463278172, "loss": 0.0588, "step": 6506 }, { "epoch": 1.6045364891518736, "grad_norm": 0.34765625, "learning_rate": 0.0001717256114623443, "loss": 0.0556, "step": 6508 }, { "epoch": 1.6050295857988166, "grad_norm": 0.33984375, "learning_rate": 0.00017170161979202149, "loss": 0.0567, "step": 6510 }, { "epoch": 1.6055226824457594, "grad_norm": 0.435546875, "learning_rate": 0.00017167761962465637, "loss": 0.0584, "step": 6512 }, { "epoch": 1.606015779092702, "grad_norm": 0.40234375, "learning_rate": 0.00017165361096309318, "loss": 0.0622, "step": 6514 }, { "epoch": 1.606508875739645, "grad_norm": 0.380859375, "learning_rate": 0.000171629593810177, "loss": 0.0563, "step": 6516 }, { "epoch": 1.6070019723865878, "grad_norm": 0.41015625, "learning_rate": 0.000171605568168754, "loss": 0.0567, "step": 6518 }, { "epoch": 1.6074950690335306, "grad_norm": 0.369140625, "learning_rate": 0.00017158153404167142, "loss": 0.0576, "step": 6520 }, { "epoch": 1.6079881656804735, "grad_norm": 0.384765625, "learning_rate": 0.0001715574914317774, "loss": 0.0581, "step": 6522 }, { "epoch": 1.608481262327416, "grad_norm": 0.337890625, "learning_rate": 0.0001715334403419211, "loss": 0.0567, "step": 6524 }, { "epoch": 1.608974358974359, "grad_norm": 0.37109375, "learning_rate": 0.00017150938077495272, "loss": 0.054, "step": 6526 }, { "epoch": 1.6094674556213018, "grad_norm": 0.388671875, "learning_rate": 0.00017148531273372347, "loss": 0.0589, "step": 6528 }, { "epoch": 1.6099605522682445, "grad_norm": 0.35546875, "learning_rate": 0.00017146123622108553, "loss": 0.0595, "step": 6530 }, { "epoch": 1.6104536489151875, "grad_norm": 0.498046875, "learning_rate": 0.00017143715123989212, "loss": 0.0609, "step": 6532 }, { "epoch": 1.61094674556213, "grad_norm": 0.294921875, "learning_rate": 0.00017141305779299745, "loss": 0.0504, "step": 6534 }, { "epoch": 1.611439842209073, "grad_norm": 0.453125, "learning_rate": 0.00017138895588325668, "loss": 0.0569, "step": 6536 }, { "epoch": 1.6119329388560157, "grad_norm": 0.435546875, "learning_rate": 0.00017136484551352608, "loss": 0.0582, "step": 6538 }, { "epoch": 1.6124260355029585, "grad_norm": 0.380859375, "learning_rate": 0.0001713407266866628, "loss": 0.059, "step": 6540 }, { "epoch": 1.6129191321499015, "grad_norm": 0.37109375, "learning_rate": 0.0001713165994055251, "loss": 0.0561, "step": 6542 }, { "epoch": 1.6134122287968442, "grad_norm": 0.328125, "learning_rate": 0.00017129246367297222, "loss": 0.0555, "step": 6544 }, { "epoch": 1.613905325443787, "grad_norm": 0.3515625, "learning_rate": 0.00017126831949186433, "loss": 0.0537, "step": 6546 }, { "epoch": 1.61439842209073, "grad_norm": 0.51171875, "learning_rate": 0.00017124416686506268, "loss": 0.0614, "step": 6548 }, { "epoch": 1.6148915187376724, "grad_norm": 0.32421875, "learning_rate": 0.00017122000579542946, "loss": 0.0571, "step": 6550 }, { "epoch": 1.6153846153846154, "grad_norm": 0.32421875, "learning_rate": 0.0001711958362858279, "loss": 0.058, "step": 6552 }, { "epoch": 1.6158777120315582, "grad_norm": 0.3828125, "learning_rate": 0.0001711716583391223, "loss": 0.0537, "step": 6554 }, { "epoch": 1.616370808678501, "grad_norm": 0.416015625, "learning_rate": 0.00017114747195817777, "loss": 0.0609, "step": 6556 }, { "epoch": 1.6168639053254439, "grad_norm": 0.421875, "learning_rate": 0.0001711232771458606, "loss": 0.0544, "step": 6558 }, { "epoch": 1.6173570019723866, "grad_norm": 0.53515625, "learning_rate": 0.00017109907390503796, "loss": 0.055, "step": 6560 }, { "epoch": 1.6178500986193294, "grad_norm": 0.515625, "learning_rate": 0.00017107486223857807, "loss": 0.0579, "step": 6562 }, { "epoch": 1.6183431952662723, "grad_norm": 0.4453125, "learning_rate": 0.0001710506421493502, "loss": 0.0603, "step": 6564 }, { "epoch": 1.6188362919132149, "grad_norm": 0.390625, "learning_rate": 0.00017102641364022457, "loss": 0.0561, "step": 6566 }, { "epoch": 1.6193293885601578, "grad_norm": 0.35546875, "learning_rate": 0.00017100217671407232, "loss": 0.0515, "step": 6568 }, { "epoch": 1.6198224852071006, "grad_norm": 0.33984375, "learning_rate": 0.0001709779313737657, "loss": 0.0546, "step": 6570 }, { "epoch": 1.6203155818540433, "grad_norm": 0.322265625, "learning_rate": 0.00017095367762217796, "loss": 0.0559, "step": 6572 }, { "epoch": 1.6208086785009863, "grad_norm": 0.5, "learning_rate": 0.00017092941546218324, "loss": 0.0616, "step": 6574 }, { "epoch": 1.6213017751479288, "grad_norm": 0.69921875, "learning_rate": 0.0001709051448966568, "loss": 0.0543, "step": 6576 }, { "epoch": 1.6217948717948718, "grad_norm": 0.54296875, "learning_rate": 0.00017088086592847478, "loss": 0.0647, "step": 6578 }, { "epoch": 1.6222879684418146, "grad_norm": 0.373046875, "learning_rate": 0.00017085657856051438, "loss": 0.0577, "step": 6580 }, { "epoch": 1.6227810650887573, "grad_norm": 0.46875, "learning_rate": 0.00017083228279565383, "loss": 0.0564, "step": 6582 }, { "epoch": 1.6232741617357003, "grad_norm": 0.435546875, "learning_rate": 0.00017080797863677232, "loss": 0.0503, "step": 6584 }, { "epoch": 1.623767258382643, "grad_norm": 0.404296875, "learning_rate": 0.00017078366608674997, "loss": 0.0555, "step": 6586 }, { "epoch": 1.6242603550295858, "grad_norm": 0.51171875, "learning_rate": 0.00017075934514846798, "loss": 0.0587, "step": 6588 }, { "epoch": 1.6247534516765287, "grad_norm": 0.515625, "learning_rate": 0.00017073501582480854, "loss": 0.0533, "step": 6590 }, { "epoch": 1.6252465483234713, "grad_norm": 0.443359375, "learning_rate": 0.00017071067811865476, "loss": 0.0523, "step": 6592 }, { "epoch": 1.6257396449704142, "grad_norm": 0.384765625, "learning_rate": 0.00017068633203289085, "loss": 0.0589, "step": 6594 }, { "epoch": 1.626232741617357, "grad_norm": 0.458984375, "learning_rate": 0.00017066197757040192, "loss": 0.0569, "step": 6596 }, { "epoch": 1.6267258382642997, "grad_norm": 0.33984375, "learning_rate": 0.00017063761473407412, "loss": 0.0568, "step": 6598 }, { "epoch": 1.6272189349112427, "grad_norm": 0.390625, "learning_rate": 0.0001706132435267946, "loss": 0.0567, "step": 6600 }, { "epoch": 1.6277120315581854, "grad_norm": 0.318359375, "learning_rate": 0.00017058886395145145, "loss": 0.0544, "step": 6602 }, { "epoch": 1.6282051282051282, "grad_norm": 0.59765625, "learning_rate": 0.00017056447601093382, "loss": 0.0579, "step": 6604 }, { "epoch": 1.6286982248520712, "grad_norm": 0.3671875, "learning_rate": 0.00017054007970813178, "loss": 0.0624, "step": 6606 }, { "epoch": 1.6291913214990137, "grad_norm": 0.421875, "learning_rate": 0.00017051567504593648, "loss": 0.0625, "step": 6608 }, { "epoch": 1.6296844181459567, "grad_norm": 0.369140625, "learning_rate": 0.00017049126202723998, "loss": 0.06, "step": 6610 }, { "epoch": 1.6301775147928994, "grad_norm": 0.32421875, "learning_rate": 0.00017046684065493532, "loss": 0.0566, "step": 6612 }, { "epoch": 1.6306706114398422, "grad_norm": 0.361328125, "learning_rate": 0.00017044241093191662, "loss": 0.0532, "step": 6614 }, { "epoch": 1.6311637080867851, "grad_norm": 0.486328125, "learning_rate": 0.00017041797286107893, "loss": 0.055, "step": 6616 }, { "epoch": 1.6316568047337277, "grad_norm": 0.392578125, "learning_rate": 0.0001703935264453183, "loss": 0.061, "step": 6618 }, { "epoch": 1.6321499013806706, "grad_norm": 0.37109375, "learning_rate": 0.00017036907168753175, "loss": 0.0569, "step": 6620 }, { "epoch": 1.6326429980276134, "grad_norm": 0.392578125, "learning_rate": 0.0001703446085906173, "loss": 0.0579, "step": 6622 }, { "epoch": 1.6331360946745561, "grad_norm": 0.314453125, "learning_rate": 0.00017032013715747403, "loss": 0.0572, "step": 6624 }, { "epoch": 1.633629191321499, "grad_norm": 0.439453125, "learning_rate": 0.00017029565739100185, "loss": 0.0512, "step": 6626 }, { "epoch": 1.6341222879684418, "grad_norm": 0.439453125, "learning_rate": 0.0001702711692941018, "loss": 0.0567, "step": 6628 }, { "epoch": 1.6346153846153846, "grad_norm": 0.482421875, "learning_rate": 0.0001702466728696758, "loss": 0.0589, "step": 6630 }, { "epoch": 1.6351084812623276, "grad_norm": 0.60546875, "learning_rate": 0.00017022216812062691, "loss": 0.0563, "step": 6632 }, { "epoch": 1.63560157790927, "grad_norm": 0.27734375, "learning_rate": 0.000170197655049859, "loss": 0.0564, "step": 6634 }, { "epoch": 1.636094674556213, "grad_norm": 0.40234375, "learning_rate": 0.00017017313366027705, "loss": 0.0581, "step": 6636 }, { "epoch": 1.6365877712031558, "grad_norm": 0.359375, "learning_rate": 0.00017014860395478696, "loss": 0.0527, "step": 6638 }, { "epoch": 1.6370808678500985, "grad_norm": 0.384765625, "learning_rate": 0.00017012406593629563, "loss": 0.057, "step": 6640 }, { "epoch": 1.6375739644970415, "grad_norm": 0.419921875, "learning_rate": 0.000170099519607711, "loss": 0.0608, "step": 6642 }, { "epoch": 1.6380670611439843, "grad_norm": 0.443359375, "learning_rate": 0.00017007496497194188, "loss": 0.0567, "step": 6644 }, { "epoch": 1.638560157790927, "grad_norm": 0.48828125, "learning_rate": 0.00017005040203189818, "loss": 0.0594, "step": 6646 }, { "epoch": 1.63905325443787, "grad_norm": 0.6171875, "learning_rate": 0.00017002583079049068, "loss": 0.0597, "step": 6648 }, { "epoch": 1.6395463510848125, "grad_norm": 0.494140625, "learning_rate": 0.0001700012512506313, "loss": 0.0561, "step": 6650 }, { "epoch": 1.6400394477317555, "grad_norm": 0.455078125, "learning_rate": 0.0001699766634152328, "loss": 0.0518, "step": 6652 }, { "epoch": 1.6405325443786982, "grad_norm": 0.4140625, "learning_rate": 0.000169952067287209, "loss": 0.0565, "step": 6654 }, { "epoch": 1.641025641025641, "grad_norm": 0.62109375, "learning_rate": 0.00016992746286947466, "loss": 0.0504, "step": 6656 }, { "epoch": 1.641518737672584, "grad_norm": 0.390625, "learning_rate": 0.0001699028501649455, "loss": 0.057, "step": 6658 }, { "epoch": 1.6420118343195265, "grad_norm": 0.470703125, "learning_rate": 0.00016987822917653839, "loss": 0.0569, "step": 6660 }, { "epoch": 1.6425049309664694, "grad_norm": 0.3203125, "learning_rate": 0.00016985359990717092, "loss": 0.0557, "step": 6662 }, { "epoch": 1.6429980276134122, "grad_norm": 0.4375, "learning_rate": 0.00016982896235976185, "loss": 0.057, "step": 6664 }, { "epoch": 1.643491124260355, "grad_norm": 0.4609375, "learning_rate": 0.00016980431653723085, "loss": 0.0537, "step": 6666 }, { "epoch": 1.643984220907298, "grad_norm": 0.3984375, "learning_rate": 0.00016977966244249864, "loss": 0.0558, "step": 6668 }, { "epoch": 1.6444773175542406, "grad_norm": 0.53125, "learning_rate": 0.0001697550000784868, "loss": 0.057, "step": 6670 }, { "epoch": 1.6449704142011834, "grad_norm": 0.390625, "learning_rate": 0.00016973032944811803, "loss": 0.0546, "step": 6672 }, { "epoch": 1.6454635108481264, "grad_norm": 0.5859375, "learning_rate": 0.00016970565055431588, "loss": 0.0506, "step": 6674 }, { "epoch": 1.645956607495069, "grad_norm": 0.390625, "learning_rate": 0.00016968096340000492, "loss": 0.0521, "step": 6676 }, { "epoch": 1.6464497041420119, "grad_norm": 0.359375, "learning_rate": 0.00016965626798811075, "loss": 0.0547, "step": 6678 }, { "epoch": 1.6469428007889546, "grad_norm": 0.453125, "learning_rate": 0.00016963156432155994, "loss": 0.06, "step": 6680 }, { "epoch": 1.6474358974358974, "grad_norm": 0.361328125, "learning_rate": 0.00016960685240327997, "loss": 0.056, "step": 6682 }, { "epoch": 1.6479289940828403, "grad_norm": 0.34375, "learning_rate": 0.0001695821322361994, "loss": 0.0551, "step": 6684 }, { "epoch": 1.648422090729783, "grad_norm": 0.515625, "learning_rate": 0.00016955740382324763, "loss": 0.0568, "step": 6686 }, { "epoch": 1.6489151873767258, "grad_norm": 0.43359375, "learning_rate": 0.00016953266716735515, "loss": 0.0504, "step": 6688 }, { "epoch": 1.6494082840236688, "grad_norm": 0.5, "learning_rate": 0.00016950792227145338, "loss": 0.0577, "step": 6690 }, { "epoch": 1.6499013806706113, "grad_norm": 0.546875, "learning_rate": 0.00016948316913847474, "loss": 0.0598, "step": 6692 }, { "epoch": 1.6503944773175543, "grad_norm": 0.271484375, "learning_rate": 0.00016945840777135262, "loss": 0.0525, "step": 6694 }, { "epoch": 1.650887573964497, "grad_norm": 0.435546875, "learning_rate": 0.00016943363817302135, "loss": 0.0557, "step": 6696 }, { "epoch": 1.6513806706114398, "grad_norm": 0.357421875, "learning_rate": 0.00016940886034641633, "loss": 0.059, "step": 6698 }, { "epoch": 1.6518737672583828, "grad_norm": 0.38671875, "learning_rate": 0.0001693840742944738, "loss": 0.0583, "step": 6700 }, { "epoch": 1.6523668639053253, "grad_norm": 0.41796875, "learning_rate": 0.00016935928002013108, "loss": 0.0592, "step": 6702 }, { "epoch": 1.6528599605522682, "grad_norm": 0.33984375, "learning_rate": 0.00016933447752632644, "loss": 0.0632, "step": 6704 }, { "epoch": 1.653353057199211, "grad_norm": 0.32421875, "learning_rate": 0.00016930966681599907, "loss": 0.0572, "step": 6706 }, { "epoch": 1.6538461538461537, "grad_norm": 0.447265625, "learning_rate": 0.00016928484789208924, "loss": 0.0568, "step": 6708 }, { "epoch": 1.6543392504930967, "grad_norm": 0.30859375, "learning_rate": 0.00016926002075753807, "loss": 0.0542, "step": 6710 }, { "epoch": 1.6548323471400395, "grad_norm": 0.392578125, "learning_rate": 0.00016923518541528775, "loss": 0.0579, "step": 6712 }, { "epoch": 1.6553254437869822, "grad_norm": 0.458984375, "learning_rate": 0.0001692103418682814, "loss": 0.0575, "step": 6714 }, { "epoch": 1.6558185404339252, "grad_norm": 0.40625, "learning_rate": 0.0001691854901194631, "loss": 0.0572, "step": 6716 }, { "epoch": 1.6563116370808677, "grad_norm": 0.3671875, "learning_rate": 0.000169160630171778, "loss": 0.0566, "step": 6718 }, { "epoch": 1.6568047337278107, "grad_norm": 0.396484375, "learning_rate": 0.00016913576202817204, "loss": 0.0606, "step": 6720 }, { "epoch": 1.6572978303747534, "grad_norm": 0.388671875, "learning_rate": 0.00016911088569159226, "loss": 0.0552, "step": 6722 }, { "epoch": 1.6577909270216962, "grad_norm": 0.3359375, "learning_rate": 0.0001690860011649867, "loss": 0.0548, "step": 6724 }, { "epoch": 1.6582840236686391, "grad_norm": 0.345703125, "learning_rate": 0.0001690611084513042, "loss": 0.0539, "step": 6726 }, { "epoch": 1.6587771203155819, "grad_norm": 0.345703125, "learning_rate": 0.0001690362075534948, "loss": 0.0514, "step": 6728 }, { "epoch": 1.6592702169625246, "grad_norm": 0.384765625, "learning_rate": 0.00016901129847450938, "loss": 0.0579, "step": 6730 }, { "epoch": 1.6597633136094676, "grad_norm": 0.38671875, "learning_rate": 0.00016898638121729976, "loss": 0.0569, "step": 6732 }, { "epoch": 1.6602564102564101, "grad_norm": 0.44921875, "learning_rate": 0.00016896145578481882, "loss": 0.0536, "step": 6734 }, { "epoch": 1.660749506903353, "grad_norm": 0.419921875, "learning_rate": 0.00016893652218002032, "loss": 0.0592, "step": 6736 }, { "epoch": 1.6612426035502958, "grad_norm": 0.640625, "learning_rate": 0.00016891158040585903, "loss": 0.0562, "step": 6738 }, { "epoch": 1.6617357001972386, "grad_norm": 0.4765625, "learning_rate": 0.00016888663046529074, "loss": 0.0514, "step": 6740 }, { "epoch": 1.6622287968441816, "grad_norm": 0.6875, "learning_rate": 0.0001688616723612721, "loss": 0.0559, "step": 6742 }, { "epoch": 1.6627218934911243, "grad_norm": 0.64453125, "learning_rate": 0.00016883670609676082, "loss": 0.0544, "step": 6744 }, { "epoch": 1.663214990138067, "grad_norm": 0.466796875, "learning_rate": 0.00016881173167471555, "loss": 0.0584, "step": 6746 }, { "epoch": 1.6637080867850098, "grad_norm": 0.42578125, "learning_rate": 0.00016878674909809587, "loss": 0.0618, "step": 6748 }, { "epoch": 1.6642011834319526, "grad_norm": 0.46484375, "learning_rate": 0.00016876175836986237, "loss": 0.0605, "step": 6750 }, { "epoch": 1.6646942800788955, "grad_norm": 0.68359375, "learning_rate": 0.00016873675949297657, "loss": 0.0577, "step": 6752 }, { "epoch": 1.6651873767258383, "grad_norm": 0.515625, "learning_rate": 0.000168711752470401, "loss": 0.0615, "step": 6754 }, { "epoch": 1.665680473372781, "grad_norm": 0.37890625, "learning_rate": 0.00016868673730509914, "loss": 0.0589, "step": 6756 }, { "epoch": 1.666173570019724, "grad_norm": 0.76953125, "learning_rate": 0.0001686617140000354, "loss": 0.0697, "step": 6758 }, { "epoch": 1.6666666666666665, "grad_norm": 0.34375, "learning_rate": 0.00016863668255817518, "loss": 0.0532, "step": 6760 }, { "epoch": 1.6671597633136095, "grad_norm": 0.46484375, "learning_rate": 0.0001686116429824849, "loss": 0.0581, "step": 6762 }, { "epoch": 1.6676528599605522, "grad_norm": 0.396484375, "learning_rate": 0.0001685865952759318, "loss": 0.0521, "step": 6764 }, { "epoch": 1.668145956607495, "grad_norm": 0.396484375, "learning_rate": 0.00016856153944148425, "loss": 0.0596, "step": 6766 }, { "epoch": 1.668639053254438, "grad_norm": 0.455078125, "learning_rate": 0.00016853647548211148, "loss": 0.0508, "step": 6768 }, { "epoch": 1.6691321499013807, "grad_norm": 0.3359375, "learning_rate": 0.00016851140340078368, "loss": 0.0554, "step": 6770 }, { "epoch": 1.6696252465483234, "grad_norm": 0.349609375, "learning_rate": 0.00016848632320047208, "loss": 0.054, "step": 6772 }, { "epoch": 1.6701183431952664, "grad_norm": 0.369140625, "learning_rate": 0.00016846123488414876, "loss": 0.055, "step": 6774 }, { "epoch": 1.670611439842209, "grad_norm": 0.318359375, "learning_rate": 0.00016843613845478694, "loss": 0.052, "step": 6776 }, { "epoch": 1.671104536489152, "grad_norm": 0.59765625, "learning_rate": 0.00016841103391536058, "loss": 0.0652, "step": 6778 }, { "epoch": 1.6715976331360947, "grad_norm": 0.375, "learning_rate": 0.00016838592126884475, "loss": 0.0624, "step": 6780 }, { "epoch": 1.6720907297830374, "grad_norm": 0.40625, "learning_rate": 0.00016836080051821543, "loss": 0.0547, "step": 6782 }, { "epoch": 1.6725838264299804, "grad_norm": 0.419921875, "learning_rate": 0.00016833567166644957, "loss": 0.0597, "step": 6784 }, { "epoch": 1.6730769230769231, "grad_norm": 0.408203125, "learning_rate": 0.00016831053471652507, "loss": 0.0563, "step": 6786 }, { "epoch": 1.6735700197238659, "grad_norm": 0.33203125, "learning_rate": 0.00016828538967142081, "loss": 0.0538, "step": 6788 }, { "epoch": 1.6740631163708086, "grad_norm": 0.34765625, "learning_rate": 0.0001682602365341166, "loss": 0.0603, "step": 6790 }, { "epoch": 1.6745562130177514, "grad_norm": 0.384765625, "learning_rate": 0.00016823507530759328, "loss": 0.0601, "step": 6792 }, { "epoch": 1.6750493096646943, "grad_norm": 0.34375, "learning_rate": 0.00016820990599483252, "loss": 0.0589, "step": 6794 }, { "epoch": 1.675542406311637, "grad_norm": 0.37109375, "learning_rate": 0.0001681847285988171, "loss": 0.0535, "step": 6796 }, { "epoch": 1.6760355029585798, "grad_norm": 0.35546875, "learning_rate": 0.00016815954312253065, "loss": 0.0569, "step": 6798 }, { "epoch": 1.6765285996055228, "grad_norm": 0.416015625, "learning_rate": 0.00016813434956895775, "loss": 0.0498, "step": 6800 }, { "epoch": 1.6770216962524653, "grad_norm": 0.416015625, "learning_rate": 0.00016810914794108402, "loss": 0.0552, "step": 6802 }, { "epoch": 1.6775147928994083, "grad_norm": 0.419921875, "learning_rate": 0.00016808393824189604, "loss": 0.0575, "step": 6804 }, { "epoch": 1.678007889546351, "grad_norm": 0.404296875, "learning_rate": 0.00016805872047438117, "loss": 0.0586, "step": 6806 }, { "epoch": 1.6785009861932938, "grad_norm": 0.3515625, "learning_rate": 0.00016803349464152798, "loss": 0.0589, "step": 6808 }, { "epoch": 1.6789940828402368, "grad_norm": 0.5625, "learning_rate": 0.0001680082607463258, "loss": 0.06, "step": 6810 }, { "epoch": 1.6794871794871795, "grad_norm": 0.443359375, "learning_rate": 0.00016798301879176504, "loss": 0.0547, "step": 6812 }, { "epoch": 1.6799802761341223, "grad_norm": 0.51171875, "learning_rate": 0.00016795776878083694, "loss": 0.0594, "step": 6814 }, { "epoch": 1.6804733727810652, "grad_norm": 0.451171875, "learning_rate": 0.00016793251071653387, "loss": 0.0655, "step": 6816 }, { "epoch": 1.6809664694280078, "grad_norm": 0.30859375, "learning_rate": 0.000167907244601849, "loss": 0.0533, "step": 6818 }, { "epoch": 1.6814595660749507, "grad_norm": 0.443359375, "learning_rate": 0.00016788197043977646, "loss": 0.0599, "step": 6820 }, { "epoch": 1.6819526627218935, "grad_norm": 0.341796875, "learning_rate": 0.00016785668823331138, "loss": 0.0547, "step": 6822 }, { "epoch": 1.6824457593688362, "grad_norm": 0.4140625, "learning_rate": 0.00016783139798544995, "loss": 0.0564, "step": 6824 }, { "epoch": 1.6829388560157792, "grad_norm": 0.61328125, "learning_rate": 0.0001678060996991891, "loss": 0.0544, "step": 6826 }, { "epoch": 1.683431952662722, "grad_norm": 0.34375, "learning_rate": 0.00016778079337752687, "loss": 0.053, "step": 6828 }, { "epoch": 1.6839250493096647, "grad_norm": 0.47265625, "learning_rate": 0.00016775547902346215, "loss": 0.0501, "step": 6830 }, { "epoch": 1.6844181459566077, "grad_norm": 0.59375, "learning_rate": 0.00016773015663999492, "loss": 0.0579, "step": 6832 }, { "epoch": 1.6849112426035502, "grad_norm": 0.453125, "learning_rate": 0.00016770482623012595, "loss": 0.0516, "step": 6834 }, { "epoch": 1.6854043392504932, "grad_norm": 0.42578125, "learning_rate": 0.00016767948779685706, "loss": 0.0544, "step": 6836 }, { "epoch": 1.685897435897436, "grad_norm": 0.30078125, "learning_rate": 0.000167654141343191, "loss": 0.0548, "step": 6838 }, { "epoch": 1.6863905325443787, "grad_norm": 0.408203125, "learning_rate": 0.00016762878687213143, "loss": 0.0489, "step": 6840 }, { "epoch": 1.6868836291913216, "grad_norm": 0.38671875, "learning_rate": 0.00016760342438668305, "loss": 0.0544, "step": 6842 }, { "epoch": 1.6873767258382641, "grad_norm": 0.365234375, "learning_rate": 0.00016757805388985142, "loss": 0.0553, "step": 6844 }, { "epoch": 1.6878698224852071, "grad_norm": 0.330078125, "learning_rate": 0.00016755267538464308, "loss": 0.0546, "step": 6846 }, { "epoch": 1.6883629191321499, "grad_norm": 0.369140625, "learning_rate": 0.0001675272888740656, "loss": 0.0537, "step": 6848 }, { "epoch": 1.6888560157790926, "grad_norm": 0.578125, "learning_rate": 0.00016750189436112728, "loss": 0.0555, "step": 6850 }, { "epoch": 1.6893491124260356, "grad_norm": 0.38671875, "learning_rate": 0.00016747649184883768, "loss": 0.0519, "step": 6852 }, { "epoch": 1.6898422090729783, "grad_norm": 0.40625, "learning_rate": 0.000167451081340207, "loss": 0.0571, "step": 6854 }, { "epoch": 1.690335305719921, "grad_norm": 0.3125, "learning_rate": 0.0001674256628382466, "loss": 0.0471, "step": 6856 }, { "epoch": 1.690828402366864, "grad_norm": 0.3984375, "learning_rate": 0.00016740023634596868, "loss": 0.0516, "step": 6858 }, { "epoch": 1.6913214990138066, "grad_norm": 0.376953125, "learning_rate": 0.00016737480186638644, "loss": 0.0527, "step": 6860 }, { "epoch": 1.6918145956607495, "grad_norm": 0.400390625, "learning_rate": 0.00016734935940251403, "loss": 0.0538, "step": 6862 }, { "epoch": 1.6923076923076923, "grad_norm": 0.45703125, "learning_rate": 0.0001673239089573665, "loss": 0.0538, "step": 6864 }, { "epoch": 1.692800788954635, "grad_norm": 0.3359375, "learning_rate": 0.00016729845053395988, "loss": 0.0513, "step": 6866 }, { "epoch": 1.693293885601578, "grad_norm": 0.42578125, "learning_rate": 0.0001672729841353111, "loss": 0.0619, "step": 6868 }, { "epoch": 1.6937869822485208, "grad_norm": 0.376953125, "learning_rate": 0.00016724750976443808, "loss": 0.057, "step": 6870 }, { "epoch": 1.6942800788954635, "grad_norm": 0.5078125, "learning_rate": 0.0001672220274243597, "loss": 0.055, "step": 6872 }, { "epoch": 1.6947731755424065, "grad_norm": 0.5, "learning_rate": 0.00016719653711809579, "loss": 0.0609, "step": 6874 }, { "epoch": 1.695266272189349, "grad_norm": 0.458984375, "learning_rate": 0.00016717103884866702, "loss": 0.0559, "step": 6876 }, { "epoch": 1.695759368836292, "grad_norm": 0.46875, "learning_rate": 0.00016714553261909512, "loss": 0.0577, "step": 6878 }, { "epoch": 1.6962524654832347, "grad_norm": 0.55859375, "learning_rate": 0.0001671200184324027, "loss": 0.0564, "step": 6880 }, { "epoch": 1.6967455621301775, "grad_norm": 0.50390625, "learning_rate": 0.00016709449629161337, "loss": 0.0595, "step": 6882 }, { "epoch": 1.6972386587771204, "grad_norm": 0.66796875, "learning_rate": 0.0001670689661997516, "loss": 0.0545, "step": 6884 }, { "epoch": 1.697731755424063, "grad_norm": 0.494140625, "learning_rate": 0.0001670434281598429, "loss": 0.0531, "step": 6886 }, { "epoch": 1.698224852071006, "grad_norm": 0.60546875, "learning_rate": 0.0001670178821749136, "loss": 0.0558, "step": 6888 }, { "epoch": 1.6987179487179487, "grad_norm": 0.4140625, "learning_rate": 0.0001669923282479911, "loss": 0.0558, "step": 6890 }, { "epoch": 1.6992110453648914, "grad_norm": 0.4375, "learning_rate": 0.00016696676638210364, "loss": 0.0533, "step": 6892 }, { "epoch": 1.6997041420118344, "grad_norm": 0.6875, "learning_rate": 0.00016694119658028052, "loss": 0.0556, "step": 6894 }, { "epoch": 1.7001972386587771, "grad_norm": 0.39453125, "learning_rate": 0.0001669156188455518, "loss": 0.056, "step": 6896 }, { "epoch": 1.70069033530572, "grad_norm": 0.486328125, "learning_rate": 0.00016689003318094868, "loss": 0.0603, "step": 6898 }, { "epoch": 1.7011834319526629, "grad_norm": 0.46875, "learning_rate": 0.00016686443958950317, "loss": 0.0579, "step": 6900 }, { "epoch": 1.7016765285996054, "grad_norm": 0.349609375, "learning_rate": 0.0001668388380742482, "loss": 0.0544, "step": 6902 }, { "epoch": 1.7021696252465484, "grad_norm": 0.482421875, "learning_rate": 0.00016681322863821776, "loss": 0.0546, "step": 6904 }, { "epoch": 1.702662721893491, "grad_norm": 0.58984375, "learning_rate": 0.0001667876112844467, "loss": 0.0569, "step": 6906 }, { "epoch": 1.7031558185404339, "grad_norm": 0.546875, "learning_rate": 0.0001667619860159708, "loss": 0.0553, "step": 6908 }, { "epoch": 1.7036489151873768, "grad_norm": 0.416015625, "learning_rate": 0.00016673635283582678, "loss": 0.0537, "step": 6910 }, { "epoch": 1.7041420118343196, "grad_norm": 0.546875, "learning_rate": 0.00016671071174705236, "loss": 0.0592, "step": 6912 }, { "epoch": 1.7046351084812623, "grad_norm": 0.435546875, "learning_rate": 0.00016668506275268618, "loss": 0.0549, "step": 6914 }, { "epoch": 1.7051282051282053, "grad_norm": 0.45703125, "learning_rate": 0.00016665940585576769, "loss": 0.0525, "step": 6916 }, { "epoch": 1.7056213017751478, "grad_norm": 0.35546875, "learning_rate": 0.00016663374105933745, "loss": 0.0507, "step": 6918 }, { "epoch": 1.7061143984220908, "grad_norm": 0.435546875, "learning_rate": 0.00016660806836643683, "loss": 0.0513, "step": 6920 }, { "epoch": 1.7066074950690335, "grad_norm": 0.4140625, "learning_rate": 0.0001665823877801082, "loss": 0.0538, "step": 6922 }, { "epoch": 1.7071005917159763, "grad_norm": 0.29296875, "learning_rate": 0.00016655669930339492, "loss": 0.0535, "step": 6924 }, { "epoch": 1.7075936883629192, "grad_norm": 0.416015625, "learning_rate": 0.00016653100293934116, "loss": 0.0513, "step": 6926 }, { "epoch": 1.7080867850098618, "grad_norm": 0.34765625, "learning_rate": 0.00016650529869099208, "loss": 0.0503, "step": 6928 }, { "epoch": 1.7085798816568047, "grad_norm": 0.404296875, "learning_rate": 0.00016647958656139378, "loss": 0.0562, "step": 6930 }, { "epoch": 1.7090729783037475, "grad_norm": 0.78515625, "learning_rate": 0.0001664538665535933, "loss": 0.0628, "step": 6932 }, { "epoch": 1.7095660749506902, "grad_norm": 0.33984375, "learning_rate": 0.00016642813867063865, "loss": 0.0554, "step": 6934 }, { "epoch": 1.7100591715976332, "grad_norm": 0.51953125, "learning_rate": 0.00016640240291557862, "loss": 0.0556, "step": 6936 }, { "epoch": 1.710552268244576, "grad_norm": 0.380859375, "learning_rate": 0.00016637665929146312, "loss": 0.0571, "step": 6938 }, { "epoch": 1.7110453648915187, "grad_norm": 0.384765625, "learning_rate": 0.0001663509078013429, "loss": 0.0618, "step": 6940 }, { "epoch": 1.7115384615384617, "grad_norm": 0.3359375, "learning_rate": 0.00016632514844826962, "loss": 0.0539, "step": 6942 }, { "epoch": 1.7120315581854042, "grad_norm": 0.408203125, "learning_rate": 0.00016629938123529598, "loss": 0.0535, "step": 6944 }, { "epoch": 1.7125246548323472, "grad_norm": 0.33203125, "learning_rate": 0.00016627360616547547, "loss": 0.0517, "step": 6946 }, { "epoch": 1.71301775147929, "grad_norm": 0.37109375, "learning_rate": 0.00016624782324186263, "loss": 0.0588, "step": 6948 }, { "epoch": 1.7135108481262327, "grad_norm": 0.408203125, "learning_rate": 0.00016622203246751283, "loss": 0.0588, "step": 6950 }, { "epoch": 1.7140039447731756, "grad_norm": 0.345703125, "learning_rate": 0.00016619623384548244, "loss": 0.0593, "step": 6952 }, { "epoch": 1.7144970414201184, "grad_norm": 0.466796875, "learning_rate": 0.00016617042737882874, "loss": 0.058, "step": 6954 }, { "epoch": 1.7149901380670611, "grad_norm": 0.48828125, "learning_rate": 0.00016614461307061, "loss": 0.0575, "step": 6956 }, { "epoch": 1.715483234714004, "grad_norm": 0.369140625, "learning_rate": 0.00016611879092388526, "loss": 0.0562, "step": 6958 }, { "epoch": 1.7159763313609466, "grad_norm": 0.40625, "learning_rate": 0.00016609296094171464, "loss": 0.0558, "step": 6960 }, { "epoch": 1.7164694280078896, "grad_norm": 0.341796875, "learning_rate": 0.00016606712312715915, "loss": 0.0545, "step": 6962 }, { "epoch": 1.7169625246548323, "grad_norm": 0.30859375, "learning_rate": 0.0001660412774832807, "loss": 0.0462, "step": 6964 }, { "epoch": 1.717455621301775, "grad_norm": 0.400390625, "learning_rate": 0.0001660154240131421, "loss": 0.0561, "step": 6966 }, { "epoch": 1.717948717948718, "grad_norm": 0.369140625, "learning_rate": 0.00016598956271980723, "loss": 0.0575, "step": 6968 }, { "epoch": 1.7184418145956606, "grad_norm": 0.3671875, "learning_rate": 0.0001659636936063407, "loss": 0.0578, "step": 6970 }, { "epoch": 1.7189349112426036, "grad_norm": 0.37109375, "learning_rate": 0.00016593781667580818, "loss": 0.0561, "step": 6972 }, { "epoch": 1.7194280078895463, "grad_norm": 0.390625, "learning_rate": 0.00016591193193127627, "loss": 0.0547, "step": 6974 }, { "epoch": 1.719921104536489, "grad_norm": 0.337890625, "learning_rate": 0.00016588603937581236, "loss": 0.0564, "step": 6976 }, { "epoch": 1.720414201183432, "grad_norm": 0.376953125, "learning_rate": 0.000165860139012485, "loss": 0.0626, "step": 6978 }, { "epoch": 1.7209072978303748, "grad_norm": 0.486328125, "learning_rate": 0.0001658342308443634, "loss": 0.0544, "step": 6980 }, { "epoch": 1.7214003944773175, "grad_norm": 0.400390625, "learning_rate": 0.00016580831487451788, "loss": 0.0554, "step": 6982 }, { "epoch": 1.7218934911242605, "grad_norm": 0.376953125, "learning_rate": 0.00016578239110601965, "loss": 0.0556, "step": 6984 }, { "epoch": 1.722386587771203, "grad_norm": 0.353515625, "learning_rate": 0.00016575645954194077, "loss": 0.055, "step": 6986 }, { "epoch": 1.722879684418146, "grad_norm": 0.302734375, "learning_rate": 0.0001657305201853543, "loss": 0.0509, "step": 6988 }, { "epoch": 1.7233727810650887, "grad_norm": 0.392578125, "learning_rate": 0.0001657045730393342, "loss": 0.0568, "step": 6990 }, { "epoch": 1.7238658777120315, "grad_norm": 0.408203125, "learning_rate": 0.00016567861810695532, "loss": 0.0529, "step": 6992 }, { "epoch": 1.7243589743589745, "grad_norm": 0.322265625, "learning_rate": 0.0001656526553912935, "loss": 0.0517, "step": 6994 }, { "epoch": 1.7248520710059172, "grad_norm": 0.369140625, "learning_rate": 0.0001656266848954255, "loss": 0.0584, "step": 6996 }, { "epoch": 1.72534516765286, "grad_norm": 0.3671875, "learning_rate": 0.0001656007066224289, "loss": 0.0579, "step": 6998 }, { "epoch": 1.725838264299803, "grad_norm": 0.3671875, "learning_rate": 0.00016557472057538228, "loss": 0.06, "step": 7000 }, { "epoch": 1.7263313609467454, "grad_norm": 0.427734375, "learning_rate": 0.00016554872675736518, "loss": 0.0564, "step": 7002 }, { "epoch": 1.7268244575936884, "grad_norm": 0.404296875, "learning_rate": 0.00016552272517145794, "loss": 0.0555, "step": 7004 }, { "epoch": 1.7273175542406312, "grad_norm": 0.326171875, "learning_rate": 0.00016549671582074198, "loss": 0.0521, "step": 7006 }, { "epoch": 1.727810650887574, "grad_norm": 0.4296875, "learning_rate": 0.0001654706987082995, "loss": 0.0533, "step": 7008 }, { "epoch": 1.7283037475345169, "grad_norm": 0.58203125, "learning_rate": 0.00016544467383721367, "loss": 0.0552, "step": 7010 }, { "epoch": 1.7287968441814594, "grad_norm": 0.34765625, "learning_rate": 0.00016541864121056858, "loss": 0.0572, "step": 7012 }, { "epoch": 1.7292899408284024, "grad_norm": 0.40625, "learning_rate": 0.0001653926008314493, "loss": 0.0551, "step": 7014 }, { "epoch": 1.7297830374753451, "grad_norm": 0.48046875, "learning_rate": 0.00016536655270294167, "loss": 0.0517, "step": 7016 }, { "epoch": 1.7302761341222879, "grad_norm": 0.5234375, "learning_rate": 0.00016534049682813263, "loss": 0.0548, "step": 7018 }, { "epoch": 1.7307692307692308, "grad_norm": 0.431640625, "learning_rate": 0.00016531443321010983, "loss": 0.0537, "step": 7020 }, { "epoch": 1.7312623274161736, "grad_norm": 0.384765625, "learning_rate": 0.0001652883618519621, "loss": 0.0533, "step": 7022 }, { "epoch": 1.7317554240631163, "grad_norm": 0.51171875, "learning_rate": 0.00016526228275677893, "loss": 0.0532, "step": 7024 }, { "epoch": 1.7322485207100593, "grad_norm": 0.33984375, "learning_rate": 0.0001652361959276509, "loss": 0.0541, "step": 7026 }, { "epoch": 1.7327416173570018, "grad_norm": 0.36328125, "learning_rate": 0.00016521010136766938, "loss": 0.0571, "step": 7028 }, { "epoch": 1.7332347140039448, "grad_norm": 0.263671875, "learning_rate": 0.0001651839990799268, "loss": 0.0502, "step": 7030 }, { "epoch": 1.7337278106508875, "grad_norm": 0.390625, "learning_rate": 0.00016515788906751636, "loss": 0.057, "step": 7032 }, { "epoch": 1.7342209072978303, "grad_norm": 0.3515625, "learning_rate": 0.0001651317713335323, "loss": 0.0547, "step": 7034 }, { "epoch": 1.7347140039447733, "grad_norm": 0.357421875, "learning_rate": 0.00016510564588106968, "loss": 0.0553, "step": 7036 }, { "epoch": 1.735207100591716, "grad_norm": 0.404296875, "learning_rate": 0.0001650795127132245, "loss": 0.0588, "step": 7038 }, { "epoch": 1.7357001972386588, "grad_norm": 0.359375, "learning_rate": 0.00016505337183309371, "loss": 0.0541, "step": 7040 }, { "epoch": 1.7361932938856017, "grad_norm": 0.3671875, "learning_rate": 0.00016502722324377515, "loss": 0.0533, "step": 7042 }, { "epoch": 1.7366863905325443, "grad_norm": 0.453125, "learning_rate": 0.00016500106694836757, "loss": 0.0602, "step": 7044 }, { "epoch": 1.7371794871794872, "grad_norm": 0.34375, "learning_rate": 0.00016497490294997068, "loss": 0.0488, "step": 7046 }, { "epoch": 1.73767258382643, "grad_norm": 0.30078125, "learning_rate": 0.00016494873125168496, "loss": 0.0505, "step": 7048 }, { "epoch": 1.7381656804733727, "grad_norm": 0.3984375, "learning_rate": 0.00016492255185661194, "loss": 0.0546, "step": 7050 }, { "epoch": 1.7386587771203157, "grad_norm": 0.330078125, "learning_rate": 0.0001648963647678541, "loss": 0.0475, "step": 7052 }, { "epoch": 1.7391518737672582, "grad_norm": 0.455078125, "learning_rate": 0.00016487016998851468, "loss": 0.0556, "step": 7054 }, { "epoch": 1.7396449704142012, "grad_norm": 0.46484375, "learning_rate": 0.0001648439675216979, "loss": 0.058, "step": 7056 }, { "epoch": 1.740138067061144, "grad_norm": 0.33203125, "learning_rate": 0.00016481775737050896, "loss": 0.0554, "step": 7058 }, { "epoch": 1.7406311637080867, "grad_norm": 0.51953125, "learning_rate": 0.0001647915395380539, "loss": 0.0521, "step": 7060 }, { "epoch": 1.7411242603550297, "grad_norm": 0.373046875, "learning_rate": 0.0001647653140274396, "loss": 0.058, "step": 7062 }, { "epoch": 1.7416173570019724, "grad_norm": 0.421875, "learning_rate": 0.000164739080841774, "loss": 0.0581, "step": 7064 }, { "epoch": 1.7421104536489151, "grad_norm": 0.4765625, "learning_rate": 0.0001647128399841659, "loss": 0.0615, "step": 7066 }, { "epoch": 1.7426035502958581, "grad_norm": 0.46875, "learning_rate": 0.0001646865914577249, "loss": 0.0515, "step": 7068 }, { "epoch": 1.7430966469428006, "grad_norm": 0.365234375, "learning_rate": 0.00016466033526556172, "loss": 0.0515, "step": 7070 }, { "epoch": 1.7435897435897436, "grad_norm": 0.3359375, "learning_rate": 0.00016463407141078772, "loss": 0.0568, "step": 7072 }, { "epoch": 1.7440828402366864, "grad_norm": 0.37890625, "learning_rate": 0.00016460779989651542, "loss": 0.0564, "step": 7074 }, { "epoch": 1.744575936883629, "grad_norm": 0.318359375, "learning_rate": 0.0001645815207258581, "loss": 0.0555, "step": 7076 }, { "epoch": 1.745069033530572, "grad_norm": 0.390625, "learning_rate": 0.00016455523390192998, "loss": 0.0482, "step": 7078 }, { "epoch": 1.7455621301775148, "grad_norm": 0.333984375, "learning_rate": 0.00016452893942784627, "loss": 0.0513, "step": 7080 }, { "epoch": 1.7460552268244576, "grad_norm": 0.373046875, "learning_rate": 0.00016450263730672288, "loss": 0.058, "step": 7082 }, { "epoch": 1.7465483234714005, "grad_norm": 0.3125, "learning_rate": 0.00016447632754167688, "loss": 0.047, "step": 7084 }, { "epoch": 1.747041420118343, "grad_norm": 0.365234375, "learning_rate": 0.00016445001013582608, "loss": 0.0529, "step": 7086 }, { "epoch": 1.747534516765286, "grad_norm": 0.48046875, "learning_rate": 0.0001644236850922892, "loss": 0.0535, "step": 7088 }, { "epoch": 1.7480276134122288, "grad_norm": 0.333984375, "learning_rate": 0.00016439735241418596, "loss": 0.056, "step": 7090 }, { "epoch": 1.7485207100591715, "grad_norm": 0.43359375, "learning_rate": 0.00016437101210463692, "loss": 0.057, "step": 7092 }, { "epoch": 1.7490138067061145, "grad_norm": 0.349609375, "learning_rate": 0.00016434466416676355, "loss": 0.0511, "step": 7094 }, { "epoch": 1.7495069033530573, "grad_norm": 0.3359375, "learning_rate": 0.00016431830860368823, "loss": 0.0578, "step": 7096 }, { "epoch": 1.75, "grad_norm": 0.337890625, "learning_rate": 0.00016429194541853418, "loss": 0.0565, "step": 7098 }, { "epoch": 1.7504930966469427, "grad_norm": 0.37890625, "learning_rate": 0.00016426557461442568, "loss": 0.0544, "step": 7100 }, { "epoch": 1.7509861932938855, "grad_norm": 0.333984375, "learning_rate": 0.00016423919619448776, "loss": 0.0572, "step": 7102 }, { "epoch": 1.7514792899408285, "grad_norm": 0.427734375, "learning_rate": 0.00016421281016184642, "loss": 0.0542, "step": 7104 }, { "epoch": 1.7519723865877712, "grad_norm": 0.447265625, "learning_rate": 0.00016418641651962855, "loss": 0.0589, "step": 7106 }, { "epoch": 1.752465483234714, "grad_norm": 0.37109375, "learning_rate": 0.00016416001527096199, "loss": 0.0546, "step": 7108 }, { "epoch": 1.752958579881657, "grad_norm": 0.330078125, "learning_rate": 0.00016413360641897537, "loss": 0.0586, "step": 7110 }, { "epoch": 1.7534516765285995, "grad_norm": 0.376953125, "learning_rate": 0.00016410718996679832, "loss": 0.0498, "step": 7112 }, { "epoch": 1.7539447731755424, "grad_norm": 0.296875, "learning_rate": 0.00016408076591756135, "loss": 0.0533, "step": 7114 }, { "epoch": 1.7544378698224852, "grad_norm": 0.490234375, "learning_rate": 0.0001640543342743958, "loss": 0.0527, "step": 7116 }, { "epoch": 1.754930966469428, "grad_norm": 0.498046875, "learning_rate": 0.00016402789504043401, "loss": 0.0548, "step": 7118 }, { "epoch": 1.755424063116371, "grad_norm": 0.427734375, "learning_rate": 0.0001640014482188092, "loss": 0.0571, "step": 7120 }, { "epoch": 1.7559171597633136, "grad_norm": 0.478515625, "learning_rate": 0.0001639749938126554, "loss": 0.0566, "step": 7122 }, { "epoch": 1.7564102564102564, "grad_norm": 0.376953125, "learning_rate": 0.00016394853182510768, "loss": 0.0557, "step": 7124 }, { "epoch": 1.7569033530571994, "grad_norm": 0.5, "learning_rate": 0.00016392206225930187, "loss": 0.0492, "step": 7126 }, { "epoch": 1.7573964497041419, "grad_norm": 0.5, "learning_rate": 0.0001638955851183748, "loss": 0.0568, "step": 7128 }, { "epoch": 1.7578895463510849, "grad_norm": 0.392578125, "learning_rate": 0.00016386910040546416, "loss": 0.0561, "step": 7130 }, { "epoch": 1.7583826429980276, "grad_norm": 0.5625, "learning_rate": 0.0001638426081237085, "loss": 0.0551, "step": 7132 }, { "epoch": 1.7588757396449703, "grad_norm": 0.322265625, "learning_rate": 0.00016381610827624732, "loss": 0.0483, "step": 7134 }, { "epoch": 1.7593688362919133, "grad_norm": 0.451171875, "learning_rate": 0.00016378960086622102, "loss": 0.0503, "step": 7136 }, { "epoch": 1.759861932938856, "grad_norm": 0.392578125, "learning_rate": 0.00016376308589677083, "loss": 0.0555, "step": 7138 }, { "epoch": 1.7603550295857988, "grad_norm": 0.40234375, "learning_rate": 0.000163736563371039, "loss": 0.0525, "step": 7140 }, { "epoch": 1.7608481262327418, "grad_norm": 0.396484375, "learning_rate": 0.00016371003329216852, "loss": 0.0542, "step": 7142 }, { "epoch": 1.7613412228796843, "grad_norm": 0.625, "learning_rate": 0.00016368349566330342, "loss": 0.0533, "step": 7144 }, { "epoch": 1.7618343195266273, "grad_norm": 0.384765625, "learning_rate": 0.0001636569504875885, "loss": 0.0507, "step": 7146 }, { "epoch": 1.76232741617357, "grad_norm": 0.30078125, "learning_rate": 0.0001636303977681695, "loss": 0.0549, "step": 7148 }, { "epoch": 1.7628205128205128, "grad_norm": 0.44140625, "learning_rate": 0.00016360383750819315, "loss": 0.0547, "step": 7150 }, { "epoch": 1.7633136094674557, "grad_norm": 0.4296875, "learning_rate": 0.0001635772697108069, "loss": 0.0557, "step": 7152 }, { "epoch": 1.7638067061143983, "grad_norm": 0.416015625, "learning_rate": 0.00016355069437915925, "loss": 0.0516, "step": 7154 }, { "epoch": 1.7642998027613412, "grad_norm": 0.6171875, "learning_rate": 0.00016352411151639946, "loss": 0.0568, "step": 7156 }, { "epoch": 1.764792899408284, "grad_norm": 0.78515625, "learning_rate": 0.0001634975211256778, "loss": 0.0564, "step": 7158 }, { "epoch": 1.7652859960552267, "grad_norm": 0.62109375, "learning_rate": 0.00016347092321014535, "loss": 0.0489, "step": 7160 }, { "epoch": 1.7657790927021697, "grad_norm": 0.3125, "learning_rate": 0.00016344431777295414, "loss": 0.0561, "step": 7162 }, { "epoch": 1.7662721893491125, "grad_norm": 0.55859375, "learning_rate": 0.00016341770481725706, "loss": 0.0563, "step": 7164 }, { "epoch": 1.7667652859960552, "grad_norm": 0.435546875, "learning_rate": 0.00016339108434620787, "loss": 0.0479, "step": 7166 }, { "epoch": 1.7672583826429982, "grad_norm": 0.408203125, "learning_rate": 0.00016336445636296122, "loss": 0.0551, "step": 7168 }, { "epoch": 1.7677514792899407, "grad_norm": 0.5859375, "learning_rate": 0.00016333782087067276, "loss": 0.0557, "step": 7170 }, { "epoch": 1.7682445759368837, "grad_norm": 0.423828125, "learning_rate": 0.00016331117787249884, "loss": 0.0517, "step": 7172 }, { "epoch": 1.7687376725838264, "grad_norm": 0.33203125, "learning_rate": 0.00016328452737159692, "loss": 0.0533, "step": 7174 }, { "epoch": 1.7692307692307692, "grad_norm": 0.48828125, "learning_rate": 0.00016325786937112512, "loss": 0.0467, "step": 7176 }, { "epoch": 1.7697238658777121, "grad_norm": 0.4375, "learning_rate": 0.00016323120387424264, "loss": 0.0584, "step": 7178 }, { "epoch": 1.7702169625246549, "grad_norm": 0.51953125, "learning_rate": 0.00016320453088410942, "loss": 0.0522, "step": 7180 }, { "epoch": 1.7707100591715976, "grad_norm": 0.388671875, "learning_rate": 0.00016317785040388647, "loss": 0.0517, "step": 7182 }, { "epoch": 1.7712031558185406, "grad_norm": 0.314453125, "learning_rate": 0.00016315116243673545, "loss": 0.0534, "step": 7184 }, { "epoch": 1.7716962524654831, "grad_norm": 0.61328125, "learning_rate": 0.00016312446698581913, "loss": 0.0554, "step": 7186 }, { "epoch": 1.772189349112426, "grad_norm": 0.345703125, "learning_rate": 0.000163097764054301, "loss": 0.0564, "step": 7188 }, { "epoch": 1.7726824457593688, "grad_norm": 0.625, "learning_rate": 0.00016307105364534556, "loss": 0.0532, "step": 7190 }, { "epoch": 1.7731755424063116, "grad_norm": 0.310546875, "learning_rate": 0.00016304433576211808, "loss": 0.054, "step": 7192 }, { "epoch": 1.7736686390532546, "grad_norm": 0.44140625, "learning_rate": 0.00016301761040778487, "loss": 0.0527, "step": 7194 }, { "epoch": 1.774161735700197, "grad_norm": 0.423828125, "learning_rate": 0.00016299087758551298, "loss": 0.0631, "step": 7196 }, { "epoch": 1.77465483234714, "grad_norm": 0.380859375, "learning_rate": 0.00016296413729847034, "loss": 0.0598, "step": 7198 }, { "epoch": 1.7751479289940828, "grad_norm": 0.32421875, "learning_rate": 0.00016293738954982592, "loss": 0.0553, "step": 7200 }, { "epoch": 1.7756410256410255, "grad_norm": 0.4453125, "learning_rate": 0.00016291063434274947, "loss": 0.0581, "step": 7202 }, { "epoch": 1.7761341222879685, "grad_norm": 0.515625, "learning_rate": 0.00016288387168041156, "loss": 0.0534, "step": 7204 }, { "epoch": 1.7766272189349113, "grad_norm": 0.427734375, "learning_rate": 0.00016285710156598376, "loss": 0.0544, "step": 7206 }, { "epoch": 1.777120315581854, "grad_norm": 0.61328125, "learning_rate": 0.00016283032400263848, "loss": 0.0538, "step": 7208 }, { "epoch": 1.777613412228797, "grad_norm": 0.447265625, "learning_rate": 0.00016280353899354903, "loss": 0.0551, "step": 7210 }, { "epoch": 1.7781065088757395, "grad_norm": 0.5390625, "learning_rate": 0.00016277674654188953, "loss": 0.0563, "step": 7212 }, { "epoch": 1.7785996055226825, "grad_norm": 0.376953125, "learning_rate": 0.0001627499466508351, "loss": 0.0556, "step": 7214 }, { "epoch": 1.7790927021696252, "grad_norm": 0.3515625, "learning_rate": 0.00016272313932356162, "loss": 0.0529, "step": 7216 }, { "epoch": 1.779585798816568, "grad_norm": 0.365234375, "learning_rate": 0.00016269632456324592, "loss": 0.0562, "step": 7218 }, { "epoch": 1.780078895463511, "grad_norm": 0.41015625, "learning_rate": 0.0001626695023730657, "loss": 0.0526, "step": 7220 }, { "epoch": 1.7805719921104537, "grad_norm": 0.314453125, "learning_rate": 0.00016264267275619958, "loss": 0.0523, "step": 7222 }, { "epoch": 1.7810650887573964, "grad_norm": 0.3671875, "learning_rate": 0.00016261583571582696, "loss": 0.0526, "step": 7224 }, { "epoch": 1.7815581854043394, "grad_norm": 0.330078125, "learning_rate": 0.00016258899125512821, "loss": 0.0541, "step": 7226 }, { "epoch": 1.782051282051282, "grad_norm": 0.296875, "learning_rate": 0.00016256213937728455, "loss": 0.0575, "step": 7228 }, { "epoch": 1.782544378698225, "grad_norm": 0.421875, "learning_rate": 0.00016253528008547806, "loss": 0.0579, "step": 7230 }, { "epoch": 1.7830374753451677, "grad_norm": 0.34375, "learning_rate": 0.00016250841338289176, "loss": 0.0507, "step": 7232 }, { "epoch": 1.7835305719921104, "grad_norm": 0.41796875, "learning_rate": 0.00016248153927270943, "loss": 0.0594, "step": 7234 }, { "epoch": 1.7840236686390534, "grad_norm": 0.392578125, "learning_rate": 0.00016245465775811585, "loss": 0.0491, "step": 7236 }, { "epoch": 1.784516765285996, "grad_norm": 0.322265625, "learning_rate": 0.00016242776884229664, "loss": 0.0524, "step": 7238 }, { "epoch": 1.7850098619329389, "grad_norm": 0.40625, "learning_rate": 0.00016240087252843827, "loss": 0.0568, "step": 7240 }, { "epoch": 1.7855029585798816, "grad_norm": 0.349609375, "learning_rate": 0.0001623739688197281, "loss": 0.0498, "step": 7242 }, { "epoch": 1.7859960552268244, "grad_norm": 0.37890625, "learning_rate": 0.00016234705771935438, "loss": 0.0505, "step": 7244 }, { "epoch": 1.7864891518737673, "grad_norm": 0.341796875, "learning_rate": 0.00016232013923050622, "loss": 0.0521, "step": 7246 }, { "epoch": 1.78698224852071, "grad_norm": 0.396484375, "learning_rate": 0.0001622932133563736, "loss": 0.0533, "step": 7248 }, { "epoch": 1.7874753451676528, "grad_norm": 0.330078125, "learning_rate": 0.00016226628010014742, "loss": 0.0549, "step": 7250 }, { "epoch": 1.7879684418145958, "grad_norm": 0.36328125, "learning_rate": 0.00016223933946501936, "loss": 0.0573, "step": 7252 }, { "epoch": 1.7884615384615383, "grad_norm": 0.41015625, "learning_rate": 0.00016221239145418207, "loss": 0.0592, "step": 7254 }, { "epoch": 1.7889546351084813, "grad_norm": 0.373046875, "learning_rate": 0.00016218543607082907, "loss": 0.053, "step": 7256 }, { "epoch": 1.789447731755424, "grad_norm": 0.330078125, "learning_rate": 0.0001621584733181547, "loss": 0.0565, "step": 7258 }, { "epoch": 1.7899408284023668, "grad_norm": 0.32421875, "learning_rate": 0.00016213150319935418, "loss": 0.0565, "step": 7260 }, { "epoch": 1.7904339250493098, "grad_norm": 0.36328125, "learning_rate": 0.00016210452571762364, "loss": 0.0513, "step": 7262 }, { "epoch": 1.7909270216962525, "grad_norm": 0.49609375, "learning_rate": 0.00016207754087616005, "loss": 0.0571, "step": 7264 }, { "epoch": 1.7914201183431953, "grad_norm": 0.384765625, "learning_rate": 0.00016205054867816126, "loss": 0.0555, "step": 7266 }, { "epoch": 1.7919132149901382, "grad_norm": 0.439453125, "learning_rate": 0.000162023549126826, "loss": 0.055, "step": 7268 }, { "epoch": 1.7924063116370808, "grad_norm": 0.46484375, "learning_rate": 0.0001619965422253539, "loss": 0.0547, "step": 7270 }, { "epoch": 1.7928994082840237, "grad_norm": 0.384765625, "learning_rate": 0.0001619695279769454, "loss": 0.052, "step": 7272 }, { "epoch": 1.7933925049309665, "grad_norm": 0.486328125, "learning_rate": 0.0001619425063848018, "loss": 0.0524, "step": 7274 }, { "epoch": 1.7938856015779092, "grad_norm": 0.353515625, "learning_rate": 0.0001619154774521254, "loss": 0.054, "step": 7276 }, { "epoch": 1.7943786982248522, "grad_norm": 0.53515625, "learning_rate": 0.00016188844118211924, "loss": 0.052, "step": 7278 }, { "epoch": 1.7948717948717947, "grad_norm": 0.38671875, "learning_rate": 0.00016186139757798723, "loss": 0.0561, "step": 7280 }, { "epoch": 1.7953648915187377, "grad_norm": 0.314453125, "learning_rate": 0.00016183434664293421, "loss": 0.0546, "step": 7282 }, { "epoch": 1.7958579881656804, "grad_norm": 0.427734375, "learning_rate": 0.0001618072883801659, "loss": 0.0549, "step": 7284 }, { "epoch": 1.7963510848126232, "grad_norm": 0.486328125, "learning_rate": 0.0001617802227928888, "loss": 0.0559, "step": 7286 }, { "epoch": 1.7968441814595661, "grad_norm": 0.400390625, "learning_rate": 0.00016175314988431036, "loss": 0.0566, "step": 7288 }, { "epoch": 1.797337278106509, "grad_norm": 0.515625, "learning_rate": 0.0001617260696576389, "loss": 0.0541, "step": 7290 }, { "epoch": 1.7978303747534516, "grad_norm": 0.41015625, "learning_rate": 0.00016169898211608352, "loss": 0.0547, "step": 7292 }, { "epoch": 1.7983234714003946, "grad_norm": 0.37109375, "learning_rate": 0.00016167188726285434, "loss": 0.0549, "step": 7294 }, { "epoch": 1.7988165680473371, "grad_norm": 0.373046875, "learning_rate": 0.00016164478510116213, "loss": 0.0509, "step": 7296 }, { "epoch": 1.79930966469428, "grad_norm": 0.52734375, "learning_rate": 0.0001616176756342187, "loss": 0.0551, "step": 7298 }, { "epoch": 1.7998027613412229, "grad_norm": 0.43359375, "learning_rate": 0.0001615905588652367, "loss": 0.0517, "step": 7300 }, { "epoch": 1.8002958579881656, "grad_norm": 0.40625, "learning_rate": 0.00016156343479742963, "loss": 0.0568, "step": 7302 }, { "epoch": 1.8007889546351086, "grad_norm": 0.427734375, "learning_rate": 0.00016153630343401174, "loss": 0.0578, "step": 7304 }, { "epoch": 1.8012820512820513, "grad_norm": 0.404296875, "learning_rate": 0.00016150916477819836, "loss": 0.0573, "step": 7306 }, { "epoch": 1.801775147928994, "grad_norm": 0.39453125, "learning_rate": 0.00016148201883320554, "loss": 0.0543, "step": 7308 }, { "epoch": 1.802268244575937, "grad_norm": 0.33984375, "learning_rate": 0.00016145486560225017, "loss": 0.0515, "step": 7310 }, { "epoch": 1.8027613412228796, "grad_norm": 0.388671875, "learning_rate": 0.00016142770508855017, "loss": 0.0515, "step": 7312 }, { "epoch": 1.8032544378698225, "grad_norm": 0.431640625, "learning_rate": 0.00016140053729532408, "loss": 0.0571, "step": 7314 }, { "epoch": 1.8037475345167653, "grad_norm": 0.3515625, "learning_rate": 0.00016137336222579154, "loss": 0.0512, "step": 7316 }, { "epoch": 1.804240631163708, "grad_norm": 0.380859375, "learning_rate": 0.00016134617988317293, "loss": 0.0549, "step": 7318 }, { "epoch": 1.804733727810651, "grad_norm": 0.345703125, "learning_rate": 0.00016131899027068945, "loss": 0.0492, "step": 7320 }, { "epoch": 1.8052268244575935, "grad_norm": 0.33203125, "learning_rate": 0.0001612917933915633, "loss": 0.0548, "step": 7322 }, { "epoch": 1.8057199211045365, "grad_norm": 0.4609375, "learning_rate": 0.00016126458924901738, "loss": 0.0591, "step": 7324 }, { "epoch": 1.8062130177514792, "grad_norm": 0.37890625, "learning_rate": 0.00016123737784627562, "loss": 0.0581, "step": 7326 }, { "epoch": 1.806706114398422, "grad_norm": 0.3984375, "learning_rate": 0.00016121015918656268, "loss": 0.0568, "step": 7328 }, { "epoch": 1.807199211045365, "grad_norm": 0.49609375, "learning_rate": 0.00016118293327310406, "loss": 0.0536, "step": 7330 }, { "epoch": 1.8076923076923077, "grad_norm": 0.53515625, "learning_rate": 0.0001611557001091263, "loss": 0.0558, "step": 7332 }, { "epoch": 1.8081854043392505, "grad_norm": 0.6171875, "learning_rate": 0.00016112845969785665, "loss": 0.0497, "step": 7334 }, { "epoch": 1.8086785009861934, "grad_norm": 0.30859375, "learning_rate": 0.00016110121204252316, "loss": 0.0527, "step": 7336 }, { "epoch": 1.809171597633136, "grad_norm": 0.3671875, "learning_rate": 0.00016107395714635493, "loss": 0.0545, "step": 7338 }, { "epoch": 1.809664694280079, "grad_norm": 0.40625, "learning_rate": 0.0001610466950125818, "loss": 0.0558, "step": 7340 }, { "epoch": 1.8101577909270217, "grad_norm": 0.36328125, "learning_rate": 0.00016101942564443445, "loss": 0.0518, "step": 7342 }, { "epoch": 1.8106508875739644, "grad_norm": 0.32421875, "learning_rate": 0.0001609921490451445, "loss": 0.0549, "step": 7344 }, { "epoch": 1.8111439842209074, "grad_norm": 0.359375, "learning_rate": 0.00016096486521794434, "loss": 0.0522, "step": 7346 }, { "epoch": 1.8116370808678501, "grad_norm": 0.470703125, "learning_rate": 0.00016093757416606725, "loss": 0.0521, "step": 7348 }, { "epoch": 1.8121301775147929, "grad_norm": 0.359375, "learning_rate": 0.0001609102758927474, "loss": 0.0524, "step": 7350 }, { "epoch": 1.8126232741617359, "grad_norm": 0.314453125, "learning_rate": 0.00016088297040121977, "loss": 0.0514, "step": 7352 }, { "epoch": 1.8131163708086784, "grad_norm": 0.328125, "learning_rate": 0.00016085565769472022, "loss": 0.0528, "step": 7354 }, { "epoch": 1.8136094674556213, "grad_norm": 0.341796875, "learning_rate": 0.00016082833777648548, "loss": 0.0533, "step": 7356 }, { "epoch": 1.814102564102564, "grad_norm": 0.427734375, "learning_rate": 0.00016080101064975305, "loss": 0.0531, "step": 7358 }, { "epoch": 1.8145956607495068, "grad_norm": 0.359375, "learning_rate": 0.0001607736763177614, "loss": 0.0567, "step": 7360 }, { "epoch": 1.8150887573964498, "grad_norm": 0.4453125, "learning_rate": 0.00016074633478374982, "loss": 0.0576, "step": 7362 }, { "epoch": 1.8155818540433923, "grad_norm": 0.45703125, "learning_rate": 0.0001607189860509584, "loss": 0.051, "step": 7364 }, { "epoch": 1.8160749506903353, "grad_norm": 0.34765625, "learning_rate": 0.0001606916301226281, "loss": 0.059, "step": 7366 }, { "epoch": 1.816568047337278, "grad_norm": 0.45703125, "learning_rate": 0.00016066426700200078, "loss": 0.0538, "step": 7368 }, { "epoch": 1.8170611439842208, "grad_norm": 0.5, "learning_rate": 0.0001606368966923191, "loss": 0.0524, "step": 7370 }, { "epoch": 1.8175542406311638, "grad_norm": 0.44140625, "learning_rate": 0.00016060951919682665, "loss": 0.055, "step": 7372 }, { "epoch": 1.8180473372781065, "grad_norm": 0.423828125, "learning_rate": 0.00016058213451876772, "loss": 0.0557, "step": 7374 }, { "epoch": 1.8185404339250493, "grad_norm": 0.412109375, "learning_rate": 0.0001605547426613877, "loss": 0.054, "step": 7376 }, { "epoch": 1.8190335305719922, "grad_norm": 0.453125, "learning_rate": 0.00016052734362793247, "loss": 0.0579, "step": 7378 }, { "epoch": 1.8195266272189348, "grad_norm": 0.3359375, "learning_rate": 0.0001604999374216491, "loss": 0.0515, "step": 7380 }, { "epoch": 1.8200197238658777, "grad_norm": 0.423828125, "learning_rate": 0.00016047252404578538, "loss": 0.0553, "step": 7382 }, { "epoch": 1.8205128205128205, "grad_norm": 0.5078125, "learning_rate": 0.00016044510350358995, "loss": 0.0514, "step": 7384 }, { "epoch": 1.8210059171597632, "grad_norm": 0.337890625, "learning_rate": 0.00016041767579831224, "loss": 0.0586, "step": 7386 }, { "epoch": 1.8214990138067062, "grad_norm": 0.3359375, "learning_rate": 0.00016039024093320262, "loss": 0.0482, "step": 7388 }, { "epoch": 1.821992110453649, "grad_norm": 0.365234375, "learning_rate": 0.00016036279891151227, "loss": 0.0568, "step": 7390 }, { "epoch": 1.8224852071005917, "grad_norm": 0.396484375, "learning_rate": 0.0001603353497364932, "loss": 0.0579, "step": 7392 }, { "epoch": 1.8229783037475347, "grad_norm": 0.337890625, "learning_rate": 0.0001603078934113984, "loss": 0.0526, "step": 7394 }, { "epoch": 1.8234714003944772, "grad_norm": 0.388671875, "learning_rate": 0.00016028042993948148, "loss": 0.0552, "step": 7396 }, { "epoch": 1.8239644970414202, "grad_norm": 0.349609375, "learning_rate": 0.00016025295932399703, "loss": 0.0534, "step": 7398 }, { "epoch": 1.824457593688363, "grad_norm": 0.373046875, "learning_rate": 0.00016022548156820047, "loss": 0.0572, "step": 7400 }, { "epoch": 1.8249506903353057, "grad_norm": 0.337890625, "learning_rate": 0.00016019799667534813, "loss": 0.0586, "step": 7402 }, { "epoch": 1.8254437869822486, "grad_norm": 0.3359375, "learning_rate": 0.00016017050464869705, "loss": 0.0546, "step": 7404 }, { "epoch": 1.8259368836291914, "grad_norm": 0.375, "learning_rate": 0.00016014300549150521, "loss": 0.0551, "step": 7406 }, { "epoch": 1.8264299802761341, "grad_norm": 0.3515625, "learning_rate": 0.0001601154992070315, "loss": 0.0562, "step": 7408 }, { "epoch": 1.8269230769230769, "grad_norm": 0.34765625, "learning_rate": 0.00016008798579853544, "loss": 0.052, "step": 7410 }, { "epoch": 1.8274161735700196, "grad_norm": 0.376953125, "learning_rate": 0.00016006046526927757, "loss": 0.0508, "step": 7412 }, { "epoch": 1.8279092702169626, "grad_norm": 0.29296875, "learning_rate": 0.00016003293762251923, "loss": 0.0507, "step": 7414 }, { "epoch": 1.8284023668639053, "grad_norm": 0.404296875, "learning_rate": 0.00016000540286152258, "loss": 0.051, "step": 7416 }, { "epoch": 1.828895463510848, "grad_norm": 0.4140625, "learning_rate": 0.0001599778609895507, "loss": 0.056, "step": 7418 }, { "epoch": 1.829388560157791, "grad_norm": 0.353515625, "learning_rate": 0.00015995031200986742, "loss": 0.0543, "step": 7420 }, { "epoch": 1.8298816568047336, "grad_norm": 0.48828125, "learning_rate": 0.00015992275592573743, "loss": 0.0519, "step": 7422 }, { "epoch": 1.8303747534516766, "grad_norm": 0.498046875, "learning_rate": 0.0001598951927404263, "loss": 0.0558, "step": 7424 }, { "epoch": 1.8308678500986193, "grad_norm": 0.357421875, "learning_rate": 0.00015986762245720043, "loss": 0.0542, "step": 7426 }, { "epoch": 1.831360946745562, "grad_norm": 0.5234375, "learning_rate": 0.00015984004507932703, "loss": 0.0566, "step": 7428 }, { "epoch": 1.831854043392505, "grad_norm": 0.39453125, "learning_rate": 0.0001598124606100742, "loss": 0.0507, "step": 7430 }, { "epoch": 1.8323471400394478, "grad_norm": 0.365234375, "learning_rate": 0.00015978486905271082, "loss": 0.0595, "step": 7432 }, { "epoch": 1.8328402366863905, "grad_norm": 0.34375, "learning_rate": 0.00015975727041050669, "loss": 0.0486, "step": 7434 }, { "epoch": 1.8333333333333335, "grad_norm": 0.40234375, "learning_rate": 0.00015972966468673233, "loss": 0.0541, "step": 7436 }, { "epoch": 1.833826429980276, "grad_norm": 0.42578125, "learning_rate": 0.00015970205188465923, "loss": 0.0563, "step": 7438 }, { "epoch": 1.834319526627219, "grad_norm": 0.365234375, "learning_rate": 0.00015967443200755964, "loss": 0.0521, "step": 7440 }, { "epoch": 1.8348126232741617, "grad_norm": 0.419921875, "learning_rate": 0.00015964680505870673, "loss": 0.055, "step": 7442 }, { "epoch": 1.8353057199211045, "grad_norm": 0.408203125, "learning_rate": 0.00015961917104137437, "loss": 0.0581, "step": 7444 }, { "epoch": 1.8357988165680474, "grad_norm": 0.46484375, "learning_rate": 0.00015959152995883734, "loss": 0.0554, "step": 7446 }, { "epoch": 1.8362919132149902, "grad_norm": 0.328125, "learning_rate": 0.0001595638818143713, "loss": 0.0562, "step": 7448 }, { "epoch": 1.836785009861933, "grad_norm": 0.5078125, "learning_rate": 0.00015953622661125274, "loss": 0.0576, "step": 7450 }, { "epoch": 1.8372781065088757, "grad_norm": 0.33203125, "learning_rate": 0.00015950856435275888, "loss": 0.0545, "step": 7452 }, { "epoch": 1.8377712031558184, "grad_norm": 0.54296875, "learning_rate": 0.0001594808950421679, "loss": 0.0521, "step": 7454 }, { "epoch": 1.8382642998027614, "grad_norm": 0.38671875, "learning_rate": 0.00015945321868275877, "loss": 0.0578, "step": 7456 }, { "epoch": 1.8387573964497042, "grad_norm": 0.36328125, "learning_rate": 0.00015942553527781126, "loss": 0.0532, "step": 7458 }, { "epoch": 1.839250493096647, "grad_norm": 0.47265625, "learning_rate": 0.00015939784483060606, "loss": 0.0558, "step": 7460 }, { "epoch": 1.8397435897435899, "grad_norm": 0.419921875, "learning_rate": 0.0001593701473444246, "loss": 0.0551, "step": 7462 }, { "epoch": 1.8402366863905324, "grad_norm": 0.44921875, "learning_rate": 0.0001593424428225492, "loss": 0.0519, "step": 7464 }, { "epoch": 1.8407297830374754, "grad_norm": 0.37890625, "learning_rate": 0.00015931473126826298, "loss": 0.0511, "step": 7466 }, { "epoch": 1.8412228796844181, "grad_norm": 0.333984375, "learning_rate": 0.00015928701268484995, "loss": 0.0582, "step": 7468 }, { "epoch": 1.8417159763313609, "grad_norm": 0.375, "learning_rate": 0.0001592592870755949, "loss": 0.0529, "step": 7470 }, { "epoch": 1.8422090729783038, "grad_norm": 0.47265625, "learning_rate": 0.0001592315544437835, "loss": 0.0539, "step": 7472 }, { "epoch": 1.8427021696252466, "grad_norm": 0.318359375, "learning_rate": 0.00015920381479270215, "loss": 0.0581, "step": 7474 }, { "epoch": 1.8431952662721893, "grad_norm": 0.375, "learning_rate": 0.0001591760681256382, "loss": 0.0537, "step": 7476 }, { "epoch": 1.8436883629191323, "grad_norm": 0.375, "learning_rate": 0.0001591483144458798, "loss": 0.0546, "step": 7478 }, { "epoch": 1.8441814595660748, "grad_norm": 0.40625, "learning_rate": 0.0001591205537567159, "loss": 0.0518, "step": 7480 }, { "epoch": 1.8446745562130178, "grad_norm": 0.62109375, "learning_rate": 0.00015909278606143627, "loss": 0.0539, "step": 7482 }, { "epoch": 1.8451676528599605, "grad_norm": 0.3671875, "learning_rate": 0.00015906501136333156, "loss": 0.0519, "step": 7484 }, { "epoch": 1.8456607495069033, "grad_norm": 0.341796875, "learning_rate": 0.00015903722966569323, "loss": 0.0532, "step": 7486 }, { "epoch": 1.8461538461538463, "grad_norm": 0.412109375, "learning_rate": 0.00015900944097181356, "loss": 0.0502, "step": 7488 }, { "epoch": 1.846646942800789, "grad_norm": 0.36328125, "learning_rate": 0.00015898164528498567, "loss": 0.0564, "step": 7490 }, { "epoch": 1.8471400394477318, "grad_norm": 0.4375, "learning_rate": 0.00015895384260850354, "loss": 0.0521, "step": 7492 }, { "epoch": 1.8476331360946747, "grad_norm": 0.51953125, "learning_rate": 0.00015892603294566187, "loss": 0.0505, "step": 7494 }, { "epoch": 1.8481262327416172, "grad_norm": 0.3359375, "learning_rate": 0.00015889821629975627, "loss": 0.0551, "step": 7496 }, { "epoch": 1.8486193293885602, "grad_norm": 0.51171875, "learning_rate": 0.00015887039267408325, "loss": 0.0513, "step": 7498 }, { "epoch": 1.849112426035503, "grad_norm": 0.35546875, "learning_rate": 0.00015884256207193996, "loss": 0.0549, "step": 7500 }, { "epoch": 1.8496055226824457, "grad_norm": 0.37109375, "learning_rate": 0.00015881472449662455, "loss": 0.0523, "step": 7502 }, { "epoch": 1.8500986193293887, "grad_norm": 0.32421875, "learning_rate": 0.00015878687995143592, "loss": 0.0517, "step": 7504 }, { "epoch": 1.8505917159763312, "grad_norm": 0.32421875, "learning_rate": 0.00015875902843967375, "loss": 0.0506, "step": 7506 }, { "epoch": 1.8510848126232742, "grad_norm": 0.451171875, "learning_rate": 0.00015873116996463867, "loss": 0.0536, "step": 7508 }, { "epoch": 1.851577909270217, "grad_norm": 0.35546875, "learning_rate": 0.00015870330452963204, "loss": 0.0568, "step": 7510 }, { "epoch": 1.8520710059171597, "grad_norm": 0.490234375, "learning_rate": 0.00015867543213795608, "loss": 0.0525, "step": 7512 }, { "epoch": 1.8525641025641026, "grad_norm": 0.376953125, "learning_rate": 0.00015864755279291378, "loss": 0.0558, "step": 7514 }, { "epoch": 1.8530571992110454, "grad_norm": 0.294921875, "learning_rate": 0.00015861966649780907, "loss": 0.0482, "step": 7516 }, { "epoch": 1.8535502958579881, "grad_norm": 0.349609375, "learning_rate": 0.00015859177325594655, "loss": 0.0519, "step": 7518 }, { "epoch": 1.854043392504931, "grad_norm": 0.34375, "learning_rate": 0.00015856387307063176, "loss": 0.0579, "step": 7520 }, { "epoch": 1.8545364891518736, "grad_norm": 0.58203125, "learning_rate": 0.00015853596594517103, "loss": 0.0613, "step": 7522 }, { "epoch": 1.8550295857988166, "grad_norm": 0.345703125, "learning_rate": 0.00015850805188287156, "loss": 0.0558, "step": 7524 }, { "epoch": 1.8555226824457594, "grad_norm": 0.470703125, "learning_rate": 0.00015848013088704126, "loss": 0.0575, "step": 7526 }, { "epoch": 1.856015779092702, "grad_norm": 0.345703125, "learning_rate": 0.0001584522029609889, "loss": 0.0604, "step": 7528 }, { "epoch": 1.856508875739645, "grad_norm": 0.390625, "learning_rate": 0.00015842426810802414, "loss": 0.0529, "step": 7530 }, { "epoch": 1.8570019723865878, "grad_norm": 0.333984375, "learning_rate": 0.00015839632633145745, "loss": 0.0524, "step": 7532 }, { "epoch": 1.8574950690335306, "grad_norm": 0.3984375, "learning_rate": 0.0001583683776346, "loss": 0.0586, "step": 7534 }, { "epoch": 1.8579881656804735, "grad_norm": 0.291015625, "learning_rate": 0.00015834042202076393, "loss": 0.0546, "step": 7536 }, { "epoch": 1.858481262327416, "grad_norm": 0.375, "learning_rate": 0.00015831245949326212, "loss": 0.0511, "step": 7538 }, { "epoch": 1.858974358974359, "grad_norm": 0.46484375, "learning_rate": 0.0001582844900554083, "loss": 0.0502, "step": 7540 }, { "epoch": 1.8594674556213018, "grad_norm": 0.287109375, "learning_rate": 0.000158256513710517, "loss": 0.0516, "step": 7542 }, { "epoch": 1.8599605522682445, "grad_norm": 0.326171875, "learning_rate": 0.00015822853046190355, "loss": 0.0532, "step": 7544 }, { "epoch": 1.8604536489151875, "grad_norm": 0.359375, "learning_rate": 0.0001582005403128841, "loss": 0.0539, "step": 7546 }, { "epoch": 1.86094674556213, "grad_norm": 0.400390625, "learning_rate": 0.0001581725432667757, "loss": 0.0553, "step": 7548 }, { "epoch": 1.861439842209073, "grad_norm": 0.40625, "learning_rate": 0.00015814453932689614, "loss": 0.0476, "step": 7550 }, { "epoch": 1.8619329388560157, "grad_norm": 0.57421875, "learning_rate": 0.000158116528496564, "loss": 0.0523, "step": 7552 }, { "epoch": 1.8624260355029585, "grad_norm": 0.453125, "learning_rate": 0.00015808851077909878, "loss": 0.0514, "step": 7554 }, { "epoch": 1.8629191321499015, "grad_norm": 0.365234375, "learning_rate": 0.0001580604861778207, "loss": 0.0551, "step": 7556 }, { "epoch": 1.8634122287968442, "grad_norm": 0.3203125, "learning_rate": 0.00015803245469605087, "loss": 0.049, "step": 7558 }, { "epoch": 1.863905325443787, "grad_norm": 0.318359375, "learning_rate": 0.00015800441633711108, "loss": 0.0517, "step": 7560 }, { "epoch": 1.86439842209073, "grad_norm": 0.47265625, "learning_rate": 0.00015797637110432416, "loss": 0.059, "step": 7562 }, { "epoch": 1.8648915187376724, "grad_norm": 0.3359375, "learning_rate": 0.00015794831900101352, "loss": 0.0518, "step": 7564 }, { "epoch": 1.8653846153846154, "grad_norm": 0.28515625, "learning_rate": 0.00015792026003050356, "loss": 0.0538, "step": 7566 }, { "epoch": 1.8658777120315582, "grad_norm": 0.361328125, "learning_rate": 0.00015789219419611942, "loss": 0.0568, "step": 7568 }, { "epoch": 1.866370808678501, "grad_norm": 0.392578125, "learning_rate": 0.00015786412150118698, "loss": 0.0534, "step": 7570 }, { "epoch": 1.8668639053254439, "grad_norm": 0.3984375, "learning_rate": 0.00015783604194903313, "loss": 0.0549, "step": 7572 }, { "epoch": 1.8673570019723866, "grad_norm": 0.423828125, "learning_rate": 0.00015780795554298536, "loss": 0.0509, "step": 7574 }, { "epoch": 1.8678500986193294, "grad_norm": 0.431640625, "learning_rate": 0.00015777986228637212, "loss": 0.054, "step": 7576 }, { "epoch": 1.8683431952662723, "grad_norm": 0.38671875, "learning_rate": 0.00015775176218252258, "loss": 0.0569, "step": 7578 }, { "epoch": 1.8688362919132149, "grad_norm": 0.3828125, "learning_rate": 0.00015772365523476677, "loss": 0.0463, "step": 7580 }, { "epoch": 1.8693293885601578, "grad_norm": 0.369140625, "learning_rate": 0.00015769554144643558, "loss": 0.0593, "step": 7582 }, { "epoch": 1.8698224852071006, "grad_norm": 0.451171875, "learning_rate": 0.00015766742082086053, "loss": 0.0541, "step": 7584 }, { "epoch": 1.8703155818540433, "grad_norm": 0.3125, "learning_rate": 0.00015763929336137417, "loss": 0.0475, "step": 7586 }, { "epoch": 1.8708086785009863, "grad_norm": 0.310546875, "learning_rate": 0.00015761115907130975, "loss": 0.0554, "step": 7588 }, { "epoch": 1.8713017751479288, "grad_norm": 0.4609375, "learning_rate": 0.0001575830179540013, "loss": 0.0566, "step": 7590 }, { "epoch": 1.8717948717948718, "grad_norm": 0.37109375, "learning_rate": 0.0001575548700127837, "loss": 0.0533, "step": 7592 }, { "epoch": 1.8722879684418146, "grad_norm": 0.384765625, "learning_rate": 0.0001575267152509927, "loss": 0.0487, "step": 7594 }, { "epoch": 1.8727810650887573, "grad_norm": 0.294921875, "learning_rate": 0.00015749855367196472, "loss": 0.0505, "step": 7596 }, { "epoch": 1.8732741617357003, "grad_norm": 0.34765625, "learning_rate": 0.00015747038527903713, "loss": 0.0541, "step": 7598 }, { "epoch": 1.873767258382643, "grad_norm": 0.39453125, "learning_rate": 0.00015744221007554798, "loss": 0.0552, "step": 7600 }, { "epoch": 1.8742603550295858, "grad_norm": 0.4375, "learning_rate": 0.00015741402806483622, "loss": 0.0519, "step": 7602 }, { "epoch": 1.8747534516765287, "grad_norm": 0.32421875, "learning_rate": 0.0001573858392502416, "loss": 0.0499, "step": 7604 }, { "epoch": 1.8752465483234713, "grad_norm": 0.34375, "learning_rate": 0.0001573576436351046, "loss": 0.0565, "step": 7606 }, { "epoch": 1.8757396449704142, "grad_norm": 0.470703125, "learning_rate": 0.00015732944122276663, "loss": 0.0594, "step": 7608 }, { "epoch": 1.876232741617357, "grad_norm": 0.37890625, "learning_rate": 0.00015730123201656974, "loss": 0.052, "step": 7610 }, { "epoch": 1.8767258382642997, "grad_norm": 0.4140625, "learning_rate": 0.00015727301601985695, "loss": 0.0505, "step": 7612 }, { "epoch": 1.8772189349112427, "grad_norm": 0.671875, "learning_rate": 0.00015724479323597198, "loss": 0.0549, "step": 7614 }, { "epoch": 1.8777120315581854, "grad_norm": 0.35546875, "learning_rate": 0.00015721656366825943, "loss": 0.051, "step": 7616 }, { "epoch": 1.8782051282051282, "grad_norm": 0.490234375, "learning_rate": 0.00015718832732006458, "loss": 0.0562, "step": 7618 }, { "epoch": 1.8786982248520712, "grad_norm": 0.48828125, "learning_rate": 0.00015716008419473367, "loss": 0.0566, "step": 7620 }, { "epoch": 1.8791913214990137, "grad_norm": 0.486328125, "learning_rate": 0.00015713183429561366, "loss": 0.0513, "step": 7622 }, { "epoch": 1.8796844181459567, "grad_norm": 0.494140625, "learning_rate": 0.00015710357762605229, "loss": 0.054, "step": 7624 }, { "epoch": 1.8801775147928994, "grad_norm": 0.3359375, "learning_rate": 0.00015707531418939811, "loss": 0.0527, "step": 7626 }, { "epoch": 1.8806706114398422, "grad_norm": 0.37109375, "learning_rate": 0.00015704704398900057, "loss": 0.052, "step": 7628 }, { "epoch": 1.8811637080867851, "grad_norm": 0.439453125, "learning_rate": 0.00015701876702820978, "loss": 0.0524, "step": 7630 }, { "epoch": 1.8816568047337277, "grad_norm": 0.357421875, "learning_rate": 0.00015699048331037675, "loss": 0.0493, "step": 7632 }, { "epoch": 1.8821499013806706, "grad_norm": 0.5859375, "learning_rate": 0.00015696219283885328, "loss": 0.0542, "step": 7634 }, { "epoch": 1.8826429980276134, "grad_norm": 0.69921875, "learning_rate": 0.0001569338956169919, "loss": 0.0501, "step": 7636 }, { "epoch": 1.8831360946745561, "grad_norm": 0.68359375, "learning_rate": 0.00015690559164814602, "loss": 0.0595, "step": 7638 }, { "epoch": 1.883629191321499, "grad_norm": 0.5703125, "learning_rate": 0.0001568772809356698, "loss": 0.0526, "step": 7640 }, { "epoch": 1.8841222879684418, "grad_norm": 0.3671875, "learning_rate": 0.00015684896348291827, "loss": 0.0484, "step": 7642 }, { "epoch": 1.8846153846153846, "grad_norm": 0.4921875, "learning_rate": 0.00015682063929324712, "loss": 0.053, "step": 7644 }, { "epoch": 1.8851084812623276, "grad_norm": 0.439453125, "learning_rate": 0.00015679230837001295, "loss": 0.058, "step": 7646 }, { "epoch": 1.88560157790927, "grad_norm": 0.36328125, "learning_rate": 0.00015676397071657318, "loss": 0.0526, "step": 7648 }, { "epoch": 1.886094674556213, "grad_norm": 0.388671875, "learning_rate": 0.00015673562633628595, "loss": 0.0493, "step": 7650 }, { "epoch": 1.8865877712031558, "grad_norm": 0.423828125, "learning_rate": 0.00015670727523251022, "loss": 0.0476, "step": 7652 }, { "epoch": 1.8870808678500985, "grad_norm": 0.400390625, "learning_rate": 0.00015667891740860575, "loss": 0.0578, "step": 7654 }, { "epoch": 1.8875739644970415, "grad_norm": 0.41015625, "learning_rate": 0.00015665055286793314, "loss": 0.0543, "step": 7656 }, { "epoch": 1.8880670611439843, "grad_norm": 0.73046875, "learning_rate": 0.00015662218161385374, "loss": 0.0503, "step": 7658 }, { "epoch": 1.888560157790927, "grad_norm": 0.404296875, "learning_rate": 0.00015659380364972965, "loss": 0.0553, "step": 7660 }, { "epoch": 1.88905325443787, "grad_norm": 0.40234375, "learning_rate": 0.00015656541897892382, "loss": 0.0517, "step": 7662 }, { "epoch": 1.8895463510848125, "grad_norm": 0.52734375, "learning_rate": 0.00015653702760480006, "loss": 0.0521, "step": 7664 }, { "epoch": 1.8900394477317555, "grad_norm": 0.482421875, "learning_rate": 0.00015650862953072285, "loss": 0.0559, "step": 7666 }, { "epoch": 1.8905325443786982, "grad_norm": 0.322265625, "learning_rate": 0.00015648022476005754, "loss": 0.0555, "step": 7668 }, { "epoch": 1.891025641025641, "grad_norm": 0.44140625, "learning_rate": 0.00015645181329617023, "loss": 0.0525, "step": 7670 }, { "epoch": 1.891518737672584, "grad_norm": 0.322265625, "learning_rate": 0.0001564233951424279, "loss": 0.0516, "step": 7672 }, { "epoch": 1.8920118343195265, "grad_norm": 0.67578125, "learning_rate": 0.0001563949703021982, "loss": 0.0563, "step": 7674 }, { "epoch": 1.8925049309664694, "grad_norm": 0.359375, "learning_rate": 0.00015636653877884965, "loss": 0.053, "step": 7676 }, { "epoch": 1.8929980276134122, "grad_norm": 0.3203125, "learning_rate": 0.00015633810057575156, "loss": 0.0518, "step": 7678 }, { "epoch": 1.893491124260355, "grad_norm": 0.3359375, "learning_rate": 0.00015630965569627395, "loss": 0.0495, "step": 7680 }, { "epoch": 1.893984220907298, "grad_norm": 0.404296875, "learning_rate": 0.00015628120414378776, "loss": 0.0537, "step": 7682 }, { "epoch": 1.8944773175542406, "grad_norm": 0.314453125, "learning_rate": 0.00015625274592166467, "loss": 0.0576, "step": 7684 }, { "epoch": 1.8949704142011834, "grad_norm": 0.3125, "learning_rate": 0.0001562242810332771, "loss": 0.0489, "step": 7686 }, { "epoch": 1.8954635108481264, "grad_norm": 0.458984375, "learning_rate": 0.00015619580948199834, "loss": 0.0555, "step": 7688 }, { "epoch": 1.895956607495069, "grad_norm": 0.333984375, "learning_rate": 0.0001561673312712024, "loss": 0.055, "step": 7690 }, { "epoch": 1.8964497041420119, "grad_norm": 0.34375, "learning_rate": 0.0001561388464042641, "loss": 0.0568, "step": 7692 }, { "epoch": 1.8969428007889546, "grad_norm": 0.421875, "learning_rate": 0.00015611035488455902, "loss": 0.0521, "step": 7694 }, { "epoch": 1.8974358974358974, "grad_norm": 0.361328125, "learning_rate": 0.0001560818567154636, "loss": 0.0484, "step": 7696 }, { "epoch": 1.8979289940828403, "grad_norm": 0.30078125, "learning_rate": 0.0001560533519003551, "loss": 0.0528, "step": 7698 }, { "epoch": 1.898422090729783, "grad_norm": 0.484375, "learning_rate": 0.00015602484044261142, "loss": 0.0539, "step": 7700 }, { "epoch": 1.8989151873767258, "grad_norm": 0.36328125, "learning_rate": 0.0001559963223456113, "loss": 0.0515, "step": 7702 }, { "epoch": 1.8994082840236688, "grad_norm": 0.361328125, "learning_rate": 0.00015596779761273437, "loss": 0.0578, "step": 7704 }, { "epoch": 1.8999013806706113, "grad_norm": 0.4609375, "learning_rate": 0.00015593926624736098, "loss": 0.0549, "step": 7706 }, { "epoch": 1.9003944773175543, "grad_norm": 0.34375, "learning_rate": 0.00015591072825287214, "loss": 0.0571, "step": 7708 }, { "epoch": 1.900887573964497, "grad_norm": 0.345703125, "learning_rate": 0.0001558821836326499, "loss": 0.0474, "step": 7710 }, { "epoch": 1.9013806706114398, "grad_norm": 0.30859375, "learning_rate": 0.00015585363239007684, "loss": 0.0504, "step": 7712 }, { "epoch": 1.9018737672583828, "grad_norm": 0.29296875, "learning_rate": 0.0001558250745285365, "loss": 0.0508, "step": 7714 }, { "epoch": 1.9023668639053253, "grad_norm": 0.326171875, "learning_rate": 0.00015579651005141317, "loss": 0.044, "step": 7716 }, { "epoch": 1.9028599605522682, "grad_norm": 0.337890625, "learning_rate": 0.00015576793896209188, "loss": 0.0532, "step": 7718 }, { "epoch": 1.903353057199211, "grad_norm": 0.34375, "learning_rate": 0.00015573936126395843, "loss": 0.0528, "step": 7720 }, { "epoch": 1.9038461538461537, "grad_norm": 0.408203125, "learning_rate": 0.00015571077696039946, "loss": 0.0556, "step": 7722 }, { "epoch": 1.9043392504930967, "grad_norm": 0.48828125, "learning_rate": 0.00015568218605480237, "loss": 0.0629, "step": 7724 }, { "epoch": 1.9048323471400395, "grad_norm": 0.3359375, "learning_rate": 0.00015565358855055536, "loss": 0.0501, "step": 7726 }, { "epoch": 1.9053254437869822, "grad_norm": 0.322265625, "learning_rate": 0.0001556249844510474, "loss": 0.0527, "step": 7728 }, { "epoch": 1.9058185404339252, "grad_norm": 0.455078125, "learning_rate": 0.00015559637375966816, "loss": 0.0541, "step": 7730 }, { "epoch": 1.9063116370808677, "grad_norm": 0.357421875, "learning_rate": 0.00015556775647980825, "loss": 0.0517, "step": 7732 }, { "epoch": 1.9068047337278107, "grad_norm": 0.314453125, "learning_rate": 0.00015553913261485893, "loss": 0.0541, "step": 7734 }, { "epoch": 1.9072978303747534, "grad_norm": 0.36328125, "learning_rate": 0.0001555105021682123, "loss": 0.0548, "step": 7736 }, { "epoch": 1.9077909270216962, "grad_norm": 0.36328125, "learning_rate": 0.00015548186514326124, "loss": 0.0513, "step": 7738 }, { "epoch": 1.9082840236686391, "grad_norm": 0.33203125, "learning_rate": 0.0001554532215433994, "loss": 0.0497, "step": 7740 }, { "epoch": 1.9087771203155819, "grad_norm": 0.287109375, "learning_rate": 0.0001554245713720212, "loss": 0.052, "step": 7742 }, { "epoch": 1.9092702169625246, "grad_norm": 0.376953125, "learning_rate": 0.0001553959146325218, "loss": 0.0511, "step": 7744 }, { "epoch": 1.9097633136094676, "grad_norm": 0.375, "learning_rate": 0.0001553672513282973, "loss": 0.0555, "step": 7746 }, { "epoch": 1.9102564102564101, "grad_norm": 0.3671875, "learning_rate": 0.0001553385814627443, "loss": 0.0505, "step": 7748 }, { "epoch": 1.910749506903353, "grad_norm": 0.302734375, "learning_rate": 0.00015530990503926048, "loss": 0.0546, "step": 7750 }, { "epoch": 1.9112426035502958, "grad_norm": 0.314453125, "learning_rate": 0.0001552812220612441, "loss": 0.0532, "step": 7752 }, { "epoch": 1.9117357001972386, "grad_norm": 0.314453125, "learning_rate": 0.00015525253253209425, "loss": 0.0462, "step": 7754 }, { "epoch": 1.9122287968441816, "grad_norm": 0.353515625, "learning_rate": 0.0001552238364552108, "loss": 0.055, "step": 7756 }, { "epoch": 1.9127218934911243, "grad_norm": 0.33984375, "learning_rate": 0.00015519513383399438, "loss": 0.0494, "step": 7758 }, { "epoch": 1.913214990138067, "grad_norm": 0.373046875, "learning_rate": 0.00015516642467184645, "loss": 0.0538, "step": 7760 }, { "epoch": 1.9137080867850098, "grad_norm": 0.34375, "learning_rate": 0.00015513770897216918, "loss": 0.0501, "step": 7762 }, { "epoch": 1.9142011834319526, "grad_norm": 0.310546875, "learning_rate": 0.00015510898673836554, "loss": 0.0525, "step": 7764 }, { "epoch": 1.9146942800788955, "grad_norm": 0.404296875, "learning_rate": 0.0001550802579738393, "loss": 0.0547, "step": 7766 }, { "epoch": 1.9151873767258383, "grad_norm": 0.294921875, "learning_rate": 0.00015505152268199495, "loss": 0.0473, "step": 7768 }, { "epoch": 1.915680473372781, "grad_norm": 0.337890625, "learning_rate": 0.0001550227808662378, "loss": 0.0501, "step": 7770 }, { "epoch": 1.916173570019724, "grad_norm": 0.33984375, "learning_rate": 0.0001549940325299739, "loss": 0.0525, "step": 7772 }, { "epoch": 1.9166666666666665, "grad_norm": 0.443359375, "learning_rate": 0.00015496527767661013, "loss": 0.0485, "step": 7774 }, { "epoch": 1.9171597633136095, "grad_norm": 0.404296875, "learning_rate": 0.00015493651630955405, "loss": 0.0521, "step": 7776 }, { "epoch": 1.9176528599605522, "grad_norm": 0.4140625, "learning_rate": 0.00015490774843221404, "loss": 0.0525, "step": 7778 }, { "epoch": 1.918145956607495, "grad_norm": 0.333984375, "learning_rate": 0.00015487897404799933, "loss": 0.0484, "step": 7780 }, { "epoch": 1.918639053254438, "grad_norm": 0.265625, "learning_rate": 0.0001548501931603197, "loss": 0.0479, "step": 7782 }, { "epoch": 1.9191321499013807, "grad_norm": 0.486328125, "learning_rate": 0.000154821405772586, "loss": 0.0543, "step": 7784 }, { "epoch": 1.9196252465483234, "grad_norm": 0.46875, "learning_rate": 0.00015479261188820962, "loss": 0.0558, "step": 7786 }, { "epoch": 1.9201183431952664, "grad_norm": 0.37890625, "learning_rate": 0.00015476381151060278, "loss": 0.0529, "step": 7788 }, { "epoch": 1.920611439842209, "grad_norm": 0.33203125, "learning_rate": 0.00015473500464317858, "loss": 0.0515, "step": 7790 }, { "epoch": 1.921104536489152, "grad_norm": 0.5078125, "learning_rate": 0.00015470619128935068, "loss": 0.0513, "step": 7792 }, { "epoch": 1.9215976331360947, "grad_norm": 0.3671875, "learning_rate": 0.00015467737145253373, "loss": 0.054, "step": 7794 }, { "epoch": 1.9220907297830374, "grad_norm": 0.35546875, "learning_rate": 0.0001546485451361429, "loss": 0.0512, "step": 7796 }, { "epoch": 1.9225838264299804, "grad_norm": 0.37890625, "learning_rate": 0.0001546197123435944, "loss": 0.054, "step": 7798 }, { "epoch": 1.9230769230769231, "grad_norm": 0.412109375, "learning_rate": 0.000154590873078305, "loss": 0.05, "step": 7800 }, { "epoch": 1.9235700197238659, "grad_norm": 0.4609375, "learning_rate": 0.00015456202734369238, "loss": 0.0535, "step": 7802 }, { "epoch": 1.9240631163708086, "grad_norm": 0.37109375, "learning_rate": 0.00015453317514317486, "loss": 0.0556, "step": 7804 }, { "epoch": 1.9245562130177514, "grad_norm": 0.412109375, "learning_rate": 0.00015450431648017163, "loss": 0.0597, "step": 7806 }, { "epoch": 1.9250493096646943, "grad_norm": 0.36328125, "learning_rate": 0.0001544754513581026, "loss": 0.0495, "step": 7808 }, { "epoch": 1.925542406311637, "grad_norm": 0.380859375, "learning_rate": 0.00015444657978038836, "loss": 0.0539, "step": 7810 }, { "epoch": 1.9260355029585798, "grad_norm": 0.34375, "learning_rate": 0.00015441770175045048, "loss": 0.0538, "step": 7812 }, { "epoch": 1.9265285996055228, "grad_norm": 0.3125, "learning_rate": 0.0001543888172717111, "loss": 0.0565, "step": 7814 }, { "epoch": 1.9270216962524653, "grad_norm": 0.376953125, "learning_rate": 0.00015435992634759321, "loss": 0.0557, "step": 7816 }, { "epoch": 1.9275147928994083, "grad_norm": 0.3359375, "learning_rate": 0.0001543310289815205, "loss": 0.0514, "step": 7818 }, { "epoch": 1.928007889546351, "grad_norm": 0.388671875, "learning_rate": 0.00015430212517691757, "loss": 0.0563, "step": 7820 }, { "epoch": 1.9285009861932938, "grad_norm": 0.37109375, "learning_rate": 0.00015427321493720962, "loss": 0.054, "step": 7822 }, { "epoch": 1.9289940828402368, "grad_norm": 0.365234375, "learning_rate": 0.00015424429826582268, "loss": 0.0547, "step": 7824 }, { "epoch": 1.9294871794871795, "grad_norm": 0.396484375, "learning_rate": 0.0001542153751661835, "loss": 0.0492, "step": 7826 }, { "epoch": 1.9299802761341223, "grad_norm": 0.337890625, "learning_rate": 0.00015418644564171968, "loss": 0.0501, "step": 7828 }, { "epoch": 1.9304733727810652, "grad_norm": 0.357421875, "learning_rate": 0.0001541575096958595, "loss": 0.0542, "step": 7830 }, { "epoch": 1.9309664694280078, "grad_norm": 0.4921875, "learning_rate": 0.00015412856733203206, "loss": 0.0515, "step": 7832 }, { "epoch": 1.9314595660749507, "grad_norm": 0.369140625, "learning_rate": 0.00015409961855366718, "loss": 0.0611, "step": 7834 }, { "epoch": 1.9319526627218935, "grad_norm": 0.34375, "learning_rate": 0.0001540706633641954, "loss": 0.0547, "step": 7836 }, { "epoch": 1.9324457593688362, "grad_norm": 0.30859375, "learning_rate": 0.00015404170176704817, "loss": 0.0504, "step": 7838 }, { "epoch": 1.9329388560157792, "grad_norm": 0.3125, "learning_rate": 0.00015401273376565756, "loss": 0.0527, "step": 7840 }, { "epoch": 1.933431952662722, "grad_norm": 0.455078125, "learning_rate": 0.0001539837593634564, "loss": 0.0521, "step": 7842 }, { "epoch": 1.9339250493096647, "grad_norm": 0.345703125, "learning_rate": 0.00015395477856387834, "loss": 0.0554, "step": 7844 }, { "epoch": 1.9344181459566077, "grad_norm": 0.3828125, "learning_rate": 0.0001539257913703578, "loss": 0.0542, "step": 7846 }, { "epoch": 1.9349112426035502, "grad_norm": 0.36328125, "learning_rate": 0.00015389679778632988, "loss": 0.0566, "step": 7848 }, { "epoch": 1.9354043392504932, "grad_norm": 0.59375, "learning_rate": 0.00015386779781523052, "loss": 0.0528, "step": 7850 }, { "epoch": 1.935897435897436, "grad_norm": 0.30859375, "learning_rate": 0.00015383879146049634, "loss": 0.0492, "step": 7852 }, { "epoch": 1.9363905325443787, "grad_norm": 0.53515625, "learning_rate": 0.0001538097787255648, "loss": 0.0571, "step": 7854 }, { "epoch": 1.9368836291913216, "grad_norm": 0.40625, "learning_rate": 0.00015378075961387403, "loss": 0.0496, "step": 7856 }, { "epoch": 1.9373767258382641, "grad_norm": 0.462890625, "learning_rate": 0.00015375173412886296, "loss": 0.053, "step": 7858 }, { "epoch": 1.9378698224852071, "grad_norm": 0.431640625, "learning_rate": 0.0001537227022739713, "loss": 0.0478, "step": 7860 }, { "epoch": 1.9383629191321499, "grad_norm": 0.3671875, "learning_rate": 0.00015369366405263948, "loss": 0.05, "step": 7862 }, { "epoch": 1.9388560157790926, "grad_norm": 0.3984375, "learning_rate": 0.00015366461946830865, "loss": 0.0566, "step": 7864 }, { "epoch": 1.9393491124260356, "grad_norm": 0.34765625, "learning_rate": 0.00015363556852442085, "loss": 0.0483, "step": 7866 }, { "epoch": 1.9398422090729783, "grad_norm": 0.314453125, "learning_rate": 0.00015360651122441868, "loss": 0.0529, "step": 7868 }, { "epoch": 1.940335305719921, "grad_norm": 0.41015625, "learning_rate": 0.00015357744757174563, "loss": 0.0556, "step": 7870 }, { "epoch": 1.940828402366864, "grad_norm": 0.38671875, "learning_rate": 0.0001535483775698459, "loss": 0.0537, "step": 7872 }, { "epoch": 1.9413214990138066, "grad_norm": 0.404296875, "learning_rate": 0.00015351930122216442, "loss": 0.0544, "step": 7874 }, { "epoch": 1.9418145956607495, "grad_norm": 0.318359375, "learning_rate": 0.000153490218532147, "loss": 0.0497, "step": 7876 }, { "epoch": 1.9423076923076923, "grad_norm": 0.3984375, "learning_rate": 0.00015346112950323997, "loss": 0.0538, "step": 7878 }, { "epoch": 1.942800788954635, "grad_norm": 0.388671875, "learning_rate": 0.00015343203413889064, "loss": 0.0533, "step": 7880 }, { "epoch": 1.943293885601578, "grad_norm": 0.380859375, "learning_rate": 0.00015340293244254694, "loss": 0.053, "step": 7882 }, { "epoch": 1.9437869822485208, "grad_norm": 0.380859375, "learning_rate": 0.00015337382441765756, "loss": 0.0554, "step": 7884 }, { "epoch": 1.9442800788954635, "grad_norm": 0.44140625, "learning_rate": 0.00015334471006767198, "loss": 0.0515, "step": 7886 }, { "epoch": 1.9447731755424065, "grad_norm": 0.373046875, "learning_rate": 0.00015331558939604047, "loss": 0.0561, "step": 7888 }, { "epoch": 1.945266272189349, "grad_norm": 0.40625, "learning_rate": 0.0001532864624062139, "loss": 0.0521, "step": 7890 }, { "epoch": 1.945759368836292, "grad_norm": 0.34765625, "learning_rate": 0.000153257329101644, "loss": 0.0545, "step": 7892 }, { "epoch": 1.9462524654832347, "grad_norm": 0.302734375, "learning_rate": 0.0001532281894857833, "loss": 0.0499, "step": 7894 }, { "epoch": 1.9467455621301775, "grad_norm": 0.33984375, "learning_rate": 0.00015319904356208498, "loss": 0.0516, "step": 7896 }, { "epoch": 1.9472386587771204, "grad_norm": 0.37109375, "learning_rate": 0.0001531698913340029, "loss": 0.0522, "step": 7898 }, { "epoch": 1.947731755424063, "grad_norm": 0.30078125, "learning_rate": 0.00015314073280499186, "loss": 0.0499, "step": 7900 }, { "epoch": 1.948224852071006, "grad_norm": 0.38671875, "learning_rate": 0.00015311156797850726, "loss": 0.0609, "step": 7902 }, { "epoch": 1.9487179487179487, "grad_norm": 0.306640625, "learning_rate": 0.00015308239685800538, "loss": 0.0496, "step": 7904 }, { "epoch": 1.9492110453648914, "grad_norm": 0.419921875, "learning_rate": 0.00015305321944694304, "loss": 0.0518, "step": 7906 }, { "epoch": 1.9497041420118344, "grad_norm": 0.296875, "learning_rate": 0.00015302403574877805, "loss": 0.0513, "step": 7908 }, { "epoch": 1.9501972386587771, "grad_norm": 0.376953125, "learning_rate": 0.00015299484576696871, "loss": 0.052, "step": 7910 }, { "epoch": 1.95069033530572, "grad_norm": 0.333984375, "learning_rate": 0.00015296564950497427, "loss": 0.0575, "step": 7912 }, { "epoch": 1.9511834319526629, "grad_norm": 0.345703125, "learning_rate": 0.00015293644696625464, "loss": 0.0576, "step": 7914 }, { "epoch": 1.9516765285996054, "grad_norm": 0.421875, "learning_rate": 0.00015290723815427053, "loss": 0.0561, "step": 7916 }, { "epoch": 1.9521696252465484, "grad_norm": 0.328125, "learning_rate": 0.00015287802307248325, "loss": 0.0534, "step": 7918 }, { "epoch": 1.952662721893491, "grad_norm": 0.38671875, "learning_rate": 0.00015284880172435506, "loss": 0.0575, "step": 7920 }, { "epoch": 1.9531558185404339, "grad_norm": 0.326171875, "learning_rate": 0.00015281957411334874, "loss": 0.0514, "step": 7922 }, { "epoch": 1.9536489151873768, "grad_norm": 0.361328125, "learning_rate": 0.000152790340242928, "loss": 0.0558, "step": 7924 }, { "epoch": 1.9541420118343196, "grad_norm": 0.427734375, "learning_rate": 0.0001527611001165572, "loss": 0.048, "step": 7926 }, { "epoch": 1.9546351084812623, "grad_norm": 0.5, "learning_rate": 0.00015273185373770146, "loss": 0.0514, "step": 7928 }, { "epoch": 1.9551282051282053, "grad_norm": 0.5703125, "learning_rate": 0.0001527026011098266, "loss": 0.0538, "step": 7930 }, { "epoch": 1.9556213017751478, "grad_norm": 0.421875, "learning_rate": 0.0001526733422363993, "loss": 0.0523, "step": 7932 }, { "epoch": 1.9561143984220908, "grad_norm": 0.453125, "learning_rate": 0.00015264407712088684, "loss": 0.0449, "step": 7934 }, { "epoch": 1.9566074950690335, "grad_norm": 0.55859375, "learning_rate": 0.00015261480576675733, "loss": 0.0527, "step": 7936 }, { "epoch": 1.9571005917159763, "grad_norm": 0.54296875, "learning_rate": 0.00015258552817747954, "loss": 0.0544, "step": 7938 }, { "epoch": 1.9575936883629192, "grad_norm": 0.37890625, "learning_rate": 0.00015255624435652311, "loss": 0.0519, "step": 7940 }, { "epoch": 1.9580867850098618, "grad_norm": 0.56640625, "learning_rate": 0.00015252695430735823, "loss": 0.0543, "step": 7942 }, { "epoch": 1.9585798816568047, "grad_norm": 0.419921875, "learning_rate": 0.000152497658033456, "loss": 0.0628, "step": 7944 }, { "epoch": 1.9590729783037475, "grad_norm": 0.421875, "learning_rate": 0.00015246835553828821, "loss": 0.0511, "step": 7946 }, { "epoch": 1.9595660749506902, "grad_norm": 0.37890625, "learning_rate": 0.00015243904682532734, "loss": 0.0544, "step": 7948 }, { "epoch": 1.9600591715976332, "grad_norm": 0.3359375, "learning_rate": 0.00015240973189804664, "loss": 0.0582, "step": 7950 }, { "epoch": 1.960552268244576, "grad_norm": 0.337890625, "learning_rate": 0.00015238041075992008, "loss": 0.0539, "step": 7952 }, { "epoch": 1.9610453648915187, "grad_norm": 0.388671875, "learning_rate": 0.0001523510834144224, "loss": 0.0543, "step": 7954 }, { "epoch": 1.9615384615384617, "grad_norm": 0.39453125, "learning_rate": 0.00015232174986502906, "loss": 0.0569, "step": 7956 }, { "epoch": 1.9620315581854042, "grad_norm": 0.3359375, "learning_rate": 0.0001522924101152162, "loss": 0.0505, "step": 7958 }, { "epoch": 1.9625246548323472, "grad_norm": 0.349609375, "learning_rate": 0.00015226306416846078, "loss": 0.0554, "step": 7960 }, { "epoch": 1.96301775147929, "grad_norm": 0.40625, "learning_rate": 0.00015223371202824048, "loss": 0.0538, "step": 7962 }, { "epoch": 1.9635108481262327, "grad_norm": 0.4453125, "learning_rate": 0.00015220435369803367, "loss": 0.0508, "step": 7964 }, { "epoch": 1.9640039447731756, "grad_norm": 0.494140625, "learning_rate": 0.00015217498918131944, "loss": 0.0564, "step": 7966 }, { "epoch": 1.9644970414201184, "grad_norm": 0.365234375, "learning_rate": 0.0001521456184815777, "loss": 0.0532, "step": 7968 }, { "epoch": 1.9649901380670611, "grad_norm": 0.341796875, "learning_rate": 0.00015211624160228906, "loss": 0.0526, "step": 7970 }, { "epoch": 1.965483234714004, "grad_norm": 0.34765625, "learning_rate": 0.00015208685854693475, "loss": 0.05, "step": 7972 }, { "epoch": 1.9659763313609466, "grad_norm": 0.37109375, "learning_rate": 0.00015205746931899693, "loss": 0.0535, "step": 7974 }, { "epoch": 1.9664694280078896, "grad_norm": 0.34765625, "learning_rate": 0.00015202807392195836, "loss": 0.0529, "step": 7976 }, { "epoch": 1.9669625246548323, "grad_norm": 0.376953125, "learning_rate": 0.0001519986723593025, "loss": 0.051, "step": 7978 }, { "epoch": 1.967455621301775, "grad_norm": 0.310546875, "learning_rate": 0.00015196926463451367, "loss": 0.0524, "step": 7980 }, { "epoch": 1.967948717948718, "grad_norm": 0.373046875, "learning_rate": 0.0001519398507510768, "loss": 0.0521, "step": 7982 }, { "epoch": 1.9684418145956606, "grad_norm": 0.3828125, "learning_rate": 0.0001519104307124776, "loss": 0.047, "step": 7984 }, { "epoch": 1.9689349112426036, "grad_norm": 0.3671875, "learning_rate": 0.00015188100452220255, "loss": 0.0531, "step": 7986 }, { "epoch": 1.9694280078895463, "grad_norm": 0.439453125, "learning_rate": 0.00015185157218373883, "loss": 0.0612, "step": 7988 }, { "epoch": 1.969921104536489, "grad_norm": 0.40234375, "learning_rate": 0.0001518221337005743, "loss": 0.0502, "step": 7990 }, { "epoch": 1.970414201183432, "grad_norm": 0.35546875, "learning_rate": 0.00015179268907619754, "loss": 0.0491, "step": 7992 }, { "epoch": 1.9709072978303748, "grad_norm": 0.44140625, "learning_rate": 0.000151763238314098, "loss": 0.0486, "step": 7994 }, { "epoch": 1.9714003944773175, "grad_norm": 0.470703125, "learning_rate": 0.00015173378141776568, "loss": 0.0542, "step": 7996 }, { "epoch": 1.9718934911242605, "grad_norm": 0.380859375, "learning_rate": 0.00015170431839069143, "loss": 0.0484, "step": 7998 }, { "epoch": 1.972386587771203, "grad_norm": 0.375, "learning_rate": 0.00015167484923636677, "loss": 0.054, "step": 8000 }, { "epoch": 1.972879684418146, "grad_norm": 0.3671875, "learning_rate": 0.00015164537395828396, "loss": 0.0456, "step": 8002 }, { "epoch": 1.9733727810650887, "grad_norm": 0.3984375, "learning_rate": 0.00015161589255993597, "loss": 0.0554, "step": 8004 }, { "epoch": 1.9738658777120315, "grad_norm": 0.376953125, "learning_rate": 0.00015158640504481658, "loss": 0.0566, "step": 8006 }, { "epoch": 1.9743589743589745, "grad_norm": 0.3515625, "learning_rate": 0.00015155691141642013, "loss": 0.0578, "step": 8008 }, { "epoch": 1.9748520710059172, "grad_norm": 0.376953125, "learning_rate": 0.0001515274116782418, "loss": 0.0505, "step": 8010 }, { "epoch": 1.97534516765286, "grad_norm": 0.310546875, "learning_rate": 0.0001514979058337775, "loss": 0.0516, "step": 8012 }, { "epoch": 1.975838264299803, "grad_norm": 0.330078125, "learning_rate": 0.00015146839388652388, "loss": 0.0531, "step": 8014 }, { "epoch": 1.9763313609467454, "grad_norm": 0.30859375, "learning_rate": 0.0001514388758399782, "loss": 0.0517, "step": 8016 }, { "epoch": 1.9768244575936884, "grad_norm": 0.48828125, "learning_rate": 0.00015140935169763851, "loss": 0.0477, "step": 8018 }, { "epoch": 1.9773175542406312, "grad_norm": 0.447265625, "learning_rate": 0.00015137982146300365, "loss": 0.0521, "step": 8020 }, { "epoch": 1.977810650887574, "grad_norm": 0.361328125, "learning_rate": 0.00015135028513957302, "loss": 0.0488, "step": 8022 }, { "epoch": 1.9783037475345169, "grad_norm": 0.33203125, "learning_rate": 0.00015132074273084694, "loss": 0.0481, "step": 8024 }, { "epoch": 1.9787968441814594, "grad_norm": 0.4921875, "learning_rate": 0.00015129119424032628, "loss": 0.0567, "step": 8026 }, { "epoch": 1.9792899408284024, "grad_norm": 0.36328125, "learning_rate": 0.00015126163967151275, "loss": 0.0529, "step": 8028 }, { "epoch": 1.9797830374753451, "grad_norm": 0.294921875, "learning_rate": 0.0001512320790279087, "loss": 0.0536, "step": 8030 }, { "epoch": 1.9802761341222879, "grad_norm": 0.34375, "learning_rate": 0.00015120251231301728, "loss": 0.0536, "step": 8032 }, { "epoch": 1.9807692307692308, "grad_norm": 0.36328125, "learning_rate": 0.00015117293953034223, "loss": 0.0569, "step": 8034 }, { "epoch": 1.9812623274161736, "grad_norm": 0.349609375, "learning_rate": 0.00015114336068338817, "loss": 0.0506, "step": 8036 }, { "epoch": 1.9817554240631163, "grad_norm": 0.423828125, "learning_rate": 0.00015111377577566027, "loss": 0.0506, "step": 8038 }, { "epoch": 1.9822485207100593, "grad_norm": 0.404296875, "learning_rate": 0.0001510841848106646, "loss": 0.0545, "step": 8040 }, { "epoch": 1.9827416173570018, "grad_norm": 0.439453125, "learning_rate": 0.00015105458779190778, "loss": 0.0497, "step": 8042 }, { "epoch": 1.9832347140039448, "grad_norm": 0.33984375, "learning_rate": 0.00015102498472289728, "loss": 0.057, "step": 8044 }, { "epoch": 1.9837278106508875, "grad_norm": 0.56640625, "learning_rate": 0.0001509953756071412, "loss": 0.0515, "step": 8046 }, { "epoch": 1.9842209072978303, "grad_norm": 0.48828125, "learning_rate": 0.00015096576044814838, "loss": 0.0593, "step": 8048 }, { "epoch": 1.9847140039447733, "grad_norm": 0.3828125, "learning_rate": 0.00015093613924942837, "loss": 0.0551, "step": 8050 }, { "epoch": 1.985207100591716, "grad_norm": 0.4765625, "learning_rate": 0.0001509065120144915, "loss": 0.0538, "step": 8052 }, { "epoch": 1.9857001972386588, "grad_norm": 0.498046875, "learning_rate": 0.00015087687874684873, "loss": 0.0523, "step": 8054 }, { "epoch": 1.9861932938856017, "grad_norm": 0.458984375, "learning_rate": 0.00015084723945001175, "loss": 0.0475, "step": 8056 }, { "epoch": 1.9866863905325443, "grad_norm": 0.427734375, "learning_rate": 0.000150817594127493, "loss": 0.0478, "step": 8058 }, { "epoch": 1.9871794871794872, "grad_norm": 0.361328125, "learning_rate": 0.00015078794278280565, "loss": 0.0543, "step": 8060 }, { "epoch": 1.98767258382643, "grad_norm": 0.416015625, "learning_rate": 0.00015075828541946348, "loss": 0.0481, "step": 8062 }, { "epoch": 1.9881656804733727, "grad_norm": 0.3828125, "learning_rate": 0.0001507286220409811, "loss": 0.0521, "step": 8064 }, { "epoch": 1.9886587771203157, "grad_norm": 0.349609375, "learning_rate": 0.0001506989526508738, "loss": 0.0552, "step": 8066 }, { "epoch": 1.9891518737672582, "grad_norm": 0.435546875, "learning_rate": 0.0001506692772526575, "loss": 0.0509, "step": 8068 }, { "epoch": 1.9896449704142012, "grad_norm": 0.392578125, "learning_rate": 0.000150639595849849, "loss": 0.055, "step": 8070 }, { "epoch": 1.990138067061144, "grad_norm": 0.357421875, "learning_rate": 0.0001506099084459656, "loss": 0.0515, "step": 8072 }, { "epoch": 1.9906311637080867, "grad_norm": 0.404296875, "learning_rate": 0.00015058021504452552, "loss": 0.0537, "step": 8074 }, { "epoch": 1.9911242603550297, "grad_norm": 0.30078125, "learning_rate": 0.00015055051564904755, "loss": 0.0539, "step": 8076 }, { "epoch": 1.9916173570019724, "grad_norm": 0.34375, "learning_rate": 0.00015052081026305122, "loss": 0.0519, "step": 8078 }, { "epoch": 1.9921104536489151, "grad_norm": 0.44140625, "learning_rate": 0.00015049109889005678, "loss": 0.0487, "step": 8080 }, { "epoch": 1.9926035502958581, "grad_norm": 0.4140625, "learning_rate": 0.00015046138153358528, "loss": 0.0495, "step": 8082 }, { "epoch": 1.9930966469428006, "grad_norm": 0.310546875, "learning_rate": 0.00015043165819715829, "loss": 0.0525, "step": 8084 }, { "epoch": 1.9935897435897436, "grad_norm": 0.5234375, "learning_rate": 0.0001504019288842982, "loss": 0.056, "step": 8086 }, { "epoch": 1.9940828402366864, "grad_norm": 0.3984375, "learning_rate": 0.00015037219359852818, "loss": 0.0561, "step": 8088 }, { "epoch": 1.994575936883629, "grad_norm": 0.486328125, "learning_rate": 0.00015034245234337197, "loss": 0.0471, "step": 8090 }, { "epoch": 1.995069033530572, "grad_norm": 0.4453125, "learning_rate": 0.00015031270512235405, "loss": 0.0588, "step": 8092 }, { "epoch": 1.9955621301775148, "grad_norm": 0.37890625, "learning_rate": 0.00015028295193899967, "loss": 0.0509, "step": 8094 }, { "epoch": 1.9960552268244576, "grad_norm": 0.412109375, "learning_rate": 0.00015025319279683474, "loss": 0.0495, "step": 8096 }, { "epoch": 1.9965483234714005, "grad_norm": 0.345703125, "learning_rate": 0.00015022342769938592, "loss": 0.0525, "step": 8098 }, { "epoch": 1.997041420118343, "grad_norm": 0.408203125, "learning_rate": 0.00015019365665018048, "loss": 0.054, "step": 8100 }, { "epoch": 1.997534516765286, "grad_norm": 0.345703125, "learning_rate": 0.0001501638796527465, "loss": 0.0524, "step": 8102 }, { "epoch": 1.9980276134122288, "grad_norm": 0.333984375, "learning_rate": 0.00015013409671061267, "loss": 0.0548, "step": 8104 }, { "epoch": 1.9985207100591715, "grad_norm": 0.4375, "learning_rate": 0.00015010430782730848, "loss": 0.0585, "step": 8106 }, { "epoch": 1.9990138067061145, "grad_norm": 0.2890625, "learning_rate": 0.00015007451300636405, "loss": 0.0493, "step": 8108 }, { "epoch": 1.9995069033530573, "grad_norm": 0.451171875, "learning_rate": 0.00015004471225131026, "loss": 0.0511, "step": 8110 }, { "epoch": 2.0, "grad_norm": 0.46484375, "learning_rate": 0.00015001490556567867, "loss": 0.0497, "step": 8112 }, { "epoch": 2.0, "eval_loss": 0.06181851029396057, "eval_runtime": 59.6015, "eval_samples_per_second": 267.258, "eval_steps_per_second": 2.097, "step": 8112 }, { "epoch": 2.000493096646943, "grad_norm": 0.37890625, "learning_rate": 0.00014998509295300155, "loss": 0.0378, "step": 8114 }, { "epoch": 2.0009861932938855, "grad_norm": 0.359375, "learning_rate": 0.0001499552744168118, "loss": 0.0324, "step": 8116 }, { "epoch": 2.0014792899408285, "grad_norm": 0.291015625, "learning_rate": 0.00014992544996064307, "loss": 0.0336, "step": 8118 }, { "epoch": 2.001972386587771, "grad_norm": 0.369140625, "learning_rate": 0.00014989561958802985, "loss": 0.0389, "step": 8120 }, { "epoch": 2.002465483234714, "grad_norm": 0.27734375, "learning_rate": 0.0001498657833025071, "loss": 0.035, "step": 8122 }, { "epoch": 2.002958579881657, "grad_norm": 0.341796875, "learning_rate": 0.0001498359411076106, "loss": 0.0316, "step": 8124 }, { "epoch": 2.0034516765285995, "grad_norm": 0.330078125, "learning_rate": 0.00014980609300687683, "loss": 0.0417, "step": 8126 }, { "epoch": 2.0039447731755424, "grad_norm": 0.5234375, "learning_rate": 0.00014977623900384299, "loss": 0.043, "step": 8128 }, { "epoch": 2.0044378698224854, "grad_norm": 0.294921875, "learning_rate": 0.00014974637910204687, "loss": 0.039, "step": 8130 }, { "epoch": 2.004930966469428, "grad_norm": 0.271484375, "learning_rate": 0.0001497165133050271, "loss": 0.0374, "step": 8132 }, { "epoch": 2.005424063116371, "grad_norm": 0.39453125, "learning_rate": 0.0001496866416163229, "loss": 0.0362, "step": 8134 }, { "epoch": 2.0059171597633134, "grad_norm": 0.251953125, "learning_rate": 0.00014965676403947428, "loss": 0.0391, "step": 8136 }, { "epoch": 2.0064102564102564, "grad_norm": 0.3046875, "learning_rate": 0.00014962688057802188, "loss": 0.0361, "step": 8138 }, { "epoch": 2.0069033530571994, "grad_norm": 0.423828125, "learning_rate": 0.00014959699123550703, "loss": 0.035, "step": 8140 }, { "epoch": 2.007396449704142, "grad_norm": 0.38671875, "learning_rate": 0.0001495670960154718, "loss": 0.0358, "step": 8142 }, { "epoch": 2.007889546351085, "grad_norm": 0.244140625, "learning_rate": 0.00014953719492145898, "loss": 0.0364, "step": 8144 }, { "epoch": 2.008382642998028, "grad_norm": 0.2578125, "learning_rate": 0.00014950728795701194, "loss": 0.0374, "step": 8146 }, { "epoch": 2.0088757396449703, "grad_norm": 0.365234375, "learning_rate": 0.00014947737512567487, "loss": 0.0358, "step": 8148 }, { "epoch": 2.0093688362919133, "grad_norm": 0.349609375, "learning_rate": 0.00014944745643099262, "loss": 0.0381, "step": 8150 }, { "epoch": 2.009861932938856, "grad_norm": 0.294921875, "learning_rate": 0.0001494175318765107, "loss": 0.0387, "step": 8152 }, { "epoch": 2.010355029585799, "grad_norm": 0.302734375, "learning_rate": 0.00014938760146577534, "loss": 0.0358, "step": 8154 }, { "epoch": 2.010848126232742, "grad_norm": 0.37890625, "learning_rate": 0.00014935766520233348, "loss": 0.0327, "step": 8156 }, { "epoch": 2.0113412228796843, "grad_norm": 0.373046875, "learning_rate": 0.00014932772308973266, "loss": 0.0398, "step": 8158 }, { "epoch": 2.0118343195266273, "grad_norm": 0.3359375, "learning_rate": 0.0001492977751315213, "loss": 0.0347, "step": 8160 }, { "epoch": 2.01232741617357, "grad_norm": 0.2431640625, "learning_rate": 0.0001492678213312483, "loss": 0.0337, "step": 8162 }, { "epoch": 2.0128205128205128, "grad_norm": 0.25390625, "learning_rate": 0.0001492378616924634, "loss": 0.036, "step": 8164 }, { "epoch": 2.0133136094674557, "grad_norm": 0.46875, "learning_rate": 0.000149207896218717, "loss": 0.0348, "step": 8166 }, { "epoch": 2.0138067061143983, "grad_norm": 0.28515625, "learning_rate": 0.00014917792491356017, "loss": 0.0355, "step": 8168 }, { "epoch": 2.0142998027613412, "grad_norm": 0.2490234375, "learning_rate": 0.00014914794778054467, "loss": 0.0337, "step": 8170 }, { "epoch": 2.014792899408284, "grad_norm": 0.27734375, "learning_rate": 0.0001491179648232229, "loss": 0.0357, "step": 8172 }, { "epoch": 2.0152859960552267, "grad_norm": 0.279296875, "learning_rate": 0.0001490879760451481, "loss": 0.0416, "step": 8174 }, { "epoch": 2.0157790927021697, "grad_norm": 0.265625, "learning_rate": 0.00014905798144987403, "loss": 0.0351, "step": 8176 }, { "epoch": 2.0162721893491122, "grad_norm": 0.2734375, "learning_rate": 0.00014902798104095527, "loss": 0.0654, "step": 8178 }, { "epoch": 2.016765285996055, "grad_norm": 0.31640625, "learning_rate": 0.00014899797482194704, "loss": 0.0301, "step": 8180 }, { "epoch": 2.017258382642998, "grad_norm": 0.3046875, "learning_rate": 0.00014896796279640522, "loss": 0.0332, "step": 8182 }, { "epoch": 2.0177514792899407, "grad_norm": 0.365234375, "learning_rate": 0.0001489379449678864, "loss": 0.0392, "step": 8184 }, { "epoch": 2.0182445759368837, "grad_norm": 0.2734375, "learning_rate": 0.00014890792133994787, "loss": 0.0356, "step": 8186 }, { "epoch": 2.0187376725838266, "grad_norm": 0.349609375, "learning_rate": 0.00014887789191614758, "loss": 0.0341, "step": 8188 }, { "epoch": 2.019230769230769, "grad_norm": 0.26171875, "learning_rate": 0.0001488478567000442, "loss": 0.0351, "step": 8190 }, { "epoch": 2.019723865877712, "grad_norm": 0.33984375, "learning_rate": 0.00014881781569519705, "loss": 0.0428, "step": 8192 }, { "epoch": 2.0202169625246547, "grad_norm": 0.498046875, "learning_rate": 0.0001487877689051662, "loss": 0.0348, "step": 8194 }, { "epoch": 2.0207100591715976, "grad_norm": 0.275390625, "learning_rate": 0.00014875771633351238, "loss": 0.0396, "step": 8196 }, { "epoch": 2.0212031558185406, "grad_norm": 0.255859375, "learning_rate": 0.0001487276579837969, "loss": 0.0322, "step": 8198 }, { "epoch": 2.021696252465483, "grad_norm": 0.4609375, "learning_rate": 0.0001486975938595819, "loss": 0.035, "step": 8200 }, { "epoch": 2.022189349112426, "grad_norm": 0.279296875, "learning_rate": 0.00014866752396443014, "loss": 0.0351, "step": 8202 }, { "epoch": 2.0226824457593686, "grad_norm": 0.29296875, "learning_rate": 0.00014863744830190503, "loss": 0.032, "step": 8204 }, { "epoch": 2.0231755424063116, "grad_norm": 0.44921875, "learning_rate": 0.00014860736687557076, "loss": 0.0421, "step": 8206 }, { "epoch": 2.0236686390532546, "grad_norm": 0.32421875, "learning_rate": 0.00014857727968899212, "loss": 0.0378, "step": 8208 }, { "epoch": 2.024161735700197, "grad_norm": 0.44140625, "learning_rate": 0.00014854718674573464, "loss": 0.0352, "step": 8210 }, { "epoch": 2.02465483234714, "grad_norm": 0.27734375, "learning_rate": 0.00014851708804936449, "loss": 0.0362, "step": 8212 }, { "epoch": 2.025147928994083, "grad_norm": 0.26171875, "learning_rate": 0.00014848698360344847, "loss": 0.0346, "step": 8214 }, { "epoch": 2.0256410256410255, "grad_norm": 0.328125, "learning_rate": 0.0001484568734115542, "loss": 0.0335, "step": 8216 }, { "epoch": 2.0261341222879685, "grad_norm": 0.318359375, "learning_rate": 0.0001484267574772499, "loss": 0.0368, "step": 8218 }, { "epoch": 2.026627218934911, "grad_norm": 0.40625, "learning_rate": 0.00014839663580410445, "loss": 0.0349, "step": 8220 }, { "epoch": 2.027120315581854, "grad_norm": 0.275390625, "learning_rate": 0.00014836650839568743, "loss": 0.0318, "step": 8222 }, { "epoch": 2.027613412228797, "grad_norm": 0.30078125, "learning_rate": 0.00014833637525556913, "loss": 0.0367, "step": 8224 }, { "epoch": 2.0281065088757395, "grad_norm": 0.341796875, "learning_rate": 0.00014830623638732048, "loss": 0.0344, "step": 8226 }, { "epoch": 2.0285996055226825, "grad_norm": 0.279296875, "learning_rate": 0.00014827609179451313, "loss": 0.041, "step": 8228 }, { "epoch": 2.0290927021696255, "grad_norm": 0.32421875, "learning_rate": 0.00014824594148071934, "loss": 0.031, "step": 8230 }, { "epoch": 2.029585798816568, "grad_norm": 0.28515625, "learning_rate": 0.00014821578544951212, "loss": 0.0333, "step": 8232 }, { "epoch": 2.030078895463511, "grad_norm": 0.271484375, "learning_rate": 0.00014818562370446511, "loss": 0.0339, "step": 8234 }, { "epoch": 2.0305719921104535, "grad_norm": 0.35546875, "learning_rate": 0.0001481554562491527, "loss": 0.0373, "step": 8236 }, { "epoch": 2.0310650887573964, "grad_norm": 0.28125, "learning_rate": 0.00014812528308714984, "loss": 0.0379, "step": 8238 }, { "epoch": 2.0315581854043394, "grad_norm": 0.294921875, "learning_rate": 0.00014809510422203223, "loss": 0.0314, "step": 8240 }, { "epoch": 2.032051282051282, "grad_norm": 0.353515625, "learning_rate": 0.00014806491965737624, "loss": 0.0351, "step": 8242 }, { "epoch": 2.032544378698225, "grad_norm": 0.353515625, "learning_rate": 0.0001480347293967589, "loss": 0.0367, "step": 8244 }, { "epoch": 2.033037475345168, "grad_norm": 0.365234375, "learning_rate": 0.00014800453344375794, "loss": 0.0337, "step": 8246 }, { "epoch": 2.0335305719921104, "grad_norm": 0.322265625, "learning_rate": 0.00014797433180195176, "loss": 0.0353, "step": 8248 }, { "epoch": 2.0340236686390534, "grad_norm": 0.3359375, "learning_rate": 0.0001479441244749194, "loss": 0.0332, "step": 8250 }, { "epoch": 2.034516765285996, "grad_norm": 0.326171875, "learning_rate": 0.0001479139114662406, "loss": 0.0368, "step": 8252 }, { "epoch": 2.035009861932939, "grad_norm": 0.271484375, "learning_rate": 0.00014788369277949576, "loss": 0.035, "step": 8254 }, { "epoch": 2.035502958579882, "grad_norm": 0.31640625, "learning_rate": 0.000147853468418266, "loss": 0.032, "step": 8256 }, { "epoch": 2.0359960552268244, "grad_norm": 0.2578125, "learning_rate": 0.00014782323838613304, "loss": 0.0345, "step": 8258 }, { "epoch": 2.0364891518737673, "grad_norm": 0.271484375, "learning_rate": 0.00014779300268667932, "loss": 0.0333, "step": 8260 }, { "epoch": 2.03698224852071, "grad_norm": 0.2578125, "learning_rate": 0.00014776276132348793, "loss": 0.0367, "step": 8262 }, { "epoch": 2.037475345167653, "grad_norm": 0.248046875, "learning_rate": 0.00014773251430014268, "loss": 0.032, "step": 8264 }, { "epoch": 2.037968441814596, "grad_norm": 0.255859375, "learning_rate": 0.00014770226162022797, "loss": 0.0293, "step": 8266 }, { "epoch": 2.0384615384615383, "grad_norm": 0.271484375, "learning_rate": 0.00014767200328732894, "loss": 0.0335, "step": 8268 }, { "epoch": 2.0389546351084813, "grad_norm": 0.302734375, "learning_rate": 0.00014764173930503132, "loss": 0.0318, "step": 8270 }, { "epoch": 2.0394477317554243, "grad_norm": 0.296875, "learning_rate": 0.00014761146967692162, "loss": 0.0295, "step": 8272 }, { "epoch": 2.039940828402367, "grad_norm": 0.322265625, "learning_rate": 0.0001475811944065869, "loss": 0.0364, "step": 8274 }, { "epoch": 2.0404339250493098, "grad_norm": 0.251953125, "learning_rate": 0.00014755091349761502, "loss": 0.032, "step": 8276 }, { "epoch": 2.0409270216962523, "grad_norm": 0.294921875, "learning_rate": 0.0001475206269535944, "loss": 0.0306, "step": 8278 }, { "epoch": 2.0414201183431953, "grad_norm": 0.3046875, "learning_rate": 0.00014749033477811412, "loss": 0.0375, "step": 8280 }, { "epoch": 2.0419132149901382, "grad_norm": 0.287109375, "learning_rate": 0.00014746003697476404, "loss": 0.0317, "step": 8282 }, { "epoch": 2.0424063116370808, "grad_norm": 0.345703125, "learning_rate": 0.00014742973354713463, "loss": 0.0333, "step": 8284 }, { "epoch": 2.0428994082840237, "grad_norm": 0.302734375, "learning_rate": 0.00014739942449881695, "loss": 0.0311, "step": 8286 }, { "epoch": 2.0433925049309662, "grad_norm": 0.2734375, "learning_rate": 0.00014736910983340282, "loss": 0.0369, "step": 8288 }, { "epoch": 2.043885601577909, "grad_norm": 0.32421875, "learning_rate": 0.0001473387895544847, "loss": 0.0339, "step": 8290 }, { "epoch": 2.044378698224852, "grad_norm": 0.259765625, "learning_rate": 0.00014730846366565575, "loss": 0.0338, "step": 8292 }, { "epoch": 2.0448717948717947, "grad_norm": 0.287109375, "learning_rate": 0.00014727813217050965, "loss": 0.0343, "step": 8294 }, { "epoch": 2.0453648915187377, "grad_norm": 0.27734375, "learning_rate": 0.00014724779507264098, "loss": 0.0346, "step": 8296 }, { "epoch": 2.0458579881656807, "grad_norm": 0.306640625, "learning_rate": 0.00014721745237564475, "loss": 0.0353, "step": 8298 }, { "epoch": 2.046351084812623, "grad_norm": 0.23828125, "learning_rate": 0.0001471871040831168, "loss": 0.0341, "step": 8300 }, { "epoch": 2.046844181459566, "grad_norm": 0.337890625, "learning_rate": 0.0001471567501986536, "loss": 0.035, "step": 8302 }, { "epoch": 2.0473372781065087, "grad_norm": 0.314453125, "learning_rate": 0.00014712639072585215, "loss": 0.0326, "step": 8304 }, { "epoch": 2.0478303747534516, "grad_norm": 0.30078125, "learning_rate": 0.00014709602566831032, "loss": 0.0309, "step": 8306 }, { "epoch": 2.0483234714003946, "grad_norm": 0.27734375, "learning_rate": 0.00014706565502962648, "loss": 0.0363, "step": 8308 }, { "epoch": 2.048816568047337, "grad_norm": 0.26171875, "learning_rate": 0.00014703527881339973, "loss": 0.0298, "step": 8310 }, { "epoch": 2.04930966469428, "grad_norm": 0.3203125, "learning_rate": 0.00014700489702322984, "loss": 0.0329, "step": 8312 }, { "epoch": 2.049802761341223, "grad_norm": 0.37890625, "learning_rate": 0.00014697450966271721, "loss": 0.0342, "step": 8314 }, { "epoch": 2.0502958579881656, "grad_norm": 0.349609375, "learning_rate": 0.0001469441167354629, "loss": 0.0327, "step": 8316 }, { "epoch": 2.0507889546351086, "grad_norm": 0.291015625, "learning_rate": 0.0001469137182450687, "loss": 0.0342, "step": 8318 }, { "epoch": 2.051282051282051, "grad_norm": 0.275390625, "learning_rate": 0.00014688331419513692, "loss": 0.034, "step": 8320 }, { "epoch": 2.051775147928994, "grad_norm": 0.27734375, "learning_rate": 0.00014685290458927065, "loss": 0.0329, "step": 8322 }, { "epoch": 2.052268244575937, "grad_norm": 0.369140625, "learning_rate": 0.0001468224894310736, "loss": 0.0337, "step": 8324 }, { "epoch": 2.0527613412228796, "grad_norm": 0.3515625, "learning_rate": 0.00014679206872415012, "loss": 0.0369, "step": 8326 }, { "epoch": 2.0532544378698225, "grad_norm": 0.322265625, "learning_rate": 0.00014676164247210525, "loss": 0.0337, "step": 8328 }, { "epoch": 2.0537475345167655, "grad_norm": 0.44140625, "learning_rate": 0.00014673121067854464, "loss": 0.0311, "step": 8330 }, { "epoch": 2.054240631163708, "grad_norm": 0.34765625, "learning_rate": 0.00014670077334707468, "loss": 0.033, "step": 8332 }, { "epoch": 2.054733727810651, "grad_norm": 0.376953125, "learning_rate": 0.00014667033048130232, "loss": 0.0346, "step": 8334 }, { "epoch": 2.0552268244575935, "grad_norm": 0.41796875, "learning_rate": 0.00014663988208483524, "loss": 0.0306, "step": 8336 }, { "epoch": 2.0557199211045365, "grad_norm": 0.275390625, "learning_rate": 0.0001466094281612817, "loss": 0.0332, "step": 8338 }, { "epoch": 2.0562130177514795, "grad_norm": 0.263671875, "learning_rate": 0.0001465789687142507, "loss": 0.0363, "step": 8340 }, { "epoch": 2.056706114398422, "grad_norm": 0.23046875, "learning_rate": 0.00014654850374735186, "loss": 0.034, "step": 8342 }, { "epoch": 2.057199211045365, "grad_norm": 0.259765625, "learning_rate": 0.00014651803326419543, "loss": 0.0351, "step": 8344 }, { "epoch": 2.0576923076923075, "grad_norm": 0.263671875, "learning_rate": 0.00014648755726839232, "loss": 0.0333, "step": 8346 }, { "epoch": 2.0581854043392505, "grad_norm": 0.228515625, "learning_rate": 0.00014645707576355413, "loss": 0.0312, "step": 8348 }, { "epoch": 2.0586785009861934, "grad_norm": 0.2158203125, "learning_rate": 0.0001464265887532931, "loss": 0.0313, "step": 8350 }, { "epoch": 2.059171597633136, "grad_norm": 0.232421875, "learning_rate": 0.00014639609624122205, "loss": 0.0305, "step": 8352 }, { "epoch": 2.059664694280079, "grad_norm": 0.224609375, "learning_rate": 0.00014636559823095457, "loss": 0.0327, "step": 8354 }, { "epoch": 2.060157790927022, "grad_norm": 0.283203125, "learning_rate": 0.00014633509472610482, "loss": 0.0339, "step": 8356 }, { "epoch": 2.0606508875739644, "grad_norm": 0.248046875, "learning_rate": 0.00014630458573028764, "loss": 0.0289, "step": 8358 }, { "epoch": 2.0611439842209074, "grad_norm": 0.31640625, "learning_rate": 0.00014627407124711853, "loss": 0.0342, "step": 8360 }, { "epoch": 2.06163708086785, "grad_norm": 0.28125, "learning_rate": 0.00014624355128021362, "loss": 0.0339, "step": 8362 }, { "epoch": 2.062130177514793, "grad_norm": 0.3359375, "learning_rate": 0.0001462130258331897, "loss": 0.0366, "step": 8364 }, { "epoch": 2.062623274161736, "grad_norm": 0.279296875, "learning_rate": 0.00014618249490966418, "loss": 0.0336, "step": 8366 }, { "epoch": 2.0631163708086784, "grad_norm": 0.248046875, "learning_rate": 0.0001461519585132552, "loss": 0.0309, "step": 8368 }, { "epoch": 2.0636094674556213, "grad_norm": 0.330078125, "learning_rate": 0.00014612141664758138, "loss": 0.0319, "step": 8370 }, { "epoch": 2.064102564102564, "grad_norm": 0.28515625, "learning_rate": 0.00014609086931626224, "loss": 0.0345, "step": 8372 }, { "epoch": 2.064595660749507, "grad_norm": 0.294921875, "learning_rate": 0.0001460603165229177, "loss": 0.03, "step": 8374 }, { "epoch": 2.06508875739645, "grad_norm": 0.404296875, "learning_rate": 0.00014602975827116853, "loss": 0.0357, "step": 8376 }, { "epoch": 2.0655818540433923, "grad_norm": 0.26953125, "learning_rate": 0.000145999194564636, "loss": 0.0339, "step": 8378 }, { "epoch": 2.0660749506903353, "grad_norm": 0.275390625, "learning_rate": 0.00014596862540694206, "loss": 0.0287, "step": 8380 }, { "epoch": 2.0665680473372783, "grad_norm": 0.291015625, "learning_rate": 0.00014593805080170938, "loss": 0.0323, "step": 8382 }, { "epoch": 2.067061143984221, "grad_norm": 0.294921875, "learning_rate": 0.00014590747075256122, "loss": 0.0356, "step": 8384 }, { "epoch": 2.0675542406311638, "grad_norm": 0.2353515625, "learning_rate": 0.00014587688526312143, "loss": 0.0299, "step": 8386 }, { "epoch": 2.0680473372781063, "grad_norm": 0.283203125, "learning_rate": 0.00014584629433701455, "loss": 0.0333, "step": 8388 }, { "epoch": 2.0685404339250493, "grad_norm": 0.2373046875, "learning_rate": 0.0001458156979778659, "loss": 0.0365, "step": 8390 }, { "epoch": 2.0690335305719922, "grad_norm": 0.244140625, "learning_rate": 0.00014578509618930118, "loss": 0.0309, "step": 8392 }, { "epoch": 2.0695266272189348, "grad_norm": 0.244140625, "learning_rate": 0.00014575448897494693, "loss": 0.0314, "step": 8394 }, { "epoch": 2.0700197238658777, "grad_norm": 0.255859375, "learning_rate": 0.0001457238763384303, "loss": 0.0299, "step": 8396 }, { "epoch": 2.0705128205128207, "grad_norm": 0.2470703125, "learning_rate": 0.00014569325828337902, "loss": 0.0316, "step": 8398 }, { "epoch": 2.0710059171597632, "grad_norm": 0.26953125, "learning_rate": 0.00014566263481342154, "loss": 0.0321, "step": 8400 }, { "epoch": 2.071499013806706, "grad_norm": 0.2734375, "learning_rate": 0.00014563200593218687, "loss": 0.0353, "step": 8402 }, { "epoch": 2.0719921104536487, "grad_norm": 0.251953125, "learning_rate": 0.00014560137164330473, "loss": 0.0328, "step": 8404 }, { "epoch": 2.0724852071005917, "grad_norm": 0.255859375, "learning_rate": 0.00014557073195040543, "loss": 0.0329, "step": 8406 }, { "epoch": 2.0729783037475347, "grad_norm": 0.326171875, "learning_rate": 0.00014554008685711999, "loss": 0.0358, "step": 8408 }, { "epoch": 2.073471400394477, "grad_norm": 0.2353515625, "learning_rate": 0.00014550943636707994, "loss": 0.0311, "step": 8410 }, { "epoch": 2.07396449704142, "grad_norm": 0.248046875, "learning_rate": 0.0001454787804839176, "loss": 0.0324, "step": 8412 }, { "epoch": 2.074457593688363, "grad_norm": 0.25, "learning_rate": 0.00014544811921126587, "loss": 0.0312, "step": 8414 }, { "epoch": 2.0749506903353057, "grad_norm": 0.2578125, "learning_rate": 0.0001454174525527583, "loss": 0.0338, "step": 8416 }, { "epoch": 2.0754437869822486, "grad_norm": 0.330078125, "learning_rate": 0.00014538678051202898, "loss": 0.0289, "step": 8418 }, { "epoch": 2.075936883629191, "grad_norm": 0.34765625, "learning_rate": 0.00014535610309271277, "loss": 0.0338, "step": 8420 }, { "epoch": 2.076429980276134, "grad_norm": 0.306640625, "learning_rate": 0.0001453254202984451, "loss": 0.0341, "step": 8422 }, { "epoch": 2.076923076923077, "grad_norm": 0.328125, "learning_rate": 0.00014529473213286206, "loss": 0.0315, "step": 8424 }, { "epoch": 2.0774161735700196, "grad_norm": 0.228515625, "learning_rate": 0.0001452640385996004, "loss": 0.0354, "step": 8426 }, { "epoch": 2.0779092702169626, "grad_norm": 0.357421875, "learning_rate": 0.0001452333397022974, "loss": 0.0313, "step": 8428 }, { "epoch": 2.078402366863905, "grad_norm": 0.267578125, "learning_rate": 0.0001452026354445911, "loss": 0.0337, "step": 8430 }, { "epoch": 2.078895463510848, "grad_norm": 0.279296875, "learning_rate": 0.00014517192583012013, "loss": 0.0325, "step": 8432 }, { "epoch": 2.079388560157791, "grad_norm": 0.25390625, "learning_rate": 0.00014514121086252376, "loss": 0.0325, "step": 8434 }, { "epoch": 2.0798816568047336, "grad_norm": 0.2470703125, "learning_rate": 0.00014511049054544186, "loss": 0.032, "step": 8436 }, { "epoch": 2.0803747534516766, "grad_norm": 0.25, "learning_rate": 0.00014507976488251496, "loss": 0.032, "step": 8438 }, { "epoch": 2.0808678500986195, "grad_norm": 0.251953125, "learning_rate": 0.00014504903387738423, "loss": 0.0298, "step": 8440 }, { "epoch": 2.081360946745562, "grad_norm": 0.283203125, "learning_rate": 0.00014501829753369144, "loss": 0.0314, "step": 8442 }, { "epoch": 2.081854043392505, "grad_norm": 0.26171875, "learning_rate": 0.00014498755585507907, "loss": 0.0339, "step": 8444 }, { "epoch": 2.0823471400394475, "grad_norm": 0.291015625, "learning_rate": 0.00014495680884519017, "loss": 0.0318, "step": 8446 }, { "epoch": 2.0828402366863905, "grad_norm": 0.28515625, "learning_rate": 0.00014492605650766834, "loss": 0.0316, "step": 8448 }, { "epoch": 2.0833333333333335, "grad_norm": 0.26171875, "learning_rate": 0.00014489529884615808, "loss": 0.0314, "step": 8450 }, { "epoch": 2.083826429980276, "grad_norm": 0.26171875, "learning_rate": 0.00014486453586430418, "loss": 0.0352, "step": 8452 }, { "epoch": 2.084319526627219, "grad_norm": 0.263671875, "learning_rate": 0.00014483376756575228, "loss": 0.0322, "step": 8454 }, { "epoch": 2.084812623274162, "grad_norm": 0.25390625, "learning_rate": 0.00014480299395414862, "loss": 0.0353, "step": 8456 }, { "epoch": 2.0853057199211045, "grad_norm": 0.2734375, "learning_rate": 0.00014477221503314005, "loss": 0.0348, "step": 8458 }, { "epoch": 2.0857988165680474, "grad_norm": 0.34375, "learning_rate": 0.00014474143080637402, "loss": 0.0335, "step": 8460 }, { "epoch": 2.08629191321499, "grad_norm": 0.255859375, "learning_rate": 0.0001447106412774986, "loss": 0.0342, "step": 8462 }, { "epoch": 2.086785009861933, "grad_norm": 0.328125, "learning_rate": 0.00014467984645016258, "loss": 0.0313, "step": 8464 }, { "epoch": 2.087278106508876, "grad_norm": 0.23828125, "learning_rate": 0.0001446490463280153, "loss": 0.0306, "step": 8466 }, { "epoch": 2.0877712031558184, "grad_norm": 0.23828125, "learning_rate": 0.00014461824091470673, "loss": 0.0332, "step": 8468 }, { "epoch": 2.0882642998027614, "grad_norm": 0.240234375, "learning_rate": 0.0001445874302138875, "loss": 0.0281, "step": 8470 }, { "epoch": 2.088757396449704, "grad_norm": 0.26171875, "learning_rate": 0.00014455661422920887, "loss": 0.0347, "step": 8472 }, { "epoch": 2.089250493096647, "grad_norm": 0.423828125, "learning_rate": 0.00014452579296432265, "loss": 0.0354, "step": 8474 }, { "epoch": 2.08974358974359, "grad_norm": 0.3671875, "learning_rate": 0.00014449496642288137, "loss": 0.0309, "step": 8476 }, { "epoch": 2.0902366863905324, "grad_norm": 0.25, "learning_rate": 0.00014446413460853812, "loss": 0.0304, "step": 8478 }, { "epoch": 2.0907297830374754, "grad_norm": 0.296875, "learning_rate": 0.00014443329752494667, "loss": 0.033, "step": 8480 }, { "epoch": 2.0912228796844183, "grad_norm": 0.28515625, "learning_rate": 0.0001444024551757614, "loss": 0.0321, "step": 8482 }, { "epoch": 2.091715976331361, "grad_norm": 0.35546875, "learning_rate": 0.0001443716075646373, "loss": 0.0288, "step": 8484 }, { "epoch": 2.092209072978304, "grad_norm": 0.390625, "learning_rate": 0.00014434075469522994, "loss": 0.0299, "step": 8486 }, { "epoch": 2.0927021696252464, "grad_norm": 0.302734375, "learning_rate": 0.00014430989657119558, "loss": 0.0302, "step": 8488 }, { "epoch": 2.0931952662721893, "grad_norm": 0.302734375, "learning_rate": 0.00014427903319619108, "loss": 0.0314, "step": 8490 }, { "epoch": 2.0936883629191323, "grad_norm": 0.2890625, "learning_rate": 0.0001442481645738739, "loss": 0.0322, "step": 8492 }, { "epoch": 2.094181459566075, "grad_norm": 0.349609375, "learning_rate": 0.0001442172907079022, "loss": 0.0335, "step": 8494 }, { "epoch": 2.094674556213018, "grad_norm": 0.384765625, "learning_rate": 0.00014418641160193467, "loss": 0.035, "step": 8496 }, { "epoch": 2.0951676528599608, "grad_norm": 0.2890625, "learning_rate": 0.00014415552725963065, "loss": 0.0309, "step": 8498 }, { "epoch": 2.0956607495069033, "grad_norm": 0.2490234375, "learning_rate": 0.00014412463768465018, "loss": 0.0308, "step": 8500 }, { "epoch": 2.0961538461538463, "grad_norm": 0.369140625, "learning_rate": 0.0001440937428806537, "loss": 0.0347, "step": 8502 }, { "epoch": 2.096646942800789, "grad_norm": 0.287109375, "learning_rate": 0.0001440628428513025, "loss": 0.0417, "step": 8504 }, { "epoch": 2.0971400394477318, "grad_norm": 0.404296875, "learning_rate": 0.00014403193760025845, "loss": 0.0313, "step": 8506 }, { "epoch": 2.0976331360946747, "grad_norm": 0.30859375, "learning_rate": 0.00014400102713118393, "loss": 0.0321, "step": 8508 }, { "epoch": 2.0981262327416172, "grad_norm": 0.326171875, "learning_rate": 0.000143970111447742, "loss": 0.0371, "step": 8510 }, { "epoch": 2.09861932938856, "grad_norm": 0.26171875, "learning_rate": 0.00014393919055359644, "loss": 0.0382, "step": 8512 }, { "epoch": 2.0991124260355027, "grad_norm": 0.31640625, "learning_rate": 0.0001439082644524114, "loss": 0.0305, "step": 8514 }, { "epoch": 2.0996055226824457, "grad_norm": 0.306640625, "learning_rate": 0.00014387733314785193, "loss": 0.0294, "step": 8516 }, { "epoch": 2.1000986193293887, "grad_norm": 0.3046875, "learning_rate": 0.00014384639664358345, "loss": 0.0363, "step": 8518 }, { "epoch": 2.100591715976331, "grad_norm": 0.32421875, "learning_rate": 0.0001438154549432722, "loss": 0.0334, "step": 8520 }, { "epoch": 2.101084812623274, "grad_norm": 0.30859375, "learning_rate": 0.00014378450805058488, "loss": 0.0302, "step": 8522 }, { "epoch": 2.101577909270217, "grad_norm": 0.345703125, "learning_rate": 0.0001437535559691889, "loss": 0.0297, "step": 8524 }, { "epoch": 2.1020710059171597, "grad_norm": 0.271484375, "learning_rate": 0.00014372259870275227, "loss": 0.0325, "step": 8526 }, { "epoch": 2.1025641025641026, "grad_norm": 0.263671875, "learning_rate": 0.00014369163625494355, "loss": 0.0382, "step": 8528 }, { "epoch": 2.103057199211045, "grad_norm": 0.267578125, "learning_rate": 0.000143660668629432, "loss": 0.0319, "step": 8530 }, { "epoch": 2.103550295857988, "grad_norm": 0.2392578125, "learning_rate": 0.00014362969582988746, "loss": 0.0308, "step": 8532 }, { "epoch": 2.104043392504931, "grad_norm": 0.353515625, "learning_rate": 0.00014359871785998035, "loss": 0.0425, "step": 8534 }, { "epoch": 2.1045364891518736, "grad_norm": 0.341796875, "learning_rate": 0.00014356773472338173, "loss": 0.0419, "step": 8536 }, { "epoch": 2.1050295857988166, "grad_norm": 0.2412109375, "learning_rate": 0.00014353674642376334, "loss": 0.0312, "step": 8538 }, { "epoch": 2.1055226824457596, "grad_norm": 0.248046875, "learning_rate": 0.0001435057529647974, "loss": 0.028, "step": 8540 }, { "epoch": 2.106015779092702, "grad_norm": 0.419921875, "learning_rate": 0.00014347475435015685, "loss": 0.0394, "step": 8542 }, { "epoch": 2.106508875739645, "grad_norm": 0.359375, "learning_rate": 0.00014344375058351514, "loss": 0.0409, "step": 8544 }, { "epoch": 2.1070019723865876, "grad_norm": 0.296875, "learning_rate": 0.00014341274166854644, "loss": 0.0337, "step": 8546 }, { "epoch": 2.1074950690335306, "grad_norm": 0.2734375, "learning_rate": 0.00014338172760892546, "loss": 0.0309, "step": 8548 }, { "epoch": 2.1079881656804735, "grad_norm": 0.30078125, "learning_rate": 0.00014335070840832756, "loss": 0.0413, "step": 8550 }, { "epoch": 2.108481262327416, "grad_norm": 0.333984375, "learning_rate": 0.00014331968407042868, "loss": 0.0404, "step": 8552 }, { "epoch": 2.108974358974359, "grad_norm": 0.330078125, "learning_rate": 0.00014328865459890537, "loss": 0.0336, "step": 8554 }, { "epoch": 2.109467455621302, "grad_norm": 0.35546875, "learning_rate": 0.00014325761999743478, "loss": 0.0353, "step": 8556 }, { "epoch": 2.1099605522682445, "grad_norm": 0.29296875, "learning_rate": 0.00014322658026969472, "loss": 0.0424, "step": 8558 }, { "epoch": 2.1104536489151875, "grad_norm": 0.431640625, "learning_rate": 0.00014319553541936354, "loss": 0.0496, "step": 8560 }, { "epoch": 2.11094674556213, "grad_norm": 0.333984375, "learning_rate": 0.00014316448545012022, "loss": 0.0349, "step": 8562 }, { "epoch": 2.111439842209073, "grad_norm": 0.357421875, "learning_rate": 0.0001431334303656444, "loss": 0.0352, "step": 8564 }, { "epoch": 2.111932938856016, "grad_norm": 0.310546875, "learning_rate": 0.00014310237016961627, "loss": 0.0425, "step": 8566 }, { "epoch": 2.1124260355029585, "grad_norm": 0.44140625, "learning_rate": 0.0001430713048657166, "loss": 0.045, "step": 8568 }, { "epoch": 2.1129191321499015, "grad_norm": 0.30859375, "learning_rate": 0.00014304023445762686, "loss": 0.0325, "step": 8570 }, { "epoch": 2.113412228796844, "grad_norm": 0.28515625, "learning_rate": 0.00014300915894902895, "loss": 0.03, "step": 8572 }, { "epoch": 2.113905325443787, "grad_norm": 0.33203125, "learning_rate": 0.00014297807834360565, "loss": 0.0454, "step": 8574 }, { "epoch": 2.11439842209073, "grad_norm": 0.345703125, "learning_rate": 0.00014294699264504006, "loss": 0.0431, "step": 8576 }, { "epoch": 2.1148915187376724, "grad_norm": 0.404296875, "learning_rate": 0.00014291590185701607, "loss": 0.0329, "step": 8578 }, { "epoch": 2.1153846153846154, "grad_norm": 0.24609375, "learning_rate": 0.0001428848059832181, "loss": 0.0335, "step": 8580 }, { "epoch": 2.1158777120315584, "grad_norm": 0.361328125, "learning_rate": 0.00014285370502733116, "loss": 0.0422, "step": 8582 }, { "epoch": 2.116370808678501, "grad_norm": 0.44921875, "learning_rate": 0.00014282259899304098, "loss": 0.0468, "step": 8584 }, { "epoch": 2.116863905325444, "grad_norm": 0.369140625, "learning_rate": 0.00014279148788403368, "loss": 0.0294, "step": 8586 }, { "epoch": 2.1173570019723864, "grad_norm": 0.279296875, "learning_rate": 0.00014276037170399614, "loss": 0.0418, "step": 8588 }, { "epoch": 2.1178500986193294, "grad_norm": 2.796875, "learning_rate": 0.00014272925045661584, "loss": 0.047, "step": 8590 }, { "epoch": 2.1183431952662723, "grad_norm": 0.376953125, "learning_rate": 0.00014269812414558075, "loss": 0.0417, "step": 8592 }, { "epoch": 2.118836291913215, "grad_norm": 0.57421875, "learning_rate": 0.00014266699277457956, "loss": 0.0435, "step": 8594 }, { "epoch": 2.119329388560158, "grad_norm": 0.423828125, "learning_rate": 0.00014263585634730153, "loss": 0.0352, "step": 8596 }, { "epoch": 2.1198224852071004, "grad_norm": 0.2890625, "learning_rate": 0.0001426047148674365, "loss": 0.0445, "step": 8598 }, { "epoch": 2.1203155818540433, "grad_norm": 0.330078125, "learning_rate": 0.00014257356833867485, "loss": 0.0418, "step": 8600 }, { "epoch": 2.1208086785009863, "grad_norm": 0.32421875, "learning_rate": 0.00014254241676470763, "loss": 0.0351, "step": 8602 }, { "epoch": 2.121301775147929, "grad_norm": 0.333984375, "learning_rate": 0.00014251126014922652, "loss": 0.0375, "step": 8604 }, { "epoch": 2.121794871794872, "grad_norm": 0.31640625, "learning_rate": 0.00014248009849592372, "loss": 0.0469, "step": 8606 }, { "epoch": 2.1222879684418148, "grad_norm": 0.48828125, "learning_rate": 0.00014244893180849206, "loss": 0.0452, "step": 8608 }, { "epoch": 2.1227810650887573, "grad_norm": 0.298828125, "learning_rate": 0.00014241776009062503, "loss": 0.0407, "step": 8610 }, { "epoch": 2.1232741617357003, "grad_norm": 0.33984375, "learning_rate": 0.00014238658334601655, "loss": 0.0381, "step": 8612 }, { "epoch": 2.123767258382643, "grad_norm": 0.46875, "learning_rate": 0.00014235540157836132, "loss": 0.0459, "step": 8614 }, { "epoch": 2.1242603550295858, "grad_norm": 0.333984375, "learning_rate": 0.00014232421479135448, "loss": 0.0445, "step": 8616 }, { "epoch": 2.1247534516765287, "grad_norm": 0.302734375, "learning_rate": 0.00014229302298869188, "loss": 0.0451, "step": 8618 }, { "epoch": 2.1252465483234713, "grad_norm": 0.3671875, "learning_rate": 0.00014226182617406996, "loss": 0.0404, "step": 8620 }, { "epoch": 2.1257396449704142, "grad_norm": 0.310546875, "learning_rate": 0.00014223062435118563, "loss": 0.0436, "step": 8622 }, { "epoch": 2.126232741617357, "grad_norm": 0.408203125, "learning_rate": 0.00014219941752373658, "loss": 0.0462, "step": 8624 }, { "epoch": 2.1267258382642997, "grad_norm": 0.34765625, "learning_rate": 0.0001421682056954209, "loss": 0.0451, "step": 8626 }, { "epoch": 2.1272189349112427, "grad_norm": 0.27734375, "learning_rate": 0.00014213698886993744, "loss": 0.0483, "step": 8628 }, { "epoch": 2.1277120315581852, "grad_norm": 0.384765625, "learning_rate": 0.00014210576705098554, "loss": 0.0488, "step": 8630 }, { "epoch": 2.128205128205128, "grad_norm": 0.39453125, "learning_rate": 0.00014207454024226513, "loss": 0.0529, "step": 8632 }, { "epoch": 2.128698224852071, "grad_norm": 0.3125, "learning_rate": 0.0001420433084474768, "loss": 0.0448, "step": 8634 }, { "epoch": 2.1291913214990137, "grad_norm": 0.36328125, "learning_rate": 0.00014201207167032168, "loss": 0.0425, "step": 8636 }, { "epoch": 2.1296844181459567, "grad_norm": 0.400390625, "learning_rate": 0.0001419808299145015, "loss": 0.0443, "step": 8638 }, { "epoch": 2.1301775147928996, "grad_norm": 0.333984375, "learning_rate": 0.0001419495831837186, "loss": 0.0466, "step": 8640 }, { "epoch": 2.130670611439842, "grad_norm": 0.353515625, "learning_rate": 0.00014191833148167584, "loss": 0.0417, "step": 8642 }, { "epoch": 2.131163708086785, "grad_norm": 0.333984375, "learning_rate": 0.00014188707481207677, "loss": 0.0381, "step": 8644 }, { "epoch": 2.1316568047337277, "grad_norm": 0.337890625, "learning_rate": 0.00014185581317862546, "loss": 0.0454, "step": 8646 }, { "epoch": 2.1321499013806706, "grad_norm": 0.4140625, "learning_rate": 0.00014182454658502662, "loss": 0.0491, "step": 8648 }, { "epoch": 2.1326429980276136, "grad_norm": 0.427734375, "learning_rate": 0.00014179327503498548, "loss": 0.0435, "step": 8650 }, { "epoch": 2.133136094674556, "grad_norm": 0.283203125, "learning_rate": 0.00014176199853220794, "loss": 0.0434, "step": 8652 }, { "epoch": 2.133629191321499, "grad_norm": 0.421875, "learning_rate": 0.00014173071708040036, "loss": 0.0487, "step": 8654 }, { "epoch": 2.1341222879684416, "grad_norm": 0.431640625, "learning_rate": 0.0001416994306832698, "loss": 0.0487, "step": 8656 }, { "epoch": 2.1346153846153846, "grad_norm": 0.392578125, "learning_rate": 0.0001416681393445239, "loss": 0.0548, "step": 8658 }, { "epoch": 2.1351084812623276, "grad_norm": 0.39453125, "learning_rate": 0.00014163684306787083, "loss": 0.0469, "step": 8660 }, { "epoch": 2.13560157790927, "grad_norm": 0.384765625, "learning_rate": 0.00014160554185701943, "loss": 0.048, "step": 8662 }, { "epoch": 2.136094674556213, "grad_norm": 0.4453125, "learning_rate": 0.000141574235715679, "loss": 0.0456, "step": 8664 }, { "epoch": 2.136587771203156, "grad_norm": 0.3671875, "learning_rate": 0.0001415429246475595, "loss": 0.0456, "step": 8666 }, { "epoch": 2.1370808678500985, "grad_norm": 0.5703125, "learning_rate": 0.0001415116086563715, "loss": 0.0443, "step": 8668 }, { "epoch": 2.1375739644970415, "grad_norm": 0.345703125, "learning_rate": 0.00014148028774582612, "loss": 0.0523, "step": 8670 }, { "epoch": 2.138067061143984, "grad_norm": 0.388671875, "learning_rate": 0.000141448961919635, "loss": 0.0485, "step": 8672 }, { "epoch": 2.138560157790927, "grad_norm": 0.318359375, "learning_rate": 0.0001414176311815105, "loss": 0.043, "step": 8674 }, { "epoch": 2.13905325443787, "grad_norm": 0.34375, "learning_rate": 0.00014138629553516548, "loss": 0.0444, "step": 8676 }, { "epoch": 2.1395463510848125, "grad_norm": 0.515625, "learning_rate": 0.00014135495498431334, "loss": 0.0471, "step": 8678 }, { "epoch": 2.1400394477317555, "grad_norm": 0.41796875, "learning_rate": 0.00014132360953266816, "loss": 0.0544, "step": 8680 }, { "epoch": 2.140532544378698, "grad_norm": 0.33984375, "learning_rate": 0.00014129225918394452, "loss": 0.0468, "step": 8682 }, { "epoch": 2.141025641025641, "grad_norm": 0.375, "learning_rate": 0.00014126090394185758, "loss": 0.0498, "step": 8684 }, { "epoch": 2.141518737672584, "grad_norm": 0.416015625, "learning_rate": 0.0001412295438101232, "loss": 0.0592, "step": 8686 }, { "epoch": 2.1420118343195265, "grad_norm": 0.453125, "learning_rate": 0.00014119817879245766, "loss": 0.0503, "step": 8688 }, { "epoch": 2.1425049309664694, "grad_norm": 0.4140625, "learning_rate": 0.0001411668088925779, "loss": 0.0491, "step": 8690 }, { "epoch": 2.1429980276134124, "grad_norm": 0.3125, "learning_rate": 0.00014113543411420148, "loss": 0.0517, "step": 8692 }, { "epoch": 2.143491124260355, "grad_norm": 0.40234375, "learning_rate": 0.00014110405446104643, "loss": 0.0523, "step": 8694 }, { "epoch": 2.143984220907298, "grad_norm": 0.462890625, "learning_rate": 0.00014107266993683139, "loss": 0.0537, "step": 8696 }, { "epoch": 2.1444773175542404, "grad_norm": 0.361328125, "learning_rate": 0.0001410412805452757, "loss": 0.0475, "step": 8698 }, { "epoch": 2.1449704142011834, "grad_norm": 0.3828125, "learning_rate": 0.00014100988629009908, "loss": 0.0438, "step": 8700 }, { "epoch": 2.1454635108481264, "grad_norm": 0.45703125, "learning_rate": 0.00014097848717502197, "loss": 0.0523, "step": 8702 }, { "epoch": 2.145956607495069, "grad_norm": 0.46484375, "learning_rate": 0.00014094708320376535, "loss": 0.0487, "step": 8704 }, { "epoch": 2.146449704142012, "grad_norm": 0.369140625, "learning_rate": 0.00014091567438005077, "loss": 0.0489, "step": 8706 }, { "epoch": 2.146942800788955, "grad_norm": 0.349609375, "learning_rate": 0.0001408842607076003, "loss": 0.0474, "step": 8708 }, { "epoch": 2.1474358974358974, "grad_norm": 0.48046875, "learning_rate": 0.00014085284219013668, "loss": 0.0523, "step": 8710 }, { "epoch": 2.1479289940828403, "grad_norm": 0.3125, "learning_rate": 0.00014082141883138316, "loss": 0.0521, "step": 8712 }, { "epoch": 2.148422090729783, "grad_norm": 0.484375, "learning_rate": 0.0001407899906350636, "loss": 0.0547, "step": 8714 }, { "epoch": 2.148915187376726, "grad_norm": 0.34375, "learning_rate": 0.0001407585576049024, "loss": 0.0502, "step": 8716 }, { "epoch": 2.149408284023669, "grad_norm": 0.421875, "learning_rate": 0.00014072711974462456, "loss": 0.0522, "step": 8718 }, { "epoch": 2.1499013806706113, "grad_norm": 0.50390625, "learning_rate": 0.00014069567705795566, "loss": 0.0561, "step": 8720 }, { "epoch": 2.1503944773175543, "grad_norm": 0.7109375, "learning_rate": 0.0001406642295486218, "loss": 0.0517, "step": 8722 }, { "epoch": 2.1508875739644973, "grad_norm": 0.3828125, "learning_rate": 0.00014063277722034968, "loss": 0.0483, "step": 8724 }, { "epoch": 2.15138067061144, "grad_norm": 0.494140625, "learning_rate": 0.00014060132007686659, "loss": 0.053, "step": 8726 }, { "epoch": 2.1518737672583828, "grad_norm": 0.494140625, "learning_rate": 0.0001405698581219004, "loss": 0.0485, "step": 8728 }, { "epoch": 2.1523668639053253, "grad_norm": 0.373046875, "learning_rate": 0.00014053839135917952, "loss": 0.0516, "step": 8730 }, { "epoch": 2.1528599605522682, "grad_norm": 0.462890625, "learning_rate": 0.0001405069197924329, "loss": 0.0485, "step": 8732 }, { "epoch": 2.153353057199211, "grad_norm": 0.373046875, "learning_rate": 0.00014047544342539017, "loss": 0.0529, "step": 8734 }, { "epoch": 2.1538461538461537, "grad_norm": 0.41796875, "learning_rate": 0.00014044396226178135, "loss": 0.0553, "step": 8736 }, { "epoch": 2.1543392504930967, "grad_norm": 0.333984375, "learning_rate": 0.00014041247630533724, "loss": 0.0474, "step": 8738 }, { "epoch": 2.1548323471400392, "grad_norm": 0.4140625, "learning_rate": 0.000140380985559789, "loss": 0.0506, "step": 8740 }, { "epoch": 2.155325443786982, "grad_norm": 0.4765625, "learning_rate": 0.00014034949002886854, "loss": 0.0506, "step": 8742 }, { "epoch": 2.155818540433925, "grad_norm": 0.455078125, "learning_rate": 0.00014031798971630822, "loss": 0.057, "step": 8744 }, { "epoch": 2.1563116370808677, "grad_norm": 0.3046875, "learning_rate": 0.00014028648462584108, "loss": 0.0452, "step": 8746 }, { "epoch": 2.1568047337278107, "grad_norm": 0.416015625, "learning_rate": 0.0001402549747612005, "loss": 0.0509, "step": 8748 }, { "epoch": 2.1572978303747536, "grad_norm": 0.5546875, "learning_rate": 0.0001402234601261207, "loss": 0.0484, "step": 8750 }, { "epoch": 2.157790927021696, "grad_norm": 0.4140625, "learning_rate": 0.00014019194072433625, "loss": 0.0535, "step": 8752 }, { "epoch": 2.158284023668639, "grad_norm": 0.30859375, "learning_rate": 0.00014016041655958244, "loss": 0.0519, "step": 8754 }, { "epoch": 2.1587771203155817, "grad_norm": 0.5703125, "learning_rate": 0.00014012888763559507, "loss": 0.0539, "step": 8756 }, { "epoch": 2.1592702169625246, "grad_norm": 0.34765625, "learning_rate": 0.00014009735395611043, "loss": 0.0581, "step": 8758 }, { "epoch": 2.1597633136094676, "grad_norm": 0.44921875, "learning_rate": 0.00014006581552486547, "loss": 0.0562, "step": 8760 }, { "epoch": 2.16025641025641, "grad_norm": 0.3828125, "learning_rate": 0.0001400342723455977, "loss": 0.0488, "step": 8762 }, { "epoch": 2.160749506903353, "grad_norm": 0.373046875, "learning_rate": 0.00014000272442204513, "loss": 0.0532, "step": 8764 }, { "epoch": 2.1612426035502956, "grad_norm": 0.439453125, "learning_rate": 0.00013997117175794637, "loss": 0.0554, "step": 8766 }, { "epoch": 2.1617357001972386, "grad_norm": 0.380859375, "learning_rate": 0.00013993961435704056, "loss": 0.0637, "step": 8768 }, { "epoch": 2.1622287968441816, "grad_norm": 0.341796875, "learning_rate": 0.00013990805222306745, "loss": 0.0505, "step": 8770 }, { "epoch": 2.162721893491124, "grad_norm": 0.390625, "learning_rate": 0.00013987648535976737, "loss": 0.048, "step": 8772 }, { "epoch": 2.163214990138067, "grad_norm": 0.416015625, "learning_rate": 0.00013984491377088114, "loss": 0.0532, "step": 8774 }, { "epoch": 2.16370808678501, "grad_norm": 0.36328125, "learning_rate": 0.0001398133374601501, "loss": 0.0571, "step": 8776 }, { "epoch": 2.1642011834319526, "grad_norm": 0.419921875, "learning_rate": 0.00013978175643131633, "loss": 0.0506, "step": 8778 }, { "epoch": 2.1646942800788955, "grad_norm": 0.43359375, "learning_rate": 0.0001397501706881223, "loss": 0.0469, "step": 8780 }, { "epoch": 2.1651873767258385, "grad_norm": 0.470703125, "learning_rate": 0.0001397185802343111, "loss": 0.0514, "step": 8782 }, { "epoch": 2.165680473372781, "grad_norm": 0.400390625, "learning_rate": 0.0001396869850736264, "loss": 0.0523, "step": 8784 }, { "epoch": 2.166173570019724, "grad_norm": 0.416015625, "learning_rate": 0.00013965538520981235, "loss": 0.0481, "step": 8786 }, { "epoch": 2.1666666666666665, "grad_norm": 0.44140625, "learning_rate": 0.0001396237806466138, "loss": 0.0528, "step": 8788 }, { "epoch": 2.1671597633136095, "grad_norm": 0.376953125, "learning_rate": 0.00013959217138777597, "loss": 0.0556, "step": 8790 }, { "epoch": 2.1676528599605525, "grad_norm": 0.375, "learning_rate": 0.0001395605574370448, "loss": 0.0546, "step": 8792 }, { "epoch": 2.168145956607495, "grad_norm": 0.359375, "learning_rate": 0.00013952893879816668, "loss": 0.0463, "step": 8794 }, { "epoch": 2.168639053254438, "grad_norm": 0.396484375, "learning_rate": 0.00013949731547488863, "loss": 0.0515, "step": 8796 }, { "epoch": 2.1691321499013805, "grad_norm": 0.41015625, "learning_rate": 0.00013946568747095817, "loss": 0.0549, "step": 8798 }, { "epoch": 2.1696252465483234, "grad_norm": 0.458984375, "learning_rate": 0.00013943405479012336, "loss": 0.0572, "step": 8800 }, { "epoch": 2.1701183431952664, "grad_norm": 0.349609375, "learning_rate": 0.00013940241743613295, "loss": 0.0526, "step": 8802 }, { "epoch": 2.170611439842209, "grad_norm": 0.46875, "learning_rate": 0.00013937077541273602, "loss": 0.0564, "step": 8804 }, { "epoch": 2.171104536489152, "grad_norm": 0.33203125, "learning_rate": 0.0001393391287236824, "loss": 0.053, "step": 8806 }, { "epoch": 2.171597633136095, "grad_norm": 0.58984375, "learning_rate": 0.00013930747737272235, "loss": 0.0577, "step": 8808 }, { "epoch": 2.1720907297830374, "grad_norm": 0.4296875, "learning_rate": 0.0001392758213636068, "loss": 0.0569, "step": 8810 }, { "epoch": 2.1725838264299804, "grad_norm": 0.34765625, "learning_rate": 0.00013924416070008717, "loss": 0.0484, "step": 8812 }, { "epoch": 2.173076923076923, "grad_norm": 0.404296875, "learning_rate": 0.00013921249538591533, "loss": 0.06, "step": 8814 }, { "epoch": 2.173570019723866, "grad_norm": 0.45703125, "learning_rate": 0.0001391808254248439, "loss": 0.0606, "step": 8816 }, { "epoch": 2.174063116370809, "grad_norm": 0.486328125, "learning_rate": 0.00013914915082062588, "loss": 0.0539, "step": 8818 }, { "epoch": 2.1745562130177514, "grad_norm": 0.4765625, "learning_rate": 0.0001391174715770149, "loss": 0.0479, "step": 8820 }, { "epoch": 2.1750493096646943, "grad_norm": 0.37109375, "learning_rate": 0.00013908578769776512, "loss": 0.0632, "step": 8822 }, { "epoch": 2.175542406311637, "grad_norm": 0.494140625, "learning_rate": 0.0001390540991866313, "loss": 0.0554, "step": 8824 }, { "epoch": 2.17603550295858, "grad_norm": 0.546875, "learning_rate": 0.00013902240604736862, "loss": 0.0579, "step": 8826 }, { "epoch": 2.176528599605523, "grad_norm": 0.328125, "learning_rate": 0.000138990708283733, "loss": 0.0552, "step": 8828 }, { "epoch": 2.1770216962524653, "grad_norm": 0.361328125, "learning_rate": 0.00013895900589948077, "loss": 0.0577, "step": 8830 }, { "epoch": 2.1775147928994083, "grad_norm": 0.462890625, "learning_rate": 0.0001389272988983688, "loss": 0.0582, "step": 8832 }, { "epoch": 2.1780078895463513, "grad_norm": 0.3828125, "learning_rate": 0.00013889558728415454, "loss": 0.0503, "step": 8834 }, { "epoch": 2.178500986193294, "grad_norm": 0.427734375, "learning_rate": 0.00013886387106059605, "loss": 0.0486, "step": 8836 }, { "epoch": 2.1789940828402368, "grad_norm": 0.546875, "learning_rate": 0.00013883215023145185, "loss": 0.0513, "step": 8838 }, { "epoch": 2.1794871794871793, "grad_norm": 0.359375, "learning_rate": 0.00013880042480048102, "loss": 0.0559, "step": 8840 }, { "epoch": 2.1799802761341223, "grad_norm": 0.7265625, "learning_rate": 0.00013876869477144323, "loss": 0.0509, "step": 8842 }, { "epoch": 2.1804733727810652, "grad_norm": 0.4453125, "learning_rate": 0.00013873696014809865, "loss": 0.0533, "step": 8844 }, { "epoch": 2.1809664694280078, "grad_norm": 0.390625, "learning_rate": 0.00013870522093420803, "loss": 0.0526, "step": 8846 }, { "epoch": 2.1814595660749507, "grad_norm": 0.5234375, "learning_rate": 0.0001386734771335326, "loss": 0.054, "step": 8848 }, { "epoch": 2.1819526627218937, "grad_norm": 0.421875, "learning_rate": 0.00013864172874983424, "loss": 0.0503, "step": 8850 }, { "epoch": 2.1824457593688362, "grad_norm": 0.4375, "learning_rate": 0.00013860997578687522, "loss": 0.0502, "step": 8852 }, { "epoch": 2.182938856015779, "grad_norm": 0.44921875, "learning_rate": 0.00013857821824841854, "loss": 0.0565, "step": 8854 }, { "epoch": 2.1834319526627217, "grad_norm": 0.5, "learning_rate": 0.0001385464561382276, "loss": 0.0546, "step": 8856 }, { "epoch": 2.1839250493096647, "grad_norm": 0.48046875, "learning_rate": 0.00013851468946006639, "loss": 0.0498, "step": 8858 }, { "epoch": 2.1844181459566077, "grad_norm": 0.349609375, "learning_rate": 0.00013848291821769943, "loss": 0.0544, "step": 8860 }, { "epoch": 2.18491124260355, "grad_norm": 0.380859375, "learning_rate": 0.00013845114241489183, "loss": 0.06, "step": 8862 }, { "epoch": 2.185404339250493, "grad_norm": 0.337890625, "learning_rate": 0.00013841936205540915, "loss": 0.0537, "step": 8864 }, { "epoch": 2.185897435897436, "grad_norm": 0.443359375, "learning_rate": 0.00013838757714301756, "loss": 0.0564, "step": 8866 }, { "epoch": 2.1863905325443787, "grad_norm": 0.359375, "learning_rate": 0.0001383557876814837, "loss": 0.0517, "step": 8868 }, { "epoch": 2.1868836291913216, "grad_norm": 0.427734375, "learning_rate": 0.00013832399367457492, "loss": 0.0551, "step": 8870 }, { "epoch": 2.187376725838264, "grad_norm": 0.400390625, "learning_rate": 0.0001382921951260589, "loss": 0.0592, "step": 8872 }, { "epoch": 2.187869822485207, "grad_norm": 0.375, "learning_rate": 0.00013826039203970394, "loss": 0.055, "step": 8874 }, { "epoch": 2.18836291913215, "grad_norm": 0.380859375, "learning_rate": 0.00013822858441927893, "loss": 0.0524, "step": 8876 }, { "epoch": 2.1888560157790926, "grad_norm": 0.57421875, "learning_rate": 0.00013819677226855324, "loss": 0.0577, "step": 8878 }, { "epoch": 2.1893491124260356, "grad_norm": 0.48046875, "learning_rate": 0.0001381649555912967, "loss": 0.0613, "step": 8880 }, { "epoch": 2.189842209072978, "grad_norm": 0.466796875, "learning_rate": 0.0001381331343912799, "loss": 0.0634, "step": 8882 }, { "epoch": 2.190335305719921, "grad_norm": 0.455078125, "learning_rate": 0.00013810130867227373, "loss": 0.0543, "step": 8884 }, { "epoch": 2.190828402366864, "grad_norm": 0.703125, "learning_rate": 0.0001380694784380498, "loss": 0.0596, "step": 8886 }, { "epoch": 2.1913214990138066, "grad_norm": 0.3984375, "learning_rate": 0.00013803764369238008, "loss": 0.0575, "step": 8888 }, { "epoch": 2.1918145956607495, "grad_norm": 0.46484375, "learning_rate": 0.00013800580443903724, "loss": 0.0549, "step": 8890 }, { "epoch": 2.1923076923076925, "grad_norm": 0.453125, "learning_rate": 0.00013797396068179434, "loss": 0.0584, "step": 8892 }, { "epoch": 2.192800788954635, "grad_norm": 0.41796875, "learning_rate": 0.0001379421124244251, "loss": 0.0553, "step": 8894 }, { "epoch": 2.193293885601578, "grad_norm": 0.470703125, "learning_rate": 0.0001379102596707037, "loss": 0.0548, "step": 8896 }, { "epoch": 2.1937869822485205, "grad_norm": 0.44921875, "learning_rate": 0.00013787840242440486, "loss": 0.0564, "step": 8898 }, { "epoch": 2.1942800788954635, "grad_norm": 0.400390625, "learning_rate": 0.00013784654068930386, "loss": 0.0549, "step": 8900 }, { "epoch": 2.1947731755424065, "grad_norm": 0.373046875, "learning_rate": 0.00013781467446917644, "loss": 0.0588, "step": 8902 }, { "epoch": 2.195266272189349, "grad_norm": 0.447265625, "learning_rate": 0.00013778280376779898, "loss": 0.0596, "step": 8904 }, { "epoch": 2.195759368836292, "grad_norm": 0.376953125, "learning_rate": 0.00013775092858894837, "loss": 0.0564, "step": 8906 }, { "epoch": 2.1962524654832345, "grad_norm": 0.369140625, "learning_rate": 0.00013771904893640187, "loss": 0.0612, "step": 8908 }, { "epoch": 2.1967455621301775, "grad_norm": 0.353515625, "learning_rate": 0.00013768716481393756, "loss": 0.0622, "step": 8910 }, { "epoch": 2.1972386587771204, "grad_norm": 0.40234375, "learning_rate": 0.0001376552762253337, "loss": 0.0624, "step": 8912 }, { "epoch": 2.197731755424063, "grad_norm": 0.50390625, "learning_rate": 0.00013762338317436948, "loss": 0.051, "step": 8914 }, { "epoch": 2.198224852071006, "grad_norm": 0.54296875, "learning_rate": 0.00013759148566482423, "loss": 0.0473, "step": 8916 }, { "epoch": 2.198717948717949, "grad_norm": 0.474609375, "learning_rate": 0.00013755958370047802, "loss": 0.0544, "step": 8918 }, { "epoch": 2.1992110453648914, "grad_norm": 0.373046875, "learning_rate": 0.00013752767728511146, "loss": 0.0545, "step": 8920 }, { "epoch": 2.1997041420118344, "grad_norm": 0.330078125, "learning_rate": 0.00013749576642250563, "loss": 0.0524, "step": 8922 }, { "epoch": 2.200197238658777, "grad_norm": 0.4375, "learning_rate": 0.0001374638511164421, "loss": 0.056, "step": 8924 }, { "epoch": 2.20069033530572, "grad_norm": 0.37109375, "learning_rate": 0.0001374319313707031, "loss": 0.0607, "step": 8926 }, { "epoch": 2.201183431952663, "grad_norm": 0.36328125, "learning_rate": 0.0001374000071890712, "loss": 0.051, "step": 8928 }, { "epoch": 2.2016765285996054, "grad_norm": 0.3125, "learning_rate": 0.00013736807857532964, "loss": 0.0522, "step": 8930 }, { "epoch": 2.2021696252465484, "grad_norm": 0.37890625, "learning_rate": 0.00013733614553326212, "loss": 0.0593, "step": 8932 }, { "epoch": 2.2026627218934913, "grad_norm": 0.4453125, "learning_rate": 0.0001373042080666529, "loss": 0.0631, "step": 8934 }, { "epoch": 2.203155818540434, "grad_norm": 0.46875, "learning_rate": 0.00013727226617928672, "loss": 0.0566, "step": 8936 }, { "epoch": 2.203648915187377, "grad_norm": 0.4609375, "learning_rate": 0.0001372403198749489, "loss": 0.051, "step": 8938 }, { "epoch": 2.2041420118343193, "grad_norm": 0.490234375, "learning_rate": 0.00013720836915742532, "loss": 0.0477, "step": 8940 }, { "epoch": 2.2046351084812623, "grad_norm": 0.361328125, "learning_rate": 0.0001371764140305022, "loss": 0.0559, "step": 8942 }, { "epoch": 2.2051282051282053, "grad_norm": 0.36328125, "learning_rate": 0.00013714445449796646, "loss": 0.0552, "step": 8944 }, { "epoch": 2.205621301775148, "grad_norm": 0.51953125, "learning_rate": 0.00013711249056360547, "loss": 0.0552, "step": 8946 }, { "epoch": 2.206114398422091, "grad_norm": 0.40625, "learning_rate": 0.00013708052223120713, "loss": 0.059, "step": 8948 }, { "epoch": 2.2066074950690338, "grad_norm": 0.50390625, "learning_rate": 0.00013704854950455988, "loss": 0.0573, "step": 8950 }, { "epoch": 2.2071005917159763, "grad_norm": 0.349609375, "learning_rate": 0.00013701657238745265, "loss": 0.0595, "step": 8952 }, { "epoch": 2.2075936883629192, "grad_norm": 0.447265625, "learning_rate": 0.00013698459088367496, "loss": 0.0557, "step": 8954 }, { "epoch": 2.2080867850098618, "grad_norm": 0.52734375, "learning_rate": 0.0001369526049970167, "loss": 0.054, "step": 8956 }, { "epoch": 2.2085798816568047, "grad_norm": 0.640625, "learning_rate": 0.00013692061473126845, "loss": 0.0564, "step": 8958 }, { "epoch": 2.2090729783037477, "grad_norm": 0.4609375, "learning_rate": 0.00013688862009022122, "loss": 0.0624, "step": 8960 }, { "epoch": 2.2095660749506902, "grad_norm": 0.578125, "learning_rate": 0.00013685662107766657, "loss": 0.0593, "step": 8962 }, { "epoch": 2.210059171597633, "grad_norm": 0.7578125, "learning_rate": 0.0001368246176973965, "loss": 0.0606, "step": 8964 }, { "epoch": 2.2105522682445757, "grad_norm": 0.41015625, "learning_rate": 0.00013679260995320367, "loss": 0.0643, "step": 8966 }, { "epoch": 2.2110453648915187, "grad_norm": 0.79296875, "learning_rate": 0.00013676059784888112, "loss": 0.0636, "step": 8968 }, { "epoch": 2.2115384615384617, "grad_norm": 0.400390625, "learning_rate": 0.00013672858138822247, "loss": 0.0539, "step": 8970 }, { "epoch": 2.212031558185404, "grad_norm": 0.515625, "learning_rate": 0.00013669656057502187, "loss": 0.0619, "step": 8972 }, { "epoch": 2.212524654832347, "grad_norm": 0.462890625, "learning_rate": 0.00013666453541307395, "loss": 0.0586, "step": 8974 }, { "epoch": 2.21301775147929, "grad_norm": 0.50390625, "learning_rate": 0.00013663250590617386, "loss": 0.0641, "step": 8976 }, { "epoch": 2.2135108481262327, "grad_norm": 0.39453125, "learning_rate": 0.00013660047205811735, "loss": 0.057, "step": 8978 }, { "epoch": 2.2140039447731756, "grad_norm": 0.48046875, "learning_rate": 0.0001365684338727005, "loss": 0.0565, "step": 8980 }, { "epoch": 2.214497041420118, "grad_norm": 0.5546875, "learning_rate": 0.00013653639135372008, "loss": 0.0547, "step": 8982 }, { "epoch": 2.214990138067061, "grad_norm": 0.337890625, "learning_rate": 0.00013650434450497326, "loss": 0.0595, "step": 8984 }, { "epoch": 2.215483234714004, "grad_norm": 0.515625, "learning_rate": 0.00013647229333025785, "loss": 0.0558, "step": 8986 }, { "epoch": 2.2159763313609466, "grad_norm": 0.4296875, "learning_rate": 0.00013644023783337202, "loss": 0.0615, "step": 8988 }, { "epoch": 2.2164694280078896, "grad_norm": 0.515625, "learning_rate": 0.00013640817801811454, "loss": 0.0576, "step": 8990 }, { "epoch": 2.216962524654832, "grad_norm": 0.419921875, "learning_rate": 0.0001363761138882847, "loss": 0.0565, "step": 8992 }, { "epoch": 2.217455621301775, "grad_norm": 0.380859375, "learning_rate": 0.00013634404544768226, "loss": 0.0554, "step": 8994 }, { "epoch": 2.217948717948718, "grad_norm": 0.400390625, "learning_rate": 0.00013631197270010757, "loss": 0.0579, "step": 8996 }, { "epoch": 2.2184418145956606, "grad_norm": 0.35546875, "learning_rate": 0.0001362798956493613, "loss": 0.058, "step": 8998 }, { "epoch": 2.2189349112426036, "grad_norm": 0.333984375, "learning_rate": 0.00013624781429924485, "loss": 0.0601, "step": 9000 }, { "epoch": 2.2194280078895465, "grad_norm": 0.42578125, "learning_rate": 0.00013621572865356004, "loss": 0.0605, "step": 9002 }, { "epoch": 2.219921104536489, "grad_norm": 0.453125, "learning_rate": 0.00013618363871610917, "loss": 0.0558, "step": 9004 }, { "epoch": 2.220414201183432, "grad_norm": 0.625, "learning_rate": 0.0001361515444906951, "loss": 0.0549, "step": 9006 }, { "epoch": 2.2209072978303745, "grad_norm": 0.435546875, "learning_rate": 0.00013611944598112118, "loss": 0.0567, "step": 9008 }, { "epoch": 2.2214003944773175, "grad_norm": 0.349609375, "learning_rate": 0.00013608734319119122, "loss": 0.0591, "step": 9010 }, { "epoch": 2.2218934911242605, "grad_norm": 0.4375, "learning_rate": 0.0001360552361247096, "loss": 0.0554, "step": 9012 }, { "epoch": 2.222386587771203, "grad_norm": 0.3359375, "learning_rate": 0.0001360231247854812, "loss": 0.0555, "step": 9014 }, { "epoch": 2.222879684418146, "grad_norm": 0.45703125, "learning_rate": 0.00013599100917731138, "loss": 0.0582, "step": 9016 }, { "epoch": 2.223372781065089, "grad_norm": 0.33984375, "learning_rate": 0.00013595888930400605, "loss": 0.0579, "step": 9018 }, { "epoch": 2.2238658777120315, "grad_norm": 0.333984375, "learning_rate": 0.00013592676516937152, "loss": 0.0548, "step": 9020 }, { "epoch": 2.2243589743589745, "grad_norm": 0.376953125, "learning_rate": 0.0001358946367772148, "loss": 0.0603, "step": 9022 }, { "epoch": 2.224852071005917, "grad_norm": 0.34765625, "learning_rate": 0.00013586250413134316, "loss": 0.0501, "step": 9024 }, { "epoch": 2.22534516765286, "grad_norm": 0.419921875, "learning_rate": 0.00013583036723556455, "loss": 0.0584, "step": 9026 }, { "epoch": 2.225838264299803, "grad_norm": 0.40234375, "learning_rate": 0.0001357982260936874, "loss": 0.0583, "step": 9028 }, { "epoch": 2.2263313609467454, "grad_norm": 0.38671875, "learning_rate": 0.00013576608070952058, "loss": 0.0582, "step": 9030 }, { "epoch": 2.2268244575936884, "grad_norm": 0.337890625, "learning_rate": 0.00013573393108687348, "loss": 0.0563, "step": 9032 }, { "epoch": 2.2273175542406314, "grad_norm": 0.390625, "learning_rate": 0.00013570177722955603, "loss": 0.0615, "step": 9034 }, { "epoch": 2.227810650887574, "grad_norm": 0.421875, "learning_rate": 0.00013566961914137867, "loss": 0.059, "step": 9036 }, { "epoch": 2.228303747534517, "grad_norm": 0.365234375, "learning_rate": 0.00013563745682615227, "loss": 0.0599, "step": 9038 }, { "epoch": 2.2287968441814594, "grad_norm": 0.44921875, "learning_rate": 0.00013560529028768827, "loss": 0.0574, "step": 9040 }, { "epoch": 2.2292899408284024, "grad_norm": 0.5, "learning_rate": 0.0001355731195297986, "loss": 0.0588, "step": 9042 }, { "epoch": 2.2297830374753453, "grad_norm": 0.369140625, "learning_rate": 0.0001355409445562956, "loss": 0.057, "step": 9044 }, { "epoch": 2.230276134122288, "grad_norm": 0.5703125, "learning_rate": 0.00013550876537099228, "loss": 0.0629, "step": 9046 }, { "epoch": 2.230769230769231, "grad_norm": 0.431640625, "learning_rate": 0.000135476581977702, "loss": 0.0595, "step": 9048 }, { "epoch": 2.2312623274161734, "grad_norm": 0.39453125, "learning_rate": 0.00013544439438023867, "loss": 0.0574, "step": 9050 }, { "epoch": 2.2317554240631163, "grad_norm": 0.482421875, "learning_rate": 0.00013541220258241673, "loss": 0.0588, "step": 9052 }, { "epoch": 2.2322485207100593, "grad_norm": 0.404296875, "learning_rate": 0.00013538000658805104, "loss": 0.0548, "step": 9054 }, { "epoch": 2.232741617357002, "grad_norm": 0.369140625, "learning_rate": 0.000135347806400957, "loss": 0.0597, "step": 9056 }, { "epoch": 2.233234714003945, "grad_norm": 0.419921875, "learning_rate": 0.0001353156020249506, "loss": 0.0584, "step": 9058 }, { "epoch": 2.2337278106508878, "grad_norm": 0.37890625, "learning_rate": 0.00013528339346384819, "loss": 0.0534, "step": 9060 }, { "epoch": 2.2342209072978303, "grad_norm": 0.365234375, "learning_rate": 0.00013525118072146662, "loss": 0.0563, "step": 9062 }, { "epoch": 2.2347140039447733, "grad_norm": 0.34375, "learning_rate": 0.00013521896380162335, "loss": 0.0559, "step": 9064 }, { "epoch": 2.235207100591716, "grad_norm": 0.37890625, "learning_rate": 0.00013518674270813623, "loss": 0.056, "step": 9066 }, { "epoch": 2.2357001972386588, "grad_norm": 0.419921875, "learning_rate": 0.0001351545174448236, "loss": 0.0585, "step": 9068 }, { "epoch": 2.2361932938856017, "grad_norm": 0.458984375, "learning_rate": 0.0001351222880155044, "loss": 0.0546, "step": 9070 }, { "epoch": 2.2366863905325443, "grad_norm": 0.37890625, "learning_rate": 0.00013509005442399798, "loss": 0.0622, "step": 9072 }, { "epoch": 2.2371794871794872, "grad_norm": 0.5546875, "learning_rate": 0.0001350578166741242, "loss": 0.0595, "step": 9074 }, { "epoch": 2.2376725838264298, "grad_norm": 0.32421875, "learning_rate": 0.0001350255747697034, "loss": 0.0589, "step": 9076 }, { "epoch": 2.2381656804733727, "grad_norm": 0.470703125, "learning_rate": 0.00013499332871455644, "loss": 0.0575, "step": 9078 }, { "epoch": 2.2386587771203157, "grad_norm": 0.38671875, "learning_rate": 0.00013496107851250461, "loss": 0.0625, "step": 9080 }, { "epoch": 2.239151873767258, "grad_norm": 0.416015625, "learning_rate": 0.0001349288241673698, "loss": 0.0561, "step": 9082 }, { "epoch": 2.239644970414201, "grad_norm": 0.37109375, "learning_rate": 0.00013489656568297433, "loss": 0.0579, "step": 9084 }, { "epoch": 2.240138067061144, "grad_norm": 0.5, "learning_rate": 0.00013486430306314096, "loss": 0.0588, "step": 9086 }, { "epoch": 2.2406311637080867, "grad_norm": 0.388671875, "learning_rate": 0.000134832036311693, "loss": 0.0571, "step": 9088 }, { "epoch": 2.2411242603550297, "grad_norm": 0.404296875, "learning_rate": 0.00013479976543245433, "loss": 0.0581, "step": 9090 }, { "epoch": 2.2416173570019726, "grad_norm": 0.54296875, "learning_rate": 0.00013476749042924912, "loss": 0.0577, "step": 9092 }, { "epoch": 2.242110453648915, "grad_norm": 0.380859375, "learning_rate": 0.0001347352113059022, "loss": 0.0571, "step": 9094 }, { "epoch": 2.242603550295858, "grad_norm": 0.357421875, "learning_rate": 0.00013470292806623877, "loss": 0.0604, "step": 9096 }, { "epoch": 2.2430966469428006, "grad_norm": 0.50390625, "learning_rate": 0.0001346706407140846, "loss": 0.055, "step": 9098 }, { "epoch": 2.2435897435897436, "grad_norm": 0.4609375, "learning_rate": 0.00013463834925326595, "loss": 0.0567, "step": 9100 }, { "epoch": 2.2440828402366866, "grad_norm": 0.38671875, "learning_rate": 0.0001346060536876095, "loss": 0.0599, "step": 9102 }, { "epoch": 2.244575936883629, "grad_norm": 0.404296875, "learning_rate": 0.0001345737540209425, "loss": 0.0558, "step": 9104 }, { "epoch": 2.245069033530572, "grad_norm": 0.5078125, "learning_rate": 0.0001345414502570926, "loss": 0.0583, "step": 9106 }, { "epoch": 2.2455621301775146, "grad_norm": 0.59765625, "learning_rate": 0.000134509142399888, "loss": 0.0535, "step": 9108 }, { "epoch": 2.2460552268244576, "grad_norm": 0.359375, "learning_rate": 0.00013447683045315735, "loss": 0.0601, "step": 9110 }, { "epoch": 2.2465483234714005, "grad_norm": 0.58984375, "learning_rate": 0.00013444451442072978, "loss": 0.0529, "step": 9112 }, { "epoch": 2.247041420118343, "grad_norm": 0.376953125, "learning_rate": 0.00013441219430643494, "loss": 0.0575, "step": 9114 }, { "epoch": 2.247534516765286, "grad_norm": 0.412109375, "learning_rate": 0.00013437987011410296, "loss": 0.066, "step": 9116 }, { "epoch": 2.248027613412229, "grad_norm": 0.40234375, "learning_rate": 0.00013434754184756442, "loss": 0.0561, "step": 9118 }, { "epoch": 2.2485207100591715, "grad_norm": 0.41796875, "learning_rate": 0.0001343152095106504, "loss": 0.0627, "step": 9120 }, { "epoch": 2.2490138067061145, "grad_norm": 0.37890625, "learning_rate": 0.00013428287310719242, "loss": 0.0578, "step": 9122 }, { "epoch": 2.249506903353057, "grad_norm": 0.416015625, "learning_rate": 0.0001342505326410226, "loss": 0.064, "step": 9124 }, { "epoch": 2.25, "grad_norm": 0.330078125, "learning_rate": 0.00013421818811597345, "loss": 0.0594, "step": 9126 }, { "epoch": 2.250493096646943, "grad_norm": 0.490234375, "learning_rate": 0.0001341858395358779, "loss": 0.0577, "step": 9128 }, { "epoch": 2.2509861932938855, "grad_norm": 0.4921875, "learning_rate": 0.00013415348690456953, "loss": 0.0546, "step": 9130 }, { "epoch": 2.2514792899408285, "grad_norm": 0.404296875, "learning_rate": 0.0001341211302258823, "loss": 0.0554, "step": 9132 }, { "epoch": 2.251972386587771, "grad_norm": 0.462890625, "learning_rate": 0.0001340887695036506, "loss": 0.0627, "step": 9134 }, { "epoch": 2.252465483234714, "grad_norm": 0.37890625, "learning_rate": 0.0001340564047417094, "loss": 0.0567, "step": 9136 }, { "epoch": 2.252958579881657, "grad_norm": 0.357421875, "learning_rate": 0.00013402403594389405, "loss": 0.0562, "step": 9138 }, { "epoch": 2.2534516765285995, "grad_norm": 0.384765625, "learning_rate": 0.0001339916631140405, "loss": 0.0529, "step": 9140 }, { "epoch": 2.2539447731755424, "grad_norm": 0.4140625, "learning_rate": 0.00013395928625598513, "loss": 0.0598, "step": 9142 }, { "epoch": 2.2544378698224854, "grad_norm": 0.37890625, "learning_rate": 0.0001339269053735647, "loss": 0.0591, "step": 9144 }, { "epoch": 2.254930966469428, "grad_norm": 0.376953125, "learning_rate": 0.00013389452047061652, "loss": 0.0589, "step": 9146 }, { "epoch": 2.255424063116371, "grad_norm": 0.46484375, "learning_rate": 0.0001338621315509785, "loss": 0.0625, "step": 9148 }, { "epoch": 2.2559171597633134, "grad_norm": 0.41015625, "learning_rate": 0.00013382973861848874, "loss": 0.0572, "step": 9150 }, { "epoch": 2.2564102564102564, "grad_norm": 0.37890625, "learning_rate": 0.00013379734167698614, "loss": 0.0652, "step": 9152 }, { "epoch": 2.2569033530571994, "grad_norm": 0.578125, "learning_rate": 0.00013376494073030984, "loss": 0.0606, "step": 9154 }, { "epoch": 2.257396449704142, "grad_norm": 0.4140625, "learning_rate": 0.0001337325357822995, "loss": 0.0614, "step": 9156 }, { "epoch": 2.257889546351085, "grad_norm": 0.376953125, "learning_rate": 0.0001337001268367954, "loss": 0.0649, "step": 9158 }, { "epoch": 2.2583826429980274, "grad_norm": 0.431640625, "learning_rate": 0.00013366771389763805, "loss": 0.0616, "step": 9160 }, { "epoch": 2.2588757396449703, "grad_norm": 0.44921875, "learning_rate": 0.0001336352969686687, "loss": 0.0559, "step": 9162 }, { "epoch": 2.2593688362919133, "grad_norm": 0.447265625, "learning_rate": 0.0001336028760537288, "loss": 0.0638, "step": 9164 }, { "epoch": 2.259861932938856, "grad_norm": 0.404296875, "learning_rate": 0.0001335704511566605, "loss": 0.0561, "step": 9166 }, { "epoch": 2.260355029585799, "grad_norm": 0.416015625, "learning_rate": 0.0001335380222813063, "loss": 0.0616, "step": 9168 }, { "epoch": 2.260848126232742, "grad_norm": 0.36328125, "learning_rate": 0.00013350558943150918, "loss": 0.0635, "step": 9170 }, { "epoch": 2.2613412228796843, "grad_norm": 0.462890625, "learning_rate": 0.00013347315261111269, "loss": 0.0545, "step": 9172 }, { "epoch": 2.2618343195266273, "grad_norm": 0.453125, "learning_rate": 0.0001334407118239607, "loss": 0.0605, "step": 9174 }, { "epoch": 2.2623274161735702, "grad_norm": 0.326171875, "learning_rate": 0.00013340826707389767, "loss": 0.0596, "step": 9176 }, { "epoch": 2.2628205128205128, "grad_norm": 0.5546875, "learning_rate": 0.00013337581836476842, "loss": 0.0568, "step": 9178 }, { "epoch": 2.2633136094674557, "grad_norm": 0.427734375, "learning_rate": 0.00013334336570041837, "loss": 0.055, "step": 9180 }, { "epoch": 2.2638067061143983, "grad_norm": 0.400390625, "learning_rate": 0.0001333109090846933, "loss": 0.0607, "step": 9182 }, { "epoch": 2.2642998027613412, "grad_norm": 0.56640625, "learning_rate": 0.00013327844852143955, "loss": 0.0606, "step": 9184 }, { "epoch": 2.264792899408284, "grad_norm": 0.365234375, "learning_rate": 0.00013324598401450382, "loss": 0.0612, "step": 9186 }, { "epoch": 2.2652859960552267, "grad_norm": 0.4609375, "learning_rate": 0.00013321351556773342, "loss": 0.0571, "step": 9188 }, { "epoch": 2.2657790927021697, "grad_norm": 0.412109375, "learning_rate": 0.00013318104318497592, "loss": 0.0638, "step": 9190 }, { "epoch": 2.2662721893491122, "grad_norm": 0.4296875, "learning_rate": 0.00013314856687007958, "loss": 0.0591, "step": 9192 }, { "epoch": 2.266765285996055, "grad_norm": 0.42578125, "learning_rate": 0.00013311608662689296, "loss": 0.0582, "step": 9194 }, { "epoch": 2.267258382642998, "grad_norm": 0.54296875, "learning_rate": 0.00013308360245926521, "loss": 0.0626, "step": 9196 }, { "epoch": 2.2677514792899407, "grad_norm": 0.49609375, "learning_rate": 0.00013305111437104583, "loss": 0.0603, "step": 9198 }, { "epoch": 2.2682445759368837, "grad_norm": 0.55078125, "learning_rate": 0.00013301862236608488, "loss": 0.0691, "step": 9200 }, { "epoch": 2.2687376725838266, "grad_norm": 0.427734375, "learning_rate": 0.00013298612644823283, "loss": 0.0587, "step": 9202 }, { "epoch": 2.269230769230769, "grad_norm": 0.4375, "learning_rate": 0.0001329536266213406, "loss": 0.0578, "step": 9204 }, { "epoch": 2.269723865877712, "grad_norm": 0.404296875, "learning_rate": 0.00013292112288925962, "loss": 0.0623, "step": 9206 }, { "epoch": 2.2702169625246547, "grad_norm": 0.435546875, "learning_rate": 0.00013288861525584183, "loss": 0.0538, "step": 9208 }, { "epoch": 2.2707100591715976, "grad_norm": 0.4453125, "learning_rate": 0.00013285610372493943, "loss": 0.062, "step": 9210 }, { "epoch": 2.2712031558185406, "grad_norm": 0.515625, "learning_rate": 0.00013282358830040534, "loss": 0.0579, "step": 9212 }, { "epoch": 2.271696252465483, "grad_norm": 0.451171875, "learning_rate": 0.00013279106898609274, "loss": 0.0524, "step": 9214 }, { "epoch": 2.272189349112426, "grad_norm": 0.404296875, "learning_rate": 0.0001327585457858554, "loss": 0.0638, "step": 9216 }, { "epoch": 2.2726824457593686, "grad_norm": 0.396484375, "learning_rate": 0.0001327260187035475, "loss": 0.0584, "step": 9218 }, { "epoch": 2.2731755424063116, "grad_norm": 0.388671875, "learning_rate": 0.00013269348774302365, "loss": 0.0625, "step": 9220 }, { "epoch": 2.2736686390532546, "grad_norm": 0.45703125, "learning_rate": 0.00013266095290813895, "loss": 0.0606, "step": 9222 }, { "epoch": 2.274161735700197, "grad_norm": 0.404296875, "learning_rate": 0.000132628414202749, "loss": 0.0611, "step": 9224 }, { "epoch": 2.27465483234714, "grad_norm": 0.369140625, "learning_rate": 0.0001325958716307098, "loss": 0.0528, "step": 9226 }, { "epoch": 2.275147928994083, "grad_norm": 0.37890625, "learning_rate": 0.0001325633251958778, "loss": 0.0563, "step": 9228 }, { "epoch": 2.2756410256410255, "grad_norm": 0.451171875, "learning_rate": 0.00013253077490211, "loss": 0.0607, "step": 9230 }, { "epoch": 2.2761341222879685, "grad_norm": 0.44140625, "learning_rate": 0.00013249822075326367, "loss": 0.0565, "step": 9232 }, { "epoch": 2.2766272189349115, "grad_norm": 0.43359375, "learning_rate": 0.0001324656627531968, "loss": 0.0595, "step": 9234 }, { "epoch": 2.277120315581854, "grad_norm": 0.427734375, "learning_rate": 0.00013243310090576757, "loss": 0.0574, "step": 9236 }, { "epoch": 2.277613412228797, "grad_norm": 0.380859375, "learning_rate": 0.00013240053521483482, "loss": 0.0588, "step": 9238 }, { "epoch": 2.2781065088757395, "grad_norm": 0.408203125, "learning_rate": 0.00013236796568425775, "loss": 0.0605, "step": 9240 }, { "epoch": 2.2785996055226825, "grad_norm": 0.376953125, "learning_rate": 0.00013233539231789605, "loss": 0.0507, "step": 9242 }, { "epoch": 2.279092702169625, "grad_norm": 0.33984375, "learning_rate": 0.0001323028151196098, "loss": 0.0527, "step": 9244 }, { "epoch": 2.279585798816568, "grad_norm": 0.392578125, "learning_rate": 0.0001322702340932596, "loss": 0.054, "step": 9246 }, { "epoch": 2.280078895463511, "grad_norm": 0.375, "learning_rate": 0.0001322376492427065, "loss": 0.0573, "step": 9248 }, { "epoch": 2.2805719921104535, "grad_norm": 0.45703125, "learning_rate": 0.00013220506057181196, "loss": 0.0578, "step": 9250 }, { "epoch": 2.2810650887573964, "grad_norm": 0.400390625, "learning_rate": 0.00013217246808443792, "loss": 0.0588, "step": 9252 }, { "epoch": 2.2815581854043394, "grad_norm": 0.4921875, "learning_rate": 0.0001321398717844468, "loss": 0.0562, "step": 9254 }, { "epoch": 2.282051282051282, "grad_norm": 0.384765625, "learning_rate": 0.00013210727167570143, "loss": 0.0592, "step": 9256 }, { "epoch": 2.282544378698225, "grad_norm": 0.458984375, "learning_rate": 0.0001320746677620651, "loss": 0.0674, "step": 9258 }, { "epoch": 2.283037475345168, "grad_norm": 0.40234375, "learning_rate": 0.0001320420600474016, "loss": 0.0601, "step": 9260 }, { "epoch": 2.2835305719921104, "grad_norm": 0.490234375, "learning_rate": 0.000132009448535575, "loss": 0.0594, "step": 9262 }, { "epoch": 2.2840236686390534, "grad_norm": 0.40234375, "learning_rate": 0.00013197683323045006, "loss": 0.0581, "step": 9264 }, { "epoch": 2.284516765285996, "grad_norm": 0.5, "learning_rate": 0.00013194421413589188, "loss": 0.0612, "step": 9266 }, { "epoch": 2.285009861932939, "grad_norm": 0.34765625, "learning_rate": 0.00013191159125576594, "loss": 0.0564, "step": 9268 }, { "epoch": 2.285502958579882, "grad_norm": 0.388671875, "learning_rate": 0.0001318789645939383, "loss": 0.0597, "step": 9270 }, { "epoch": 2.2859960552268244, "grad_norm": 0.380859375, "learning_rate": 0.00013184633415427532, "loss": 0.0544, "step": 9272 }, { "epoch": 2.2864891518737673, "grad_norm": 0.44140625, "learning_rate": 0.00013181369994064398, "loss": 0.0609, "step": 9274 }, { "epoch": 2.28698224852071, "grad_norm": 0.48046875, "learning_rate": 0.00013178106195691153, "loss": 0.0618, "step": 9276 }, { "epoch": 2.287475345167653, "grad_norm": 0.44921875, "learning_rate": 0.00013174842020694582, "loss": 0.0583, "step": 9278 }, { "epoch": 2.287968441814596, "grad_norm": 0.451171875, "learning_rate": 0.00013171577469461504, "loss": 0.0624, "step": 9280 }, { "epoch": 2.2884615384615383, "grad_norm": 0.466796875, "learning_rate": 0.0001316831254237879, "loss": 0.0574, "step": 9282 }, { "epoch": 2.2889546351084813, "grad_norm": 0.404296875, "learning_rate": 0.00013165047239833348, "loss": 0.0564, "step": 9284 }, { "epoch": 2.2894477317554243, "grad_norm": 0.455078125, "learning_rate": 0.00013161781562212138, "loss": 0.0548, "step": 9286 }, { "epoch": 2.289940828402367, "grad_norm": 0.447265625, "learning_rate": 0.0001315851550990216, "loss": 0.0563, "step": 9288 }, { "epoch": 2.2904339250493098, "grad_norm": 0.392578125, "learning_rate": 0.00013155249083290457, "loss": 0.0547, "step": 9290 }, { "epoch": 2.2909270216962523, "grad_norm": 0.38671875, "learning_rate": 0.00013151982282764125, "loss": 0.0603, "step": 9292 }, { "epoch": 2.2914201183431953, "grad_norm": 0.412109375, "learning_rate": 0.00013148715108710294, "loss": 0.0574, "step": 9294 }, { "epoch": 2.2919132149901382, "grad_norm": 0.44921875, "learning_rate": 0.00013145447561516138, "loss": 0.0569, "step": 9296 }, { "epoch": 2.2924063116370808, "grad_norm": 0.47265625, "learning_rate": 0.00013142179641568888, "loss": 0.0571, "step": 9298 }, { "epoch": 2.2928994082840237, "grad_norm": 0.4140625, "learning_rate": 0.00013138911349255806, "loss": 0.0558, "step": 9300 }, { "epoch": 2.2933925049309662, "grad_norm": 0.578125, "learning_rate": 0.000131356426849642, "loss": 0.0592, "step": 9302 }, { "epoch": 2.293885601577909, "grad_norm": 0.5546875, "learning_rate": 0.0001313237364908143, "loss": 0.0549, "step": 9304 }, { "epoch": 2.294378698224852, "grad_norm": 0.5625, "learning_rate": 0.00013129104241994894, "loss": 0.06, "step": 9306 }, { "epoch": 2.2948717948717947, "grad_norm": 0.51171875, "learning_rate": 0.00013125834464092039, "loss": 0.0572, "step": 9308 }, { "epoch": 2.2953648915187377, "grad_norm": 0.384765625, "learning_rate": 0.00013122564315760343, "loss": 0.0593, "step": 9310 }, { "epoch": 2.2958579881656807, "grad_norm": 0.6015625, "learning_rate": 0.00013119293797387343, "loss": 0.0581, "step": 9312 }, { "epoch": 2.296351084812623, "grad_norm": 0.50390625, "learning_rate": 0.00013116022909360613, "loss": 0.0607, "step": 9314 }, { "epoch": 2.296844181459566, "grad_norm": 0.333984375, "learning_rate": 0.0001311275165206777, "loss": 0.0581, "step": 9316 }, { "epoch": 2.297337278106509, "grad_norm": 0.337890625, "learning_rate": 0.00013109480025896476, "loss": 0.0535, "step": 9318 }, { "epoch": 2.2978303747534516, "grad_norm": 0.333984375, "learning_rate": 0.00013106208031234435, "loss": 0.0603, "step": 9320 }, { "epoch": 2.2983234714003946, "grad_norm": 0.455078125, "learning_rate": 0.00013102935668469402, "loss": 0.0576, "step": 9322 }, { "epoch": 2.298816568047337, "grad_norm": 0.375, "learning_rate": 0.0001309966293798917, "loss": 0.0559, "step": 9324 }, { "epoch": 2.29930966469428, "grad_norm": 0.55078125, "learning_rate": 0.00013096389840181572, "loss": 0.0573, "step": 9326 }, { "epoch": 2.2998027613412226, "grad_norm": 0.44140625, "learning_rate": 0.00013093116375434492, "loss": 0.058, "step": 9328 }, { "epoch": 2.3002958579881656, "grad_norm": 0.41796875, "learning_rate": 0.00013089842544135847, "loss": 0.0542, "step": 9330 }, { "epoch": 2.3007889546351086, "grad_norm": 0.34765625, "learning_rate": 0.00013086568346673612, "loss": 0.0574, "step": 9332 }, { "epoch": 2.301282051282051, "grad_norm": 0.392578125, "learning_rate": 0.00013083293783435794, "loss": 0.0604, "step": 9334 }, { "epoch": 2.301775147928994, "grad_norm": 0.421875, "learning_rate": 0.00013080018854810446, "loss": 0.0565, "step": 9336 }, { "epoch": 2.302268244575937, "grad_norm": 0.396484375, "learning_rate": 0.0001307674356118567, "loss": 0.0537, "step": 9338 }, { "epoch": 2.3027613412228796, "grad_norm": 0.34375, "learning_rate": 0.00013073467902949604, "loss": 0.0617, "step": 9340 }, { "epoch": 2.3032544378698225, "grad_norm": 0.392578125, "learning_rate": 0.00013070191880490433, "loss": 0.0592, "step": 9342 }, { "epoch": 2.3037475345167655, "grad_norm": 0.47265625, "learning_rate": 0.00013066915494196378, "loss": 0.0666, "step": 9344 }, { "epoch": 2.304240631163708, "grad_norm": 0.390625, "learning_rate": 0.00013063638744455712, "loss": 0.0615, "step": 9346 }, { "epoch": 2.304733727810651, "grad_norm": 0.416015625, "learning_rate": 0.00013060361631656755, "loss": 0.0576, "step": 9348 }, { "epoch": 2.3052268244575935, "grad_norm": 0.404296875, "learning_rate": 0.00013057084156187853, "loss": 0.0593, "step": 9350 }, { "epoch": 2.3057199211045365, "grad_norm": 0.37890625, "learning_rate": 0.00013053806318437413, "loss": 0.0612, "step": 9352 }, { "epoch": 2.3062130177514795, "grad_norm": 0.53125, "learning_rate": 0.00013050528118793874, "loss": 0.0578, "step": 9354 }, { "epoch": 2.306706114398422, "grad_norm": 0.5078125, "learning_rate": 0.0001304724955764572, "loss": 0.054, "step": 9356 }, { "epoch": 2.307199211045365, "grad_norm": 0.404296875, "learning_rate": 0.0001304397063538148, "loss": 0.0569, "step": 9358 }, { "epoch": 2.3076923076923075, "grad_norm": 0.32421875, "learning_rate": 0.00013040691352389727, "loss": 0.0555, "step": 9360 }, { "epoch": 2.3081854043392505, "grad_norm": 0.39453125, "learning_rate": 0.0001303741170905907, "loss": 0.0616, "step": 9362 }, { "epoch": 2.3086785009861934, "grad_norm": 0.34765625, "learning_rate": 0.00013034131705778163, "loss": 0.0549, "step": 9364 }, { "epoch": 2.309171597633136, "grad_norm": 0.412109375, "learning_rate": 0.00013030851342935712, "loss": 0.0544, "step": 9366 }, { "epoch": 2.309664694280079, "grad_norm": 0.46484375, "learning_rate": 0.00013027570620920454, "loss": 0.0608, "step": 9368 }, { "epoch": 2.310157790927022, "grad_norm": 0.3515625, "learning_rate": 0.00013024289540121173, "loss": 0.0622, "step": 9370 }, { "epoch": 2.3106508875739644, "grad_norm": 0.376953125, "learning_rate": 0.00013021008100926696, "loss": 0.0579, "step": 9372 }, { "epoch": 2.3111439842209074, "grad_norm": 0.5078125, "learning_rate": 0.00013017726303725895, "loss": 0.0611, "step": 9374 }, { "epoch": 2.31163708086785, "grad_norm": 0.42578125, "learning_rate": 0.00013014444148907678, "loss": 0.0612, "step": 9376 }, { "epoch": 2.312130177514793, "grad_norm": 0.4296875, "learning_rate": 0.00013011161636860993, "loss": 0.0665, "step": 9378 }, { "epoch": 2.312623274161736, "grad_norm": 0.4453125, "learning_rate": 0.0001300787876797485, "loss": 0.0554, "step": 9380 }, { "epoch": 2.3131163708086784, "grad_norm": 0.416015625, "learning_rate": 0.00013004595542638272, "loss": 0.058, "step": 9382 }, { "epoch": 2.3136094674556213, "grad_norm": 0.427734375, "learning_rate": 0.00013001311961240348, "loss": 0.052, "step": 9384 }, { "epoch": 2.314102564102564, "grad_norm": 0.416015625, "learning_rate": 0.00012998028024170202, "loss": 0.0556, "step": 9386 }, { "epoch": 2.314595660749507, "grad_norm": 0.353515625, "learning_rate": 0.00012994743731816995, "loss": 0.0604, "step": 9388 }, { "epoch": 2.31508875739645, "grad_norm": 0.365234375, "learning_rate": 0.00012991459084569934, "loss": 0.0581, "step": 9390 }, { "epoch": 2.3155818540433923, "grad_norm": 0.373046875, "learning_rate": 0.00012988174082818272, "loss": 0.0554, "step": 9392 }, { "epoch": 2.3160749506903353, "grad_norm": 0.375, "learning_rate": 0.00012984888726951297, "loss": 0.0546, "step": 9394 }, { "epoch": 2.3165680473372783, "grad_norm": 0.3671875, "learning_rate": 0.0001298160301735834, "loss": 0.0572, "step": 9396 }, { "epoch": 2.317061143984221, "grad_norm": 0.373046875, "learning_rate": 0.0001297831695442878, "loss": 0.0539, "step": 9398 }, { "epoch": 2.3175542406311638, "grad_norm": 0.42578125, "learning_rate": 0.00012975030538552032, "loss": 0.0537, "step": 9400 }, { "epoch": 2.3180473372781067, "grad_norm": 0.75, "learning_rate": 0.00012971743770117552, "loss": 0.0587, "step": 9402 }, { "epoch": 2.3185404339250493, "grad_norm": 0.37109375, "learning_rate": 0.00012968456649514845, "loss": 0.0549, "step": 9404 }, { "epoch": 2.3190335305719922, "grad_norm": 0.40625, "learning_rate": 0.00012965169177133455, "loss": 0.0572, "step": 9406 }, { "epoch": 2.3195266272189348, "grad_norm": 0.5234375, "learning_rate": 0.0001296188135336296, "loss": 0.0574, "step": 9408 }, { "epoch": 2.3200197238658777, "grad_norm": 0.5234375, "learning_rate": 0.00012958593178592987, "loss": 0.0615, "step": 9410 }, { "epoch": 2.3205128205128207, "grad_norm": 0.4453125, "learning_rate": 0.00012955304653213201, "loss": 0.0576, "step": 9412 }, { "epoch": 2.3210059171597632, "grad_norm": 0.40625, "learning_rate": 0.00012952015777613316, "loss": 0.055, "step": 9414 }, { "epoch": 2.321499013806706, "grad_norm": 0.412109375, "learning_rate": 0.0001294872655218308, "loss": 0.0561, "step": 9416 }, { "epoch": 2.3219921104536487, "grad_norm": 0.353515625, "learning_rate": 0.0001294543697731228, "loss": 0.055, "step": 9418 }, { "epoch": 2.3224852071005917, "grad_norm": 0.51953125, "learning_rate": 0.00012942147053390758, "loss": 0.0579, "step": 9420 }, { "epoch": 2.3229783037475347, "grad_norm": 0.478515625, "learning_rate": 0.0001293885678080838, "loss": 0.0595, "step": 9422 }, { "epoch": 2.323471400394477, "grad_norm": 0.37890625, "learning_rate": 0.0001293556615995507, "loss": 0.0569, "step": 9424 }, { "epoch": 2.32396449704142, "grad_norm": 0.416015625, "learning_rate": 0.00012932275191220776, "loss": 0.0546, "step": 9426 }, { "epoch": 2.324457593688363, "grad_norm": 0.390625, "learning_rate": 0.000129289838749955, "loss": 0.0507, "step": 9428 }, { "epoch": 2.3249506903353057, "grad_norm": 0.451171875, "learning_rate": 0.0001292569221166928, "loss": 0.0589, "step": 9430 }, { "epoch": 2.3254437869822486, "grad_norm": 0.36328125, "learning_rate": 0.000129224002016322, "loss": 0.0547, "step": 9432 }, { "epoch": 2.325936883629191, "grad_norm": 0.37109375, "learning_rate": 0.00012919107845274378, "loss": 0.0567, "step": 9434 }, { "epoch": 2.326429980276134, "grad_norm": 0.50390625, "learning_rate": 0.00012915815142985982, "loss": 0.0606, "step": 9436 }, { "epoch": 2.326923076923077, "grad_norm": 0.5078125, "learning_rate": 0.00012912522095157207, "loss": 0.0536, "step": 9438 }, { "epoch": 2.3274161735700196, "grad_norm": 0.34765625, "learning_rate": 0.00012909228702178305, "loss": 0.0529, "step": 9440 }, { "epoch": 2.3279092702169626, "grad_norm": 0.41015625, "learning_rate": 0.0001290593496443956, "loss": 0.0572, "step": 9442 }, { "epoch": 2.328402366863905, "grad_norm": 0.392578125, "learning_rate": 0.00012902640882331295, "loss": 0.0599, "step": 9444 }, { "epoch": 2.328895463510848, "grad_norm": 0.357421875, "learning_rate": 0.00012899346456243877, "loss": 0.0572, "step": 9446 }, { "epoch": 2.329388560157791, "grad_norm": 0.44921875, "learning_rate": 0.0001289605168656772, "loss": 0.053, "step": 9448 }, { "epoch": 2.3298816568047336, "grad_norm": 0.416015625, "learning_rate": 0.00012892756573693265, "loss": 0.0585, "step": 9450 }, { "epoch": 2.3303747534516766, "grad_norm": 0.3671875, "learning_rate": 0.00012889461118011008, "loss": 0.0537, "step": 9452 }, { "epoch": 2.3308678500986195, "grad_norm": 0.365234375, "learning_rate": 0.00012886165319911475, "loss": 0.0577, "step": 9454 }, { "epoch": 2.331360946745562, "grad_norm": 0.486328125, "learning_rate": 0.00012882869179785237, "loss": 0.0515, "step": 9456 }, { "epoch": 2.331854043392505, "grad_norm": 0.33203125, "learning_rate": 0.00012879572698022907, "loss": 0.061, "step": 9458 }, { "epoch": 2.3323471400394475, "grad_norm": 0.458984375, "learning_rate": 0.00012876275875015132, "loss": 0.0578, "step": 9460 }, { "epoch": 2.3328402366863905, "grad_norm": 0.41796875, "learning_rate": 0.00012872978711152608, "loss": 0.0556, "step": 9462 }, { "epoch": 2.3333333333333335, "grad_norm": 0.45703125, "learning_rate": 0.00012869681206826067, "loss": 0.0588, "step": 9464 }, { "epoch": 2.333826429980276, "grad_norm": 0.42578125, "learning_rate": 0.0001286638336242628, "loss": 0.0516, "step": 9466 }, { "epoch": 2.334319526627219, "grad_norm": 0.5390625, "learning_rate": 0.0001286308517834406, "loss": 0.0491, "step": 9468 }, { "epoch": 2.3348126232741615, "grad_norm": 0.380859375, "learning_rate": 0.00012859786654970262, "loss": 0.0535, "step": 9470 }, { "epoch": 2.3353057199211045, "grad_norm": 0.373046875, "learning_rate": 0.00012856487792695776, "loss": 0.0543, "step": 9472 }, { "epoch": 2.3357988165680474, "grad_norm": 0.5390625, "learning_rate": 0.0001285318859191154, "loss": 0.0509, "step": 9474 }, { "epoch": 2.33629191321499, "grad_norm": 0.42578125, "learning_rate": 0.00012849889053008525, "loss": 0.0564, "step": 9476 }, { "epoch": 2.336785009861933, "grad_norm": 0.33203125, "learning_rate": 0.00012846589176377748, "loss": 0.0534, "step": 9478 }, { "epoch": 2.337278106508876, "grad_norm": 0.390625, "learning_rate": 0.0001284328896241026, "loss": 0.057, "step": 9480 }, { "epoch": 2.3377712031558184, "grad_norm": 0.431640625, "learning_rate": 0.00012839988411497152, "loss": 0.0547, "step": 9482 }, { "epoch": 2.3382642998027614, "grad_norm": 0.337890625, "learning_rate": 0.00012836687524029561, "loss": 0.0589, "step": 9484 }, { "epoch": 2.3387573964497044, "grad_norm": 0.41015625, "learning_rate": 0.0001283338630039866, "loss": 0.0536, "step": 9486 }, { "epoch": 2.339250493096647, "grad_norm": 0.43359375, "learning_rate": 0.00012830084740995666, "loss": 0.0528, "step": 9488 }, { "epoch": 2.33974358974359, "grad_norm": 0.375, "learning_rate": 0.0001282678284621183, "loss": 0.056, "step": 9490 }, { "epoch": 2.3402366863905324, "grad_norm": 0.30078125, "learning_rate": 0.00012823480616438443, "loss": 0.0529, "step": 9492 }, { "epoch": 2.3407297830374754, "grad_norm": 0.3671875, "learning_rate": 0.00012820178052066832, "loss": 0.0526, "step": 9494 }, { "epoch": 2.3412228796844183, "grad_norm": 0.447265625, "learning_rate": 0.00012816875153488381, "loss": 0.0584, "step": 9496 }, { "epoch": 2.341715976331361, "grad_norm": 0.40234375, "learning_rate": 0.00012813571921094498, "loss": 0.0522, "step": 9498 }, { "epoch": 2.342209072978304, "grad_norm": 0.47265625, "learning_rate": 0.0001281026835527663, "loss": 0.0567, "step": 9500 }, { "epoch": 2.3427021696252464, "grad_norm": 0.369140625, "learning_rate": 0.00012806964456426275, "loss": 0.0511, "step": 9502 }, { "epoch": 2.3431952662721893, "grad_norm": 0.37890625, "learning_rate": 0.00012803660224934958, "loss": 0.0543, "step": 9504 }, { "epoch": 2.3436883629191323, "grad_norm": 0.423828125, "learning_rate": 0.00012800355661194253, "loss": 0.0627, "step": 9506 }, { "epoch": 2.344181459566075, "grad_norm": 0.353515625, "learning_rate": 0.00012797050765595767, "loss": 0.0577, "step": 9508 }, { "epoch": 2.344674556213018, "grad_norm": 0.474609375, "learning_rate": 0.00012793745538531144, "loss": 0.0588, "step": 9510 }, { "epoch": 2.3451676528599608, "grad_norm": 0.41015625, "learning_rate": 0.0001279043998039208, "loss": 0.0572, "step": 9512 }, { "epoch": 2.3456607495069033, "grad_norm": 0.349609375, "learning_rate": 0.00012787134091570297, "loss": 0.0578, "step": 9514 }, { "epoch": 2.3461538461538463, "grad_norm": 0.375, "learning_rate": 0.00012783827872457564, "loss": 0.0589, "step": 9516 }, { "epoch": 2.346646942800789, "grad_norm": 0.4296875, "learning_rate": 0.00012780521323445685, "loss": 0.0579, "step": 9518 }, { "epoch": 2.3471400394477318, "grad_norm": 0.419921875, "learning_rate": 0.00012777214444926503, "loss": 0.0548, "step": 9520 }, { "epoch": 2.3476331360946747, "grad_norm": 0.3828125, "learning_rate": 0.00012773907237291908, "loss": 0.0572, "step": 9522 }, { "epoch": 2.3481262327416172, "grad_norm": 0.369140625, "learning_rate": 0.0001277059970093381, "loss": 0.0499, "step": 9524 }, { "epoch": 2.34861932938856, "grad_norm": 0.376953125, "learning_rate": 0.00012767291836244185, "loss": 0.0545, "step": 9526 }, { "epoch": 2.3491124260355027, "grad_norm": 0.515625, "learning_rate": 0.00012763983643615022, "loss": 0.052, "step": 9528 }, { "epoch": 2.3496055226824457, "grad_norm": 0.30078125, "learning_rate": 0.00012760675123438367, "loss": 0.0505, "step": 9530 }, { "epoch": 2.3500986193293887, "grad_norm": 0.36328125, "learning_rate": 0.00012757366276106298, "loss": 0.0512, "step": 9532 }, { "epoch": 2.350591715976331, "grad_norm": 0.3515625, "learning_rate": 0.0001275405710201093, "loss": 0.054, "step": 9534 }, { "epoch": 2.351084812623274, "grad_norm": 0.333984375, "learning_rate": 0.00012750747601544417, "loss": 0.0561, "step": 9536 }, { "epoch": 2.351577909270217, "grad_norm": 0.34375, "learning_rate": 0.0001274743777509896, "loss": 0.0558, "step": 9538 }, { "epoch": 2.3520710059171597, "grad_norm": 0.369140625, "learning_rate": 0.00012744127623066782, "loss": 0.0529, "step": 9540 }, { "epoch": 2.3525641025641026, "grad_norm": 0.380859375, "learning_rate": 0.0001274081714584016, "loss": 0.0534, "step": 9542 }, { "epoch": 2.3530571992110456, "grad_norm": 0.474609375, "learning_rate": 0.00012737506343811402, "loss": 0.0554, "step": 9544 }, { "epoch": 2.353550295857988, "grad_norm": 0.333984375, "learning_rate": 0.0001273419521737286, "loss": 0.0543, "step": 9546 }, { "epoch": 2.354043392504931, "grad_norm": 0.451171875, "learning_rate": 0.00012730883766916919, "loss": 0.0516, "step": 9548 }, { "epoch": 2.3545364891518736, "grad_norm": 0.32421875, "learning_rate": 0.00012727571992836003, "loss": 0.0509, "step": 9550 }, { "epoch": 2.3550295857988166, "grad_norm": 0.3984375, "learning_rate": 0.00012724259895522576, "loss": 0.0526, "step": 9552 }, { "epoch": 2.355522682445759, "grad_norm": 0.498046875, "learning_rate": 0.00012720947475369142, "loss": 0.0515, "step": 9554 }, { "epoch": 2.356015779092702, "grad_norm": 0.451171875, "learning_rate": 0.00012717634732768243, "loss": 0.0535, "step": 9556 }, { "epoch": 2.356508875739645, "grad_norm": 0.296875, "learning_rate": 0.00012714321668112454, "loss": 0.0509, "step": 9558 }, { "epoch": 2.3570019723865876, "grad_norm": 0.384765625, "learning_rate": 0.00012711008281794393, "loss": 0.0541, "step": 9560 }, { "epoch": 2.3574950690335306, "grad_norm": 0.45703125, "learning_rate": 0.00012707694574206713, "loss": 0.0604, "step": 9562 }, { "epoch": 2.3579881656804735, "grad_norm": 0.32421875, "learning_rate": 0.00012704380545742108, "loss": 0.0491, "step": 9564 }, { "epoch": 2.358481262327416, "grad_norm": 0.35546875, "learning_rate": 0.00012701066196793307, "loss": 0.0558, "step": 9566 }, { "epoch": 2.358974358974359, "grad_norm": 0.37109375, "learning_rate": 0.00012697751527753084, "loss": 0.0542, "step": 9568 }, { "epoch": 2.359467455621302, "grad_norm": 0.333984375, "learning_rate": 0.00012694436539014241, "loss": 0.0468, "step": 9570 }, { "epoch": 2.3599605522682445, "grad_norm": 0.349609375, "learning_rate": 0.00012691121230969628, "loss": 0.0481, "step": 9572 }, { "epoch": 2.3604536489151875, "grad_norm": 0.3828125, "learning_rate": 0.00012687805604012122, "loss": 0.0572, "step": 9574 }, { "epoch": 2.36094674556213, "grad_norm": 0.41796875, "learning_rate": 0.00012684489658534642, "loss": 0.0571, "step": 9576 }, { "epoch": 2.361439842209073, "grad_norm": 0.498046875, "learning_rate": 0.0001268117339493015, "loss": 0.0547, "step": 9578 }, { "epoch": 2.361932938856016, "grad_norm": 0.4296875, "learning_rate": 0.00012677856813591643, "loss": 0.0459, "step": 9580 }, { "epoch": 2.3624260355029585, "grad_norm": 0.5078125, "learning_rate": 0.0001267453991491215, "loss": 0.0554, "step": 9582 }, { "epoch": 2.3629191321499015, "grad_norm": 0.390625, "learning_rate": 0.00012671222699284744, "loss": 0.0514, "step": 9584 }, { "epoch": 2.363412228796844, "grad_norm": 0.396484375, "learning_rate": 0.00012667905167102538, "loss": 0.053, "step": 9586 }, { "epoch": 2.363905325443787, "grad_norm": 0.34765625, "learning_rate": 0.00012664587318758673, "loss": 0.0537, "step": 9588 }, { "epoch": 2.36439842209073, "grad_norm": 0.423828125, "learning_rate": 0.00012661269154646334, "loss": 0.055, "step": 9590 }, { "epoch": 2.3648915187376724, "grad_norm": 0.40234375, "learning_rate": 0.00012657950675158737, "loss": 0.052, "step": 9592 }, { "epoch": 2.3653846153846154, "grad_norm": 0.416015625, "learning_rate": 0.00012654631880689148, "loss": 0.058, "step": 9594 }, { "epoch": 2.3658777120315584, "grad_norm": 0.46875, "learning_rate": 0.00012651312771630858, "loss": 0.055, "step": 9596 }, { "epoch": 2.366370808678501, "grad_norm": 0.37890625, "learning_rate": 0.00012647993348377203, "loss": 0.0575, "step": 9598 }, { "epoch": 2.366863905325444, "grad_norm": 0.59765625, "learning_rate": 0.00012644673611321552, "loss": 0.0536, "step": 9600 }, { "epoch": 2.3673570019723864, "grad_norm": 0.353515625, "learning_rate": 0.00012641353560857312, "loss": 0.0522, "step": 9602 }, { "epoch": 2.3678500986193294, "grad_norm": 0.349609375, "learning_rate": 0.0001263803319737793, "loss": 0.0481, "step": 9604 }, { "epoch": 2.3683431952662723, "grad_norm": 0.357421875, "learning_rate": 0.00012634712521276883, "loss": 0.0501, "step": 9606 }, { "epoch": 2.368836291913215, "grad_norm": 0.369140625, "learning_rate": 0.00012631391532947687, "loss": 0.0497, "step": 9608 }, { "epoch": 2.369329388560158, "grad_norm": 0.45703125, "learning_rate": 0.00012628070232783908, "loss": 0.0499, "step": 9610 }, { "epoch": 2.3698224852071004, "grad_norm": 0.373046875, "learning_rate": 0.00012624748621179133, "loss": 0.0518, "step": 9612 }, { "epoch": 2.3703155818540433, "grad_norm": 0.439453125, "learning_rate": 0.0001262142669852699, "loss": 0.0503, "step": 9614 }, { "epoch": 2.3708086785009863, "grad_norm": 0.330078125, "learning_rate": 0.00012618104465221148, "loss": 0.052, "step": 9616 }, { "epoch": 2.371301775147929, "grad_norm": 0.427734375, "learning_rate": 0.00012614781921655308, "loss": 0.0552, "step": 9618 }, { "epoch": 2.371794871794872, "grad_norm": 0.38671875, "learning_rate": 0.00012611459068223214, "loss": 0.0541, "step": 9620 }, { "epoch": 2.3722879684418148, "grad_norm": 0.39453125, "learning_rate": 0.00012608135905318635, "loss": 0.0518, "step": 9622 }, { "epoch": 2.3727810650887573, "grad_norm": 0.478515625, "learning_rate": 0.00012604812433335394, "loss": 0.0552, "step": 9624 }, { "epoch": 2.3732741617357003, "grad_norm": 0.439453125, "learning_rate": 0.00012601488652667334, "loss": 0.0568, "step": 9626 }, { "epoch": 2.3737672583826432, "grad_norm": 0.359375, "learning_rate": 0.00012598164563708345, "loss": 0.0546, "step": 9628 }, { "epoch": 2.3742603550295858, "grad_norm": 0.326171875, "learning_rate": 0.0001259484016685235, "loss": 0.0537, "step": 9630 }, { "epoch": 2.3747534516765287, "grad_norm": 0.404296875, "learning_rate": 0.00012591515462493305, "loss": 0.0476, "step": 9632 }, { "epoch": 2.3752465483234713, "grad_norm": 0.380859375, "learning_rate": 0.00012588190451025207, "loss": 0.0547, "step": 9634 }, { "epoch": 2.3757396449704142, "grad_norm": 0.31640625, "learning_rate": 0.00012584865132842093, "loss": 0.0539, "step": 9636 }, { "epoch": 2.3762327416173568, "grad_norm": 0.357421875, "learning_rate": 0.0001258153950833803, "loss": 0.0561, "step": 9638 }, { "epoch": 2.3767258382642997, "grad_norm": 0.3671875, "learning_rate": 0.00012578213577907118, "loss": 0.0493, "step": 9640 }, { "epoch": 2.3772189349112427, "grad_norm": 0.388671875, "learning_rate": 0.00012574887341943507, "loss": 0.0489, "step": 9642 }, { "epoch": 2.3777120315581852, "grad_norm": 0.2890625, "learning_rate": 0.00012571560800841365, "loss": 0.045, "step": 9644 }, { "epoch": 2.378205128205128, "grad_norm": 0.33203125, "learning_rate": 0.00012568233954994912, "loss": 0.0499, "step": 9646 }, { "epoch": 2.378698224852071, "grad_norm": 0.412109375, "learning_rate": 0.00012564906804798394, "loss": 0.0519, "step": 9648 }, { "epoch": 2.3791913214990137, "grad_norm": 0.40234375, "learning_rate": 0.00012561579350646101, "loss": 0.0565, "step": 9650 }, { "epoch": 2.3796844181459567, "grad_norm": 0.349609375, "learning_rate": 0.00012558251592932352, "loss": 0.0502, "step": 9652 }, { "epoch": 2.3801775147928996, "grad_norm": 0.52734375, "learning_rate": 0.00012554923532051508, "loss": 0.0542, "step": 9654 }, { "epoch": 2.380670611439842, "grad_norm": 0.345703125, "learning_rate": 0.00012551595168397962, "loss": 0.0545, "step": 9656 }, { "epoch": 2.381163708086785, "grad_norm": 0.451171875, "learning_rate": 0.00012548266502366139, "loss": 0.0523, "step": 9658 }, { "epoch": 2.3816568047337277, "grad_norm": 0.361328125, "learning_rate": 0.00012544937534350508, "loss": 0.0508, "step": 9660 }, { "epoch": 2.3821499013806706, "grad_norm": 0.298828125, "learning_rate": 0.0001254160826474557, "loss": 0.0503, "step": 9662 }, { "epoch": 2.3826429980276136, "grad_norm": 0.41796875, "learning_rate": 0.0001253827869394586, "loss": 0.0537, "step": 9664 }, { "epoch": 2.383136094674556, "grad_norm": 0.4609375, "learning_rate": 0.00012534948822345957, "loss": 0.0481, "step": 9666 }, { "epoch": 2.383629191321499, "grad_norm": 0.3515625, "learning_rate": 0.00012531618650340464, "loss": 0.0492, "step": 9668 }, { "epoch": 2.3841222879684416, "grad_norm": 0.416015625, "learning_rate": 0.00012528288178324028, "loss": 0.0528, "step": 9670 }, { "epoch": 2.3846153846153846, "grad_norm": 0.37109375, "learning_rate": 0.00012524957406691325, "loss": 0.0566, "step": 9672 }, { "epoch": 2.3851084812623276, "grad_norm": 0.3828125, "learning_rate": 0.00012521626335837072, "loss": 0.052, "step": 9674 }, { "epoch": 2.38560157790927, "grad_norm": 0.3203125, "learning_rate": 0.00012518294966156016, "loss": 0.0503, "step": 9676 }, { "epoch": 2.386094674556213, "grad_norm": 0.365234375, "learning_rate": 0.00012514963298042948, "loss": 0.0452, "step": 9678 }, { "epoch": 2.386587771203156, "grad_norm": 0.36328125, "learning_rate": 0.0001251163133189269, "loss": 0.0467, "step": 9680 }, { "epoch": 2.3870808678500985, "grad_norm": 0.37890625, "learning_rate": 0.00012508299068100093, "loss": 0.0537, "step": 9682 }, { "epoch": 2.3875739644970415, "grad_norm": 0.30078125, "learning_rate": 0.00012504966507060052, "loss": 0.0481, "step": 9684 }, { "epoch": 2.388067061143984, "grad_norm": 0.470703125, "learning_rate": 0.00012501633649167495, "loss": 0.0534, "step": 9686 }, { "epoch": 2.388560157790927, "grad_norm": 0.400390625, "learning_rate": 0.00012498300494817385, "loss": 0.0578, "step": 9688 }, { "epoch": 2.38905325443787, "grad_norm": 0.439453125, "learning_rate": 0.00012494967044404716, "loss": 0.0525, "step": 9690 }, { "epoch": 2.3895463510848125, "grad_norm": 0.41015625, "learning_rate": 0.0001249163329832452, "loss": 0.0501, "step": 9692 }, { "epoch": 2.3900394477317555, "grad_norm": 0.42578125, "learning_rate": 0.0001248829925697187, "loss": 0.0496, "step": 9694 }, { "epoch": 2.390532544378698, "grad_norm": 0.357421875, "learning_rate": 0.00012484964920741863, "loss": 0.0558, "step": 9696 }, { "epoch": 2.391025641025641, "grad_norm": 0.369140625, "learning_rate": 0.0001248163029002964, "loss": 0.0508, "step": 9698 }, { "epoch": 2.391518737672584, "grad_norm": 0.345703125, "learning_rate": 0.0001247829536523037, "loss": 0.0509, "step": 9700 }, { "epoch": 2.3920118343195265, "grad_norm": 0.423828125, "learning_rate": 0.00012474960146739264, "loss": 0.0494, "step": 9702 }, { "epoch": 2.3925049309664694, "grad_norm": 0.40234375, "learning_rate": 0.00012471624634951563, "loss": 0.0484, "step": 9704 }, { "epoch": 2.3929980276134124, "grad_norm": 0.4296875, "learning_rate": 0.00012468288830262542, "loss": 0.0501, "step": 9706 }, { "epoch": 2.393491124260355, "grad_norm": 0.375, "learning_rate": 0.00012464952733067512, "loss": 0.0529, "step": 9708 }, { "epoch": 2.393984220907298, "grad_norm": 0.353515625, "learning_rate": 0.00012461616343761823, "loss": 0.0555, "step": 9710 }, { "epoch": 2.394477317554241, "grad_norm": 0.35546875, "learning_rate": 0.00012458279662740852, "loss": 0.0531, "step": 9712 }, { "epoch": 2.3949704142011834, "grad_norm": 0.4609375, "learning_rate": 0.00012454942690400016, "loss": 0.0485, "step": 9714 }, { "epoch": 2.3954635108481264, "grad_norm": 0.34375, "learning_rate": 0.00012451605427134761, "loss": 0.046, "step": 9716 }, { "epoch": 2.395956607495069, "grad_norm": 0.341796875, "learning_rate": 0.0001244826787334058, "loss": 0.0533, "step": 9718 }, { "epoch": 2.396449704142012, "grad_norm": 0.375, "learning_rate": 0.00012444930029412987, "loss": 0.052, "step": 9720 }, { "epoch": 2.3969428007889544, "grad_norm": 0.388671875, "learning_rate": 0.0001244159189574753, "loss": 0.0524, "step": 9722 }, { "epoch": 2.3974358974358974, "grad_norm": 0.314453125, "learning_rate": 0.00012438253472739805, "loss": 0.0511, "step": 9724 }, { "epoch": 2.3979289940828403, "grad_norm": 0.328125, "learning_rate": 0.0001243491476078543, "loss": 0.0434, "step": 9726 }, { "epoch": 2.398422090729783, "grad_norm": 0.369140625, "learning_rate": 0.0001243157576028006, "loss": 0.0586, "step": 9728 }, { "epoch": 2.398915187376726, "grad_norm": 0.45703125, "learning_rate": 0.00012428236471619385, "loss": 0.052, "step": 9730 }, { "epoch": 2.399408284023669, "grad_norm": 0.3671875, "learning_rate": 0.00012424896895199132, "loss": 0.0503, "step": 9732 }, { "epoch": 2.3999013806706113, "grad_norm": 0.34375, "learning_rate": 0.00012421557031415056, "loss": 0.0487, "step": 9734 }, { "epoch": 2.4003944773175543, "grad_norm": 0.515625, "learning_rate": 0.00012418216880662956, "loss": 0.0478, "step": 9736 }, { "epoch": 2.4008875739644973, "grad_norm": 0.4765625, "learning_rate": 0.0001241487644333865, "loss": 0.0515, "step": 9738 }, { "epoch": 2.40138067061144, "grad_norm": 0.369140625, "learning_rate": 0.00012411535719838006, "loss": 0.0551, "step": 9740 }, { "epoch": 2.4018737672583828, "grad_norm": 0.453125, "learning_rate": 0.00012408194710556914, "loss": 0.0555, "step": 9742 }, { "epoch": 2.4023668639053253, "grad_norm": 0.62890625, "learning_rate": 0.00012404853415891299, "loss": 0.0564, "step": 9744 }, { "epoch": 2.4028599605522682, "grad_norm": 0.31640625, "learning_rate": 0.0001240151183623713, "loss": 0.0494, "step": 9746 }, { "epoch": 2.403353057199211, "grad_norm": 0.455078125, "learning_rate": 0.000123981699719904, "loss": 0.0496, "step": 9748 }, { "epoch": 2.4038461538461537, "grad_norm": 0.423828125, "learning_rate": 0.00012394827823547142, "loss": 0.0505, "step": 9750 }, { "epoch": 2.4043392504930967, "grad_norm": 0.35546875, "learning_rate": 0.0001239148539130341, "loss": 0.0468, "step": 9752 }, { "epoch": 2.4048323471400392, "grad_norm": 0.36328125, "learning_rate": 0.0001238814267565531, "loss": 0.0475, "step": 9754 }, { "epoch": 2.405325443786982, "grad_norm": 0.5625, "learning_rate": 0.00012384799676998972, "loss": 0.0472, "step": 9756 }, { "epoch": 2.405818540433925, "grad_norm": 0.4609375, "learning_rate": 0.00012381456395730553, "loss": 0.048, "step": 9758 }, { "epoch": 2.4063116370808677, "grad_norm": 0.404296875, "learning_rate": 0.00012378112832246255, "loss": 0.0515, "step": 9760 }, { "epoch": 2.4068047337278107, "grad_norm": 0.455078125, "learning_rate": 0.0001237476898694231, "loss": 0.0522, "step": 9762 }, { "epoch": 2.4072978303747536, "grad_norm": 0.490234375, "learning_rate": 0.00012371424860214977, "loss": 0.0486, "step": 9764 }, { "epoch": 2.407790927021696, "grad_norm": 0.3515625, "learning_rate": 0.00012368080452460564, "loss": 0.049, "step": 9766 }, { "epoch": 2.408284023668639, "grad_norm": 0.482421875, "learning_rate": 0.00012364735764075393, "loss": 0.0468, "step": 9768 }, { "epoch": 2.4087771203155817, "grad_norm": 0.4609375, "learning_rate": 0.0001236139079545583, "loss": 0.0493, "step": 9770 }, { "epoch": 2.4092702169625246, "grad_norm": 0.291015625, "learning_rate": 0.00012358045546998275, "loss": 0.0521, "step": 9772 }, { "epoch": 2.4097633136094676, "grad_norm": 0.3046875, "learning_rate": 0.00012354700019099157, "loss": 0.046, "step": 9774 }, { "epoch": 2.41025641025641, "grad_norm": 0.376953125, "learning_rate": 0.0001235135421215494, "loss": 0.0517, "step": 9776 }, { "epoch": 2.410749506903353, "grad_norm": 0.349609375, "learning_rate": 0.0001234800812656212, "loss": 0.0498, "step": 9778 }, { "epoch": 2.4112426035502956, "grad_norm": 0.349609375, "learning_rate": 0.00012344661762717226, "loss": 0.0423, "step": 9780 }, { "epoch": 2.4117357001972386, "grad_norm": 0.349609375, "learning_rate": 0.00012341315121016823, "loss": 0.0457, "step": 9782 }, { "epoch": 2.4122287968441816, "grad_norm": 0.30859375, "learning_rate": 0.00012337968201857506, "loss": 0.0488, "step": 9784 }, { "epoch": 2.412721893491124, "grad_norm": 0.388671875, "learning_rate": 0.00012334621005635906, "loss": 0.0519, "step": 9786 }, { "epoch": 2.413214990138067, "grad_norm": 0.3125, "learning_rate": 0.0001233127353274868, "loss": 0.0536, "step": 9788 }, { "epoch": 2.41370808678501, "grad_norm": 0.39453125, "learning_rate": 0.00012327925783592523, "loss": 0.0496, "step": 9790 }, { "epoch": 2.4142011834319526, "grad_norm": 0.5625, "learning_rate": 0.00012324577758564164, "loss": 0.0491, "step": 9792 }, { "epoch": 2.4146942800788955, "grad_norm": 0.3671875, "learning_rate": 0.00012321229458060363, "loss": 0.0565, "step": 9794 }, { "epoch": 2.4151873767258385, "grad_norm": 0.353515625, "learning_rate": 0.0001231788088247791, "loss": 0.054, "step": 9796 }, { "epoch": 2.415680473372781, "grad_norm": 0.6171875, "learning_rate": 0.00012314532032213632, "loss": 0.0488, "step": 9798 }, { "epoch": 2.416173570019724, "grad_norm": 0.33984375, "learning_rate": 0.0001231118290766438, "loss": 0.0523, "step": 9800 }, { "epoch": 2.4166666666666665, "grad_norm": 0.431640625, "learning_rate": 0.00012307833509227057, "loss": 0.0506, "step": 9802 }, { "epoch": 2.4171597633136095, "grad_norm": 0.37109375, "learning_rate": 0.00012304483837298578, "loss": 0.0504, "step": 9804 }, { "epoch": 2.4176528599605525, "grad_norm": 0.43359375, "learning_rate": 0.00012301133892275894, "loss": 0.0462, "step": 9806 }, { "epoch": 2.418145956607495, "grad_norm": 0.369140625, "learning_rate": 0.00012297783674555996, "loss": 0.0518, "step": 9808 }, { "epoch": 2.418639053254438, "grad_norm": 0.376953125, "learning_rate": 0.00012294433184535905, "loss": 0.0445, "step": 9810 }, { "epoch": 2.4191321499013805, "grad_norm": 0.328125, "learning_rate": 0.0001229108242261267, "loss": 0.0442, "step": 9812 }, { "epoch": 2.4196252465483234, "grad_norm": 0.345703125, "learning_rate": 0.00012287731389183375, "loss": 0.0464, "step": 9814 }, { "epoch": 2.4201183431952664, "grad_norm": 0.4140625, "learning_rate": 0.00012284380084645139, "loss": 0.0419, "step": 9816 }, { "epoch": 2.420611439842209, "grad_norm": 0.32421875, "learning_rate": 0.00012281028509395108, "loss": 0.0505, "step": 9818 }, { "epoch": 2.421104536489152, "grad_norm": 0.361328125, "learning_rate": 0.00012277676663830463, "loss": 0.0487, "step": 9820 }, { "epoch": 2.421597633136095, "grad_norm": 0.38671875, "learning_rate": 0.00012274324548348418, "loss": 0.051, "step": 9822 }, { "epoch": 2.4220907297830374, "grad_norm": 0.435546875, "learning_rate": 0.00012270972163346214, "loss": 0.0509, "step": 9824 }, { "epoch": 2.4225838264299804, "grad_norm": 0.365234375, "learning_rate": 0.0001226761950922113, "loss": 0.0474, "step": 9826 }, { "epoch": 2.423076923076923, "grad_norm": 0.443359375, "learning_rate": 0.00012264266586370473, "loss": 0.0521, "step": 9828 }, { "epoch": 2.423570019723866, "grad_norm": 0.435546875, "learning_rate": 0.00012260913395191583, "loss": 0.0506, "step": 9830 }, { "epoch": 2.424063116370809, "grad_norm": 0.376953125, "learning_rate": 0.00012257559936081834, "loss": 0.0507, "step": 9832 }, { "epoch": 2.4245562130177514, "grad_norm": 0.373046875, "learning_rate": 0.00012254206209438625, "loss": 0.05, "step": 9834 }, { "epoch": 2.4250493096646943, "grad_norm": 0.3359375, "learning_rate": 0.000122508522156594, "loss": 0.0458, "step": 9836 }, { "epoch": 2.425542406311637, "grad_norm": 0.326171875, "learning_rate": 0.00012247497955141613, "loss": 0.052, "step": 9838 }, { "epoch": 2.42603550295858, "grad_norm": 0.30859375, "learning_rate": 0.00012244143428282776, "loss": 0.0535, "step": 9840 }, { "epoch": 2.426528599605523, "grad_norm": 0.482421875, "learning_rate": 0.00012240788635480408, "loss": 0.0499, "step": 9842 }, { "epoch": 2.4270216962524653, "grad_norm": 0.326171875, "learning_rate": 0.00012237433577132079, "loss": 0.0508, "step": 9844 }, { "epoch": 2.4275147928994083, "grad_norm": 0.314453125, "learning_rate": 0.00012234078253635377, "loss": 0.0465, "step": 9846 }, { "epoch": 2.4280078895463513, "grad_norm": 0.380859375, "learning_rate": 0.0001223072266538793, "loss": 0.0489, "step": 9848 }, { "epoch": 2.428500986193294, "grad_norm": 0.40234375, "learning_rate": 0.0001222736681278739, "loss": 0.0469, "step": 9850 }, { "epoch": 2.4289940828402368, "grad_norm": 0.435546875, "learning_rate": 0.00012224010696231452, "loss": 0.0475, "step": 9852 }, { "epoch": 2.4294871794871793, "grad_norm": 0.4296875, "learning_rate": 0.00012220654316117825, "loss": 0.0496, "step": 9854 }, { "epoch": 2.4299802761341223, "grad_norm": 0.400390625, "learning_rate": 0.00012217297672844263, "loss": 0.0479, "step": 9856 }, { "epoch": 2.4304733727810652, "grad_norm": 0.279296875, "learning_rate": 0.00012213940766808546, "loss": 0.0444, "step": 9858 }, { "epoch": 2.4309664694280078, "grad_norm": 0.380859375, "learning_rate": 0.0001221058359840849, "loss": 0.0472, "step": 9860 }, { "epoch": 2.4314595660749507, "grad_norm": 0.443359375, "learning_rate": 0.00012207226168041934, "loss": 0.0485, "step": 9862 }, { "epoch": 2.4319526627218933, "grad_norm": 0.291015625, "learning_rate": 0.00012203868476106755, "loss": 0.0424, "step": 9864 }, { "epoch": 2.4324457593688362, "grad_norm": 0.322265625, "learning_rate": 0.00012200510523000854, "loss": 0.0514, "step": 9866 }, { "epoch": 2.432938856015779, "grad_norm": 0.3671875, "learning_rate": 0.00012197152309122173, "loss": 0.0473, "step": 9868 }, { "epoch": 2.4334319526627217, "grad_norm": 0.318359375, "learning_rate": 0.00012193793834868674, "loss": 0.0507, "step": 9870 }, { "epoch": 2.4339250493096647, "grad_norm": 0.302734375, "learning_rate": 0.00012190435100638355, "loss": 0.0467, "step": 9872 }, { "epoch": 2.4344181459566077, "grad_norm": 0.36328125, "learning_rate": 0.00012187076106829249, "loss": 0.0536, "step": 9874 }, { "epoch": 2.43491124260355, "grad_norm": 0.34765625, "learning_rate": 0.00012183716853839416, "loss": 0.0529, "step": 9876 }, { "epoch": 2.435404339250493, "grad_norm": 0.3984375, "learning_rate": 0.00012180357342066939, "loss": 0.0478, "step": 9878 }, { "epoch": 2.435897435897436, "grad_norm": 0.48828125, "learning_rate": 0.00012176997571909945, "loss": 0.0515, "step": 9880 }, { "epoch": 2.4363905325443787, "grad_norm": 0.341796875, "learning_rate": 0.00012173637543766583, "loss": 0.0521, "step": 9882 }, { "epoch": 2.4368836291913216, "grad_norm": 0.40234375, "learning_rate": 0.00012170277258035038, "loss": 0.0501, "step": 9884 }, { "epoch": 2.437376725838264, "grad_norm": 0.408203125, "learning_rate": 0.00012166916715113521, "loss": 0.0453, "step": 9886 }, { "epoch": 2.437869822485207, "grad_norm": 0.4609375, "learning_rate": 0.00012163555915400273, "loss": 0.0436, "step": 9888 }, { "epoch": 2.43836291913215, "grad_norm": 0.34765625, "learning_rate": 0.00012160194859293572, "loss": 0.0475, "step": 9890 }, { "epoch": 2.4388560157790926, "grad_norm": 0.380859375, "learning_rate": 0.00012156833547191717, "loss": 0.049, "step": 9892 }, { "epoch": 2.4393491124260356, "grad_norm": 0.49609375, "learning_rate": 0.00012153471979493043, "loss": 0.0478, "step": 9894 }, { "epoch": 2.439842209072978, "grad_norm": 0.33203125, "learning_rate": 0.00012150110156595918, "loss": 0.0474, "step": 9896 }, { "epoch": 2.440335305719921, "grad_norm": 0.41796875, "learning_rate": 0.00012146748078898733, "loss": 0.0489, "step": 9898 }, { "epoch": 2.440828402366864, "grad_norm": 0.392578125, "learning_rate": 0.00012143385746799915, "loss": 0.0473, "step": 9900 }, { "epoch": 2.4413214990138066, "grad_norm": 0.4140625, "learning_rate": 0.00012140023160697922, "loss": 0.0456, "step": 9902 }, { "epoch": 2.4418145956607495, "grad_norm": 0.451171875, "learning_rate": 0.00012136660320991236, "loss": 0.0473, "step": 9904 }, { "epoch": 2.4423076923076925, "grad_norm": 0.341796875, "learning_rate": 0.00012133297228078369, "loss": 0.0492, "step": 9906 }, { "epoch": 2.442800788954635, "grad_norm": 0.349609375, "learning_rate": 0.00012129933882357871, "loss": 0.044, "step": 9908 }, { "epoch": 2.443293885601578, "grad_norm": 0.28515625, "learning_rate": 0.00012126570284228314, "loss": 0.0473, "step": 9910 }, { "epoch": 2.4437869822485205, "grad_norm": 0.390625, "learning_rate": 0.00012123206434088308, "loss": 0.0437, "step": 9912 }, { "epoch": 2.4442800788954635, "grad_norm": 0.4140625, "learning_rate": 0.00012119842332336482, "loss": 0.0489, "step": 9914 }, { "epoch": 2.4447731755424065, "grad_norm": 0.33203125, "learning_rate": 0.00012116477979371508, "loss": 0.0453, "step": 9916 }, { "epoch": 2.445266272189349, "grad_norm": 0.46875, "learning_rate": 0.00012113113375592076, "loss": 0.0497, "step": 9918 }, { "epoch": 2.445759368836292, "grad_norm": 0.3515625, "learning_rate": 0.00012109748521396908, "loss": 0.0475, "step": 9920 }, { "epoch": 2.4462524654832345, "grad_norm": 0.33203125, "learning_rate": 0.00012106383417184763, "loss": 0.0473, "step": 9922 }, { "epoch": 2.4467455621301775, "grad_norm": 0.3359375, "learning_rate": 0.00012103018063354421, "loss": 0.0432, "step": 9924 }, { "epoch": 2.4472386587771204, "grad_norm": 0.392578125, "learning_rate": 0.00012099652460304698, "loss": 0.0541, "step": 9926 }, { "epoch": 2.447731755424063, "grad_norm": 0.427734375, "learning_rate": 0.00012096286608434432, "loss": 0.0447, "step": 9928 }, { "epoch": 2.448224852071006, "grad_norm": 0.400390625, "learning_rate": 0.00012092920508142505, "loss": 0.0449, "step": 9930 }, { "epoch": 2.448717948717949, "grad_norm": 0.458984375, "learning_rate": 0.00012089554159827807, "loss": 0.0438, "step": 9932 }, { "epoch": 2.4492110453648914, "grad_norm": 0.66015625, "learning_rate": 0.0001208618756388928, "loss": 0.0472, "step": 9934 }, { "epoch": 2.4497041420118344, "grad_norm": 0.296875, "learning_rate": 0.00012082820720725876, "loss": 0.0474, "step": 9936 }, { "epoch": 2.4501972386587774, "grad_norm": 0.50390625, "learning_rate": 0.00012079453630736586, "loss": 0.0477, "step": 9938 }, { "epoch": 2.45069033530572, "grad_norm": 0.76171875, "learning_rate": 0.0001207608629432043, "loss": 0.0531, "step": 9940 }, { "epoch": 2.451183431952663, "grad_norm": 0.30859375, "learning_rate": 0.00012072718711876459, "loss": 0.0427, "step": 9942 }, { "epoch": 2.4516765285996054, "grad_norm": 0.423828125, "learning_rate": 0.00012069350883803749, "loss": 0.0483, "step": 9944 }, { "epoch": 2.4521696252465484, "grad_norm": 0.671875, "learning_rate": 0.00012065982810501404, "loss": 0.0532, "step": 9946 }, { "epoch": 2.452662721893491, "grad_norm": 0.3203125, "learning_rate": 0.00012062614492368561, "loss": 0.051, "step": 9948 }, { "epoch": 2.453155818540434, "grad_norm": 0.3046875, "learning_rate": 0.00012059245929804384, "loss": 0.0441, "step": 9950 }, { "epoch": 2.453648915187377, "grad_norm": 0.4921875, "learning_rate": 0.00012055877123208067, "loss": 0.0478, "step": 9952 }, { "epoch": 2.4541420118343193, "grad_norm": 0.36328125, "learning_rate": 0.00012052508072978832, "loss": 0.0487, "step": 9954 }, { "epoch": 2.4546351084812623, "grad_norm": 0.310546875, "learning_rate": 0.00012049138779515932, "loss": 0.0519, "step": 9956 }, { "epoch": 2.4551282051282053, "grad_norm": 0.357421875, "learning_rate": 0.00012045769243218647, "loss": 0.0468, "step": 9958 }, { "epoch": 2.455621301775148, "grad_norm": 0.36328125, "learning_rate": 0.00012042399464486282, "loss": 0.0467, "step": 9960 }, { "epoch": 2.456114398422091, "grad_norm": 0.3046875, "learning_rate": 0.00012039029443718177, "loss": 0.0446, "step": 9962 }, { "epoch": 2.4566074950690338, "grad_norm": 0.404296875, "learning_rate": 0.00012035659181313699, "loss": 0.0486, "step": 9964 }, { "epoch": 2.4571005917159763, "grad_norm": 0.45703125, "learning_rate": 0.00012032288677672244, "loss": 0.0489, "step": 9966 }, { "epoch": 2.4575936883629192, "grad_norm": 0.302734375, "learning_rate": 0.00012028917933193233, "loss": 0.0496, "step": 9968 }, { "epoch": 2.4580867850098618, "grad_norm": 0.326171875, "learning_rate": 0.00012025546948276116, "loss": 0.0498, "step": 9970 }, { "epoch": 2.4585798816568047, "grad_norm": 0.283203125, "learning_rate": 0.00012022175723320381, "loss": 0.0456, "step": 9972 }, { "epoch": 2.4590729783037477, "grad_norm": 0.33984375, "learning_rate": 0.00012018804258725532, "loss": 0.0425, "step": 9974 }, { "epoch": 2.4595660749506902, "grad_norm": 0.376953125, "learning_rate": 0.00012015432554891105, "loss": 0.047, "step": 9976 }, { "epoch": 2.460059171597633, "grad_norm": 0.37890625, "learning_rate": 0.00012012060612216664, "loss": 0.0457, "step": 9978 }, { "epoch": 2.4605522682445757, "grad_norm": 0.3359375, "learning_rate": 0.0001200868843110181, "loss": 0.0472, "step": 9980 }, { "epoch": 2.4610453648915187, "grad_norm": 0.34765625, "learning_rate": 0.00012005316011946162, "loss": 0.0452, "step": 9982 }, { "epoch": 2.4615384615384617, "grad_norm": 0.35546875, "learning_rate": 0.00012001943355149371, "loss": 0.0457, "step": 9984 }, { "epoch": 2.462031558185404, "grad_norm": 0.3515625, "learning_rate": 0.00011998570461111115, "loss": 0.048, "step": 9986 }, { "epoch": 2.462524654832347, "grad_norm": 0.32421875, "learning_rate": 0.00011995197330231101, "loss": 0.0495, "step": 9988 }, { "epoch": 2.46301775147929, "grad_norm": 0.390625, "learning_rate": 0.00011991823962909062, "loss": 0.045, "step": 9990 }, { "epoch": 2.4635108481262327, "grad_norm": 0.41015625, "learning_rate": 0.00011988450359544763, "loss": 0.0561, "step": 9992 }, { "epoch": 2.4640039447731756, "grad_norm": 0.412109375, "learning_rate": 0.00011985076520537995, "loss": 0.0471, "step": 9994 }, { "epoch": 2.464497041420118, "grad_norm": 0.400390625, "learning_rate": 0.00011981702446288575, "loss": 0.0506, "step": 9996 }, { "epoch": 2.464990138067061, "grad_norm": 0.451171875, "learning_rate": 0.0001197832813719635, "loss": 0.0489, "step": 9998 }, { "epoch": 2.465483234714004, "grad_norm": 0.46484375, "learning_rate": 0.00011974953593661198, "loss": 0.0502, "step": 10000 }, { "epoch": 2.4659763313609466, "grad_norm": 0.375, "learning_rate": 0.00011971578816083018, "loss": 0.0475, "step": 10002 }, { "epoch": 2.4664694280078896, "grad_norm": 0.345703125, "learning_rate": 0.00011968203804861737, "loss": 0.0492, "step": 10004 }, { "epoch": 2.466962524654832, "grad_norm": 0.390625, "learning_rate": 0.00011964828560397319, "loss": 0.0476, "step": 10006 }, { "epoch": 2.467455621301775, "grad_norm": 0.396484375, "learning_rate": 0.00011961453083089744, "loss": 0.0494, "step": 10008 }, { "epoch": 2.467948717948718, "grad_norm": 0.375, "learning_rate": 0.0001195807737333903, "loss": 0.0511, "step": 10010 }, { "epoch": 2.4684418145956606, "grad_norm": 0.40234375, "learning_rate": 0.00011954701431545214, "loss": 0.0525, "step": 10012 }, { "epoch": 2.4689349112426036, "grad_norm": 0.3359375, "learning_rate": 0.00011951325258108363, "loss": 0.0463, "step": 10014 }, { "epoch": 2.4694280078895465, "grad_norm": 0.322265625, "learning_rate": 0.00011947948853428576, "loss": 0.0439, "step": 10016 }, { "epoch": 2.469921104536489, "grad_norm": 0.328125, "learning_rate": 0.00011944572217905976, "loss": 0.049, "step": 10018 }, { "epoch": 2.470414201183432, "grad_norm": 0.369140625, "learning_rate": 0.00011941195351940708, "loss": 0.048, "step": 10020 }, { "epoch": 2.470907297830375, "grad_norm": 0.318359375, "learning_rate": 0.0001193781825593295, "loss": 0.0463, "step": 10022 }, { "epoch": 2.4714003944773175, "grad_norm": 0.314453125, "learning_rate": 0.00011934440930282913, "loss": 0.0472, "step": 10024 }, { "epoch": 2.4718934911242605, "grad_norm": 0.3046875, "learning_rate": 0.00011931063375390825, "loss": 0.0447, "step": 10026 }, { "epoch": 2.472386587771203, "grad_norm": 0.33984375, "learning_rate": 0.00011927685591656947, "loss": 0.0458, "step": 10028 }, { "epoch": 2.472879684418146, "grad_norm": 0.392578125, "learning_rate": 0.0001192430757948156, "loss": 0.05, "step": 10030 }, { "epoch": 2.4733727810650885, "grad_norm": 0.310546875, "learning_rate": 0.00011920929339264982, "loss": 0.0448, "step": 10032 }, { "epoch": 2.4738658777120315, "grad_norm": 0.412109375, "learning_rate": 0.00011917550871407554, "loss": 0.0456, "step": 10034 }, { "epoch": 2.4743589743589745, "grad_norm": 0.419921875, "learning_rate": 0.0001191417217630964, "loss": 0.0452, "step": 10036 }, { "epoch": 2.474852071005917, "grad_norm": 0.341796875, "learning_rate": 0.00011910793254371634, "loss": 0.049, "step": 10038 }, { "epoch": 2.47534516765286, "grad_norm": 0.337890625, "learning_rate": 0.00011907414105993961, "loss": 0.0486, "step": 10040 }, { "epoch": 2.475838264299803, "grad_norm": 0.38671875, "learning_rate": 0.00011904034731577068, "loss": 0.0438, "step": 10042 }, { "epoch": 2.4763313609467454, "grad_norm": 0.482421875, "learning_rate": 0.00011900655131521427, "loss": 0.0461, "step": 10044 }, { "epoch": 2.4768244575936884, "grad_norm": 0.42578125, "learning_rate": 0.0001189727530622754, "loss": 0.0475, "step": 10046 }, { "epoch": 2.4773175542406314, "grad_norm": 0.6484375, "learning_rate": 0.00011893895256095937, "loss": 0.0476, "step": 10048 }, { "epoch": 2.477810650887574, "grad_norm": 0.3515625, "learning_rate": 0.00011890514981527175, "loss": 0.0447, "step": 10050 }, { "epoch": 2.478303747534517, "grad_norm": 0.294921875, "learning_rate": 0.00011887134482921829, "loss": 0.0496, "step": 10052 }, { "epoch": 2.4787968441814594, "grad_norm": 0.546875, "learning_rate": 0.00011883753760680509, "loss": 0.0464, "step": 10054 }, { "epoch": 2.4792899408284024, "grad_norm": 0.384765625, "learning_rate": 0.00011880372815203855, "loss": 0.0436, "step": 10056 }, { "epoch": 2.4797830374753453, "grad_norm": 0.33203125, "learning_rate": 0.00011876991646892522, "loss": 0.0466, "step": 10058 }, { "epoch": 2.480276134122288, "grad_norm": 0.369140625, "learning_rate": 0.00011873610256147197, "loss": 0.0483, "step": 10060 }, { "epoch": 2.480769230769231, "grad_norm": 0.4375, "learning_rate": 0.00011870228643368598, "loss": 0.048, "step": 10062 }, { "epoch": 2.4812623274161734, "grad_norm": 0.56640625, "learning_rate": 0.0001186684680895746, "loss": 0.0498, "step": 10064 }, { "epoch": 2.4817554240631163, "grad_norm": 0.37890625, "learning_rate": 0.00011863464753314556, "loss": 0.0471, "step": 10066 }, { "epoch": 2.4822485207100593, "grad_norm": 0.4375, "learning_rate": 0.00011860082476840672, "loss": 0.046, "step": 10068 }, { "epoch": 2.482741617357002, "grad_norm": 0.39453125, "learning_rate": 0.00011856699979936632, "loss": 0.0425, "step": 10070 }, { "epoch": 2.483234714003945, "grad_norm": 0.318359375, "learning_rate": 0.00011853317263003274, "loss": 0.0485, "step": 10072 }, { "epoch": 2.4837278106508878, "grad_norm": 0.33984375, "learning_rate": 0.00011849934326441474, "loss": 0.0455, "step": 10074 }, { "epoch": 2.4842209072978303, "grad_norm": 0.447265625, "learning_rate": 0.00011846551170652127, "loss": 0.0456, "step": 10076 }, { "epoch": 2.4847140039447733, "grad_norm": 0.439453125, "learning_rate": 0.00011843167796036156, "loss": 0.047, "step": 10078 }, { "epoch": 2.485207100591716, "grad_norm": 0.384765625, "learning_rate": 0.00011839784202994513, "loss": 0.0467, "step": 10080 }, { "epoch": 2.4857001972386588, "grad_norm": 0.310546875, "learning_rate": 0.00011836400391928168, "loss": 0.0466, "step": 10082 }, { "epoch": 2.4861932938856017, "grad_norm": 0.4453125, "learning_rate": 0.00011833016363238127, "loss": 0.0413, "step": 10084 }, { "epoch": 2.4866863905325443, "grad_norm": 0.3203125, "learning_rate": 0.00011829632117325408, "loss": 0.0466, "step": 10086 }, { "epoch": 2.4871794871794872, "grad_norm": 0.33203125, "learning_rate": 0.0001182624765459107, "loss": 0.0474, "step": 10088 }, { "epoch": 2.4876725838264298, "grad_norm": 0.365234375, "learning_rate": 0.00011822862975436187, "loss": 0.0483, "step": 10090 }, { "epoch": 2.4881656804733727, "grad_norm": 0.38671875, "learning_rate": 0.00011819478080261863, "loss": 0.0447, "step": 10092 }, { "epoch": 2.4886587771203157, "grad_norm": 0.359375, "learning_rate": 0.0001181609296946923, "loss": 0.0478, "step": 10094 }, { "epoch": 2.489151873767258, "grad_norm": 0.37890625, "learning_rate": 0.00011812707643459441, "loss": 0.0461, "step": 10096 }, { "epoch": 2.489644970414201, "grad_norm": 0.318359375, "learning_rate": 0.00011809322102633673, "loss": 0.0518, "step": 10098 }, { "epoch": 2.490138067061144, "grad_norm": 0.46875, "learning_rate": 0.00011805936347393139, "loss": 0.0438, "step": 10100 }, { "epoch": 2.4906311637080867, "grad_norm": 0.376953125, "learning_rate": 0.0001180255037813906, "loss": 0.0462, "step": 10102 }, { "epoch": 2.4911242603550297, "grad_norm": 0.37890625, "learning_rate": 0.00011799164195272697, "loss": 0.0416, "step": 10104 }, { "epoch": 2.4916173570019726, "grad_norm": 0.40625, "learning_rate": 0.00011795777799195331, "loss": 0.0446, "step": 10106 }, { "epoch": 2.492110453648915, "grad_norm": 0.38671875, "learning_rate": 0.00011792391190308272, "loss": 0.0452, "step": 10108 }, { "epoch": 2.492603550295858, "grad_norm": 0.400390625, "learning_rate": 0.00011789004369012848, "loss": 0.046, "step": 10110 }, { "epoch": 2.4930966469428006, "grad_norm": 0.333984375, "learning_rate": 0.00011785617335710419, "loss": 0.0442, "step": 10112 }, { "epoch": 2.4935897435897436, "grad_norm": 0.337890625, "learning_rate": 0.00011782230090802365, "loss": 0.0479, "step": 10114 }, { "epoch": 2.4940828402366866, "grad_norm": 0.3203125, "learning_rate": 0.00011778842634690097, "loss": 0.038, "step": 10116 }, { "epoch": 2.494575936883629, "grad_norm": 0.318359375, "learning_rate": 0.00011775454967775042, "loss": 0.0451, "step": 10118 }, { "epoch": 2.495069033530572, "grad_norm": 0.322265625, "learning_rate": 0.00011772067090458661, "loss": 0.0455, "step": 10120 }, { "epoch": 2.4955621301775146, "grad_norm": 0.333984375, "learning_rate": 0.00011768679003142435, "loss": 0.0534, "step": 10122 }, { "epoch": 2.4960552268244576, "grad_norm": 0.330078125, "learning_rate": 0.00011765290706227877, "loss": 0.046, "step": 10124 }, { "epoch": 2.4965483234714005, "grad_norm": 0.384765625, "learning_rate": 0.00011761902200116508, "loss": 0.0508, "step": 10126 }, { "epoch": 2.497041420118343, "grad_norm": 0.41015625, "learning_rate": 0.00011758513485209892, "loss": 0.0466, "step": 10128 }, { "epoch": 2.497534516765286, "grad_norm": 0.296875, "learning_rate": 0.00011755124561909612, "loss": 0.0455, "step": 10130 }, { "epoch": 2.498027613412229, "grad_norm": 0.390625, "learning_rate": 0.00011751735430617272, "loss": 0.0477, "step": 10132 }, { "epoch": 2.4985207100591715, "grad_norm": 0.3828125, "learning_rate": 0.00011748346091734504, "loss": 0.0475, "step": 10134 }, { "epoch": 2.4990138067061145, "grad_norm": 0.34375, "learning_rate": 0.00011744956545662959, "loss": 0.0482, "step": 10136 }, { "epoch": 2.499506903353057, "grad_norm": 0.396484375, "learning_rate": 0.00011741566792804324, "loss": 0.056, "step": 10138 }, { "epoch": 2.5, "grad_norm": 0.486328125, "learning_rate": 0.00011738176833560299, "loss": 0.055, "step": 10140 }, { "epoch": 2.5004930966469425, "grad_norm": 0.35546875, "learning_rate": 0.00011734786668332614, "loss": 0.0466, "step": 10142 }, { "epoch": 2.5009861932938855, "grad_norm": 0.287109375, "learning_rate": 0.00011731396297523023, "loss": 0.0449, "step": 10144 }, { "epoch": 2.5014792899408285, "grad_norm": 0.365234375, "learning_rate": 0.00011728005721533305, "loss": 0.0478, "step": 10146 }, { "epoch": 2.501972386587771, "grad_norm": 0.462890625, "learning_rate": 0.0001172461494076526, "loss": 0.0497, "step": 10148 }, { "epoch": 2.502465483234714, "grad_norm": 0.458984375, "learning_rate": 0.00011721223955620718, "loss": 0.0463, "step": 10150 }, { "epoch": 2.502958579881657, "grad_norm": 0.361328125, "learning_rate": 0.00011717832766501526, "loss": 0.0463, "step": 10152 }, { "epoch": 2.5034516765285995, "grad_norm": 0.326171875, "learning_rate": 0.00011714441373809557, "loss": 0.0513, "step": 10154 }, { "epoch": 2.5039447731755424, "grad_norm": 0.41015625, "learning_rate": 0.00011711049777946716, "loss": 0.0443, "step": 10156 }, { "epoch": 2.5044378698224854, "grad_norm": 0.3046875, "learning_rate": 0.00011707657979314922, "loss": 0.0466, "step": 10158 }, { "epoch": 2.504930966469428, "grad_norm": 0.357421875, "learning_rate": 0.00011704265978316122, "loss": 0.0456, "step": 10160 }, { "epoch": 2.505424063116371, "grad_norm": 0.392578125, "learning_rate": 0.0001170087377535229, "loss": 0.0448, "step": 10162 }, { "epoch": 2.505917159763314, "grad_norm": 0.3046875, "learning_rate": 0.0001169748137082542, "loss": 0.049, "step": 10164 }, { "epoch": 2.5064102564102564, "grad_norm": 0.412109375, "learning_rate": 0.0001169408876513753, "loss": 0.0472, "step": 10166 }, { "epoch": 2.5069033530571994, "grad_norm": 0.384765625, "learning_rate": 0.00011690695958690661, "loss": 0.0483, "step": 10168 }, { "epoch": 2.507396449704142, "grad_norm": 0.28515625, "learning_rate": 0.0001168730295188688, "loss": 0.0487, "step": 10170 }, { "epoch": 2.507889546351085, "grad_norm": 0.3203125, "learning_rate": 0.00011683909745128281, "loss": 0.0435, "step": 10172 }, { "epoch": 2.5083826429980274, "grad_norm": 0.34375, "learning_rate": 0.00011680516338816974, "loss": 0.0448, "step": 10174 }, { "epoch": 2.5088757396449703, "grad_norm": 0.404296875, "learning_rate": 0.00011677122733355097, "loss": 0.0462, "step": 10176 }, { "epoch": 2.5093688362919133, "grad_norm": 0.337890625, "learning_rate": 0.00011673728929144815, "loss": 0.046, "step": 10178 }, { "epoch": 2.509861932938856, "grad_norm": 0.3125, "learning_rate": 0.00011670334926588309, "loss": 0.0397, "step": 10180 }, { "epoch": 2.510355029585799, "grad_norm": 0.345703125, "learning_rate": 0.00011666940726087791, "loss": 0.0448, "step": 10182 }, { "epoch": 2.510848126232742, "grad_norm": 0.419921875, "learning_rate": 0.00011663546328045488, "loss": 0.0484, "step": 10184 }, { "epoch": 2.5113412228796843, "grad_norm": 0.419921875, "learning_rate": 0.00011660151732863653, "loss": 0.0458, "step": 10186 }, { "epoch": 2.5118343195266273, "grad_norm": 0.44921875, "learning_rate": 0.00011656756940944573, "loss": 0.0475, "step": 10188 }, { "epoch": 2.5123274161735702, "grad_norm": 0.2890625, "learning_rate": 0.00011653361952690542, "loss": 0.0462, "step": 10190 }, { "epoch": 2.5128205128205128, "grad_norm": 0.310546875, "learning_rate": 0.00011649966768503893, "loss": 0.0469, "step": 10192 }, { "epoch": 2.5133136094674557, "grad_norm": 0.5078125, "learning_rate": 0.00011646571388786967, "loss": 0.0508, "step": 10194 }, { "epoch": 2.5138067061143983, "grad_norm": 0.50390625, "learning_rate": 0.00011643175813942137, "loss": 0.0474, "step": 10196 }, { "epoch": 2.5142998027613412, "grad_norm": 0.34765625, "learning_rate": 0.00011639780044371803, "loss": 0.0438, "step": 10198 }, { "epoch": 2.5147928994082838, "grad_norm": 0.4140625, "learning_rate": 0.00011636384080478376, "loss": 0.0468, "step": 10200 }, { "epoch": 2.5152859960552267, "grad_norm": 0.314453125, "learning_rate": 0.00011632987922664295, "loss": 0.0449, "step": 10202 }, { "epoch": 2.5157790927021697, "grad_norm": 0.375, "learning_rate": 0.00011629591571332033, "loss": 0.0444, "step": 10204 }, { "epoch": 2.5162721893491122, "grad_norm": 0.322265625, "learning_rate": 0.0001162619502688407, "loss": 0.0461, "step": 10206 }, { "epoch": 2.516765285996055, "grad_norm": 0.365234375, "learning_rate": 0.00011622798289722915, "loss": 0.0471, "step": 10208 }, { "epoch": 2.517258382642998, "grad_norm": 0.50390625, "learning_rate": 0.00011619401360251104, "loss": 0.0448, "step": 10210 }, { "epoch": 2.5177514792899407, "grad_norm": 0.34765625, "learning_rate": 0.0001161600423887119, "loss": 0.0455, "step": 10212 }, { "epoch": 2.5182445759368837, "grad_norm": 0.322265625, "learning_rate": 0.0001161260692598575, "loss": 0.0489, "step": 10214 }, { "epoch": 2.5187376725838266, "grad_norm": 0.380859375, "learning_rate": 0.00011609209421997386, "loss": 0.0436, "step": 10216 }, { "epoch": 2.519230769230769, "grad_norm": 0.333984375, "learning_rate": 0.00011605811727308722, "loss": 0.0478, "step": 10218 }, { "epoch": 2.519723865877712, "grad_norm": 0.365234375, "learning_rate": 0.00011602413842322404, "loss": 0.049, "step": 10220 }, { "epoch": 2.520216962524655, "grad_norm": 0.40625, "learning_rate": 0.00011599015767441097, "loss": 0.0469, "step": 10222 }, { "epoch": 2.5207100591715976, "grad_norm": 0.353515625, "learning_rate": 0.00011595617503067493, "loss": 0.046, "step": 10224 }, { "epoch": 2.5212031558185406, "grad_norm": 0.333984375, "learning_rate": 0.00011592219049604305, "loss": 0.0437, "step": 10226 }, { "epoch": 2.521696252465483, "grad_norm": 0.31640625, "learning_rate": 0.00011588820407454273, "loss": 0.0434, "step": 10228 }, { "epoch": 2.522189349112426, "grad_norm": 0.330078125, "learning_rate": 0.00011585421577020148, "loss": 0.0435, "step": 10230 }, { "epoch": 2.5226824457593686, "grad_norm": 0.279296875, "learning_rate": 0.0001158202255870472, "loss": 0.0465, "step": 10232 }, { "epoch": 2.5231755424063116, "grad_norm": 0.423828125, "learning_rate": 0.00011578623352910783, "loss": 0.0431, "step": 10234 }, { "epoch": 2.5236686390532546, "grad_norm": 0.353515625, "learning_rate": 0.00011575223960041164, "loss": 0.0476, "step": 10236 }, { "epoch": 2.524161735700197, "grad_norm": 0.36328125, "learning_rate": 0.00011571824380498711, "loss": 0.0492, "step": 10238 }, { "epoch": 2.52465483234714, "grad_norm": 0.498046875, "learning_rate": 0.00011568424614686293, "loss": 0.0473, "step": 10240 }, { "epoch": 2.525147928994083, "grad_norm": 0.283203125, "learning_rate": 0.000115650246630068, "loss": 0.0443, "step": 10242 }, { "epoch": 2.5256410256410255, "grad_norm": 0.341796875, "learning_rate": 0.00011561624525863148, "loss": 0.0474, "step": 10244 }, { "epoch": 2.5261341222879685, "grad_norm": 0.380859375, "learning_rate": 0.00011558224203658272, "loss": 0.0457, "step": 10246 }, { "epoch": 2.5266272189349115, "grad_norm": 0.373046875, "learning_rate": 0.00011554823696795127, "loss": 0.0477, "step": 10248 }, { "epoch": 2.527120315581854, "grad_norm": 0.376953125, "learning_rate": 0.00011551423005676691, "loss": 0.0486, "step": 10250 }, { "epoch": 2.527613412228797, "grad_norm": 0.34765625, "learning_rate": 0.00011548022130705969, "loss": 0.0427, "step": 10252 }, { "epoch": 2.5281065088757395, "grad_norm": 0.353515625, "learning_rate": 0.0001154462107228598, "loss": 0.0433, "step": 10254 }, { "epoch": 2.5285996055226825, "grad_norm": 0.3359375, "learning_rate": 0.00011541219830819769, "loss": 0.0461, "step": 10256 }, { "epoch": 2.529092702169625, "grad_norm": 0.357421875, "learning_rate": 0.00011537818406710405, "loss": 0.0465, "step": 10258 }, { "epoch": 2.529585798816568, "grad_norm": 0.302734375, "learning_rate": 0.00011534416800360975, "loss": 0.041, "step": 10260 }, { "epoch": 2.530078895463511, "grad_norm": 0.353515625, "learning_rate": 0.00011531015012174584, "loss": 0.0506, "step": 10262 }, { "epoch": 2.5305719921104535, "grad_norm": 0.353515625, "learning_rate": 0.00011527613042554368, "loss": 0.0486, "step": 10264 }, { "epoch": 2.5310650887573964, "grad_norm": 0.302734375, "learning_rate": 0.00011524210891903478, "loss": 0.0459, "step": 10266 }, { "epoch": 2.5315581854043394, "grad_norm": 0.3828125, "learning_rate": 0.00011520808560625085, "loss": 0.0508, "step": 10268 }, { "epoch": 2.532051282051282, "grad_norm": 0.38671875, "learning_rate": 0.00011517406049122385, "loss": 0.0462, "step": 10270 }, { "epoch": 2.532544378698225, "grad_norm": 0.306640625, "learning_rate": 0.00011514003357798597, "loss": 0.0457, "step": 10272 }, { "epoch": 2.533037475345168, "grad_norm": 0.35546875, "learning_rate": 0.00011510600487056961, "loss": 0.0451, "step": 10274 }, { "epoch": 2.5335305719921104, "grad_norm": 0.423828125, "learning_rate": 0.00011507197437300732, "loss": 0.044, "step": 10276 }, { "epoch": 2.5340236686390534, "grad_norm": 0.337890625, "learning_rate": 0.0001150379420893319, "loss": 0.0445, "step": 10278 }, { "epoch": 2.534516765285996, "grad_norm": 0.30078125, "learning_rate": 0.00011500390802357638, "loss": 0.0423, "step": 10280 }, { "epoch": 2.535009861932939, "grad_norm": 0.263671875, "learning_rate": 0.00011496987217977402, "loss": 0.0398, "step": 10282 }, { "epoch": 2.5355029585798814, "grad_norm": 0.3125, "learning_rate": 0.0001149358345619582, "loss": 0.0488, "step": 10284 }, { "epoch": 2.5359960552268244, "grad_norm": 0.4140625, "learning_rate": 0.0001149017951741626, "loss": 0.0464, "step": 10286 }, { "epoch": 2.5364891518737673, "grad_norm": 0.48828125, "learning_rate": 0.00011486775402042109, "loss": 0.0456, "step": 10288 }, { "epoch": 2.53698224852071, "grad_norm": 0.32421875, "learning_rate": 0.00011483371110476768, "loss": 0.0425, "step": 10290 }, { "epoch": 2.537475345167653, "grad_norm": 0.373046875, "learning_rate": 0.0001147996664312367, "loss": 0.0431, "step": 10292 }, { "epoch": 2.537968441814596, "grad_norm": 0.330078125, "learning_rate": 0.00011476562000386261, "loss": 0.0443, "step": 10294 }, { "epoch": 2.5384615384615383, "grad_norm": 0.345703125, "learning_rate": 0.0001147315718266801, "loss": 0.0438, "step": 10296 }, { "epoch": 2.5389546351084813, "grad_norm": 0.58203125, "learning_rate": 0.00011469752190372411, "loss": 0.0466, "step": 10298 }, { "epoch": 2.5394477317554243, "grad_norm": 0.37890625, "learning_rate": 0.0001146634702390297, "loss": 0.0486, "step": 10300 }, { "epoch": 2.539940828402367, "grad_norm": 0.435546875, "learning_rate": 0.0001146294168366322, "loss": 0.0474, "step": 10302 }, { "epoch": 2.5404339250493098, "grad_norm": 0.396484375, "learning_rate": 0.00011459536170056713, "loss": 0.0441, "step": 10304 }, { "epoch": 2.5409270216962527, "grad_norm": 0.373046875, "learning_rate": 0.0001145613048348702, "loss": 0.0493, "step": 10306 }, { "epoch": 2.5414201183431953, "grad_norm": 0.353515625, "learning_rate": 0.00011452724624357734, "loss": 0.0451, "step": 10308 }, { "epoch": 2.5419132149901382, "grad_norm": 0.330078125, "learning_rate": 0.00011449318593072466, "loss": 0.0456, "step": 10310 }, { "epoch": 2.5424063116370808, "grad_norm": 0.294921875, "learning_rate": 0.00011445912390034857, "loss": 0.0464, "step": 10312 }, { "epoch": 2.5428994082840237, "grad_norm": 0.306640625, "learning_rate": 0.00011442506015648558, "loss": 0.0484, "step": 10314 }, { "epoch": 2.5433925049309662, "grad_norm": 0.31640625, "learning_rate": 0.0001143909947031724, "loss": 0.0448, "step": 10316 }, { "epoch": 2.543885601577909, "grad_norm": 0.337890625, "learning_rate": 0.00011435692754444602, "loss": 0.0428, "step": 10318 }, { "epoch": 2.544378698224852, "grad_norm": 0.337890625, "learning_rate": 0.00011432285868434354, "loss": 0.0421, "step": 10320 }, { "epoch": 2.5448717948717947, "grad_norm": 0.302734375, "learning_rate": 0.00011428878812690234, "loss": 0.0425, "step": 10322 }, { "epoch": 2.5453648915187377, "grad_norm": 0.3203125, "learning_rate": 0.00011425471587615997, "loss": 0.0472, "step": 10324 }, { "epoch": 2.5458579881656807, "grad_norm": 0.326171875, "learning_rate": 0.00011422064193615418, "loss": 0.0404, "step": 10326 }, { "epoch": 2.546351084812623, "grad_norm": 0.345703125, "learning_rate": 0.00011418656631092295, "loss": 0.0486, "step": 10328 }, { "epoch": 2.546844181459566, "grad_norm": 0.32421875, "learning_rate": 0.00011415248900450437, "loss": 0.0471, "step": 10330 }, { "epoch": 2.547337278106509, "grad_norm": 0.333984375, "learning_rate": 0.00011411841002093686, "loss": 0.0512, "step": 10332 }, { "epoch": 2.5478303747534516, "grad_norm": 0.380859375, "learning_rate": 0.0001140843293642589, "loss": 0.0439, "step": 10334 }, { "epoch": 2.5483234714003946, "grad_norm": 0.3671875, "learning_rate": 0.00011405024703850929, "loss": 0.0492, "step": 10336 }, { "epoch": 2.548816568047337, "grad_norm": 0.419921875, "learning_rate": 0.00011401616304772696, "loss": 0.046, "step": 10338 }, { "epoch": 2.54930966469428, "grad_norm": 0.3203125, "learning_rate": 0.00011398207739595104, "loss": 0.0436, "step": 10340 }, { "epoch": 2.5498027613412226, "grad_norm": 0.345703125, "learning_rate": 0.00011394799008722088, "loss": 0.0456, "step": 10342 }, { "epoch": 2.5502958579881656, "grad_norm": 0.353515625, "learning_rate": 0.00011391390112557605, "loss": 0.0498, "step": 10344 }, { "epoch": 2.5507889546351086, "grad_norm": 0.349609375, "learning_rate": 0.0001138798105150562, "loss": 0.0441, "step": 10346 }, { "epoch": 2.551282051282051, "grad_norm": 0.298828125, "learning_rate": 0.00011384571825970138, "loss": 0.042, "step": 10348 }, { "epoch": 2.551775147928994, "grad_norm": 0.296875, "learning_rate": 0.0001138116243635516, "loss": 0.0448, "step": 10350 }, { "epoch": 2.552268244575937, "grad_norm": 0.361328125, "learning_rate": 0.0001137775288306472, "loss": 0.0464, "step": 10352 }, { "epoch": 2.5527613412228796, "grad_norm": 0.49609375, "learning_rate": 0.00011374343166502873, "loss": 0.0446, "step": 10354 }, { "epoch": 2.5532544378698225, "grad_norm": 0.337890625, "learning_rate": 0.00011370933287073687, "loss": 0.0497, "step": 10356 }, { "epoch": 2.5537475345167655, "grad_norm": 0.31640625, "learning_rate": 0.00011367523245181256, "loss": 0.0425, "step": 10358 }, { "epoch": 2.554240631163708, "grad_norm": 0.28515625, "learning_rate": 0.0001136411304122968, "loss": 0.0433, "step": 10360 }, { "epoch": 2.554733727810651, "grad_norm": 0.294921875, "learning_rate": 0.00011360702675623094, "loss": 0.0432, "step": 10362 }, { "epoch": 2.5552268244575935, "grad_norm": 0.33203125, "learning_rate": 0.00011357292148765646, "loss": 0.0468, "step": 10364 }, { "epoch": 2.5557199211045365, "grad_norm": 0.294921875, "learning_rate": 0.00011353881461061498, "loss": 0.0454, "step": 10366 }, { "epoch": 2.556213017751479, "grad_norm": 0.28515625, "learning_rate": 0.00011350470612914836, "loss": 0.0449, "step": 10368 }, { "epoch": 2.556706114398422, "grad_norm": 0.392578125, "learning_rate": 0.00011347059604729869, "loss": 0.0471, "step": 10370 }, { "epoch": 2.557199211045365, "grad_norm": 0.34375, "learning_rate": 0.0001134364843691082, "loss": 0.045, "step": 10372 }, { "epoch": 2.5576923076923075, "grad_norm": 0.310546875, "learning_rate": 0.00011340237109861924, "loss": 0.0455, "step": 10374 }, { "epoch": 2.5581854043392505, "grad_norm": 0.306640625, "learning_rate": 0.00011336825623987448, "loss": 0.0474, "step": 10376 }, { "epoch": 2.5586785009861934, "grad_norm": 0.30859375, "learning_rate": 0.00011333413979691674, "loss": 0.0438, "step": 10378 }, { "epoch": 2.559171597633136, "grad_norm": 0.330078125, "learning_rate": 0.000113300021773789, "loss": 0.0484, "step": 10380 }, { "epoch": 2.559664694280079, "grad_norm": 0.314453125, "learning_rate": 0.00011326590217453439, "loss": 0.0418, "step": 10382 }, { "epoch": 2.560157790927022, "grad_norm": 0.4296875, "learning_rate": 0.00011323178100319632, "loss": 0.0433, "step": 10384 }, { "epoch": 2.5606508875739644, "grad_norm": 0.330078125, "learning_rate": 0.00011319765826381834, "loss": 0.0484, "step": 10386 }, { "epoch": 2.5611439842209074, "grad_norm": 0.361328125, "learning_rate": 0.00011316353396044415, "loss": 0.0464, "step": 10388 }, { "epoch": 2.5616370808678504, "grad_norm": 0.310546875, "learning_rate": 0.0001131294080971177, "loss": 0.0428, "step": 10390 }, { "epoch": 2.562130177514793, "grad_norm": 0.419921875, "learning_rate": 0.00011309528067788306, "loss": 0.0426, "step": 10392 }, { "epoch": 2.562623274161736, "grad_norm": 0.294921875, "learning_rate": 0.00011306115170678455, "loss": 0.0471, "step": 10394 }, { "epoch": 2.5631163708086784, "grad_norm": 0.4140625, "learning_rate": 0.00011302702118786667, "loss": 0.0485, "step": 10396 }, { "epoch": 2.5636094674556213, "grad_norm": 0.375, "learning_rate": 0.00011299288912517403, "loss": 0.038, "step": 10398 }, { "epoch": 2.564102564102564, "grad_norm": 0.322265625, "learning_rate": 0.0001129587555227515, "loss": 0.0432, "step": 10400 }, { "epoch": 2.564595660749507, "grad_norm": 0.359375, "learning_rate": 0.00011292462038464407, "loss": 0.0438, "step": 10402 }, { "epoch": 2.56508875739645, "grad_norm": 0.28125, "learning_rate": 0.00011289048371489694, "loss": 0.0459, "step": 10404 }, { "epoch": 2.5655818540433923, "grad_norm": 0.396484375, "learning_rate": 0.00011285634551755554, "loss": 0.0424, "step": 10406 }, { "epoch": 2.5660749506903353, "grad_norm": 0.310546875, "learning_rate": 0.00011282220579666542, "loss": 0.044, "step": 10408 }, { "epoch": 2.5665680473372783, "grad_norm": 0.34765625, "learning_rate": 0.0001127880645562723, "loss": 0.044, "step": 10410 }, { "epoch": 2.567061143984221, "grad_norm": 0.3515625, "learning_rate": 0.00011275392180042216, "loss": 0.0446, "step": 10412 }, { "epoch": 2.5675542406311638, "grad_norm": 0.275390625, "learning_rate": 0.0001127197775331611, "loss": 0.0447, "step": 10414 }, { "epoch": 2.5680473372781067, "grad_norm": 0.345703125, "learning_rate": 0.00011268563175853535, "loss": 0.0455, "step": 10416 }, { "epoch": 2.5685404339250493, "grad_norm": 0.296875, "learning_rate": 0.00011265148448059139, "loss": 0.0456, "step": 10418 }, { "epoch": 2.5690335305719922, "grad_norm": 0.3984375, "learning_rate": 0.00011261733570337591, "loss": 0.0429, "step": 10420 }, { "epoch": 2.5695266272189348, "grad_norm": 0.3359375, "learning_rate": 0.00011258318543093569, "loss": 0.0434, "step": 10422 }, { "epoch": 2.5700197238658777, "grad_norm": 0.326171875, "learning_rate": 0.00011254903366731775, "loss": 0.0474, "step": 10424 }, { "epoch": 2.5705128205128203, "grad_norm": 0.43359375, "learning_rate": 0.00011251488041656928, "loss": 0.0476, "step": 10426 }, { "epoch": 2.5710059171597632, "grad_norm": 0.53515625, "learning_rate": 0.00011248072568273755, "loss": 0.0516, "step": 10428 }, { "epoch": 2.571499013806706, "grad_norm": 0.326171875, "learning_rate": 0.0001124465694698702, "loss": 0.0516, "step": 10430 }, { "epoch": 2.5719921104536487, "grad_norm": 0.455078125, "learning_rate": 0.00011241241178201485, "loss": 0.0433, "step": 10432 }, { "epoch": 2.5724852071005917, "grad_norm": 0.390625, "learning_rate": 0.0001123782526232194, "loss": 0.0452, "step": 10434 }, { "epoch": 2.5729783037475347, "grad_norm": 0.365234375, "learning_rate": 0.00011234409199753189, "loss": 0.0454, "step": 10436 }, { "epoch": 2.573471400394477, "grad_norm": 0.33984375, "learning_rate": 0.00011230992990900055, "loss": 0.0473, "step": 10438 }, { "epoch": 2.57396449704142, "grad_norm": 0.38671875, "learning_rate": 0.00011227576636167385, "loss": 0.0447, "step": 10440 }, { "epoch": 2.574457593688363, "grad_norm": 0.337890625, "learning_rate": 0.00011224160135960024, "loss": 0.047, "step": 10442 }, { "epoch": 2.5749506903353057, "grad_norm": 0.357421875, "learning_rate": 0.00011220743490682852, "loss": 0.0473, "step": 10444 }, { "epoch": 2.5754437869822486, "grad_norm": 0.3046875, "learning_rate": 0.00011217326700740765, "loss": 0.0419, "step": 10446 }, { "epoch": 2.5759368836291916, "grad_norm": 0.326171875, "learning_rate": 0.00011213909766538663, "loss": 0.0458, "step": 10448 }, { "epoch": 2.576429980276134, "grad_norm": 0.34765625, "learning_rate": 0.00011210492688481477, "loss": 0.0465, "step": 10450 }, { "epoch": 2.5769230769230766, "grad_norm": 0.365234375, "learning_rate": 0.00011207075466974148, "loss": 0.0488, "step": 10452 }, { "epoch": 2.5774161735700196, "grad_norm": 0.322265625, "learning_rate": 0.00011203658102421642, "loss": 0.0419, "step": 10454 }, { "epoch": 2.5779092702169626, "grad_norm": 0.427734375, "learning_rate": 0.00011200240595228923, "loss": 0.0486, "step": 10456 }, { "epoch": 2.578402366863905, "grad_norm": 0.330078125, "learning_rate": 0.00011196822945800995, "loss": 0.0503, "step": 10458 }, { "epoch": 2.578895463510848, "grad_norm": 0.296875, "learning_rate": 0.00011193405154542867, "loss": 0.0483, "step": 10460 }, { "epoch": 2.579388560157791, "grad_norm": 0.34375, "learning_rate": 0.00011189987221859566, "loss": 0.0494, "step": 10462 }, { "epoch": 2.5798816568047336, "grad_norm": 0.36328125, "learning_rate": 0.00011186569148156134, "loss": 0.0434, "step": 10464 }, { "epoch": 2.5803747534516766, "grad_norm": 0.310546875, "learning_rate": 0.00011183150933837632, "loss": 0.0482, "step": 10466 }, { "epoch": 2.5808678500986195, "grad_norm": 0.326171875, "learning_rate": 0.00011179732579309141, "loss": 0.0448, "step": 10468 }, { "epoch": 2.581360946745562, "grad_norm": 0.357421875, "learning_rate": 0.0001117631408497575, "loss": 0.0478, "step": 10470 }, { "epoch": 2.581854043392505, "grad_norm": 0.310546875, "learning_rate": 0.00011172895451242571, "loss": 0.0431, "step": 10472 }, { "epoch": 2.582347140039448, "grad_norm": 0.330078125, "learning_rate": 0.00011169476678514734, "loss": 0.0456, "step": 10474 }, { "epoch": 2.5828402366863905, "grad_norm": 0.296875, "learning_rate": 0.00011166057767197383, "loss": 0.0438, "step": 10476 }, { "epoch": 2.5833333333333335, "grad_norm": 0.32421875, "learning_rate": 0.00011162638717695672, "loss": 0.0416, "step": 10478 }, { "epoch": 2.583826429980276, "grad_norm": 0.474609375, "learning_rate": 0.00011159219530414787, "loss": 0.0427, "step": 10480 }, { "epoch": 2.584319526627219, "grad_norm": 0.361328125, "learning_rate": 0.00011155800205759915, "loss": 0.047, "step": 10482 }, { "epoch": 2.5848126232741615, "grad_norm": 0.4921875, "learning_rate": 0.00011152380744136261, "loss": 0.0494, "step": 10484 }, { "epoch": 2.5853057199211045, "grad_norm": 0.359375, "learning_rate": 0.00011148961145949055, "loss": 0.0406, "step": 10486 }, { "epoch": 2.5857988165680474, "grad_norm": 0.33984375, "learning_rate": 0.00011145541411603539, "loss": 0.0459, "step": 10488 }, { "epoch": 2.58629191321499, "grad_norm": 0.44140625, "learning_rate": 0.0001114212154150497, "loss": 0.0511, "step": 10490 }, { "epoch": 2.586785009861933, "grad_norm": 0.466796875, "learning_rate": 0.00011138701536058624, "loss": 0.05, "step": 10492 }, { "epoch": 2.587278106508876, "grad_norm": 0.34375, "learning_rate": 0.00011135281395669787, "loss": 0.0422, "step": 10494 }, { "epoch": 2.5877712031558184, "grad_norm": 0.298828125, "learning_rate": 0.00011131861120743767, "loss": 0.0459, "step": 10496 }, { "epoch": 2.5882642998027614, "grad_norm": 0.353515625, "learning_rate": 0.00011128440711685883, "loss": 0.047, "step": 10498 }, { "epoch": 2.5887573964497044, "grad_norm": 0.328125, "learning_rate": 0.00011125020168901474, "loss": 0.0416, "step": 10500 }, { "epoch": 2.589250493096647, "grad_norm": 0.36328125, "learning_rate": 0.00011121599492795896, "loss": 0.0494, "step": 10502 }, { "epoch": 2.58974358974359, "grad_norm": 0.30859375, "learning_rate": 0.00011118178683774511, "loss": 0.0471, "step": 10504 }, { "epoch": 2.5902366863905324, "grad_norm": 0.314453125, "learning_rate": 0.00011114757742242712, "loss": 0.0475, "step": 10506 }, { "epoch": 2.5907297830374754, "grad_norm": 0.345703125, "learning_rate": 0.00011111336668605899, "loss": 0.0441, "step": 10508 }, { "epoch": 2.591222879684418, "grad_norm": 0.2734375, "learning_rate": 0.00011107915463269483, "loss": 0.04, "step": 10510 }, { "epoch": 2.591715976331361, "grad_norm": 0.3125, "learning_rate": 0.000111044941266389, "loss": 0.0448, "step": 10512 }, { "epoch": 2.592209072978304, "grad_norm": 0.376953125, "learning_rate": 0.00011101072659119595, "loss": 0.0478, "step": 10514 }, { "epoch": 2.5927021696252464, "grad_norm": 0.283203125, "learning_rate": 0.00011097651061117033, "loss": 0.0442, "step": 10516 }, { "epoch": 2.5931952662721893, "grad_norm": 0.3828125, "learning_rate": 0.0001109422933303669, "loss": 0.0466, "step": 10518 }, { "epoch": 2.5936883629191323, "grad_norm": 0.400390625, "learning_rate": 0.00011090807475284064, "loss": 0.0482, "step": 10520 }, { "epoch": 2.594181459566075, "grad_norm": 0.30078125, "learning_rate": 0.00011087385488264663, "loss": 0.0461, "step": 10522 }, { "epoch": 2.594674556213018, "grad_norm": 0.419921875, "learning_rate": 0.00011083963372384007, "loss": 0.0435, "step": 10524 }, { "epoch": 2.5951676528599608, "grad_norm": 0.3359375, "learning_rate": 0.00011080541128047641, "loss": 0.0484, "step": 10526 }, { "epoch": 2.5956607495069033, "grad_norm": 0.44140625, "learning_rate": 0.0001107711875566112, "loss": 0.0478, "step": 10528 }, { "epoch": 2.5961538461538463, "grad_norm": 0.283203125, "learning_rate": 0.0001107369625563001, "loss": 0.044, "step": 10530 }, { "epoch": 2.5966469428007892, "grad_norm": 0.318359375, "learning_rate": 0.000110702736283599, "loss": 0.048, "step": 10532 }, { "epoch": 2.5971400394477318, "grad_norm": 0.287109375, "learning_rate": 0.00011066850874256387, "loss": 0.0447, "step": 10534 }, { "epoch": 2.5976331360946747, "grad_norm": 0.3515625, "learning_rate": 0.00011063427993725092, "loss": 0.0454, "step": 10536 }, { "epoch": 2.5981262327416172, "grad_norm": 0.291015625, "learning_rate": 0.00011060004987171642, "loss": 0.0408, "step": 10538 }, { "epoch": 2.59861932938856, "grad_norm": 0.388671875, "learning_rate": 0.00011056581855001683, "loss": 0.0433, "step": 10540 }, { "epoch": 2.5991124260355027, "grad_norm": 0.318359375, "learning_rate": 0.00011053158597620872, "loss": 0.0435, "step": 10542 }, { "epoch": 2.5996055226824457, "grad_norm": 0.375, "learning_rate": 0.00011049735215434889, "loss": 0.0417, "step": 10544 }, { "epoch": 2.6000986193293887, "grad_norm": 0.462890625, "learning_rate": 0.00011046311708849424, "loss": 0.0456, "step": 10546 }, { "epoch": 2.600591715976331, "grad_norm": 0.34765625, "learning_rate": 0.00011042888078270177, "loss": 0.0458, "step": 10548 }, { "epoch": 2.601084812623274, "grad_norm": 0.3125, "learning_rate": 0.00011039464324102873, "loss": 0.0461, "step": 10550 }, { "epoch": 2.601577909270217, "grad_norm": 0.333984375, "learning_rate": 0.00011036040446753239, "loss": 0.0451, "step": 10552 }, { "epoch": 2.6020710059171597, "grad_norm": 0.30859375, "learning_rate": 0.00011032616446627029, "loss": 0.0477, "step": 10554 }, { "epoch": 2.6025641025641026, "grad_norm": 0.314453125, "learning_rate": 0.00011029192324130004, "loss": 0.0466, "step": 10556 }, { "epoch": 2.6030571992110456, "grad_norm": 0.341796875, "learning_rate": 0.00011025768079667942, "loss": 0.05, "step": 10558 }, { "epoch": 2.603550295857988, "grad_norm": 0.353515625, "learning_rate": 0.00011022343713646637, "loss": 0.0457, "step": 10560 }, { "epoch": 2.604043392504931, "grad_norm": 0.27734375, "learning_rate": 0.00011018919226471896, "loss": 0.0449, "step": 10562 }, { "epoch": 2.6045364891518736, "grad_norm": 0.306640625, "learning_rate": 0.00011015494618549533, "loss": 0.0438, "step": 10564 }, { "epoch": 2.6050295857988166, "grad_norm": 0.330078125, "learning_rate": 0.00011012069890285392, "loss": 0.0459, "step": 10566 }, { "epoch": 2.605522682445759, "grad_norm": 0.32421875, "learning_rate": 0.00011008645042085317, "loss": 0.0459, "step": 10568 }, { "epoch": 2.606015779092702, "grad_norm": 0.306640625, "learning_rate": 0.00011005220074355171, "loss": 0.0482, "step": 10570 }, { "epoch": 2.606508875739645, "grad_norm": 0.33984375, "learning_rate": 0.00011001794987500835, "loss": 0.0443, "step": 10572 }, { "epoch": 2.6070019723865876, "grad_norm": 0.3046875, "learning_rate": 0.000109983697819282, "loss": 0.0433, "step": 10574 }, { "epoch": 2.6074950690335306, "grad_norm": 0.337890625, "learning_rate": 0.00010994944458043173, "loss": 0.0443, "step": 10576 }, { "epoch": 2.6079881656804735, "grad_norm": 0.30078125, "learning_rate": 0.0001099151901625167, "loss": 0.0444, "step": 10578 }, { "epoch": 2.608481262327416, "grad_norm": 0.322265625, "learning_rate": 0.00010988093456959631, "loss": 0.0454, "step": 10580 }, { "epoch": 2.608974358974359, "grad_norm": 0.287109375, "learning_rate": 0.00010984667780572999, "loss": 0.0424, "step": 10582 }, { "epoch": 2.609467455621302, "grad_norm": 0.34765625, "learning_rate": 0.00010981241987497734, "loss": 0.0463, "step": 10584 }, { "epoch": 2.6099605522682445, "grad_norm": 0.412109375, "learning_rate": 0.00010977816078139817, "loss": 0.0461, "step": 10586 }, { "epoch": 2.6104536489151875, "grad_norm": 0.33203125, "learning_rate": 0.00010974390052905237, "loss": 0.0486, "step": 10588 }, { "epoch": 2.61094674556213, "grad_norm": 0.27734375, "learning_rate": 0.00010970963912199997, "loss": 0.0398, "step": 10590 }, { "epoch": 2.611439842209073, "grad_norm": 0.345703125, "learning_rate": 0.00010967537656430109, "loss": 0.0438, "step": 10592 }, { "epoch": 2.6119329388560155, "grad_norm": 0.302734375, "learning_rate": 0.00010964111286001608, "loss": 0.0458, "step": 10594 }, { "epoch": 2.6124260355029585, "grad_norm": 0.330078125, "learning_rate": 0.00010960684801320536, "loss": 0.0474, "step": 10596 }, { "epoch": 2.6129191321499015, "grad_norm": 0.318359375, "learning_rate": 0.0001095725820279295, "loss": 0.0441, "step": 10598 }, { "epoch": 2.613412228796844, "grad_norm": 0.357421875, "learning_rate": 0.00010953831490824924, "loss": 0.0451, "step": 10600 }, { "epoch": 2.613905325443787, "grad_norm": 0.341796875, "learning_rate": 0.0001095040466582254, "loss": 0.0421, "step": 10602 }, { "epoch": 2.61439842209073, "grad_norm": 0.421875, "learning_rate": 0.00010946977728191896, "loss": 0.0473, "step": 10604 }, { "epoch": 2.6148915187376724, "grad_norm": 0.30078125, "learning_rate": 0.00010943550678339108, "loss": 0.0441, "step": 10606 }, { "epoch": 2.6153846153846154, "grad_norm": 0.341796875, "learning_rate": 0.00010940123516670293, "loss": 0.0466, "step": 10608 }, { "epoch": 2.6158777120315584, "grad_norm": 0.27734375, "learning_rate": 0.00010936696243591591, "loss": 0.0418, "step": 10610 }, { "epoch": 2.616370808678501, "grad_norm": 0.337890625, "learning_rate": 0.00010933268859509158, "loss": 0.0467, "step": 10612 }, { "epoch": 2.616863905325444, "grad_norm": 0.34375, "learning_rate": 0.00010929841364829152, "loss": 0.0422, "step": 10614 }, { "epoch": 2.617357001972387, "grad_norm": 0.3671875, "learning_rate": 0.00010926413759957754, "loss": 0.0422, "step": 10616 }, { "epoch": 2.6178500986193294, "grad_norm": 0.365234375, "learning_rate": 0.00010922986045301152, "loss": 0.0442, "step": 10618 }, { "epoch": 2.6183431952662723, "grad_norm": 0.330078125, "learning_rate": 0.00010919558221265552, "loss": 0.0475, "step": 10620 }, { "epoch": 2.618836291913215, "grad_norm": 0.314453125, "learning_rate": 0.00010916130288257166, "loss": 0.0441, "step": 10622 }, { "epoch": 2.619329388560158, "grad_norm": 0.333984375, "learning_rate": 0.00010912702246682227, "loss": 0.0393, "step": 10624 }, { "epoch": 2.6198224852071004, "grad_norm": 0.34765625, "learning_rate": 0.00010909274096946976, "loss": 0.0425, "step": 10626 }, { "epoch": 2.6203155818540433, "grad_norm": 0.3984375, "learning_rate": 0.00010905845839457669, "loss": 0.0436, "step": 10628 }, { "epoch": 2.6208086785009863, "grad_norm": 0.392578125, "learning_rate": 0.0001090241747462057, "loss": 0.0505, "step": 10630 }, { "epoch": 2.621301775147929, "grad_norm": 0.63671875, "learning_rate": 0.00010898989002841964, "loss": 0.043, "step": 10632 }, { "epoch": 2.621794871794872, "grad_norm": 0.36328125, "learning_rate": 0.00010895560424528142, "loss": 0.0503, "step": 10634 }, { "epoch": 2.6222879684418148, "grad_norm": 0.328125, "learning_rate": 0.00010892131740085408, "loss": 0.045, "step": 10636 }, { "epoch": 2.6227810650887573, "grad_norm": 0.3984375, "learning_rate": 0.00010888702949920081, "loss": 0.0448, "step": 10638 }, { "epoch": 2.6232741617357003, "grad_norm": 0.314453125, "learning_rate": 0.00010885274054438495, "loss": 0.0381, "step": 10640 }, { "epoch": 2.6237672583826432, "grad_norm": 0.30859375, "learning_rate": 0.00010881845054046988, "loss": 0.0436, "step": 10642 }, { "epoch": 2.6242603550295858, "grad_norm": 0.36328125, "learning_rate": 0.00010878415949151922, "loss": 0.0457, "step": 10644 }, { "epoch": 2.6247534516765287, "grad_norm": 0.349609375, "learning_rate": 0.00010874986740159658, "loss": 0.0418, "step": 10646 }, { "epoch": 2.6252465483234713, "grad_norm": 0.33203125, "learning_rate": 0.00010871557427476583, "loss": 0.0396, "step": 10648 }, { "epoch": 2.6257396449704142, "grad_norm": 0.3046875, "learning_rate": 0.00010868128011509087, "loss": 0.0463, "step": 10650 }, { "epoch": 2.6262327416173568, "grad_norm": 0.423828125, "learning_rate": 0.00010864698492663572, "loss": 0.043, "step": 10652 }, { "epoch": 2.6267258382642997, "grad_norm": 0.27734375, "learning_rate": 0.00010861268871346458, "loss": 0.0458, "step": 10654 }, { "epoch": 2.6272189349112427, "grad_norm": 0.3671875, "learning_rate": 0.00010857839147964173, "loss": 0.0448, "step": 10656 }, { "epoch": 2.6277120315581852, "grad_norm": 0.330078125, "learning_rate": 0.00010854409322923161, "loss": 0.0427, "step": 10658 }, { "epoch": 2.628205128205128, "grad_norm": 0.41796875, "learning_rate": 0.00010850979396629877, "loss": 0.0446, "step": 10660 }, { "epoch": 2.628698224852071, "grad_norm": 0.3125, "learning_rate": 0.00010847549369490781, "loss": 0.0486, "step": 10662 }, { "epoch": 2.6291913214990137, "grad_norm": 0.328125, "learning_rate": 0.00010844119241912352, "loss": 0.0482, "step": 10664 }, { "epoch": 2.6296844181459567, "grad_norm": 0.337890625, "learning_rate": 0.00010840689014301078, "loss": 0.046, "step": 10666 }, { "epoch": 2.6301775147928996, "grad_norm": 0.29296875, "learning_rate": 0.00010837258687063463, "loss": 0.044, "step": 10668 }, { "epoch": 2.630670611439842, "grad_norm": 0.27734375, "learning_rate": 0.00010833828260606021, "loss": 0.0418, "step": 10670 }, { "epoch": 2.631163708086785, "grad_norm": 0.314453125, "learning_rate": 0.00010830397735335272, "loss": 0.0432, "step": 10672 }, { "epoch": 2.6316568047337277, "grad_norm": 0.32421875, "learning_rate": 0.00010826967111657757, "loss": 0.0459, "step": 10674 }, { "epoch": 2.6321499013806706, "grad_norm": 0.29296875, "learning_rate": 0.0001082353638998002, "loss": 0.0445, "step": 10676 }, { "epoch": 2.632642998027613, "grad_norm": 0.3203125, "learning_rate": 0.00010820105570708627, "loss": 0.0455, "step": 10678 }, { "epoch": 2.633136094674556, "grad_norm": 0.328125, "learning_rate": 0.00010816674654250141, "loss": 0.0461, "step": 10680 }, { "epoch": 2.633629191321499, "grad_norm": 0.365234375, "learning_rate": 0.0001081324364101115, "loss": 0.0395, "step": 10682 }, { "epoch": 2.6341222879684416, "grad_norm": 0.416015625, "learning_rate": 0.00010809812531398247, "loss": 0.0442, "step": 10684 }, { "epoch": 2.6346153846153846, "grad_norm": 0.36328125, "learning_rate": 0.00010806381325818036, "loss": 0.0463, "step": 10686 }, { "epoch": 2.6351084812623276, "grad_norm": 0.380859375, "learning_rate": 0.00010802950024677138, "loss": 0.043, "step": 10688 }, { "epoch": 2.63560157790927, "grad_norm": 0.37109375, "learning_rate": 0.0001079951862838218, "loss": 0.0443, "step": 10690 }, { "epoch": 2.636094674556213, "grad_norm": 0.427734375, "learning_rate": 0.00010796087137339796, "loss": 0.046, "step": 10692 }, { "epoch": 2.636587771203156, "grad_norm": 0.31640625, "learning_rate": 0.00010792655551956645, "loss": 0.0409, "step": 10694 }, { "epoch": 2.6370808678500985, "grad_norm": 0.306640625, "learning_rate": 0.00010789223872639384, "loss": 0.0451, "step": 10696 }, { "epoch": 2.6375739644970415, "grad_norm": 0.40234375, "learning_rate": 0.00010785792099794689, "loss": 0.0481, "step": 10698 }, { "epoch": 2.6380670611439845, "grad_norm": 0.359375, "learning_rate": 0.0001078236023382924, "loss": 0.0449, "step": 10700 }, { "epoch": 2.638560157790927, "grad_norm": 0.330078125, "learning_rate": 0.0001077892827514974, "loss": 0.0455, "step": 10702 }, { "epoch": 2.63905325443787, "grad_norm": 0.439453125, "learning_rate": 0.00010775496224162886, "loss": 0.0469, "step": 10704 }, { "epoch": 2.6395463510848125, "grad_norm": 0.333984375, "learning_rate": 0.00010772064081275402, "loss": 0.0437, "step": 10706 }, { "epoch": 2.6400394477317555, "grad_norm": 0.4140625, "learning_rate": 0.00010768631846894013, "loss": 0.0403, "step": 10708 }, { "epoch": 2.640532544378698, "grad_norm": 0.33203125, "learning_rate": 0.0001076519952142546, "loss": 0.0448, "step": 10710 }, { "epoch": 2.641025641025641, "grad_norm": 0.283203125, "learning_rate": 0.0001076176710527649, "loss": 0.0393, "step": 10712 }, { "epoch": 2.641518737672584, "grad_norm": 0.333984375, "learning_rate": 0.00010758334598853863, "loss": 0.0458, "step": 10714 }, { "epoch": 2.6420118343195265, "grad_norm": 0.314453125, "learning_rate": 0.00010754902002564355, "loss": 0.0454, "step": 10716 }, { "epoch": 2.6425049309664694, "grad_norm": 0.275390625, "learning_rate": 0.00010751469316814743, "loss": 0.0429, "step": 10718 }, { "epoch": 2.6429980276134124, "grad_norm": 0.322265625, "learning_rate": 0.00010748036542011823, "loss": 0.0445, "step": 10720 }, { "epoch": 2.643491124260355, "grad_norm": 0.3359375, "learning_rate": 0.00010744603678562395, "loss": 0.0412, "step": 10722 }, { "epoch": 2.643984220907298, "grad_norm": 0.341796875, "learning_rate": 0.00010741170726873274, "loss": 0.0439, "step": 10724 }, { "epoch": 2.644477317554241, "grad_norm": 0.33984375, "learning_rate": 0.00010737737687351284, "loss": 0.044, "step": 10726 }, { "epoch": 2.6449704142011834, "grad_norm": 0.431640625, "learning_rate": 0.00010734304560403259, "loss": 0.043, "step": 10728 }, { "epoch": 2.6454635108481264, "grad_norm": 0.28515625, "learning_rate": 0.00010730871346436045, "loss": 0.0391, "step": 10730 }, { "epoch": 2.645956607495069, "grad_norm": 0.421875, "learning_rate": 0.00010727438045856492, "loss": 0.0405, "step": 10732 }, { "epoch": 2.646449704142012, "grad_norm": 0.26953125, "learning_rate": 0.00010724004659071472, "loss": 0.0426, "step": 10734 }, { "epoch": 2.6469428007889544, "grad_norm": 0.318359375, "learning_rate": 0.00010720571186487858, "loss": 0.0471, "step": 10736 }, { "epoch": 2.6474358974358974, "grad_norm": 0.4453125, "learning_rate": 0.00010717137628512533, "loss": 0.0429, "step": 10738 }, { "epoch": 2.6479289940828403, "grad_norm": 0.3046875, "learning_rate": 0.00010713703985552395, "loss": 0.0437, "step": 10740 }, { "epoch": 2.648422090729783, "grad_norm": 0.376953125, "learning_rate": 0.00010710270258014354, "loss": 0.044, "step": 10742 }, { "epoch": 2.648915187376726, "grad_norm": 0.353515625, "learning_rate": 0.0001070683644630532, "loss": 0.0381, "step": 10744 }, { "epoch": 2.649408284023669, "grad_norm": 0.359375, "learning_rate": 0.0001070340255083222, "loss": 0.0449, "step": 10746 }, { "epoch": 2.6499013806706113, "grad_norm": 0.466796875, "learning_rate": 0.00010699968572001988, "loss": 0.0469, "step": 10748 }, { "epoch": 2.6503944773175543, "grad_norm": 0.267578125, "learning_rate": 0.00010696534510221574, "loss": 0.0414, "step": 10750 }, { "epoch": 2.6508875739644973, "grad_norm": 0.341796875, "learning_rate": 0.00010693100365897933, "loss": 0.0423, "step": 10752 }, { "epoch": 2.65138067061144, "grad_norm": 0.30859375, "learning_rate": 0.00010689666139438026, "loss": 0.0472, "step": 10754 }, { "epoch": 2.6518737672583828, "grad_norm": 0.328125, "learning_rate": 0.00010686231831248834, "loss": 0.0443, "step": 10756 }, { "epoch": 2.6523668639053253, "grad_norm": 0.3359375, "learning_rate": 0.00010682797441737335, "loss": 0.0463, "step": 10758 }, { "epoch": 2.6528599605522682, "grad_norm": 0.30078125, "learning_rate": 0.00010679362971310531, "loss": 0.0512, "step": 10760 }, { "epoch": 2.6533530571992108, "grad_norm": 0.353515625, "learning_rate": 0.00010675928420375421, "loss": 0.0452, "step": 10762 }, { "epoch": 2.6538461538461537, "grad_norm": 0.458984375, "learning_rate": 0.00010672493789339015, "loss": 0.0452, "step": 10764 }, { "epoch": 2.6543392504930967, "grad_norm": 0.33203125, "learning_rate": 0.00010669059078608344, "loss": 0.043, "step": 10766 }, { "epoch": 2.6548323471400392, "grad_norm": 0.30859375, "learning_rate": 0.00010665624288590437, "loss": 0.0463, "step": 10768 }, { "epoch": 2.655325443786982, "grad_norm": 0.318359375, "learning_rate": 0.00010662189419692334, "loss": 0.0455, "step": 10770 }, { "epoch": 2.655818540433925, "grad_norm": 0.322265625, "learning_rate": 0.00010658754472321089, "loss": 0.0447, "step": 10772 }, { "epoch": 2.6563116370808677, "grad_norm": 0.287109375, "learning_rate": 0.00010655319446883758, "loss": 0.0435, "step": 10774 }, { "epoch": 2.6568047337278107, "grad_norm": 0.3984375, "learning_rate": 0.00010651884343787418, "loss": 0.0479, "step": 10776 }, { "epoch": 2.6572978303747536, "grad_norm": 0.310546875, "learning_rate": 0.0001064844916343914, "loss": 0.0438, "step": 10778 }, { "epoch": 2.657790927021696, "grad_norm": 0.333984375, "learning_rate": 0.00010645013906246017, "loss": 0.0427, "step": 10780 }, { "epoch": 2.658284023668639, "grad_norm": 0.3046875, "learning_rate": 0.00010641578572615145, "loss": 0.0419, "step": 10782 }, { "epoch": 2.658777120315582, "grad_norm": 0.32421875, "learning_rate": 0.00010638143162953631, "loss": 0.0404, "step": 10784 }, { "epoch": 2.6592702169625246, "grad_norm": 0.333984375, "learning_rate": 0.00010634707677668587, "loss": 0.0437, "step": 10786 }, { "epoch": 2.6597633136094676, "grad_norm": 0.32421875, "learning_rate": 0.00010631272117167139, "loss": 0.0438, "step": 10788 }, { "epoch": 2.66025641025641, "grad_norm": 0.318359375, "learning_rate": 0.00010627836481856421, "loss": 0.0406, "step": 10790 }, { "epoch": 2.660749506903353, "grad_norm": 0.33203125, "learning_rate": 0.00010624400772143579, "loss": 0.0444, "step": 10792 }, { "epoch": 2.6612426035502956, "grad_norm": 0.341796875, "learning_rate": 0.00010620964988435755, "loss": 0.0432, "step": 10794 }, { "epoch": 2.6617357001972386, "grad_norm": 0.375, "learning_rate": 0.00010617529131140114, "loss": 0.0399, "step": 10796 }, { "epoch": 2.6622287968441816, "grad_norm": 0.2890625, "learning_rate": 0.00010614093200663824, "loss": 0.0436, "step": 10798 }, { "epoch": 2.662721893491124, "grad_norm": 0.330078125, "learning_rate": 0.00010610657197414058, "loss": 0.0431, "step": 10800 }, { "epoch": 2.663214990138067, "grad_norm": 0.3515625, "learning_rate": 0.00010607221121798006, "loss": 0.0452, "step": 10802 }, { "epoch": 2.66370808678501, "grad_norm": 0.4921875, "learning_rate": 0.00010603784974222861, "loss": 0.0486, "step": 10804 }, { "epoch": 2.6642011834319526, "grad_norm": 0.375, "learning_rate": 0.00010600348755095826, "loss": 0.047, "step": 10806 }, { "epoch": 2.6646942800788955, "grad_norm": 0.42578125, "learning_rate": 0.0001059691246482411, "loss": 0.0462, "step": 10808 }, { "epoch": 2.6651873767258385, "grad_norm": 0.34375, "learning_rate": 0.00010593476103814936, "loss": 0.0476, "step": 10810 }, { "epoch": 2.665680473372781, "grad_norm": 0.306640625, "learning_rate": 0.00010590039672475534, "loss": 0.0469, "step": 10812 }, { "epoch": 2.666173570019724, "grad_norm": 0.46875, "learning_rate": 0.0001058660317121313, "loss": 0.0552, "step": 10814 }, { "epoch": 2.6666666666666665, "grad_norm": 0.341796875, "learning_rate": 0.00010583166600434977, "loss": 0.0422, "step": 10816 }, { "epoch": 2.6671597633136095, "grad_norm": 0.30859375, "learning_rate": 0.00010579729960548326, "loss": 0.0463, "step": 10818 }, { "epoch": 2.667652859960552, "grad_norm": 0.38671875, "learning_rate": 0.00010576293251960436, "loss": 0.0412, "step": 10820 }, { "epoch": 2.668145956607495, "grad_norm": 0.4765625, "learning_rate": 0.00010572856475078578, "loss": 0.0476, "step": 10822 }, { "epoch": 2.668639053254438, "grad_norm": 0.296875, "learning_rate": 0.00010569419630310036, "loss": 0.0403, "step": 10824 }, { "epoch": 2.6691321499013805, "grad_norm": 0.3359375, "learning_rate": 0.00010565982718062085, "loss": 0.0426, "step": 10826 }, { "epoch": 2.6696252465483234, "grad_norm": 0.32421875, "learning_rate": 0.00010562545738742021, "loss": 0.0422, "step": 10828 }, { "epoch": 2.6701183431952664, "grad_norm": 0.294921875, "learning_rate": 0.00010559108692757142, "loss": 0.0432, "step": 10830 }, { "epoch": 2.670611439842209, "grad_norm": 0.291015625, "learning_rate": 0.00010555671580514765, "loss": 0.0402, "step": 10832 }, { "epoch": 2.671104536489152, "grad_norm": 0.384765625, "learning_rate": 0.00010552234402422203, "loss": 0.0494, "step": 10834 }, { "epoch": 2.671597633136095, "grad_norm": 0.3125, "learning_rate": 0.0001054879715888678, "loss": 0.0479, "step": 10836 }, { "epoch": 2.6720907297830374, "grad_norm": 0.2890625, "learning_rate": 0.00010545359850315835, "loss": 0.0425, "step": 10838 }, { "epoch": 2.6725838264299804, "grad_norm": 0.341796875, "learning_rate": 0.00010541922477116697, "loss": 0.0466, "step": 10840 }, { "epoch": 2.6730769230769234, "grad_norm": 0.37109375, "learning_rate": 0.00010538485039696722, "loss": 0.0446, "step": 10842 }, { "epoch": 2.673570019723866, "grad_norm": 0.31640625, "learning_rate": 0.00010535047538463263, "loss": 0.0416, "step": 10844 }, { "epoch": 2.6740631163708084, "grad_norm": 0.298828125, "learning_rate": 0.00010531609973823681, "loss": 0.0471, "step": 10846 }, { "epoch": 2.6745562130177514, "grad_norm": 0.3515625, "learning_rate": 0.00010528172346185352, "loss": 0.0471, "step": 10848 }, { "epoch": 2.6750493096646943, "grad_norm": 0.390625, "learning_rate": 0.0001052473465595565, "loss": 0.0464, "step": 10850 }, { "epoch": 2.675542406311637, "grad_norm": 0.326171875, "learning_rate": 0.00010521296903541962, "loss": 0.0416, "step": 10852 }, { "epoch": 2.67603550295858, "grad_norm": 0.34765625, "learning_rate": 0.0001051785908935168, "loss": 0.0442, "step": 10854 }, { "epoch": 2.676528599605523, "grad_norm": 0.31640625, "learning_rate": 0.00010514421213792205, "loss": 0.0391, "step": 10856 }, { "epoch": 2.6770216962524653, "grad_norm": 0.3984375, "learning_rate": 0.00010510983277270945, "loss": 0.0443, "step": 10858 }, { "epoch": 2.6775147928994083, "grad_norm": 0.357421875, "learning_rate": 0.0001050754528019531, "loss": 0.0446, "step": 10860 }, { "epoch": 2.6780078895463513, "grad_norm": 0.365234375, "learning_rate": 0.00010504107222972727, "loss": 0.0452, "step": 10862 }, { "epoch": 2.678500986193294, "grad_norm": 0.31640625, "learning_rate": 0.00010500669106010625, "loss": 0.0456, "step": 10864 }, { "epoch": 2.6789940828402368, "grad_norm": 0.37890625, "learning_rate": 0.00010497230929716439, "loss": 0.0453, "step": 10866 }, { "epoch": 2.6794871794871797, "grad_norm": 0.41015625, "learning_rate": 0.0001049379269449761, "loss": 0.0421, "step": 10868 }, { "epoch": 2.6799802761341223, "grad_norm": 0.34765625, "learning_rate": 0.0001049035440076159, "loss": 0.0465, "step": 10870 }, { "epoch": 2.6804733727810652, "grad_norm": 0.30078125, "learning_rate": 0.00010486916048915836, "loss": 0.0515, "step": 10872 }, { "epoch": 2.6809664694280078, "grad_norm": 0.2890625, "learning_rate": 0.00010483477639367811, "loss": 0.0409, "step": 10874 }, { "epoch": 2.6814595660749507, "grad_norm": 0.44921875, "learning_rate": 0.00010480039172524987, "loss": 0.0468, "step": 10876 }, { "epoch": 2.6819526627218933, "grad_norm": 0.349609375, "learning_rate": 0.00010476600648794841, "loss": 0.0425, "step": 10878 }, { "epoch": 2.6824457593688362, "grad_norm": 0.412109375, "learning_rate": 0.00010473162068584857, "loss": 0.0447, "step": 10880 }, { "epoch": 2.682938856015779, "grad_norm": 0.322265625, "learning_rate": 0.00010469723432302527, "loss": 0.043, "step": 10882 }, { "epoch": 2.6834319526627217, "grad_norm": 0.375, "learning_rate": 0.00010466284740355345, "loss": 0.0426, "step": 10884 }, { "epoch": 2.6839250493096647, "grad_norm": 0.306640625, "learning_rate": 0.0001046284599315082, "loss": 0.0391, "step": 10886 }, { "epoch": 2.6844181459566077, "grad_norm": 0.435546875, "learning_rate": 0.00010459407191096459, "loss": 0.0449, "step": 10888 }, { "epoch": 2.68491124260355, "grad_norm": 0.353515625, "learning_rate": 0.00010455968334599782, "loss": 0.0401, "step": 10890 }, { "epoch": 2.685404339250493, "grad_norm": 0.3515625, "learning_rate": 0.00010452529424068314, "loss": 0.0429, "step": 10892 }, { "epoch": 2.685897435897436, "grad_norm": 0.326171875, "learning_rate": 0.00010449090459909578, "loss": 0.0432, "step": 10894 }, { "epoch": 2.6863905325443787, "grad_norm": 0.361328125, "learning_rate": 0.00010445651442531119, "loss": 0.0377, "step": 10896 }, { "epoch": 2.6868836291913216, "grad_norm": 0.341796875, "learning_rate": 0.00010442212372340474, "loss": 0.0421, "step": 10898 }, { "epoch": 2.687376725838264, "grad_norm": 0.365234375, "learning_rate": 0.00010438773249745192, "loss": 0.0433, "step": 10900 }, { "epoch": 2.687869822485207, "grad_norm": 0.330078125, "learning_rate": 0.00010435334075152832, "loss": 0.0424, "step": 10902 }, { "epoch": 2.6883629191321496, "grad_norm": 0.345703125, "learning_rate": 0.00010431894848970953, "loss": 0.0417, "step": 10904 }, { "epoch": 2.6888560157790926, "grad_norm": 0.353515625, "learning_rate": 0.00010428455571607123, "loss": 0.0432, "step": 10906 }, { "epoch": 2.6893491124260356, "grad_norm": 0.29296875, "learning_rate": 0.0001042501624346892, "loss": 0.0395, "step": 10908 }, { "epoch": 2.689842209072978, "grad_norm": 0.380859375, "learning_rate": 0.00010421576864963915, "loss": 0.0455, "step": 10910 }, { "epoch": 2.690335305719921, "grad_norm": 0.451171875, "learning_rate": 0.00010418137436499698, "loss": 0.0364, "step": 10912 }, { "epoch": 2.690828402366864, "grad_norm": 0.3359375, "learning_rate": 0.0001041469795848386, "loss": 0.0398, "step": 10914 }, { "epoch": 2.6913214990138066, "grad_norm": 0.271484375, "learning_rate": 0.00010411258431323999, "loss": 0.0403, "step": 10916 }, { "epoch": 2.6918145956607495, "grad_norm": 0.3828125, "learning_rate": 0.00010407818855427716, "loss": 0.0414, "step": 10918 }, { "epoch": 2.6923076923076925, "grad_norm": 0.328125, "learning_rate": 0.00010404379231202623, "loss": 0.0415, "step": 10920 }, { "epoch": 2.692800788954635, "grad_norm": 0.326171875, "learning_rate": 0.00010400939559056336, "loss": 0.0389, "step": 10922 }, { "epoch": 2.693293885601578, "grad_norm": 0.455078125, "learning_rate": 0.0001039749983939647, "loss": 0.048, "step": 10924 }, { "epoch": 2.693786982248521, "grad_norm": 0.357421875, "learning_rate": 0.00010394060072630653, "loss": 0.0437, "step": 10926 }, { "epoch": 2.6942800788954635, "grad_norm": 0.30078125, "learning_rate": 0.0001039062025916652, "loss": 0.0427, "step": 10928 }, { "epoch": 2.6947731755424065, "grad_norm": 0.33984375, "learning_rate": 0.000103871803994117, "loss": 0.0474, "step": 10930 }, { "epoch": 2.695266272189349, "grad_norm": 0.28515625, "learning_rate": 0.00010383740493773845, "loss": 0.0442, "step": 10932 }, { "epoch": 2.695759368836292, "grad_norm": 0.375, "learning_rate": 0.00010380300542660596, "loss": 0.0448, "step": 10934 }, { "epoch": 2.6962524654832345, "grad_norm": 0.357421875, "learning_rate": 0.00010376860546479614, "loss": 0.0438, "step": 10936 }, { "epoch": 2.6967455621301775, "grad_norm": 0.330078125, "learning_rate": 0.00010373420505638552, "loss": 0.0462, "step": 10938 }, { "epoch": 2.6972386587771204, "grad_norm": 0.40625, "learning_rate": 0.00010369980420545074, "loss": 0.0428, "step": 10940 }, { "epoch": 2.697731755424063, "grad_norm": 0.357421875, "learning_rate": 0.0001036654029160685, "loss": 0.0421, "step": 10942 }, { "epoch": 2.698224852071006, "grad_norm": 0.306640625, "learning_rate": 0.00010363100119231554, "loss": 0.0441, "step": 10944 }, { "epoch": 2.698717948717949, "grad_norm": 0.41796875, "learning_rate": 0.00010359659903826869, "loss": 0.043, "step": 10946 }, { "epoch": 2.6992110453648914, "grad_norm": 0.318359375, "learning_rate": 0.00010356219645800475, "loss": 0.0422, "step": 10948 }, { "epoch": 2.6997041420118344, "grad_norm": 0.435546875, "learning_rate": 0.00010352779345560069, "loss": 0.0438, "step": 10950 }, { "epoch": 2.7001972386587774, "grad_norm": 0.32421875, "learning_rate": 0.00010349339003513336, "loss": 0.0441, "step": 10952 }, { "epoch": 2.70069033530572, "grad_norm": 0.314453125, "learning_rate": 0.00010345898620067979, "loss": 0.0483, "step": 10954 }, { "epoch": 2.701183431952663, "grad_norm": 0.330078125, "learning_rate": 0.00010342458195631708, "loss": 0.046, "step": 10956 }, { "epoch": 2.7016765285996054, "grad_norm": 0.3671875, "learning_rate": 0.00010339017730612231, "loss": 0.0423, "step": 10958 }, { "epoch": 2.7021696252465484, "grad_norm": 0.349609375, "learning_rate": 0.00010335577225417255, "loss": 0.0418, "step": 10960 }, { "epoch": 2.702662721893491, "grad_norm": 0.3515625, "learning_rate": 0.00010332136680454505, "loss": 0.0436, "step": 10962 }, { "epoch": 2.703155818540434, "grad_norm": 0.322265625, "learning_rate": 0.00010328696096131707, "loss": 0.042, "step": 10964 }, { "epoch": 2.703648915187377, "grad_norm": 0.28515625, "learning_rate": 0.00010325255472856585, "loss": 0.0418, "step": 10966 }, { "epoch": 2.7041420118343193, "grad_norm": 0.357421875, "learning_rate": 0.00010321814811036872, "loss": 0.0469, "step": 10968 }, { "epoch": 2.7046351084812623, "grad_norm": 0.32421875, "learning_rate": 0.00010318374111080306, "loss": 0.0435, "step": 10970 }, { "epoch": 2.7051282051282053, "grad_norm": 0.318359375, "learning_rate": 0.00010314933373394633, "loss": 0.0399, "step": 10972 }, { "epoch": 2.705621301775148, "grad_norm": 0.296875, "learning_rate": 0.00010311492598387598, "loss": 0.0379, "step": 10974 }, { "epoch": 2.706114398422091, "grad_norm": 0.41796875, "learning_rate": 0.0001030805178646695, "loss": 0.039, "step": 10976 }, { "epoch": 2.7066074950690338, "grad_norm": 0.392578125, "learning_rate": 0.00010304610938040446, "loss": 0.0429, "step": 10978 }, { "epoch": 2.7071005917159763, "grad_norm": 0.28125, "learning_rate": 0.00010301170053515843, "loss": 0.0414, "step": 10980 }, { "epoch": 2.7075936883629192, "grad_norm": 0.349609375, "learning_rate": 0.00010297729133300907, "loss": 0.0389, "step": 10982 }, { "epoch": 2.7080867850098618, "grad_norm": 0.31640625, "learning_rate": 0.00010294288177803406, "loss": 0.0391, "step": 10984 }, { "epoch": 2.7085798816568047, "grad_norm": 0.42578125, "learning_rate": 0.00010290847187431113, "loss": 0.0451, "step": 10986 }, { "epoch": 2.7090729783037473, "grad_norm": 0.37890625, "learning_rate": 0.00010287406162591806, "loss": 0.0491, "step": 10988 }, { "epoch": 2.7095660749506902, "grad_norm": 0.33984375, "learning_rate": 0.00010283965103693265, "loss": 0.0439, "step": 10990 }, { "epoch": 2.710059171597633, "grad_norm": 0.345703125, "learning_rate": 0.00010280524011143273, "loss": 0.0437, "step": 10992 }, { "epoch": 2.7105522682445757, "grad_norm": 0.306640625, "learning_rate": 0.00010277082885349621, "loss": 0.0436, "step": 10994 }, { "epoch": 2.7110453648915187, "grad_norm": 0.310546875, "learning_rate": 0.00010273641726720096, "loss": 0.0475, "step": 10996 }, { "epoch": 2.7115384615384617, "grad_norm": 0.33203125, "learning_rate": 0.000102702005356625, "loss": 0.0412, "step": 10998 }, { "epoch": 2.712031558185404, "grad_norm": 0.40625, "learning_rate": 0.0001026675931258463, "loss": 0.0416, "step": 11000 }, { "epoch": 2.712524654832347, "grad_norm": 0.2890625, "learning_rate": 0.00010263318057894296, "loss": 0.0394, "step": 11002 }, { "epoch": 2.71301775147929, "grad_norm": 0.333984375, "learning_rate": 0.00010259876771999302, "loss": 0.045, "step": 11004 }, { "epoch": 2.7135108481262327, "grad_norm": 0.396484375, "learning_rate": 0.00010256435455307457, "loss": 0.0444, "step": 11006 }, { "epoch": 2.7140039447731756, "grad_norm": 0.296875, "learning_rate": 0.0001025299410822658, "loss": 0.0463, "step": 11008 }, { "epoch": 2.7144970414201186, "grad_norm": 0.40234375, "learning_rate": 0.00010249552731164488, "loss": 0.0455, "step": 11010 }, { "epoch": 2.714990138067061, "grad_norm": 0.46484375, "learning_rate": 0.00010246111324529004, "loss": 0.0454, "step": 11012 }, { "epoch": 2.715483234714004, "grad_norm": 0.3125, "learning_rate": 0.00010242669888727954, "loss": 0.0442, "step": 11014 }, { "epoch": 2.7159763313609466, "grad_norm": 0.337890625, "learning_rate": 0.00010239228424169167, "loss": 0.0438, "step": 11016 }, { "epoch": 2.7164694280078896, "grad_norm": 0.328125, "learning_rate": 0.00010235786931260478, "loss": 0.0424, "step": 11018 }, { "epoch": 2.716962524654832, "grad_norm": 0.283203125, "learning_rate": 0.00010232345410409718, "loss": 0.0352, "step": 11020 }, { "epoch": 2.717455621301775, "grad_norm": 0.294921875, "learning_rate": 0.00010228903862024732, "loss": 0.0434, "step": 11022 }, { "epoch": 2.717948717948718, "grad_norm": 0.3046875, "learning_rate": 0.0001022546228651336, "loss": 0.0453, "step": 11024 }, { "epoch": 2.7184418145956606, "grad_norm": 0.3515625, "learning_rate": 0.0001022202068428345, "loss": 0.0447, "step": 11026 }, { "epoch": 2.7189349112426036, "grad_norm": 0.357421875, "learning_rate": 0.00010218579055742845, "loss": 0.0446, "step": 11028 }, { "epoch": 2.7194280078895465, "grad_norm": 0.3046875, "learning_rate": 0.00010215137401299404, "loss": 0.0422, "step": 11030 }, { "epoch": 2.719921104536489, "grad_norm": 0.333984375, "learning_rate": 0.0001021169572136098, "loss": 0.0439, "step": 11032 }, { "epoch": 2.720414201183432, "grad_norm": 0.328125, "learning_rate": 0.0001020825401633543, "loss": 0.0489, "step": 11034 }, { "epoch": 2.720907297830375, "grad_norm": 0.3359375, "learning_rate": 0.00010204812286630616, "loss": 0.0435, "step": 11036 }, { "epoch": 2.7214003944773175, "grad_norm": 0.322265625, "learning_rate": 0.00010201370532654404, "loss": 0.0431, "step": 11038 }, { "epoch": 2.7218934911242605, "grad_norm": 0.322265625, "learning_rate": 0.00010197928754814661, "loss": 0.0436, "step": 11040 }, { "epoch": 2.722386587771203, "grad_norm": 0.302734375, "learning_rate": 0.0001019448695351925, "loss": 0.0429, "step": 11042 }, { "epoch": 2.722879684418146, "grad_norm": 0.310546875, "learning_rate": 0.00010191045129176053, "loss": 0.0402, "step": 11044 }, { "epoch": 2.7233727810650885, "grad_norm": 0.3203125, "learning_rate": 0.00010187603282192943, "loss": 0.0442, "step": 11046 }, { "epoch": 2.7238658777120315, "grad_norm": 0.36328125, "learning_rate": 0.00010184161412977795, "loss": 0.0411, "step": 11048 }, { "epoch": 2.7243589743589745, "grad_norm": 0.31640625, "learning_rate": 0.00010180719521938491, "loss": 0.0408, "step": 11050 }, { "epoch": 2.724852071005917, "grad_norm": 0.33984375, "learning_rate": 0.00010177277609482914, "loss": 0.0443, "step": 11052 }, { "epoch": 2.72534516765286, "grad_norm": 0.32421875, "learning_rate": 0.0001017383567601895, "loss": 0.0448, "step": 11054 }, { "epoch": 2.725838264299803, "grad_norm": 0.37890625, "learning_rate": 0.0001017039372195449, "loss": 0.0466, "step": 11056 }, { "epoch": 2.7263313609467454, "grad_norm": 0.28515625, "learning_rate": 0.00010166951747697421, "loss": 0.0453, "step": 11058 }, { "epoch": 2.7268244575936884, "grad_norm": 0.337890625, "learning_rate": 0.0001016350975365564, "loss": 0.0434, "step": 11060 }, { "epoch": 2.7273175542406314, "grad_norm": 0.30859375, "learning_rate": 0.00010160067740237038, "loss": 0.0391, "step": 11062 }, { "epoch": 2.727810650887574, "grad_norm": 0.32421875, "learning_rate": 0.00010156625707849517, "loss": 0.0428, "step": 11064 }, { "epoch": 2.728303747534517, "grad_norm": 0.375, "learning_rate": 0.00010153183656900973, "loss": 0.0436, "step": 11066 }, { "epoch": 2.7287968441814594, "grad_norm": 0.400390625, "learning_rate": 0.0001014974158779931, "loss": 0.0443, "step": 11068 }, { "epoch": 2.7292899408284024, "grad_norm": 0.330078125, "learning_rate": 0.00010146299500952434, "loss": 0.0416, "step": 11070 }, { "epoch": 2.729783037475345, "grad_norm": 0.3515625, "learning_rate": 0.00010142857396768251, "loss": 0.0414, "step": 11072 }, { "epoch": 2.730276134122288, "grad_norm": 0.30859375, "learning_rate": 0.00010139415275654671, "loss": 0.044, "step": 11074 }, { "epoch": 2.730769230769231, "grad_norm": 0.296875, "learning_rate": 0.00010135973138019599, "loss": 0.0429, "step": 11076 }, { "epoch": 2.7312623274161734, "grad_norm": 0.3359375, "learning_rate": 0.0001013253098427095, "loss": 0.0416, "step": 11078 }, { "epoch": 2.7317554240631163, "grad_norm": 0.3984375, "learning_rate": 0.00010129088814816641, "loss": 0.0409, "step": 11080 }, { "epoch": 2.7322485207100593, "grad_norm": 0.31640625, "learning_rate": 0.00010125646630064586, "loss": 0.0421, "step": 11082 }, { "epoch": 2.732741617357002, "grad_norm": 0.322265625, "learning_rate": 0.00010122204430422704, "loss": 0.0443, "step": 11084 }, { "epoch": 2.733234714003945, "grad_norm": 0.2734375, "learning_rate": 0.00010118762216298916, "loss": 0.0403, "step": 11086 }, { "epoch": 2.7337278106508878, "grad_norm": 0.328125, "learning_rate": 0.00010115319988101141, "loss": 0.0439, "step": 11088 }, { "epoch": 2.7342209072978303, "grad_norm": 0.330078125, "learning_rate": 0.00010111877746237306, "loss": 0.0424, "step": 11090 }, { "epoch": 2.7347140039447733, "grad_norm": 0.302734375, "learning_rate": 0.0001010843549111533, "loss": 0.0428, "step": 11092 }, { "epoch": 2.7352071005917162, "grad_norm": 0.404296875, "learning_rate": 0.00010104993223143144, "loss": 0.0471, "step": 11094 }, { "epoch": 2.7357001972386588, "grad_norm": 0.326171875, "learning_rate": 0.00010101550942728675, "loss": 0.0425, "step": 11096 }, { "epoch": 2.7361932938856017, "grad_norm": 0.3125, "learning_rate": 0.00010098108650279851, "loss": 0.0414, "step": 11098 }, { "epoch": 2.7366863905325443, "grad_norm": 0.392578125, "learning_rate": 0.00010094666346204609, "loss": 0.0463, "step": 11100 }, { "epoch": 2.7371794871794872, "grad_norm": 0.279296875, "learning_rate": 0.00010091224030910872, "loss": 0.0383, "step": 11102 }, { "epoch": 2.7376725838264298, "grad_norm": 0.255859375, "learning_rate": 0.00010087781704806578, "loss": 0.0397, "step": 11104 }, { "epoch": 2.7381656804733727, "grad_norm": 0.3203125, "learning_rate": 0.00010084339368299667, "loss": 0.0429, "step": 11106 }, { "epoch": 2.7386587771203157, "grad_norm": 0.236328125, "learning_rate": 0.00010080897021798064, "loss": 0.0371, "step": 11108 }, { "epoch": 2.739151873767258, "grad_norm": 0.314453125, "learning_rate": 0.00010077454665709713, "loss": 0.0426, "step": 11110 }, { "epoch": 2.739644970414201, "grad_norm": 0.29296875, "learning_rate": 0.00010074012300442552, "loss": 0.0452, "step": 11112 }, { "epoch": 2.740138067061144, "grad_norm": 0.34375, "learning_rate": 0.00010070569926404523, "loss": 0.0433, "step": 11114 }, { "epoch": 2.7406311637080867, "grad_norm": 0.33984375, "learning_rate": 0.00010067127544003563, "loss": 0.0414, "step": 11116 }, { "epoch": 2.7411242603550297, "grad_norm": 0.375, "learning_rate": 0.0001006368515364761, "loss": 0.0456, "step": 11118 }, { "epoch": 2.7416173570019726, "grad_norm": 0.29296875, "learning_rate": 0.00010060242755744614, "loss": 0.0452, "step": 11120 }, { "epoch": 2.742110453648915, "grad_norm": 0.318359375, "learning_rate": 0.00010056800350702517, "loss": 0.0472, "step": 11122 }, { "epoch": 2.742603550295858, "grad_norm": 0.326171875, "learning_rate": 0.00010053357938929257, "loss": 0.0386, "step": 11124 }, { "epoch": 2.7430966469428006, "grad_norm": 0.294921875, "learning_rate": 0.00010049915520832787, "loss": 0.0382, "step": 11126 }, { "epoch": 2.7435897435897436, "grad_norm": 0.322265625, "learning_rate": 0.00010046473096821047, "loss": 0.0437, "step": 11128 }, { "epoch": 2.744082840236686, "grad_norm": 0.337890625, "learning_rate": 0.00010043030667301986, "loss": 0.0435, "step": 11130 }, { "epoch": 2.744575936883629, "grad_norm": 0.291015625, "learning_rate": 0.0001003958823268355, "loss": 0.0447, "step": 11132 }, { "epoch": 2.745069033530572, "grad_norm": 0.322265625, "learning_rate": 0.00010036145793373688, "loss": 0.0383, "step": 11134 }, { "epoch": 2.7455621301775146, "grad_norm": 0.3125, "learning_rate": 0.00010032703349780348, "loss": 0.0397, "step": 11136 }, { "epoch": 2.7460552268244576, "grad_norm": 0.373046875, "learning_rate": 0.00010029260902311477, "loss": 0.0464, "step": 11138 }, { "epoch": 2.7465483234714005, "grad_norm": 0.251953125, "learning_rate": 0.00010025818451375028, "loss": 0.0367, "step": 11140 }, { "epoch": 2.747041420118343, "grad_norm": 0.357421875, "learning_rate": 0.00010022375997378947, "loss": 0.041, "step": 11142 }, { "epoch": 2.747534516765286, "grad_norm": 0.388671875, "learning_rate": 0.00010018933540731188, "loss": 0.042, "step": 11144 }, { "epoch": 2.748027613412229, "grad_norm": 0.29296875, "learning_rate": 0.00010015491081839696, "loss": 0.0444, "step": 11146 }, { "epoch": 2.7485207100591715, "grad_norm": 0.375, "learning_rate": 0.00010012048621112424, "loss": 0.045, "step": 11148 }, { "epoch": 2.7490138067061145, "grad_norm": 0.31640625, "learning_rate": 0.00010008606158957323, "loss": 0.0404, "step": 11150 }, { "epoch": 2.7495069033530575, "grad_norm": 0.30859375, "learning_rate": 0.00010005163695782345, "loss": 0.0462, "step": 11152 }, { "epoch": 2.75, "grad_norm": 0.328125, "learning_rate": 0.00010001721231995443, "loss": 0.0438, "step": 11154 }, { "epoch": 2.7504930966469425, "grad_norm": 0.337890625, "learning_rate": 9.998278768004561e-05, "loss": 0.043, "step": 11156 }, { "epoch": 2.7509861932938855, "grad_norm": 0.287109375, "learning_rate": 9.994836304217657e-05, "loss": 0.0448, "step": 11158 }, { "epoch": 2.7514792899408285, "grad_norm": 0.359375, "learning_rate": 9.991393841042676e-05, "loss": 0.0417, "step": 11160 }, { "epoch": 2.751972386587771, "grad_norm": 0.314453125, "learning_rate": 9.987951378887578e-05, "loss": 0.0465, "step": 11162 }, { "epoch": 2.752465483234714, "grad_norm": 0.318359375, "learning_rate": 9.984508918160308e-05, "loss": 0.0425, "step": 11164 }, { "epoch": 2.752958579881657, "grad_norm": 0.302734375, "learning_rate": 9.981066459268814e-05, "loss": 0.0451, "step": 11166 }, { "epoch": 2.7534516765285995, "grad_norm": 0.341796875, "learning_rate": 9.977624002621054e-05, "loss": 0.04, "step": 11168 }, { "epoch": 2.7539447731755424, "grad_norm": 0.330078125, "learning_rate": 9.974181548624973e-05, "loss": 0.0425, "step": 11170 }, { "epoch": 2.7544378698224854, "grad_norm": 0.341796875, "learning_rate": 9.970739097688525e-05, "loss": 0.0415, "step": 11172 }, { "epoch": 2.754930966469428, "grad_norm": 0.400390625, "learning_rate": 9.967296650219652e-05, "loss": 0.042, "step": 11174 }, { "epoch": 2.755424063116371, "grad_norm": 0.423828125, "learning_rate": 9.963854206626314e-05, "loss": 0.0447, "step": 11176 }, { "epoch": 2.755917159763314, "grad_norm": 0.3359375, "learning_rate": 9.960411767316448e-05, "loss": 0.0442, "step": 11178 }, { "epoch": 2.7564102564102564, "grad_norm": 0.455078125, "learning_rate": 9.956969332698015e-05, "loss": 0.0433, "step": 11180 }, { "epoch": 2.7569033530571994, "grad_norm": 0.40234375, "learning_rate": 9.953526903178956e-05, "loss": 0.0367, "step": 11182 }, { "epoch": 2.757396449704142, "grad_norm": 0.32421875, "learning_rate": 9.950084479167217e-05, "loss": 0.0434, "step": 11184 }, { "epoch": 2.757889546351085, "grad_norm": 0.41015625, "learning_rate": 9.946642061070746e-05, "loss": 0.0423, "step": 11186 }, { "epoch": 2.7583826429980274, "grad_norm": 0.306640625, "learning_rate": 9.943199649297485e-05, "loss": 0.0421, "step": 11188 }, { "epoch": 2.7588757396449703, "grad_norm": 0.31640625, "learning_rate": 9.939757244255387e-05, "loss": 0.0376, "step": 11190 }, { "epoch": 2.7593688362919133, "grad_norm": 0.40234375, "learning_rate": 9.93631484635239e-05, "loss": 0.0408, "step": 11192 }, { "epoch": 2.759861932938856, "grad_norm": 0.34765625, "learning_rate": 9.93287245599644e-05, "loss": 0.0436, "step": 11194 }, { "epoch": 2.760355029585799, "grad_norm": 0.32421875, "learning_rate": 9.929430073595478e-05, "loss": 0.04, "step": 11196 }, { "epoch": 2.760848126232742, "grad_norm": 0.3828125, "learning_rate": 9.92598769955745e-05, "loss": 0.0416, "step": 11198 }, { "epoch": 2.7613412228796843, "grad_norm": 0.306640625, "learning_rate": 9.92254533429029e-05, "loss": 0.0416, "step": 11200 }, { "epoch": 2.7618343195266273, "grad_norm": 0.296875, "learning_rate": 9.919102978201939e-05, "loss": 0.0395, "step": 11202 }, { "epoch": 2.7623274161735702, "grad_norm": 0.35546875, "learning_rate": 9.915660631700338e-05, "loss": 0.0415, "step": 11204 }, { "epoch": 2.7628205128205128, "grad_norm": 0.322265625, "learning_rate": 9.91221829519342e-05, "loss": 0.0421, "step": 11206 }, { "epoch": 2.7633136094674557, "grad_norm": 0.33203125, "learning_rate": 9.90877596908913e-05, "loss": 0.0424, "step": 11208 }, { "epoch": 2.7638067061143983, "grad_norm": 0.333984375, "learning_rate": 9.905333653795393e-05, "loss": 0.0409, "step": 11210 }, { "epoch": 2.7642998027613412, "grad_norm": 0.625, "learning_rate": 9.90189134972015e-05, "loss": 0.0446, "step": 11212 }, { "epoch": 2.7647928994082838, "grad_norm": 0.421875, "learning_rate": 9.898449057271328e-05, "loss": 0.0443, "step": 11214 }, { "epoch": 2.7652859960552267, "grad_norm": 0.314453125, "learning_rate": 9.895006776856857e-05, "loss": 0.0367, "step": 11216 }, { "epoch": 2.7657790927021697, "grad_norm": 0.333984375, "learning_rate": 9.891564508884673e-05, "loss": 0.0451, "step": 11218 }, { "epoch": 2.7662721893491122, "grad_norm": 0.42578125, "learning_rate": 9.888122253762696e-05, "loss": 0.0449, "step": 11220 }, { "epoch": 2.766765285996055, "grad_norm": 0.318359375, "learning_rate": 9.88468001189886e-05, "loss": 0.0377, "step": 11222 }, { "epoch": 2.767258382642998, "grad_norm": 0.357421875, "learning_rate": 9.881237783701085e-05, "loss": 0.0438, "step": 11224 }, { "epoch": 2.7677514792899407, "grad_norm": 0.318359375, "learning_rate": 9.877795569577299e-05, "loss": 0.0433, "step": 11226 }, { "epoch": 2.7682445759368837, "grad_norm": 0.330078125, "learning_rate": 9.874353369935415e-05, "loss": 0.0413, "step": 11228 }, { "epoch": 2.7687376725838266, "grad_norm": 0.3203125, "learning_rate": 9.870911185183361e-05, "loss": 0.042, "step": 11230 }, { "epoch": 2.769230769230769, "grad_norm": 0.365234375, "learning_rate": 9.867469015729054e-05, "loss": 0.0365, "step": 11232 }, { "epoch": 2.769723865877712, "grad_norm": 0.294921875, "learning_rate": 9.864026861980404e-05, "loss": 0.0448, "step": 11234 }, { "epoch": 2.770216962524655, "grad_norm": 0.31640625, "learning_rate": 9.860584724345334e-05, "loss": 0.0399, "step": 11236 }, { "epoch": 2.7707100591715976, "grad_norm": 0.29296875, "learning_rate": 9.85714260323175e-05, "loss": 0.0402, "step": 11238 }, { "epoch": 2.7712031558185406, "grad_norm": 0.3203125, "learning_rate": 9.853700499047569e-05, "loss": 0.0414, "step": 11240 }, { "epoch": 2.771696252465483, "grad_norm": 0.4296875, "learning_rate": 9.850258412200691e-05, "loss": 0.0438, "step": 11242 }, { "epoch": 2.772189349112426, "grad_norm": 0.328125, "learning_rate": 9.84681634309903e-05, "loss": 0.0444, "step": 11244 }, { "epoch": 2.7726824457593686, "grad_norm": 0.298828125, "learning_rate": 9.843374292150488e-05, "loss": 0.0412, "step": 11246 }, { "epoch": 2.7731755424063116, "grad_norm": 0.294921875, "learning_rate": 9.839932259762963e-05, "loss": 0.0417, "step": 11248 }, { "epoch": 2.7736686390532546, "grad_norm": 0.474609375, "learning_rate": 9.836490246344364e-05, "loss": 0.0409, "step": 11250 }, { "epoch": 2.774161735700197, "grad_norm": 0.41015625, "learning_rate": 9.83304825230258e-05, "loss": 0.0489, "step": 11252 }, { "epoch": 2.77465483234714, "grad_norm": 0.330078125, "learning_rate": 9.829606278045513e-05, "loss": 0.048, "step": 11254 }, { "epoch": 2.775147928994083, "grad_norm": 0.3828125, "learning_rate": 9.82616432398105e-05, "loss": 0.0434, "step": 11256 }, { "epoch": 2.7756410256410255, "grad_norm": 0.365234375, "learning_rate": 9.822722390517089e-05, "loss": 0.0452, "step": 11258 }, { "epoch": 2.7761341222879685, "grad_norm": 0.33984375, "learning_rate": 9.81928047806151e-05, "loss": 0.0408, "step": 11260 }, { "epoch": 2.7766272189349115, "grad_norm": 0.337890625, "learning_rate": 9.815838587022206e-05, "loss": 0.0418, "step": 11262 }, { "epoch": 2.777120315581854, "grad_norm": 0.404296875, "learning_rate": 9.81239671780706e-05, "loss": 0.0417, "step": 11264 }, { "epoch": 2.777613412228797, "grad_norm": 0.40625, "learning_rate": 9.808954870823949e-05, "loss": 0.0421, "step": 11266 }, { "epoch": 2.7781065088757395, "grad_norm": 0.302734375, "learning_rate": 9.805513046480752e-05, "loss": 0.0451, "step": 11268 }, { "epoch": 2.7785996055226825, "grad_norm": 0.357421875, "learning_rate": 9.802071245185343e-05, "loss": 0.043, "step": 11270 }, { "epoch": 2.779092702169625, "grad_norm": 0.333984375, "learning_rate": 9.798629467345599e-05, "loss": 0.041, "step": 11272 }, { "epoch": 2.779585798816568, "grad_norm": 0.4375, "learning_rate": 9.795187713369384e-05, "loss": 0.0446, "step": 11274 }, { "epoch": 2.780078895463511, "grad_norm": 0.33984375, "learning_rate": 9.791745983664572e-05, "loss": 0.0404, "step": 11276 }, { "epoch": 2.7805719921104535, "grad_norm": 0.29296875, "learning_rate": 9.788304278639022e-05, "loss": 0.0407, "step": 11278 }, { "epoch": 2.7810650887573964, "grad_norm": 0.306640625, "learning_rate": 9.7848625987006e-05, "loss": 0.0414, "step": 11280 }, { "epoch": 2.7815581854043394, "grad_norm": 0.306640625, "learning_rate": 9.781420944257159e-05, "loss": 0.0414, "step": 11282 }, { "epoch": 2.782051282051282, "grad_norm": 0.30078125, "learning_rate": 9.777979315716553e-05, "loss": 0.045, "step": 11284 }, { "epoch": 2.782544378698225, "grad_norm": 0.439453125, "learning_rate": 9.774537713486643e-05, "loss": 0.0447, "step": 11286 }, { "epoch": 2.783037475345168, "grad_norm": 0.330078125, "learning_rate": 9.771096137975269e-05, "loss": 0.0391, "step": 11288 }, { "epoch": 2.7835305719921104, "grad_norm": 0.4375, "learning_rate": 9.767654589590284e-05, "loss": 0.0462, "step": 11290 }, { "epoch": 2.7840236686390534, "grad_norm": 0.435546875, "learning_rate": 9.764213068739526e-05, "loss": 0.0386, "step": 11292 }, { "epoch": 2.784516765285996, "grad_norm": 0.453125, "learning_rate": 9.760771575830837e-05, "loss": 0.0414, "step": 11294 }, { "epoch": 2.785009861932939, "grad_norm": 0.37890625, "learning_rate": 9.75733011127205e-05, "loss": 0.0463, "step": 11296 }, { "epoch": 2.7855029585798814, "grad_norm": 0.294921875, "learning_rate": 9.753888675470999e-05, "loss": 0.0384, "step": 11298 }, { "epoch": 2.7859960552268244, "grad_norm": 0.302734375, "learning_rate": 9.750447268835517e-05, "loss": 0.0389, "step": 11300 }, { "epoch": 2.7864891518737673, "grad_norm": 0.302734375, "learning_rate": 9.747005891773422e-05, "loss": 0.0397, "step": 11302 }, { "epoch": 2.78698224852071, "grad_norm": 0.294921875, "learning_rate": 9.743564544692546e-05, "loss": 0.0429, "step": 11304 }, { "epoch": 2.787475345167653, "grad_norm": 0.3046875, "learning_rate": 9.740123228000702e-05, "loss": 0.0421, "step": 11306 }, { "epoch": 2.787968441814596, "grad_norm": 0.349609375, "learning_rate": 9.736681942105708e-05, "loss": 0.0441, "step": 11308 }, { "epoch": 2.7884615384615383, "grad_norm": 0.318359375, "learning_rate": 9.73324068741537e-05, "loss": 0.0463, "step": 11310 }, { "epoch": 2.7889546351084813, "grad_norm": 0.314453125, "learning_rate": 9.729799464337503e-05, "loss": 0.0401, "step": 11312 }, { "epoch": 2.7894477317554243, "grad_norm": 0.28125, "learning_rate": 9.726358273279909e-05, "loss": 0.0442, "step": 11314 }, { "epoch": 2.789940828402367, "grad_norm": 0.337890625, "learning_rate": 9.722917114650383e-05, "loss": 0.0444, "step": 11316 }, { "epoch": 2.7904339250493098, "grad_norm": 0.3828125, "learning_rate": 9.71947598885673e-05, "loss": 0.0408, "step": 11318 }, { "epoch": 2.7909270216962527, "grad_norm": 0.388671875, "learning_rate": 9.716034896306737e-05, "loss": 0.0449, "step": 11320 }, { "epoch": 2.7914201183431953, "grad_norm": 0.38671875, "learning_rate": 9.712593837408196e-05, "loss": 0.0421, "step": 11322 }, { "epoch": 2.7919132149901382, "grad_norm": 0.314453125, "learning_rate": 9.709152812568886e-05, "loss": 0.0428, "step": 11324 }, { "epoch": 2.7924063116370808, "grad_norm": 0.32421875, "learning_rate": 9.705711822196597e-05, "loss": 0.0426, "step": 11326 }, { "epoch": 2.7928994082840237, "grad_norm": 0.341796875, "learning_rate": 9.702270866699094e-05, "loss": 0.04, "step": 11328 }, { "epoch": 2.7933925049309662, "grad_norm": 0.275390625, "learning_rate": 9.69882994648416e-05, "loss": 0.0402, "step": 11330 }, { "epoch": 2.793885601577909, "grad_norm": 0.3359375, "learning_rate": 9.695389061959559e-05, "loss": 0.0413, "step": 11332 }, { "epoch": 2.794378698224852, "grad_norm": 0.326171875, "learning_rate": 9.691948213533053e-05, "loss": 0.0391, "step": 11334 }, { "epoch": 2.7948717948717947, "grad_norm": 0.34375, "learning_rate": 9.688507401612406e-05, "loss": 0.0423, "step": 11336 }, { "epoch": 2.7953648915187377, "grad_norm": 0.302734375, "learning_rate": 9.685066626605366e-05, "loss": 0.0428, "step": 11338 }, { "epoch": 2.7958579881656807, "grad_norm": 0.341796875, "learning_rate": 9.681625888919695e-05, "loss": 0.0421, "step": 11340 }, { "epoch": 2.796351084812623, "grad_norm": 0.380859375, "learning_rate": 9.678185188963129e-05, "loss": 0.0429, "step": 11342 }, { "epoch": 2.796844181459566, "grad_norm": 0.32421875, "learning_rate": 9.674744527143417e-05, "loss": 0.0437, "step": 11344 }, { "epoch": 2.797337278106509, "grad_norm": 0.357421875, "learning_rate": 9.671303903868297e-05, "loss": 0.0407, "step": 11346 }, { "epoch": 2.7978303747534516, "grad_norm": 0.36328125, "learning_rate": 9.667863319545496e-05, "loss": 0.0421, "step": 11348 }, { "epoch": 2.7983234714003946, "grad_norm": 0.326171875, "learning_rate": 9.664422774582749e-05, "loss": 0.0431, "step": 11350 }, { "epoch": 2.798816568047337, "grad_norm": 0.3125, "learning_rate": 9.660982269387773e-05, "loss": 0.039, "step": 11352 }, { "epoch": 2.79930966469428, "grad_norm": 0.39453125, "learning_rate": 9.657541804368294e-05, "loss": 0.0429, "step": 11354 }, { "epoch": 2.7998027613412226, "grad_norm": 0.345703125, "learning_rate": 9.654101379932019e-05, "loss": 0.0392, "step": 11356 }, { "epoch": 2.8002958579881656, "grad_norm": 0.306640625, "learning_rate": 9.650660996486666e-05, "loss": 0.0437, "step": 11358 }, { "epoch": 2.8007889546351086, "grad_norm": 0.404296875, "learning_rate": 9.647220654439935e-05, "loss": 0.0451, "step": 11360 }, { "epoch": 2.801282051282051, "grad_norm": 0.357421875, "learning_rate": 9.643780354199527e-05, "loss": 0.0452, "step": 11362 }, { "epoch": 2.801775147928994, "grad_norm": 0.302734375, "learning_rate": 9.640340096173135e-05, "loss": 0.0415, "step": 11364 }, { "epoch": 2.802268244575937, "grad_norm": 0.302734375, "learning_rate": 9.636899880768446e-05, "loss": 0.0403, "step": 11366 }, { "epoch": 2.8027613412228796, "grad_norm": 0.310546875, "learning_rate": 9.633459708393154e-05, "loss": 0.0405, "step": 11368 }, { "epoch": 2.8032544378698225, "grad_norm": 0.306640625, "learning_rate": 9.630019579454927e-05, "loss": 0.046, "step": 11370 }, { "epoch": 2.8037475345167655, "grad_norm": 0.3828125, "learning_rate": 9.62657949436145e-05, "loss": 0.0405, "step": 11372 }, { "epoch": 2.804240631163708, "grad_norm": 0.298828125, "learning_rate": 9.623139453520387e-05, "loss": 0.0439, "step": 11374 }, { "epoch": 2.804733727810651, "grad_norm": 0.298828125, "learning_rate": 9.619699457339405e-05, "loss": 0.0387, "step": 11376 }, { "epoch": 2.8052268244575935, "grad_norm": 0.322265625, "learning_rate": 9.616259506226157e-05, "loss": 0.043, "step": 11378 }, { "epoch": 2.8057199211045365, "grad_norm": 0.345703125, "learning_rate": 9.6128196005883e-05, "loss": 0.0463, "step": 11380 }, { "epoch": 2.806213017751479, "grad_norm": 0.349609375, "learning_rate": 9.609379740833487e-05, "loss": 0.0464, "step": 11382 }, { "epoch": 2.806706114398422, "grad_norm": 0.365234375, "learning_rate": 9.605939927369349e-05, "loss": 0.0448, "step": 11384 }, { "epoch": 2.807199211045365, "grad_norm": 0.32421875, "learning_rate": 9.602500160603534e-05, "loss": 0.0417, "step": 11386 }, { "epoch": 2.8076923076923075, "grad_norm": 0.3671875, "learning_rate": 9.599060440943668e-05, "loss": 0.0431, "step": 11388 }, { "epoch": 2.8081854043392505, "grad_norm": 0.48046875, "learning_rate": 9.59562076879738e-05, "loss": 0.0387, "step": 11390 }, { "epoch": 2.8086785009861934, "grad_norm": 0.31640625, "learning_rate": 9.592181144572285e-05, "loss": 0.0403, "step": 11392 }, { "epoch": 2.809171597633136, "grad_norm": 0.296875, "learning_rate": 9.588741568676004e-05, "loss": 0.0425, "step": 11394 }, { "epoch": 2.809664694280079, "grad_norm": 0.353515625, "learning_rate": 9.585302041516144e-05, "loss": 0.0439, "step": 11396 }, { "epoch": 2.810157790927022, "grad_norm": 0.3125, "learning_rate": 9.581862563500304e-05, "loss": 0.0407, "step": 11398 }, { "epoch": 2.8106508875739644, "grad_norm": 0.33984375, "learning_rate": 9.578423135036088e-05, "loss": 0.0425, "step": 11400 }, { "epoch": 2.8111439842209074, "grad_norm": 0.353515625, "learning_rate": 9.574983756531084e-05, "loss": 0.0406, "step": 11402 }, { "epoch": 2.8116370808678504, "grad_norm": 0.41796875, "learning_rate": 9.57154442839288e-05, "loss": 0.041, "step": 11404 }, { "epoch": 2.812130177514793, "grad_norm": 0.3359375, "learning_rate": 9.568105151029047e-05, "loss": 0.0404, "step": 11406 }, { "epoch": 2.812623274161736, "grad_norm": 0.302734375, "learning_rate": 9.564665924847169e-05, "loss": 0.0414, "step": 11408 }, { "epoch": 2.8131163708086784, "grad_norm": 0.357421875, "learning_rate": 9.561226750254807e-05, "loss": 0.0408, "step": 11410 }, { "epoch": 2.8136094674556213, "grad_norm": 0.30078125, "learning_rate": 9.557787627659528e-05, "loss": 0.041, "step": 11412 }, { "epoch": 2.814102564102564, "grad_norm": 0.3828125, "learning_rate": 9.554348557468885e-05, "loss": 0.0422, "step": 11414 }, { "epoch": 2.814595660749507, "grad_norm": 0.494140625, "learning_rate": 9.550909540090424e-05, "loss": 0.0439, "step": 11416 }, { "epoch": 2.81508875739645, "grad_norm": 0.34765625, "learning_rate": 9.54747057593169e-05, "loss": 0.0434, "step": 11418 }, { "epoch": 2.8155818540433923, "grad_norm": 0.314453125, "learning_rate": 9.544031665400218e-05, "loss": 0.0403, "step": 11420 }, { "epoch": 2.8160749506903353, "grad_norm": 0.328125, "learning_rate": 9.540592808903543e-05, "loss": 0.0462, "step": 11422 }, { "epoch": 2.8165680473372783, "grad_norm": 0.400390625, "learning_rate": 9.53715400684918e-05, "loss": 0.0424, "step": 11424 }, { "epoch": 2.817061143984221, "grad_norm": 0.3046875, "learning_rate": 9.533715259644657e-05, "loss": 0.0407, "step": 11426 }, { "epoch": 2.8175542406311638, "grad_norm": 0.349609375, "learning_rate": 9.530276567697478e-05, "loss": 0.0429, "step": 11428 }, { "epoch": 2.8180473372781067, "grad_norm": 0.4296875, "learning_rate": 9.526837931415146e-05, "loss": 0.0439, "step": 11430 }, { "epoch": 2.8185404339250493, "grad_norm": 0.5078125, "learning_rate": 9.523399351205161e-05, "loss": 0.0423, "step": 11432 }, { "epoch": 2.8190335305719922, "grad_norm": 0.322265625, "learning_rate": 9.519960827475014e-05, "loss": 0.0449, "step": 11434 }, { "epoch": 2.8195266272189348, "grad_norm": 0.3125, "learning_rate": 9.516522360632192e-05, "loss": 0.0399, "step": 11436 }, { "epoch": 2.8200197238658777, "grad_norm": 0.52734375, "learning_rate": 9.513083951084165e-05, "loss": 0.0424, "step": 11438 }, { "epoch": 2.8205128205128203, "grad_norm": 0.359375, "learning_rate": 9.509645599238412e-05, "loss": 0.0402, "step": 11440 }, { "epoch": 2.8210059171597632, "grad_norm": 0.3359375, "learning_rate": 9.506207305502391e-05, "loss": 0.0454, "step": 11442 }, { "epoch": 2.821499013806706, "grad_norm": 0.35546875, "learning_rate": 9.502769070283562e-05, "loss": 0.0373, "step": 11444 }, { "epoch": 2.8219921104536487, "grad_norm": 0.5, "learning_rate": 9.499330893989377e-05, "loss": 0.045, "step": 11446 }, { "epoch": 2.8224852071005917, "grad_norm": 0.59375, "learning_rate": 9.495892777027274e-05, "loss": 0.045, "step": 11448 }, { "epoch": 2.8229783037475347, "grad_norm": 0.31640625, "learning_rate": 9.492454719804693e-05, "loss": 0.0414, "step": 11450 }, { "epoch": 2.823471400394477, "grad_norm": 0.349609375, "learning_rate": 9.489016722729058e-05, "loss": 0.0436, "step": 11452 }, { "epoch": 2.82396449704142, "grad_norm": 0.296875, "learning_rate": 9.485578786207799e-05, "loss": 0.0426, "step": 11454 }, { "epoch": 2.824457593688363, "grad_norm": 0.40234375, "learning_rate": 9.482140910648322e-05, "loss": 0.046, "step": 11456 }, { "epoch": 2.8249506903353057, "grad_norm": 0.32421875, "learning_rate": 9.478703096458039e-05, "loss": 0.0459, "step": 11458 }, { "epoch": 2.8254437869822486, "grad_norm": 0.37890625, "learning_rate": 9.475265344044351e-05, "loss": 0.0415, "step": 11460 }, { "epoch": 2.8259368836291916, "grad_norm": 0.294921875, "learning_rate": 9.47182765381465e-05, "loss": 0.0425, "step": 11462 }, { "epoch": 2.826429980276134, "grad_norm": 0.326171875, "learning_rate": 9.468390026176321e-05, "loss": 0.0432, "step": 11464 }, { "epoch": 2.8269230769230766, "grad_norm": 0.302734375, "learning_rate": 9.464952461536739e-05, "loss": 0.0407, "step": 11466 }, { "epoch": 2.8274161735700196, "grad_norm": 0.291015625, "learning_rate": 9.461514960303282e-05, "loss": 0.0401, "step": 11468 }, { "epoch": 2.8279092702169626, "grad_norm": 0.322265625, "learning_rate": 9.458077522883305e-05, "loss": 0.0399, "step": 11470 }, { "epoch": 2.828402366863905, "grad_norm": 0.27734375, "learning_rate": 9.454640149684169e-05, "loss": 0.0394, "step": 11472 }, { "epoch": 2.828895463510848, "grad_norm": 0.41796875, "learning_rate": 9.451202841113219e-05, "loss": 0.042, "step": 11474 }, { "epoch": 2.829388560157791, "grad_norm": 0.30859375, "learning_rate": 9.447765597577799e-05, "loss": 0.0414, "step": 11476 }, { "epoch": 2.8298816568047336, "grad_norm": 0.396484375, "learning_rate": 9.444328419485239e-05, "loss": 0.0399, "step": 11478 }, { "epoch": 2.8303747534516766, "grad_norm": 0.322265625, "learning_rate": 9.440891307242859e-05, "loss": 0.0416, "step": 11480 }, { "epoch": 2.8308678500986195, "grad_norm": 0.3828125, "learning_rate": 9.437454261257986e-05, "loss": 0.0419, "step": 11482 }, { "epoch": 2.831360946745562, "grad_norm": 0.302734375, "learning_rate": 9.434017281937918e-05, "loss": 0.0442, "step": 11484 }, { "epoch": 2.831854043392505, "grad_norm": 0.30078125, "learning_rate": 9.430580369689968e-05, "loss": 0.0398, "step": 11486 }, { "epoch": 2.832347140039448, "grad_norm": 0.4609375, "learning_rate": 9.42714352492142e-05, "loss": 0.0462, "step": 11488 }, { "epoch": 2.8328402366863905, "grad_norm": 0.3125, "learning_rate": 9.423706748039566e-05, "loss": 0.0378, "step": 11490 }, { "epoch": 2.8333333333333335, "grad_norm": 0.287109375, "learning_rate": 9.420270039451675e-05, "loss": 0.0432, "step": 11492 }, { "epoch": 2.833826429980276, "grad_norm": 0.361328125, "learning_rate": 9.416833399565027e-05, "loss": 0.0441, "step": 11494 }, { "epoch": 2.834319526627219, "grad_norm": 0.365234375, "learning_rate": 9.413396828786875e-05, "loss": 0.041, "step": 11496 }, { "epoch": 2.8348126232741615, "grad_norm": 0.3828125, "learning_rate": 9.40996032752447e-05, "loss": 0.0424, "step": 11498 }, { "epoch": 2.8353057199211045, "grad_norm": 0.318359375, "learning_rate": 9.406523896185066e-05, "loss": 0.0451, "step": 11500 }, { "epoch": 2.8357988165680474, "grad_norm": 0.3515625, "learning_rate": 9.40308753517589e-05, "loss": 0.042, "step": 11502 }, { "epoch": 2.83629191321499, "grad_norm": 0.326171875, "learning_rate": 9.399651244904178e-05, "loss": 0.0443, "step": 11504 }, { "epoch": 2.836785009861933, "grad_norm": 0.40234375, "learning_rate": 9.396215025777139e-05, "loss": 0.0453, "step": 11506 }, { "epoch": 2.837278106508876, "grad_norm": 0.279296875, "learning_rate": 9.392778878201995e-05, "loss": 0.0424, "step": 11508 }, { "epoch": 2.8377712031558184, "grad_norm": 0.376953125, "learning_rate": 9.389342802585946e-05, "loss": 0.0411, "step": 11510 }, { "epoch": 2.8382642998027614, "grad_norm": 0.353515625, "learning_rate": 9.385906799336179e-05, "loss": 0.046, "step": 11512 }, { "epoch": 2.8387573964497044, "grad_norm": 0.322265625, "learning_rate": 9.38247086885989e-05, "loss": 0.0425, "step": 11514 }, { "epoch": 2.839250493096647, "grad_norm": 0.337890625, "learning_rate": 9.379035011564247e-05, "loss": 0.0432, "step": 11516 }, { "epoch": 2.83974358974359, "grad_norm": 0.337890625, "learning_rate": 9.375599227856426e-05, "loss": 0.043, "step": 11518 }, { "epoch": 2.8402366863905324, "grad_norm": 0.4453125, "learning_rate": 9.372163518143578e-05, "loss": 0.0408, "step": 11520 }, { "epoch": 2.8407297830374754, "grad_norm": 0.3515625, "learning_rate": 9.368727882832863e-05, "loss": 0.0409, "step": 11522 }, { "epoch": 2.841222879684418, "grad_norm": 0.337890625, "learning_rate": 9.365292322331413e-05, "loss": 0.0455, "step": 11524 }, { "epoch": 2.841715976331361, "grad_norm": 0.3671875, "learning_rate": 9.361856837046373e-05, "loss": 0.0411, "step": 11526 }, { "epoch": 2.842209072978304, "grad_norm": 0.33203125, "learning_rate": 9.358421427384859e-05, "loss": 0.0422, "step": 11528 }, { "epoch": 2.8427021696252464, "grad_norm": 0.357421875, "learning_rate": 9.354986093753986e-05, "loss": 0.045, "step": 11530 }, { "epoch": 2.8431952662721893, "grad_norm": 0.435546875, "learning_rate": 9.351550836560864e-05, "loss": 0.0423, "step": 11532 }, { "epoch": 2.8436883629191323, "grad_norm": 0.3515625, "learning_rate": 9.348115656212585e-05, "loss": 0.0424, "step": 11534 }, { "epoch": 2.844181459566075, "grad_norm": 0.333984375, "learning_rate": 9.344680553116244e-05, "loss": 0.0395, "step": 11536 }, { "epoch": 2.844674556213018, "grad_norm": 0.294921875, "learning_rate": 9.341245527678912e-05, "loss": 0.0423, "step": 11538 }, { "epoch": 2.8451676528599608, "grad_norm": 0.310546875, "learning_rate": 9.337810580307668e-05, "loss": 0.0409, "step": 11540 }, { "epoch": 2.8456607495069033, "grad_norm": 0.34765625, "learning_rate": 9.334375711409564e-05, "loss": 0.0427, "step": 11542 }, { "epoch": 2.8461538461538463, "grad_norm": 0.28125, "learning_rate": 9.330940921391657e-05, "loss": 0.0386, "step": 11544 }, { "epoch": 2.8466469428007892, "grad_norm": 0.318359375, "learning_rate": 9.327506210660988e-05, "loss": 0.0438, "step": 11546 }, { "epoch": 2.8471400394477318, "grad_norm": 0.296875, "learning_rate": 9.324071579624583e-05, "loss": 0.0405, "step": 11548 }, { "epoch": 2.8476331360946747, "grad_norm": 0.34765625, "learning_rate": 9.320637028689474e-05, "loss": 0.0392, "step": 11550 }, { "epoch": 2.8481262327416172, "grad_norm": 0.359375, "learning_rate": 9.317202558262664e-05, "loss": 0.0421, "step": 11552 }, { "epoch": 2.84861932938856, "grad_norm": 0.380859375, "learning_rate": 9.31376816875117e-05, "loss": 0.038, "step": 11554 }, { "epoch": 2.8491124260355027, "grad_norm": 0.3125, "learning_rate": 9.310333860561975e-05, "loss": 0.042, "step": 11556 }, { "epoch": 2.8496055226824457, "grad_norm": 0.35546875, "learning_rate": 9.306899634102071e-05, "loss": 0.0393, "step": 11558 }, { "epoch": 2.8500986193293887, "grad_norm": 0.3203125, "learning_rate": 9.30346548977843e-05, "loss": 0.0397, "step": 11560 }, { "epoch": 2.850591715976331, "grad_norm": 0.2890625, "learning_rate": 9.300031427998014e-05, "loss": 0.0386, "step": 11562 }, { "epoch": 2.851084812623274, "grad_norm": 0.3515625, "learning_rate": 9.296597449167785e-05, "loss": 0.0418, "step": 11564 }, { "epoch": 2.851577909270217, "grad_norm": 0.357421875, "learning_rate": 9.293163553694682e-05, "loss": 0.0438, "step": 11566 }, { "epoch": 2.8520710059171597, "grad_norm": 0.39453125, "learning_rate": 9.289729741985649e-05, "loss": 0.0416, "step": 11568 }, { "epoch": 2.8525641025641026, "grad_norm": 0.333984375, "learning_rate": 9.286296014447604e-05, "loss": 0.0436, "step": 11570 }, { "epoch": 2.8530571992110456, "grad_norm": 0.263671875, "learning_rate": 9.28286237148747e-05, "loss": 0.0377, "step": 11572 }, { "epoch": 2.853550295857988, "grad_norm": 0.29296875, "learning_rate": 9.279428813512143e-05, "loss": 0.0415, "step": 11574 }, { "epoch": 2.854043392504931, "grad_norm": 0.33984375, "learning_rate": 9.275995340928529e-05, "loss": 0.0445, "step": 11576 }, { "epoch": 2.8545364891518736, "grad_norm": 0.443359375, "learning_rate": 9.27256195414351e-05, "loss": 0.0461, "step": 11578 }, { "epoch": 2.8550295857988166, "grad_norm": 0.302734375, "learning_rate": 9.269128653563957e-05, "loss": 0.0434, "step": 11580 }, { "epoch": 2.855522682445759, "grad_norm": 0.421875, "learning_rate": 9.265695439596744e-05, "loss": 0.0454, "step": 11582 }, { "epoch": 2.856015779092702, "grad_norm": 0.3359375, "learning_rate": 9.262262312648717e-05, "loss": 0.0468, "step": 11584 }, { "epoch": 2.856508875739645, "grad_norm": 0.328125, "learning_rate": 9.258829273126729e-05, "loss": 0.0411, "step": 11586 }, { "epoch": 2.8570019723865876, "grad_norm": 0.298828125, "learning_rate": 9.255396321437606e-05, "loss": 0.0412, "step": 11588 }, { "epoch": 2.8574950690335306, "grad_norm": 0.373046875, "learning_rate": 9.251963457988179e-05, "loss": 0.0462, "step": 11590 }, { "epoch": 2.8579881656804735, "grad_norm": 0.302734375, "learning_rate": 9.248530683185256e-05, "loss": 0.0434, "step": 11592 }, { "epoch": 2.858481262327416, "grad_norm": 0.47265625, "learning_rate": 9.245097997435646e-05, "loss": 0.0388, "step": 11594 }, { "epoch": 2.858974358974359, "grad_norm": 0.36328125, "learning_rate": 9.241665401146139e-05, "loss": 0.0385, "step": 11596 }, { "epoch": 2.859467455621302, "grad_norm": 0.275390625, "learning_rate": 9.238232894723513e-05, "loss": 0.039, "step": 11598 }, { "epoch": 2.8599605522682445, "grad_norm": 0.333984375, "learning_rate": 9.234800478574544e-05, "loss": 0.0404, "step": 11600 }, { "epoch": 2.8604536489151875, "grad_norm": 0.455078125, "learning_rate": 9.231368153105988e-05, "loss": 0.0421, "step": 11602 }, { "epoch": 2.86094674556213, "grad_norm": 0.30859375, "learning_rate": 9.2279359187246e-05, "loss": 0.0426, "step": 11604 }, { "epoch": 2.861439842209073, "grad_norm": 0.421875, "learning_rate": 9.224503775837112e-05, "loss": 0.0382, "step": 11606 }, { "epoch": 2.8619329388560155, "grad_norm": 0.345703125, "learning_rate": 9.221071724850262e-05, "loss": 0.041, "step": 11608 }, { "epoch": 2.8624260355029585, "grad_norm": 0.392578125, "learning_rate": 9.21763976617076e-05, "loss": 0.0395, "step": 11610 }, { "epoch": 2.8629191321499015, "grad_norm": 0.37109375, "learning_rate": 9.214207900205314e-05, "loss": 0.0422, "step": 11612 }, { "epoch": 2.863412228796844, "grad_norm": 0.30859375, "learning_rate": 9.21077612736062e-05, "loss": 0.0377, "step": 11614 }, { "epoch": 2.863905325443787, "grad_norm": 0.31640625, "learning_rate": 9.207344448043356e-05, "loss": 0.0397, "step": 11616 }, { "epoch": 2.86439842209073, "grad_norm": 0.546875, "learning_rate": 9.203912862660205e-05, "loss": 0.046, "step": 11618 }, { "epoch": 2.8648915187376724, "grad_norm": 0.33203125, "learning_rate": 9.200481371617821e-05, "loss": 0.0401, "step": 11620 }, { "epoch": 2.8653846153846154, "grad_norm": 0.3046875, "learning_rate": 9.197049975322863e-05, "loss": 0.0424, "step": 11622 }, { "epoch": 2.8658777120315584, "grad_norm": 0.333984375, "learning_rate": 9.193618674181963e-05, "loss": 0.0431, "step": 11624 }, { "epoch": 2.866370808678501, "grad_norm": 0.31640625, "learning_rate": 9.190187468601756e-05, "loss": 0.0401, "step": 11626 }, { "epoch": 2.866863905325444, "grad_norm": 0.375, "learning_rate": 9.186756358988855e-05, "loss": 0.0422, "step": 11628 }, { "epoch": 2.867357001972387, "grad_norm": 0.361328125, "learning_rate": 9.18332534574986e-05, "loss": 0.0394, "step": 11630 }, { "epoch": 2.8678500986193294, "grad_norm": 0.41015625, "learning_rate": 9.179894429291376e-05, "loss": 0.0415, "step": 11632 }, { "epoch": 2.8683431952662723, "grad_norm": 0.357421875, "learning_rate": 9.176463610019979e-05, "loss": 0.0432, "step": 11634 }, { "epoch": 2.868836291913215, "grad_norm": 0.34375, "learning_rate": 9.173032888342244e-05, "loss": 0.0346, "step": 11636 }, { "epoch": 2.869329388560158, "grad_norm": 0.341796875, "learning_rate": 9.169602264664728e-05, "loss": 0.0466, "step": 11638 }, { "epoch": 2.8698224852071004, "grad_norm": 0.3515625, "learning_rate": 9.166171739393983e-05, "loss": 0.0419, "step": 11640 }, { "epoch": 2.8703155818540433, "grad_norm": 0.283203125, "learning_rate": 9.16274131293654e-05, "loss": 0.0361, "step": 11642 }, { "epoch": 2.8708086785009863, "grad_norm": 0.333984375, "learning_rate": 9.159310985698923e-05, "loss": 0.0427, "step": 11644 }, { "epoch": 2.871301775147929, "grad_norm": 0.4453125, "learning_rate": 9.155880758087653e-05, "loss": 0.0431, "step": 11646 }, { "epoch": 2.871794871794872, "grad_norm": 0.3828125, "learning_rate": 9.152450630509221e-05, "loss": 0.038, "step": 11648 }, { "epoch": 2.8722879684418148, "grad_norm": 0.3984375, "learning_rate": 9.149020603370127e-05, "loss": 0.0373, "step": 11650 }, { "epoch": 2.8727810650887573, "grad_norm": 0.30078125, "learning_rate": 9.145590677076839e-05, "loss": 0.0388, "step": 11652 }, { "epoch": 2.8732741617357003, "grad_norm": 0.306640625, "learning_rate": 9.142160852035828e-05, "loss": 0.0412, "step": 11654 }, { "epoch": 2.8737672583826432, "grad_norm": 0.39453125, "learning_rate": 9.138731128653543e-05, "loss": 0.0434, "step": 11656 }, { "epoch": 2.8742603550295858, "grad_norm": 0.404296875, "learning_rate": 9.135301507336431e-05, "loss": 0.0412, "step": 11658 }, { "epoch": 2.8747534516765287, "grad_norm": 0.326171875, "learning_rate": 9.13187198849092e-05, "loss": 0.0396, "step": 11660 }, { "epoch": 2.8752465483234713, "grad_norm": 0.349609375, "learning_rate": 9.128442572523417e-05, "loss": 0.0435, "step": 11662 }, { "epoch": 2.8757396449704142, "grad_norm": 0.30859375, "learning_rate": 9.125013259840343e-05, "loss": 0.0452, "step": 11664 }, { "epoch": 2.8762327416173568, "grad_norm": 0.431640625, "learning_rate": 9.121584050848081e-05, "loss": 0.0397, "step": 11666 }, { "epoch": 2.8767258382642997, "grad_norm": 0.392578125, "learning_rate": 9.118154945953014e-05, "loss": 0.0399, "step": 11668 }, { "epoch": 2.8772189349112427, "grad_norm": 0.3515625, "learning_rate": 9.114725945561506e-05, "loss": 0.0428, "step": 11670 }, { "epoch": 2.8777120315581852, "grad_norm": 0.3515625, "learning_rate": 9.111297050079921e-05, "loss": 0.0398, "step": 11672 }, { "epoch": 2.878205128205128, "grad_norm": 0.31640625, "learning_rate": 9.107868259914592e-05, "loss": 0.0446, "step": 11674 }, { "epoch": 2.878698224852071, "grad_norm": 0.3984375, "learning_rate": 9.10443957547186e-05, "loss": 0.0438, "step": 11676 }, { "epoch": 2.8791913214990137, "grad_norm": 0.318359375, "learning_rate": 9.10101099715804e-05, "loss": 0.0403, "step": 11678 }, { "epoch": 2.8796844181459567, "grad_norm": 0.333984375, "learning_rate": 9.097582525379431e-05, "loss": 0.0416, "step": 11680 }, { "epoch": 2.8801775147928996, "grad_norm": 0.3203125, "learning_rate": 9.094154160542335e-05, "loss": 0.0405, "step": 11682 }, { "epoch": 2.880670611439842, "grad_norm": 0.41015625, "learning_rate": 9.090725903053025e-05, "loss": 0.0408, "step": 11684 }, { "epoch": 2.881163708086785, "grad_norm": 0.3046875, "learning_rate": 9.087297753317776e-05, "loss": 0.0402, "step": 11686 }, { "epoch": 2.8816568047337277, "grad_norm": 0.39453125, "learning_rate": 9.083869711742834e-05, "loss": 0.0376, "step": 11688 }, { "epoch": 2.8821499013806706, "grad_norm": 0.33203125, "learning_rate": 9.080441778734451e-05, "loss": 0.0404, "step": 11690 }, { "epoch": 2.882642998027613, "grad_norm": 0.455078125, "learning_rate": 9.077013954698851e-05, "loss": 0.0387, "step": 11692 }, { "epoch": 2.883136094674556, "grad_norm": 0.322265625, "learning_rate": 9.07358624004225e-05, "loss": 0.046, "step": 11694 }, { "epoch": 2.883629191321499, "grad_norm": 0.361328125, "learning_rate": 9.070158635170852e-05, "loss": 0.0403, "step": 11696 }, { "epoch": 2.8841222879684416, "grad_norm": 0.341796875, "learning_rate": 9.066731140490844e-05, "loss": 0.0371, "step": 11698 }, { "epoch": 2.8846153846153846, "grad_norm": 0.37109375, "learning_rate": 9.063303756408411e-05, "loss": 0.0416, "step": 11700 }, { "epoch": 2.8851084812623276, "grad_norm": 0.365234375, "learning_rate": 9.059876483329708e-05, "loss": 0.0458, "step": 11702 }, { "epoch": 2.88560157790927, "grad_norm": 0.390625, "learning_rate": 9.056449321660895e-05, "loss": 0.0406, "step": 11704 }, { "epoch": 2.886094674556213, "grad_norm": 0.296875, "learning_rate": 9.053022271808104e-05, "loss": 0.0393, "step": 11706 }, { "epoch": 2.886587771203156, "grad_norm": 0.361328125, "learning_rate": 9.049595334177463e-05, "loss": 0.0369, "step": 11708 }, { "epoch": 2.8870808678500985, "grad_norm": 0.349609375, "learning_rate": 9.04616850917508e-05, "loss": 0.0452, "step": 11710 }, { "epoch": 2.8875739644970415, "grad_norm": 0.3828125, "learning_rate": 9.042741797207052e-05, "loss": 0.0435, "step": 11712 }, { "epoch": 2.8880670611439845, "grad_norm": 0.33984375, "learning_rate": 9.039315198679469e-05, "loss": 0.0387, "step": 11714 }, { "epoch": 2.888560157790927, "grad_norm": 0.337890625, "learning_rate": 9.035888713998394e-05, "loss": 0.0431, "step": 11716 }, { "epoch": 2.88905325443787, "grad_norm": 0.380859375, "learning_rate": 9.032462343569894e-05, "loss": 0.0393, "step": 11718 }, { "epoch": 2.8895463510848125, "grad_norm": 0.310546875, "learning_rate": 9.029036087800005e-05, "loss": 0.0414, "step": 11720 }, { "epoch": 2.8900394477317555, "grad_norm": 0.3828125, "learning_rate": 9.025609947094766e-05, "loss": 0.0436, "step": 11722 }, { "epoch": 2.890532544378698, "grad_norm": 0.29296875, "learning_rate": 9.022183921860181e-05, "loss": 0.0436, "step": 11724 }, { "epoch": 2.891025641025641, "grad_norm": 0.330078125, "learning_rate": 9.018758012502267e-05, "loss": 0.0405, "step": 11726 }, { "epoch": 2.891518737672584, "grad_norm": 0.30078125, "learning_rate": 9.015332219427006e-05, "loss": 0.0411, "step": 11728 }, { "epoch": 2.8920118343195265, "grad_norm": 0.51171875, "learning_rate": 9.01190654304037e-05, "loss": 0.0439, "step": 11730 }, { "epoch": 2.8925049309664694, "grad_norm": 0.306640625, "learning_rate": 9.008480983748331e-05, "loss": 0.041, "step": 11732 }, { "epoch": 2.8929980276134124, "grad_norm": 0.296875, "learning_rate": 9.00505554195683e-05, "loss": 0.0408, "step": 11734 }, { "epoch": 2.893491124260355, "grad_norm": 0.31640625, "learning_rate": 9.001630218071803e-05, "loss": 0.0378, "step": 11736 }, { "epoch": 2.893984220907298, "grad_norm": 0.291015625, "learning_rate": 8.998205012499165e-05, "loss": 0.0413, "step": 11738 }, { "epoch": 2.894477317554241, "grad_norm": 0.32421875, "learning_rate": 8.994779925644831e-05, "loss": 0.0445, "step": 11740 }, { "epoch": 2.8949704142011834, "grad_norm": 0.28125, "learning_rate": 8.991354957914687e-05, "loss": 0.038, "step": 11742 }, { "epoch": 2.8954635108481264, "grad_norm": 0.41796875, "learning_rate": 8.98793010971461e-05, "loss": 0.0419, "step": 11744 }, { "epoch": 2.895956607495069, "grad_norm": 0.314453125, "learning_rate": 8.984505381450468e-05, "loss": 0.0411, "step": 11746 }, { "epoch": 2.896449704142012, "grad_norm": 0.359375, "learning_rate": 8.981080773528107e-05, "loss": 0.0431, "step": 11748 }, { "epoch": 2.8969428007889544, "grad_norm": 0.353515625, "learning_rate": 8.977656286353366e-05, "loss": 0.0394, "step": 11750 }, { "epoch": 2.8974358974358974, "grad_norm": 0.35546875, "learning_rate": 8.974231920332056e-05, "loss": 0.038, "step": 11752 }, { "epoch": 2.8979289940828403, "grad_norm": 0.35546875, "learning_rate": 8.970807675869997e-05, "loss": 0.0414, "step": 11754 }, { "epoch": 2.898422090729783, "grad_norm": 0.330078125, "learning_rate": 8.967383553372971e-05, "loss": 0.0416, "step": 11756 }, { "epoch": 2.898915187376726, "grad_norm": 0.322265625, "learning_rate": 8.963959553246763e-05, "loss": 0.039, "step": 11758 }, { "epoch": 2.899408284023669, "grad_norm": 0.283203125, "learning_rate": 8.960535675897133e-05, "loss": 0.044, "step": 11760 }, { "epoch": 2.8999013806706113, "grad_norm": 0.458984375, "learning_rate": 8.957111921729826e-05, "loss": 0.0421, "step": 11762 }, { "epoch": 2.9003944773175543, "grad_norm": 0.38671875, "learning_rate": 8.953688291150582e-05, "loss": 0.0449, "step": 11764 }, { "epoch": 2.9008875739644973, "grad_norm": 0.294921875, "learning_rate": 8.950264784565112e-05, "loss": 0.0363, "step": 11766 }, { "epoch": 2.90138067061144, "grad_norm": 0.28515625, "learning_rate": 8.94684140237913e-05, "loss": 0.0393, "step": 11768 }, { "epoch": 2.9018737672583828, "grad_norm": 0.484375, "learning_rate": 8.943418144998318e-05, "loss": 0.0397, "step": 11770 }, { "epoch": 2.9023668639053253, "grad_norm": 0.322265625, "learning_rate": 8.93999501282836e-05, "loss": 0.0341, "step": 11772 }, { "epoch": 2.9028599605522682, "grad_norm": 0.3125, "learning_rate": 8.936572006274911e-05, "loss": 0.0418, "step": 11774 }, { "epoch": 2.9033530571992108, "grad_norm": 0.33984375, "learning_rate": 8.933149125743615e-05, "loss": 0.0416, "step": 11776 }, { "epoch": 2.9038461538461537, "grad_norm": 0.482421875, "learning_rate": 8.929726371640106e-05, "loss": 0.0421, "step": 11778 }, { "epoch": 2.9043392504930967, "grad_norm": 0.39453125, "learning_rate": 8.926303744369993e-05, "loss": 0.0496, "step": 11780 }, { "epoch": 2.9048323471400392, "grad_norm": 0.306640625, "learning_rate": 8.922881244338886e-05, "loss": 0.039, "step": 11782 }, { "epoch": 2.905325443786982, "grad_norm": 0.296875, "learning_rate": 8.91945887195236e-05, "loss": 0.0414, "step": 11784 }, { "epoch": 2.905818540433925, "grad_norm": 0.298828125, "learning_rate": 8.916036627615995e-05, "loss": 0.0417, "step": 11786 }, { "epoch": 2.9063116370808677, "grad_norm": 0.330078125, "learning_rate": 8.91261451173534e-05, "loss": 0.041, "step": 11788 }, { "epoch": 2.9068047337278107, "grad_norm": 0.318359375, "learning_rate": 8.90919252471594e-05, "loss": 0.042, "step": 11790 }, { "epoch": 2.9072978303747536, "grad_norm": 0.32421875, "learning_rate": 8.905770666963314e-05, "loss": 0.043, "step": 11792 }, { "epoch": 2.907790927021696, "grad_norm": 0.302734375, "learning_rate": 8.902348938882969e-05, "loss": 0.0409, "step": 11794 }, { "epoch": 2.908284023668639, "grad_norm": 0.3203125, "learning_rate": 8.898927340880409e-05, "loss": 0.0375, "step": 11796 }, { "epoch": 2.908777120315582, "grad_norm": 0.291015625, "learning_rate": 8.895505873361102e-05, "loss": 0.0412, "step": 11798 }, { "epoch": 2.9092702169625246, "grad_norm": 0.30859375, "learning_rate": 8.892084536730519e-05, "loss": 0.0397, "step": 11800 }, { "epoch": 2.9097633136094676, "grad_norm": 0.318359375, "learning_rate": 8.888663331394103e-05, "loss": 0.0431, "step": 11802 }, { "epoch": 2.91025641025641, "grad_norm": 0.3046875, "learning_rate": 8.885242257757289e-05, "loss": 0.0398, "step": 11804 }, { "epoch": 2.910749506903353, "grad_norm": 0.2734375, "learning_rate": 8.881821316225489e-05, "loss": 0.0417, "step": 11806 }, { "epoch": 2.9112426035502956, "grad_norm": 0.2734375, "learning_rate": 8.878400507204108e-05, "loss": 0.041, "step": 11808 }, { "epoch": 2.9117357001972386, "grad_norm": 0.28125, "learning_rate": 8.87497983109853e-05, "loss": 0.0347, "step": 11810 }, { "epoch": 2.9122287968441816, "grad_norm": 0.30859375, "learning_rate": 8.871559288314119e-05, "loss": 0.0425, "step": 11812 }, { "epoch": 2.912721893491124, "grad_norm": 0.32421875, "learning_rate": 8.868138879256235e-05, "loss": 0.0377, "step": 11814 }, { "epoch": 2.913214990138067, "grad_norm": 0.330078125, "learning_rate": 8.864718604330215e-05, "loss": 0.0411, "step": 11816 }, { "epoch": 2.91370808678501, "grad_norm": 0.310546875, "learning_rate": 8.861298463941378e-05, "loss": 0.039, "step": 11818 }, { "epoch": 2.9142011834319526, "grad_norm": 0.333984375, "learning_rate": 8.857878458495028e-05, "loss": 0.0408, "step": 11820 }, { "epoch": 2.9146942800788955, "grad_norm": 0.3359375, "learning_rate": 8.854458588396462e-05, "loss": 0.0415, "step": 11822 }, { "epoch": 2.9151873767258385, "grad_norm": 0.287109375, "learning_rate": 8.851038854050948e-05, "loss": 0.0373, "step": 11824 }, { "epoch": 2.915680473372781, "grad_norm": 0.34375, "learning_rate": 8.847619255863742e-05, "loss": 0.0392, "step": 11826 }, { "epoch": 2.916173570019724, "grad_norm": 0.328125, "learning_rate": 8.84419979424009e-05, "loss": 0.0412, "step": 11828 }, { "epoch": 2.9166666666666665, "grad_norm": 0.34765625, "learning_rate": 8.840780469585216e-05, "loss": 0.0383, "step": 11830 }, { "epoch": 2.9171597633136095, "grad_norm": 0.283203125, "learning_rate": 8.837361282304329e-05, "loss": 0.0402, "step": 11832 }, { "epoch": 2.917652859960552, "grad_norm": 0.3203125, "learning_rate": 8.833942232802618e-05, "loss": 0.0415, "step": 11834 }, { "epoch": 2.918145956607495, "grad_norm": 0.349609375, "learning_rate": 8.830523321485268e-05, "loss": 0.0384, "step": 11836 }, { "epoch": 2.918639053254438, "grad_norm": 0.345703125, "learning_rate": 8.827104548757427e-05, "loss": 0.0381, "step": 11838 }, { "epoch": 2.9191321499013805, "grad_norm": 0.279296875, "learning_rate": 8.823685915024253e-05, "loss": 0.0419, "step": 11840 }, { "epoch": 2.9196252465483234, "grad_norm": 0.3203125, "learning_rate": 8.820267420690864e-05, "loss": 0.0432, "step": 11842 }, { "epoch": 2.9201183431952664, "grad_norm": 0.296875, "learning_rate": 8.81684906616237e-05, "loss": 0.0405, "step": 11844 }, { "epoch": 2.920611439842209, "grad_norm": 0.28515625, "learning_rate": 8.813430851843871e-05, "loss": 0.0398, "step": 11846 }, { "epoch": 2.921104536489152, "grad_norm": 0.33984375, "learning_rate": 8.810012778140437e-05, "loss": 0.0395, "step": 11848 }, { "epoch": 2.921597633136095, "grad_norm": 0.287109375, "learning_rate": 8.806594845457134e-05, "loss": 0.0417, "step": 11850 }, { "epoch": 2.9220907297830374, "grad_norm": 0.283203125, "learning_rate": 8.803177054199003e-05, "loss": 0.0391, "step": 11852 }, { "epoch": 2.9225838264299804, "grad_norm": 0.33203125, "learning_rate": 8.799759404771078e-05, "loss": 0.0418, "step": 11854 }, { "epoch": 2.9230769230769234, "grad_norm": 0.3125, "learning_rate": 8.796341897578362e-05, "loss": 0.0372, "step": 11856 }, { "epoch": 2.923570019723866, "grad_norm": 0.353515625, "learning_rate": 8.792924533025854e-05, "loss": 0.0418, "step": 11858 }, { "epoch": 2.9240631163708084, "grad_norm": 0.349609375, "learning_rate": 8.789507311518528e-05, "loss": 0.0435, "step": 11860 }, { "epoch": 2.9245562130177514, "grad_norm": 0.314453125, "learning_rate": 8.786090233461339e-05, "loss": 0.0464, "step": 11862 }, { "epoch": 2.9250493096646943, "grad_norm": 0.2734375, "learning_rate": 8.78267329925924e-05, "loss": 0.0376, "step": 11864 }, { "epoch": 2.925542406311637, "grad_norm": 0.365234375, "learning_rate": 8.779256509317148e-05, "loss": 0.0413, "step": 11866 }, { "epoch": 2.92603550295858, "grad_norm": 0.255859375, "learning_rate": 8.775839864039978e-05, "loss": 0.0399, "step": 11868 }, { "epoch": 2.926528599605523, "grad_norm": 0.392578125, "learning_rate": 8.772423363832618e-05, "loss": 0.0438, "step": 11870 }, { "epoch": 2.9270216962524653, "grad_norm": 0.298828125, "learning_rate": 8.769007009099946e-05, "loss": 0.0426, "step": 11872 }, { "epoch": 2.9275147928994083, "grad_norm": 0.27734375, "learning_rate": 8.765590800246816e-05, "loss": 0.0383, "step": 11874 }, { "epoch": 2.9280078895463513, "grad_norm": 0.39453125, "learning_rate": 8.762174737678063e-05, "loss": 0.0434, "step": 11876 }, { "epoch": 2.928500986193294, "grad_norm": 0.365234375, "learning_rate": 8.75875882179852e-05, "loss": 0.0412, "step": 11878 }, { "epoch": 2.9289940828402368, "grad_norm": 0.33984375, "learning_rate": 8.755343053012982e-05, "loss": 0.0434, "step": 11880 }, { "epoch": 2.9294871794871797, "grad_norm": 0.33203125, "learning_rate": 8.751927431726245e-05, "loss": 0.0383, "step": 11882 }, { "epoch": 2.9299802761341223, "grad_norm": 0.3046875, "learning_rate": 8.748511958343076e-05, "loss": 0.0394, "step": 11884 }, { "epoch": 2.9304733727810652, "grad_norm": 0.4453125, "learning_rate": 8.745096633268227e-05, "loss": 0.0421, "step": 11886 }, { "epoch": 2.9309664694280078, "grad_norm": 0.333984375, "learning_rate": 8.74168145690643e-05, "loss": 0.0397, "step": 11888 }, { "epoch": 2.9314595660749507, "grad_norm": 0.44140625, "learning_rate": 8.738266429662411e-05, "loss": 0.0472, "step": 11890 }, { "epoch": 2.9319526627218933, "grad_norm": 0.30859375, "learning_rate": 8.734851551940863e-05, "loss": 0.0422, "step": 11892 }, { "epoch": 2.9324457593688362, "grad_norm": 0.5, "learning_rate": 8.731436824146468e-05, "loss": 0.04, "step": 11894 }, { "epoch": 2.932938856015779, "grad_norm": 0.52734375, "learning_rate": 8.728022246683894e-05, "loss": 0.0415, "step": 11896 }, { "epoch": 2.9334319526627217, "grad_norm": 0.314453125, "learning_rate": 8.724607819957785e-05, "loss": 0.04, "step": 11898 }, { "epoch": 2.9339250493096647, "grad_norm": 0.30859375, "learning_rate": 8.721193544372772e-05, "loss": 0.0422, "step": 11900 }, { "epoch": 2.9344181459566077, "grad_norm": 0.302734375, "learning_rate": 8.717779420333459e-05, "loss": 0.0417, "step": 11902 }, { "epoch": 2.93491124260355, "grad_norm": 0.302734375, "learning_rate": 8.714365448244448e-05, "loss": 0.045, "step": 11904 }, { "epoch": 2.935404339250493, "grad_norm": 0.47265625, "learning_rate": 8.71095162851031e-05, "loss": 0.0411, "step": 11906 }, { "epoch": 2.935897435897436, "grad_norm": 0.306640625, "learning_rate": 8.707537961535597e-05, "loss": 0.0385, "step": 11908 }, { "epoch": 2.9363905325443787, "grad_norm": 0.3828125, "learning_rate": 8.704124447724855e-05, "loss": 0.0445, "step": 11910 }, { "epoch": 2.9368836291913216, "grad_norm": 0.306640625, "learning_rate": 8.7007110874826e-05, "loss": 0.0397, "step": 11912 }, { "epoch": 2.937376725838264, "grad_norm": 0.287109375, "learning_rate": 8.697297881213338e-05, "loss": 0.0416, "step": 11914 }, { "epoch": 2.937869822485207, "grad_norm": 0.267578125, "learning_rate": 8.693884829321544e-05, "loss": 0.0366, "step": 11916 }, { "epoch": 2.9383629191321496, "grad_norm": 0.294921875, "learning_rate": 8.690471932211697e-05, "loss": 0.0385, "step": 11918 }, { "epoch": 2.9388560157790926, "grad_norm": 0.3359375, "learning_rate": 8.687059190288231e-05, "loss": 0.0442, "step": 11920 }, { "epoch": 2.9393491124260356, "grad_norm": 0.29296875, "learning_rate": 8.683646603955587e-05, "loss": 0.038, "step": 11922 }, { "epoch": 2.939842209072978, "grad_norm": 0.30078125, "learning_rate": 8.68023417361817e-05, "loss": 0.0407, "step": 11924 }, { "epoch": 2.940335305719921, "grad_norm": 0.365234375, "learning_rate": 8.676821899680369e-05, "loss": 0.0426, "step": 11926 }, { "epoch": 2.940828402366864, "grad_norm": 0.330078125, "learning_rate": 8.673409782546564e-05, "loss": 0.0407, "step": 11928 }, { "epoch": 2.9413214990138066, "grad_norm": 0.365234375, "learning_rate": 8.669997822621102e-05, "loss": 0.0408, "step": 11930 }, { "epoch": 2.9418145956607495, "grad_norm": 0.3046875, "learning_rate": 8.666586020308328e-05, "loss": 0.0382, "step": 11932 }, { "epoch": 2.9423076923076925, "grad_norm": 0.34375, "learning_rate": 8.66317437601255e-05, "loss": 0.0424, "step": 11934 }, { "epoch": 2.942800788954635, "grad_norm": 0.392578125, "learning_rate": 8.659762890138079e-05, "loss": 0.0422, "step": 11936 }, { "epoch": 2.943293885601578, "grad_norm": 0.345703125, "learning_rate": 8.656351563089184e-05, "loss": 0.0406, "step": 11938 }, { "epoch": 2.943786982248521, "grad_norm": 0.30859375, "learning_rate": 8.652940395270134e-05, "loss": 0.0431, "step": 11940 }, { "epoch": 2.9442800788954635, "grad_norm": 0.34765625, "learning_rate": 8.649529387085168e-05, "loss": 0.0399, "step": 11942 }, { "epoch": 2.9447731755424065, "grad_norm": 0.361328125, "learning_rate": 8.646118538938506e-05, "loss": 0.044, "step": 11944 }, { "epoch": 2.945266272189349, "grad_norm": 0.30078125, "learning_rate": 8.642707851234359e-05, "loss": 0.0392, "step": 11946 }, { "epoch": 2.945759368836292, "grad_norm": 0.341796875, "learning_rate": 8.639297324376906e-05, "loss": 0.043, "step": 11948 }, { "epoch": 2.9462524654832345, "grad_norm": 0.294921875, "learning_rate": 8.635886958770323e-05, "loss": 0.0394, "step": 11950 }, { "epoch": 2.9467455621301775, "grad_norm": 0.294921875, "learning_rate": 8.632476754818748e-05, "loss": 0.0407, "step": 11952 }, { "epoch": 2.9472386587771204, "grad_norm": 0.30078125, "learning_rate": 8.629066712926314e-05, "loss": 0.0403, "step": 11954 }, { "epoch": 2.947731755424063, "grad_norm": 0.302734375, "learning_rate": 8.625656833497128e-05, "loss": 0.0386, "step": 11956 }, { "epoch": 2.948224852071006, "grad_norm": 0.439453125, "learning_rate": 8.622247116935282e-05, "loss": 0.0478, "step": 11958 }, { "epoch": 2.948717948717949, "grad_norm": 0.322265625, "learning_rate": 8.618837563644844e-05, "loss": 0.0394, "step": 11960 }, { "epoch": 2.9492110453648914, "grad_norm": 0.4296875, "learning_rate": 8.615428174029864e-05, "loss": 0.042, "step": 11962 }, { "epoch": 2.9497041420118344, "grad_norm": 0.294921875, "learning_rate": 8.61201894849438e-05, "loss": 0.0409, "step": 11964 }, { "epoch": 2.9501972386587774, "grad_norm": 0.3125, "learning_rate": 8.608609887442398e-05, "loss": 0.0417, "step": 11966 }, { "epoch": 2.95069033530572, "grad_norm": 0.306640625, "learning_rate": 8.605200991277916e-05, "loss": 0.0441, "step": 11968 }, { "epoch": 2.951183431952663, "grad_norm": 0.37109375, "learning_rate": 8.601792260404897e-05, "loss": 0.0444, "step": 11970 }, { "epoch": 2.9516765285996054, "grad_norm": 0.40234375, "learning_rate": 8.598383695227308e-05, "loss": 0.0435, "step": 11972 }, { "epoch": 2.9521696252465484, "grad_norm": 0.29296875, "learning_rate": 8.594975296149076e-05, "loss": 0.0412, "step": 11974 }, { "epoch": 2.952662721893491, "grad_norm": 0.36328125, "learning_rate": 8.591567063574112e-05, "loss": 0.0455, "step": 11976 }, { "epoch": 2.953155818540434, "grad_norm": 0.439453125, "learning_rate": 8.588158997906319e-05, "loss": 0.0405, "step": 11978 }, { "epoch": 2.953648915187377, "grad_norm": 0.43359375, "learning_rate": 8.584751099549565e-05, "loss": 0.0436, "step": 11980 }, { "epoch": 2.9541420118343193, "grad_norm": 0.287109375, "learning_rate": 8.581343368907708e-05, "loss": 0.0364, "step": 11982 }, { "epoch": 2.9546351084812623, "grad_norm": 0.455078125, "learning_rate": 8.577935806384583e-05, "loss": 0.0397, "step": 11984 }, { "epoch": 2.9551282051282053, "grad_norm": 0.30859375, "learning_rate": 8.574528412384006e-05, "loss": 0.0414, "step": 11986 }, { "epoch": 2.955621301775148, "grad_norm": 0.373046875, "learning_rate": 8.571121187309766e-05, "loss": 0.0399, "step": 11988 }, { "epoch": 2.956114398422091, "grad_norm": 0.357421875, "learning_rate": 8.567714131565648e-05, "loss": 0.035, "step": 11990 }, { "epoch": 2.9566074950690338, "grad_norm": 0.36328125, "learning_rate": 8.564307245555403e-05, "loss": 0.041, "step": 11992 }, { "epoch": 2.9571005917159763, "grad_norm": 0.28515625, "learning_rate": 8.560900529682763e-05, "loss": 0.0431, "step": 11994 }, { "epoch": 2.9575936883629192, "grad_norm": 0.404296875, "learning_rate": 8.557493984351445e-05, "loss": 0.0387, "step": 11996 }, { "epoch": 2.9580867850098618, "grad_norm": 0.3671875, "learning_rate": 8.554087609965143e-05, "loss": 0.0423, "step": 11998 }, { "epoch": 2.9585798816568047, "grad_norm": 0.384765625, "learning_rate": 8.550681406927535e-05, "loss": 0.051, "step": 12000 }, { "epoch": 2.9590729783037473, "grad_norm": 0.3046875, "learning_rate": 8.547275375642267e-05, "loss": 0.0409, "step": 12002 }, { "epoch": 2.9595660749506902, "grad_norm": 0.287109375, "learning_rate": 8.543869516512984e-05, "loss": 0.0429, "step": 12004 }, { "epoch": 2.960059171597633, "grad_norm": 0.33984375, "learning_rate": 8.540463829943292e-05, "loss": 0.0457, "step": 12006 }, { "epoch": 2.9605522682445757, "grad_norm": 0.52734375, "learning_rate": 8.537058316336783e-05, "loss": 0.0438, "step": 12008 }, { "epoch": 2.9610453648915187, "grad_norm": 0.330078125, "learning_rate": 8.533652976097032e-05, "loss": 0.0426, "step": 12010 }, { "epoch": 2.9615384615384617, "grad_norm": 0.34765625, "learning_rate": 8.53024780962759e-05, "loss": 0.0442, "step": 12012 }, { "epoch": 2.962031558185404, "grad_norm": 0.3515625, "learning_rate": 8.526842817331992e-05, "loss": 0.0401, "step": 12014 }, { "epoch": 2.962524654832347, "grad_norm": 0.349609375, "learning_rate": 8.52343799961374e-05, "loss": 0.0436, "step": 12016 }, { "epoch": 2.96301775147929, "grad_norm": 0.36328125, "learning_rate": 8.520033356876332e-05, "loss": 0.0411, "step": 12018 }, { "epoch": 2.9635108481262327, "grad_norm": 0.30859375, "learning_rate": 8.516628889523234e-05, "loss": 0.0405, "step": 12020 }, { "epoch": 2.9640039447731756, "grad_norm": 0.4765625, "learning_rate": 8.513224597957893e-05, "loss": 0.0438, "step": 12022 }, { "epoch": 2.9644970414201186, "grad_norm": 0.298828125, "learning_rate": 8.509820482583742e-05, "loss": 0.0397, "step": 12024 }, { "epoch": 2.964990138067061, "grad_norm": 0.322265625, "learning_rate": 8.506416543804182e-05, "loss": 0.0406, "step": 12026 }, { "epoch": 2.965483234714004, "grad_norm": 0.361328125, "learning_rate": 8.503012782022602e-05, "loss": 0.0398, "step": 12028 }, { "epoch": 2.9659763313609466, "grad_norm": 0.30078125, "learning_rate": 8.499609197642361e-05, "loss": 0.0412, "step": 12030 }, { "epoch": 2.9664694280078896, "grad_norm": 0.28515625, "learning_rate": 8.496205791066811e-05, "loss": 0.0406, "step": 12032 }, { "epoch": 2.966962524654832, "grad_norm": 0.34765625, "learning_rate": 8.49280256269927e-05, "loss": 0.0378, "step": 12034 }, { "epoch": 2.967455621301775, "grad_norm": 0.3125, "learning_rate": 8.48939951294304e-05, "loss": 0.0399, "step": 12036 }, { "epoch": 2.967948717948718, "grad_norm": 0.291015625, "learning_rate": 8.485996642201404e-05, "loss": 0.0395, "step": 12038 }, { "epoch": 2.9684418145956606, "grad_norm": 0.330078125, "learning_rate": 8.482593950877616e-05, "loss": 0.0363, "step": 12040 }, { "epoch": 2.9689349112426036, "grad_norm": 0.3515625, "learning_rate": 8.47919143937492e-05, "loss": 0.0419, "step": 12042 }, { "epoch": 2.9694280078895465, "grad_norm": 0.33203125, "learning_rate": 8.475789108096526e-05, "loss": 0.0489, "step": 12044 }, { "epoch": 2.969921104536489, "grad_norm": 0.2890625, "learning_rate": 8.472386957445635e-05, "loss": 0.0393, "step": 12046 }, { "epoch": 2.970414201183432, "grad_norm": 0.294921875, "learning_rate": 8.468984987825417e-05, "loss": 0.038, "step": 12048 }, { "epoch": 2.970907297830375, "grad_norm": 0.28125, "learning_rate": 8.465583199639028e-05, "loss": 0.0366, "step": 12050 }, { "epoch": 2.9714003944773175, "grad_norm": 0.376953125, "learning_rate": 8.462181593289596e-05, "loss": 0.0417, "step": 12052 }, { "epoch": 2.9718934911242605, "grad_norm": 0.29296875, "learning_rate": 8.458780169180232e-05, "loss": 0.0372, "step": 12054 }, { "epoch": 2.972386587771203, "grad_norm": 0.34765625, "learning_rate": 8.455378927714024e-05, "loss": 0.0407, "step": 12056 }, { "epoch": 2.972879684418146, "grad_norm": 0.2890625, "learning_rate": 8.451977869294034e-05, "loss": 0.0349, "step": 12058 }, { "epoch": 2.9733727810650885, "grad_norm": 0.37890625, "learning_rate": 8.448576994323311e-05, "loss": 0.0419, "step": 12060 }, { "epoch": 2.9738658777120315, "grad_norm": 0.302734375, "learning_rate": 8.445176303204875e-05, "loss": 0.0419, "step": 12062 }, { "epoch": 2.9743589743589745, "grad_norm": 0.314453125, "learning_rate": 8.44177579634173e-05, "loss": 0.0434, "step": 12064 }, { "epoch": 2.974852071005917, "grad_norm": 0.294921875, "learning_rate": 8.438375474136853e-05, "loss": 0.0383, "step": 12066 }, { "epoch": 2.97534516765286, "grad_norm": 0.30078125, "learning_rate": 8.434975336993202e-05, "loss": 0.039, "step": 12068 }, { "epoch": 2.975838264299803, "grad_norm": 0.294921875, "learning_rate": 8.431575385313708e-05, "loss": 0.0419, "step": 12070 }, { "epoch": 2.9763313609467454, "grad_norm": 0.291015625, "learning_rate": 8.428175619501291e-05, "loss": 0.0407, "step": 12072 }, { "epoch": 2.9768244575936884, "grad_norm": 0.34765625, "learning_rate": 8.424776039958841e-05, "loss": 0.0374, "step": 12074 }, { "epoch": 2.9773175542406314, "grad_norm": 0.33984375, "learning_rate": 8.421376647089218e-05, "loss": 0.0394, "step": 12076 }, { "epoch": 2.977810650887574, "grad_norm": 0.28125, "learning_rate": 8.417977441295283e-05, "loss": 0.0372, "step": 12078 }, { "epoch": 2.978303747534517, "grad_norm": 0.29296875, "learning_rate": 8.41457842297985e-05, "loss": 0.0361, "step": 12080 }, { "epoch": 2.9787968441814594, "grad_norm": 0.35546875, "learning_rate": 8.41117959254573e-05, "loss": 0.0427, "step": 12082 }, { "epoch": 2.9792899408284024, "grad_norm": 0.330078125, "learning_rate": 8.407780950395694e-05, "loss": 0.0405, "step": 12084 }, { "epoch": 2.979783037475345, "grad_norm": 0.302734375, "learning_rate": 8.40438249693251e-05, "loss": 0.0414, "step": 12086 }, { "epoch": 2.980276134122288, "grad_norm": 0.310546875, "learning_rate": 8.400984232558908e-05, "loss": 0.0402, "step": 12088 }, { "epoch": 2.980769230769231, "grad_norm": 0.298828125, "learning_rate": 8.397586157677598e-05, "loss": 0.0435, "step": 12090 }, { "epoch": 2.9812623274161734, "grad_norm": 0.2890625, "learning_rate": 8.39418827269128e-05, "loss": 0.0385, "step": 12092 }, { "epoch": 2.9817554240631163, "grad_norm": 0.369140625, "learning_rate": 8.390790578002615e-05, "loss": 0.0387, "step": 12094 }, { "epoch": 2.9822485207100593, "grad_norm": 0.427734375, "learning_rate": 8.387393074014252e-05, "loss": 0.0432, "step": 12096 }, { "epoch": 2.982741617357002, "grad_norm": 0.341796875, "learning_rate": 8.383995761128811e-05, "loss": 0.039, "step": 12098 }, { "epoch": 2.983234714003945, "grad_norm": 0.3125, "learning_rate": 8.380598639748897e-05, "loss": 0.0434, "step": 12100 }, { "epoch": 2.9837278106508878, "grad_norm": 0.318359375, "learning_rate": 8.377201710277084e-05, "loss": 0.0413, "step": 12102 }, { "epoch": 2.9842209072978303, "grad_norm": 0.419921875, "learning_rate": 8.373804973115931e-05, "loss": 0.0463, "step": 12104 }, { "epoch": 2.9847140039447733, "grad_norm": 0.37890625, "learning_rate": 8.370408428667971e-05, "loss": 0.0431, "step": 12106 }, { "epoch": 2.9852071005917162, "grad_norm": 0.326171875, "learning_rate": 8.367012077335706e-05, "loss": 0.0415, "step": 12108 }, { "epoch": 2.9857001972386588, "grad_norm": 0.3125, "learning_rate": 8.36361591952163e-05, "loss": 0.0393, "step": 12110 }, { "epoch": 2.9861932938856017, "grad_norm": 0.3125, "learning_rate": 8.3602199556282e-05, "loss": 0.0371, "step": 12112 }, { "epoch": 2.9866863905325443, "grad_norm": 0.30859375, "learning_rate": 8.356824186057864e-05, "loss": 0.035, "step": 12114 }, { "epoch": 2.9871794871794872, "grad_norm": 0.3203125, "learning_rate": 8.353428611213033e-05, "loss": 0.0418, "step": 12116 }, { "epoch": 2.9876725838264298, "grad_norm": 0.357421875, "learning_rate": 8.35003323149611e-05, "loss": 0.0355, "step": 12118 }, { "epoch": 2.9881656804733727, "grad_norm": 0.380859375, "learning_rate": 8.346638047309458e-05, "loss": 0.0376, "step": 12120 }, { "epoch": 2.9886587771203157, "grad_norm": 0.306640625, "learning_rate": 8.34324305905543e-05, "loss": 0.0422, "step": 12122 }, { "epoch": 2.989151873767258, "grad_norm": 0.32421875, "learning_rate": 8.339848267136349e-05, "loss": 0.0393, "step": 12124 }, { "epoch": 2.989644970414201, "grad_norm": 0.38671875, "learning_rate": 8.336453671954516e-05, "loss": 0.0429, "step": 12126 }, { "epoch": 2.990138067061144, "grad_norm": 0.34765625, "learning_rate": 8.333059273912214e-05, "loss": 0.0395, "step": 12128 }, { "epoch": 2.9906311637080867, "grad_norm": 0.40625, "learning_rate": 8.32966507341169e-05, "loss": 0.0413, "step": 12130 }, { "epoch": 2.9911242603550297, "grad_norm": 0.3203125, "learning_rate": 8.326271070855185e-05, "loss": 0.0423, "step": 12132 }, { "epoch": 2.9916173570019726, "grad_norm": 0.392578125, "learning_rate": 8.322877266644902e-05, "loss": 0.0403, "step": 12134 }, { "epoch": 2.992110453648915, "grad_norm": 0.376953125, "learning_rate": 8.319483661183029e-05, "loss": 0.0385, "step": 12136 }, { "epoch": 2.992603550295858, "grad_norm": 0.408203125, "learning_rate": 8.316090254871725e-05, "loss": 0.0382, "step": 12138 }, { "epoch": 2.9930966469428006, "grad_norm": 0.30078125, "learning_rate": 8.312697048113121e-05, "loss": 0.041, "step": 12140 }, { "epoch": 2.9935897435897436, "grad_norm": 0.43359375, "learning_rate": 8.309304041309344e-05, "loss": 0.0433, "step": 12142 }, { "epoch": 2.994082840236686, "grad_norm": 0.4296875, "learning_rate": 8.305911234862473e-05, "loss": 0.0426, "step": 12144 }, { "epoch": 2.994575936883629, "grad_norm": 0.3515625, "learning_rate": 8.302518629174582e-05, "loss": 0.0365, "step": 12146 }, { "epoch": 2.995069033530572, "grad_norm": 0.40625, "learning_rate": 8.29912622464771e-05, "loss": 0.0449, "step": 12148 }, { "epoch": 2.9955621301775146, "grad_norm": 0.4609375, "learning_rate": 8.29573402168388e-05, "loss": 0.0391, "step": 12150 }, { "epoch": 2.9960552268244576, "grad_norm": 0.3046875, "learning_rate": 8.292342020685079e-05, "loss": 0.0385, "step": 12152 }, { "epoch": 2.9965483234714005, "grad_norm": 0.365234375, "learning_rate": 8.288950222053286e-05, "loss": 0.0405, "step": 12154 }, { "epoch": 2.997041420118343, "grad_norm": 0.384765625, "learning_rate": 8.285558626190447e-05, "loss": 0.0413, "step": 12156 }, { "epoch": 2.997534516765286, "grad_norm": 0.36328125, "learning_rate": 8.282167233498476e-05, "loss": 0.0402, "step": 12158 }, { "epoch": 2.998027613412229, "grad_norm": 0.32421875, "learning_rate": 8.278776044379286e-05, "loss": 0.0427, "step": 12160 }, { "epoch": 2.9985207100591715, "grad_norm": 0.431640625, "learning_rate": 8.275385059234741e-05, "loss": 0.0455, "step": 12162 }, { "epoch": 2.9990138067061145, "grad_norm": 0.2734375, "learning_rate": 8.271994278466698e-05, "loss": 0.0388, "step": 12164 }, { "epoch": 2.9995069033530575, "grad_norm": 0.384765625, "learning_rate": 8.268603702476977e-05, "loss": 0.0402, "step": 12166 }, { "epoch": 3.0, "grad_norm": 0.466796875, "learning_rate": 8.265213331667388e-05, "loss": 0.0356, "step": 12168 }, { "epoch": 3.0, "eval_loss": 0.0602399967610836, "eval_runtime": 73.8138, "eval_samples_per_second": 215.8, "eval_steps_per_second": 1.693, "step": 12168 }, { "epoch": 3.000493096646943, "grad_norm": 0.267578125, "learning_rate": 8.261823166439706e-05, "loss": 0.0281, "step": 12170 }, { "epoch": 3.0009861932938855, "grad_norm": 0.287109375, "learning_rate": 8.258433207195679e-05, "loss": 0.024, "step": 12172 }, { "epoch": 3.0014792899408285, "grad_norm": 0.255859375, "learning_rate": 8.255043454337043e-05, "loss": 0.0246, "step": 12174 }, { "epoch": 3.001972386587771, "grad_norm": 0.265625, "learning_rate": 8.2516539082655e-05, "loss": 0.0279, "step": 12176 }, { "epoch": 3.002465483234714, "grad_norm": 0.271484375, "learning_rate": 8.248264569382732e-05, "loss": 0.0258, "step": 12178 }, { "epoch": 3.002958579881657, "grad_norm": 0.2158203125, "learning_rate": 8.244875438090388e-05, "loss": 0.0239, "step": 12180 }, { "epoch": 3.0034516765285995, "grad_norm": 0.287109375, "learning_rate": 8.241486514790109e-05, "loss": 0.0305, "step": 12182 }, { "epoch": 3.0039447731755424, "grad_norm": 0.310546875, "learning_rate": 8.238097799883492e-05, "loss": 0.0325, "step": 12184 }, { "epoch": 3.0044378698224854, "grad_norm": 0.2734375, "learning_rate": 8.234709293772127e-05, "loss": 0.0293, "step": 12186 }, { "epoch": 3.004930966469428, "grad_norm": 0.26171875, "learning_rate": 8.231320996857567e-05, "loss": 0.0285, "step": 12188 }, { "epoch": 3.005424063116371, "grad_norm": 0.28125, "learning_rate": 8.227932909541341e-05, "loss": 0.0267, "step": 12190 }, { "epoch": 3.0059171597633134, "grad_norm": 0.283203125, "learning_rate": 8.224545032224963e-05, "loss": 0.0293, "step": 12192 }, { "epoch": 3.0064102564102564, "grad_norm": 0.263671875, "learning_rate": 8.221157365309907e-05, "loss": 0.0274, "step": 12194 }, { "epoch": 3.0069033530571994, "grad_norm": 0.31640625, "learning_rate": 8.217769909197636e-05, "loss": 0.0261, "step": 12196 }, { "epoch": 3.007396449704142, "grad_norm": 0.34765625, "learning_rate": 8.214382664289582e-05, "loss": 0.0262, "step": 12198 }, { "epoch": 3.007889546351085, "grad_norm": 0.26953125, "learning_rate": 8.210995630987152e-05, "loss": 0.0283, "step": 12200 }, { "epoch": 3.008382642998028, "grad_norm": 0.26171875, "learning_rate": 8.207608809691729e-05, "loss": 0.0278, "step": 12202 }, { "epoch": 3.0088757396449703, "grad_norm": 0.29296875, "learning_rate": 8.204222200804671e-05, "loss": 0.0273, "step": 12204 }, { "epoch": 3.0093688362919133, "grad_norm": 0.251953125, "learning_rate": 8.200835804727308e-05, "loss": 0.0274, "step": 12206 }, { "epoch": 3.009861932938856, "grad_norm": 0.3125, "learning_rate": 8.197449621860943e-05, "loss": 0.0287, "step": 12208 }, { "epoch": 3.010355029585799, "grad_norm": 0.30859375, "learning_rate": 8.194063652606867e-05, "loss": 0.027, "step": 12210 }, { "epoch": 3.010848126232742, "grad_norm": 0.283203125, "learning_rate": 8.190677897366325e-05, "loss": 0.024, "step": 12212 }, { "epoch": 3.0113412228796843, "grad_norm": 0.27734375, "learning_rate": 8.187292356540561e-05, "loss": 0.0302, "step": 12214 }, { "epoch": 3.0118343195266273, "grad_norm": 0.33984375, "learning_rate": 8.18390703053077e-05, "loss": 0.0258, "step": 12216 }, { "epoch": 3.01232741617357, "grad_norm": 0.2470703125, "learning_rate": 8.180521919738138e-05, "loss": 0.0251, "step": 12218 }, { "epoch": 3.0128205128205128, "grad_norm": 0.265625, "learning_rate": 8.177137024563818e-05, "loss": 0.0282, "step": 12220 }, { "epoch": 3.0133136094674557, "grad_norm": 0.2578125, "learning_rate": 8.173752345408933e-05, "loss": 0.0256, "step": 12222 }, { "epoch": 3.0138067061143983, "grad_norm": 0.244140625, "learning_rate": 8.170367882674595e-05, "loss": 0.0259, "step": 12224 }, { "epoch": 3.0142998027613412, "grad_norm": 0.29296875, "learning_rate": 8.166983636761875e-05, "loss": 0.0253, "step": 12226 }, { "epoch": 3.014792899408284, "grad_norm": 0.271484375, "learning_rate": 8.163599608071833e-05, "loss": 0.0267, "step": 12228 }, { "epoch": 3.0152859960552267, "grad_norm": 0.251953125, "learning_rate": 8.160215797005488e-05, "loss": 0.0309, "step": 12230 }, { "epoch": 3.0157790927021697, "grad_norm": 0.25, "learning_rate": 8.156832203963846e-05, "loss": 0.0256, "step": 12232 }, { "epoch": 3.0162721893491122, "grad_norm": 0.263671875, "learning_rate": 8.153448829347873e-05, "loss": 0.0353, "step": 12234 }, { "epoch": 3.016765285996055, "grad_norm": 0.2275390625, "learning_rate": 8.15006567355853e-05, "loss": 0.0211, "step": 12236 }, { "epoch": 3.017258382642998, "grad_norm": 0.294921875, "learning_rate": 8.14668273699673e-05, "loss": 0.0236, "step": 12238 }, { "epoch": 3.0177514792899407, "grad_norm": 0.259765625, "learning_rate": 8.143300020063372e-05, "loss": 0.0285, "step": 12240 }, { "epoch": 3.0182445759368837, "grad_norm": 0.2373046875, "learning_rate": 8.13991752315933e-05, "loss": 0.0272, "step": 12242 }, { "epoch": 3.0187376725838266, "grad_norm": 0.251953125, "learning_rate": 8.136535246685445e-05, "loss": 0.0254, "step": 12244 }, { "epoch": 3.019230769230769, "grad_norm": 0.30078125, "learning_rate": 8.133153191042541e-05, "loss": 0.0259, "step": 12246 }, { "epoch": 3.019723865877712, "grad_norm": 0.28515625, "learning_rate": 8.129771356631403e-05, "loss": 0.0313, "step": 12248 }, { "epoch": 3.0202169625246547, "grad_norm": 0.31640625, "learning_rate": 8.126389743852806e-05, "loss": 0.0265, "step": 12250 }, { "epoch": 3.0207100591715976, "grad_norm": 0.28515625, "learning_rate": 8.123008353107479e-05, "loss": 0.0304, "step": 12252 }, { "epoch": 3.0212031558185406, "grad_norm": 0.287109375, "learning_rate": 8.119627184796149e-05, "loss": 0.0236, "step": 12254 }, { "epoch": 3.021696252465483, "grad_norm": 0.32421875, "learning_rate": 8.116246239319492e-05, "loss": 0.0264, "step": 12256 }, { "epoch": 3.022189349112426, "grad_norm": 0.267578125, "learning_rate": 8.112865517078174e-05, "loss": 0.0271, "step": 12258 }, { "epoch": 3.0226824457593686, "grad_norm": 0.22265625, "learning_rate": 8.109485018472832e-05, "loss": 0.0241, "step": 12260 }, { "epoch": 3.0231755424063116, "grad_norm": 0.259765625, "learning_rate": 8.106104743904064e-05, "loss": 0.0321, "step": 12262 }, { "epoch": 3.0236686390532546, "grad_norm": 0.265625, "learning_rate": 8.102724693772462e-05, "loss": 0.0292, "step": 12264 }, { "epoch": 3.024161735700197, "grad_norm": 0.3125, "learning_rate": 8.099344868478574e-05, "loss": 0.0269, "step": 12266 }, { "epoch": 3.02465483234714, "grad_norm": 0.275390625, "learning_rate": 8.095965268422933e-05, "loss": 0.0276, "step": 12268 }, { "epoch": 3.025147928994083, "grad_norm": 0.2490234375, "learning_rate": 8.092585894006041e-05, "loss": 0.026, "step": 12270 }, { "epoch": 3.0256410256410255, "grad_norm": 0.279296875, "learning_rate": 8.089206745628367e-05, "loss": 0.025, "step": 12272 }, { "epoch": 3.0261341222879685, "grad_norm": 0.26953125, "learning_rate": 8.085827823690364e-05, "loss": 0.0279, "step": 12274 }, { "epoch": 3.026627218934911, "grad_norm": 0.279296875, "learning_rate": 8.082449128592447e-05, "loss": 0.0267, "step": 12276 }, { "epoch": 3.027120315581854, "grad_norm": 0.26953125, "learning_rate": 8.079070660735019e-05, "loss": 0.024, "step": 12278 }, { "epoch": 3.027613412228797, "grad_norm": 0.322265625, "learning_rate": 8.07569242051844e-05, "loss": 0.0282, "step": 12280 }, { "epoch": 3.0281065088757395, "grad_norm": 0.2314453125, "learning_rate": 8.072314408343056e-05, "loss": 0.0262, "step": 12282 }, { "epoch": 3.0285996055226825, "grad_norm": 0.251953125, "learning_rate": 8.068936624609176e-05, "loss": 0.0309, "step": 12284 }, { "epoch": 3.0290927021696255, "grad_norm": 0.2353515625, "learning_rate": 8.065559069717088e-05, "loss": 0.0224, "step": 12286 }, { "epoch": 3.029585798816568, "grad_norm": 0.275390625, "learning_rate": 8.062181744067052e-05, "loss": 0.0259, "step": 12288 }, { "epoch": 3.030078895463511, "grad_norm": 0.275390625, "learning_rate": 8.058804648059294e-05, "loss": 0.0261, "step": 12290 }, { "epoch": 3.0305719921104535, "grad_norm": 0.275390625, "learning_rate": 8.05542778209403e-05, "loss": 0.0289, "step": 12292 }, { "epoch": 3.0310650887573964, "grad_norm": 0.23828125, "learning_rate": 8.052051146571423e-05, "loss": 0.029, "step": 12294 }, { "epoch": 3.0315581854043394, "grad_norm": 0.25, "learning_rate": 8.048674741891638e-05, "loss": 0.024, "step": 12296 }, { "epoch": 3.032051282051282, "grad_norm": 0.263671875, "learning_rate": 8.045298568454788e-05, "loss": 0.0271, "step": 12298 }, { "epoch": 3.032544378698225, "grad_norm": 0.263671875, "learning_rate": 8.041922626660973e-05, "loss": 0.0279, "step": 12300 }, { "epoch": 3.033037475345168, "grad_norm": 0.2197265625, "learning_rate": 8.038546916910259e-05, "loss": 0.0247, "step": 12302 }, { "epoch": 3.0335305719921104, "grad_norm": 0.26171875, "learning_rate": 8.035171439602684e-05, "loss": 0.0269, "step": 12304 }, { "epoch": 3.0340236686390534, "grad_norm": 0.296875, "learning_rate": 8.031796195138266e-05, "loss": 0.0255, "step": 12306 }, { "epoch": 3.034516765285996, "grad_norm": 0.248046875, "learning_rate": 8.028421183916986e-05, "loss": 0.0281, "step": 12308 }, { "epoch": 3.035009861932939, "grad_norm": 0.27734375, "learning_rate": 8.025046406338806e-05, "loss": 0.0261, "step": 12310 }, { "epoch": 3.035502958579882, "grad_norm": 0.283203125, "learning_rate": 8.02167186280365e-05, "loss": 0.0244, "step": 12312 }, { "epoch": 3.0359960552268244, "grad_norm": 0.2353515625, "learning_rate": 8.018297553711429e-05, "loss": 0.0263, "step": 12314 }, { "epoch": 3.0364891518737673, "grad_norm": 0.287109375, "learning_rate": 8.014923479462006e-05, "loss": 0.0259, "step": 12316 }, { "epoch": 3.03698224852071, "grad_norm": 0.25, "learning_rate": 8.01154964045524e-05, "loss": 0.0283, "step": 12318 }, { "epoch": 3.037475345167653, "grad_norm": 0.232421875, "learning_rate": 8.008176037090942e-05, "loss": 0.0244, "step": 12320 }, { "epoch": 3.037968441814596, "grad_norm": 0.2412109375, "learning_rate": 8.0048026697689e-05, "loss": 0.0225, "step": 12322 }, { "epoch": 3.0384615384615383, "grad_norm": 0.279296875, "learning_rate": 8.001429538888887e-05, "loss": 0.0264, "step": 12324 }, { "epoch": 3.0389546351084813, "grad_norm": 0.255859375, "learning_rate": 7.998056644850631e-05, "loss": 0.024, "step": 12326 }, { "epoch": 3.0394477317554243, "grad_norm": 0.27734375, "learning_rate": 7.99468398805384e-05, "loss": 0.0217, "step": 12328 }, { "epoch": 3.039940828402367, "grad_norm": 0.291015625, "learning_rate": 7.99131156889819e-05, "loss": 0.0265, "step": 12330 }, { "epoch": 3.0404339250493098, "grad_norm": 0.2236328125, "learning_rate": 7.987939387783337e-05, "loss": 0.0241, "step": 12332 }, { "epoch": 3.0409270216962523, "grad_norm": 0.1865234375, "learning_rate": 7.984567445108896e-05, "loss": 0.023, "step": 12334 }, { "epoch": 3.0414201183431953, "grad_norm": 0.27734375, "learning_rate": 7.981195741274472e-05, "loss": 0.0295, "step": 12336 }, { "epoch": 3.0419132149901382, "grad_norm": 0.244140625, "learning_rate": 7.977824276679623e-05, "loss": 0.0236, "step": 12338 }, { "epoch": 3.0424063116370808, "grad_norm": 0.224609375, "learning_rate": 7.974453051723885e-05, "loss": 0.025, "step": 12340 }, { "epoch": 3.0428994082840237, "grad_norm": 0.2392578125, "learning_rate": 7.971082066806772e-05, "loss": 0.0236, "step": 12342 }, { "epoch": 3.0433925049309662, "grad_norm": 0.287109375, "learning_rate": 7.967711322327757e-05, "loss": 0.0285, "step": 12344 }, { "epoch": 3.043885601577909, "grad_norm": 0.310546875, "learning_rate": 7.964340818686304e-05, "loss": 0.0263, "step": 12346 }, { "epoch": 3.044378698224852, "grad_norm": 0.2353515625, "learning_rate": 7.960970556281823e-05, "loss": 0.026, "step": 12348 }, { "epoch": 3.0448717948717947, "grad_norm": 0.27734375, "learning_rate": 7.95760053551372e-05, "loss": 0.0257, "step": 12350 }, { "epoch": 3.0453648915187377, "grad_norm": 0.251953125, "learning_rate": 7.954230756781358e-05, "loss": 0.0256, "step": 12352 }, { "epoch": 3.0458579881656807, "grad_norm": 0.2373046875, "learning_rate": 7.950861220484071e-05, "loss": 0.027, "step": 12354 }, { "epoch": 3.046351084812623, "grad_norm": 0.255859375, "learning_rate": 7.947491927021172e-05, "loss": 0.0268, "step": 12356 }, { "epoch": 3.046844181459566, "grad_norm": 0.302734375, "learning_rate": 7.944122876791934e-05, "loss": 0.0262, "step": 12358 }, { "epoch": 3.0473372781065087, "grad_norm": 0.28125, "learning_rate": 7.94075407019562e-05, "loss": 0.0255, "step": 12360 }, { "epoch": 3.0478303747534516, "grad_norm": 0.25390625, "learning_rate": 7.93738550763144e-05, "loss": 0.0238, "step": 12362 }, { "epoch": 3.0483234714003946, "grad_norm": 0.27734375, "learning_rate": 7.9340171894986e-05, "loss": 0.0278, "step": 12364 }, { "epoch": 3.048816568047337, "grad_norm": 0.28125, "learning_rate": 7.930649116196253e-05, "loss": 0.0231, "step": 12366 }, { "epoch": 3.04930966469428, "grad_norm": 0.259765625, "learning_rate": 7.927281288123544e-05, "loss": 0.0253, "step": 12368 }, { "epoch": 3.049802761341223, "grad_norm": 0.3125, "learning_rate": 7.923913705679573e-05, "loss": 0.0271, "step": 12370 }, { "epoch": 3.0502958579881656, "grad_norm": 0.32421875, "learning_rate": 7.920546369263417e-05, "loss": 0.0247, "step": 12372 }, { "epoch": 3.0507889546351086, "grad_norm": 0.365234375, "learning_rate": 7.917179279274129e-05, "loss": 0.0262, "step": 12374 }, { "epoch": 3.051282051282051, "grad_norm": 0.291015625, "learning_rate": 7.913812436110723e-05, "loss": 0.0272, "step": 12376 }, { "epoch": 3.051775147928994, "grad_norm": 0.30859375, "learning_rate": 7.910445840172194e-05, "loss": 0.0254, "step": 12378 }, { "epoch": 3.052268244575937, "grad_norm": 0.27734375, "learning_rate": 7.907079491857498e-05, "loss": 0.026, "step": 12380 }, { "epoch": 3.0527613412228796, "grad_norm": 0.341796875, "learning_rate": 7.903713391565569e-05, "loss": 0.0285, "step": 12382 }, { "epoch": 3.0532544378698225, "grad_norm": 0.29296875, "learning_rate": 7.900347539695303e-05, "loss": 0.026, "step": 12384 }, { "epoch": 3.0537475345167655, "grad_norm": 0.27734375, "learning_rate": 7.896981936645581e-05, "loss": 0.0236, "step": 12386 }, { "epoch": 3.054240631163708, "grad_norm": 0.2578125, "learning_rate": 7.893616582815242e-05, "loss": 0.0249, "step": 12388 }, { "epoch": 3.054733727810651, "grad_norm": 0.25, "learning_rate": 7.890251478603094e-05, "loss": 0.0258, "step": 12390 }, { "epoch": 3.0552268244575935, "grad_norm": 0.287109375, "learning_rate": 7.886886624407929e-05, "loss": 0.0238, "step": 12392 }, { "epoch": 3.0557199211045365, "grad_norm": 0.234375, "learning_rate": 7.883522020628494e-05, "loss": 0.0259, "step": 12394 }, { "epoch": 3.0562130177514795, "grad_norm": 0.25, "learning_rate": 7.88015766766352e-05, "loss": 0.0279, "step": 12396 }, { "epoch": 3.056706114398422, "grad_norm": 0.2216796875, "learning_rate": 7.876793565911693e-05, "loss": 0.0264, "step": 12398 }, { "epoch": 3.057199211045365, "grad_norm": 0.2421875, "learning_rate": 7.873429715771687e-05, "loss": 0.0273, "step": 12400 }, { "epoch": 3.0576923076923075, "grad_norm": 0.26171875, "learning_rate": 7.870066117642134e-05, "loss": 0.0251, "step": 12402 }, { "epoch": 3.0581854043392505, "grad_norm": 0.2099609375, "learning_rate": 7.866702771921634e-05, "loss": 0.0245, "step": 12404 }, { "epoch": 3.0586785009861934, "grad_norm": 0.236328125, "learning_rate": 7.86333967900877e-05, "loss": 0.0245, "step": 12406 }, { "epoch": 3.059171597633136, "grad_norm": 0.22265625, "learning_rate": 7.859976839302081e-05, "loss": 0.0245, "step": 12408 }, { "epoch": 3.059664694280079, "grad_norm": 0.25, "learning_rate": 7.856614253200087e-05, "loss": 0.0258, "step": 12410 }, { "epoch": 3.060157790927022, "grad_norm": 0.283203125, "learning_rate": 7.853251921101268e-05, "loss": 0.0259, "step": 12412 }, { "epoch": 3.0606508875739644, "grad_norm": 0.240234375, "learning_rate": 7.849889843404086e-05, "loss": 0.0228, "step": 12414 }, { "epoch": 3.0611439842209074, "grad_norm": 0.271484375, "learning_rate": 7.846528020506957e-05, "loss": 0.0256, "step": 12416 }, { "epoch": 3.06163708086785, "grad_norm": 0.2451171875, "learning_rate": 7.843166452808287e-05, "loss": 0.0265, "step": 12418 }, { "epoch": 3.062130177514793, "grad_norm": 0.2412109375, "learning_rate": 7.839805140706433e-05, "loss": 0.0278, "step": 12420 }, { "epoch": 3.062623274161736, "grad_norm": 0.248046875, "learning_rate": 7.836444084599729e-05, "loss": 0.0257, "step": 12422 }, { "epoch": 3.0631163708086784, "grad_norm": 0.2119140625, "learning_rate": 7.833083284886483e-05, "loss": 0.0231, "step": 12424 }, { "epoch": 3.0636094674556213, "grad_norm": 0.255859375, "learning_rate": 7.829722741964963e-05, "loss": 0.0254, "step": 12426 }, { "epoch": 3.064102564102564, "grad_norm": 0.26953125, "learning_rate": 7.826362456233418e-05, "loss": 0.0279, "step": 12428 }, { "epoch": 3.064595660749507, "grad_norm": 0.279296875, "learning_rate": 7.823002428090055e-05, "loss": 0.0236, "step": 12430 }, { "epoch": 3.06508875739645, "grad_norm": 0.271484375, "learning_rate": 7.819642657933062e-05, "loss": 0.0278, "step": 12432 }, { "epoch": 3.0655818540433923, "grad_norm": 0.25, "learning_rate": 7.816283146160589e-05, "loss": 0.0267, "step": 12434 }, { "epoch": 3.0660749506903353, "grad_norm": 0.248046875, "learning_rate": 7.812923893170753e-05, "loss": 0.0235, "step": 12436 }, { "epoch": 3.0665680473372783, "grad_norm": 0.263671875, "learning_rate": 7.809564899361647e-05, "loss": 0.0257, "step": 12438 }, { "epoch": 3.067061143984221, "grad_norm": 0.28125, "learning_rate": 7.806206165131329e-05, "loss": 0.0274, "step": 12440 }, { "epoch": 3.0675542406311638, "grad_norm": 0.232421875, "learning_rate": 7.802847690877832e-05, "loss": 0.023, "step": 12442 }, { "epoch": 3.0680473372781063, "grad_norm": 0.265625, "learning_rate": 7.799489476999147e-05, "loss": 0.0251, "step": 12444 }, { "epoch": 3.0685404339250493, "grad_norm": 0.2470703125, "learning_rate": 7.796131523893249e-05, "loss": 0.0277, "step": 12446 }, { "epoch": 3.0690335305719922, "grad_norm": 0.232421875, "learning_rate": 7.792773831958067e-05, "loss": 0.0231, "step": 12448 }, { "epoch": 3.0695266272189348, "grad_norm": 0.2421875, "learning_rate": 7.789416401591513e-05, "loss": 0.0245, "step": 12450 }, { "epoch": 3.0700197238658777, "grad_norm": 0.255859375, "learning_rate": 7.786059233191457e-05, "loss": 0.023, "step": 12452 }, { "epoch": 3.0705128205128207, "grad_norm": 0.2431640625, "learning_rate": 7.782702327155738e-05, "loss": 0.0256, "step": 12454 }, { "epoch": 3.0710059171597632, "grad_norm": 0.275390625, "learning_rate": 7.77934568388218e-05, "loss": 0.0249, "step": 12456 }, { "epoch": 3.071499013806706, "grad_norm": 0.228515625, "learning_rate": 7.775989303768551e-05, "loss": 0.0278, "step": 12458 }, { "epoch": 3.0719921104536487, "grad_norm": 0.2314453125, "learning_rate": 7.77263318721261e-05, "loss": 0.025, "step": 12460 }, { "epoch": 3.0724852071005917, "grad_norm": 0.2431640625, "learning_rate": 7.769277334612072e-05, "loss": 0.0255, "step": 12462 }, { "epoch": 3.0729783037475347, "grad_norm": 0.30859375, "learning_rate": 7.765921746364626e-05, "loss": 0.0287, "step": 12464 }, { "epoch": 3.073471400394477, "grad_norm": 0.26171875, "learning_rate": 7.762566422867922e-05, "loss": 0.0239, "step": 12466 }, { "epoch": 3.07396449704142, "grad_norm": 0.2734375, "learning_rate": 7.759211364519594e-05, "loss": 0.025, "step": 12468 }, { "epoch": 3.074457593688363, "grad_norm": 0.234375, "learning_rate": 7.75585657171723e-05, "loss": 0.0239, "step": 12470 }, { "epoch": 3.0749506903353057, "grad_norm": 0.3203125, "learning_rate": 7.752502044858388e-05, "loss": 0.0261, "step": 12472 }, { "epoch": 3.0754437869822486, "grad_norm": 0.34765625, "learning_rate": 7.749147784340606e-05, "loss": 0.0224, "step": 12474 }, { "epoch": 3.075936883629191, "grad_norm": 0.279296875, "learning_rate": 7.745793790561377e-05, "loss": 0.0258, "step": 12476 }, { "epoch": 3.076429980276134, "grad_norm": 0.2333984375, "learning_rate": 7.742440063918171e-05, "loss": 0.0259, "step": 12478 }, { "epoch": 3.076923076923077, "grad_norm": 0.2578125, "learning_rate": 7.739086604808417e-05, "loss": 0.024, "step": 12480 }, { "epoch": 3.0774161735700196, "grad_norm": 0.26953125, "learning_rate": 7.73573341362953e-05, "loss": 0.0273, "step": 12482 }, { "epoch": 3.0779092702169626, "grad_norm": 0.298828125, "learning_rate": 7.732380490778874e-05, "loss": 0.0242, "step": 12484 }, { "epoch": 3.078402366863905, "grad_norm": 0.255859375, "learning_rate": 7.729027836653787e-05, "loss": 0.0258, "step": 12486 }, { "epoch": 3.078895463510848, "grad_norm": 0.30078125, "learning_rate": 7.725675451651586e-05, "loss": 0.0253, "step": 12488 }, { "epoch": 3.079388560157791, "grad_norm": 0.28125, "learning_rate": 7.72232333616954e-05, "loss": 0.0254, "step": 12490 }, { "epoch": 3.0798816568047336, "grad_norm": 0.3125, "learning_rate": 7.718971490604896e-05, "loss": 0.0243, "step": 12492 }, { "epoch": 3.0803747534516766, "grad_norm": 0.3046875, "learning_rate": 7.715619915354862e-05, "loss": 0.0245, "step": 12494 }, { "epoch": 3.0808678500986195, "grad_norm": 0.2275390625, "learning_rate": 7.712268610816628e-05, "loss": 0.0229, "step": 12496 }, { "epoch": 3.081360946745562, "grad_norm": 0.259765625, "learning_rate": 7.708917577387331e-05, "loss": 0.0235, "step": 12498 }, { "epoch": 3.081854043392505, "grad_norm": 0.306640625, "learning_rate": 7.705566815464098e-05, "loss": 0.0257, "step": 12500 }, { "epoch": 3.0823471400394475, "grad_norm": 0.2431640625, "learning_rate": 7.702216325444007e-05, "loss": 0.0238, "step": 12502 }, { "epoch": 3.0828402366863905, "grad_norm": 0.2333984375, "learning_rate": 7.69886610772411e-05, "loss": 0.0224, "step": 12504 }, { "epoch": 3.0833333333333335, "grad_norm": 0.259765625, "learning_rate": 7.695516162701429e-05, "loss": 0.024, "step": 12506 }, { "epoch": 3.083826429980276, "grad_norm": 0.2333984375, "learning_rate": 7.692166490772942e-05, "loss": 0.0276, "step": 12508 }, { "epoch": 3.084319526627219, "grad_norm": 0.2470703125, "learning_rate": 7.688817092335618e-05, "loss": 0.0242, "step": 12510 }, { "epoch": 3.084812623274162, "grad_norm": 0.263671875, "learning_rate": 7.68546796778637e-05, "loss": 0.0287, "step": 12512 }, { "epoch": 3.0853057199211045, "grad_norm": 0.279296875, "learning_rate": 7.682119117522092e-05, "loss": 0.0263, "step": 12514 }, { "epoch": 3.0857988165680474, "grad_norm": 0.2734375, "learning_rate": 7.678770541939638e-05, "loss": 0.0271, "step": 12516 }, { "epoch": 3.08629191321499, "grad_norm": 0.251953125, "learning_rate": 7.675422241435839e-05, "loss": 0.0265, "step": 12518 }, { "epoch": 3.086785009861933, "grad_norm": 0.279296875, "learning_rate": 7.672074216407478e-05, "loss": 0.0242, "step": 12520 }, { "epoch": 3.087278106508876, "grad_norm": 0.224609375, "learning_rate": 7.668726467251323e-05, "loss": 0.0246, "step": 12522 }, { "epoch": 3.0877712031558184, "grad_norm": 0.220703125, "learning_rate": 7.665378994364099e-05, "loss": 0.0255, "step": 12524 }, { "epoch": 3.0882642998027614, "grad_norm": 0.232421875, "learning_rate": 7.662031798142493e-05, "loss": 0.0215, "step": 12526 }, { "epoch": 3.088757396449704, "grad_norm": 0.25, "learning_rate": 7.658684878983178e-05, "loss": 0.0267, "step": 12528 }, { "epoch": 3.089250493096647, "grad_norm": 0.302734375, "learning_rate": 7.655338237282775e-05, "loss": 0.026, "step": 12530 }, { "epoch": 3.08974358974359, "grad_norm": 0.302734375, "learning_rate": 7.651991873437883e-05, "loss": 0.0227, "step": 12532 }, { "epoch": 3.0902366863905324, "grad_norm": 0.2060546875, "learning_rate": 7.648645787845064e-05, "loss": 0.0231, "step": 12534 }, { "epoch": 3.0907297830374754, "grad_norm": 0.2451171875, "learning_rate": 7.645299980900846e-05, "loss": 0.0252, "step": 12536 }, { "epoch": 3.0912228796844183, "grad_norm": 0.2353515625, "learning_rate": 7.641954453001728e-05, "loss": 0.0251, "step": 12538 }, { "epoch": 3.091715976331361, "grad_norm": 0.2265625, "learning_rate": 7.638609204544169e-05, "loss": 0.0218, "step": 12540 }, { "epoch": 3.092209072978304, "grad_norm": 0.302734375, "learning_rate": 7.63526423592461e-05, "loss": 0.0223, "step": 12542 }, { "epoch": 3.0927021696252464, "grad_norm": 0.2177734375, "learning_rate": 7.631919547539437e-05, "loss": 0.0224, "step": 12544 }, { "epoch": 3.0931952662721893, "grad_norm": 0.2734375, "learning_rate": 7.628575139785024e-05, "loss": 0.0237, "step": 12546 }, { "epoch": 3.0936883629191323, "grad_norm": 0.259765625, "learning_rate": 7.625231013057691e-05, "loss": 0.0252, "step": 12548 }, { "epoch": 3.094181459566075, "grad_norm": 0.275390625, "learning_rate": 7.621887167753748e-05, "loss": 0.0258, "step": 12550 }, { "epoch": 3.094674556213018, "grad_norm": 0.29296875, "learning_rate": 7.618543604269452e-05, "loss": 0.0266, "step": 12552 }, { "epoch": 3.0951676528599608, "grad_norm": 0.259765625, "learning_rate": 7.615200323001032e-05, "loss": 0.0244, "step": 12554 }, { "epoch": 3.0956607495069033, "grad_norm": 0.20703125, "learning_rate": 7.611857324344692e-05, "loss": 0.0235, "step": 12556 }, { "epoch": 3.0961538461538463, "grad_norm": 0.27734375, "learning_rate": 7.60851460869659e-05, "loss": 0.0263, "step": 12558 }, { "epoch": 3.096646942800789, "grad_norm": 0.279296875, "learning_rate": 7.605172176452862e-05, "loss": 0.0309, "step": 12560 }, { "epoch": 3.0971400394477318, "grad_norm": 0.296875, "learning_rate": 7.6018300280096e-05, "loss": 0.0242, "step": 12562 }, { "epoch": 3.0976331360946747, "grad_norm": 0.26171875, "learning_rate": 7.598488163762872e-05, "loss": 0.0252, "step": 12564 }, { "epoch": 3.0981262327416172, "grad_norm": 0.333984375, "learning_rate": 7.595146584108704e-05, "loss": 0.0283, "step": 12566 }, { "epoch": 3.09861932938856, "grad_norm": 0.32421875, "learning_rate": 7.59180528944309e-05, "loss": 0.0294, "step": 12568 }, { "epoch": 3.0991124260355027, "grad_norm": 0.19140625, "learning_rate": 7.588464280161998e-05, "loss": 0.0235, "step": 12570 }, { "epoch": 3.0996055226824457, "grad_norm": 0.3046875, "learning_rate": 7.585123556661351e-05, "loss": 0.0224, "step": 12572 }, { "epoch": 3.1000986193293887, "grad_norm": 0.373046875, "learning_rate": 7.581783119337048e-05, "loss": 0.0281, "step": 12574 }, { "epoch": 3.100591715976331, "grad_norm": 0.263671875, "learning_rate": 7.578442968584943e-05, "loss": 0.0263, "step": 12576 }, { "epoch": 3.101084812623274, "grad_norm": 0.2734375, "learning_rate": 7.57510310480087e-05, "loss": 0.0236, "step": 12578 }, { "epoch": 3.101577909270217, "grad_norm": 0.2578125, "learning_rate": 7.571763528380615e-05, "loss": 0.0235, "step": 12580 }, { "epoch": 3.1020710059171597, "grad_norm": 0.279296875, "learning_rate": 7.568424239719943e-05, "loss": 0.025, "step": 12582 }, { "epoch": 3.1025641025641026, "grad_norm": 0.2734375, "learning_rate": 7.565085239214574e-05, "loss": 0.0286, "step": 12584 }, { "epoch": 3.103057199211045, "grad_norm": 0.234375, "learning_rate": 7.561746527260197e-05, "loss": 0.024, "step": 12586 }, { "epoch": 3.103550295857988, "grad_norm": 0.21875, "learning_rate": 7.55840810425247e-05, "loss": 0.0238, "step": 12588 }, { "epoch": 3.104043392504931, "grad_norm": 0.33984375, "learning_rate": 7.555069970587017e-05, "loss": 0.0329, "step": 12590 }, { "epoch": 3.1045364891518736, "grad_norm": 0.314453125, "learning_rate": 7.551732126659422e-05, "loss": 0.0311, "step": 12592 }, { "epoch": 3.1050295857988166, "grad_norm": 0.267578125, "learning_rate": 7.548394572865237e-05, "loss": 0.0234, "step": 12594 }, { "epoch": 3.1055226824457596, "grad_norm": 0.318359375, "learning_rate": 7.545057309599986e-05, "loss": 0.0212, "step": 12596 }, { "epoch": 3.106015779092702, "grad_norm": 0.330078125, "learning_rate": 7.54172033725915e-05, "loss": 0.0318, "step": 12598 }, { "epoch": 3.106508875739645, "grad_norm": 0.341796875, "learning_rate": 7.538383656238179e-05, "loss": 0.0311, "step": 12600 }, { "epoch": 3.1070019723865876, "grad_norm": 0.298828125, "learning_rate": 7.53504726693249e-05, "loss": 0.0274, "step": 12602 }, { "epoch": 3.1074950690335306, "grad_norm": 0.28125, "learning_rate": 7.531711169737461e-05, "loss": 0.024, "step": 12604 }, { "epoch": 3.1079881656804735, "grad_norm": 0.27734375, "learning_rate": 7.528375365048442e-05, "loss": 0.0329, "step": 12606 }, { "epoch": 3.108481262327416, "grad_norm": 0.33984375, "learning_rate": 7.525039853260736e-05, "loss": 0.0309, "step": 12608 }, { "epoch": 3.108974358974359, "grad_norm": 0.25, "learning_rate": 7.52170463476963e-05, "loss": 0.025, "step": 12610 }, { "epoch": 3.109467455621302, "grad_norm": 0.294921875, "learning_rate": 7.518369709970362e-05, "loss": 0.0261, "step": 12612 }, { "epoch": 3.1099605522682445, "grad_norm": 0.31640625, "learning_rate": 7.515035079258138e-05, "loss": 0.0324, "step": 12614 }, { "epoch": 3.1104536489151875, "grad_norm": 0.283203125, "learning_rate": 7.511700743028134e-05, "loss": 0.0365, "step": 12616 }, { "epoch": 3.11094674556213, "grad_norm": 0.259765625, "learning_rate": 7.508366701675482e-05, "loss": 0.027, "step": 12618 }, { "epoch": 3.111439842209073, "grad_norm": 0.306640625, "learning_rate": 7.505032955595289e-05, "loss": 0.0274, "step": 12620 }, { "epoch": 3.111932938856016, "grad_norm": 0.296875, "learning_rate": 7.501699505182617e-05, "loss": 0.0329, "step": 12622 }, { "epoch": 3.1124260355029585, "grad_norm": 0.322265625, "learning_rate": 7.498366350832506e-05, "loss": 0.0343, "step": 12624 }, { "epoch": 3.1129191321499015, "grad_norm": 0.271484375, "learning_rate": 7.495033492939949e-05, "loss": 0.0247, "step": 12626 }, { "epoch": 3.113412228796844, "grad_norm": 0.2734375, "learning_rate": 7.491700931899909e-05, "loss": 0.022, "step": 12628 }, { "epoch": 3.113905325443787, "grad_norm": 0.345703125, "learning_rate": 7.488368668107312e-05, "loss": 0.0359, "step": 12630 }, { "epoch": 3.11439842209073, "grad_norm": 0.3515625, "learning_rate": 7.485036701957053e-05, "loss": 0.033, "step": 12632 }, { "epoch": 3.1148915187376724, "grad_norm": 0.2734375, "learning_rate": 7.481705033843989e-05, "loss": 0.0249, "step": 12634 }, { "epoch": 3.1153846153846154, "grad_norm": 0.265625, "learning_rate": 7.478373664162932e-05, "loss": 0.0256, "step": 12636 }, { "epoch": 3.1158777120315584, "grad_norm": 0.322265625, "learning_rate": 7.47504259330868e-05, "loss": 0.0328, "step": 12638 }, { "epoch": 3.116370808678501, "grad_norm": 0.333984375, "learning_rate": 7.471711821675973e-05, "loss": 0.0368, "step": 12640 }, { "epoch": 3.116863905325444, "grad_norm": 0.287109375, "learning_rate": 7.468381349659537e-05, "loss": 0.0217, "step": 12642 }, { "epoch": 3.1173570019723864, "grad_norm": 0.265625, "learning_rate": 7.465051177654043e-05, "loss": 0.0306, "step": 12644 }, { "epoch": 3.1178500986193294, "grad_norm": 0.306640625, "learning_rate": 7.46172130605414e-05, "loss": 0.0382, "step": 12646 }, { "epoch": 3.1183431952662723, "grad_norm": 0.3203125, "learning_rate": 7.458391735254431e-05, "loss": 0.0332, "step": 12648 }, { "epoch": 3.118836291913215, "grad_norm": 0.3359375, "learning_rate": 7.455062465649494e-05, "loss": 0.0321, "step": 12650 }, { "epoch": 3.119329388560158, "grad_norm": 0.244140625, "learning_rate": 7.451733497633865e-05, "loss": 0.0269, "step": 12652 }, { "epoch": 3.1198224852071004, "grad_norm": 0.302734375, "learning_rate": 7.44840483160204e-05, "loss": 0.0345, "step": 12654 }, { "epoch": 3.1203155818540433, "grad_norm": 0.3203125, "learning_rate": 7.445076467948493e-05, "loss": 0.0324, "step": 12656 }, { "epoch": 3.1208086785009863, "grad_norm": 0.240234375, "learning_rate": 7.441748407067648e-05, "loss": 0.0267, "step": 12658 }, { "epoch": 3.121301775147929, "grad_norm": 0.291015625, "learning_rate": 7.438420649353901e-05, "loss": 0.029, "step": 12660 }, { "epoch": 3.121794871794872, "grad_norm": 0.30859375, "learning_rate": 7.435093195201605e-05, "loss": 0.0364, "step": 12662 }, { "epoch": 3.1222879684418148, "grad_norm": 0.32421875, "learning_rate": 7.43176604500509e-05, "loss": 0.035, "step": 12664 }, { "epoch": 3.1227810650887573, "grad_norm": 0.298828125, "learning_rate": 7.428439199158639e-05, "loss": 0.0311, "step": 12666 }, { "epoch": 3.1232741617357003, "grad_norm": 0.326171875, "learning_rate": 7.425112658056496e-05, "loss": 0.0287, "step": 12668 }, { "epoch": 3.123767258382643, "grad_norm": 0.310546875, "learning_rate": 7.421786422092884e-05, "loss": 0.0353, "step": 12670 }, { "epoch": 3.1242603550295858, "grad_norm": 0.33203125, "learning_rate": 7.418460491661973e-05, "loss": 0.0354, "step": 12672 }, { "epoch": 3.1247534516765287, "grad_norm": 0.275390625, "learning_rate": 7.415134867157909e-05, "loss": 0.0343, "step": 12674 }, { "epoch": 3.1252465483234713, "grad_norm": 0.369140625, "learning_rate": 7.411809548974792e-05, "loss": 0.0305, "step": 12676 }, { "epoch": 3.1257396449704142, "grad_norm": 0.28515625, "learning_rate": 7.408484537506698e-05, "loss": 0.0323, "step": 12678 }, { "epoch": 3.126232741617357, "grad_norm": 0.33203125, "learning_rate": 7.405159833147651e-05, "loss": 0.0353, "step": 12680 }, { "epoch": 3.1267258382642997, "grad_norm": 0.3203125, "learning_rate": 7.401835436291656e-05, "loss": 0.0334, "step": 12682 }, { "epoch": 3.1272189349112427, "grad_norm": 0.30078125, "learning_rate": 7.398511347332669e-05, "loss": 0.0378, "step": 12684 }, { "epoch": 3.1277120315581852, "grad_norm": 0.314453125, "learning_rate": 7.395187566664607e-05, "loss": 0.038, "step": 12686 }, { "epoch": 3.128205128205128, "grad_norm": 0.3203125, "learning_rate": 7.391864094681367e-05, "loss": 0.0391, "step": 12688 }, { "epoch": 3.128698224852071, "grad_norm": 0.302734375, "learning_rate": 7.388540931776789e-05, "loss": 0.0336, "step": 12690 }, { "epoch": 3.1291913214990137, "grad_norm": 0.296875, "learning_rate": 7.385218078344694e-05, "loss": 0.0325, "step": 12692 }, { "epoch": 3.1296844181459567, "grad_norm": 0.33203125, "learning_rate": 7.381895534778853e-05, "loss": 0.0342, "step": 12694 }, { "epoch": 3.1301775147928996, "grad_norm": 0.357421875, "learning_rate": 7.378573301473012e-05, "loss": 0.0361, "step": 12696 }, { "epoch": 3.130670611439842, "grad_norm": 0.31640625, "learning_rate": 7.375251378820868e-05, "loss": 0.0316, "step": 12698 }, { "epoch": 3.131163708086785, "grad_norm": 0.298828125, "learning_rate": 7.371929767216094e-05, "loss": 0.0289, "step": 12700 }, { "epoch": 3.1316568047337277, "grad_norm": 0.330078125, "learning_rate": 7.368608467052316e-05, "loss": 0.0343, "step": 12702 }, { "epoch": 3.1321499013806706, "grad_norm": 0.294921875, "learning_rate": 7.365287478723122e-05, "loss": 0.0367, "step": 12704 }, { "epoch": 3.1326429980276136, "grad_norm": 0.333984375, "learning_rate": 7.361966802622075e-05, "loss": 0.0327, "step": 12706 }, { "epoch": 3.133136094674556, "grad_norm": 0.302734375, "learning_rate": 7.358646439142688e-05, "loss": 0.0334, "step": 12708 }, { "epoch": 3.133629191321499, "grad_norm": 0.30859375, "learning_rate": 7.35532638867845e-05, "loss": 0.0373, "step": 12710 }, { "epoch": 3.1341222879684416, "grad_norm": 0.5390625, "learning_rate": 7.352006651622798e-05, "loss": 0.0365, "step": 12712 }, { "epoch": 3.1346153846153846, "grad_norm": 0.392578125, "learning_rate": 7.348687228369144e-05, "loss": 0.0423, "step": 12714 }, { "epoch": 3.1351084812623276, "grad_norm": 0.337890625, "learning_rate": 7.345368119310856e-05, "loss": 0.0368, "step": 12716 }, { "epoch": 3.13560157790927, "grad_norm": 0.3125, "learning_rate": 7.342049324841265e-05, "loss": 0.0357, "step": 12718 }, { "epoch": 3.136094674556213, "grad_norm": 0.365234375, "learning_rate": 7.338730845353671e-05, "loss": 0.0345, "step": 12720 }, { "epoch": 3.136587771203156, "grad_norm": 0.3125, "learning_rate": 7.335412681241329e-05, "loss": 0.0342, "step": 12722 }, { "epoch": 3.1370808678500985, "grad_norm": 0.34765625, "learning_rate": 7.332094832897464e-05, "loss": 0.0336, "step": 12724 }, { "epoch": 3.1375739644970415, "grad_norm": 0.33203125, "learning_rate": 7.328777300715256e-05, "loss": 0.0388, "step": 12726 }, { "epoch": 3.138067061143984, "grad_norm": 0.37890625, "learning_rate": 7.325460085087852e-05, "loss": 0.0353, "step": 12728 }, { "epoch": 3.138560157790927, "grad_norm": 0.330078125, "learning_rate": 7.322143186408358e-05, "loss": 0.0318, "step": 12730 }, { "epoch": 3.13905325443787, "grad_norm": 0.296875, "learning_rate": 7.318826605069851e-05, "loss": 0.0335, "step": 12732 }, { "epoch": 3.1395463510848125, "grad_norm": 0.34765625, "learning_rate": 7.315510341465363e-05, "loss": 0.0344, "step": 12734 }, { "epoch": 3.1400394477317555, "grad_norm": 0.416015625, "learning_rate": 7.312194395987882e-05, "loss": 0.0422, "step": 12736 }, { "epoch": 3.140532544378698, "grad_norm": 0.32421875, "learning_rate": 7.308878769030376e-05, "loss": 0.0363, "step": 12738 }, { "epoch": 3.141025641025641, "grad_norm": 0.431640625, "learning_rate": 7.30556346098576e-05, "loss": 0.0387, "step": 12740 }, { "epoch": 3.141518737672584, "grad_norm": 0.4609375, "learning_rate": 7.302248472246919e-05, "loss": 0.0455, "step": 12742 }, { "epoch": 3.1420118343195265, "grad_norm": 0.328125, "learning_rate": 7.298933803206691e-05, "loss": 0.0384, "step": 12744 }, { "epoch": 3.1425049309664694, "grad_norm": 0.3046875, "learning_rate": 7.295619454257894e-05, "loss": 0.0384, "step": 12746 }, { "epoch": 3.1429980276134124, "grad_norm": 0.478515625, "learning_rate": 7.292305425793291e-05, "loss": 0.0388, "step": 12748 }, { "epoch": 3.143491124260355, "grad_norm": 0.44921875, "learning_rate": 7.288991718205609e-05, "loss": 0.0406, "step": 12750 }, { "epoch": 3.143984220907298, "grad_norm": 0.32421875, "learning_rate": 7.285678331887547e-05, "loss": 0.0396, "step": 12752 }, { "epoch": 3.1444773175542404, "grad_norm": 0.435546875, "learning_rate": 7.282365267231756e-05, "loss": 0.0353, "step": 12754 }, { "epoch": 3.1449704142011834, "grad_norm": 0.3203125, "learning_rate": 7.279052524630858e-05, "loss": 0.0346, "step": 12756 }, { "epoch": 3.1454635108481264, "grad_norm": 0.34765625, "learning_rate": 7.275740104477422e-05, "loss": 0.04, "step": 12758 }, { "epoch": 3.145956607495069, "grad_norm": 0.47265625, "learning_rate": 7.272428007163998e-05, "loss": 0.0377, "step": 12760 }, { "epoch": 3.146449704142012, "grad_norm": 0.365234375, "learning_rate": 7.269116233083082e-05, "loss": 0.0375, "step": 12762 }, { "epoch": 3.146942800788955, "grad_norm": 0.2890625, "learning_rate": 7.265804782627141e-05, "loss": 0.0374, "step": 12764 }, { "epoch": 3.1474358974358974, "grad_norm": 0.37890625, "learning_rate": 7.2624936561886e-05, "loss": 0.0417, "step": 12766 }, { "epoch": 3.1479289940828403, "grad_norm": 0.291015625, "learning_rate": 7.259182854159845e-05, "loss": 0.0403, "step": 12768 }, { "epoch": 3.148422090729783, "grad_norm": 0.357421875, "learning_rate": 7.255872376933223e-05, "loss": 0.0426, "step": 12770 }, { "epoch": 3.148915187376726, "grad_norm": 0.32421875, "learning_rate": 7.252562224901044e-05, "loss": 0.0378, "step": 12772 }, { "epoch": 3.149408284023669, "grad_norm": 0.359375, "learning_rate": 7.249252398455584e-05, "loss": 0.0399, "step": 12774 }, { "epoch": 3.1499013806706113, "grad_norm": 0.380859375, "learning_rate": 7.245942897989069e-05, "loss": 0.0424, "step": 12776 }, { "epoch": 3.1503944773175543, "grad_norm": 0.390625, "learning_rate": 7.242633723893701e-05, "loss": 0.0391, "step": 12778 }, { "epoch": 3.1508875739644973, "grad_norm": 0.294921875, "learning_rate": 7.239324876561631e-05, "loss": 0.0361, "step": 12780 }, { "epoch": 3.15138067061144, "grad_norm": 0.341796875, "learning_rate": 7.236016356384978e-05, "loss": 0.0402, "step": 12782 }, { "epoch": 3.1518737672583828, "grad_norm": 0.3046875, "learning_rate": 7.23270816375582e-05, "loss": 0.0358, "step": 12784 }, { "epoch": 3.1523668639053253, "grad_norm": 0.30859375, "learning_rate": 7.22940029906619e-05, "loss": 0.038, "step": 12786 }, { "epoch": 3.1528599605522682, "grad_norm": 0.396484375, "learning_rate": 7.226092762708099e-05, "loss": 0.0382, "step": 12788 }, { "epoch": 3.153353057199211, "grad_norm": 0.384765625, "learning_rate": 7.222785555073496e-05, "loss": 0.0404, "step": 12790 }, { "epoch": 3.1538461538461537, "grad_norm": 0.328125, "learning_rate": 7.219478676554318e-05, "loss": 0.041, "step": 12792 }, { "epoch": 3.1543392504930967, "grad_norm": 0.29296875, "learning_rate": 7.216172127542437e-05, "loss": 0.0356, "step": 12794 }, { "epoch": 3.1548323471400392, "grad_norm": 0.357421875, "learning_rate": 7.212865908429705e-05, "loss": 0.0391, "step": 12796 }, { "epoch": 3.155325443786982, "grad_norm": 0.306640625, "learning_rate": 7.209560019607924e-05, "loss": 0.0374, "step": 12798 }, { "epoch": 3.155818540433925, "grad_norm": 0.359375, "learning_rate": 7.206254461468858e-05, "loss": 0.0426, "step": 12800 }, { "epoch": 3.1563116370808677, "grad_norm": 0.349609375, "learning_rate": 7.202949234404239e-05, "loss": 0.0349, "step": 12802 }, { "epoch": 3.1568047337278107, "grad_norm": 0.462890625, "learning_rate": 7.199644338805749e-05, "loss": 0.0379, "step": 12804 }, { "epoch": 3.1572978303747536, "grad_norm": 0.345703125, "learning_rate": 7.196339775065042e-05, "loss": 0.0371, "step": 12806 }, { "epoch": 3.157790927021696, "grad_norm": 0.359375, "learning_rate": 7.193035543573725e-05, "loss": 0.0404, "step": 12808 }, { "epoch": 3.158284023668639, "grad_norm": 0.322265625, "learning_rate": 7.189731644723371e-05, "loss": 0.0391, "step": 12810 }, { "epoch": 3.1587771203155817, "grad_norm": 0.359375, "learning_rate": 7.186428078905503e-05, "loss": 0.0406, "step": 12812 }, { "epoch": 3.1592702169625246, "grad_norm": 0.53515625, "learning_rate": 7.18312484651162e-05, "loss": 0.0458, "step": 12814 }, { "epoch": 3.1597633136094676, "grad_norm": 0.412109375, "learning_rate": 7.17982194793317e-05, "loss": 0.0441, "step": 12816 }, { "epoch": 3.16025641025641, "grad_norm": 0.345703125, "learning_rate": 7.176519383561562e-05, "loss": 0.0365, "step": 12818 }, { "epoch": 3.160749506903353, "grad_norm": 0.42578125, "learning_rate": 7.173217153788174e-05, "loss": 0.0408, "step": 12820 }, { "epoch": 3.1612426035502956, "grad_norm": 0.36328125, "learning_rate": 7.169915259004336e-05, "loss": 0.0429, "step": 12822 }, { "epoch": 3.1617357001972386, "grad_norm": 0.412109375, "learning_rate": 7.166613699601341e-05, "loss": 0.0503, "step": 12824 }, { "epoch": 3.1622287968441816, "grad_norm": 0.359375, "learning_rate": 7.163312475970438e-05, "loss": 0.0383, "step": 12826 }, { "epoch": 3.162721893491124, "grad_norm": 0.41015625, "learning_rate": 7.160011588502851e-05, "loss": 0.0365, "step": 12828 }, { "epoch": 3.163214990138067, "grad_norm": 0.361328125, "learning_rate": 7.156711037589741e-05, "loss": 0.0412, "step": 12830 }, { "epoch": 3.16370808678501, "grad_norm": 0.373046875, "learning_rate": 7.153410823622253e-05, "loss": 0.0433, "step": 12832 }, { "epoch": 3.1642011834319526, "grad_norm": 0.302734375, "learning_rate": 7.150110946991476e-05, "loss": 0.0388, "step": 12834 }, { "epoch": 3.1646942800788955, "grad_norm": 0.369140625, "learning_rate": 7.14681140808846e-05, "loss": 0.0354, "step": 12836 }, { "epoch": 3.1651873767258385, "grad_norm": 0.34765625, "learning_rate": 7.143512207304226e-05, "loss": 0.0389, "step": 12838 }, { "epoch": 3.165680473372781, "grad_norm": 0.31640625, "learning_rate": 7.140213345029741e-05, "loss": 0.0403, "step": 12840 }, { "epoch": 3.166173570019724, "grad_norm": 0.35546875, "learning_rate": 7.136914821655943e-05, "loss": 0.0369, "step": 12842 }, { "epoch": 3.1666666666666665, "grad_norm": 0.345703125, "learning_rate": 7.133616637573721e-05, "loss": 0.0404, "step": 12844 }, { "epoch": 3.1671597633136095, "grad_norm": 0.306640625, "learning_rate": 7.130318793173936e-05, "loss": 0.0423, "step": 12846 }, { "epoch": 3.1676528599605525, "grad_norm": 0.353515625, "learning_rate": 7.127021288847395e-05, "loss": 0.0424, "step": 12848 }, { "epoch": 3.168145956607495, "grad_norm": 0.322265625, "learning_rate": 7.12372412498487e-05, "loss": 0.0347, "step": 12850 }, { "epoch": 3.168639053254438, "grad_norm": 0.333984375, "learning_rate": 7.120427301977097e-05, "loss": 0.039, "step": 12852 }, { "epoch": 3.1691321499013805, "grad_norm": 0.388671875, "learning_rate": 7.117130820214764e-05, "loss": 0.0422, "step": 12854 }, { "epoch": 3.1696252465483234, "grad_norm": 0.361328125, "learning_rate": 7.113834680088527e-05, "loss": 0.0429, "step": 12856 }, { "epoch": 3.1701183431952664, "grad_norm": 0.357421875, "learning_rate": 7.110538881988992e-05, "loss": 0.041, "step": 12858 }, { "epoch": 3.170611439842209, "grad_norm": 0.390625, "learning_rate": 7.107243426306736e-05, "loss": 0.0414, "step": 12860 }, { "epoch": 3.171104536489152, "grad_norm": 0.349609375, "learning_rate": 7.103948313432283e-05, "loss": 0.0404, "step": 12862 }, { "epoch": 3.171597633136095, "grad_norm": 0.41015625, "learning_rate": 7.100653543756125e-05, "loss": 0.0425, "step": 12864 }, { "epoch": 3.1720907297830374, "grad_norm": 0.326171875, "learning_rate": 7.097359117668711e-05, "loss": 0.0432, "step": 12866 }, { "epoch": 3.1725838264299804, "grad_norm": 0.294921875, "learning_rate": 7.094065035560443e-05, "loss": 0.0362, "step": 12868 }, { "epoch": 3.173076923076923, "grad_norm": 0.349609375, "learning_rate": 7.090771297821698e-05, "loss": 0.045, "step": 12870 }, { "epoch": 3.173570019723866, "grad_norm": 0.388671875, "learning_rate": 7.087477904842793e-05, "loss": 0.0472, "step": 12872 }, { "epoch": 3.174063116370809, "grad_norm": 0.322265625, "learning_rate": 7.084184857014023e-05, "loss": 0.0398, "step": 12874 }, { "epoch": 3.1745562130177514, "grad_norm": 0.310546875, "learning_rate": 7.080892154725622e-05, "loss": 0.037, "step": 12876 }, { "epoch": 3.1750493096646943, "grad_norm": 0.35546875, "learning_rate": 7.077599798367804e-05, "loss": 0.0481, "step": 12878 }, { "epoch": 3.175542406311637, "grad_norm": 0.353515625, "learning_rate": 7.074307788330725e-05, "loss": 0.0427, "step": 12880 }, { "epoch": 3.17603550295858, "grad_norm": 0.369140625, "learning_rate": 7.071016125004503e-05, "loss": 0.0442, "step": 12882 }, { "epoch": 3.176528599605523, "grad_norm": 0.38671875, "learning_rate": 7.067724808779231e-05, "loss": 0.0429, "step": 12884 }, { "epoch": 3.1770216962524653, "grad_norm": 0.345703125, "learning_rate": 7.064433840044933e-05, "loss": 0.0456, "step": 12886 }, { "epoch": 3.1775147928994083, "grad_norm": 0.357421875, "learning_rate": 7.061143219191621e-05, "loss": 0.0447, "step": 12888 }, { "epoch": 3.1780078895463513, "grad_norm": 0.357421875, "learning_rate": 7.057852946609244e-05, "loss": 0.0401, "step": 12890 }, { "epoch": 3.178500986193294, "grad_norm": 0.3359375, "learning_rate": 7.054563022687722e-05, "loss": 0.0375, "step": 12892 }, { "epoch": 3.1789940828402368, "grad_norm": 0.326171875, "learning_rate": 7.051273447816922e-05, "loss": 0.0393, "step": 12894 }, { "epoch": 3.1794871794871793, "grad_norm": 0.359375, "learning_rate": 7.047984222386686e-05, "loss": 0.0431, "step": 12896 }, { "epoch": 3.1799802761341223, "grad_norm": 0.32421875, "learning_rate": 7.044695346786802e-05, "loss": 0.0395, "step": 12898 }, { "epoch": 3.1804733727810652, "grad_norm": 0.337890625, "learning_rate": 7.041406821407016e-05, "loss": 0.0409, "step": 12900 }, { "epoch": 3.1809664694280078, "grad_norm": 0.353515625, "learning_rate": 7.038118646637044e-05, "loss": 0.0402, "step": 12902 }, { "epoch": 3.1814595660749507, "grad_norm": 0.3515625, "learning_rate": 7.034830822866549e-05, "loss": 0.0411, "step": 12904 }, { "epoch": 3.1819526627218937, "grad_norm": 0.32421875, "learning_rate": 7.031543350485157e-05, "loss": 0.0375, "step": 12906 }, { "epoch": 3.1824457593688362, "grad_norm": 0.357421875, "learning_rate": 7.028256229882448e-05, "loss": 0.0378, "step": 12908 }, { "epoch": 3.182938856015779, "grad_norm": 0.330078125, "learning_rate": 7.024969461447972e-05, "loss": 0.0413, "step": 12910 }, { "epoch": 3.1834319526627217, "grad_norm": 0.310546875, "learning_rate": 7.02168304557122e-05, "loss": 0.0403, "step": 12912 }, { "epoch": 3.1839250493096647, "grad_norm": 0.3359375, "learning_rate": 7.018396982641662e-05, "loss": 0.0381, "step": 12914 }, { "epoch": 3.1844181459566077, "grad_norm": 0.373046875, "learning_rate": 7.015111273048708e-05, "loss": 0.0403, "step": 12916 }, { "epoch": 3.18491124260355, "grad_norm": 0.392578125, "learning_rate": 7.011825917181732e-05, "loss": 0.0457, "step": 12918 }, { "epoch": 3.185404339250493, "grad_norm": 0.423828125, "learning_rate": 7.00854091543007e-05, "loss": 0.0406, "step": 12920 }, { "epoch": 3.185897435897436, "grad_norm": 0.359375, "learning_rate": 7.005256268183007e-05, "loss": 0.0412, "step": 12922 }, { "epoch": 3.1863905325443787, "grad_norm": 0.326171875, "learning_rate": 7.001971975829801e-05, "loss": 0.0392, "step": 12924 }, { "epoch": 3.1868836291913216, "grad_norm": 0.3203125, "learning_rate": 6.998688038759651e-05, "loss": 0.0426, "step": 12926 }, { "epoch": 3.187376725838264, "grad_norm": 0.3671875, "learning_rate": 6.99540445736173e-05, "loss": 0.0457, "step": 12928 }, { "epoch": 3.187869822485207, "grad_norm": 0.30859375, "learning_rate": 6.992121232025157e-05, "loss": 0.0408, "step": 12930 }, { "epoch": 3.18836291913215, "grad_norm": 0.3125, "learning_rate": 6.988838363139009e-05, "loss": 0.0396, "step": 12932 }, { "epoch": 3.1888560157790926, "grad_norm": 0.376953125, "learning_rate": 6.985555851092329e-05, "loss": 0.0425, "step": 12934 }, { "epoch": 3.1893491124260356, "grad_norm": 0.37890625, "learning_rate": 6.982273696274106e-05, "loss": 0.0455, "step": 12936 }, { "epoch": 3.189842209072978, "grad_norm": 0.3671875, "learning_rate": 6.978991899073306e-05, "loss": 0.0478, "step": 12938 }, { "epoch": 3.190335305719921, "grad_norm": 0.373046875, "learning_rate": 6.975710459878825e-05, "loss": 0.0403, "step": 12940 }, { "epoch": 3.190828402366864, "grad_norm": 0.3671875, "learning_rate": 6.972429379079548e-05, "loss": 0.0432, "step": 12942 }, { "epoch": 3.1913214990138066, "grad_norm": 0.375, "learning_rate": 6.969148657064289e-05, "loss": 0.0433, "step": 12944 }, { "epoch": 3.1918145956607495, "grad_norm": 0.349609375, "learning_rate": 6.965868294221839e-05, "loss": 0.042, "step": 12946 }, { "epoch": 3.1923076923076925, "grad_norm": 0.322265625, "learning_rate": 6.962588290940937e-05, "loss": 0.0452, "step": 12948 }, { "epoch": 3.192800788954635, "grad_norm": 0.357421875, "learning_rate": 6.959308647610276e-05, "loss": 0.0414, "step": 12950 }, { "epoch": 3.193293885601578, "grad_norm": 0.345703125, "learning_rate": 6.956029364618521e-05, "loss": 0.0416, "step": 12952 }, { "epoch": 3.1937869822485205, "grad_norm": 0.369140625, "learning_rate": 6.952750442354278e-05, "loss": 0.0409, "step": 12954 }, { "epoch": 3.1942800788954635, "grad_norm": 0.345703125, "learning_rate": 6.949471881206126e-05, "loss": 0.0416, "step": 12956 }, { "epoch": 3.1947731755424065, "grad_norm": 0.3125, "learning_rate": 6.946193681562588e-05, "loss": 0.0452, "step": 12958 }, { "epoch": 3.195266272189349, "grad_norm": 0.39453125, "learning_rate": 6.942915843812148e-05, "loss": 0.0447, "step": 12960 }, { "epoch": 3.195759368836292, "grad_norm": 0.349609375, "learning_rate": 6.939638368343246e-05, "loss": 0.0429, "step": 12962 }, { "epoch": 3.1962524654832345, "grad_norm": 0.353515625, "learning_rate": 6.936361255544289e-05, "loss": 0.047, "step": 12964 }, { "epoch": 3.1967455621301775, "grad_norm": 0.34375, "learning_rate": 6.933084505803626e-05, "loss": 0.0494, "step": 12966 }, { "epoch": 3.1972386587771204, "grad_norm": 0.390625, "learning_rate": 6.92980811950957e-05, "loss": 0.0499, "step": 12968 }, { "epoch": 3.197731755424063, "grad_norm": 0.337890625, "learning_rate": 6.926532097050398e-05, "loss": 0.0396, "step": 12970 }, { "epoch": 3.198224852071006, "grad_norm": 0.396484375, "learning_rate": 6.923256438814331e-05, "loss": 0.0363, "step": 12972 }, { "epoch": 3.198717948717949, "grad_norm": 0.359375, "learning_rate": 6.919981145189556e-05, "loss": 0.0423, "step": 12974 }, { "epoch": 3.1992110453648914, "grad_norm": 0.361328125, "learning_rate": 6.916706216564206e-05, "loss": 0.0431, "step": 12976 }, { "epoch": 3.1997041420118344, "grad_norm": 0.353515625, "learning_rate": 6.91343165332639e-05, "loss": 0.0409, "step": 12978 }, { "epoch": 3.200197238658777, "grad_norm": 0.34375, "learning_rate": 6.910157455864156e-05, "loss": 0.0433, "step": 12980 }, { "epoch": 3.20069033530572, "grad_norm": 0.345703125, "learning_rate": 6.906883624565512e-05, "loss": 0.0462, "step": 12982 }, { "epoch": 3.201183431952663, "grad_norm": 0.330078125, "learning_rate": 6.90361015981843e-05, "loss": 0.0394, "step": 12984 }, { "epoch": 3.2016765285996054, "grad_norm": 0.326171875, "learning_rate": 6.900337062010831e-05, "loss": 0.0409, "step": 12986 }, { "epoch": 3.2021696252465484, "grad_norm": 0.380859375, "learning_rate": 6.8970643315306e-05, "loss": 0.0457, "step": 12988 }, { "epoch": 3.2026627218934913, "grad_norm": 0.33984375, "learning_rate": 6.893791968765564e-05, "loss": 0.0485, "step": 12990 }, { "epoch": 3.203155818540434, "grad_norm": 0.412109375, "learning_rate": 6.890519974103528e-05, "loss": 0.0426, "step": 12992 }, { "epoch": 3.203648915187377, "grad_norm": 0.328125, "learning_rate": 6.887248347932232e-05, "loss": 0.0383, "step": 12994 }, { "epoch": 3.2041420118343193, "grad_norm": 0.416015625, "learning_rate": 6.883977090639389e-05, "loss": 0.0373, "step": 12996 }, { "epoch": 3.2046351084812623, "grad_norm": 0.32421875, "learning_rate": 6.880706202612659e-05, "loss": 0.0429, "step": 12998 }, { "epoch": 3.2051282051282053, "grad_norm": 0.34765625, "learning_rate": 6.877435684239659e-05, "loss": 0.0418, "step": 13000 }, { "epoch": 3.205621301775148, "grad_norm": 0.44921875, "learning_rate": 6.874165535907967e-05, "loss": 0.0408, "step": 13002 }, { "epoch": 3.206114398422091, "grad_norm": 0.361328125, "learning_rate": 6.870895758005105e-05, "loss": 0.0433, "step": 13004 }, { "epoch": 3.2066074950690338, "grad_norm": 0.3515625, "learning_rate": 6.867626350918571e-05, "loss": 0.0439, "step": 13006 }, { "epoch": 3.2071005917159763, "grad_norm": 0.3515625, "learning_rate": 6.864357315035801e-05, "loss": 0.0463, "step": 13008 }, { "epoch": 3.2075936883629192, "grad_norm": 0.5703125, "learning_rate": 6.861088650744198e-05, "loss": 0.0428, "step": 13010 }, { "epoch": 3.2080867850098618, "grad_norm": 0.330078125, "learning_rate": 6.857820358431117e-05, "loss": 0.0419, "step": 13012 }, { "epoch": 3.2085798816568047, "grad_norm": 0.376953125, "learning_rate": 6.854552438483865e-05, "loss": 0.0438, "step": 13014 }, { "epoch": 3.2090729783037477, "grad_norm": 0.51953125, "learning_rate": 6.851284891289713e-05, "loss": 0.0469, "step": 13016 }, { "epoch": 3.2095660749506902, "grad_norm": 0.5546875, "learning_rate": 6.848017717235877e-05, "loss": 0.0456, "step": 13018 }, { "epoch": 3.210059171597633, "grad_norm": 0.337890625, "learning_rate": 6.844750916709544e-05, "loss": 0.0455, "step": 13020 }, { "epoch": 3.2105522682445757, "grad_norm": 0.380859375, "learning_rate": 6.84148449009784e-05, "loss": 0.0481, "step": 13022 }, { "epoch": 3.2110453648915187, "grad_norm": 0.333984375, "learning_rate": 6.838218437787863e-05, "loss": 0.048, "step": 13024 }, { "epoch": 3.2115384615384617, "grad_norm": 0.328125, "learning_rate": 6.834952760166653e-05, "loss": 0.0412, "step": 13026 }, { "epoch": 3.212031558185404, "grad_norm": 0.375, "learning_rate": 6.831687457621214e-05, "loss": 0.0482, "step": 13028 }, { "epoch": 3.212524654832347, "grad_norm": 0.31640625, "learning_rate": 6.828422530538501e-05, "loss": 0.0442, "step": 13030 }, { "epoch": 3.21301775147929, "grad_norm": 0.3359375, "learning_rate": 6.82515797930542e-05, "loss": 0.0486, "step": 13032 }, { "epoch": 3.2135108481262327, "grad_norm": 0.384765625, "learning_rate": 6.82189380430885e-05, "loss": 0.042, "step": 13034 }, { "epoch": 3.2140039447731756, "grad_norm": 0.3359375, "learning_rate": 6.818630005935605e-05, "loss": 0.0427, "step": 13036 }, { "epoch": 3.214497041420118, "grad_norm": 0.390625, "learning_rate": 6.81536658457247e-05, "loss": 0.0419, "step": 13038 }, { "epoch": 3.214990138067061, "grad_norm": 0.3515625, "learning_rate": 6.812103540606174e-05, "loss": 0.0453, "step": 13040 }, { "epoch": 3.215483234714004, "grad_norm": 0.345703125, "learning_rate": 6.808840874423407e-05, "loss": 0.0414, "step": 13042 }, { "epoch": 3.2159763313609466, "grad_norm": 0.36328125, "learning_rate": 6.805578586410812e-05, "loss": 0.0468, "step": 13044 }, { "epoch": 3.2164694280078896, "grad_norm": 0.390625, "learning_rate": 6.802316676954993e-05, "loss": 0.0453, "step": 13046 }, { "epoch": 3.216962524654832, "grad_norm": 0.3515625, "learning_rate": 6.799055146442503e-05, "loss": 0.0432, "step": 13048 }, { "epoch": 3.217455621301775, "grad_norm": 0.384765625, "learning_rate": 6.795793995259845e-05, "loss": 0.0429, "step": 13050 }, { "epoch": 3.217948717948718, "grad_norm": 0.36328125, "learning_rate": 6.792533223793492e-05, "loss": 0.0441, "step": 13052 }, { "epoch": 3.2184418145956606, "grad_norm": 0.310546875, "learning_rate": 6.789272832429858e-05, "loss": 0.0438, "step": 13054 }, { "epoch": 3.2189349112426036, "grad_norm": 0.314453125, "learning_rate": 6.786012821555324e-05, "loss": 0.0447, "step": 13056 }, { "epoch": 3.2194280078895465, "grad_norm": 0.4375, "learning_rate": 6.782753191556208e-05, "loss": 0.0452, "step": 13058 }, { "epoch": 3.219921104536489, "grad_norm": 0.380859375, "learning_rate": 6.779493942818807e-05, "loss": 0.0423, "step": 13060 }, { "epoch": 3.220414201183432, "grad_norm": 0.41796875, "learning_rate": 6.776235075729356e-05, "loss": 0.0411, "step": 13062 }, { "epoch": 3.2209072978303745, "grad_norm": 0.41796875, "learning_rate": 6.772976590674042e-05, "loss": 0.0429, "step": 13064 }, { "epoch": 3.2214003944773175, "grad_norm": 0.34375, "learning_rate": 6.769718488039023e-05, "loss": 0.0446, "step": 13066 }, { "epoch": 3.2218934911242605, "grad_norm": 0.314453125, "learning_rate": 6.766460768210399e-05, "loss": 0.0424, "step": 13068 }, { "epoch": 3.222386587771203, "grad_norm": 0.314453125, "learning_rate": 6.763203431574227e-05, "loss": 0.0428, "step": 13070 }, { "epoch": 3.222879684418146, "grad_norm": 0.3359375, "learning_rate": 6.759946478516519e-05, "loss": 0.0439, "step": 13072 }, { "epoch": 3.223372781065089, "grad_norm": 0.33984375, "learning_rate": 6.756689909423247e-05, "loss": 0.0436, "step": 13074 }, { "epoch": 3.2238658777120315, "grad_norm": 0.318359375, "learning_rate": 6.753433724680323e-05, "loss": 0.042, "step": 13076 }, { "epoch": 3.2243589743589745, "grad_norm": 0.31640625, "learning_rate": 6.750177924673634e-05, "loss": 0.0454, "step": 13078 }, { "epoch": 3.224852071005917, "grad_norm": 0.31640625, "learning_rate": 6.746922509789006e-05, "loss": 0.0391, "step": 13080 }, { "epoch": 3.22534516765286, "grad_norm": 0.341796875, "learning_rate": 6.743667480412222e-05, "loss": 0.0433, "step": 13082 }, { "epoch": 3.225838264299803, "grad_norm": 0.39453125, "learning_rate": 6.740412836929025e-05, "loss": 0.0431, "step": 13084 }, { "epoch": 3.2263313609467454, "grad_norm": 0.345703125, "learning_rate": 6.7371585797251e-05, "loss": 0.0449, "step": 13086 }, { "epoch": 3.2268244575936884, "grad_norm": 0.318359375, "learning_rate": 6.733904709186105e-05, "loss": 0.0421, "step": 13088 }, { "epoch": 3.2273175542406314, "grad_norm": 0.3515625, "learning_rate": 6.730651225697635e-05, "loss": 0.0472, "step": 13090 }, { "epoch": 3.227810650887574, "grad_norm": 0.359375, "learning_rate": 6.727398129645252e-05, "loss": 0.0457, "step": 13092 }, { "epoch": 3.228303747534517, "grad_norm": 0.345703125, "learning_rate": 6.724145421414461e-05, "loss": 0.047, "step": 13094 }, { "epoch": 3.2287968441814594, "grad_norm": 0.328125, "learning_rate": 6.720893101390728e-05, "loss": 0.044, "step": 13096 }, { "epoch": 3.2292899408284024, "grad_norm": 0.37109375, "learning_rate": 6.71764116995947e-05, "loss": 0.0444, "step": 13098 }, { "epoch": 3.2297830374753453, "grad_norm": 0.330078125, "learning_rate": 6.714389627506059e-05, "loss": 0.0428, "step": 13100 }, { "epoch": 3.230276134122288, "grad_norm": 0.37890625, "learning_rate": 6.711138474415823e-05, "loss": 0.0486, "step": 13102 }, { "epoch": 3.230769230769231, "grad_norm": 0.376953125, "learning_rate": 6.707887711074037e-05, "loss": 0.0457, "step": 13104 }, { "epoch": 3.2312623274161734, "grad_norm": 0.326171875, "learning_rate": 6.704637337865942e-05, "loss": 0.0449, "step": 13106 }, { "epoch": 3.2317554240631163, "grad_norm": 0.369140625, "learning_rate": 6.701387355176721e-05, "loss": 0.0448, "step": 13108 }, { "epoch": 3.2322485207100593, "grad_norm": 0.3515625, "learning_rate": 6.698137763391516e-05, "loss": 0.0414, "step": 13110 }, { "epoch": 3.232741617357002, "grad_norm": 0.341796875, "learning_rate": 6.694888562895419e-05, "loss": 0.0451, "step": 13112 }, { "epoch": 3.233234714003945, "grad_norm": 0.365234375, "learning_rate": 6.691639754073482e-05, "loss": 0.0432, "step": 13114 }, { "epoch": 3.2337278106508878, "grad_norm": 0.345703125, "learning_rate": 6.688391337310706e-05, "loss": 0.0402, "step": 13116 }, { "epoch": 3.2342209072978303, "grad_norm": 0.33984375, "learning_rate": 6.685143312992044e-05, "loss": 0.0412, "step": 13118 }, { "epoch": 3.2347140039447733, "grad_norm": 0.314453125, "learning_rate": 6.68189568150241e-05, "loss": 0.0432, "step": 13120 }, { "epoch": 3.235207100591716, "grad_norm": 0.302734375, "learning_rate": 6.678648443226662e-05, "loss": 0.0418, "step": 13122 }, { "epoch": 3.2357001972386588, "grad_norm": 0.34375, "learning_rate": 6.675401598549619e-05, "loss": 0.0436, "step": 13124 }, { "epoch": 3.2361932938856017, "grad_norm": 0.333984375, "learning_rate": 6.672155147856045e-05, "loss": 0.0417, "step": 13126 }, { "epoch": 3.2366863905325443, "grad_norm": 0.353515625, "learning_rate": 6.668909091530671e-05, "loss": 0.0467, "step": 13128 }, { "epoch": 3.2371794871794872, "grad_norm": 0.400390625, "learning_rate": 6.665663429958167e-05, "loss": 0.0444, "step": 13130 }, { "epoch": 3.2376725838264298, "grad_norm": 0.3125, "learning_rate": 6.66241816352316e-05, "loss": 0.0453, "step": 13132 }, { "epoch": 3.2381656804733727, "grad_norm": 0.3984375, "learning_rate": 6.659173292610239e-05, "loss": 0.0425, "step": 13134 }, { "epoch": 3.2386587771203157, "grad_norm": 0.390625, "learning_rate": 6.655928817603933e-05, "loss": 0.0464, "step": 13136 }, { "epoch": 3.239151873767258, "grad_norm": 0.27734375, "learning_rate": 6.652684738888734e-05, "loss": 0.0409, "step": 13138 }, { "epoch": 3.239644970414201, "grad_norm": 0.345703125, "learning_rate": 6.649441056849082e-05, "loss": 0.043, "step": 13140 }, { "epoch": 3.240138067061144, "grad_norm": 0.36328125, "learning_rate": 6.646197771869374e-05, "loss": 0.0428, "step": 13142 }, { "epoch": 3.2406311637080867, "grad_norm": 0.341796875, "learning_rate": 6.642954884333955e-05, "loss": 0.0435, "step": 13144 }, { "epoch": 3.2411242603550297, "grad_norm": 0.359375, "learning_rate": 6.639712394627122e-05, "loss": 0.0452, "step": 13146 }, { "epoch": 3.2416173570019726, "grad_norm": 0.369140625, "learning_rate": 6.636470303133136e-05, "loss": 0.0426, "step": 13148 }, { "epoch": 3.242110453648915, "grad_norm": 0.341796875, "learning_rate": 6.633228610236195e-05, "loss": 0.0426, "step": 13150 }, { "epoch": 3.242603550295858, "grad_norm": 0.35546875, "learning_rate": 6.629987316320463e-05, "loss": 0.0445, "step": 13152 }, { "epoch": 3.2430966469428006, "grad_norm": 0.3984375, "learning_rate": 6.626746421770047e-05, "loss": 0.0417, "step": 13154 }, { "epoch": 3.2435897435897436, "grad_norm": 0.306640625, "learning_rate": 6.62350592696902e-05, "loss": 0.0424, "step": 13156 }, { "epoch": 3.2440828402366866, "grad_norm": 0.40625, "learning_rate": 6.620265832301386e-05, "loss": 0.044, "step": 13158 }, { "epoch": 3.244575936883629, "grad_norm": 0.35546875, "learning_rate": 6.617026138151125e-05, "loss": 0.0419, "step": 13160 }, { "epoch": 3.245069033530572, "grad_norm": 0.34375, "learning_rate": 6.613786844902155e-05, "loss": 0.0417, "step": 13162 }, { "epoch": 3.2455621301775146, "grad_norm": 0.376953125, "learning_rate": 6.610547952938349e-05, "loss": 0.0405, "step": 13164 }, { "epoch": 3.2460552268244576, "grad_norm": 0.35546875, "learning_rate": 6.607309462643534e-05, "loss": 0.0449, "step": 13166 }, { "epoch": 3.2465483234714005, "grad_norm": 0.380859375, "learning_rate": 6.60407137440149e-05, "loss": 0.0387, "step": 13168 }, { "epoch": 3.247041420118343, "grad_norm": 0.37109375, "learning_rate": 6.600833688595951e-05, "loss": 0.0437, "step": 13170 }, { "epoch": 3.247534516765286, "grad_norm": 0.4296875, "learning_rate": 6.597596405610593e-05, "loss": 0.048, "step": 13172 }, { "epoch": 3.248027613412229, "grad_norm": 0.396484375, "learning_rate": 6.594359525829063e-05, "loss": 0.0434, "step": 13174 }, { "epoch": 3.2485207100591715, "grad_norm": 0.408203125, "learning_rate": 6.591123049634942e-05, "loss": 0.0471, "step": 13176 }, { "epoch": 3.2490138067061145, "grad_norm": 0.375, "learning_rate": 6.587886977411773e-05, "loss": 0.0427, "step": 13178 }, { "epoch": 3.249506903353057, "grad_norm": 0.349609375, "learning_rate": 6.584651309543049e-05, "loss": 0.0492, "step": 13180 }, { "epoch": 3.25, "grad_norm": 0.32421875, "learning_rate": 6.58141604641221e-05, "loss": 0.046, "step": 13182 }, { "epoch": 3.250493096646943, "grad_norm": 0.353515625, "learning_rate": 6.578181188402661e-05, "loss": 0.0442, "step": 13184 }, { "epoch": 3.2509861932938855, "grad_norm": 0.41796875, "learning_rate": 6.57494673589774e-05, "loss": 0.0417, "step": 13186 }, { "epoch": 3.2514792899408285, "grad_norm": 0.361328125, "learning_rate": 6.57171268928076e-05, "loss": 0.0435, "step": 13188 }, { "epoch": 3.251972386587771, "grad_norm": 0.388671875, "learning_rate": 6.568479048934964e-05, "loss": 0.0467, "step": 13190 }, { "epoch": 3.252465483234714, "grad_norm": 0.349609375, "learning_rate": 6.565245815243559e-05, "loss": 0.0444, "step": 13192 }, { "epoch": 3.252958579881657, "grad_norm": 0.359375, "learning_rate": 6.562012988589707e-05, "loss": 0.0427, "step": 13194 }, { "epoch": 3.2534516765285995, "grad_norm": 0.431640625, "learning_rate": 6.558780569356507e-05, "loss": 0.0411, "step": 13196 }, { "epoch": 3.2539447731755424, "grad_norm": 0.337890625, "learning_rate": 6.555548557927025e-05, "loss": 0.0448, "step": 13198 }, { "epoch": 3.2544378698224854, "grad_norm": 0.33984375, "learning_rate": 6.552316954684267e-05, "loss": 0.0442, "step": 13200 }, { "epoch": 3.254930966469428, "grad_norm": 0.330078125, "learning_rate": 6.549085760011202e-05, "loss": 0.0453, "step": 13202 }, { "epoch": 3.255424063116371, "grad_norm": 0.37109375, "learning_rate": 6.545854974290741e-05, "loss": 0.0463, "step": 13204 }, { "epoch": 3.2559171597633134, "grad_norm": 0.376953125, "learning_rate": 6.542624597905751e-05, "loss": 0.0444, "step": 13206 }, { "epoch": 3.2564102564102564, "grad_norm": 0.3515625, "learning_rate": 6.539394631239048e-05, "loss": 0.0499, "step": 13208 }, { "epoch": 3.2569033530571994, "grad_norm": 0.373046875, "learning_rate": 6.536165074673409e-05, "loss": 0.0439, "step": 13210 }, { "epoch": 3.257396449704142, "grad_norm": 0.357421875, "learning_rate": 6.532935928591544e-05, "loss": 0.0484, "step": 13212 }, { "epoch": 3.257889546351085, "grad_norm": 0.33203125, "learning_rate": 6.529707193376127e-05, "loss": 0.0487, "step": 13214 }, { "epoch": 3.2583826429980274, "grad_norm": 0.408203125, "learning_rate": 6.526478869409785e-05, "loss": 0.0479, "step": 13216 }, { "epoch": 3.2588757396449703, "grad_norm": 0.423828125, "learning_rate": 6.523250957075089e-05, "loss": 0.042, "step": 13218 }, { "epoch": 3.2593688362919133, "grad_norm": 0.390625, "learning_rate": 6.52002345675457e-05, "loss": 0.0494, "step": 13220 }, { "epoch": 3.259861932938856, "grad_norm": 0.36328125, "learning_rate": 6.516796368830698e-05, "loss": 0.0417, "step": 13222 }, { "epoch": 3.260355029585799, "grad_norm": 0.390625, "learning_rate": 6.513569693685907e-05, "loss": 0.0472, "step": 13224 }, { "epoch": 3.260848126232742, "grad_norm": 0.423828125, "learning_rate": 6.510343431702568e-05, "loss": 0.0485, "step": 13226 }, { "epoch": 3.2613412228796843, "grad_norm": 0.396484375, "learning_rate": 6.507117583263021e-05, "loss": 0.043, "step": 13228 }, { "epoch": 3.2618343195266273, "grad_norm": 0.376953125, "learning_rate": 6.503892148749542e-05, "loss": 0.0475, "step": 13230 }, { "epoch": 3.2623274161735702, "grad_norm": 0.330078125, "learning_rate": 6.50066712854436e-05, "loss": 0.0462, "step": 13232 }, { "epoch": 3.2628205128205128, "grad_norm": 0.43359375, "learning_rate": 6.497442523029663e-05, "loss": 0.0449, "step": 13234 }, { "epoch": 3.2633136094674557, "grad_norm": 0.328125, "learning_rate": 6.494218332587583e-05, "loss": 0.0411, "step": 13236 }, { "epoch": 3.2638067061143983, "grad_norm": 0.357421875, "learning_rate": 6.490994557600204e-05, "loss": 0.0477, "step": 13238 }, { "epoch": 3.2642998027613412, "grad_norm": 0.33984375, "learning_rate": 6.48777119844956e-05, "loss": 0.0452, "step": 13240 }, { "epoch": 3.264792899408284, "grad_norm": 0.380859375, "learning_rate": 6.484548255517642e-05, "loss": 0.046, "step": 13242 }, { "epoch": 3.2652859960552267, "grad_norm": 0.34765625, "learning_rate": 6.481325729186383e-05, "loss": 0.0421, "step": 13244 }, { "epoch": 3.2657790927021697, "grad_norm": 0.462890625, "learning_rate": 6.478103619837665e-05, "loss": 0.0468, "step": 13246 }, { "epoch": 3.2662721893491122, "grad_norm": 0.443359375, "learning_rate": 6.474881927853339e-05, "loss": 0.044, "step": 13248 }, { "epoch": 3.266765285996055, "grad_norm": 0.443359375, "learning_rate": 6.471660653615184e-05, "loss": 0.0443, "step": 13250 }, { "epoch": 3.267258382642998, "grad_norm": 0.3515625, "learning_rate": 6.468439797504942e-05, "loss": 0.048, "step": 13252 }, { "epoch": 3.2677514792899407, "grad_norm": 0.376953125, "learning_rate": 6.465219359904298e-05, "loss": 0.046, "step": 13254 }, { "epoch": 3.2682445759368837, "grad_norm": 0.326171875, "learning_rate": 6.4619993411949e-05, "loss": 0.0531, "step": 13256 }, { "epoch": 3.2687376725838266, "grad_norm": 0.373046875, "learning_rate": 6.458779741758328e-05, "loss": 0.0445, "step": 13258 }, { "epoch": 3.269230769230769, "grad_norm": 0.380859375, "learning_rate": 6.455560561976135e-05, "loss": 0.0448, "step": 13260 }, { "epoch": 3.269723865877712, "grad_norm": 0.4453125, "learning_rate": 6.452341802229803e-05, "loss": 0.0486, "step": 13262 }, { "epoch": 3.2702169625246547, "grad_norm": 0.365234375, "learning_rate": 6.449123462900774e-05, "loss": 0.0405, "step": 13264 }, { "epoch": 3.2707100591715976, "grad_norm": 0.37890625, "learning_rate": 6.445905544370441e-05, "loss": 0.046, "step": 13266 }, { "epoch": 3.2712031558185406, "grad_norm": 0.43359375, "learning_rate": 6.442688047020142e-05, "loss": 0.0442, "step": 13268 }, { "epoch": 3.271696252465483, "grad_norm": 0.380859375, "learning_rate": 6.439470971231174e-05, "loss": 0.0396, "step": 13270 }, { "epoch": 3.272189349112426, "grad_norm": 0.3359375, "learning_rate": 6.436254317384772e-05, "loss": 0.0471, "step": 13272 }, { "epoch": 3.2726824457593686, "grad_norm": 0.345703125, "learning_rate": 6.433038085862134e-05, "loss": 0.0437, "step": 13274 }, { "epoch": 3.2731755424063116, "grad_norm": 0.41796875, "learning_rate": 6.429822277044399e-05, "loss": 0.0462, "step": 13276 }, { "epoch": 3.2736686390532546, "grad_norm": 0.302734375, "learning_rate": 6.426606891312655e-05, "loss": 0.0453, "step": 13278 }, { "epoch": 3.274161735700197, "grad_norm": 0.373046875, "learning_rate": 6.423391929047948e-05, "loss": 0.0465, "step": 13280 }, { "epoch": 3.27465483234714, "grad_norm": 0.357421875, "learning_rate": 6.420177390631263e-05, "loss": 0.0405, "step": 13282 }, { "epoch": 3.275147928994083, "grad_norm": 0.326171875, "learning_rate": 6.416963276443547e-05, "loss": 0.0428, "step": 13284 }, { "epoch": 3.2756410256410255, "grad_norm": 0.349609375, "learning_rate": 6.413749586865684e-05, "loss": 0.0446, "step": 13286 }, { "epoch": 3.2761341222879685, "grad_norm": 0.375, "learning_rate": 6.410536322278523e-05, "loss": 0.0426, "step": 13288 }, { "epoch": 3.2766272189349115, "grad_norm": 0.37109375, "learning_rate": 6.407323483062848e-05, "loss": 0.0444, "step": 13290 }, { "epoch": 3.277120315581854, "grad_norm": 0.3515625, "learning_rate": 6.404111069599399e-05, "loss": 0.042, "step": 13292 }, { "epoch": 3.277613412228797, "grad_norm": 0.341796875, "learning_rate": 6.400899082268865e-05, "loss": 0.0443, "step": 13294 }, { "epoch": 3.2781065088757395, "grad_norm": 0.404296875, "learning_rate": 6.397687521451882e-05, "loss": 0.0466, "step": 13296 }, { "epoch": 3.2785996055226825, "grad_norm": 0.32421875, "learning_rate": 6.394476387529044e-05, "loss": 0.0384, "step": 13298 }, { "epoch": 3.279092702169625, "grad_norm": 0.30859375, "learning_rate": 6.39126568088088e-05, "loss": 0.0387, "step": 13300 }, { "epoch": 3.279585798816568, "grad_norm": 0.3359375, "learning_rate": 6.388055401887886e-05, "loss": 0.0396, "step": 13302 }, { "epoch": 3.280078895463511, "grad_norm": 0.35546875, "learning_rate": 6.384845550930492e-05, "loss": 0.044, "step": 13304 }, { "epoch": 3.2805719921104535, "grad_norm": 0.376953125, "learning_rate": 6.381636128389085e-05, "loss": 0.0418, "step": 13306 }, { "epoch": 3.2810650887573964, "grad_norm": 0.365234375, "learning_rate": 6.378427134643996e-05, "loss": 0.0434, "step": 13308 }, { "epoch": 3.2815581854043394, "grad_norm": 0.5234375, "learning_rate": 6.375218570075516e-05, "loss": 0.0418, "step": 13310 }, { "epoch": 3.282051282051282, "grad_norm": 0.400390625, "learning_rate": 6.372010435063875e-05, "loss": 0.0435, "step": 13312 }, { "epoch": 3.282544378698225, "grad_norm": 0.392578125, "learning_rate": 6.368802729989247e-05, "loss": 0.0523, "step": 13314 }, { "epoch": 3.283037475345168, "grad_norm": 0.359375, "learning_rate": 6.365595455231774e-05, "loss": 0.0449, "step": 13316 }, { "epoch": 3.2835305719921104, "grad_norm": 0.36328125, "learning_rate": 6.36238861117153e-05, "loss": 0.043, "step": 13318 }, { "epoch": 3.2840236686390534, "grad_norm": 0.421875, "learning_rate": 6.359182198188549e-05, "loss": 0.0445, "step": 13320 }, { "epoch": 3.284516765285996, "grad_norm": 0.4453125, "learning_rate": 6.355976216662799e-05, "loss": 0.0462, "step": 13322 }, { "epoch": 3.285009861932939, "grad_norm": 0.3203125, "learning_rate": 6.352770666974218e-05, "loss": 0.0433, "step": 13324 }, { "epoch": 3.285502958579882, "grad_norm": 0.44921875, "learning_rate": 6.349565549502676e-05, "loss": 0.0439, "step": 13326 }, { "epoch": 3.2859960552268244, "grad_norm": 0.392578125, "learning_rate": 6.346360864627994e-05, "loss": 0.0415, "step": 13328 }, { "epoch": 3.2864891518737673, "grad_norm": 0.396484375, "learning_rate": 6.343156612729952e-05, "loss": 0.0439, "step": 13330 }, { "epoch": 3.28698224852071, "grad_norm": 0.4453125, "learning_rate": 6.339952794188268e-05, "loss": 0.0483, "step": 13332 }, { "epoch": 3.287475345167653, "grad_norm": 0.46875, "learning_rate": 6.336749409382615e-05, "loss": 0.0447, "step": 13334 }, { "epoch": 3.287968441814596, "grad_norm": 0.419921875, "learning_rate": 6.333546458692605e-05, "loss": 0.0471, "step": 13336 }, { "epoch": 3.2884615384615383, "grad_norm": 0.376953125, "learning_rate": 6.330343942497815e-05, "loss": 0.0442, "step": 13338 }, { "epoch": 3.2889546351084813, "grad_norm": 0.447265625, "learning_rate": 6.327141861177752e-05, "loss": 0.0428, "step": 13340 }, { "epoch": 3.2894477317554243, "grad_norm": 0.58203125, "learning_rate": 6.323940215111889e-05, "loss": 0.0414, "step": 13342 }, { "epoch": 3.289940828402367, "grad_norm": 0.390625, "learning_rate": 6.320739004679637e-05, "loss": 0.0405, "step": 13344 }, { "epoch": 3.2904339250493098, "grad_norm": 0.310546875, "learning_rate": 6.317538230260352e-05, "loss": 0.0409, "step": 13346 }, { "epoch": 3.2909270216962523, "grad_norm": 0.345703125, "learning_rate": 6.314337892233348e-05, "loss": 0.0456, "step": 13348 }, { "epoch": 3.2914201183431953, "grad_norm": 0.439453125, "learning_rate": 6.311137990977878e-05, "loss": 0.0425, "step": 13350 }, { "epoch": 3.2919132149901382, "grad_norm": 0.349609375, "learning_rate": 6.307938526873157e-05, "loss": 0.0422, "step": 13352 }, { "epoch": 3.2924063116370808, "grad_norm": 0.345703125, "learning_rate": 6.30473950029833e-05, "loss": 0.0413, "step": 13354 }, { "epoch": 3.2928994082840237, "grad_norm": 0.31640625, "learning_rate": 6.301540911632509e-05, "loss": 0.0415, "step": 13356 }, { "epoch": 3.2933925049309662, "grad_norm": 0.376953125, "learning_rate": 6.298342761254736e-05, "loss": 0.0455, "step": 13358 }, { "epoch": 3.293885601577909, "grad_norm": 0.373046875, "learning_rate": 6.295145049544016e-05, "loss": 0.0416, "step": 13360 }, { "epoch": 3.294378698224852, "grad_norm": 0.345703125, "learning_rate": 6.291947776879292e-05, "loss": 0.045, "step": 13362 }, { "epoch": 3.2948717948717947, "grad_norm": 0.318359375, "learning_rate": 6.288750943639457e-05, "loss": 0.0421, "step": 13364 }, { "epoch": 3.2953648915187377, "grad_norm": 0.37890625, "learning_rate": 6.285554550203358e-05, "loss": 0.0442, "step": 13366 }, { "epoch": 3.2958579881656807, "grad_norm": 0.353515625, "learning_rate": 6.282358596949781e-05, "loss": 0.0431, "step": 13368 }, { "epoch": 3.296351084812623, "grad_norm": 0.390625, "learning_rate": 6.279163084257471e-05, "loss": 0.0455, "step": 13370 }, { "epoch": 3.296844181459566, "grad_norm": 0.33203125, "learning_rate": 6.275968012505107e-05, "loss": 0.0448, "step": 13372 }, { "epoch": 3.297337278106509, "grad_norm": 0.3359375, "learning_rate": 6.272773382071329e-05, "loss": 0.0412, "step": 13374 }, { "epoch": 3.2978303747534516, "grad_norm": 0.37109375, "learning_rate": 6.269579193334716e-05, "loss": 0.0463, "step": 13376 }, { "epoch": 3.2983234714003946, "grad_norm": 0.349609375, "learning_rate": 6.26638544667379e-05, "loss": 0.0448, "step": 13378 }, { "epoch": 3.298816568047337, "grad_norm": 0.38671875, "learning_rate": 6.26319214246704e-05, "loss": 0.0435, "step": 13380 }, { "epoch": 3.29930966469428, "grad_norm": 0.357421875, "learning_rate": 6.25999928109288e-05, "loss": 0.0427, "step": 13382 }, { "epoch": 3.2998027613412226, "grad_norm": 0.37890625, "learning_rate": 6.256806862929694e-05, "loss": 0.0446, "step": 13384 }, { "epoch": 3.3002958579881656, "grad_norm": 0.330078125, "learning_rate": 6.253614888355789e-05, "loss": 0.0409, "step": 13386 }, { "epoch": 3.3007889546351086, "grad_norm": 0.330078125, "learning_rate": 6.250423357749439e-05, "loss": 0.0445, "step": 13388 }, { "epoch": 3.301282051282051, "grad_norm": 0.361328125, "learning_rate": 6.247232271488852e-05, "loss": 0.0459, "step": 13390 }, { "epoch": 3.301775147928994, "grad_norm": 0.34375, "learning_rate": 6.244041629952199e-05, "loss": 0.043, "step": 13392 }, { "epoch": 3.302268244575937, "grad_norm": 0.341796875, "learning_rate": 6.240851433517583e-05, "loss": 0.0408, "step": 13394 }, { "epoch": 3.3027613412228796, "grad_norm": 0.30859375, "learning_rate": 6.237661682563056e-05, "loss": 0.0464, "step": 13396 }, { "epoch": 3.3032544378698225, "grad_norm": 0.32421875, "learning_rate": 6.23447237746663e-05, "loss": 0.0443, "step": 13398 }, { "epoch": 3.3037475345167655, "grad_norm": 0.388671875, "learning_rate": 6.231283518606248e-05, "loss": 0.0501, "step": 13400 }, { "epoch": 3.304240631163708, "grad_norm": 0.349609375, "learning_rate": 6.228095106359815e-05, "loss": 0.0456, "step": 13402 }, { "epoch": 3.304733727810651, "grad_norm": 0.376953125, "learning_rate": 6.224907141105165e-05, "loss": 0.0445, "step": 13404 }, { "epoch": 3.3052268244575935, "grad_norm": 0.369140625, "learning_rate": 6.221719623220102e-05, "loss": 0.0443, "step": 13406 }, { "epoch": 3.3057199211045365, "grad_norm": 0.349609375, "learning_rate": 6.218532553082359e-05, "loss": 0.0471, "step": 13408 }, { "epoch": 3.3062130177514795, "grad_norm": 0.37890625, "learning_rate": 6.215345931069617e-05, "loss": 0.043, "step": 13410 }, { "epoch": 3.306706114398422, "grad_norm": 0.376953125, "learning_rate": 6.212159757559516e-05, "loss": 0.0403, "step": 13412 }, { "epoch": 3.307199211045365, "grad_norm": 0.375, "learning_rate": 6.208974032929633e-05, "loss": 0.0436, "step": 13414 }, { "epoch": 3.3076923076923075, "grad_norm": 0.318359375, "learning_rate": 6.205788757557493e-05, "loss": 0.0427, "step": 13416 }, { "epoch": 3.3081854043392505, "grad_norm": 0.30078125, "learning_rate": 6.202603931820567e-05, "loss": 0.0467, "step": 13418 }, { "epoch": 3.3086785009861934, "grad_norm": 0.314453125, "learning_rate": 6.19941955609628e-05, "loss": 0.041, "step": 13420 }, { "epoch": 3.309171597633136, "grad_norm": 0.353515625, "learning_rate": 6.196235630761991e-05, "loss": 0.0393, "step": 13422 }, { "epoch": 3.309664694280079, "grad_norm": 0.421875, "learning_rate": 6.193052156195023e-05, "loss": 0.0459, "step": 13424 }, { "epoch": 3.310157790927022, "grad_norm": 0.41015625, "learning_rate": 6.189869132772628e-05, "loss": 0.0477, "step": 13426 }, { "epoch": 3.3106508875739644, "grad_norm": 0.345703125, "learning_rate": 6.186686560872013e-05, "loss": 0.0439, "step": 13428 }, { "epoch": 3.3111439842209074, "grad_norm": 0.373046875, "learning_rate": 6.183504440870331e-05, "loss": 0.0447, "step": 13430 }, { "epoch": 3.31163708086785, "grad_norm": 0.3984375, "learning_rate": 6.18032277314468e-05, "loss": 0.0447, "step": 13432 }, { "epoch": 3.312130177514793, "grad_norm": 0.326171875, "learning_rate": 6.177141558072111e-05, "loss": 0.0491, "step": 13434 }, { "epoch": 3.312623274161736, "grad_norm": 0.375, "learning_rate": 6.173960796029605e-05, "loss": 0.0404, "step": 13436 }, { "epoch": 3.3131163708086784, "grad_norm": 0.314453125, "learning_rate": 6.170780487394112e-05, "loss": 0.0434, "step": 13438 }, { "epoch": 3.3136094674556213, "grad_norm": 0.328125, "learning_rate": 6.167600632542508e-05, "loss": 0.0395, "step": 13440 }, { "epoch": 3.314102564102564, "grad_norm": 0.412109375, "learning_rate": 6.164421231851629e-05, "loss": 0.041, "step": 13442 }, { "epoch": 3.314595660749507, "grad_norm": 0.322265625, "learning_rate": 6.16124228569825e-05, "loss": 0.0462, "step": 13444 }, { "epoch": 3.31508875739645, "grad_norm": 0.3515625, "learning_rate": 6.158063794459087e-05, "loss": 0.0451, "step": 13446 }, { "epoch": 3.3155818540433923, "grad_norm": 0.421875, "learning_rate": 6.154885758510822e-05, "loss": 0.0425, "step": 13448 }, { "epoch": 3.3160749506903353, "grad_norm": 0.3359375, "learning_rate": 6.151708178230056e-05, "loss": 0.0416, "step": 13450 }, { "epoch": 3.3165680473372783, "grad_norm": 0.3515625, "learning_rate": 6.148531053993362e-05, "loss": 0.0444, "step": 13452 }, { "epoch": 3.317061143984221, "grad_norm": 0.37109375, "learning_rate": 6.145354386177241e-05, "loss": 0.0411, "step": 13454 }, { "epoch": 3.3175542406311638, "grad_norm": 0.34375, "learning_rate": 6.142178175158149e-05, "loss": 0.0409, "step": 13456 }, { "epoch": 3.3180473372781067, "grad_norm": 0.486328125, "learning_rate": 6.139002421312481e-05, "loss": 0.0437, "step": 13458 }, { "epoch": 3.3185404339250493, "grad_norm": 0.328125, "learning_rate": 6.13582712501658e-05, "loss": 0.0428, "step": 13460 }, { "epoch": 3.3190335305719922, "grad_norm": 0.33984375, "learning_rate": 6.132652286646742e-05, "loss": 0.0437, "step": 13462 }, { "epoch": 3.3195266272189348, "grad_norm": 0.35546875, "learning_rate": 6.129477906579198e-05, "loss": 0.0437, "step": 13464 }, { "epoch": 3.3200197238658777, "grad_norm": 0.3984375, "learning_rate": 6.126303985190136e-05, "loss": 0.0434, "step": 13466 }, { "epoch": 3.3205128205128207, "grad_norm": 0.328125, "learning_rate": 6.123130522855678e-05, "loss": 0.0434, "step": 13468 }, { "epoch": 3.3210059171597632, "grad_norm": 0.3359375, "learning_rate": 6.1199575199519e-05, "loss": 0.0412, "step": 13470 }, { "epoch": 3.321499013806706, "grad_norm": 0.3671875, "learning_rate": 6.116784976854816e-05, "loss": 0.0424, "step": 13472 }, { "epoch": 3.3219921104536487, "grad_norm": 0.35546875, "learning_rate": 6.113612893940397e-05, "loss": 0.0416, "step": 13474 }, { "epoch": 3.3224852071005917, "grad_norm": 0.3828125, "learning_rate": 6.11044127158455e-05, "loss": 0.044, "step": 13476 }, { "epoch": 3.3229783037475347, "grad_norm": 0.384765625, "learning_rate": 6.107270110163123e-05, "loss": 0.0461, "step": 13478 }, { "epoch": 3.323471400394477, "grad_norm": 0.349609375, "learning_rate": 6.104099410051927e-05, "loss": 0.0427, "step": 13480 }, { "epoch": 3.32396449704142, "grad_norm": 0.4296875, "learning_rate": 6.100929171626701e-05, "loss": 0.0407, "step": 13482 }, { "epoch": 3.324457593688363, "grad_norm": 0.310546875, "learning_rate": 6.09775939526314e-05, "loss": 0.0394, "step": 13484 }, { "epoch": 3.3249506903353057, "grad_norm": 0.365234375, "learning_rate": 6.094590081336873e-05, "loss": 0.0452, "step": 13486 }, { "epoch": 3.3254437869822486, "grad_norm": 0.38671875, "learning_rate": 6.091421230223491e-05, "loss": 0.0422, "step": 13488 }, { "epoch": 3.325936883629191, "grad_norm": 0.337890625, "learning_rate": 6.088252842298512e-05, "loss": 0.0447, "step": 13490 }, { "epoch": 3.326429980276134, "grad_norm": 0.609375, "learning_rate": 6.085084917937416e-05, "loss": 0.0468, "step": 13492 }, { "epoch": 3.326923076923077, "grad_norm": 0.310546875, "learning_rate": 6.081917457515614e-05, "loss": 0.0407, "step": 13494 }, { "epoch": 3.3274161735700196, "grad_norm": 0.3828125, "learning_rate": 6.078750461408468e-05, "loss": 0.0404, "step": 13496 }, { "epoch": 3.3279092702169626, "grad_norm": 0.404296875, "learning_rate": 6.0755839299912864e-05, "loss": 0.0437, "step": 13498 }, { "epoch": 3.328402366863905, "grad_norm": 0.33984375, "learning_rate": 6.0724178636393184e-05, "loss": 0.0456, "step": 13500 }, { "epoch": 3.328895463510848, "grad_norm": 0.32421875, "learning_rate": 6.069252262727765e-05, "loss": 0.0429, "step": 13502 }, { "epoch": 3.329388560157791, "grad_norm": 0.396484375, "learning_rate": 6.066087127631761e-05, "loss": 0.0392, "step": 13504 }, { "epoch": 3.3298816568047336, "grad_norm": 0.3984375, "learning_rate": 6.0629224587264013e-05, "loss": 0.045, "step": 13506 }, { "epoch": 3.3303747534516766, "grad_norm": 0.326171875, "learning_rate": 6.059758256386712e-05, "loss": 0.0413, "step": 13508 }, { "epoch": 3.3308678500986195, "grad_norm": 0.337890625, "learning_rate": 6.0565945209876664e-05, "loss": 0.0444, "step": 13510 }, { "epoch": 3.331360946745562, "grad_norm": 0.392578125, "learning_rate": 6.053431252904189e-05, "loss": 0.0393, "step": 13512 }, { "epoch": 3.331854043392505, "grad_norm": 0.33203125, "learning_rate": 6.050268452511138e-05, "loss": 0.0482, "step": 13514 }, { "epoch": 3.3323471400394475, "grad_norm": 0.3828125, "learning_rate": 6.0471061201833346e-05, "loss": 0.0441, "step": 13516 }, { "epoch": 3.3328402366863905, "grad_norm": 0.3203125, "learning_rate": 6.04394425629552e-05, "loss": 0.0422, "step": 13518 }, { "epoch": 3.3333333333333335, "grad_norm": 0.3828125, "learning_rate": 6.040782861222403e-05, "loss": 0.0456, "step": 13520 }, { "epoch": 3.333826429980276, "grad_norm": 0.380859375, "learning_rate": 6.037621935338621e-05, "loss": 0.0369, "step": 13522 }, { "epoch": 3.334319526627219, "grad_norm": 0.322265625, "learning_rate": 6.034461479018765e-05, "loss": 0.0374, "step": 13524 }, { "epoch": 3.3348126232741615, "grad_norm": 0.314453125, "learning_rate": 6.031301492637364e-05, "loss": 0.0401, "step": 13526 }, { "epoch": 3.3353057199211045, "grad_norm": 0.40234375, "learning_rate": 6.0281419765688904e-05, "loss": 0.0416, "step": 13528 }, { "epoch": 3.3357988165680474, "grad_norm": 0.310546875, "learning_rate": 6.024982931187773e-05, "loss": 0.0387, "step": 13530 }, { "epoch": 3.33629191321499, "grad_norm": 0.333984375, "learning_rate": 6.021824356868366e-05, "loss": 0.0431, "step": 13532 }, { "epoch": 3.336785009861933, "grad_norm": 0.318359375, "learning_rate": 6.01866625398499e-05, "loss": 0.04, "step": 13534 }, { "epoch": 3.337278106508876, "grad_norm": 0.37890625, "learning_rate": 6.015508622911889e-05, "loss": 0.0422, "step": 13536 }, { "epoch": 3.3377712031558184, "grad_norm": 0.353515625, "learning_rate": 6.012351464023266e-05, "loss": 0.0408, "step": 13538 }, { "epoch": 3.3382642998027614, "grad_norm": 0.302734375, "learning_rate": 6.009194777693258e-05, "loss": 0.0456, "step": 13540 }, { "epoch": 3.3387573964497044, "grad_norm": 0.349609375, "learning_rate": 6.006038564295946e-05, "loss": 0.0419, "step": 13542 }, { "epoch": 3.339250493096647, "grad_norm": 0.34765625, "learning_rate": 6.002882824205368e-05, "loss": 0.04, "step": 13544 }, { "epoch": 3.33974358974359, "grad_norm": 0.341796875, "learning_rate": 5.9997275577954894e-05, "loss": 0.0436, "step": 13546 }, { "epoch": 3.3402366863905324, "grad_norm": 0.314453125, "learning_rate": 5.9965727654402315e-05, "loss": 0.0414, "step": 13548 }, { "epoch": 3.3407297830374754, "grad_norm": 0.333984375, "learning_rate": 5.993418447513452e-05, "loss": 0.0402, "step": 13550 }, { "epoch": 3.3412228796844183, "grad_norm": 0.427734375, "learning_rate": 5.99026460438896e-05, "loss": 0.0453, "step": 13552 }, { "epoch": 3.341715976331361, "grad_norm": 0.34375, "learning_rate": 5.987111236440495e-05, "loss": 0.0395, "step": 13554 }, { "epoch": 3.342209072978304, "grad_norm": 0.453125, "learning_rate": 5.983958344041757e-05, "loss": 0.0436, "step": 13556 }, { "epoch": 3.3427021696252464, "grad_norm": 0.33203125, "learning_rate": 5.980805927566378e-05, "loss": 0.0374, "step": 13558 }, { "epoch": 3.3431952662721893, "grad_norm": 0.400390625, "learning_rate": 5.977653987387933e-05, "loss": 0.0406, "step": 13560 }, { "epoch": 3.3436883629191323, "grad_norm": 0.37890625, "learning_rate": 5.9745025238799526e-05, "loss": 0.0476, "step": 13562 }, { "epoch": 3.344181459566075, "grad_norm": 0.345703125, "learning_rate": 5.971351537415897e-05, "loss": 0.044, "step": 13564 }, { "epoch": 3.344674556213018, "grad_norm": 0.36328125, "learning_rate": 5.9682010283691794e-05, "loss": 0.0445, "step": 13566 }, { "epoch": 3.3451676528599608, "grad_norm": 0.3515625, "learning_rate": 5.9650509971131465e-05, "loss": 0.0444, "step": 13568 }, { "epoch": 3.3456607495069033, "grad_norm": 0.443359375, "learning_rate": 5.961901444021101e-05, "loss": 0.0451, "step": 13570 }, { "epoch": 3.3461538461538463, "grad_norm": 0.380859375, "learning_rate": 5.958752369466278e-05, "loss": 0.0475, "step": 13572 }, { "epoch": 3.346646942800789, "grad_norm": 0.349609375, "learning_rate": 5.955603773821866e-05, "loss": 0.0452, "step": 13574 }, { "epoch": 3.3471400394477318, "grad_norm": 0.3359375, "learning_rate": 5.952455657460988e-05, "loss": 0.0418, "step": 13576 }, { "epoch": 3.3476331360946747, "grad_norm": 0.328125, "learning_rate": 5.949308020756712e-05, "loss": 0.0428, "step": 13578 }, { "epoch": 3.3481262327416172, "grad_norm": 0.375, "learning_rate": 5.9461608640820535e-05, "loss": 0.0387, "step": 13580 }, { "epoch": 3.34861932938856, "grad_norm": 0.396484375, "learning_rate": 5.943014187809961e-05, "loss": 0.0417, "step": 13582 }, { "epoch": 3.3491124260355027, "grad_norm": 0.365234375, "learning_rate": 5.939867992313344e-05, "loss": 0.0402, "step": 13584 }, { "epoch": 3.3496055226824457, "grad_norm": 0.2890625, "learning_rate": 5.9367222779650334e-05, "loss": 0.0388, "step": 13586 }, { "epoch": 3.3500986193293887, "grad_norm": 0.318359375, "learning_rate": 5.933577045137823e-05, "loss": 0.0394, "step": 13588 }, { "epoch": 3.350591715976331, "grad_norm": 0.3671875, "learning_rate": 5.9304322942044376e-05, "loss": 0.0413, "step": 13590 }, { "epoch": 3.351084812623274, "grad_norm": 0.3515625, "learning_rate": 5.927288025537545e-05, "loss": 0.0429, "step": 13592 }, { "epoch": 3.351577909270217, "grad_norm": 0.357421875, "learning_rate": 5.924144239509764e-05, "loss": 0.0426, "step": 13594 }, { "epoch": 3.3520710059171597, "grad_norm": 0.3203125, "learning_rate": 5.9210009364936416e-05, "loss": 0.0405, "step": 13596 }, { "epoch": 3.3525641025641026, "grad_norm": 0.345703125, "learning_rate": 5.917858116861687e-05, "loss": 0.0414, "step": 13598 }, { "epoch": 3.3530571992110456, "grad_norm": 0.41015625, "learning_rate": 5.914715780986333e-05, "loss": 0.0422, "step": 13600 }, { "epoch": 3.353550295857988, "grad_norm": 0.31640625, "learning_rate": 5.911573929239971e-05, "loss": 0.0416, "step": 13602 }, { "epoch": 3.354043392504931, "grad_norm": 0.28125, "learning_rate": 5.9084325619949256e-05, "loss": 0.0395, "step": 13604 }, { "epoch": 3.3545364891518736, "grad_norm": 0.376953125, "learning_rate": 5.9052916796234656e-05, "loss": 0.0395, "step": 13606 }, { "epoch": 3.3550295857988166, "grad_norm": 0.32421875, "learning_rate": 5.9021512824978056e-05, "loss": 0.0398, "step": 13608 }, { "epoch": 3.355522682445759, "grad_norm": 0.37109375, "learning_rate": 5.899011370990093e-05, "loss": 0.039, "step": 13610 }, { "epoch": 3.356015779092702, "grad_norm": 0.380859375, "learning_rate": 5.8958719454724346e-05, "loss": 0.0417, "step": 13612 }, { "epoch": 3.356508875739645, "grad_norm": 0.30078125, "learning_rate": 5.892733006316861e-05, "loss": 0.0405, "step": 13614 }, { "epoch": 3.3570019723865876, "grad_norm": 0.3515625, "learning_rate": 5.889594553895361e-05, "loss": 0.0419, "step": 13616 }, { "epoch": 3.3574950690335306, "grad_norm": 0.46484375, "learning_rate": 5.886456588579855e-05, "loss": 0.0459, "step": 13618 }, { "epoch": 3.3579881656804735, "grad_norm": 0.31640625, "learning_rate": 5.883319110742213e-05, "loss": 0.0372, "step": 13620 }, { "epoch": 3.358481262327416, "grad_norm": 0.314453125, "learning_rate": 5.8801821207542365e-05, "loss": 0.0427, "step": 13622 }, { "epoch": 3.358974358974359, "grad_norm": 0.365234375, "learning_rate": 5.8770456189876844e-05, "loss": 0.042, "step": 13624 }, { "epoch": 3.359467455621302, "grad_norm": 0.3515625, "learning_rate": 5.873909605814245e-05, "loss": 0.0355, "step": 13626 }, { "epoch": 3.3599605522682445, "grad_norm": 0.30078125, "learning_rate": 5.8707740816055526e-05, "loss": 0.0358, "step": 13628 }, { "epoch": 3.3604536489151875, "grad_norm": 0.333984375, "learning_rate": 5.867639046733189e-05, "loss": 0.0445, "step": 13630 }, { "epoch": 3.36094674556213, "grad_norm": 0.3359375, "learning_rate": 5.8645045015686686e-05, "loss": 0.0445, "step": 13632 }, { "epoch": 3.361439842209073, "grad_norm": 0.4375, "learning_rate": 5.861370446483456e-05, "loss": 0.0419, "step": 13634 }, { "epoch": 3.361932938856016, "grad_norm": 0.2890625, "learning_rate": 5.8582368818489484e-05, "loss": 0.0351, "step": 13636 }, { "epoch": 3.3624260355029585, "grad_norm": 0.431640625, "learning_rate": 5.855103808036501e-05, "loss": 0.0421, "step": 13638 }, { "epoch": 3.3629191321499015, "grad_norm": 0.294921875, "learning_rate": 5.851971225417393e-05, "loss": 0.0387, "step": 13640 }, { "epoch": 3.363412228796844, "grad_norm": 0.353515625, "learning_rate": 5.84883913436285e-05, "loss": 0.042, "step": 13642 }, { "epoch": 3.363905325443787, "grad_norm": 0.34375, "learning_rate": 5.8457075352440514e-05, "loss": 0.0418, "step": 13644 }, { "epoch": 3.36439842209073, "grad_norm": 0.357421875, "learning_rate": 5.842576428432103e-05, "loss": 0.0417, "step": 13646 }, { "epoch": 3.3648915187376724, "grad_norm": 0.3359375, "learning_rate": 5.8394458142980615e-05, "loss": 0.0415, "step": 13648 }, { "epoch": 3.3653846153846154, "grad_norm": 0.396484375, "learning_rate": 5.836315693212916e-05, "loss": 0.0451, "step": 13650 }, { "epoch": 3.3658777120315584, "grad_norm": 0.337890625, "learning_rate": 5.83318606554761e-05, "loss": 0.044, "step": 13652 }, { "epoch": 3.366370808678501, "grad_norm": 0.31640625, "learning_rate": 5.83005693167302e-05, "loss": 0.0452, "step": 13654 }, { "epoch": 3.366863905325444, "grad_norm": 0.32421875, "learning_rate": 5.82692829195997e-05, "loss": 0.042, "step": 13656 }, { "epoch": 3.3673570019723864, "grad_norm": 0.341796875, "learning_rate": 5.823800146779212e-05, "loss": 0.0407, "step": 13658 }, { "epoch": 3.3678500986193294, "grad_norm": 0.302734375, "learning_rate": 5.820672496501454e-05, "loss": 0.0372, "step": 13660 }, { "epoch": 3.3683431952662723, "grad_norm": 0.359375, "learning_rate": 5.817545341497339e-05, "loss": 0.0389, "step": 13662 }, { "epoch": 3.368836291913215, "grad_norm": 0.310546875, "learning_rate": 5.814418682137454e-05, "loss": 0.0389, "step": 13664 }, { "epoch": 3.369329388560158, "grad_norm": 0.41796875, "learning_rate": 5.811292518792324e-05, "loss": 0.039, "step": 13666 }, { "epoch": 3.3698224852071004, "grad_norm": 0.423828125, "learning_rate": 5.8081668518324176e-05, "loss": 0.0394, "step": 13668 }, { "epoch": 3.3703155818540433, "grad_norm": 0.330078125, "learning_rate": 5.805041681628146e-05, "loss": 0.0384, "step": 13670 }, { "epoch": 3.3708086785009863, "grad_norm": 0.330078125, "learning_rate": 5.801917008549853e-05, "loss": 0.0416, "step": 13672 }, { "epoch": 3.371301775147929, "grad_norm": 0.35546875, "learning_rate": 5.798792832967834e-05, "loss": 0.044, "step": 13674 }, { "epoch": 3.371794871794872, "grad_norm": 0.31640625, "learning_rate": 5.795669155252321e-05, "loss": 0.0419, "step": 13676 }, { "epoch": 3.3722879684418148, "grad_norm": 0.302734375, "learning_rate": 5.792545975773489e-05, "loss": 0.0374, "step": 13678 }, { "epoch": 3.3727810650887573, "grad_norm": 0.4765625, "learning_rate": 5.789423294901448e-05, "loss": 0.0422, "step": 13680 }, { "epoch": 3.3732741617357003, "grad_norm": 0.380859375, "learning_rate": 5.786301113006256e-05, "loss": 0.0449, "step": 13682 }, { "epoch": 3.3737672583826432, "grad_norm": 0.357421875, "learning_rate": 5.7831794304579124e-05, "loss": 0.0426, "step": 13684 }, { "epoch": 3.3742603550295858, "grad_norm": 0.373046875, "learning_rate": 5.780058247626341e-05, "loss": 0.0419, "step": 13686 }, { "epoch": 3.3747534516765287, "grad_norm": 0.310546875, "learning_rate": 5.776937564881437e-05, "loss": 0.0369, "step": 13688 }, { "epoch": 3.3752465483234713, "grad_norm": 0.310546875, "learning_rate": 5.773817382593008e-05, "loss": 0.0429, "step": 13690 }, { "epoch": 3.3757396449704142, "grad_norm": 0.318359375, "learning_rate": 5.7706977011308126e-05, "loss": 0.0424, "step": 13692 }, { "epoch": 3.3762327416173568, "grad_norm": 0.3125, "learning_rate": 5.767578520864554e-05, "loss": 0.0435, "step": 13694 }, { "epoch": 3.3767258382642997, "grad_norm": 0.291015625, "learning_rate": 5.764459842163872e-05, "loss": 0.0379, "step": 13696 }, { "epoch": 3.3772189349112427, "grad_norm": 0.337890625, "learning_rate": 5.7613416653983497e-05, "loss": 0.0372, "step": 13698 }, { "epoch": 3.3777120315581852, "grad_norm": 0.283203125, "learning_rate": 5.758223990937499e-05, "loss": 0.0354, "step": 13700 }, { "epoch": 3.378205128205128, "grad_norm": 0.298828125, "learning_rate": 5.7551068191507954e-05, "loss": 0.0374, "step": 13702 }, { "epoch": 3.378698224852071, "grad_norm": 0.337890625, "learning_rate": 5.751990150407627e-05, "loss": 0.0401, "step": 13704 }, { "epoch": 3.3791913214990137, "grad_norm": 0.349609375, "learning_rate": 5.7488739850773496e-05, "loss": 0.0443, "step": 13706 }, { "epoch": 3.3796844181459567, "grad_norm": 0.349609375, "learning_rate": 5.745758323529239e-05, "loss": 0.0395, "step": 13708 }, { "epoch": 3.3801775147928996, "grad_norm": 0.443359375, "learning_rate": 5.742643166132519e-05, "loss": 0.0421, "step": 13710 }, { "epoch": 3.380670611439842, "grad_norm": 0.3203125, "learning_rate": 5.739528513256357e-05, "loss": 0.0433, "step": 13712 }, { "epoch": 3.381163708086785, "grad_norm": 0.34375, "learning_rate": 5.736414365269846e-05, "loss": 0.0401, "step": 13714 }, { "epoch": 3.3816568047337277, "grad_norm": 0.318359375, "learning_rate": 5.733300722542045e-05, "loss": 0.0383, "step": 13716 }, { "epoch": 3.3821499013806706, "grad_norm": 0.314453125, "learning_rate": 5.730187585441924e-05, "loss": 0.0392, "step": 13718 }, { "epoch": 3.3826429980276136, "grad_norm": 0.341796875, "learning_rate": 5.727074954338421e-05, "loss": 0.0428, "step": 13720 }, { "epoch": 3.383136094674556, "grad_norm": 0.33984375, "learning_rate": 5.723962829600388e-05, "loss": 0.0375, "step": 13722 }, { "epoch": 3.383629191321499, "grad_norm": 0.349609375, "learning_rate": 5.720851211596634e-05, "loss": 0.0379, "step": 13724 }, { "epoch": 3.3841222879684416, "grad_norm": 0.37109375, "learning_rate": 5.7177401006959075e-05, "loss": 0.041, "step": 13726 }, { "epoch": 3.3846153846153846, "grad_norm": 0.400390625, "learning_rate": 5.714629497266881e-05, "loss": 0.0447, "step": 13728 }, { "epoch": 3.3851084812623276, "grad_norm": 0.416015625, "learning_rate": 5.7115194016781934e-05, "loss": 0.0403, "step": 13730 }, { "epoch": 3.38560157790927, "grad_norm": 0.416015625, "learning_rate": 5.7084098142983924e-05, "loss": 0.039, "step": 13732 }, { "epoch": 3.386094674556213, "grad_norm": 0.291015625, "learning_rate": 5.7053007354959975e-05, "loss": 0.0339, "step": 13734 }, { "epoch": 3.386587771203156, "grad_norm": 0.326171875, "learning_rate": 5.7021921656394394e-05, "loss": 0.0355, "step": 13736 }, { "epoch": 3.3870808678500985, "grad_norm": 0.333984375, "learning_rate": 5.699084105097106e-05, "loss": 0.041, "step": 13738 }, { "epoch": 3.3875739644970415, "grad_norm": 0.349609375, "learning_rate": 5.6959765542373235e-05, "loss": 0.0368, "step": 13740 }, { "epoch": 3.388067061143984, "grad_norm": 0.375, "learning_rate": 5.692869513428341e-05, "loss": 0.0406, "step": 13742 }, { "epoch": 3.388560157790927, "grad_norm": 0.361328125, "learning_rate": 5.689762983038378e-05, "loss": 0.0442, "step": 13744 }, { "epoch": 3.38905325443787, "grad_norm": 0.353515625, "learning_rate": 5.686656963435558e-05, "loss": 0.0404, "step": 13746 }, { "epoch": 3.3895463510848125, "grad_norm": 0.3125, "learning_rate": 5.683551454987979e-05, "loss": 0.0403, "step": 13748 }, { "epoch": 3.3900394477317555, "grad_norm": 0.306640625, "learning_rate": 5.680446458063649e-05, "loss": 0.038, "step": 13750 }, { "epoch": 3.390532544378698, "grad_norm": 0.349609375, "learning_rate": 5.67734197303053e-05, "loss": 0.043, "step": 13752 }, { "epoch": 3.391025641025641, "grad_norm": 0.306640625, "learning_rate": 5.6742380002565235e-05, "loss": 0.0397, "step": 13754 }, { "epoch": 3.391518737672584, "grad_norm": 0.333984375, "learning_rate": 5.671134540109464e-05, "loss": 0.04, "step": 13756 }, { "epoch": 3.3920118343195265, "grad_norm": 0.326171875, "learning_rate": 5.668031592957136e-05, "loss": 0.04, "step": 13758 }, { "epoch": 3.3925049309664694, "grad_norm": 0.326171875, "learning_rate": 5.6649291591672424e-05, "loss": 0.0379, "step": 13760 }, { "epoch": 3.3929980276134124, "grad_norm": 0.3671875, "learning_rate": 5.661827239107456e-05, "loss": 0.0394, "step": 13762 }, { "epoch": 3.393491124260355, "grad_norm": 0.337890625, "learning_rate": 5.658725833145357e-05, "loss": 0.0412, "step": 13764 }, { "epoch": 3.393984220907298, "grad_norm": 0.2890625, "learning_rate": 5.6556249416484873e-05, "loss": 0.0424, "step": 13766 }, { "epoch": 3.394477317554241, "grad_norm": 0.3046875, "learning_rate": 5.6525245649843184e-05, "loss": 0.0412, "step": 13768 }, { "epoch": 3.3949704142011834, "grad_norm": 0.328125, "learning_rate": 5.649424703520261e-05, "loss": 0.0375, "step": 13770 }, { "epoch": 3.3954635108481264, "grad_norm": 0.29296875, "learning_rate": 5.6463253576236694e-05, "loss": 0.0365, "step": 13772 }, { "epoch": 3.395956607495069, "grad_norm": 0.322265625, "learning_rate": 5.643226527661824e-05, "loss": 0.0405, "step": 13774 }, { "epoch": 3.396449704142012, "grad_norm": 0.294921875, "learning_rate": 5.6401282140019685e-05, "loss": 0.0404, "step": 13776 }, { "epoch": 3.3969428007889544, "grad_norm": 0.322265625, "learning_rate": 5.637030417011258e-05, "loss": 0.0409, "step": 13778 }, { "epoch": 3.3974358974358974, "grad_norm": 0.291015625, "learning_rate": 5.633933137056801e-05, "loss": 0.0401, "step": 13780 }, { "epoch": 3.3979289940828403, "grad_norm": 0.35546875, "learning_rate": 5.630836374505647e-05, "loss": 0.0348, "step": 13782 }, { "epoch": 3.398422090729783, "grad_norm": 0.39453125, "learning_rate": 5.627740129724775e-05, "loss": 0.0449, "step": 13784 }, { "epoch": 3.398915187376726, "grad_norm": 0.28515625, "learning_rate": 5.62464440308111e-05, "loss": 0.0401, "step": 13786 }, { "epoch": 3.399408284023669, "grad_norm": 0.34765625, "learning_rate": 5.621549194941511e-05, "loss": 0.0387, "step": 13788 }, { "epoch": 3.3999013806706113, "grad_norm": 0.306640625, "learning_rate": 5.6184545056727836e-05, "loss": 0.0385, "step": 13790 }, { "epoch": 3.4003944773175543, "grad_norm": 0.349609375, "learning_rate": 5.6153603356416575e-05, "loss": 0.037, "step": 13792 }, { "epoch": 3.4008875739644973, "grad_norm": 0.3203125, "learning_rate": 5.612266685214811e-05, "loss": 0.0413, "step": 13794 }, { "epoch": 3.40138067061144, "grad_norm": 0.341796875, "learning_rate": 5.60917355475886e-05, "loss": 0.0437, "step": 13796 }, { "epoch": 3.4018737672583828, "grad_norm": 0.3203125, "learning_rate": 5.60608094464036e-05, "loss": 0.0433, "step": 13798 }, { "epoch": 3.4023668639053253, "grad_norm": 0.375, "learning_rate": 5.602988855225798e-05, "loss": 0.0438, "step": 13800 }, { "epoch": 3.4028599605522682, "grad_norm": 0.3046875, "learning_rate": 5.599897286881608e-05, "loss": 0.0396, "step": 13802 }, { "epoch": 3.403353057199211, "grad_norm": 0.37890625, "learning_rate": 5.5968062399741593e-05, "loss": 0.0395, "step": 13804 }, { "epoch": 3.4038461538461537, "grad_norm": 0.298828125, "learning_rate": 5.5937157148697514e-05, "loss": 0.0397, "step": 13806 }, { "epoch": 3.4043392504930967, "grad_norm": 0.337890625, "learning_rate": 5.590625711934634e-05, "loss": 0.0359, "step": 13808 }, { "epoch": 3.4048323471400392, "grad_norm": 0.30078125, "learning_rate": 5.587536231534989e-05, "loss": 0.0376, "step": 13810 }, { "epoch": 3.405325443786982, "grad_norm": 0.30859375, "learning_rate": 5.5844472740369355e-05, "loss": 0.0363, "step": 13812 }, { "epoch": 3.405818540433925, "grad_norm": 0.2890625, "learning_rate": 5.581358839806534e-05, "loss": 0.0372, "step": 13814 }, { "epoch": 3.4063116370808677, "grad_norm": 0.345703125, "learning_rate": 5.57827092920978e-05, "loss": 0.0405, "step": 13816 }, { "epoch": 3.4068047337278107, "grad_norm": 0.3515625, "learning_rate": 5.575183542612609e-05, "loss": 0.0392, "step": 13818 }, { "epoch": 3.4072978303747536, "grad_norm": 0.314453125, "learning_rate": 5.572096680380896e-05, "loss": 0.0366, "step": 13820 }, { "epoch": 3.407790927021696, "grad_norm": 0.3359375, "learning_rate": 5.5690103428804465e-05, "loss": 0.0379, "step": 13822 }, { "epoch": 3.408284023668639, "grad_norm": 0.326171875, "learning_rate": 5.565924530477009e-05, "loss": 0.0368, "step": 13824 }, { "epoch": 3.4087771203155817, "grad_norm": 0.333984375, "learning_rate": 5.562839243536273e-05, "loss": 0.0391, "step": 13826 }, { "epoch": 3.4092702169625246, "grad_norm": 0.30859375, "learning_rate": 5.5597544824238604e-05, "loss": 0.0405, "step": 13828 }, { "epoch": 3.4097633136094676, "grad_norm": 0.330078125, "learning_rate": 5.556670247505331e-05, "loss": 0.0361, "step": 13830 }, { "epoch": 3.41025641025641, "grad_norm": 0.322265625, "learning_rate": 5.553586539146187e-05, "loss": 0.0405, "step": 13832 }, { "epoch": 3.410749506903353, "grad_norm": 0.330078125, "learning_rate": 5.5505033577118684e-05, "loss": 0.0392, "step": 13834 }, { "epoch": 3.4112426035502956, "grad_norm": 0.3125, "learning_rate": 5.5474207035677364e-05, "loss": 0.0326, "step": 13836 }, { "epoch": 3.4117357001972386, "grad_norm": 0.3125, "learning_rate": 5.544338577079118e-05, "loss": 0.0349, "step": 13838 }, { "epoch": 3.4122287968441816, "grad_norm": 0.291015625, "learning_rate": 5.541256978611251e-05, "loss": 0.0384, "step": 13840 }, { "epoch": 3.412721893491124, "grad_norm": 0.3671875, "learning_rate": 5.5381759085293284e-05, "loss": 0.0388, "step": 13842 }, { "epoch": 3.413214990138067, "grad_norm": 0.384765625, "learning_rate": 5.535095367198471e-05, "loss": 0.0414, "step": 13844 }, { "epoch": 3.41370808678501, "grad_norm": 0.310546875, "learning_rate": 5.5320153549837415e-05, "loss": 0.0385, "step": 13846 }, { "epoch": 3.4142011834319526, "grad_norm": 0.376953125, "learning_rate": 5.528935872250143e-05, "loss": 0.0384, "step": 13848 }, { "epoch": 3.4146942800788955, "grad_norm": 0.37890625, "learning_rate": 5.5258569193626e-05, "loss": 0.0455, "step": 13850 }, { "epoch": 3.4151873767258385, "grad_norm": 0.380859375, "learning_rate": 5.5227784966859984e-05, "loss": 0.043, "step": 13852 }, { "epoch": 3.415680473372781, "grad_norm": 0.310546875, "learning_rate": 5.519700604585139e-05, "loss": 0.0379, "step": 13854 }, { "epoch": 3.416173570019724, "grad_norm": 0.28125, "learning_rate": 5.5166232434247725e-05, "loss": 0.0423, "step": 13856 }, { "epoch": 3.4166666666666665, "grad_norm": 0.3515625, "learning_rate": 5.5135464135695846e-05, "loss": 0.0397, "step": 13858 }, { "epoch": 3.4171597633136095, "grad_norm": 0.310546875, "learning_rate": 5.5104701153841956e-05, "loss": 0.0392, "step": 13860 }, { "epoch": 3.4176528599605525, "grad_norm": 0.388671875, "learning_rate": 5.507394349233168e-05, "loss": 0.0362, "step": 13862 }, { "epoch": 3.418145956607495, "grad_norm": 0.345703125, "learning_rate": 5.504319115480985e-05, "loss": 0.0403, "step": 13864 }, { "epoch": 3.418639053254438, "grad_norm": 0.330078125, "learning_rate": 5.501244414492096e-05, "loss": 0.0353, "step": 13866 }, { "epoch": 3.4191321499013805, "grad_norm": 0.306640625, "learning_rate": 5.498170246630854e-05, "loss": 0.0341, "step": 13868 }, { "epoch": 3.4196252465483234, "grad_norm": 0.314453125, "learning_rate": 5.495096612261581e-05, "loss": 0.0361, "step": 13870 }, { "epoch": 3.4201183431952664, "grad_norm": 0.341796875, "learning_rate": 5.492023511748506e-05, "loss": 0.0331, "step": 13872 }, { "epoch": 3.420611439842209, "grad_norm": 0.291015625, "learning_rate": 5.488950945455815e-05, "loss": 0.0386, "step": 13874 }, { "epoch": 3.421104536489152, "grad_norm": 0.35546875, "learning_rate": 5.485878913747629e-05, "loss": 0.0362, "step": 13876 }, { "epoch": 3.421597633136095, "grad_norm": 0.328125, "learning_rate": 5.482807416987985e-05, "loss": 0.0394, "step": 13878 }, { "epoch": 3.4220907297830374, "grad_norm": 0.3203125, "learning_rate": 5.479736455540892e-05, "loss": 0.0404, "step": 13880 }, { "epoch": 3.4225838264299804, "grad_norm": 0.271484375, "learning_rate": 5.4766660297702586e-05, "loss": 0.0371, "step": 13882 }, { "epoch": 3.423076923076923, "grad_norm": 0.31640625, "learning_rate": 5.473596140039965e-05, "loss": 0.0402, "step": 13884 }, { "epoch": 3.423570019723866, "grad_norm": 0.39453125, "learning_rate": 5.4705267867137955e-05, "loss": 0.0394, "step": 13886 }, { "epoch": 3.424063116370809, "grad_norm": 0.337890625, "learning_rate": 5.467457970155492e-05, "loss": 0.0401, "step": 13888 }, { "epoch": 3.4245562130177514, "grad_norm": 0.427734375, "learning_rate": 5.464389690728728e-05, "loss": 0.0378, "step": 13890 }, { "epoch": 3.4250493096646943, "grad_norm": 0.251953125, "learning_rate": 5.461321948797103e-05, "loss": 0.0366, "step": 13892 }, { "epoch": 3.425542406311637, "grad_norm": 0.33984375, "learning_rate": 5.458254744724176e-05, "loss": 0.0406, "step": 13894 }, { "epoch": 3.42603550295858, "grad_norm": 0.341796875, "learning_rate": 5.45518807887341e-05, "loss": 0.0425, "step": 13896 }, { "epoch": 3.426528599605523, "grad_norm": 0.298828125, "learning_rate": 5.4521219516082404e-05, "loss": 0.0376, "step": 13898 }, { "epoch": 3.4270216962524653, "grad_norm": 0.296875, "learning_rate": 5.4490563632920076e-05, "loss": 0.0394, "step": 13900 }, { "epoch": 3.4275147928994083, "grad_norm": 0.318359375, "learning_rate": 5.445991314288006e-05, "loss": 0.0365, "step": 13902 }, { "epoch": 3.4280078895463513, "grad_norm": 0.30078125, "learning_rate": 5.442926804959463e-05, "loss": 0.0389, "step": 13904 }, { "epoch": 3.428500986193294, "grad_norm": 0.29296875, "learning_rate": 5.439862835669529e-05, "loss": 0.0362, "step": 13906 }, { "epoch": 3.4289940828402368, "grad_norm": 0.337890625, "learning_rate": 5.436799406781317e-05, "loss": 0.0366, "step": 13908 }, { "epoch": 3.4294871794871793, "grad_norm": 0.29296875, "learning_rate": 5.433736518657846e-05, "loss": 0.0387, "step": 13910 }, { "epoch": 3.4299802761341223, "grad_norm": 0.314453125, "learning_rate": 5.4306741716621e-05, "loss": 0.0369, "step": 13912 }, { "epoch": 3.4304733727810652, "grad_norm": 0.30859375, "learning_rate": 5.427612366156971e-05, "loss": 0.0349, "step": 13914 }, { "epoch": 3.4309664694280078, "grad_norm": 0.330078125, "learning_rate": 5.424551102505308e-05, "loss": 0.0373, "step": 13916 }, { "epoch": 3.4314595660749507, "grad_norm": 0.384765625, "learning_rate": 5.4214903810698835e-05, "loss": 0.0392, "step": 13918 }, { "epoch": 3.4319526627218933, "grad_norm": 0.279296875, "learning_rate": 5.418430202213414e-05, "loss": 0.0337, "step": 13920 }, { "epoch": 3.4324457593688362, "grad_norm": 0.37109375, "learning_rate": 5.415370566298548e-05, "loss": 0.0402, "step": 13922 }, { "epoch": 3.432938856015779, "grad_norm": 0.296875, "learning_rate": 5.412311473687859e-05, "loss": 0.0362, "step": 13924 }, { "epoch": 3.4334319526627217, "grad_norm": 0.328125, "learning_rate": 5.409252924743884e-05, "loss": 0.0394, "step": 13926 }, { "epoch": 3.4339250493096647, "grad_norm": 0.36328125, "learning_rate": 5.406194919829064e-05, "loss": 0.0365, "step": 13928 }, { "epoch": 3.4344181459566077, "grad_norm": 0.388671875, "learning_rate": 5.4031374593057946e-05, "loss": 0.0412, "step": 13930 }, { "epoch": 3.43491124260355, "grad_norm": 0.328125, "learning_rate": 5.400080543536402e-05, "loss": 0.0417, "step": 13932 }, { "epoch": 3.435404339250493, "grad_norm": 0.310546875, "learning_rate": 5.397024172883147e-05, "loss": 0.0375, "step": 13934 }, { "epoch": 3.435897435897436, "grad_norm": 0.341796875, "learning_rate": 5.3939683477082324e-05, "loss": 0.0404, "step": 13936 }, { "epoch": 3.4363905325443787, "grad_norm": 0.41015625, "learning_rate": 5.390913068373777e-05, "loss": 0.0414, "step": 13938 }, { "epoch": 3.4368836291913216, "grad_norm": 0.490234375, "learning_rate": 5.3878583352418645e-05, "loss": 0.0395, "step": 13940 }, { "epoch": 3.437376725838264, "grad_norm": 0.30078125, "learning_rate": 5.384804148674487e-05, "loss": 0.0353, "step": 13942 }, { "epoch": 3.437869822485207, "grad_norm": 0.390625, "learning_rate": 5.381750509033585e-05, "loss": 0.0339, "step": 13944 }, { "epoch": 3.43836291913215, "grad_norm": 0.3046875, "learning_rate": 5.378697416681033e-05, "loss": 0.0382, "step": 13946 }, { "epoch": 3.4388560157790926, "grad_norm": 0.314453125, "learning_rate": 5.3756448719786403e-05, "loss": 0.04, "step": 13948 }, { "epoch": 3.4393491124260356, "grad_norm": 0.333984375, "learning_rate": 5.372592875288147e-05, "loss": 0.0371, "step": 13950 }, { "epoch": 3.439842209072978, "grad_norm": 0.326171875, "learning_rate": 5.3695414269712366e-05, "loss": 0.037, "step": 13952 }, { "epoch": 3.440335305719921, "grad_norm": 0.357421875, "learning_rate": 5.366490527389523e-05, "loss": 0.0373, "step": 13954 }, { "epoch": 3.440828402366864, "grad_norm": 0.345703125, "learning_rate": 5.363440176904547e-05, "loss": 0.0377, "step": 13956 }, { "epoch": 3.4413214990138066, "grad_norm": 0.369140625, "learning_rate": 5.360390375877799e-05, "loss": 0.0349, "step": 13958 }, { "epoch": 3.4418145956607495, "grad_norm": 0.328125, "learning_rate": 5.357341124670694e-05, "loss": 0.0366, "step": 13960 }, { "epoch": 3.4423076923076925, "grad_norm": 0.349609375, "learning_rate": 5.354292423644589e-05, "loss": 0.039, "step": 13962 }, { "epoch": 3.442800788954635, "grad_norm": 0.34765625, "learning_rate": 5.3512442731607694e-05, "loss": 0.035, "step": 13964 }, { "epoch": 3.443293885601578, "grad_norm": 0.3046875, "learning_rate": 5.3481966735804586e-05, "loss": 0.0364, "step": 13966 }, { "epoch": 3.4437869822485205, "grad_norm": 0.30078125, "learning_rate": 5.345149625264814e-05, "loss": 0.0354, "step": 13968 }, { "epoch": 3.4442800788954635, "grad_norm": 0.353515625, "learning_rate": 5.342103128574932e-05, "loss": 0.039, "step": 13970 }, { "epoch": 3.4447731755424065, "grad_norm": 0.31640625, "learning_rate": 5.339057183871832e-05, "loss": 0.0356, "step": 13972 }, { "epoch": 3.445266272189349, "grad_norm": 0.349609375, "learning_rate": 5.33601179151648e-05, "loss": 0.0395, "step": 13974 }, { "epoch": 3.445759368836292, "grad_norm": 0.3125, "learning_rate": 5.33296695186977e-05, "loss": 0.0382, "step": 13976 }, { "epoch": 3.4462524654832345, "grad_norm": 0.30078125, "learning_rate": 5.3299226652925336e-05, "loss": 0.037, "step": 13978 }, { "epoch": 3.4467455621301775, "grad_norm": 0.396484375, "learning_rate": 5.326878932145536e-05, "loss": 0.0344, "step": 13980 }, { "epoch": 3.4472386587771204, "grad_norm": 0.34375, "learning_rate": 5.323835752789477e-05, "loss": 0.0428, "step": 13982 }, { "epoch": 3.447731755424063, "grad_norm": 0.2734375, "learning_rate": 5.3207931275849934e-05, "loss": 0.0361, "step": 13984 }, { "epoch": 3.448224852071006, "grad_norm": 0.298828125, "learning_rate": 5.3177510568926434e-05, "loss": 0.0351, "step": 13986 }, { "epoch": 3.448717948717949, "grad_norm": 0.28125, "learning_rate": 5.314709541072938e-05, "loss": 0.0342, "step": 13988 }, { "epoch": 3.4492110453648914, "grad_norm": 0.373046875, "learning_rate": 5.311668580486311e-05, "loss": 0.036, "step": 13990 }, { "epoch": 3.4497041420118344, "grad_norm": 0.35546875, "learning_rate": 5.308628175493132e-05, "loss": 0.0383, "step": 13992 }, { "epoch": 3.4501972386587774, "grad_norm": 0.337890625, "learning_rate": 5.3055883264537085e-05, "loss": 0.0371, "step": 13994 }, { "epoch": 3.45069033530572, "grad_norm": 0.34375, "learning_rate": 5.302549033728279e-05, "loss": 0.0405, "step": 13996 }, { "epoch": 3.451183431952663, "grad_norm": 0.3046875, "learning_rate": 5.299510297677015e-05, "loss": 0.0329, "step": 13998 }, { "epoch": 3.4516765285996054, "grad_norm": 0.412109375, "learning_rate": 5.296472118660025e-05, "loss": 0.0372, "step": 14000 }, { "epoch": 3.4521696252465484, "grad_norm": 0.318359375, "learning_rate": 5.293434497037355e-05, "loss": 0.0406, "step": 14002 }, { "epoch": 3.452662721893491, "grad_norm": 0.30859375, "learning_rate": 5.290397433168972e-05, "loss": 0.0396, "step": 14004 }, { "epoch": 3.453155818540434, "grad_norm": 0.361328125, "learning_rate": 5.2873609274147864e-05, "loss": 0.0344, "step": 14006 }, { "epoch": 3.453648915187377, "grad_norm": 0.310546875, "learning_rate": 5.284324980134644e-05, "loss": 0.038, "step": 14008 }, { "epoch": 3.4541420118343193, "grad_norm": 0.2890625, "learning_rate": 5.28128959168832e-05, "loss": 0.0383, "step": 14010 }, { "epoch": 3.4546351084812623, "grad_norm": 0.443359375, "learning_rate": 5.278254762435525e-05, "loss": 0.0425, "step": 14012 }, { "epoch": 3.4551282051282053, "grad_norm": 0.408203125, "learning_rate": 5.275220492735904e-05, "loss": 0.0382, "step": 14014 }, { "epoch": 3.455621301775148, "grad_norm": 0.32421875, "learning_rate": 5.272186782949038e-05, "loss": 0.0368, "step": 14016 }, { "epoch": 3.456114398422091, "grad_norm": 0.322265625, "learning_rate": 5.2691536334344274e-05, "loss": 0.0343, "step": 14018 }, { "epoch": 3.4566074950690338, "grad_norm": 0.41796875, "learning_rate": 5.266121044551533e-05, "loss": 0.0375, "step": 14020 }, { "epoch": 3.4571005917159763, "grad_norm": 0.4296875, "learning_rate": 5.2630890166597215e-05, "loss": 0.0387, "step": 14022 }, { "epoch": 3.4575936883629192, "grad_norm": 0.3046875, "learning_rate": 5.260057550118307e-05, "loss": 0.039, "step": 14024 }, { "epoch": 3.4580867850098618, "grad_norm": 0.279296875, "learning_rate": 5.25702664528654e-05, "loss": 0.0391, "step": 14026 }, { "epoch": 3.4585798816568047, "grad_norm": 0.298828125, "learning_rate": 5.253996302523596e-05, "loss": 0.0365, "step": 14028 }, { "epoch": 3.4590729783037477, "grad_norm": 0.27734375, "learning_rate": 5.250966522188592e-05, "loss": 0.0328, "step": 14030 }, { "epoch": 3.4595660749506902, "grad_norm": 0.3828125, "learning_rate": 5.2479373046405625e-05, "loss": 0.0364, "step": 14032 }, { "epoch": 3.460059171597633, "grad_norm": 0.314453125, "learning_rate": 5.244908650238503e-05, "loss": 0.0348, "step": 14034 }, { "epoch": 3.4605522682445757, "grad_norm": 0.302734375, "learning_rate": 5.241880559341312e-05, "loss": 0.0379, "step": 14036 }, { "epoch": 3.4610453648915187, "grad_norm": 0.28515625, "learning_rate": 5.238853032307842e-05, "loss": 0.0354, "step": 14038 }, { "epoch": 3.4615384615384617, "grad_norm": 0.3203125, "learning_rate": 5.23582606949687e-05, "loss": 0.0348, "step": 14040 }, { "epoch": 3.462031558185404, "grad_norm": 0.3046875, "learning_rate": 5.232799671267109e-05, "loss": 0.037, "step": 14042 }, { "epoch": 3.462524654832347, "grad_norm": 0.31640625, "learning_rate": 5.229773837977208e-05, "loss": 0.0393, "step": 14044 }, { "epoch": 3.46301775147929, "grad_norm": 0.361328125, "learning_rate": 5.226748569985731e-05, "loss": 0.0353, "step": 14046 }, { "epoch": 3.4635108481262327, "grad_norm": 0.3125, "learning_rate": 5.223723867651208e-05, "loss": 0.0437, "step": 14048 }, { "epoch": 3.4640039447731756, "grad_norm": 0.302734375, "learning_rate": 5.220699731332071e-05, "loss": 0.0375, "step": 14050 }, { "epoch": 3.464497041420118, "grad_norm": 0.3515625, "learning_rate": 5.2176761613866974e-05, "loss": 0.0391, "step": 14052 }, { "epoch": 3.464990138067061, "grad_norm": 0.341796875, "learning_rate": 5.214653158173401e-05, "loss": 0.0371, "step": 14054 }, { "epoch": 3.465483234714004, "grad_norm": 0.306640625, "learning_rate": 5.2116307220504245e-05, "loss": 0.0392, "step": 14056 }, { "epoch": 3.4659763313609466, "grad_norm": 0.29296875, "learning_rate": 5.208608853375945e-05, "loss": 0.0367, "step": 14058 }, { "epoch": 3.4664694280078896, "grad_norm": 0.337890625, "learning_rate": 5.205587552508061e-05, "loss": 0.0377, "step": 14060 }, { "epoch": 3.466962524654832, "grad_norm": 0.322265625, "learning_rate": 5.202566819804828e-05, "loss": 0.038, "step": 14062 }, { "epoch": 3.467455621301775, "grad_norm": 0.318359375, "learning_rate": 5.199546655624208e-05, "loss": 0.0393, "step": 14064 }, { "epoch": 3.467948717948718, "grad_norm": 0.349609375, "learning_rate": 5.1965270603241124e-05, "loss": 0.0404, "step": 14066 }, { "epoch": 3.4684418145956606, "grad_norm": 0.345703125, "learning_rate": 5.1935080342623786e-05, "loss": 0.0411, "step": 14068 }, { "epoch": 3.4689349112426036, "grad_norm": 0.294921875, "learning_rate": 5.1904895777967796e-05, "loss": 0.0355, "step": 14070 }, { "epoch": 3.4694280078895465, "grad_norm": 0.31640625, "learning_rate": 5.1874716912850216e-05, "loss": 0.0342, "step": 14072 }, { "epoch": 3.469921104536489, "grad_norm": 0.337890625, "learning_rate": 5.184454375084731e-05, "loss": 0.0393, "step": 14074 }, { "epoch": 3.470414201183432, "grad_norm": 0.349609375, "learning_rate": 5.1814376295534895e-05, "loss": 0.0385, "step": 14076 }, { "epoch": 3.470907297830375, "grad_norm": 0.29296875, "learning_rate": 5.178421455048786e-05, "loss": 0.0369, "step": 14078 }, { "epoch": 3.4714003944773175, "grad_norm": 0.291015625, "learning_rate": 5.175405851928068e-05, "loss": 0.0366, "step": 14080 }, { "epoch": 3.4718934911242605, "grad_norm": 0.2734375, "learning_rate": 5.1723908205486896e-05, "loss": 0.0347, "step": 14082 }, { "epoch": 3.472386587771203, "grad_norm": 0.35546875, "learning_rate": 5.1693763612679526e-05, "loss": 0.0357, "step": 14084 }, { "epoch": 3.472879684418146, "grad_norm": 0.2890625, "learning_rate": 5.1663624744430915e-05, "loss": 0.0394, "step": 14086 }, { "epoch": 3.4733727810650885, "grad_norm": 0.302734375, "learning_rate": 5.163349160431257e-05, "loss": 0.0347, "step": 14088 }, { "epoch": 3.4738658777120315, "grad_norm": 0.33203125, "learning_rate": 5.1603364195895596e-05, "loss": 0.0361, "step": 14090 }, { "epoch": 3.4743589743589745, "grad_norm": 0.287109375, "learning_rate": 5.15732425227501e-05, "loss": 0.0352, "step": 14092 }, { "epoch": 3.474852071005917, "grad_norm": 0.31640625, "learning_rate": 5.1543126588445825e-05, "loss": 0.0399, "step": 14094 }, { "epoch": 3.47534516765286, "grad_norm": 0.392578125, "learning_rate": 5.151301639655155e-05, "loss": 0.038, "step": 14096 }, { "epoch": 3.475838264299803, "grad_norm": 0.384765625, "learning_rate": 5.148291195063555e-05, "loss": 0.0349, "step": 14098 }, { "epoch": 3.4763313609467454, "grad_norm": 0.33984375, "learning_rate": 5.1452813254265364e-05, "loss": 0.0365, "step": 14100 }, { "epoch": 3.4768244575936884, "grad_norm": 0.3671875, "learning_rate": 5.142272031100788e-05, "loss": 0.0377, "step": 14102 }, { "epoch": 3.4773175542406314, "grad_norm": 0.322265625, "learning_rate": 5.139263312442928e-05, "loss": 0.0357, "step": 14104 }, { "epoch": 3.477810650887574, "grad_norm": 0.271484375, "learning_rate": 5.1362551698094964e-05, "loss": 0.0357, "step": 14106 }, { "epoch": 3.478303747534517, "grad_norm": 0.318359375, "learning_rate": 5.133247603556992e-05, "loss": 0.0394, "step": 14108 }, { "epoch": 3.4787968441814594, "grad_norm": 0.341796875, "learning_rate": 5.130240614041815e-05, "loss": 0.0358, "step": 14110 }, { "epoch": 3.4792899408284024, "grad_norm": 0.33203125, "learning_rate": 5.1272342016203126e-05, "loss": 0.0343, "step": 14112 }, { "epoch": 3.4797830374753453, "grad_norm": 0.376953125, "learning_rate": 5.124228366648765e-05, "loss": 0.0382, "step": 14114 }, { "epoch": 3.480276134122288, "grad_norm": 0.361328125, "learning_rate": 5.1212231094833784e-05, "loss": 0.039, "step": 14116 }, { "epoch": 3.480769230769231, "grad_norm": 0.30859375, "learning_rate": 5.118218430480297e-05, "loss": 0.0391, "step": 14118 }, { "epoch": 3.4812623274161734, "grad_norm": 0.36328125, "learning_rate": 5.11521432999558e-05, "loss": 0.0396, "step": 14120 }, { "epoch": 3.4817554240631163, "grad_norm": 0.291015625, "learning_rate": 5.112210808385246e-05, "loss": 0.0371, "step": 14122 }, { "epoch": 3.4822485207100593, "grad_norm": 0.2734375, "learning_rate": 5.109207866005217e-05, "loss": 0.0371, "step": 14124 }, { "epoch": 3.482741617357002, "grad_norm": 0.2890625, "learning_rate": 5.106205503211363e-05, "loss": 0.0344, "step": 14126 }, { "epoch": 3.483234714003945, "grad_norm": 0.33984375, "learning_rate": 5.1032037203594794e-05, "loss": 0.0385, "step": 14128 }, { "epoch": 3.4837278106508878, "grad_norm": 0.29296875, "learning_rate": 5.100202517805297e-05, "loss": 0.0355, "step": 14130 }, { "epoch": 3.4842209072978303, "grad_norm": 0.31640625, "learning_rate": 5.097201895904471e-05, "loss": 0.036, "step": 14132 }, { "epoch": 3.4847140039447733, "grad_norm": 0.341796875, "learning_rate": 5.094201855012596e-05, "loss": 0.0376, "step": 14134 }, { "epoch": 3.485207100591716, "grad_norm": 0.279296875, "learning_rate": 5.0912023954851953e-05, "loss": 0.0366, "step": 14136 }, { "epoch": 3.4857001972386588, "grad_norm": 0.296875, "learning_rate": 5.088203517677712e-05, "loss": 0.0373, "step": 14138 }, { "epoch": 3.4861932938856017, "grad_norm": 0.294921875, "learning_rate": 5.085205221945537e-05, "loss": 0.0318, "step": 14140 }, { "epoch": 3.4866863905325443, "grad_norm": 0.33984375, "learning_rate": 5.082207508643985e-05, "loss": 0.0374, "step": 14142 }, { "epoch": 3.4871794871794872, "grad_norm": 0.287109375, "learning_rate": 5.0792103781282997e-05, "loss": 0.0366, "step": 14144 }, { "epoch": 3.4876725838264298, "grad_norm": 0.36328125, "learning_rate": 5.0762138307536586e-05, "loss": 0.038, "step": 14146 }, { "epoch": 3.4881656804733727, "grad_norm": 0.28125, "learning_rate": 5.07321786687517e-05, "loss": 0.0351, "step": 14148 }, { "epoch": 3.4886587771203157, "grad_norm": 0.326171875, "learning_rate": 5.0702224868478706e-05, "loss": 0.0378, "step": 14150 }, { "epoch": 3.489151873767258, "grad_norm": 0.345703125, "learning_rate": 5.0672276910267366e-05, "loss": 0.0366, "step": 14152 }, { "epoch": 3.489644970414201, "grad_norm": 0.322265625, "learning_rate": 5.064233479766657e-05, "loss": 0.042, "step": 14154 }, { "epoch": 3.490138067061144, "grad_norm": 0.302734375, "learning_rate": 5.061239853422468e-05, "loss": 0.0338, "step": 14156 }, { "epoch": 3.4906311637080867, "grad_norm": 0.30078125, "learning_rate": 5.058246812348931e-05, "loss": 0.0358, "step": 14158 }, { "epoch": 3.4911242603550297, "grad_norm": 0.283203125, "learning_rate": 5.0552543569007384e-05, "loss": 0.0329, "step": 14160 }, { "epoch": 3.4916173570019726, "grad_norm": 0.296875, "learning_rate": 5.052262487432513e-05, "loss": 0.0351, "step": 14162 }, { "epoch": 3.492110453648915, "grad_norm": 0.3203125, "learning_rate": 5.0492712042988065e-05, "loss": 0.0348, "step": 14164 }, { "epoch": 3.492603550295858, "grad_norm": 0.33984375, "learning_rate": 5.046280507854106e-05, "loss": 0.0359, "step": 14166 }, { "epoch": 3.4930966469428006, "grad_norm": 0.302734375, "learning_rate": 5.0432903984528225e-05, "loss": 0.0349, "step": 14168 }, { "epoch": 3.4935897435897436, "grad_norm": 0.310546875, "learning_rate": 5.0403008764492995e-05, "loss": 0.0372, "step": 14170 }, { "epoch": 3.4940828402366866, "grad_norm": 0.29296875, "learning_rate": 5.037311942197815e-05, "loss": 0.0294, "step": 14172 }, { "epoch": 3.494575936883629, "grad_norm": 0.294921875, "learning_rate": 5.0343235960525725e-05, "loss": 0.0352, "step": 14174 }, { "epoch": 3.495069033530572, "grad_norm": 0.287109375, "learning_rate": 5.03133583836771e-05, "loss": 0.0356, "step": 14176 }, { "epoch": 3.4955621301775146, "grad_norm": 0.31640625, "learning_rate": 5.028348669497292e-05, "loss": 0.0419, "step": 14178 }, { "epoch": 3.4960552268244576, "grad_norm": 0.306640625, "learning_rate": 5.025362089795317e-05, "loss": 0.0373, "step": 14180 }, { "epoch": 3.4965483234714005, "grad_norm": 0.3359375, "learning_rate": 5.0223760996157024e-05, "loss": 0.0397, "step": 14182 }, { "epoch": 3.497041420118343, "grad_norm": 0.400390625, "learning_rate": 5.01939069931232e-05, "loss": 0.0364, "step": 14184 }, { "epoch": 3.497534516765286, "grad_norm": 0.3046875, "learning_rate": 5.016405889238942e-05, "loss": 0.0375, "step": 14186 }, { "epoch": 3.498027613412229, "grad_norm": 0.333984375, "learning_rate": 5.013421669749293e-05, "loss": 0.0376, "step": 14188 }, { "epoch": 3.4985207100591715, "grad_norm": 0.330078125, "learning_rate": 5.010438041197017e-05, "loss": 0.0369, "step": 14190 }, { "epoch": 3.4990138067061145, "grad_norm": 0.34375, "learning_rate": 5.0074550039356905e-05, "loss": 0.0383, "step": 14192 }, { "epoch": 3.499506903353057, "grad_norm": 0.376953125, "learning_rate": 5.004472558318827e-05, "loss": 0.0449, "step": 14194 }, { "epoch": 3.5, "grad_norm": 0.33203125, "learning_rate": 5.001490704699847e-05, "loss": 0.0441, "step": 14196 }, { "epoch": 3.5004930966469425, "grad_norm": 0.29296875, "learning_rate": 4.998509443432134e-05, "loss": 0.037, "step": 14198 }, { "epoch": 3.5009861932938855, "grad_norm": 0.265625, "learning_rate": 4.9955287748689736e-05, "loss": 0.036, "step": 14200 }, { "epoch": 3.5014792899408285, "grad_norm": 0.36328125, "learning_rate": 4.992548699363595e-05, "loss": 0.0381, "step": 14202 }, { "epoch": 3.501972386587771, "grad_norm": 0.310546875, "learning_rate": 4.9895692172691534e-05, "loss": 0.0412, "step": 14204 }, { "epoch": 3.502465483234714, "grad_norm": 0.302734375, "learning_rate": 4.986590328938734e-05, "loss": 0.0361, "step": 14206 }, { "epoch": 3.502958579881657, "grad_norm": 0.29296875, "learning_rate": 4.9836120347253554e-05, "loss": 0.0367, "step": 14208 }, { "epoch": 3.5034516765285995, "grad_norm": 0.322265625, "learning_rate": 4.980634334981952e-05, "loss": 0.0412, "step": 14210 }, { "epoch": 3.5039447731755424, "grad_norm": 0.322265625, "learning_rate": 4.9776572300614125e-05, "loss": 0.0351, "step": 14212 }, { "epoch": 3.5044378698224854, "grad_norm": 0.376953125, "learning_rate": 4.9746807203165225e-05, "loss": 0.0364, "step": 14214 }, { "epoch": 3.504930966469428, "grad_norm": 0.27734375, "learning_rate": 4.971704806100035e-05, "loss": 0.036, "step": 14216 }, { "epoch": 3.505424063116371, "grad_norm": 0.345703125, "learning_rate": 4.968729487764596e-05, "loss": 0.0355, "step": 14218 }, { "epoch": 3.505917159763314, "grad_norm": 0.279296875, "learning_rate": 4.965754765662806e-05, "loss": 0.038, "step": 14220 }, { "epoch": 3.5064102564102564, "grad_norm": 0.380859375, "learning_rate": 4.962780640147187e-05, "loss": 0.0372, "step": 14222 }, { "epoch": 3.5069033530571994, "grad_norm": 0.3828125, "learning_rate": 4.959807111570178e-05, "loss": 0.0377, "step": 14224 }, { "epoch": 3.507396449704142, "grad_norm": 0.263671875, "learning_rate": 4.956834180284176e-05, "loss": 0.0383, "step": 14226 }, { "epoch": 3.507889546351085, "grad_norm": 0.310546875, "learning_rate": 4.9538618466414724e-05, "loss": 0.0346, "step": 14228 }, { "epoch": 3.5083826429980274, "grad_norm": 0.3359375, "learning_rate": 4.9508901109943215e-05, "loss": 0.0361, "step": 14230 }, { "epoch": 3.5088757396449703, "grad_norm": 0.345703125, "learning_rate": 4.947918973694881e-05, "loss": 0.0367, "step": 14232 }, { "epoch": 3.5093688362919133, "grad_norm": 0.3046875, "learning_rate": 4.944948435095248e-05, "loss": 0.0359, "step": 14234 }, { "epoch": 3.509861932938856, "grad_norm": 0.33984375, "learning_rate": 4.9419784955474524e-05, "loss": 0.0314, "step": 14236 }, { "epoch": 3.510355029585799, "grad_norm": 0.296875, "learning_rate": 4.9390091554034393e-05, "loss": 0.0352, "step": 14238 }, { "epoch": 3.510848126232742, "grad_norm": 0.388671875, "learning_rate": 4.936040415015105e-05, "loss": 0.0385, "step": 14240 }, { "epoch": 3.5113412228796843, "grad_norm": 0.3203125, "learning_rate": 4.9330722747342474e-05, "loss": 0.0362, "step": 14242 }, { "epoch": 3.5118343195266273, "grad_norm": 0.353515625, "learning_rate": 4.930104734912623e-05, "loss": 0.0381, "step": 14244 }, { "epoch": 3.5123274161735702, "grad_norm": 0.314453125, "learning_rate": 4.92713779590189e-05, "loss": 0.0361, "step": 14246 }, { "epoch": 3.5128205128205128, "grad_norm": 0.318359375, "learning_rate": 4.9241714580536526e-05, "loss": 0.037, "step": 14248 }, { "epoch": 3.5133136094674557, "grad_norm": 0.478515625, "learning_rate": 4.9212057217194405e-05, "loss": 0.0399, "step": 14250 }, { "epoch": 3.5138067061143983, "grad_norm": 0.3828125, "learning_rate": 4.918240587250699e-05, "loss": 0.0357, "step": 14252 }, { "epoch": 3.5142998027613412, "grad_norm": 0.275390625, "learning_rate": 4.915276054998828e-05, "loss": 0.0346, "step": 14254 }, { "epoch": 3.5147928994082838, "grad_norm": 0.337890625, "learning_rate": 4.9123121253151275e-05, "loss": 0.0364, "step": 14256 }, { "epoch": 3.5152859960552267, "grad_norm": 0.279296875, "learning_rate": 4.909348798550852e-05, "loss": 0.0346, "step": 14258 }, { "epoch": 3.5157790927021697, "grad_norm": 0.296875, "learning_rate": 4.906386075057164e-05, "loss": 0.0349, "step": 14260 }, { "epoch": 3.5162721893491122, "grad_norm": 0.30859375, "learning_rate": 4.903423955185165e-05, "loss": 0.0358, "step": 14262 }, { "epoch": 3.516765285996055, "grad_norm": 0.3046875, "learning_rate": 4.9004624392858835e-05, "loss": 0.0352, "step": 14264 }, { "epoch": 3.517258382642998, "grad_norm": 0.37109375, "learning_rate": 4.897501527710274e-05, "loss": 0.0351, "step": 14266 }, { "epoch": 3.5177514792899407, "grad_norm": 0.326171875, "learning_rate": 4.8945412208092265e-05, "loss": 0.0351, "step": 14268 }, { "epoch": 3.5182445759368837, "grad_norm": 0.29296875, "learning_rate": 4.891581518933541e-05, "loss": 0.0389, "step": 14270 }, { "epoch": 3.5187376725838266, "grad_norm": 0.291015625, "learning_rate": 4.888622422433976e-05, "loss": 0.0347, "step": 14272 }, { "epoch": 3.519230769230769, "grad_norm": 0.341796875, "learning_rate": 4.8856639316611885e-05, "loss": 0.0376, "step": 14274 }, { "epoch": 3.519723865877712, "grad_norm": 0.322265625, "learning_rate": 4.882706046965779e-05, "loss": 0.0384, "step": 14276 }, { "epoch": 3.520216962524655, "grad_norm": 0.29296875, "learning_rate": 4.879748768698276e-05, "loss": 0.0373, "step": 14278 }, { "epoch": 3.5207100591715976, "grad_norm": 0.30078125, "learning_rate": 4.8767920972091296e-05, "loss": 0.0355, "step": 14280 }, { "epoch": 3.5212031558185406, "grad_norm": 0.31640625, "learning_rate": 4.8738360328487256e-05, "loss": 0.0355, "step": 14282 }, { "epoch": 3.521696252465483, "grad_norm": 0.33203125, "learning_rate": 4.8708805759673714e-05, "loss": 0.0344, "step": 14284 }, { "epoch": 3.522189349112426, "grad_norm": 0.310546875, "learning_rate": 4.86792572691531e-05, "loss": 0.0346, "step": 14286 }, { "epoch": 3.5226824457593686, "grad_norm": 0.33203125, "learning_rate": 4.864971486042701e-05, "loss": 0.0374, "step": 14288 }, { "epoch": 3.5231755424063116, "grad_norm": 0.29296875, "learning_rate": 4.862017853699641e-05, "loss": 0.0345, "step": 14290 }, { "epoch": 3.5236686390532546, "grad_norm": 0.359375, "learning_rate": 4.8590648302361516e-05, "loss": 0.0384, "step": 14292 }, { "epoch": 3.524161735700197, "grad_norm": 0.349609375, "learning_rate": 4.8561124160021834e-05, "loss": 0.0397, "step": 14294 }, { "epoch": 3.52465483234714, "grad_norm": 0.365234375, "learning_rate": 4.8531606113476134e-05, "loss": 0.0363, "step": 14296 }, { "epoch": 3.525147928994083, "grad_norm": 0.279296875, "learning_rate": 4.850209416622248e-05, "loss": 0.0349, "step": 14298 }, { "epoch": 3.5256410256410255, "grad_norm": 0.328125, "learning_rate": 4.8472588321758226e-05, "loss": 0.037, "step": 14300 }, { "epoch": 3.5261341222879685, "grad_norm": 0.3984375, "learning_rate": 4.844308858357992e-05, "loss": 0.0366, "step": 14302 }, { "epoch": 3.5266272189349115, "grad_norm": 0.333984375, "learning_rate": 4.841359495518346e-05, "loss": 0.0376, "step": 14304 }, { "epoch": 3.527120315581854, "grad_norm": 0.3359375, "learning_rate": 4.838410744006403e-05, "loss": 0.039, "step": 14306 }, { "epoch": 3.527613412228797, "grad_norm": 0.26171875, "learning_rate": 4.8354626041716045e-05, "loss": 0.0333, "step": 14308 }, { "epoch": 3.5281065088757395, "grad_norm": 0.34375, "learning_rate": 4.832515076363323e-05, "loss": 0.034, "step": 14310 }, { "epoch": 3.5285996055226825, "grad_norm": 0.3359375, "learning_rate": 4.829568160930858e-05, "loss": 0.0369, "step": 14312 }, { "epoch": 3.529092702169625, "grad_norm": 0.294921875, "learning_rate": 4.826621858223431e-05, "loss": 0.0366, "step": 14314 }, { "epoch": 3.529585798816568, "grad_norm": 0.291015625, "learning_rate": 4.8236761685902045e-05, "loss": 0.0333, "step": 14316 }, { "epoch": 3.530078895463511, "grad_norm": 0.3515625, "learning_rate": 4.820731092380247e-05, "loss": 0.0413, "step": 14318 }, { "epoch": 3.5305719921104535, "grad_norm": 0.3203125, "learning_rate": 4.8177866299425746e-05, "loss": 0.0385, "step": 14320 }, { "epoch": 3.5310650887573964, "grad_norm": 0.275390625, "learning_rate": 4.814842781626119e-05, "loss": 0.037, "step": 14322 }, { "epoch": 3.5315581854043394, "grad_norm": 0.34765625, "learning_rate": 4.811899547779743e-05, "loss": 0.0402, "step": 14324 }, { "epoch": 3.532051282051282, "grad_norm": 0.30859375, "learning_rate": 4.8089569287522396e-05, "loss": 0.0377, "step": 14326 }, { "epoch": 3.532544378698225, "grad_norm": 0.294921875, "learning_rate": 4.8060149248923215e-05, "loss": 0.0372, "step": 14328 }, { "epoch": 3.533037475345168, "grad_norm": 0.29296875, "learning_rate": 4.803073536548639e-05, "loss": 0.0354, "step": 14330 }, { "epoch": 3.5335305719921104, "grad_norm": 0.3515625, "learning_rate": 4.800132764069755e-05, "loss": 0.0344, "step": 14332 }, { "epoch": 3.5340236686390534, "grad_norm": 0.314453125, "learning_rate": 4.79719260780417e-05, "loss": 0.0358, "step": 14334 }, { "epoch": 3.534516765285996, "grad_norm": 0.328125, "learning_rate": 4.7942530681003084e-05, "loss": 0.0337, "step": 14336 }, { "epoch": 3.535009861932939, "grad_norm": 0.2734375, "learning_rate": 4.791314145306526e-05, "loss": 0.0309, "step": 14338 }, { "epoch": 3.5355029585798814, "grad_norm": 0.3046875, "learning_rate": 4.788375839771097e-05, "loss": 0.0393, "step": 14340 }, { "epoch": 3.5359960552268244, "grad_norm": 0.291015625, "learning_rate": 4.7854381518422296e-05, "loss": 0.0375, "step": 14342 }, { "epoch": 3.5364891518737673, "grad_norm": 0.376953125, "learning_rate": 4.782501081868059e-05, "loss": 0.0364, "step": 14344 }, { "epoch": 3.53698224852071, "grad_norm": 0.3359375, "learning_rate": 4.7795646301966344e-05, "loss": 0.034, "step": 14346 }, { "epoch": 3.537475345167653, "grad_norm": 0.30859375, "learning_rate": 4.7766287971759546e-05, "loss": 0.0346, "step": 14348 }, { "epoch": 3.537968441814596, "grad_norm": 0.279296875, "learning_rate": 4.7736935831539233e-05, "loss": 0.0353, "step": 14350 }, { "epoch": 3.5384615384615383, "grad_norm": 0.2890625, "learning_rate": 4.7707589884783824e-05, "loss": 0.0345, "step": 14352 }, { "epoch": 3.5389546351084813, "grad_norm": 0.41796875, "learning_rate": 4.767825013497097e-05, "loss": 0.0366, "step": 14354 }, { "epoch": 3.5394477317554243, "grad_norm": 0.33984375, "learning_rate": 4.764891658557761e-05, "loss": 0.0391, "step": 14356 }, { "epoch": 3.539940828402367, "grad_norm": 0.3515625, "learning_rate": 4.761958924007995e-05, "loss": 0.0365, "step": 14358 }, { "epoch": 3.5404339250493098, "grad_norm": 0.322265625, "learning_rate": 4.759026810195335e-05, "loss": 0.035, "step": 14360 }, { "epoch": 3.5409270216962527, "grad_norm": 0.30859375, "learning_rate": 4.756095317467269e-05, "loss": 0.0385, "step": 14362 }, { "epoch": 3.5414201183431953, "grad_norm": 0.322265625, "learning_rate": 4.7531644461711755e-05, "loss": 0.0353, "step": 14364 }, { "epoch": 3.5419132149901382, "grad_norm": 0.30078125, "learning_rate": 4.7502341966544e-05, "loss": 0.0367, "step": 14366 }, { "epoch": 3.5424063116370808, "grad_norm": 0.337890625, "learning_rate": 4.747304569264178e-05, "loss": 0.0368, "step": 14368 }, { "epoch": 3.5428994082840237, "grad_norm": 0.328125, "learning_rate": 4.7443755643476915e-05, "loss": 0.0394, "step": 14370 }, { "epoch": 3.5433925049309662, "grad_norm": 0.32421875, "learning_rate": 4.741447182252049e-05, "loss": 0.0369, "step": 14372 }, { "epoch": 3.543885601577909, "grad_norm": 0.275390625, "learning_rate": 4.738519423324267e-05, "loss": 0.0338, "step": 14374 }, { "epoch": 3.544378698224852, "grad_norm": 0.3359375, "learning_rate": 4.735592287911318e-05, "loss": 0.0344, "step": 14376 }, { "epoch": 3.5448717948717947, "grad_norm": 0.271484375, "learning_rate": 4.732665776360068e-05, "loss": 0.0339, "step": 14378 }, { "epoch": 3.5453648915187377, "grad_norm": 0.302734375, "learning_rate": 4.729739889017339e-05, "loss": 0.0378, "step": 14380 }, { "epoch": 3.5458579881656807, "grad_norm": 0.2890625, "learning_rate": 4.726814626229856e-05, "loss": 0.0324, "step": 14382 }, { "epoch": 3.546351084812623, "grad_norm": 0.345703125, "learning_rate": 4.723889988344281e-05, "loss": 0.0371, "step": 14384 }, { "epoch": 3.546844181459566, "grad_norm": 0.29296875, "learning_rate": 4.720965975707204e-05, "loss": 0.0368, "step": 14386 }, { "epoch": 3.547337278106509, "grad_norm": 0.2890625, "learning_rate": 4.718042588665127e-05, "loss": 0.0403, "step": 14388 }, { "epoch": 3.5478303747534516, "grad_norm": 0.3359375, "learning_rate": 4.7151198275645004e-05, "loss": 0.0347, "step": 14390 }, { "epoch": 3.5483234714003946, "grad_norm": 0.359375, "learning_rate": 4.712197692751673e-05, "loss": 0.0395, "step": 14392 }, { "epoch": 3.548816568047337, "grad_norm": 0.3125, "learning_rate": 4.70927618457295e-05, "loss": 0.0359, "step": 14394 }, { "epoch": 3.54930966469428, "grad_norm": 0.271484375, "learning_rate": 4.7063553033745365e-05, "loss": 0.0349, "step": 14396 }, { "epoch": 3.5498027613412226, "grad_norm": 0.306640625, "learning_rate": 4.703435049502575e-05, "loss": 0.0358, "step": 14398 }, { "epoch": 3.5502958579881656, "grad_norm": 0.357421875, "learning_rate": 4.700515423303134e-05, "loss": 0.0393, "step": 14400 }, { "epoch": 3.5507889546351086, "grad_norm": 0.328125, "learning_rate": 4.697596425122199e-05, "loss": 0.0349, "step": 14402 }, { "epoch": 3.551282051282051, "grad_norm": 0.294921875, "learning_rate": 4.694678055305699e-05, "loss": 0.0344, "step": 14404 }, { "epoch": 3.551775147928994, "grad_norm": 0.3203125, "learning_rate": 4.6917603141994626e-05, "loss": 0.0358, "step": 14406 }, { "epoch": 3.552268244575937, "grad_norm": 0.408203125, "learning_rate": 4.688843202149275e-05, "loss": 0.0363, "step": 14408 }, { "epoch": 3.5527613412228796, "grad_norm": 0.2734375, "learning_rate": 4.685926719500817e-05, "loss": 0.0356, "step": 14410 }, { "epoch": 3.5532544378698225, "grad_norm": 0.2890625, "learning_rate": 4.683010866599713e-05, "loss": 0.0402, "step": 14412 }, { "epoch": 3.5537475345167655, "grad_norm": 0.3515625, "learning_rate": 4.6800956437915066e-05, "loss": 0.0337, "step": 14414 }, { "epoch": 3.554240631163708, "grad_norm": 0.26953125, "learning_rate": 4.6771810514216706e-05, "loss": 0.034, "step": 14416 }, { "epoch": 3.554733727810651, "grad_norm": 0.3125, "learning_rate": 4.674267089835601e-05, "loss": 0.0345, "step": 14418 }, { "epoch": 3.5552268244575935, "grad_norm": 0.353515625, "learning_rate": 4.67135375937861e-05, "loss": 0.0364, "step": 14420 }, { "epoch": 3.5557199211045365, "grad_norm": 0.28515625, "learning_rate": 4.6684410603959574e-05, "loss": 0.0362, "step": 14422 }, { "epoch": 3.556213017751479, "grad_norm": 0.279296875, "learning_rate": 4.665528993232803e-05, "loss": 0.0356, "step": 14424 }, { "epoch": 3.556706114398422, "grad_norm": 0.35546875, "learning_rate": 4.662617558234246e-05, "loss": 0.0365, "step": 14426 }, { "epoch": 3.557199211045365, "grad_norm": 0.29296875, "learning_rate": 4.659706755745309e-05, "loss": 0.0349, "step": 14428 }, { "epoch": 3.5576923076923075, "grad_norm": 0.306640625, "learning_rate": 4.656796586110938e-05, "loss": 0.0374, "step": 14430 }, { "epoch": 3.5581854043392505, "grad_norm": 0.294921875, "learning_rate": 4.653887049676007e-05, "loss": 0.0376, "step": 14432 }, { "epoch": 3.5586785009861934, "grad_norm": 0.296875, "learning_rate": 4.650978146785302e-05, "loss": 0.0346, "step": 14434 }, { "epoch": 3.559171597633136, "grad_norm": 0.3359375, "learning_rate": 4.6480698777835605e-05, "loss": 0.0379, "step": 14436 }, { "epoch": 3.559664694280079, "grad_norm": 0.361328125, "learning_rate": 4.645162243015415e-05, "loss": 0.0335, "step": 14438 }, { "epoch": 3.560157790927022, "grad_norm": 0.35546875, "learning_rate": 4.642255242825442e-05, "loss": 0.0347, "step": 14440 }, { "epoch": 3.5606508875739644, "grad_norm": 0.326171875, "learning_rate": 4.6393488775581364e-05, "loss": 0.0375, "step": 14442 }, { "epoch": 3.5611439842209074, "grad_norm": 0.328125, "learning_rate": 4.636443147557918e-05, "loss": 0.0369, "step": 14444 }, { "epoch": 3.5616370808678504, "grad_norm": 0.302734375, "learning_rate": 4.633538053169134e-05, "loss": 0.0339, "step": 14446 }, { "epoch": 3.562130177514793, "grad_norm": 0.30078125, "learning_rate": 4.630633594736053e-05, "loss": 0.0338, "step": 14448 }, { "epoch": 3.562623274161736, "grad_norm": 0.33203125, "learning_rate": 4.627729772602874e-05, "loss": 0.037, "step": 14450 }, { "epoch": 3.5631163708086784, "grad_norm": 0.361328125, "learning_rate": 4.6248265871137065e-05, "loss": 0.0374, "step": 14452 }, { "epoch": 3.5636094674556213, "grad_norm": 0.32421875, "learning_rate": 4.621924038612601e-05, "loss": 0.0305, "step": 14454 }, { "epoch": 3.564102564102564, "grad_norm": 0.330078125, "learning_rate": 4.619022127443523e-05, "loss": 0.0351, "step": 14456 }, { "epoch": 3.564595660749507, "grad_norm": 0.333984375, "learning_rate": 4.616120853950367e-05, "loss": 0.0356, "step": 14458 }, { "epoch": 3.56508875739645, "grad_norm": 0.314453125, "learning_rate": 4.613220218476949e-05, "loss": 0.0386, "step": 14460 }, { "epoch": 3.5655818540433923, "grad_norm": 0.369140625, "learning_rate": 4.610320221367013e-05, "loss": 0.0348, "step": 14462 }, { "epoch": 3.5660749506903353, "grad_norm": 0.33203125, "learning_rate": 4.607420862964225e-05, "loss": 0.0355, "step": 14464 }, { "epoch": 3.5665680473372783, "grad_norm": 0.27734375, "learning_rate": 4.604522143612169e-05, "loss": 0.0358, "step": 14466 }, { "epoch": 3.567061143984221, "grad_norm": 0.3046875, "learning_rate": 4.601624063654364e-05, "loss": 0.0357, "step": 14468 }, { "epoch": 3.5675542406311638, "grad_norm": 0.3359375, "learning_rate": 4.5987266234342486e-05, "loss": 0.0355, "step": 14470 }, { "epoch": 3.5680473372781067, "grad_norm": 0.3125, "learning_rate": 4.5958298232951846e-05, "loss": 0.0367, "step": 14472 }, { "epoch": 3.5685404339250493, "grad_norm": 0.28125, "learning_rate": 4.59293366358046e-05, "loss": 0.0369, "step": 14474 }, { "epoch": 3.5690335305719922, "grad_norm": 0.35546875, "learning_rate": 4.590038144633285e-05, "loss": 0.034, "step": 14476 }, { "epoch": 3.5695266272189348, "grad_norm": 0.32421875, "learning_rate": 4.587143266796795e-05, "loss": 0.0337, "step": 14478 }, { "epoch": 3.5700197238658777, "grad_norm": 0.359375, "learning_rate": 4.584249030414054e-05, "loss": 0.0377, "step": 14480 }, { "epoch": 3.5705128205128203, "grad_norm": 0.28515625, "learning_rate": 4.5813554358280366e-05, "loss": 0.0368, "step": 14482 }, { "epoch": 3.5710059171597632, "grad_norm": 0.42578125, "learning_rate": 4.578462483381654e-05, "loss": 0.0419, "step": 14484 }, { "epoch": 3.571499013806706, "grad_norm": 0.318359375, "learning_rate": 4.5755701734177356e-05, "loss": 0.0407, "step": 14486 }, { "epoch": 3.5719921104536487, "grad_norm": 0.314453125, "learning_rate": 4.57267850627904e-05, "loss": 0.0338, "step": 14488 }, { "epoch": 3.5724852071005917, "grad_norm": 0.3984375, "learning_rate": 4.5697874823082424e-05, "loss": 0.0357, "step": 14490 }, { "epoch": 3.5729783037475347, "grad_norm": 0.33984375, "learning_rate": 4.566897101847947e-05, "loss": 0.0368, "step": 14492 }, { "epoch": 3.573471400394477, "grad_norm": 0.349609375, "learning_rate": 4.5640073652406836e-05, "loss": 0.0378, "step": 14494 }, { "epoch": 3.57396449704142, "grad_norm": 0.330078125, "learning_rate": 4.5611182728288895e-05, "loss": 0.0341, "step": 14496 }, { "epoch": 3.574457593688363, "grad_norm": 0.28125, "learning_rate": 4.558229824954955e-05, "loss": 0.0372, "step": 14498 }, { "epoch": 3.5749506903353057, "grad_norm": 0.33203125, "learning_rate": 4.555342021961165e-05, "loss": 0.0371, "step": 14500 }, { "epoch": 3.5754437869822486, "grad_norm": 0.310546875, "learning_rate": 4.552454864189746e-05, "loss": 0.0344, "step": 14502 }, { "epoch": 3.5759368836291916, "grad_norm": 0.2890625, "learning_rate": 4.5495683519828395e-05, "loss": 0.0362, "step": 14504 }, { "epoch": 3.576429980276134, "grad_norm": 0.328125, "learning_rate": 4.546682485682515e-05, "loss": 0.0367, "step": 14506 }, { "epoch": 3.5769230769230766, "grad_norm": 0.291015625, "learning_rate": 4.543797265630767e-05, "loss": 0.0393, "step": 14508 }, { "epoch": 3.5774161735700196, "grad_norm": 0.330078125, "learning_rate": 4.540912692169499e-05, "loss": 0.0336, "step": 14510 }, { "epoch": 3.5779092702169626, "grad_norm": 0.369140625, "learning_rate": 4.538028765640564e-05, "loss": 0.0389, "step": 14512 }, { "epoch": 3.578402366863905, "grad_norm": 0.3984375, "learning_rate": 4.5351454863857124e-05, "loss": 0.0397, "step": 14514 }, { "epoch": 3.578895463510848, "grad_norm": 0.32421875, "learning_rate": 4.532262854746633e-05, "loss": 0.0385, "step": 14516 }, { "epoch": 3.579388560157791, "grad_norm": 0.361328125, "learning_rate": 4.529380871064933e-05, "loss": 0.0403, "step": 14518 }, { "epoch": 3.5798816568047336, "grad_norm": 0.326171875, "learning_rate": 4.526499535682144e-05, "loss": 0.0354, "step": 14520 }, { "epoch": 3.5803747534516766, "grad_norm": 0.310546875, "learning_rate": 4.5236188489397225e-05, "loss": 0.0391, "step": 14522 }, { "epoch": 3.5808678500986195, "grad_norm": 0.287109375, "learning_rate": 4.520738811179037e-05, "loss": 0.0361, "step": 14524 }, { "epoch": 3.581360946745562, "grad_norm": 0.3125, "learning_rate": 4.5178594227414026e-05, "loss": 0.0389, "step": 14526 }, { "epoch": 3.581854043392505, "grad_norm": 0.341796875, "learning_rate": 4.514980683968028e-05, "loss": 0.0348, "step": 14528 }, { "epoch": 3.582347140039448, "grad_norm": 0.287109375, "learning_rate": 4.512102595200073e-05, "loss": 0.0346, "step": 14530 }, { "epoch": 3.5828402366863905, "grad_norm": 0.27734375, "learning_rate": 4.509225156778597e-05, "loss": 0.0357, "step": 14532 }, { "epoch": 3.5833333333333335, "grad_norm": 0.302734375, "learning_rate": 4.506348369044597e-05, "loss": 0.0329, "step": 14534 }, { "epoch": 3.583826429980276, "grad_norm": 0.380859375, "learning_rate": 4.5034722323389886e-05, "loss": 0.0348, "step": 14536 }, { "epoch": 3.584319526627219, "grad_norm": 0.396484375, "learning_rate": 4.500596747002609e-05, "loss": 0.0379, "step": 14538 }, { "epoch": 3.5848126232741615, "grad_norm": 0.3203125, "learning_rate": 4.497721913376223e-05, "loss": 0.0401, "step": 14540 }, { "epoch": 3.5853057199211045, "grad_norm": 0.275390625, "learning_rate": 4.494847731800503e-05, "loss": 0.0332, "step": 14542 }, { "epoch": 3.5857988165680474, "grad_norm": 0.37890625, "learning_rate": 4.491974202616072e-05, "loss": 0.0378, "step": 14544 }, { "epoch": 3.58629191321499, "grad_norm": 0.33203125, "learning_rate": 4.4891013261634474e-05, "loss": 0.0403, "step": 14546 }, { "epoch": 3.586785009861933, "grad_norm": 0.326171875, "learning_rate": 4.486229102783084e-05, "loss": 0.04, "step": 14548 }, { "epoch": 3.587278106508876, "grad_norm": 0.30859375, "learning_rate": 4.483357532815356e-05, "loss": 0.0327, "step": 14550 }, { "epoch": 3.5877712031558184, "grad_norm": 0.294921875, "learning_rate": 4.480486616600563e-05, "loss": 0.0357, "step": 14552 }, { "epoch": 3.5882642998027614, "grad_norm": 0.291015625, "learning_rate": 4.477616354478926e-05, "loss": 0.0361, "step": 14554 }, { "epoch": 3.5887573964497044, "grad_norm": 0.341796875, "learning_rate": 4.4747467467905765e-05, "loss": 0.033, "step": 14556 }, { "epoch": 3.589250493096647, "grad_norm": 0.30859375, "learning_rate": 4.471877793875594e-05, "loss": 0.0392, "step": 14558 }, { "epoch": 3.58974358974359, "grad_norm": 0.28125, "learning_rate": 4.469009496073955e-05, "loss": 0.0377, "step": 14560 }, { "epoch": 3.5902366863905324, "grad_norm": 0.2890625, "learning_rate": 4.466141853725571e-05, "loss": 0.038, "step": 14562 }, { "epoch": 3.5907297830374754, "grad_norm": 0.328125, "learning_rate": 4.463274867170274e-05, "loss": 0.0344, "step": 14564 }, { "epoch": 3.591222879684418, "grad_norm": 0.251953125, "learning_rate": 4.460408536747819e-05, "loss": 0.0313, "step": 14566 }, { "epoch": 3.591715976331361, "grad_norm": 0.3046875, "learning_rate": 4.457542862797885e-05, "loss": 0.0348, "step": 14568 }, { "epoch": 3.592209072978304, "grad_norm": 0.3359375, "learning_rate": 4.45467784566006e-05, "loss": 0.0378, "step": 14570 }, { "epoch": 3.5927021696252464, "grad_norm": 0.29296875, "learning_rate": 4.451813485673878e-05, "loss": 0.0352, "step": 14572 }, { "epoch": 3.5931952662721893, "grad_norm": 0.30859375, "learning_rate": 4.4489497831787716e-05, "loss": 0.0362, "step": 14574 }, { "epoch": 3.5936883629191323, "grad_norm": 0.390625, "learning_rate": 4.44608673851411e-05, "loss": 0.0387, "step": 14576 }, { "epoch": 3.594181459566075, "grad_norm": 0.3125, "learning_rate": 4.443224352019179e-05, "loss": 0.0362, "step": 14578 }, { "epoch": 3.594674556213018, "grad_norm": 0.3359375, "learning_rate": 4.440362624033185e-05, "loss": 0.0344, "step": 14580 }, { "epoch": 3.5951676528599608, "grad_norm": 0.32421875, "learning_rate": 4.437501554895267e-05, "loss": 0.0383, "step": 14582 }, { "epoch": 3.5956607495069033, "grad_norm": 0.365234375, "learning_rate": 4.4346411449444636e-05, "loss": 0.0382, "step": 14584 }, { "epoch": 3.5961538461538463, "grad_norm": 0.265625, "learning_rate": 4.431781394519765e-05, "loss": 0.0355, "step": 14586 }, { "epoch": 3.5966469428007892, "grad_norm": 0.328125, "learning_rate": 4.428922303960056e-05, "loss": 0.038, "step": 14588 }, { "epoch": 3.5971400394477318, "grad_norm": 0.287109375, "learning_rate": 4.426063873604159e-05, "loss": 0.0363, "step": 14590 }, { "epoch": 3.5976331360946747, "grad_norm": 0.345703125, "learning_rate": 4.4232061037908146e-05, "loss": 0.0363, "step": 14592 }, { "epoch": 3.5981262327416172, "grad_norm": 0.306640625, "learning_rate": 4.420348994858683e-05, "loss": 0.0327, "step": 14594 }, { "epoch": 3.59861932938856, "grad_norm": 0.326171875, "learning_rate": 4.4174925471463524e-05, "loss": 0.0343, "step": 14596 }, { "epoch": 3.5991124260355027, "grad_norm": 0.3203125, "learning_rate": 4.414636760992317e-05, "loss": 0.0346, "step": 14598 }, { "epoch": 3.5996055226824457, "grad_norm": 0.37109375, "learning_rate": 4.411781636735016e-05, "loss": 0.032, "step": 14600 }, { "epoch": 3.6000986193293887, "grad_norm": 0.330078125, "learning_rate": 4.408927174712789e-05, "loss": 0.0361, "step": 14602 }, { "epoch": 3.600591715976331, "grad_norm": 0.32421875, "learning_rate": 4.4060733752639074e-05, "loss": 0.036, "step": 14604 }, { "epoch": 3.601084812623274, "grad_norm": 0.28515625, "learning_rate": 4.403220238726564e-05, "loss": 0.0358, "step": 14606 }, { "epoch": 3.601577909270217, "grad_norm": 0.384765625, "learning_rate": 4.400367765438871e-05, "loss": 0.0349, "step": 14608 }, { "epoch": 3.6020710059171597, "grad_norm": 0.314453125, "learning_rate": 4.3975159557388615e-05, "loss": 0.0377, "step": 14610 }, { "epoch": 3.6025641025641026, "grad_norm": 0.302734375, "learning_rate": 4.394664809964492e-05, "loss": 0.0365, "step": 14612 }, { "epoch": 3.6030571992110456, "grad_norm": 0.314453125, "learning_rate": 4.391814328453642e-05, "loss": 0.0396, "step": 14614 }, { "epoch": 3.603550295857988, "grad_norm": 0.294921875, "learning_rate": 4.388964511544099e-05, "loss": 0.0361, "step": 14616 }, { "epoch": 3.604043392504931, "grad_norm": 0.302734375, "learning_rate": 4.386115359573596e-05, "loss": 0.0352, "step": 14618 }, { "epoch": 3.6045364891518736, "grad_norm": 0.30078125, "learning_rate": 4.383266872879764e-05, "loss": 0.035, "step": 14620 }, { "epoch": 3.6050295857988166, "grad_norm": 0.291015625, "learning_rate": 4.380419051800167e-05, "loss": 0.037, "step": 14622 }, { "epoch": 3.605522682445759, "grad_norm": 0.373046875, "learning_rate": 4.377571896672289e-05, "loss": 0.0363, "step": 14624 }, { "epoch": 3.606015779092702, "grad_norm": 0.29296875, "learning_rate": 4.374725407833532e-05, "loss": 0.0383, "step": 14626 }, { "epoch": 3.606508875739645, "grad_norm": 0.3203125, "learning_rate": 4.371879585621222e-05, "loss": 0.0348, "step": 14628 }, { "epoch": 3.6070019723865876, "grad_norm": 0.3359375, "learning_rate": 4.3690344303726036e-05, "loss": 0.0341, "step": 14630 }, { "epoch": 3.6074950690335306, "grad_norm": 0.359375, "learning_rate": 4.36618994242485e-05, "loss": 0.0354, "step": 14632 }, { "epoch": 3.6079881656804735, "grad_norm": 0.30078125, "learning_rate": 4.3633461221150384e-05, "loss": 0.0357, "step": 14634 }, { "epoch": 3.608481262327416, "grad_norm": 0.33203125, "learning_rate": 4.360502969780182e-05, "loss": 0.0369, "step": 14636 }, { "epoch": 3.608974358974359, "grad_norm": 0.310546875, "learning_rate": 4.3576604857572114e-05, "loss": 0.0339, "step": 14638 }, { "epoch": 3.609467455621302, "grad_norm": 0.306640625, "learning_rate": 4.354818670382976e-05, "loss": 0.0368, "step": 14640 }, { "epoch": 3.6099605522682445, "grad_norm": 0.32421875, "learning_rate": 4.351977523994247e-05, "loss": 0.0373, "step": 14642 }, { "epoch": 3.6104536489151875, "grad_norm": 0.3203125, "learning_rate": 4.349137046927715e-05, "loss": 0.0386, "step": 14644 }, { "epoch": 3.61094674556213, "grad_norm": 0.2890625, "learning_rate": 4.3462972395199976e-05, "loss": 0.0316, "step": 14646 }, { "epoch": 3.611439842209073, "grad_norm": 0.330078125, "learning_rate": 4.3434581021076206e-05, "loss": 0.0343, "step": 14648 }, { "epoch": 3.6119329388560155, "grad_norm": 0.2734375, "learning_rate": 4.340619635027039e-05, "loss": 0.0359, "step": 14650 }, { "epoch": 3.6124260355029585, "grad_norm": 0.314453125, "learning_rate": 4.3377818386146294e-05, "loss": 0.0377, "step": 14652 }, { "epoch": 3.6129191321499015, "grad_norm": 0.373046875, "learning_rate": 4.334944713206687e-05, "loss": 0.0357, "step": 14654 }, { "epoch": 3.613412228796844, "grad_norm": 0.318359375, "learning_rate": 4.3321082591394235e-05, "loss": 0.0358, "step": 14656 }, { "epoch": 3.613905325443787, "grad_norm": 0.294921875, "learning_rate": 4.3292724767489776e-05, "loss": 0.034, "step": 14658 }, { "epoch": 3.61439842209073, "grad_norm": 0.416015625, "learning_rate": 4.326437366371405e-05, "loss": 0.0377, "step": 14660 }, { "epoch": 3.6148915187376724, "grad_norm": 0.2578125, "learning_rate": 4.323602928342685e-05, "loss": 0.0349, "step": 14662 }, { "epoch": 3.6153846153846154, "grad_norm": 0.318359375, "learning_rate": 4.320769162998707e-05, "loss": 0.037, "step": 14664 }, { "epoch": 3.6158777120315584, "grad_norm": 0.2890625, "learning_rate": 4.317936070675291e-05, "loss": 0.0337, "step": 14666 }, { "epoch": 3.616370808678501, "grad_norm": 0.345703125, "learning_rate": 4.3151036517081765e-05, "loss": 0.0371, "step": 14668 }, { "epoch": 3.616863905325444, "grad_norm": 0.30859375, "learning_rate": 4.31227190643302e-05, "loss": 0.0338, "step": 14670 }, { "epoch": 3.617357001972387, "grad_norm": 0.330078125, "learning_rate": 4.309440835185399e-05, "loss": 0.0339, "step": 14672 }, { "epoch": 3.6178500986193294, "grad_norm": 0.29296875, "learning_rate": 4.30661043830081e-05, "loss": 0.034, "step": 14674 }, { "epoch": 3.6183431952662723, "grad_norm": 0.33203125, "learning_rate": 4.303780716114675e-05, "loss": 0.0381, "step": 14676 }, { "epoch": 3.618836291913215, "grad_norm": 0.30078125, "learning_rate": 4.300951668962323e-05, "loss": 0.0353, "step": 14678 }, { "epoch": 3.619329388560158, "grad_norm": 0.3046875, "learning_rate": 4.298123297179024e-05, "loss": 0.0313, "step": 14680 }, { "epoch": 3.6198224852071004, "grad_norm": 0.36328125, "learning_rate": 4.2952956010999455e-05, "loss": 0.0334, "step": 14682 }, { "epoch": 3.6203155818540433, "grad_norm": 0.365234375, "learning_rate": 4.29246858106019e-05, "loss": 0.0344, "step": 14684 }, { "epoch": 3.6208086785009863, "grad_norm": 0.333984375, "learning_rate": 4.2896422373947745e-05, "loss": 0.0406, "step": 14686 }, { "epoch": 3.621301775147929, "grad_norm": 0.373046875, "learning_rate": 4.2868165704386366e-05, "loss": 0.0338, "step": 14688 }, { "epoch": 3.621794871794872, "grad_norm": 0.333984375, "learning_rate": 4.283991580526636e-05, "loss": 0.0405, "step": 14690 }, { "epoch": 3.6222879684418148, "grad_norm": 0.3046875, "learning_rate": 4.281167267993541e-05, "loss": 0.0358, "step": 14692 }, { "epoch": 3.6227810650887573, "grad_norm": 0.33203125, "learning_rate": 4.278343633174061e-05, "loss": 0.0367, "step": 14694 }, { "epoch": 3.6232741617357003, "grad_norm": 0.361328125, "learning_rate": 4.275520676402803e-05, "loss": 0.0305, "step": 14696 }, { "epoch": 3.6237672583826432, "grad_norm": 0.337890625, "learning_rate": 4.272698398014306e-05, "loss": 0.0346, "step": 14698 }, { "epoch": 3.6242603550295858, "grad_norm": 0.322265625, "learning_rate": 4.2698767983430265e-05, "loss": 0.0361, "step": 14700 }, { "epoch": 3.6247534516765287, "grad_norm": 0.328125, "learning_rate": 4.267055877723338e-05, "loss": 0.034, "step": 14702 }, { "epoch": 3.6252465483234713, "grad_norm": 0.298828125, "learning_rate": 4.264235636489542e-05, "loss": 0.031, "step": 14704 }, { "epoch": 3.6257396449704142, "grad_norm": 0.296875, "learning_rate": 4.261416074975839e-05, "loss": 0.0364, "step": 14706 }, { "epoch": 3.6262327416173568, "grad_norm": 0.353515625, "learning_rate": 4.2585971935163794e-05, "loss": 0.034, "step": 14708 }, { "epoch": 3.6267258382642997, "grad_norm": 0.275390625, "learning_rate": 4.255778992445201e-05, "loss": 0.0367, "step": 14710 }, { "epoch": 3.6272189349112427, "grad_norm": 0.337890625, "learning_rate": 4.252961472096289e-05, "loss": 0.0355, "step": 14712 }, { "epoch": 3.6277120315581852, "grad_norm": 0.2890625, "learning_rate": 4.2501446328035286e-05, "loss": 0.035, "step": 14714 }, { "epoch": 3.628205128205128, "grad_norm": 0.318359375, "learning_rate": 4.247328474900732e-05, "loss": 0.0361, "step": 14716 }, { "epoch": 3.628698224852071, "grad_norm": 0.3125, "learning_rate": 4.244512998721633e-05, "loss": 0.0396, "step": 14718 }, { "epoch": 3.6291913214990137, "grad_norm": 0.30859375, "learning_rate": 4.241698204599871e-05, "loss": 0.0387, "step": 14720 }, { "epoch": 3.6296844181459567, "grad_norm": 0.306640625, "learning_rate": 4.238884092869029e-05, "loss": 0.0364, "step": 14722 }, { "epoch": 3.6301775147928996, "grad_norm": 0.291015625, "learning_rate": 4.23607066386258e-05, "loss": 0.0347, "step": 14724 }, { "epoch": 3.630670611439842, "grad_norm": 0.283203125, "learning_rate": 4.233257917913948e-05, "loss": 0.0332, "step": 14726 }, { "epoch": 3.631163708086785, "grad_norm": 0.328125, "learning_rate": 4.230445855356445e-05, "loss": 0.0348, "step": 14728 }, { "epoch": 3.6316568047337277, "grad_norm": 0.3046875, "learning_rate": 4.227634476523322e-05, "loss": 0.0366, "step": 14730 }, { "epoch": 3.6321499013806706, "grad_norm": 0.279296875, "learning_rate": 4.2248237817477455e-05, "loss": 0.0357, "step": 14732 }, { "epoch": 3.632642998027613, "grad_norm": 0.337890625, "learning_rate": 4.222013771362788e-05, "loss": 0.0362, "step": 14734 }, { "epoch": 3.633136094674556, "grad_norm": 0.291015625, "learning_rate": 4.219204445701467e-05, "loss": 0.0368, "step": 14736 }, { "epoch": 3.633629191321499, "grad_norm": 0.29296875, "learning_rate": 4.216395805096687e-05, "loss": 0.0316, "step": 14738 }, { "epoch": 3.6341222879684416, "grad_norm": 0.310546875, "learning_rate": 4.213587849881303e-05, "loss": 0.0355, "step": 14740 }, { "epoch": 3.6346153846153846, "grad_norm": 0.314453125, "learning_rate": 4.210780580388062e-05, "loss": 0.0372, "step": 14742 }, { "epoch": 3.6351084812623276, "grad_norm": 0.345703125, "learning_rate": 4.207973996949646e-05, "loss": 0.0335, "step": 14744 }, { "epoch": 3.63560157790927, "grad_norm": 0.306640625, "learning_rate": 4.205168099898652e-05, "loss": 0.0362, "step": 14746 }, { "epoch": 3.636094674556213, "grad_norm": 0.31640625, "learning_rate": 4.202362889567586e-05, "loss": 0.0374, "step": 14748 }, { "epoch": 3.636587771203156, "grad_norm": 0.2734375, "learning_rate": 4.199558366288895e-05, "loss": 0.0322, "step": 14750 }, { "epoch": 3.6370808678500985, "grad_norm": 0.279296875, "learning_rate": 4.196754530394915e-05, "loss": 0.0373, "step": 14752 }, { "epoch": 3.6375739644970415, "grad_norm": 0.302734375, "learning_rate": 4.193951382217932e-05, "loss": 0.0383, "step": 14754 }, { "epoch": 3.6380670611439845, "grad_norm": 0.40234375, "learning_rate": 4.1911489220901236e-05, "loss": 0.0369, "step": 14756 }, { "epoch": 3.638560157790927, "grad_norm": 0.30078125, "learning_rate": 4.1883471503436e-05, "loss": 0.0355, "step": 14758 }, { "epoch": 3.63905325443787, "grad_norm": 0.46484375, "learning_rate": 4.185546067310388e-05, "loss": 0.0383, "step": 14760 }, { "epoch": 3.6395463510848125, "grad_norm": 0.384765625, "learning_rate": 4.1827456733224305e-05, "loss": 0.0359, "step": 14762 }, { "epoch": 3.6400394477317555, "grad_norm": 0.296875, "learning_rate": 4.179945968711594e-05, "loss": 0.0325, "step": 14764 }, { "epoch": 3.640532544378698, "grad_norm": 0.296875, "learning_rate": 4.1771469538096464e-05, "loss": 0.0362, "step": 14766 }, { "epoch": 3.641025641025641, "grad_norm": 0.30078125, "learning_rate": 4.1743486289483037e-05, "loss": 0.0322, "step": 14768 }, { "epoch": 3.641518737672584, "grad_norm": 0.298828125, "learning_rate": 4.171550994459171e-05, "loss": 0.0374, "step": 14770 }, { "epoch": 3.6420118343195265, "grad_norm": 0.2890625, "learning_rate": 4.1687540506737874e-05, "loss": 0.0374, "step": 14772 }, { "epoch": 3.6425049309664694, "grad_norm": 0.322265625, "learning_rate": 4.1659577979236075e-05, "loss": 0.0349, "step": 14774 }, { "epoch": 3.6429980276134124, "grad_norm": 0.279296875, "learning_rate": 4.16316223654e-05, "loss": 0.0365, "step": 14776 }, { "epoch": 3.643491124260355, "grad_norm": 0.302734375, "learning_rate": 4.160367366854261e-05, "loss": 0.0334, "step": 14778 }, { "epoch": 3.643984220907298, "grad_norm": 0.28125, "learning_rate": 4.157573189197586e-05, "loss": 0.0351, "step": 14780 }, { "epoch": 3.644477317554241, "grad_norm": 0.31640625, "learning_rate": 4.154779703901114e-05, "loss": 0.0345, "step": 14782 }, { "epoch": 3.6449704142011834, "grad_norm": 0.27734375, "learning_rate": 4.15198691129588e-05, "loss": 0.0349, "step": 14784 }, { "epoch": 3.6454635108481264, "grad_norm": 0.29296875, "learning_rate": 4.1491948117128474e-05, "loss": 0.0315, "step": 14786 }, { "epoch": 3.645956607495069, "grad_norm": 0.3515625, "learning_rate": 4.146403405482897e-05, "loss": 0.0328, "step": 14788 }, { "epoch": 3.646449704142012, "grad_norm": 0.271484375, "learning_rate": 4.1436126929368255e-05, "loss": 0.0337, "step": 14790 }, { "epoch": 3.6469428007889544, "grad_norm": 0.359375, "learning_rate": 4.140822674405347e-05, "loss": 0.0384, "step": 14792 }, { "epoch": 3.6474358974358974, "grad_norm": 0.32421875, "learning_rate": 4.138033350219096e-05, "loss": 0.0338, "step": 14794 }, { "epoch": 3.6479289940828403, "grad_norm": 0.296875, "learning_rate": 4.1352447207086256e-05, "loss": 0.0353, "step": 14796 }, { "epoch": 3.648422090729783, "grad_norm": 0.376953125, "learning_rate": 4.132456786204395e-05, "loss": 0.0352, "step": 14798 }, { "epoch": 3.648915187376726, "grad_norm": 0.330078125, "learning_rate": 4.1296695470367974e-05, "loss": 0.0308, "step": 14800 }, { "epoch": 3.649408284023669, "grad_norm": 0.3046875, "learning_rate": 4.126883003536134e-05, "loss": 0.0369, "step": 14802 }, { "epoch": 3.6499013806706113, "grad_norm": 0.34765625, "learning_rate": 4.124097156032625e-05, "loss": 0.038, "step": 14804 }, { "epoch": 3.6503944773175543, "grad_norm": 0.27734375, "learning_rate": 4.12131200485641e-05, "loss": 0.0336, "step": 14806 }, { "epoch": 3.6508875739644973, "grad_norm": 0.298828125, "learning_rate": 4.118527550337546e-05, "loss": 0.0338, "step": 14808 }, { "epoch": 3.65138067061144, "grad_norm": 0.279296875, "learning_rate": 4.115743792806004e-05, "loss": 0.0377, "step": 14810 }, { "epoch": 3.6518737672583828, "grad_norm": 0.29296875, "learning_rate": 4.1129607325916797e-05, "loss": 0.035, "step": 14812 }, { "epoch": 3.6523668639053253, "grad_norm": 0.3125, "learning_rate": 4.1101783700243744e-05, "loss": 0.0372, "step": 14814 }, { "epoch": 3.6528599605522682, "grad_norm": 0.306640625, "learning_rate": 4.107396705433817e-05, "loss": 0.0408, "step": 14816 }, { "epoch": 3.6533530571992108, "grad_norm": 0.3125, "learning_rate": 4.1046157391496496e-05, "loss": 0.0358, "step": 14818 }, { "epoch": 3.6538461538461537, "grad_norm": 0.359375, "learning_rate": 4.101835471501433e-05, "loss": 0.037, "step": 14820 }, { "epoch": 3.6543392504930967, "grad_norm": 0.3046875, "learning_rate": 4.099055902818644e-05, "loss": 0.0345, "step": 14822 }, { "epoch": 3.6548323471400392, "grad_norm": 0.294921875, "learning_rate": 4.096277033430678e-05, "loss": 0.0381, "step": 14824 }, { "epoch": 3.655325443786982, "grad_norm": 0.3046875, "learning_rate": 4.093498863666848e-05, "loss": 0.036, "step": 14826 }, { "epoch": 3.655818540433925, "grad_norm": 0.357421875, "learning_rate": 4.090721393856377e-05, "loss": 0.0369, "step": 14828 }, { "epoch": 3.6563116370808677, "grad_norm": 0.30859375, "learning_rate": 4.0879446243284145e-05, "loss": 0.0349, "step": 14830 }, { "epoch": 3.6568047337278107, "grad_norm": 0.353515625, "learning_rate": 4.0851685554120226e-05, "loss": 0.0387, "step": 14832 }, { "epoch": 3.6572978303747536, "grad_norm": 0.2734375, "learning_rate": 4.082393187436181e-05, "loss": 0.0353, "step": 14834 }, { "epoch": 3.657790927021696, "grad_norm": 0.31640625, "learning_rate": 4.079618520729786e-05, "loss": 0.0344, "step": 14836 }, { "epoch": 3.658284023668639, "grad_norm": 0.341796875, "learning_rate": 4.076844555621653e-05, "loss": 0.0347, "step": 14838 }, { "epoch": 3.658777120315582, "grad_norm": 0.298828125, "learning_rate": 4.0740712924405125e-05, "loss": 0.0329, "step": 14840 }, { "epoch": 3.6592702169625246, "grad_norm": 0.328125, "learning_rate": 4.071298731515004e-05, "loss": 0.0354, "step": 14842 }, { "epoch": 3.6597633136094676, "grad_norm": 0.30859375, "learning_rate": 4.068526873173704e-05, "loss": 0.0343, "step": 14844 }, { "epoch": 3.66025641025641, "grad_norm": 0.27734375, "learning_rate": 4.065755717745083e-05, "loss": 0.0316, "step": 14846 }, { "epoch": 3.660749506903353, "grad_norm": 0.306640625, "learning_rate": 4.062985265557543e-05, "loss": 0.035, "step": 14848 }, { "epoch": 3.6612426035502956, "grad_norm": 0.296875, "learning_rate": 4.0602155169393964e-05, "loss": 0.0349, "step": 14850 }, { "epoch": 3.6617357001972386, "grad_norm": 0.296875, "learning_rate": 4.0574464722188745e-05, "loss": 0.0321, "step": 14852 }, { "epoch": 3.6622287968441816, "grad_norm": 0.29296875, "learning_rate": 4.054678131724128e-05, "loss": 0.0348, "step": 14854 }, { "epoch": 3.662721893491124, "grad_norm": 0.296875, "learning_rate": 4.05191049578321e-05, "loss": 0.0348, "step": 14856 }, { "epoch": 3.663214990138067, "grad_norm": 0.30078125, "learning_rate": 4.049143564724116e-05, "loss": 0.0363, "step": 14858 }, { "epoch": 3.66370808678501, "grad_norm": 0.359375, "learning_rate": 4.0463773388747306e-05, "loss": 0.039, "step": 14860 }, { "epoch": 3.6642011834319526, "grad_norm": 0.37109375, "learning_rate": 4.0436118185628714e-05, "loss": 0.0378, "step": 14862 }, { "epoch": 3.6646942800788955, "grad_norm": 0.357421875, "learning_rate": 4.040847004116268e-05, "loss": 0.0365, "step": 14864 }, { "epoch": 3.6651873767258385, "grad_norm": 0.33984375, "learning_rate": 4.038082895862567e-05, "loss": 0.0379, "step": 14866 }, { "epoch": 3.665680473372781, "grad_norm": 0.30078125, "learning_rate": 4.0353194941293324e-05, "loss": 0.0386, "step": 14868 }, { "epoch": 3.666173570019724, "grad_norm": 0.419921875, "learning_rate": 4.032556799244034e-05, "loss": 0.0434, "step": 14870 }, { "epoch": 3.6666666666666665, "grad_norm": 0.3046875, "learning_rate": 4.02979481153408e-05, "loss": 0.0331, "step": 14872 }, { "epoch": 3.6671597633136095, "grad_norm": 0.291015625, "learning_rate": 4.027033531326767e-05, "loss": 0.0372, "step": 14874 }, { "epoch": 3.667652859960552, "grad_norm": 0.3671875, "learning_rate": 4.024272958949336e-05, "loss": 0.0332, "step": 14876 }, { "epoch": 3.668145956607495, "grad_norm": 0.359375, "learning_rate": 4.0215130947289206e-05, "loss": 0.0384, "step": 14878 }, { "epoch": 3.668639053254438, "grad_norm": 0.28515625, "learning_rate": 4.018753938992582e-05, "loss": 0.0324, "step": 14880 }, { "epoch": 3.6691321499013805, "grad_norm": 0.298828125, "learning_rate": 4.0159954920673005e-05, "loss": 0.034, "step": 14882 }, { "epoch": 3.6696252465483234, "grad_norm": 0.271484375, "learning_rate": 4.013237754279957e-05, "loss": 0.034, "step": 14884 }, { "epoch": 3.6701183431952664, "grad_norm": 0.392578125, "learning_rate": 4.0104807259573716e-05, "loss": 0.0352, "step": 14886 }, { "epoch": 3.670611439842209, "grad_norm": 0.33984375, "learning_rate": 4.007724407426255e-05, "loss": 0.032, "step": 14888 }, { "epoch": 3.671104536489152, "grad_norm": 0.373046875, "learning_rate": 4.00496879901326e-05, "loss": 0.0386, "step": 14890 }, { "epoch": 3.671597633136095, "grad_norm": 0.294921875, "learning_rate": 4.002213901044931e-05, "loss": 0.0386, "step": 14892 }, { "epoch": 3.6720907297830374, "grad_norm": 0.34375, "learning_rate": 3.999459713847742e-05, "loss": 0.0337, "step": 14894 }, { "epoch": 3.6725838264299804, "grad_norm": 0.341796875, "learning_rate": 3.9967062377480824e-05, "loss": 0.0368, "step": 14896 }, { "epoch": 3.6730769230769234, "grad_norm": 0.3125, "learning_rate": 3.993953473072245e-05, "loss": 0.0354, "step": 14898 }, { "epoch": 3.673570019723866, "grad_norm": 0.310546875, "learning_rate": 3.991201420146461e-05, "loss": 0.0329, "step": 14900 }, { "epoch": 3.6740631163708084, "grad_norm": 0.291015625, "learning_rate": 3.9884500792968515e-05, "loss": 0.0372, "step": 14902 }, { "epoch": 3.6745562130177514, "grad_norm": 0.322265625, "learning_rate": 3.985699450849478e-05, "loss": 0.0368, "step": 14904 }, { "epoch": 3.6750493096646943, "grad_norm": 0.31640625, "learning_rate": 3.982949535130297e-05, "loss": 0.0364, "step": 14906 }, { "epoch": 3.675542406311637, "grad_norm": 0.287109375, "learning_rate": 3.980200332465189e-05, "loss": 0.0327, "step": 14908 }, { "epoch": 3.67603550295858, "grad_norm": 0.3046875, "learning_rate": 3.977451843179957e-05, "loss": 0.0354, "step": 14910 }, { "epoch": 3.676528599605523, "grad_norm": 0.28125, "learning_rate": 3.9747040676003e-05, "loss": 0.0313, "step": 14912 }, { "epoch": 3.6770216962524653, "grad_norm": 0.31640625, "learning_rate": 3.971957006051858e-05, "loss": 0.0356, "step": 14914 }, { "epoch": 3.6775147928994083, "grad_norm": 0.294921875, "learning_rate": 3.9692106588601605e-05, "loss": 0.0345, "step": 14916 }, { "epoch": 3.6780078895463513, "grad_norm": 0.330078125, "learning_rate": 3.966465026350679e-05, "loss": 0.0369, "step": 14918 }, { "epoch": 3.678500986193294, "grad_norm": 0.263671875, "learning_rate": 3.963720108848775e-05, "loss": 0.0362, "step": 14920 }, { "epoch": 3.6789940828402368, "grad_norm": 0.353515625, "learning_rate": 3.96097590667974e-05, "loss": 0.0359, "step": 14922 }, { "epoch": 3.6794871794871797, "grad_norm": 0.33984375, "learning_rate": 3.958232420168778e-05, "loss": 0.0335, "step": 14924 }, { "epoch": 3.6799802761341223, "grad_norm": 0.31640625, "learning_rate": 3.955489649641007e-05, "loss": 0.0376, "step": 14926 }, { "epoch": 3.6804733727810652, "grad_norm": 0.30078125, "learning_rate": 3.952747595421464e-05, "loss": 0.0408, "step": 14928 }, { "epoch": 3.6809664694280078, "grad_norm": 0.2734375, "learning_rate": 3.9500062578350885e-05, "loss": 0.0334, "step": 14930 }, { "epoch": 3.6814595660749507, "grad_norm": 0.369140625, "learning_rate": 3.947265637206756e-05, "loss": 0.0368, "step": 14932 }, { "epoch": 3.6819526627218933, "grad_norm": 0.322265625, "learning_rate": 3.9445257338612373e-05, "loss": 0.0337, "step": 14934 }, { "epoch": 3.6824457593688362, "grad_norm": 0.322265625, "learning_rate": 3.9417865481232274e-05, "loss": 0.0359, "step": 14936 }, { "epoch": 3.682938856015779, "grad_norm": 0.302734375, "learning_rate": 3.939048080317337e-05, "loss": 0.0342, "step": 14938 }, { "epoch": 3.6834319526627217, "grad_norm": 0.33984375, "learning_rate": 3.9363103307680894e-05, "loss": 0.0344, "step": 14940 }, { "epoch": 3.6839250493096647, "grad_norm": 0.310546875, "learning_rate": 3.933573299799922e-05, "loss": 0.0317, "step": 14942 }, { "epoch": 3.6844181459566077, "grad_norm": 0.296875, "learning_rate": 3.93083698773719e-05, "loss": 0.0366, "step": 14944 }, { "epoch": 3.68491124260355, "grad_norm": 0.3203125, "learning_rate": 3.928101394904165e-05, "loss": 0.0319, "step": 14946 }, { "epoch": 3.685404339250493, "grad_norm": 0.306640625, "learning_rate": 3.92536652162502e-05, "loss": 0.0344, "step": 14948 }, { "epoch": 3.685897435897436, "grad_norm": 0.3203125, "learning_rate": 3.9226323682238605e-05, "loss": 0.0342, "step": 14950 }, { "epoch": 3.6863905325443787, "grad_norm": 0.298828125, "learning_rate": 3.9198989350246963e-05, "loss": 0.0304, "step": 14952 }, { "epoch": 3.6868836291913216, "grad_norm": 0.27734375, "learning_rate": 3.917166222351455e-05, "loss": 0.0325, "step": 14954 }, { "epoch": 3.687376725838264, "grad_norm": 0.294921875, "learning_rate": 3.9144342305279794e-05, "loss": 0.0348, "step": 14956 }, { "epoch": 3.687869822485207, "grad_norm": 0.3203125, "learning_rate": 3.911702959878024e-05, "loss": 0.0341, "step": 14958 }, { "epoch": 3.6883629191321496, "grad_norm": 0.3203125, "learning_rate": 3.9089724107252645e-05, "loss": 0.0338, "step": 14960 }, { "epoch": 3.6888560157790926, "grad_norm": 0.314453125, "learning_rate": 3.906242583393279e-05, "loss": 0.0355, "step": 14962 }, { "epoch": 3.6893491124260356, "grad_norm": 0.298828125, "learning_rate": 3.90351347820557e-05, "loss": 0.0315, "step": 14964 }, { "epoch": 3.689842209072978, "grad_norm": 0.314453125, "learning_rate": 3.900785095485553e-05, "loss": 0.0374, "step": 14966 }, { "epoch": 3.690335305719921, "grad_norm": 0.3125, "learning_rate": 3.898057435556556e-05, "loss": 0.029, "step": 14968 }, { "epoch": 3.690828402366864, "grad_norm": 0.294921875, "learning_rate": 3.8953304987418205e-05, "loss": 0.0317, "step": 14970 }, { "epoch": 3.6913214990138066, "grad_norm": 0.26953125, "learning_rate": 3.8926042853645064e-05, "loss": 0.0322, "step": 14972 }, { "epoch": 3.6918145956607495, "grad_norm": 0.27734375, "learning_rate": 3.889878795747683e-05, "loss": 0.0326, "step": 14974 }, { "epoch": 3.6923076923076925, "grad_norm": 0.349609375, "learning_rate": 3.8871540302143406e-05, "loss": 0.0338, "step": 14976 }, { "epoch": 3.692800788954635, "grad_norm": 0.30859375, "learning_rate": 3.884429989087371e-05, "loss": 0.0316, "step": 14978 }, { "epoch": 3.693293885601578, "grad_norm": 0.380859375, "learning_rate": 3.8817066726895945e-05, "loss": 0.0379, "step": 14980 }, { "epoch": 3.693786982248521, "grad_norm": 0.27734375, "learning_rate": 3.878984081343736e-05, "loss": 0.0335, "step": 14982 }, { "epoch": 3.6942800788954635, "grad_norm": 0.291015625, "learning_rate": 3.876262215372439e-05, "loss": 0.0344, "step": 14984 }, { "epoch": 3.6947731755424065, "grad_norm": 0.29296875, "learning_rate": 3.873541075098261e-05, "loss": 0.037, "step": 14986 }, { "epoch": 3.695266272189349, "grad_norm": 0.302734375, "learning_rate": 3.8708206608436715e-05, "loss": 0.0355, "step": 14988 }, { "epoch": 3.695759368836292, "grad_norm": 0.357421875, "learning_rate": 3.868100972931058e-05, "loss": 0.0364, "step": 14990 }, { "epoch": 3.6962524654832345, "grad_norm": 0.330078125, "learning_rate": 3.865382011682711e-05, "loss": 0.0357, "step": 14992 }, { "epoch": 3.6967455621301775, "grad_norm": 0.3203125, "learning_rate": 3.862663777420847e-05, "loss": 0.0376, "step": 14994 }, { "epoch": 3.6972386587771204, "grad_norm": 0.34765625, "learning_rate": 3.8599462704675926e-05, "loss": 0.0348, "step": 14996 }, { "epoch": 3.697731755424063, "grad_norm": 0.27734375, "learning_rate": 3.857229491144987e-05, "loss": 0.034, "step": 14998 }, { "epoch": 3.698224852071006, "grad_norm": 0.31640625, "learning_rate": 3.854513439774984e-05, "loss": 0.035, "step": 15000 }, { "epoch": 3.698717948717949, "grad_norm": 0.302734375, "learning_rate": 3.851798116679449e-05, "loss": 0.0351, "step": 15002 }, { "epoch": 3.6992110453648914, "grad_norm": 0.29296875, "learning_rate": 3.849083522180168e-05, "loss": 0.0336, "step": 15004 }, { "epoch": 3.6997041420118344, "grad_norm": 0.4140625, "learning_rate": 3.846369656598825e-05, "loss": 0.0355, "step": 15006 }, { "epoch": 3.7001972386587774, "grad_norm": 0.30859375, "learning_rate": 3.843656520257043e-05, "loss": 0.0358, "step": 15008 }, { "epoch": 3.70069033530572, "grad_norm": 0.326171875, "learning_rate": 3.840944113476331e-05, "loss": 0.0393, "step": 15010 }, { "epoch": 3.701183431952663, "grad_norm": 0.330078125, "learning_rate": 3.838232436578131e-05, "loss": 0.0373, "step": 15012 }, { "epoch": 3.7016765285996054, "grad_norm": 0.369140625, "learning_rate": 3.8355214898837896e-05, "loss": 0.0347, "step": 15014 }, { "epoch": 3.7021696252465484, "grad_norm": 0.318359375, "learning_rate": 3.832811273714569e-05, "loss": 0.0333, "step": 15016 }, { "epoch": 3.702662721893491, "grad_norm": 0.37890625, "learning_rate": 3.83010178839165e-05, "loss": 0.0354, "step": 15018 }, { "epoch": 3.703155818540434, "grad_norm": 0.359375, "learning_rate": 3.82739303423611e-05, "loss": 0.0344, "step": 15020 }, { "epoch": 3.703648915187377, "grad_norm": 0.271484375, "learning_rate": 3.824685011568966e-05, "loss": 0.0346, "step": 15022 }, { "epoch": 3.7041420118343193, "grad_norm": 0.34375, "learning_rate": 3.82197772071112e-05, "loss": 0.0387, "step": 15024 }, { "epoch": 3.7046351084812623, "grad_norm": 0.29296875, "learning_rate": 3.819271161983414e-05, "loss": 0.0355, "step": 15026 }, { "epoch": 3.7051282051282053, "grad_norm": 0.296875, "learning_rate": 3.816565335706581e-05, "loss": 0.0323, "step": 15028 }, { "epoch": 3.705621301775148, "grad_norm": 0.314453125, "learning_rate": 3.8138602422012793e-05, "loss": 0.03, "step": 15030 }, { "epoch": 3.706114398422091, "grad_norm": 0.3671875, "learning_rate": 3.811155881788082e-05, "loss": 0.0319, "step": 15032 }, { "epoch": 3.7066074950690338, "grad_norm": 0.32421875, "learning_rate": 3.808452254787459e-05, "loss": 0.035, "step": 15034 }, { "epoch": 3.7071005917159763, "grad_norm": 0.26171875, "learning_rate": 3.80574936151982e-05, "loss": 0.0332, "step": 15036 }, { "epoch": 3.7075936883629192, "grad_norm": 0.265625, "learning_rate": 3.8030472023054595e-05, "loss": 0.0302, "step": 15038 }, { "epoch": 3.7080867850098618, "grad_norm": 0.236328125, "learning_rate": 3.800345777464612e-05, "loss": 0.0309, "step": 15040 }, { "epoch": 3.7085798816568047, "grad_norm": 0.30078125, "learning_rate": 3.7976450873174005e-05, "loss": 0.0352, "step": 15042 }, { "epoch": 3.7090729783037473, "grad_norm": 0.357421875, "learning_rate": 3.794945132183876e-05, "loss": 0.0388, "step": 15044 }, { "epoch": 3.7095660749506902, "grad_norm": 0.28125, "learning_rate": 3.792245912384e-05, "loss": 0.0344, "step": 15046 }, { "epoch": 3.710059171597633, "grad_norm": 0.33203125, "learning_rate": 3.7895474282376375e-05, "loss": 0.0346, "step": 15048 }, { "epoch": 3.7105522682445757, "grad_norm": 0.30859375, "learning_rate": 3.786849680064586e-05, "loss": 0.0346, "step": 15050 }, { "epoch": 3.7110453648915187, "grad_norm": 0.306640625, "learning_rate": 3.7841526681845305e-05, "loss": 0.0375, "step": 15052 }, { "epoch": 3.7115384615384617, "grad_norm": 0.296875, "learning_rate": 3.781456392917095e-05, "loss": 0.0335, "step": 15054 }, { "epoch": 3.712031558185404, "grad_norm": 0.322265625, "learning_rate": 3.778760854581794e-05, "loss": 0.0329, "step": 15056 }, { "epoch": 3.712524654832347, "grad_norm": 0.279296875, "learning_rate": 3.7760660534980666e-05, "loss": 0.031, "step": 15058 }, { "epoch": 3.71301775147929, "grad_norm": 0.3125, "learning_rate": 3.773371989985265e-05, "loss": 0.0349, "step": 15060 }, { "epoch": 3.7135108481262327, "grad_norm": 0.3359375, "learning_rate": 3.770678664362641e-05, "loss": 0.0353, "step": 15062 }, { "epoch": 3.7140039447731756, "grad_norm": 0.3203125, "learning_rate": 3.7679860769493825e-05, "loss": 0.0372, "step": 15064 }, { "epoch": 3.7144970414201186, "grad_norm": 0.341796875, "learning_rate": 3.765294228064562e-05, "loss": 0.0365, "step": 15066 }, { "epoch": 3.714990138067061, "grad_norm": 0.43359375, "learning_rate": 3.762603118027193e-05, "loss": 0.0372, "step": 15068 }, { "epoch": 3.715483234714004, "grad_norm": 0.287109375, "learning_rate": 3.7599127471561745e-05, "loss": 0.0348, "step": 15070 }, { "epoch": 3.7159763313609466, "grad_norm": 0.294921875, "learning_rate": 3.757223115770336e-05, "loss": 0.0349, "step": 15072 }, { "epoch": 3.7164694280078896, "grad_norm": 0.341796875, "learning_rate": 3.754534224188415e-05, "loss": 0.0336, "step": 15074 }, { "epoch": 3.716962524654832, "grad_norm": 0.2734375, "learning_rate": 3.751846072729057e-05, "loss": 0.0281, "step": 15076 }, { "epoch": 3.717455621301775, "grad_norm": 0.314453125, "learning_rate": 3.74915866171083e-05, "loss": 0.0357, "step": 15078 }, { "epoch": 3.717948717948718, "grad_norm": 0.298828125, "learning_rate": 3.746471991452193e-05, "loss": 0.0374, "step": 15080 }, { "epoch": 3.7184418145956606, "grad_norm": 0.337890625, "learning_rate": 3.743786062271548e-05, "loss": 0.0364, "step": 15082 }, { "epoch": 3.7189349112426036, "grad_norm": 0.318359375, "learning_rate": 3.741100874487181e-05, "loss": 0.0356, "step": 15084 }, { "epoch": 3.7194280078895465, "grad_norm": 0.298828125, "learning_rate": 3.7384164284173064e-05, "loss": 0.034, "step": 15086 }, { "epoch": 3.719921104536489, "grad_norm": 0.298828125, "learning_rate": 3.7357327243800446e-05, "loss": 0.0349, "step": 15088 }, { "epoch": 3.720414201183432, "grad_norm": 0.32421875, "learning_rate": 3.7330497626934305e-05, "loss": 0.0398, "step": 15090 }, { "epoch": 3.720907297830375, "grad_norm": 0.349609375, "learning_rate": 3.7303675436754126e-05, "loss": 0.0353, "step": 15092 }, { "epoch": 3.7214003944773175, "grad_norm": 0.326171875, "learning_rate": 3.72768606764384e-05, "loss": 0.0346, "step": 15094 }, { "epoch": 3.7218934911242605, "grad_norm": 0.291015625, "learning_rate": 3.725005334916495e-05, "loss": 0.0342, "step": 15096 }, { "epoch": 3.722386587771203, "grad_norm": 0.294921875, "learning_rate": 3.722325345811048e-05, "loss": 0.0333, "step": 15098 }, { "epoch": 3.722879684418146, "grad_norm": 0.326171875, "learning_rate": 3.719646100645099e-05, "loss": 0.0331, "step": 15100 }, { "epoch": 3.7233727810650885, "grad_norm": 0.337890625, "learning_rate": 3.716967599736152e-05, "loss": 0.0348, "step": 15102 }, { "epoch": 3.7238658777120315, "grad_norm": 0.318359375, "learning_rate": 3.714289843401625e-05, "loss": 0.0327, "step": 15104 }, { "epoch": 3.7243589743589745, "grad_norm": 0.287109375, "learning_rate": 3.711612831958845e-05, "loss": 0.0327, "step": 15106 }, { "epoch": 3.724852071005917, "grad_norm": 0.330078125, "learning_rate": 3.7089365657250555e-05, "loss": 0.0349, "step": 15108 }, { "epoch": 3.72534516765286, "grad_norm": 0.279296875, "learning_rate": 3.7062610450174105e-05, "loss": 0.0357, "step": 15110 }, { "epoch": 3.725838264299803, "grad_norm": 0.296875, "learning_rate": 3.703586270152968e-05, "loss": 0.037, "step": 15112 }, { "epoch": 3.7263313609467454, "grad_norm": 0.263671875, "learning_rate": 3.700912241448707e-05, "loss": 0.0368, "step": 15114 }, { "epoch": 3.7268244575936884, "grad_norm": 0.34765625, "learning_rate": 3.698238959221515e-05, "loss": 0.0349, "step": 15116 }, { "epoch": 3.7273175542406314, "grad_norm": 0.283203125, "learning_rate": 3.6955664237881916e-05, "loss": 0.0304, "step": 15118 }, { "epoch": 3.727810650887574, "grad_norm": 0.28125, "learning_rate": 3.692894635465446e-05, "loss": 0.0339, "step": 15120 }, { "epoch": 3.728303747534517, "grad_norm": 0.298828125, "learning_rate": 3.690223594569901e-05, "loss": 0.0354, "step": 15122 }, { "epoch": 3.7287968441814594, "grad_norm": 0.349609375, "learning_rate": 3.687553301418092e-05, "loss": 0.0354, "step": 15124 }, { "epoch": 3.7292899408284024, "grad_norm": 0.294921875, "learning_rate": 3.684883756326457e-05, "loss": 0.0335, "step": 15126 }, { "epoch": 3.729783037475345, "grad_norm": 0.34375, "learning_rate": 3.6822149596113574e-05, "loss": 0.0344, "step": 15128 }, { "epoch": 3.730276134122288, "grad_norm": 0.29296875, "learning_rate": 3.679546911589058e-05, "loss": 0.036, "step": 15130 }, { "epoch": 3.730769230769231, "grad_norm": 0.330078125, "learning_rate": 3.6768796125757384e-05, "loss": 0.035, "step": 15132 }, { "epoch": 3.7312623274161734, "grad_norm": 0.29296875, "learning_rate": 3.674213062887489e-05, "loss": 0.0333, "step": 15134 }, { "epoch": 3.7317554240631163, "grad_norm": 0.296875, "learning_rate": 3.67154726284031e-05, "loss": 0.0319, "step": 15136 }, { "epoch": 3.7322485207100593, "grad_norm": 0.275390625, "learning_rate": 3.668882212750115e-05, "loss": 0.0338, "step": 15138 }, { "epoch": 3.732741617357002, "grad_norm": 0.294921875, "learning_rate": 3.666217912932729e-05, "loss": 0.0353, "step": 15140 }, { "epoch": 3.733234714003945, "grad_norm": 0.259765625, "learning_rate": 3.663554363703879e-05, "loss": 0.0332, "step": 15142 }, { "epoch": 3.7337278106508878, "grad_norm": 0.3203125, "learning_rate": 3.6608915653792174e-05, "loss": 0.0355, "step": 15144 }, { "epoch": 3.7342209072978303, "grad_norm": 0.32421875, "learning_rate": 3.6582295182742964e-05, "loss": 0.0334, "step": 15146 }, { "epoch": 3.7347140039447733, "grad_norm": 0.322265625, "learning_rate": 3.6555682227045864e-05, "loss": 0.0344, "step": 15148 }, { "epoch": 3.7352071005917162, "grad_norm": 0.33203125, "learning_rate": 3.652907678985464e-05, "loss": 0.0379, "step": 15150 }, { "epoch": 3.7357001972386588, "grad_norm": 0.3203125, "learning_rate": 3.65024788743222e-05, "loss": 0.0342, "step": 15152 }, { "epoch": 3.7361932938856017, "grad_norm": 0.283203125, "learning_rate": 3.647588848360054e-05, "loss": 0.0334, "step": 15154 }, { "epoch": 3.7366863905325443, "grad_norm": 0.32421875, "learning_rate": 3.644930562084076e-05, "loss": 0.0366, "step": 15156 }, { "epoch": 3.7371794871794872, "grad_norm": 0.26953125, "learning_rate": 3.6422730289193116e-05, "loss": 0.0313, "step": 15158 }, { "epoch": 3.7376725838264298, "grad_norm": 0.275390625, "learning_rate": 3.6396162491806884e-05, "loss": 0.0313, "step": 15160 }, { "epoch": 3.7381656804733727, "grad_norm": 0.298828125, "learning_rate": 3.63696022318305e-05, "loss": 0.0349, "step": 15162 }, { "epoch": 3.7386587771203157, "grad_norm": 0.2421875, "learning_rate": 3.634304951241153e-05, "loss": 0.0304, "step": 15164 }, { "epoch": 3.739151873767258, "grad_norm": 0.296875, "learning_rate": 3.63165043366966e-05, "loss": 0.0343, "step": 15166 }, { "epoch": 3.739644970414201, "grad_norm": 0.2890625, "learning_rate": 3.628996670783148e-05, "loss": 0.0375, "step": 15168 }, { "epoch": 3.740138067061144, "grad_norm": 0.322265625, "learning_rate": 3.6263436628961e-05, "loss": 0.035, "step": 15170 }, { "epoch": 3.7406311637080867, "grad_norm": 0.287109375, "learning_rate": 3.623691410322918e-05, "loss": 0.0339, "step": 15172 }, { "epoch": 3.7411242603550297, "grad_norm": 0.3828125, "learning_rate": 3.6210399133779014e-05, "loss": 0.0361, "step": 15174 }, { "epoch": 3.7416173570019726, "grad_norm": 0.291015625, "learning_rate": 3.61838917237527e-05, "loss": 0.0355, "step": 15176 }, { "epoch": 3.742110453648915, "grad_norm": 0.337890625, "learning_rate": 3.6157391876291535e-05, "loss": 0.0379, "step": 15178 }, { "epoch": 3.742603550295858, "grad_norm": 0.32421875, "learning_rate": 3.613089959453587e-05, "loss": 0.0314, "step": 15180 }, { "epoch": 3.7430966469428006, "grad_norm": 0.26953125, "learning_rate": 3.6104414881625215e-05, "loss": 0.0301, "step": 15182 }, { "epoch": 3.7435897435897436, "grad_norm": 0.279296875, "learning_rate": 3.607793774069813e-05, "loss": 0.0345, "step": 15184 }, { "epoch": 3.744082840236686, "grad_norm": 0.31640625, "learning_rate": 3.605146817489236e-05, "loss": 0.0335, "step": 15186 }, { "epoch": 3.744575936883629, "grad_norm": 0.318359375, "learning_rate": 3.602500618734459e-05, "loss": 0.036, "step": 15188 }, { "epoch": 3.745069033530572, "grad_norm": 0.302734375, "learning_rate": 3.599855178119084e-05, "loss": 0.0307, "step": 15190 }, { "epoch": 3.7455621301775146, "grad_norm": 0.279296875, "learning_rate": 3.5972104959566e-05, "loss": 0.0324, "step": 15192 }, { "epoch": 3.7460552268244576, "grad_norm": 0.31640625, "learning_rate": 3.5945665725604216e-05, "loss": 0.0368, "step": 15194 }, { "epoch": 3.7465483234714005, "grad_norm": 0.255859375, "learning_rate": 3.591923408243868e-05, "loss": 0.0298, "step": 15196 }, { "epoch": 3.747041420118343, "grad_norm": 0.302734375, "learning_rate": 3.589281003320168e-05, "loss": 0.0325, "step": 15198 }, { "epoch": 3.747534516765286, "grad_norm": 0.3515625, "learning_rate": 3.586639358102466e-05, "loss": 0.0342, "step": 15200 }, { "epoch": 3.748027613412229, "grad_norm": 0.275390625, "learning_rate": 3.583998472903801e-05, "loss": 0.0367, "step": 15202 }, { "epoch": 3.7485207100591715, "grad_norm": 0.32421875, "learning_rate": 3.581358348037146e-05, "loss": 0.0363, "step": 15204 }, { "epoch": 3.7490138067061145, "grad_norm": 0.33984375, "learning_rate": 3.57871898381536e-05, "loss": 0.0332, "step": 15206 }, { "epoch": 3.7495069033530575, "grad_norm": 0.30078125, "learning_rate": 3.5760803805512266e-05, "loss": 0.0388, "step": 15208 }, { "epoch": 3.75, "grad_norm": 0.29296875, "learning_rate": 3.573442538557434e-05, "loss": 0.0346, "step": 15210 }, { "epoch": 3.7504930966469425, "grad_norm": 0.322265625, "learning_rate": 3.570805458146582e-05, "loss": 0.034, "step": 15212 }, { "epoch": 3.7509861932938855, "grad_norm": 0.275390625, "learning_rate": 3.568169139631183e-05, "loss": 0.037, "step": 15214 }, { "epoch": 3.7514792899408285, "grad_norm": 0.29296875, "learning_rate": 3.565533583323646e-05, "loss": 0.0336, "step": 15216 }, { "epoch": 3.751972386587771, "grad_norm": 0.27734375, "learning_rate": 3.56289878953631e-05, "loss": 0.0371, "step": 15218 }, { "epoch": 3.752465483234714, "grad_norm": 0.32421875, "learning_rate": 3.560264758581402e-05, "loss": 0.0352, "step": 15220 }, { "epoch": 3.752958579881657, "grad_norm": 0.29296875, "learning_rate": 3.557631490771081e-05, "loss": 0.0362, "step": 15222 }, { "epoch": 3.7534516765285995, "grad_norm": 0.31640625, "learning_rate": 3.5549989864173946e-05, "loss": 0.033, "step": 15224 }, { "epoch": 3.7539447731755424, "grad_norm": 0.38671875, "learning_rate": 3.5523672458323134e-05, "loss": 0.0348, "step": 15226 }, { "epoch": 3.7544378698224854, "grad_norm": 0.2578125, "learning_rate": 3.549736269327715e-05, "loss": 0.0332, "step": 15228 }, { "epoch": 3.754930966469428, "grad_norm": 0.310546875, "learning_rate": 3.547106057215376e-05, "loss": 0.0336, "step": 15230 }, { "epoch": 3.755424063116371, "grad_norm": 0.28515625, "learning_rate": 3.5444766098070034e-05, "loss": 0.0361, "step": 15232 }, { "epoch": 3.755917159763314, "grad_norm": 0.279296875, "learning_rate": 3.541847927414189e-05, "loss": 0.0351, "step": 15234 }, { "epoch": 3.7564102564102564, "grad_norm": 0.353515625, "learning_rate": 3.539220010348461e-05, "loss": 0.0349, "step": 15236 }, { "epoch": 3.7569033530571994, "grad_norm": 0.291015625, "learning_rate": 3.5365928589212296e-05, "loss": 0.0287, "step": 15238 }, { "epoch": 3.757396449704142, "grad_norm": 0.298828125, "learning_rate": 3.533966473443832e-05, "loss": 0.034, "step": 15240 }, { "epoch": 3.757889546351085, "grad_norm": 0.369140625, "learning_rate": 3.531340854227513e-05, "loss": 0.0338, "step": 15242 }, { "epoch": 3.7583826429980274, "grad_norm": 0.32421875, "learning_rate": 3.528716001583411e-05, "loss": 0.0343, "step": 15244 }, { "epoch": 3.7588757396449703, "grad_norm": 0.263671875, "learning_rate": 3.526091915822601e-05, "loss": 0.0313, "step": 15246 }, { "epoch": 3.7593688362919133, "grad_norm": 0.3671875, "learning_rate": 3.5234685972560386e-05, "loss": 0.0343, "step": 15248 }, { "epoch": 3.759861932938856, "grad_norm": 0.361328125, "learning_rate": 3.520846046194614e-05, "loss": 0.035, "step": 15250 }, { "epoch": 3.760355029585799, "grad_norm": 0.326171875, "learning_rate": 3.5182242629491046e-05, "loss": 0.032, "step": 15252 }, { "epoch": 3.760848126232742, "grad_norm": 0.32421875, "learning_rate": 3.515603247830209e-05, "loss": 0.033, "step": 15254 }, { "epoch": 3.7613412228796843, "grad_norm": 0.34765625, "learning_rate": 3.5129830011485366e-05, "loss": 0.0335, "step": 15256 }, { "epoch": 3.7618343195266273, "grad_norm": 0.287109375, "learning_rate": 3.5103635232145916e-05, "loss": 0.0321, "step": 15258 }, { "epoch": 3.7623274161735702, "grad_norm": 0.3125, "learning_rate": 3.5077448143388084e-05, "loss": 0.0334, "step": 15260 }, { "epoch": 3.7628205128205128, "grad_norm": 0.287109375, "learning_rate": 3.505126874831506e-05, "loss": 0.0329, "step": 15262 }, { "epoch": 3.7633136094674557, "grad_norm": 0.29296875, "learning_rate": 3.502509705002939e-05, "loss": 0.0342, "step": 15264 }, { "epoch": 3.7638067061143983, "grad_norm": 0.296875, "learning_rate": 3.499893305163246e-05, "loss": 0.0323, "step": 15266 }, { "epoch": 3.7642998027613412, "grad_norm": 0.376953125, "learning_rate": 3.497277675622487e-05, "loss": 0.036, "step": 15268 }, { "epoch": 3.7647928994082838, "grad_norm": 0.353515625, "learning_rate": 3.49466281669063e-05, "loss": 0.0357, "step": 15270 }, { "epoch": 3.7652859960552267, "grad_norm": 0.3046875, "learning_rate": 3.4920487286775517e-05, "loss": 0.0292, "step": 15272 }, { "epoch": 3.7657790927021697, "grad_norm": 0.3359375, "learning_rate": 3.4894354118930375e-05, "loss": 0.0366, "step": 15274 }, { "epoch": 3.7662721893491122, "grad_norm": 0.302734375, "learning_rate": 3.4868228666467704e-05, "loss": 0.0368, "step": 15276 }, { "epoch": 3.766765285996055, "grad_norm": 0.251953125, "learning_rate": 3.484211093248366e-05, "loss": 0.0312, "step": 15278 }, { "epoch": 3.767258382642998, "grad_norm": 0.328125, "learning_rate": 3.4816000920073235e-05, "loss": 0.0363, "step": 15280 }, { "epoch": 3.7677514792899407, "grad_norm": 0.328125, "learning_rate": 3.478989863233063e-05, "loss": 0.0351, "step": 15282 }, { "epoch": 3.7682445759368837, "grad_norm": 0.267578125, "learning_rate": 3.476380407234913e-05, "loss": 0.0336, "step": 15284 }, { "epoch": 3.7687376725838266, "grad_norm": 0.271484375, "learning_rate": 3.473771724322108e-05, "loss": 0.0345, "step": 15286 }, { "epoch": 3.769230769230769, "grad_norm": 0.2890625, "learning_rate": 3.4711638148037905e-05, "loss": 0.0298, "step": 15288 }, { "epoch": 3.769723865877712, "grad_norm": 0.306640625, "learning_rate": 3.4685566789890145e-05, "loss": 0.0364, "step": 15290 }, { "epoch": 3.770216962524655, "grad_norm": 0.330078125, "learning_rate": 3.4659503171867414e-05, "loss": 0.0324, "step": 15292 }, { "epoch": 3.7707100591715976, "grad_norm": 0.306640625, "learning_rate": 3.463344729705835e-05, "loss": 0.0324, "step": 15294 }, { "epoch": 3.7712031558185406, "grad_norm": 0.29296875, "learning_rate": 3.4607399168550734e-05, "loss": 0.0336, "step": 15296 }, { "epoch": 3.771696252465483, "grad_norm": 0.357421875, "learning_rate": 3.458135878943142e-05, "loss": 0.0352, "step": 15298 }, { "epoch": 3.772189349112426, "grad_norm": 0.2890625, "learning_rate": 3.455532616278634e-05, "loss": 0.0364, "step": 15300 }, { "epoch": 3.7726824457593686, "grad_norm": 0.275390625, "learning_rate": 3.452930129170051e-05, "loss": 0.0335, "step": 15302 }, { "epoch": 3.7731755424063116, "grad_norm": 0.28515625, "learning_rate": 3.4503284179258025e-05, "loss": 0.0334, "step": 15304 }, { "epoch": 3.7736686390532546, "grad_norm": 0.35546875, "learning_rate": 3.4477274828542074e-05, "loss": 0.0322, "step": 15306 }, { "epoch": 3.774161735700197, "grad_norm": 0.36328125, "learning_rate": 3.445127324263485e-05, "loss": 0.0391, "step": 15308 }, { "epoch": 3.77465483234714, "grad_norm": 0.361328125, "learning_rate": 3.442527942461774e-05, "loss": 0.0386, "step": 15310 }, { "epoch": 3.775147928994083, "grad_norm": 0.28515625, "learning_rate": 3.439929337757113e-05, "loss": 0.035, "step": 15312 }, { "epoch": 3.7756410256410255, "grad_norm": 0.306640625, "learning_rate": 3.4373315104574524e-05, "loss": 0.0357, "step": 15314 }, { "epoch": 3.7761341222879685, "grad_norm": 0.337890625, "learning_rate": 3.434734460870649e-05, "loss": 0.0318, "step": 15316 }, { "epoch": 3.7766272189349115, "grad_norm": 0.318359375, "learning_rate": 3.432138189304468e-05, "loss": 0.033, "step": 15318 }, { "epoch": 3.777120315581854, "grad_norm": 0.396484375, "learning_rate": 3.4295426960665814e-05, "loss": 0.0335, "step": 15320 }, { "epoch": 3.777613412228797, "grad_norm": 0.33984375, "learning_rate": 3.4269479814645745e-05, "loss": 0.0332, "step": 15322 }, { "epoch": 3.7781065088757395, "grad_norm": 0.3046875, "learning_rate": 3.4243540458059264e-05, "loss": 0.0364, "step": 15324 }, { "epoch": 3.7785996055226825, "grad_norm": 0.302734375, "learning_rate": 3.421760889398038e-05, "loss": 0.0347, "step": 15326 }, { "epoch": 3.779092702169625, "grad_norm": 0.294921875, "learning_rate": 3.419168512548212e-05, "loss": 0.0326, "step": 15328 }, { "epoch": 3.779585798816568, "grad_norm": 0.3203125, "learning_rate": 3.4165769155636604e-05, "loss": 0.036, "step": 15330 }, { "epoch": 3.780078895463511, "grad_norm": 0.318359375, "learning_rate": 3.4139860987515014e-05, "loss": 0.0328, "step": 15332 }, { "epoch": 3.7805719921104535, "grad_norm": 0.2734375, "learning_rate": 3.4113960624187615e-05, "loss": 0.0326, "step": 15334 }, { "epoch": 3.7810650887573964, "grad_norm": 0.31640625, "learning_rate": 3.408806806872376e-05, "loss": 0.0334, "step": 15336 }, { "epoch": 3.7815581854043394, "grad_norm": 0.2890625, "learning_rate": 3.40621833241918e-05, "loss": 0.0334, "step": 15338 }, { "epoch": 3.782051282051282, "grad_norm": 0.296875, "learning_rate": 3.403630639365932e-05, "loss": 0.0365, "step": 15340 }, { "epoch": 3.782544378698225, "grad_norm": 0.443359375, "learning_rate": 3.40104372801928e-05, "loss": 0.0364, "step": 15342 }, { "epoch": 3.783037475345168, "grad_norm": 0.3203125, "learning_rate": 3.3984575986857905e-05, "loss": 0.0317, "step": 15344 }, { "epoch": 3.7835305719921104, "grad_norm": 0.314453125, "learning_rate": 3.395872251671932e-05, "loss": 0.0372, "step": 15346 }, { "epoch": 3.7840236686390534, "grad_norm": 0.337890625, "learning_rate": 3.393287687284086e-05, "loss": 0.0311, "step": 15348 }, { "epoch": 3.784516765285996, "grad_norm": 0.3203125, "learning_rate": 3.3907039058285387e-05, "loss": 0.0333, "step": 15350 }, { "epoch": 3.785009861932939, "grad_norm": 0.330078125, "learning_rate": 3.388120907611474e-05, "loss": 0.0387, "step": 15352 }, { "epoch": 3.7855029585798814, "grad_norm": 0.2734375, "learning_rate": 3.3855386929390045e-05, "loss": 0.0304, "step": 15354 }, { "epoch": 3.7859960552268244, "grad_norm": 0.28125, "learning_rate": 3.382957262117127e-05, "loss": 0.0319, "step": 15356 }, { "epoch": 3.7864891518737673, "grad_norm": 0.28125, "learning_rate": 3.3803766154517577e-05, "loss": 0.0322, "step": 15358 }, { "epoch": 3.78698224852071, "grad_norm": 0.283203125, "learning_rate": 3.37779675324872e-05, "loss": 0.0345, "step": 15360 }, { "epoch": 3.787475345167653, "grad_norm": 0.287109375, "learning_rate": 3.37521767581374e-05, "loss": 0.0346, "step": 15362 }, { "epoch": 3.787968441814596, "grad_norm": 0.32421875, "learning_rate": 3.372639383452456e-05, "loss": 0.0351, "step": 15364 }, { "epoch": 3.7884615384615383, "grad_norm": 0.30078125, "learning_rate": 3.370061876470403e-05, "loss": 0.0371, "step": 15366 }, { "epoch": 3.7889546351084813, "grad_norm": 0.3359375, "learning_rate": 3.367485155173039e-05, "loss": 0.0326, "step": 15368 }, { "epoch": 3.7894477317554243, "grad_norm": 0.27734375, "learning_rate": 3.36490921986571e-05, "loss": 0.0368, "step": 15370 }, { "epoch": 3.789940828402367, "grad_norm": 0.298828125, "learning_rate": 3.36233407085369e-05, "loss": 0.0352, "step": 15372 }, { "epoch": 3.7904339250493098, "grad_norm": 0.32421875, "learning_rate": 3.359759708442139e-05, "loss": 0.032, "step": 15374 }, { "epoch": 3.7909270216962527, "grad_norm": 0.34375, "learning_rate": 3.3571861329361385e-05, "loss": 0.0365, "step": 15376 }, { "epoch": 3.7914201183431953, "grad_norm": 0.283203125, "learning_rate": 3.354613344640672e-05, "loss": 0.0326, "step": 15378 }, { "epoch": 3.7919132149901382, "grad_norm": 0.306640625, "learning_rate": 3.352041343860621e-05, "loss": 0.0345, "step": 15380 }, { "epoch": 3.7924063116370808, "grad_norm": 0.3046875, "learning_rate": 3.3494701309007945e-05, "loss": 0.0346, "step": 15382 }, { "epoch": 3.7928994082840237, "grad_norm": 0.30078125, "learning_rate": 3.3468997060658834e-05, "loss": 0.0315, "step": 15384 }, { "epoch": 3.7933925049309662, "grad_norm": 0.296875, "learning_rate": 3.34433006966051e-05, "loss": 0.0327, "step": 15386 }, { "epoch": 3.793885601577909, "grad_norm": 0.36328125, "learning_rate": 3.341761221989179e-05, "loss": 0.0339, "step": 15388 }, { "epoch": 3.794378698224852, "grad_norm": 0.333984375, "learning_rate": 3.339193163356319e-05, "loss": 0.0324, "step": 15390 }, { "epoch": 3.7948717948717947, "grad_norm": 0.29296875, "learning_rate": 3.336625894066262e-05, "loss": 0.0336, "step": 15392 }, { "epoch": 3.7953648915187377, "grad_norm": 0.306640625, "learning_rate": 3.334059414423233e-05, "loss": 0.0345, "step": 15394 }, { "epoch": 3.7958579881656807, "grad_norm": 0.30859375, "learning_rate": 3.331493724731387e-05, "loss": 0.0335, "step": 15396 }, { "epoch": 3.796351084812623, "grad_norm": 0.3359375, "learning_rate": 3.328928825294761e-05, "loss": 0.0337, "step": 15398 }, { "epoch": 3.796844181459566, "grad_norm": 0.30078125, "learning_rate": 3.3263647164173227e-05, "loss": 0.0353, "step": 15400 }, { "epoch": 3.797337278106509, "grad_norm": 0.30078125, "learning_rate": 3.323801398402923e-05, "loss": 0.0323, "step": 15402 }, { "epoch": 3.7978303747534516, "grad_norm": 0.314453125, "learning_rate": 3.321238871555332e-05, "loss": 0.0327, "step": 15404 }, { "epoch": 3.7983234714003946, "grad_norm": 0.298828125, "learning_rate": 3.318677136178228e-05, "loss": 0.0339, "step": 15406 }, { "epoch": 3.798816568047337, "grad_norm": 0.265625, "learning_rate": 3.31611619257518e-05, "loss": 0.0295, "step": 15408 }, { "epoch": 3.79930966469428, "grad_norm": 0.365234375, "learning_rate": 3.313556041049689e-05, "loss": 0.0339, "step": 15410 }, { "epoch": 3.7998027613412226, "grad_norm": 0.267578125, "learning_rate": 3.310996681905132e-05, "loss": 0.031, "step": 15412 }, { "epoch": 3.8002958579881656, "grad_norm": 0.32421875, "learning_rate": 3.308438115444821e-05, "loss": 0.0351, "step": 15414 }, { "epoch": 3.8007889546351086, "grad_norm": 0.39453125, "learning_rate": 3.305880341971951e-05, "loss": 0.0358, "step": 15416 }, { "epoch": 3.801282051282051, "grad_norm": 0.337890625, "learning_rate": 3.3033233617896364e-05, "loss": 0.0361, "step": 15418 }, { "epoch": 3.801775147928994, "grad_norm": 0.296875, "learning_rate": 3.300767175200893e-05, "loss": 0.0334, "step": 15420 }, { "epoch": 3.802268244575937, "grad_norm": 0.296875, "learning_rate": 3.298211782508642e-05, "loss": 0.0327, "step": 15422 }, { "epoch": 3.8027613412228796, "grad_norm": 0.294921875, "learning_rate": 3.295657184015717e-05, "loss": 0.0332, "step": 15424 }, { "epoch": 3.8032544378698225, "grad_norm": 0.28125, "learning_rate": 3.29310338002484e-05, "loss": 0.0376, "step": 15426 }, { "epoch": 3.8037475345167655, "grad_norm": 0.31640625, "learning_rate": 3.290550370838668e-05, "loss": 0.0326, "step": 15428 }, { "epoch": 3.804240631163708, "grad_norm": 0.2890625, "learning_rate": 3.287998156759732e-05, "loss": 0.0361, "step": 15430 }, { "epoch": 3.804733727810651, "grad_norm": 0.291015625, "learning_rate": 3.285446738090491e-05, "loss": 0.032, "step": 15432 }, { "epoch": 3.8052268244575935, "grad_norm": 0.28125, "learning_rate": 3.282896115133301e-05, "loss": 0.0338, "step": 15434 }, { "epoch": 3.8057199211045365, "grad_norm": 0.345703125, "learning_rate": 3.280346288190423e-05, "loss": 0.0386, "step": 15436 }, { "epoch": 3.806213017751479, "grad_norm": 0.361328125, "learning_rate": 3.2777972575640325e-05, "loss": 0.0375, "step": 15438 }, { "epoch": 3.806706114398422, "grad_norm": 0.310546875, "learning_rate": 3.275249023556192e-05, "loss": 0.0369, "step": 15440 }, { "epoch": 3.807199211045365, "grad_norm": 0.314453125, "learning_rate": 3.272701586468895e-05, "loss": 0.0337, "step": 15442 }, { "epoch": 3.8076923076923075, "grad_norm": 0.34765625, "learning_rate": 3.270154946604016e-05, "loss": 0.0355, "step": 15444 }, { "epoch": 3.8081854043392505, "grad_norm": 0.375, "learning_rate": 3.267609104263352e-05, "loss": 0.0312, "step": 15446 }, { "epoch": 3.8086785009861934, "grad_norm": 0.271484375, "learning_rate": 3.2650640597485973e-05, "loss": 0.0324, "step": 15448 }, { "epoch": 3.809171597633136, "grad_norm": 0.322265625, "learning_rate": 3.2625198133613554e-05, "loss": 0.0346, "step": 15450 }, { "epoch": 3.809664694280079, "grad_norm": 0.349609375, "learning_rate": 3.2599763654031325e-05, "loss": 0.0363, "step": 15452 }, { "epoch": 3.810157790927022, "grad_norm": 0.29296875, "learning_rate": 3.257433716175341e-05, "loss": 0.0339, "step": 15454 }, { "epoch": 3.8106508875739644, "grad_norm": 0.3046875, "learning_rate": 3.254891865979304e-05, "loss": 0.0346, "step": 15456 }, { "epoch": 3.8111439842209074, "grad_norm": 0.3203125, "learning_rate": 3.252350815116238e-05, "loss": 0.0325, "step": 15458 }, { "epoch": 3.8116370808678504, "grad_norm": 0.390625, "learning_rate": 3.2498105638872724e-05, "loss": 0.0337, "step": 15460 }, { "epoch": 3.812130177514793, "grad_norm": 0.326171875, "learning_rate": 3.247271112593444e-05, "loss": 0.0327, "step": 15462 }, { "epoch": 3.812623274161736, "grad_norm": 0.2890625, "learning_rate": 3.244732461535692e-05, "loss": 0.0346, "step": 15464 }, { "epoch": 3.8131163708086784, "grad_norm": 0.328125, "learning_rate": 3.242194611014859e-05, "loss": 0.033, "step": 15466 }, { "epoch": 3.8136094674556213, "grad_norm": 0.29296875, "learning_rate": 3.239657561331696e-05, "loss": 0.0328, "step": 15468 }, { "epoch": 3.814102564102564, "grad_norm": 0.34375, "learning_rate": 3.237121312786857e-05, "loss": 0.0348, "step": 15470 }, { "epoch": 3.814595660749507, "grad_norm": 0.435546875, "learning_rate": 3.234585865680905e-05, "loss": 0.0352, "step": 15472 }, { "epoch": 3.81508875739645, "grad_norm": 0.3359375, "learning_rate": 3.232051220314297e-05, "loss": 0.0357, "step": 15474 }, { "epoch": 3.8155818540433923, "grad_norm": 0.267578125, "learning_rate": 3.229517376987407e-05, "loss": 0.0334, "step": 15476 }, { "epoch": 3.8160749506903353, "grad_norm": 0.291015625, "learning_rate": 3.2269843360005094e-05, "loss": 0.0368, "step": 15478 }, { "epoch": 3.8165680473372783, "grad_norm": 0.298828125, "learning_rate": 3.224452097653784e-05, "loss": 0.0338, "step": 15480 }, { "epoch": 3.817061143984221, "grad_norm": 0.271484375, "learning_rate": 3.2219206622473144e-05, "loss": 0.0339, "step": 15482 }, { "epoch": 3.8175542406311638, "grad_norm": 0.3203125, "learning_rate": 3.219390030081091e-05, "loss": 0.035, "step": 15484 }, { "epoch": 3.8180473372781067, "grad_norm": 0.330078125, "learning_rate": 3.2168602014550096e-05, "loss": 0.0356, "step": 15486 }, { "epoch": 3.8185404339250493, "grad_norm": 0.439453125, "learning_rate": 3.214331176668863e-05, "loss": 0.0342, "step": 15488 }, { "epoch": 3.8190335305719922, "grad_norm": 0.32421875, "learning_rate": 3.211802956022359e-05, "loss": 0.0365, "step": 15490 }, { "epoch": 3.8195266272189348, "grad_norm": 0.2890625, "learning_rate": 3.209275539815105e-05, "loss": 0.032, "step": 15492 }, { "epoch": 3.8200197238658777, "grad_norm": 0.3828125, "learning_rate": 3.206748928346614e-05, "loss": 0.0329, "step": 15494 }, { "epoch": 3.8205128205128203, "grad_norm": 0.30078125, "learning_rate": 3.204223121916304e-05, "loss": 0.0325, "step": 15496 }, { "epoch": 3.8210059171597632, "grad_norm": 0.296875, "learning_rate": 3.201698120823496e-05, "loss": 0.0362, "step": 15498 }, { "epoch": 3.821499013806706, "grad_norm": 0.29296875, "learning_rate": 3.199173925367421e-05, "loss": 0.0297, "step": 15500 }, { "epoch": 3.8219921104536487, "grad_norm": 0.291015625, "learning_rate": 3.196650535847201e-05, "loss": 0.036, "step": 15502 }, { "epoch": 3.8224852071005917, "grad_norm": 0.35546875, "learning_rate": 3.1941279525618836e-05, "loss": 0.0358, "step": 15504 }, { "epoch": 3.8229783037475347, "grad_norm": 0.2890625, "learning_rate": 3.1916061758104e-05, "loss": 0.0334, "step": 15506 }, { "epoch": 3.823471400394477, "grad_norm": 0.2890625, "learning_rate": 3.189085205891598e-05, "loss": 0.0349, "step": 15508 }, { "epoch": 3.82396449704142, "grad_norm": 0.306640625, "learning_rate": 3.186565043104226e-05, "loss": 0.0356, "step": 15510 }, { "epoch": 3.824457593688363, "grad_norm": 0.30859375, "learning_rate": 3.184045687746937e-05, "loss": 0.0378, "step": 15512 }, { "epoch": 3.8249506903353057, "grad_norm": 0.32421875, "learning_rate": 3.181527140118293e-05, "loss": 0.037, "step": 15514 }, { "epoch": 3.8254437869822486, "grad_norm": 0.37109375, "learning_rate": 3.179009400516747e-05, "loss": 0.0339, "step": 15516 }, { "epoch": 3.8259368836291916, "grad_norm": 0.298828125, "learning_rate": 3.176492469240676e-05, "loss": 0.0345, "step": 15518 }, { "epoch": 3.826429980276134, "grad_norm": 0.2890625, "learning_rate": 3.173976346588342e-05, "loss": 0.0347, "step": 15520 }, { "epoch": 3.8269230769230766, "grad_norm": 0.314453125, "learning_rate": 3.171461032857923e-05, "loss": 0.0329, "step": 15522 }, { "epoch": 3.8274161735700196, "grad_norm": 0.310546875, "learning_rate": 3.1689465283474964e-05, "loss": 0.0326, "step": 15524 }, { "epoch": 3.8279092702169626, "grad_norm": 0.31640625, "learning_rate": 3.1664328333550466e-05, "loss": 0.032, "step": 15526 }, { "epoch": 3.828402366863905, "grad_norm": 0.2734375, "learning_rate": 3.163919948178462e-05, "loss": 0.0324, "step": 15528 }, { "epoch": 3.828895463510848, "grad_norm": 0.361328125, "learning_rate": 3.161407873115526e-05, "loss": 0.0334, "step": 15530 }, { "epoch": 3.829388560157791, "grad_norm": 0.279296875, "learning_rate": 3.158896608463945e-05, "loss": 0.0332, "step": 15532 }, { "epoch": 3.8298816568047336, "grad_norm": 0.302734375, "learning_rate": 3.1563861545213056e-05, "loss": 0.0323, "step": 15534 }, { "epoch": 3.8303747534516766, "grad_norm": 0.294921875, "learning_rate": 3.153876511585122e-05, "loss": 0.0328, "step": 15536 }, { "epoch": 3.8308678500986195, "grad_norm": 0.32421875, "learning_rate": 3.1513676799527935e-05, "loss": 0.0329, "step": 15538 }, { "epoch": 3.831360946745562, "grad_norm": 0.27734375, "learning_rate": 3.1488596599216326e-05, "loss": 0.0358, "step": 15540 }, { "epoch": 3.831854043392505, "grad_norm": 0.296875, "learning_rate": 3.1463524517888575e-05, "loss": 0.0325, "step": 15542 }, { "epoch": 3.832347140039448, "grad_norm": 0.3515625, "learning_rate": 3.1438460558515756e-05, "loss": 0.0361, "step": 15544 }, { "epoch": 3.8328402366863905, "grad_norm": 0.306640625, "learning_rate": 3.141340472406823e-05, "loss": 0.0309, "step": 15546 }, { "epoch": 3.8333333333333335, "grad_norm": 0.310546875, "learning_rate": 3.138835701751511e-05, "loss": 0.0364, "step": 15548 }, { "epoch": 3.833826429980276, "grad_norm": 0.302734375, "learning_rate": 3.1363317441824825e-05, "loss": 0.037, "step": 15550 }, { "epoch": 3.834319526627219, "grad_norm": 0.318359375, "learning_rate": 3.133828599996462e-05, "loss": 0.0338, "step": 15552 }, { "epoch": 3.8348126232741615, "grad_norm": 0.318359375, "learning_rate": 3.131326269490088e-05, "loss": 0.0346, "step": 15554 }, { "epoch": 3.8353057199211045, "grad_norm": 0.2890625, "learning_rate": 3.1288247529599034e-05, "loss": 0.0365, "step": 15556 }, { "epoch": 3.8357988165680474, "grad_norm": 0.328125, "learning_rate": 3.126324050702343e-05, "loss": 0.0334, "step": 15558 }, { "epoch": 3.83629191321499, "grad_norm": 0.3203125, "learning_rate": 3.1238241630137666e-05, "loss": 0.0368, "step": 15560 }, { "epoch": 3.836785009861933, "grad_norm": 0.3359375, "learning_rate": 3.1213250901904126e-05, "loss": 0.0359, "step": 15562 }, { "epoch": 3.837278106508876, "grad_norm": 0.271484375, "learning_rate": 3.118826832528448e-05, "loss": 0.0352, "step": 15564 }, { "epoch": 3.8377712031558184, "grad_norm": 0.361328125, "learning_rate": 3.116329390323919e-05, "loss": 0.0343, "step": 15566 }, { "epoch": 3.8382642998027614, "grad_norm": 0.328125, "learning_rate": 3.11383276387279e-05, "loss": 0.039, "step": 15568 }, { "epoch": 3.8387573964497044, "grad_norm": 0.330078125, "learning_rate": 3.11133695347093e-05, "loss": 0.0349, "step": 15570 }, { "epoch": 3.839250493096647, "grad_norm": 0.31640625, "learning_rate": 3.108841959414097e-05, "loss": 0.0347, "step": 15572 }, { "epoch": 3.83974358974359, "grad_norm": 0.361328125, "learning_rate": 3.106347781997973e-05, "loss": 0.0343, "step": 15574 }, { "epoch": 3.8402366863905324, "grad_norm": 0.3359375, "learning_rate": 3.103854421518119e-05, "loss": 0.0338, "step": 15576 }, { "epoch": 3.8407297830374754, "grad_norm": 0.31640625, "learning_rate": 3.101361878270026e-05, "loss": 0.0333, "step": 15578 }, { "epoch": 3.841222879684418, "grad_norm": 0.32421875, "learning_rate": 3.0988701525490634e-05, "loss": 0.0383, "step": 15580 }, { "epoch": 3.841715976331361, "grad_norm": 0.2890625, "learning_rate": 3.09637924465052e-05, "loss": 0.0335, "step": 15582 }, { "epoch": 3.842209072978304, "grad_norm": 0.328125, "learning_rate": 3.0938891548695805e-05, "loss": 0.0351, "step": 15584 }, { "epoch": 3.8427021696252464, "grad_norm": 0.33203125, "learning_rate": 3.091399883501335e-05, "loss": 0.0372, "step": 15586 }, { "epoch": 3.8431952662721893, "grad_norm": 0.369140625, "learning_rate": 3.0889114308407786e-05, "loss": 0.0351, "step": 15588 }, { "epoch": 3.8436883629191323, "grad_norm": 0.298828125, "learning_rate": 3.0864237971827995e-05, "loss": 0.0353, "step": 15590 }, { "epoch": 3.844181459566075, "grad_norm": 0.298828125, "learning_rate": 3.0839369828222057e-05, "loss": 0.0318, "step": 15592 }, { "epoch": 3.844674556213018, "grad_norm": 0.27734375, "learning_rate": 3.08145098805369e-05, "loss": 0.0348, "step": 15594 }, { "epoch": 3.8451676528599608, "grad_norm": 0.275390625, "learning_rate": 3.0789658131718624e-05, "loss": 0.0337, "step": 15596 }, { "epoch": 3.8456607495069033, "grad_norm": 0.30078125, "learning_rate": 3.0764814584712263e-05, "loss": 0.0344, "step": 15598 }, { "epoch": 3.8461538461538463, "grad_norm": 0.28125, "learning_rate": 3.0739979242461944e-05, "loss": 0.0311, "step": 15600 }, { "epoch": 3.8466469428007892, "grad_norm": 0.34375, "learning_rate": 3.071515210791078e-05, "loss": 0.0364, "step": 15602 }, { "epoch": 3.8471400394477318, "grad_norm": 0.291015625, "learning_rate": 3.069033318400093e-05, "loss": 0.0333, "step": 15604 }, { "epoch": 3.8476331360946747, "grad_norm": 0.302734375, "learning_rate": 3.0665522473673604e-05, "loss": 0.0323, "step": 15606 }, { "epoch": 3.8481262327416172, "grad_norm": 0.34765625, "learning_rate": 3.064071997986895e-05, "loss": 0.0347, "step": 15608 }, { "epoch": 3.84861932938856, "grad_norm": 0.353515625, "learning_rate": 3.061592570552623e-05, "loss": 0.0309, "step": 15610 }, { "epoch": 3.8491124260355027, "grad_norm": 0.3203125, "learning_rate": 3.05911396535837e-05, "loss": 0.0336, "step": 15612 }, { "epoch": 3.8496055226824457, "grad_norm": 0.2890625, "learning_rate": 3.056636182697865e-05, "loss": 0.0318, "step": 15614 }, { "epoch": 3.8500986193293887, "grad_norm": 0.291015625, "learning_rate": 3.05415922286474e-05, "loss": 0.0316, "step": 15616 }, { "epoch": 3.850591715976331, "grad_norm": 0.25390625, "learning_rate": 3.051683086152527e-05, "loss": 0.0313, "step": 15618 }, { "epoch": 3.851084812623274, "grad_norm": 0.322265625, "learning_rate": 3.0492077728546664e-05, "loss": 0.034, "step": 15620 }, { "epoch": 3.851577909270217, "grad_norm": 0.326171875, "learning_rate": 3.046733283264489e-05, "loss": 0.0356, "step": 15622 }, { "epoch": 3.8520710059171597, "grad_norm": 0.31640625, "learning_rate": 3.0442596176752403e-05, "loss": 0.0344, "step": 15624 }, { "epoch": 3.8525641025641026, "grad_norm": 0.32421875, "learning_rate": 3.0417867763800632e-05, "loss": 0.0355, "step": 15626 }, { "epoch": 3.8530571992110456, "grad_norm": 0.265625, "learning_rate": 3.0393147596720018e-05, "loss": 0.0305, "step": 15628 }, { "epoch": 3.853550295857988, "grad_norm": 0.275390625, "learning_rate": 3.0368435678440056e-05, "loss": 0.0337, "step": 15630 }, { "epoch": 3.854043392504931, "grad_norm": 0.34375, "learning_rate": 3.0343732011889235e-05, "loss": 0.0357, "step": 15632 }, { "epoch": 3.8545364891518736, "grad_norm": 0.380859375, "learning_rate": 3.0319036599995077e-05, "loss": 0.0369, "step": 15634 }, { "epoch": 3.8550295857988166, "grad_norm": 0.306640625, "learning_rate": 3.0294349445684166e-05, "loss": 0.0355, "step": 15636 }, { "epoch": 3.855522682445759, "grad_norm": 0.314453125, "learning_rate": 3.0269670551882013e-05, "loss": 0.0371, "step": 15638 }, { "epoch": 3.856015779092702, "grad_norm": 0.271484375, "learning_rate": 3.0244999921513205e-05, "loss": 0.0383, "step": 15640 }, { "epoch": 3.856508875739645, "grad_norm": 0.361328125, "learning_rate": 3.0220337557501376e-05, "loss": 0.0342, "step": 15642 }, { "epoch": 3.8570019723865876, "grad_norm": 0.287109375, "learning_rate": 3.0195683462769152e-05, "loss": 0.0342, "step": 15644 }, { "epoch": 3.8574950690335306, "grad_norm": 0.35546875, "learning_rate": 3.0171037640238165e-05, "loss": 0.0368, "step": 15646 }, { "epoch": 3.8579881656804735, "grad_norm": 0.27734375, "learning_rate": 3.0146400092829095e-05, "loss": 0.0355, "step": 15648 }, { "epoch": 3.858481262327416, "grad_norm": 0.384765625, "learning_rate": 3.0121770823461658e-05, "loss": 0.0314, "step": 15650 }, { "epoch": 3.858974358974359, "grad_norm": 0.294921875, "learning_rate": 3.0097149835054506e-05, "loss": 0.0313, "step": 15652 }, { "epoch": 3.859467455621302, "grad_norm": 0.2578125, "learning_rate": 3.007253713052538e-05, "loss": 0.0314, "step": 15654 }, { "epoch": 3.8599605522682445, "grad_norm": 0.296875, "learning_rate": 3.0047932712791027e-05, "loss": 0.0326, "step": 15656 }, { "epoch": 3.8604536489151875, "grad_norm": 0.341796875, "learning_rate": 3.0023336584767203e-05, "loss": 0.0344, "step": 15658 }, { "epoch": 3.86094674556213, "grad_norm": 0.279296875, "learning_rate": 2.999874874936871e-05, "loss": 0.0336, "step": 15660 }, { "epoch": 3.861439842209073, "grad_norm": 0.3671875, "learning_rate": 2.9974169209509317e-05, "loss": 0.0319, "step": 15662 }, { "epoch": 3.8619329388560155, "grad_norm": 0.326171875, "learning_rate": 2.9949597968101884e-05, "loss": 0.0331, "step": 15664 }, { "epoch": 3.8624260355029585, "grad_norm": 0.33203125, "learning_rate": 2.9925035028058134e-05, "loss": 0.0317, "step": 15666 }, { "epoch": 3.8629191321499015, "grad_norm": 0.330078125, "learning_rate": 2.9900480392289044e-05, "loss": 0.0327, "step": 15668 }, { "epoch": 3.863412228796844, "grad_norm": 0.30078125, "learning_rate": 2.9875934063704382e-05, "loss": 0.0308, "step": 15670 }, { "epoch": 3.863905325443787, "grad_norm": 0.2578125, "learning_rate": 2.9851396045213066e-05, "loss": 0.0322, "step": 15672 }, { "epoch": 3.86439842209073, "grad_norm": 0.41796875, "learning_rate": 2.9826866339722958e-05, "loss": 0.0372, "step": 15674 }, { "epoch": 3.8648915187376724, "grad_norm": 0.302734375, "learning_rate": 2.9802344950141002e-05, "loss": 0.0325, "step": 15676 }, { "epoch": 3.8653846153846154, "grad_norm": 0.267578125, "learning_rate": 2.977783187937313e-05, "loss": 0.0341, "step": 15678 }, { "epoch": 3.8658777120315584, "grad_norm": 0.296875, "learning_rate": 2.9753327130324194e-05, "loss": 0.0353, "step": 15680 }, { "epoch": 3.866370808678501, "grad_norm": 0.3203125, "learning_rate": 2.972883070589826e-05, "loss": 0.0318, "step": 15682 }, { "epoch": 3.866863905325444, "grad_norm": 0.33984375, "learning_rate": 2.970434260899816e-05, "loss": 0.0346, "step": 15684 }, { "epoch": 3.867357001972387, "grad_norm": 0.30078125, "learning_rate": 2.9679862842526007e-05, "loss": 0.0319, "step": 15686 }, { "epoch": 3.8678500986193294, "grad_norm": 0.361328125, "learning_rate": 2.9655391409382705e-05, "loss": 0.0331, "step": 15688 }, { "epoch": 3.8683431952662723, "grad_norm": 0.306640625, "learning_rate": 2.9630928312468263e-05, "loss": 0.0348, "step": 15690 }, { "epoch": 3.868836291913215, "grad_norm": 0.283203125, "learning_rate": 2.9606473554681712e-05, "loss": 0.0274, "step": 15692 }, { "epoch": 3.869329388560158, "grad_norm": 0.345703125, "learning_rate": 2.9582027138921063e-05, "loss": 0.0379, "step": 15694 }, { "epoch": 3.8698224852071004, "grad_norm": 0.32421875, "learning_rate": 2.9557589068083403e-05, "loss": 0.0348, "step": 15696 }, { "epoch": 3.8703155818540433, "grad_norm": 0.265625, "learning_rate": 2.9533159345064677e-05, "loss": 0.029, "step": 15698 }, { "epoch": 3.8708086785009863, "grad_norm": 0.291015625, "learning_rate": 2.950873797276006e-05, "loss": 0.0338, "step": 15700 }, { "epoch": 3.871301775147929, "grad_norm": 0.39453125, "learning_rate": 2.9484324954063536e-05, "loss": 0.0347, "step": 15702 }, { "epoch": 3.871794871794872, "grad_norm": 0.306640625, "learning_rate": 2.9459920291868215e-05, "loss": 0.0297, "step": 15704 }, { "epoch": 3.8722879684418148, "grad_norm": 0.333984375, "learning_rate": 2.9435523989066182e-05, "loss": 0.0305, "step": 15706 }, { "epoch": 3.8727810650887573, "grad_norm": 0.271484375, "learning_rate": 2.941113604854855e-05, "loss": 0.0316, "step": 15708 }, { "epoch": 3.8732741617357003, "grad_norm": 0.298828125, "learning_rate": 2.9386756473205433e-05, "loss": 0.0335, "step": 15710 }, { "epoch": 3.8737672583826432, "grad_norm": 0.33984375, "learning_rate": 2.9362385265925873e-05, "loss": 0.0349, "step": 15712 }, { "epoch": 3.8742603550295858, "grad_norm": 0.337890625, "learning_rate": 2.93380224295981e-05, "loss": 0.0338, "step": 15714 }, { "epoch": 3.8747534516765287, "grad_norm": 0.267578125, "learning_rate": 2.931366796710917e-05, "loss": 0.0322, "step": 15716 }, { "epoch": 3.8752465483234713, "grad_norm": 0.30859375, "learning_rate": 2.9289321881345254e-05, "loss": 0.0355, "step": 15718 }, { "epoch": 3.8757396449704142, "grad_norm": 0.33203125, "learning_rate": 2.926498417519149e-05, "loss": 0.0369, "step": 15720 }, { "epoch": 3.8762327416173568, "grad_norm": 0.341796875, "learning_rate": 2.9240654851532034e-05, "loss": 0.0323, "step": 15722 }, { "epoch": 3.8767258382642997, "grad_norm": 0.34375, "learning_rate": 2.9216333913250084e-05, "loss": 0.0339, "step": 15724 }, { "epoch": 3.8772189349112427, "grad_norm": 0.32421875, "learning_rate": 2.9192021363227696e-05, "loss": 0.0349, "step": 15726 }, { "epoch": 3.8777120315581852, "grad_norm": 0.3125, "learning_rate": 2.916771720434619e-05, "loss": 0.0326, "step": 15728 }, { "epoch": 3.878205128205128, "grad_norm": 0.29296875, "learning_rate": 2.914342143948563e-05, "loss": 0.0361, "step": 15730 }, { "epoch": 3.878698224852071, "grad_norm": 0.400390625, "learning_rate": 2.9119134071525254e-05, "loss": 0.0361, "step": 15732 }, { "epoch": 3.8791913214990137, "grad_norm": 0.30859375, "learning_rate": 2.9094855103343232e-05, "loss": 0.0324, "step": 15734 }, { "epoch": 3.8796844181459567, "grad_norm": 0.3125, "learning_rate": 2.907058453781677e-05, "loss": 0.0341, "step": 15736 }, { "epoch": 3.8801775147928996, "grad_norm": 0.318359375, "learning_rate": 2.904632237782209e-05, "loss": 0.0335, "step": 15738 }, { "epoch": 3.880670611439842, "grad_norm": 0.388671875, "learning_rate": 2.9022068626234287e-05, "loss": 0.0325, "step": 15740 }, { "epoch": 3.881163708086785, "grad_norm": 0.296875, "learning_rate": 2.8997823285927716e-05, "loss": 0.0327, "step": 15742 }, { "epoch": 3.8816568047337277, "grad_norm": 0.31640625, "learning_rate": 2.8973586359775474e-05, "loss": 0.03, "step": 15744 }, { "epoch": 3.8821499013806706, "grad_norm": 0.283203125, "learning_rate": 2.8949357850649815e-05, "loss": 0.0322, "step": 15746 }, { "epoch": 3.882642998027613, "grad_norm": 0.36328125, "learning_rate": 2.892513776142195e-05, "loss": 0.0313, "step": 15748 }, { "epoch": 3.883136094674556, "grad_norm": 0.296875, "learning_rate": 2.8900926094962088e-05, "loss": 0.0369, "step": 15750 }, { "epoch": 3.883629191321499, "grad_norm": 0.306640625, "learning_rate": 2.887672285413947e-05, "loss": 0.0331, "step": 15752 }, { "epoch": 3.8841222879684416, "grad_norm": 0.3125, "learning_rate": 2.8852528041822247e-05, "loss": 0.0298, "step": 15754 }, { "epoch": 3.8846153846153846, "grad_norm": 0.36328125, "learning_rate": 2.8828341660877744e-05, "loss": 0.0342, "step": 15756 }, { "epoch": 3.8851084812623276, "grad_norm": 0.31640625, "learning_rate": 2.8804163714172062e-05, "loss": 0.0378, "step": 15758 }, { "epoch": 3.88560157790927, "grad_norm": 0.3046875, "learning_rate": 2.877999420457056e-05, "loss": 0.0331, "step": 15760 }, { "epoch": 3.886094674556213, "grad_norm": 0.291015625, "learning_rate": 2.8755833134937336e-05, "loss": 0.0325, "step": 15762 }, { "epoch": 3.886587771203156, "grad_norm": 0.30859375, "learning_rate": 2.8731680508135682e-05, "loss": 0.0301, "step": 15764 }, { "epoch": 3.8870808678500985, "grad_norm": 0.3359375, "learning_rate": 2.8707536327027784e-05, "loss": 0.0384, "step": 15766 }, { "epoch": 3.8875739644970415, "grad_norm": 0.3515625, "learning_rate": 2.8683400594474886e-05, "loss": 0.0357, "step": 15768 }, { "epoch": 3.8880670611439845, "grad_norm": 0.390625, "learning_rate": 2.865927331333723e-05, "loss": 0.0312, "step": 15770 }, { "epoch": 3.888560157790927, "grad_norm": 0.330078125, "learning_rate": 2.8635154486473936e-05, "loss": 0.0351, "step": 15772 }, { "epoch": 3.88905325443787, "grad_norm": 0.35546875, "learning_rate": 2.8611044116743347e-05, "loss": 0.0313, "step": 15774 }, { "epoch": 3.8895463510848125, "grad_norm": 0.296875, "learning_rate": 2.8586942207002588e-05, "loss": 0.0339, "step": 15776 }, { "epoch": 3.8900394477317555, "grad_norm": 0.3671875, "learning_rate": 2.8562848760107897e-05, "loss": 0.0353, "step": 15778 }, { "epoch": 3.890532544378698, "grad_norm": 0.291015625, "learning_rate": 2.853876377891448e-05, "loss": 0.0355, "step": 15780 }, { "epoch": 3.891025641025641, "grad_norm": 0.296875, "learning_rate": 2.851468726627654e-05, "loss": 0.0328, "step": 15782 }, { "epoch": 3.891518737672584, "grad_norm": 0.3046875, "learning_rate": 2.849061922504729e-05, "loss": 0.0344, "step": 15784 }, { "epoch": 3.8920118343195265, "grad_norm": 0.40625, "learning_rate": 2.8466559658078905e-05, "loss": 0.0371, "step": 15786 }, { "epoch": 3.8925049309664694, "grad_norm": 0.30078125, "learning_rate": 2.8442508568222636e-05, "loss": 0.0342, "step": 15788 }, { "epoch": 3.8929980276134124, "grad_norm": 0.302734375, "learning_rate": 2.8418465958328587e-05, "loss": 0.0346, "step": 15790 }, { "epoch": 3.893491124260355, "grad_norm": 0.31640625, "learning_rate": 2.8394431831245993e-05, "loss": 0.0318, "step": 15792 }, { "epoch": 3.893984220907298, "grad_norm": 0.267578125, "learning_rate": 2.837040618982302e-05, "loss": 0.0338, "step": 15794 }, { "epoch": 3.894477317554241, "grad_norm": 0.37890625, "learning_rate": 2.8346389036906828e-05, "loss": 0.0372, "step": 15796 }, { "epoch": 3.8949704142011834, "grad_norm": 0.306640625, "learning_rate": 2.8322380375343616e-05, "loss": 0.0315, "step": 15798 }, { "epoch": 3.8954635108481264, "grad_norm": 0.314453125, "learning_rate": 2.8298380207978513e-05, "loss": 0.0347, "step": 15800 }, { "epoch": 3.895956607495069, "grad_norm": 0.2734375, "learning_rate": 2.8274388537655727e-05, "loss": 0.0332, "step": 15802 }, { "epoch": 3.896449704142012, "grad_norm": 0.330078125, "learning_rate": 2.8250405367218324e-05, "loss": 0.0347, "step": 15804 }, { "epoch": 3.8969428007889544, "grad_norm": 0.2890625, "learning_rate": 2.822643069950849e-05, "loss": 0.0315, "step": 15806 }, { "epoch": 3.8974358974358974, "grad_norm": 0.3046875, "learning_rate": 2.8202464537367346e-05, "loss": 0.031, "step": 15808 }, { "epoch": 3.8979289940828403, "grad_norm": 0.333984375, "learning_rate": 2.8178506883635024e-05, "loss": 0.0336, "step": 15810 }, { "epoch": 3.898422090729783, "grad_norm": 0.27734375, "learning_rate": 2.8154557741150633e-05, "loss": 0.0335, "step": 15812 }, { "epoch": 3.898915187376726, "grad_norm": 0.32421875, "learning_rate": 2.8130617112752277e-05, "loss": 0.0313, "step": 15814 }, { "epoch": 3.899408284023669, "grad_norm": 0.330078125, "learning_rate": 2.8106685001277077e-05, "loss": 0.0353, "step": 15816 }, { "epoch": 3.8999013806706113, "grad_norm": 0.322265625, "learning_rate": 2.808276140956112e-05, "loss": 0.0339, "step": 15818 }, { "epoch": 3.9003944773175543, "grad_norm": 0.35546875, "learning_rate": 2.8058846340439447e-05, "loss": 0.037, "step": 15820 }, { "epoch": 3.9008875739644973, "grad_norm": 0.283203125, "learning_rate": 2.803493979674615e-05, "loss": 0.0291, "step": 15822 }, { "epoch": 3.90138067061144, "grad_norm": 0.265625, "learning_rate": 2.8011041781314285e-05, "loss": 0.0316, "step": 15824 }, { "epoch": 3.9018737672583828, "grad_norm": 0.376953125, "learning_rate": 2.7987152296975915e-05, "loss": 0.0325, "step": 15826 }, { "epoch": 3.9023668639053253, "grad_norm": 0.32421875, "learning_rate": 2.7963271346562058e-05, "loss": 0.0276, "step": 15828 }, { "epoch": 3.9028599605522682, "grad_norm": 0.30078125, "learning_rate": 2.793939893290275e-05, "loss": 0.0337, "step": 15830 }, { "epoch": 3.9033530571992108, "grad_norm": 0.3125, "learning_rate": 2.7915535058827024e-05, "loss": 0.0341, "step": 15832 }, { "epoch": 3.9038461538461537, "grad_norm": 0.359375, "learning_rate": 2.7891679727162845e-05, "loss": 0.0337, "step": 15834 }, { "epoch": 3.9043392504930967, "grad_norm": 0.361328125, "learning_rate": 2.7867832940737216e-05, "loss": 0.0403, "step": 15836 }, { "epoch": 3.9048323471400392, "grad_norm": 0.279296875, "learning_rate": 2.7843994702376108e-05, "loss": 0.0322, "step": 15838 }, { "epoch": 3.905325443786982, "grad_norm": 0.306640625, "learning_rate": 2.782016501490451e-05, "loss": 0.035, "step": 15840 }, { "epoch": 3.905818540433925, "grad_norm": 0.2890625, "learning_rate": 2.7796343881146347e-05, "loss": 0.0339, "step": 15842 }, { "epoch": 3.9063116370808677, "grad_norm": 0.30078125, "learning_rate": 2.7772531303924576e-05, "loss": 0.0339, "step": 15844 }, { "epoch": 3.9068047337278107, "grad_norm": 0.333984375, "learning_rate": 2.7748727286061137e-05, "loss": 0.0347, "step": 15846 }, { "epoch": 3.9072978303747536, "grad_norm": 0.30078125, "learning_rate": 2.7724931830376855e-05, "loss": 0.0353, "step": 15848 }, { "epoch": 3.907790927021696, "grad_norm": 0.314453125, "learning_rate": 2.7701144939691747e-05, "loss": 0.0344, "step": 15850 }, { "epoch": 3.908284023668639, "grad_norm": 0.306640625, "learning_rate": 2.7677366616824595e-05, "loss": 0.0306, "step": 15852 }, { "epoch": 3.908777120315582, "grad_norm": 0.287109375, "learning_rate": 2.765359686459329e-05, "loss": 0.0343, "step": 15854 }, { "epoch": 3.9092702169625246, "grad_norm": 0.296875, "learning_rate": 2.7629835685814697e-05, "loss": 0.0334, "step": 15856 }, { "epoch": 3.9097633136094676, "grad_norm": 0.357421875, "learning_rate": 2.7606083083304633e-05, "loss": 0.0367, "step": 15858 }, { "epoch": 3.91025641025641, "grad_norm": 0.314453125, "learning_rate": 2.758233905987796e-05, "loss": 0.0339, "step": 15860 }, { "epoch": 3.910749506903353, "grad_norm": 0.279296875, "learning_rate": 2.7558603618348365e-05, "loss": 0.0353, "step": 15862 }, { "epoch": 3.9112426035502956, "grad_norm": 0.263671875, "learning_rate": 2.7534876761528773e-05, "loss": 0.0329, "step": 15864 }, { "epoch": 3.9117357001972386, "grad_norm": 0.255859375, "learning_rate": 2.7511158492230805e-05, "loss": 0.0289, "step": 15866 }, { "epoch": 3.9122287968441816, "grad_norm": 0.2890625, "learning_rate": 2.7487448813265347e-05, "loss": 0.0345, "step": 15868 }, { "epoch": 3.912721893491124, "grad_norm": 0.30859375, "learning_rate": 2.7463747727442034e-05, "loss": 0.0313, "step": 15870 }, { "epoch": 3.913214990138067, "grad_norm": 0.283203125, "learning_rate": 2.7440055237569606e-05, "loss": 0.0329, "step": 15872 }, { "epoch": 3.91370808678501, "grad_norm": 0.275390625, "learning_rate": 2.7416371346455792e-05, "loss": 0.0321, "step": 15874 }, { "epoch": 3.9142011834319526, "grad_norm": 0.302734375, "learning_rate": 2.7392696056907162e-05, "loss": 0.0331, "step": 15876 }, { "epoch": 3.9146942800788955, "grad_norm": 0.2890625, "learning_rate": 2.7369029371729494e-05, "loss": 0.0338, "step": 15878 }, { "epoch": 3.9151873767258385, "grad_norm": 0.259765625, "learning_rate": 2.7345371293727307e-05, "loss": 0.0302, "step": 15880 }, { "epoch": 3.915680473372781, "grad_norm": 0.287109375, "learning_rate": 2.732172182570434e-05, "loss": 0.0323, "step": 15882 }, { "epoch": 3.916173570019724, "grad_norm": 0.310546875, "learning_rate": 2.729808097046308e-05, "loss": 0.0343, "step": 15884 }, { "epoch": 3.9166666666666665, "grad_norm": 0.306640625, "learning_rate": 2.7274448730805135e-05, "loss": 0.0307, "step": 15886 }, { "epoch": 3.9171597633136095, "grad_norm": 0.2734375, "learning_rate": 2.725082510953111e-05, "loss": 0.0328, "step": 15888 }, { "epoch": 3.917652859960552, "grad_norm": 0.28515625, "learning_rate": 2.7227210109440414e-05, "loss": 0.0348, "step": 15890 }, { "epoch": 3.918145956607495, "grad_norm": 0.26953125, "learning_rate": 2.7203603733331706e-05, "loss": 0.0316, "step": 15892 }, { "epoch": 3.918639053254438, "grad_norm": 0.26171875, "learning_rate": 2.7180005984002333e-05, "loss": 0.031, "step": 15894 }, { "epoch": 3.9191321499013805, "grad_norm": 0.3359375, "learning_rate": 2.715641686424888e-05, "loss": 0.0352, "step": 15896 }, { "epoch": 3.9196252465483234, "grad_norm": 0.31640625, "learning_rate": 2.713283637686671e-05, "loss": 0.0356, "step": 15898 }, { "epoch": 3.9201183431952664, "grad_norm": 0.28125, "learning_rate": 2.710926452465026e-05, "loss": 0.0327, "step": 15900 }, { "epoch": 3.920611439842209, "grad_norm": 0.279296875, "learning_rate": 2.7085701310392974e-05, "loss": 0.0313, "step": 15902 }, { "epoch": 3.921104536489152, "grad_norm": 0.369140625, "learning_rate": 2.7062146736887116e-05, "loss": 0.0322, "step": 15904 }, { "epoch": 3.921597633136095, "grad_norm": 0.287109375, "learning_rate": 2.7038600806924164e-05, "loss": 0.0345, "step": 15906 }, { "epoch": 3.9220907297830374, "grad_norm": 0.2890625, "learning_rate": 2.7015063523294326e-05, "loss": 0.0322, "step": 15908 }, { "epoch": 3.9225838264299804, "grad_norm": 0.330078125, "learning_rate": 2.6991534888787017e-05, "loss": 0.034, "step": 15910 }, { "epoch": 3.9230769230769234, "grad_norm": 0.322265625, "learning_rate": 2.6968014906190408e-05, "loss": 0.0302, "step": 15912 }, { "epoch": 3.923570019723866, "grad_norm": 0.30859375, "learning_rate": 2.6944503578291803e-05, "loss": 0.0334, "step": 15914 }, { "epoch": 3.9240631163708084, "grad_norm": 0.322265625, "learning_rate": 2.6921000907877413e-05, "loss": 0.0352, "step": 15916 }, { "epoch": 3.9245562130177514, "grad_norm": 0.33203125, "learning_rate": 2.6897506897732437e-05, "loss": 0.0381, "step": 15918 }, { "epoch": 3.9250493096646943, "grad_norm": 0.279296875, "learning_rate": 2.687402155064107e-05, "loss": 0.031, "step": 15920 }, { "epoch": 3.925542406311637, "grad_norm": 0.408203125, "learning_rate": 2.6850544869386374e-05, "loss": 0.0329, "step": 15922 }, { "epoch": 3.92603550295858, "grad_norm": 0.28515625, "learning_rate": 2.6827076856750587e-05, "loss": 0.0325, "step": 15924 }, { "epoch": 3.926528599605523, "grad_norm": 0.318359375, "learning_rate": 2.68036175155147e-05, "loss": 0.0354, "step": 15926 }, { "epoch": 3.9270216962524653, "grad_norm": 0.29296875, "learning_rate": 2.678016684845883e-05, "loss": 0.0343, "step": 15928 }, { "epoch": 3.9275147928994083, "grad_norm": 0.29296875, "learning_rate": 2.6756724858361982e-05, "loss": 0.031, "step": 15930 }, { "epoch": 3.9280078895463513, "grad_norm": 0.357421875, "learning_rate": 2.673329154800218e-05, "loss": 0.0348, "step": 15932 }, { "epoch": 3.928500986193294, "grad_norm": 0.314453125, "learning_rate": 2.6709866920156434e-05, "loss": 0.0337, "step": 15934 }, { "epoch": 3.9289940828402368, "grad_norm": 0.296875, "learning_rate": 2.6686450977600585e-05, "loss": 0.0354, "step": 15936 }, { "epoch": 3.9294871794871797, "grad_norm": 0.296875, "learning_rate": 2.6663043723109692e-05, "loss": 0.0305, "step": 15938 }, { "epoch": 3.9299802761341223, "grad_norm": 0.341796875, "learning_rate": 2.663964515945756e-05, "loss": 0.032, "step": 15940 }, { "epoch": 3.9304733727810652, "grad_norm": 0.271484375, "learning_rate": 2.6616255289417058e-05, "loss": 0.0334, "step": 15942 }, { "epoch": 3.9309664694280078, "grad_norm": 0.291015625, "learning_rate": 2.6592874115760025e-05, "loss": 0.0322, "step": 15944 }, { "epoch": 3.9314595660749507, "grad_norm": 0.353515625, "learning_rate": 2.6569501641257267e-05, "loss": 0.0382, "step": 15946 }, { "epoch": 3.9319526627218933, "grad_norm": 0.302734375, "learning_rate": 2.6546137868678543e-05, "loss": 0.0345, "step": 15948 }, { "epoch": 3.9324457593688362, "grad_norm": 0.294921875, "learning_rate": 2.65227828007926e-05, "loss": 0.033, "step": 15950 }, { "epoch": 3.932938856015779, "grad_norm": 0.2890625, "learning_rate": 2.6499436440367165e-05, "loss": 0.0341, "step": 15952 }, { "epoch": 3.9334319526627217, "grad_norm": 0.296875, "learning_rate": 2.6476098790168857e-05, "loss": 0.0328, "step": 15954 }, { "epoch": 3.9339250493096647, "grad_norm": 0.287109375, "learning_rate": 2.6452769852963344e-05, "loss": 0.034, "step": 15956 }, { "epoch": 3.9344181459566077, "grad_norm": 0.28125, "learning_rate": 2.6429449631515245e-05, "loss": 0.0342, "step": 15958 }, { "epoch": 3.93491124260355, "grad_norm": 0.298828125, "learning_rate": 2.640613812858813e-05, "loss": 0.0375, "step": 15960 }, { "epoch": 3.935404339250493, "grad_norm": 0.376953125, "learning_rate": 2.6382835346944546e-05, "loss": 0.0331, "step": 15962 }, { "epoch": 3.935897435897436, "grad_norm": 0.28515625, "learning_rate": 2.635954128934599e-05, "loss": 0.032, "step": 15964 }, { "epoch": 3.9363905325443787, "grad_norm": 0.37109375, "learning_rate": 2.633625595855298e-05, "loss": 0.0357, "step": 15966 }, { "epoch": 3.9368836291913216, "grad_norm": 0.2734375, "learning_rate": 2.6312979357324897e-05, "loss": 0.033, "step": 15968 }, { "epoch": 3.937376725838264, "grad_norm": 0.314453125, "learning_rate": 2.6289711488420177e-05, "loss": 0.0342, "step": 15970 }, { "epoch": 3.937869822485207, "grad_norm": 0.28125, "learning_rate": 2.6266452354596183e-05, "loss": 0.0297, "step": 15972 }, { "epoch": 3.9383629191321496, "grad_norm": 0.283203125, "learning_rate": 2.6243201958609266e-05, "loss": 0.0317, "step": 15974 }, { "epoch": 3.9388560157790926, "grad_norm": 0.3125, "learning_rate": 2.621996030321472e-05, "loss": 0.0374, "step": 15976 }, { "epoch": 3.9393491124260356, "grad_norm": 0.283203125, "learning_rate": 2.6196727391166818e-05, "loss": 0.0318, "step": 15978 }, { "epoch": 3.939842209072978, "grad_norm": 0.263671875, "learning_rate": 2.6173503225218777e-05, "loss": 0.033, "step": 15980 }, { "epoch": 3.940335305719921, "grad_norm": 0.302734375, "learning_rate": 2.6150287808122832e-05, "loss": 0.0342, "step": 15982 }, { "epoch": 3.940828402366864, "grad_norm": 0.3046875, "learning_rate": 2.6127081142630083e-05, "loss": 0.0333, "step": 15984 }, { "epoch": 3.9413214990138066, "grad_norm": 0.34765625, "learning_rate": 2.6103883231490668e-05, "loss": 0.0332, "step": 15986 }, { "epoch": 3.9418145956607495, "grad_norm": 0.2734375, "learning_rate": 2.608069407745366e-05, "loss": 0.0321, "step": 15988 }, { "epoch": 3.9423076923076925, "grad_norm": 0.296875, "learning_rate": 2.605751368326713e-05, "loss": 0.0339, "step": 15990 }, { "epoch": 3.942800788954635, "grad_norm": 0.318359375, "learning_rate": 2.6034342051678074e-05, "loss": 0.0356, "step": 15992 }, { "epoch": 3.943293885601578, "grad_norm": 0.33203125, "learning_rate": 2.6011179185432444e-05, "loss": 0.0331, "step": 15994 }, { "epoch": 3.943786982248521, "grad_norm": 0.30078125, "learning_rate": 2.5988025087275213e-05, "loss": 0.0353, "step": 15996 }, { "epoch": 3.9442800788954635, "grad_norm": 0.306640625, "learning_rate": 2.596487975995019e-05, "loss": 0.032, "step": 15998 }, { "epoch": 3.9447731755424065, "grad_norm": 0.3359375, "learning_rate": 2.5941743206200332e-05, "loss": 0.036, "step": 16000 }, { "epoch": 3.945266272189349, "grad_norm": 0.28125, "learning_rate": 2.5918615428767357e-05, "loss": 0.0323, "step": 16002 }, { "epoch": 3.945759368836292, "grad_norm": 0.298828125, "learning_rate": 2.5895496430392076e-05, "loss": 0.0352, "step": 16004 }, { "epoch": 3.9462524654832345, "grad_norm": 0.31640625, "learning_rate": 2.5872386213814204e-05, "loss": 0.0326, "step": 16006 }, { "epoch": 3.9467455621301775, "grad_norm": 0.283203125, "learning_rate": 2.5849284781772453e-05, "loss": 0.0334, "step": 16008 }, { "epoch": 3.9472386587771204, "grad_norm": 0.3125, "learning_rate": 2.582619213700448e-05, "loss": 0.0343, "step": 16010 }, { "epoch": 3.947731755424063, "grad_norm": 0.275390625, "learning_rate": 2.5803108282246823e-05, "loss": 0.0311, "step": 16012 }, { "epoch": 3.948224852071006, "grad_norm": 0.353515625, "learning_rate": 2.5780033220235143e-05, "loss": 0.0396, "step": 16014 }, { "epoch": 3.948717948717949, "grad_norm": 0.283203125, "learning_rate": 2.5756966953703897e-05, "loss": 0.0325, "step": 16016 }, { "epoch": 3.9492110453648914, "grad_norm": 0.328125, "learning_rate": 2.5733909485386588e-05, "loss": 0.0348, "step": 16018 }, { "epoch": 3.9497041420118344, "grad_norm": 0.28515625, "learning_rate": 2.5710860818015657e-05, "loss": 0.0336, "step": 16020 }, { "epoch": 3.9501972386587774, "grad_norm": 0.326171875, "learning_rate": 2.5687820954322484e-05, "loss": 0.0344, "step": 16022 }, { "epoch": 3.95069033530572, "grad_norm": 0.310546875, "learning_rate": 2.5664789897037478e-05, "loss": 0.0362, "step": 16024 }, { "epoch": 3.951183431952663, "grad_norm": 0.328125, "learning_rate": 2.5641767648889835e-05, "loss": 0.036, "step": 16026 }, { "epoch": 3.9516765285996054, "grad_norm": 0.3515625, "learning_rate": 2.5618754212607953e-05, "loss": 0.0359, "step": 16028 }, { "epoch": 3.9521696252465484, "grad_norm": 0.3203125, "learning_rate": 2.5595749590918926e-05, "loss": 0.0337, "step": 16030 }, { "epoch": 3.952662721893491, "grad_norm": 0.3359375, "learning_rate": 2.5572753786549054e-05, "loss": 0.0374, "step": 16032 }, { "epoch": 3.953155818540434, "grad_norm": 0.30078125, "learning_rate": 2.5549766802223374e-05, "loss": 0.0332, "step": 16034 }, { "epoch": 3.953648915187377, "grad_norm": 0.33203125, "learning_rate": 2.5526788640666012e-05, "loss": 0.0353, "step": 16036 }, { "epoch": 3.9541420118343193, "grad_norm": 0.291015625, "learning_rate": 2.5503819304600017e-05, "loss": 0.0294, "step": 16038 }, { "epoch": 3.9546351084812623, "grad_norm": 0.294921875, "learning_rate": 2.548085879674732e-05, "loss": 0.0323, "step": 16040 }, { "epoch": 3.9551282051282053, "grad_norm": 0.357421875, "learning_rate": 2.5457907119828973e-05, "loss": 0.0347, "step": 16042 }, { "epoch": 3.955621301775148, "grad_norm": 0.294921875, "learning_rate": 2.5434964276564766e-05, "loss": 0.0321, "step": 16044 }, { "epoch": 3.956114398422091, "grad_norm": 0.328125, "learning_rate": 2.541203026967367e-05, "loss": 0.0293, "step": 16046 }, { "epoch": 3.9566074950690338, "grad_norm": 0.384765625, "learning_rate": 2.53891051018734e-05, "loss": 0.0337, "step": 16048 }, { "epoch": 3.9571005917159763, "grad_norm": 0.306640625, "learning_rate": 2.536618877588075e-05, "loss": 0.036, "step": 16050 }, { "epoch": 3.9575936883629192, "grad_norm": 0.3359375, "learning_rate": 2.5343281294411457e-05, "loss": 0.0312, "step": 16052 }, { "epoch": 3.9580867850098618, "grad_norm": 0.29296875, "learning_rate": 2.5320382660180108e-05, "loss": 0.0344, "step": 16054 }, { "epoch": 3.9585798816568047, "grad_norm": 0.392578125, "learning_rate": 2.529749287590042e-05, "loss": 0.0417, "step": 16056 }, { "epoch": 3.9590729783037473, "grad_norm": 0.28515625, "learning_rate": 2.5274611944284866e-05, "loss": 0.0334, "step": 16058 }, { "epoch": 3.9595660749506902, "grad_norm": 0.29296875, "learning_rate": 2.5251739868045064e-05, "loss": 0.0349, "step": 16060 }, { "epoch": 3.960059171597633, "grad_norm": 0.298828125, "learning_rate": 2.5228876649891398e-05, "loss": 0.0376, "step": 16062 }, { "epoch": 3.9605522682445757, "grad_norm": 0.458984375, "learning_rate": 2.5206022292533317e-05, "loss": 0.0368, "step": 16064 }, { "epoch": 3.9610453648915187, "grad_norm": 0.333984375, "learning_rate": 2.518317679867922e-05, "loss": 0.0353, "step": 16066 }, { "epoch": 3.9615384615384617, "grad_norm": 0.333984375, "learning_rate": 2.5160340171036335e-05, "loss": 0.0362, "step": 16068 }, { "epoch": 3.962031558185404, "grad_norm": 0.3046875, "learning_rate": 2.5137512412311048e-05, "loss": 0.0339, "step": 16070 }, { "epoch": 3.962524654832347, "grad_norm": 0.302734375, "learning_rate": 2.5114693525208467e-05, "loss": 0.0374, "step": 16072 }, { "epoch": 3.96301775147929, "grad_norm": 0.3046875, "learning_rate": 2.509188351243287e-05, "loss": 0.0341, "step": 16074 }, { "epoch": 3.9635108481262327, "grad_norm": 0.34765625, "learning_rate": 2.506908237668727e-05, "loss": 0.0347, "step": 16076 }, { "epoch": 3.9640039447731756, "grad_norm": 0.384765625, "learning_rate": 2.5046290120673776e-05, "loss": 0.0356, "step": 16078 }, { "epoch": 3.9644970414201186, "grad_norm": 0.29296875, "learning_rate": 2.5023506747093395e-05, "loss": 0.0324, "step": 16080 }, { "epoch": 3.964990138067061, "grad_norm": 0.28125, "learning_rate": 2.5000732258646074e-05, "loss": 0.0334, "step": 16082 }, { "epoch": 3.965483234714004, "grad_norm": 0.27734375, "learning_rate": 2.497796665803075e-05, "loss": 0.0333, "step": 16084 }, { "epoch": 3.9659763313609466, "grad_norm": 0.318359375, "learning_rate": 2.495520994794519e-05, "loss": 0.0343, "step": 16086 }, { "epoch": 3.9664694280078896, "grad_norm": 0.2734375, "learning_rate": 2.493246213108631e-05, "loss": 0.0334, "step": 16088 }, { "epoch": 3.966962524654832, "grad_norm": 0.279296875, "learning_rate": 2.4909723210149748e-05, "loss": 0.0311, "step": 16090 }, { "epoch": 3.967455621301775, "grad_norm": 0.283203125, "learning_rate": 2.4886993187830256e-05, "loss": 0.0325, "step": 16092 }, { "epoch": 3.967948717948718, "grad_norm": 0.30859375, "learning_rate": 2.486427206682144e-05, "loss": 0.0317, "step": 16094 }, { "epoch": 3.9684418145956606, "grad_norm": 0.28125, "learning_rate": 2.4841559849815878e-05, "loss": 0.0299, "step": 16096 }, { "epoch": 3.9689349112426036, "grad_norm": 0.310546875, "learning_rate": 2.481885653950514e-05, "loss": 0.0347, "step": 16098 }, { "epoch": 3.9694280078895465, "grad_norm": 0.328125, "learning_rate": 2.4796162138579616e-05, "loss": 0.0398, "step": 16100 }, { "epoch": 3.969921104536489, "grad_norm": 0.28125, "learning_rate": 2.47734766497288e-05, "loss": 0.0325, "step": 16102 }, { "epoch": 3.970414201183432, "grad_norm": 0.30078125, "learning_rate": 2.4750800075641e-05, "loss": 0.0312, "step": 16104 }, { "epoch": 3.970907297830375, "grad_norm": 0.28515625, "learning_rate": 2.4728132419003515e-05, "loss": 0.0298, "step": 16106 }, { "epoch": 3.9714003944773175, "grad_norm": 0.333984375, "learning_rate": 2.470547368250261e-05, "loss": 0.0341, "step": 16108 }, { "epoch": 3.9718934911242605, "grad_norm": 0.294921875, "learning_rate": 2.468282386882347e-05, "loss": 0.0306, "step": 16110 }, { "epoch": 3.972386587771203, "grad_norm": 0.326171875, "learning_rate": 2.4660182980650205e-05, "loss": 0.033, "step": 16112 }, { "epoch": 3.972879684418146, "grad_norm": 0.26171875, "learning_rate": 2.4637551020665905e-05, "loss": 0.0295, "step": 16114 }, { "epoch": 3.9733727810650885, "grad_norm": 0.361328125, "learning_rate": 2.4614927991552615e-05, "loss": 0.0345, "step": 16116 }, { "epoch": 3.9738658777120315, "grad_norm": 0.25390625, "learning_rate": 2.4592313895991225e-05, "loss": 0.0335, "step": 16118 }, { "epoch": 3.9743589743589745, "grad_norm": 0.310546875, "learning_rate": 2.4569708736661655e-05, "loss": 0.0349, "step": 16120 }, { "epoch": 3.974852071005917, "grad_norm": 0.296875, "learning_rate": 2.4547112516242753e-05, "loss": 0.0313, "step": 16122 }, { "epoch": 3.97534516765286, "grad_norm": 0.28515625, "learning_rate": 2.4524525237412287e-05, "loss": 0.0312, "step": 16124 }, { "epoch": 3.975838264299803, "grad_norm": 0.275390625, "learning_rate": 2.4501946902846983e-05, "loss": 0.0343, "step": 16126 }, { "epoch": 3.9763313609467454, "grad_norm": 0.27734375, "learning_rate": 2.44793775152225e-05, "loss": 0.0333, "step": 16128 }, { "epoch": 3.9768244575936884, "grad_norm": 0.33203125, "learning_rate": 2.4456817077213436e-05, "loss": 0.0298, "step": 16130 }, { "epoch": 3.9773175542406314, "grad_norm": 0.302734375, "learning_rate": 2.4434265591493355e-05, "loss": 0.0321, "step": 16132 }, { "epoch": 3.977810650887574, "grad_norm": 0.28125, "learning_rate": 2.441172306073468e-05, "loss": 0.0309, "step": 16134 }, { "epoch": 3.978303747534517, "grad_norm": 0.271484375, "learning_rate": 2.4389189487608855e-05, "loss": 0.0297, "step": 16136 }, { "epoch": 3.9787968441814594, "grad_norm": 0.326171875, "learning_rate": 2.4366664874786226e-05, "loss": 0.0347, "step": 16138 }, { "epoch": 3.9792899408284024, "grad_norm": 0.2890625, "learning_rate": 2.4344149224936084e-05, "loss": 0.0325, "step": 16140 }, { "epoch": 3.979783037475345, "grad_norm": 0.294921875, "learning_rate": 2.4321642540726686e-05, "loss": 0.0334, "step": 16142 }, { "epoch": 3.980276134122288, "grad_norm": 0.2734375, "learning_rate": 2.429914482482517e-05, "loss": 0.0329, "step": 16144 }, { "epoch": 3.980769230769231, "grad_norm": 0.291015625, "learning_rate": 2.4276656079897688e-05, "loss": 0.0343, "step": 16146 }, { "epoch": 3.9812623274161734, "grad_norm": 0.26953125, "learning_rate": 2.425417630860921e-05, "loss": 0.0314, "step": 16148 }, { "epoch": 3.9817554240631163, "grad_norm": 0.3359375, "learning_rate": 2.4231705513623748e-05, "loss": 0.032, "step": 16150 }, { "epoch": 3.9822485207100593, "grad_norm": 0.357421875, "learning_rate": 2.4209243697604222e-05, "loss": 0.0363, "step": 16152 }, { "epoch": 3.982741617357002, "grad_norm": 0.296875, "learning_rate": 2.4186790863212472e-05, "loss": 0.032, "step": 16154 }, { "epoch": 3.983234714003945, "grad_norm": 0.310546875, "learning_rate": 2.4164347013109302e-05, "loss": 0.0356, "step": 16156 }, { "epoch": 3.9837278106508878, "grad_norm": 0.283203125, "learning_rate": 2.4141912149954427e-05, "loss": 0.0345, "step": 16158 }, { "epoch": 3.9842209072978303, "grad_norm": 0.34375, "learning_rate": 2.4119486276406512e-05, "loss": 0.0382, "step": 16160 }, { "epoch": 3.9847140039447733, "grad_norm": 0.337890625, "learning_rate": 2.4097069395123094e-05, "loss": 0.0355, "step": 16162 }, { "epoch": 3.9852071005917162, "grad_norm": 0.306640625, "learning_rate": 2.4074661508760787e-05, "loss": 0.0338, "step": 16164 }, { "epoch": 3.9857001972386588, "grad_norm": 0.3046875, "learning_rate": 2.4052262619974974e-05, "loss": 0.0323, "step": 16166 }, { "epoch": 3.9861932938856017, "grad_norm": 0.29296875, "learning_rate": 2.4029872731420087e-05, "loss": 0.0309, "step": 16168 }, { "epoch": 3.9866863905325443, "grad_norm": 0.328125, "learning_rate": 2.400749184574944e-05, "loss": 0.0277, "step": 16170 }, { "epoch": 3.9871794871794872, "grad_norm": 0.361328125, "learning_rate": 2.39851199656153e-05, "loss": 0.0345, "step": 16172 }, { "epoch": 3.9876725838264298, "grad_norm": 0.412109375, "learning_rate": 2.3962757093668887e-05, "loss": 0.0287, "step": 16174 }, { "epoch": 3.9881656804733727, "grad_norm": 0.27734375, "learning_rate": 2.3940403232560228e-05, "loss": 0.0299, "step": 16176 }, { "epoch": 3.9886587771203157, "grad_norm": 0.2890625, "learning_rate": 2.3918058384938513e-05, "loss": 0.0348, "step": 16178 }, { "epoch": 3.989151873767258, "grad_norm": 0.3125, "learning_rate": 2.3895722553451615e-05, "loss": 0.0312, "step": 16180 }, { "epoch": 3.989644970414201, "grad_norm": 0.29296875, "learning_rate": 2.3873395740746552e-05, "loss": 0.0346, "step": 16182 }, { "epoch": 3.990138067061144, "grad_norm": 0.291015625, "learning_rate": 2.3851077949469114e-05, "loss": 0.0317, "step": 16184 }, { "epoch": 3.9906311637080867, "grad_norm": 0.333984375, "learning_rate": 2.382876918226409e-05, "loss": 0.0342, "step": 16186 }, { "epoch": 3.9911242603550297, "grad_norm": 0.3125, "learning_rate": 2.3806469441775236e-05, "loss": 0.0338, "step": 16188 }, { "epoch": 3.9916173570019726, "grad_norm": 0.287109375, "learning_rate": 2.3784178730645103e-05, "loss": 0.0322, "step": 16190 }, { "epoch": 3.992110453648915, "grad_norm": 0.365234375, "learning_rate": 2.376189705151539e-05, "loss": 0.0312, "step": 16192 }, { "epoch": 3.992603550295858, "grad_norm": 0.31640625, "learning_rate": 2.373962440702647e-05, "loss": 0.0313, "step": 16194 }, { "epoch": 3.9930966469428006, "grad_norm": 0.302734375, "learning_rate": 2.37173607998179e-05, "loss": 0.0331, "step": 16196 }, { "epoch": 3.9935897435897436, "grad_norm": 0.314453125, "learning_rate": 2.369510623252795e-05, "loss": 0.036, "step": 16198 }, { "epoch": 3.994082840236686, "grad_norm": 0.310546875, "learning_rate": 2.3672860707793953e-05, "loss": 0.0351, "step": 16200 }, { "epoch": 3.994575936883629, "grad_norm": 0.310546875, "learning_rate": 2.365062422825215e-05, "loss": 0.0302, "step": 16202 }, { "epoch": 3.995069033530572, "grad_norm": 0.34375, "learning_rate": 2.3628396796537588e-05, "loss": 0.0362, "step": 16204 }, { "epoch": 3.9955621301775146, "grad_norm": 0.3828125, "learning_rate": 2.3606178415284473e-05, "loss": 0.0318, "step": 16206 }, { "epoch": 3.9960552268244576, "grad_norm": 0.298828125, "learning_rate": 2.358396908712569e-05, "loss": 0.0319, "step": 16208 }, { "epoch": 3.9965483234714005, "grad_norm": 0.306640625, "learning_rate": 2.3561768814693274e-05, "loss": 0.0336, "step": 16210 }, { "epoch": 3.997041420118343, "grad_norm": 0.322265625, "learning_rate": 2.3539577600618003e-05, "loss": 0.0332, "step": 16212 }, { "epoch": 3.997534516765286, "grad_norm": 0.291015625, "learning_rate": 2.3517395447529687e-05, "loss": 0.0331, "step": 16214 }, { "epoch": 3.998027613412229, "grad_norm": 0.3046875, "learning_rate": 2.3495222358057058e-05, "loss": 0.0355, "step": 16216 }, { "epoch": 3.9985207100591715, "grad_norm": 0.36328125, "learning_rate": 2.347305833482768e-05, "loss": 0.0373, "step": 16218 }, { "epoch": 3.9990138067061145, "grad_norm": 0.28125, "learning_rate": 2.3450903380468214e-05, "loss": 0.0326, "step": 16220 }, { "epoch": 3.9995069033530575, "grad_norm": 0.279296875, "learning_rate": 2.3428757497604027e-05, "loss": 0.0336, "step": 16222 }, { "epoch": 4.0, "grad_norm": 0.36328125, "learning_rate": 2.340662068885966e-05, "loss": 0.0275, "step": 16224 }, { "epoch": 4.0, "eval_loss": 0.06094944104552269, "eval_runtime": 68.5538, "eval_samples_per_second": 232.358, "eval_steps_per_second": 1.823, "step": 16224 }, { "epoch": 4.0004930966469425, "grad_norm": 0.265625, "learning_rate": 2.3384492956858363e-05, "loss": 0.0223, "step": 16226 }, { "epoch": 4.000986193293886, "grad_norm": 0.2333984375, "learning_rate": 2.336237430422241e-05, "loss": 0.0194, "step": 16228 }, { "epoch": 4.0014792899408285, "grad_norm": 0.21875, "learning_rate": 2.334026473357299e-05, "loss": 0.0194, "step": 16230 }, { "epoch": 4.001972386587771, "grad_norm": 0.2734375, "learning_rate": 2.3318164247530215e-05, "loss": 0.0216, "step": 16232 }, { "epoch": 4.002465483234714, "grad_norm": 0.279296875, "learning_rate": 2.3296072848713134e-05, "loss": 0.0211, "step": 16234 }, { "epoch": 4.002958579881657, "grad_norm": 0.234375, "learning_rate": 2.3273990539739633e-05, "loss": 0.0196, "step": 16236 }, { "epoch": 4.0034516765285995, "grad_norm": 0.259765625, "learning_rate": 2.3251917323226692e-05, "loss": 0.0246, "step": 16238 }, { "epoch": 4.003944773175542, "grad_norm": 0.29296875, "learning_rate": 2.3229853201790032e-05, "loss": 0.0261, "step": 16240 }, { "epoch": 4.004437869822485, "grad_norm": 0.240234375, "learning_rate": 2.3207798178044384e-05, "loss": 0.0227, "step": 16242 }, { "epoch": 4.004930966469428, "grad_norm": 0.26953125, "learning_rate": 2.318575225460341e-05, "loss": 0.0232, "step": 16244 }, { "epoch": 4.0054240631163704, "grad_norm": 0.30078125, "learning_rate": 2.316371543407967e-05, "loss": 0.0222, "step": 16246 }, { "epoch": 4.005917159763314, "grad_norm": 0.25390625, "learning_rate": 2.314168771908468e-05, "loss": 0.0242, "step": 16248 }, { "epoch": 4.006410256410256, "grad_norm": 0.2431640625, "learning_rate": 2.311966911222876e-05, "loss": 0.0217, "step": 16250 }, { "epoch": 4.006903353057199, "grad_norm": 0.232421875, "learning_rate": 2.309765961612135e-05, "loss": 0.0206, "step": 16252 }, { "epoch": 4.007396449704142, "grad_norm": 0.298828125, "learning_rate": 2.3075659233370616e-05, "loss": 0.0208, "step": 16254 }, { "epoch": 4.007889546351085, "grad_norm": 0.2216796875, "learning_rate": 2.305366796658375e-05, "loss": 0.0231, "step": 16256 }, { "epoch": 4.008382642998027, "grad_norm": 0.275390625, "learning_rate": 2.3031685818366842e-05, "loss": 0.0226, "step": 16258 }, { "epoch": 4.008875739644971, "grad_norm": 0.2412109375, "learning_rate": 2.3009712791324912e-05, "loss": 0.0229, "step": 16260 }, { "epoch": 4.009368836291913, "grad_norm": 0.236328125, "learning_rate": 2.2987748888061866e-05, "loss": 0.0218, "step": 16262 }, { "epoch": 4.009861932938856, "grad_norm": 0.2412109375, "learning_rate": 2.296579411118055e-05, "loss": 0.0223, "step": 16264 }, { "epoch": 4.010355029585799, "grad_norm": 0.265625, "learning_rate": 2.294384846328278e-05, "loss": 0.0212, "step": 16266 }, { "epoch": 4.010848126232742, "grad_norm": 0.26171875, "learning_rate": 2.2921911946969155e-05, "loss": 0.019, "step": 16268 }, { "epoch": 4.011341222879684, "grad_norm": 0.32421875, "learning_rate": 2.2899984564839317e-05, "loss": 0.0251, "step": 16270 }, { "epoch": 4.011834319526627, "grad_norm": 0.271484375, "learning_rate": 2.2878066319491786e-05, "loss": 0.0207, "step": 16272 }, { "epoch": 4.01232741617357, "grad_norm": 0.24609375, "learning_rate": 2.2856157213523977e-05, "loss": 0.0206, "step": 16274 }, { "epoch": 4.012820512820513, "grad_norm": 0.275390625, "learning_rate": 2.2834257249532266e-05, "loss": 0.0225, "step": 16276 }, { "epoch": 4.013313609467455, "grad_norm": 0.2275390625, "learning_rate": 2.2812366430111898e-05, "loss": 0.0209, "step": 16278 }, { "epoch": 4.013806706114399, "grad_norm": 0.240234375, "learning_rate": 2.27904847578571e-05, "loss": 0.0208, "step": 16280 }, { "epoch": 4.014299802761341, "grad_norm": 0.248046875, "learning_rate": 2.276861223536092e-05, "loss": 0.0201, "step": 16282 }, { "epoch": 4.014792899408284, "grad_norm": 0.2333984375, "learning_rate": 2.2746748865215395e-05, "loss": 0.0217, "step": 16284 }, { "epoch": 4.015285996055227, "grad_norm": 0.2431640625, "learning_rate": 2.272489465001145e-05, "loss": 0.0244, "step": 16286 }, { "epoch": 4.01577909270217, "grad_norm": 0.25, "learning_rate": 2.2703049592338942e-05, "loss": 0.0205, "step": 16288 }, { "epoch": 4.016272189349112, "grad_norm": 0.275390625, "learning_rate": 2.268121369478662e-05, "loss": 0.0281, "step": 16290 }, { "epoch": 4.016765285996056, "grad_norm": 0.2353515625, "learning_rate": 2.2659386959942175e-05, "loss": 0.0167, "step": 16292 }, { "epoch": 4.017258382642998, "grad_norm": 0.23828125, "learning_rate": 2.263756939039219e-05, "loss": 0.0194, "step": 16294 }, { "epoch": 4.017751479289941, "grad_norm": 0.24609375, "learning_rate": 2.2615760988722167e-05, "loss": 0.0225, "step": 16296 }, { "epoch": 4.018244575936883, "grad_norm": 0.236328125, "learning_rate": 2.2593961757516558e-05, "loss": 0.0221, "step": 16298 }, { "epoch": 4.018737672583827, "grad_norm": 0.287109375, "learning_rate": 2.2572171699358626e-05, "loss": 0.0206, "step": 16300 }, { "epoch": 4.019230769230769, "grad_norm": 0.2578125, "learning_rate": 2.2550390816830645e-05, "loss": 0.0215, "step": 16302 }, { "epoch": 4.019723865877712, "grad_norm": 0.2451171875, "learning_rate": 2.2528619112513784e-05, "loss": 0.0261, "step": 16304 }, { "epoch": 4.020216962524655, "grad_norm": 0.302734375, "learning_rate": 2.25068565889881e-05, "loss": 0.0216, "step": 16306 }, { "epoch": 4.020710059171598, "grad_norm": 0.267578125, "learning_rate": 2.2485103248832585e-05, "loss": 0.0251, "step": 16308 }, { "epoch": 4.02120315581854, "grad_norm": 0.25390625, "learning_rate": 2.2463359094625114e-05, "loss": 0.0191, "step": 16310 }, { "epoch": 4.021696252465484, "grad_norm": 0.28125, "learning_rate": 2.2441624128942495e-05, "loss": 0.0215, "step": 16312 }, { "epoch": 4.022189349112426, "grad_norm": 0.2490234375, "learning_rate": 2.241989835436048e-05, "loss": 0.0227, "step": 16314 }, { "epoch": 4.022682445759369, "grad_norm": 0.205078125, "learning_rate": 2.239818177345364e-05, "loss": 0.0195, "step": 16316 }, { "epoch": 4.023175542406312, "grad_norm": 0.265625, "learning_rate": 2.2376474388795533e-05, "loss": 0.0266, "step": 16318 }, { "epoch": 4.023668639053255, "grad_norm": 0.25390625, "learning_rate": 2.2354776202958594e-05, "loss": 0.0246, "step": 16320 }, { "epoch": 4.024161735700197, "grad_norm": 0.3046875, "learning_rate": 2.233308721851419e-05, "loss": 0.0215, "step": 16322 }, { "epoch": 4.02465483234714, "grad_norm": 0.25, "learning_rate": 2.2311407438032606e-05, "loss": 0.0232, "step": 16324 }, { "epoch": 4.025147928994083, "grad_norm": 0.255859375, "learning_rate": 2.2289736864082978e-05, "loss": 0.0215, "step": 16326 }, { "epoch": 4.0256410256410255, "grad_norm": 0.2373046875, "learning_rate": 2.2268075499233443e-05, "loss": 0.0207, "step": 16328 }, { "epoch": 4.026134122287968, "grad_norm": 0.255859375, "learning_rate": 2.2246423346050937e-05, "loss": 0.0233, "step": 16330 }, { "epoch": 4.0266272189349115, "grad_norm": 0.294921875, "learning_rate": 2.2224780407101388e-05, "loss": 0.0221, "step": 16332 }, { "epoch": 4.027120315581854, "grad_norm": 0.2431640625, "learning_rate": 2.2203146684949595e-05, "loss": 0.0199, "step": 16334 }, { "epoch": 4.0276134122287965, "grad_norm": 0.306640625, "learning_rate": 2.2181522182159288e-05, "loss": 0.0241, "step": 16336 }, { "epoch": 4.02810650887574, "grad_norm": 0.24609375, "learning_rate": 2.2159906901293082e-05, "loss": 0.0222, "step": 16338 }, { "epoch": 4.0285996055226825, "grad_norm": 0.25390625, "learning_rate": 2.2138300844912506e-05, "loss": 0.0254, "step": 16340 }, { "epoch": 4.029092702169625, "grad_norm": 0.2099609375, "learning_rate": 2.211670401557804e-05, "loss": 0.0184, "step": 16342 }, { "epoch": 4.029585798816568, "grad_norm": 0.26171875, "learning_rate": 2.2095116415848937e-05, "loss": 0.0216, "step": 16344 }, { "epoch": 4.030078895463511, "grad_norm": 0.2412109375, "learning_rate": 2.207353804828356e-05, "loss": 0.0213, "step": 16346 }, { "epoch": 4.0305719921104535, "grad_norm": 0.291015625, "learning_rate": 2.205196891543897e-05, "loss": 0.0237, "step": 16348 }, { "epoch": 4.031065088757397, "grad_norm": 0.244140625, "learning_rate": 2.2030409019871277e-05, "loss": 0.0246, "step": 16350 }, { "epoch": 4.031558185404339, "grad_norm": 0.244140625, "learning_rate": 2.2008858364135443e-05, "loss": 0.0203, "step": 16352 }, { "epoch": 4.032051282051282, "grad_norm": 0.22265625, "learning_rate": 2.198731695078534e-05, "loss": 0.023, "step": 16354 }, { "epoch": 4.0325443786982245, "grad_norm": 0.271484375, "learning_rate": 2.1965784782373778e-05, "loss": 0.0236, "step": 16356 }, { "epoch": 4.033037475345168, "grad_norm": 0.236328125, "learning_rate": 2.194426186145234e-05, "loss": 0.0201, "step": 16358 }, { "epoch": 4.03353057199211, "grad_norm": 0.24609375, "learning_rate": 2.192274819057174e-05, "loss": 0.0228, "step": 16360 }, { "epoch": 4.034023668639053, "grad_norm": 0.263671875, "learning_rate": 2.190124377228139e-05, "loss": 0.0217, "step": 16362 }, { "epoch": 4.034516765285996, "grad_norm": 0.23828125, "learning_rate": 2.187974860912968e-05, "loss": 0.024, "step": 16364 }, { "epoch": 4.035009861932939, "grad_norm": 0.28125, "learning_rate": 2.1858262703663944e-05, "loss": 0.0208, "step": 16366 }, { "epoch": 4.035502958579881, "grad_norm": 0.2578125, "learning_rate": 2.1836786058430358e-05, "loss": 0.02, "step": 16368 }, { "epoch": 4.035996055226825, "grad_norm": 0.2470703125, "learning_rate": 2.181531867597406e-05, "loss": 0.0227, "step": 16370 }, { "epoch": 4.036489151873767, "grad_norm": 0.25390625, "learning_rate": 2.1793860558838974e-05, "loss": 0.0222, "step": 16372 }, { "epoch": 4.03698224852071, "grad_norm": 0.287109375, "learning_rate": 2.1772411709568108e-05, "loss": 0.0237, "step": 16374 }, { "epoch": 4.037475345167653, "grad_norm": 0.2373046875, "learning_rate": 2.1750972130703163e-05, "loss": 0.0211, "step": 16376 }, { "epoch": 4.037968441814596, "grad_norm": 0.2265625, "learning_rate": 2.172954182478497e-05, "loss": 0.0187, "step": 16378 }, { "epoch": 4.038461538461538, "grad_norm": 0.255859375, "learning_rate": 2.170812079435305e-05, "loss": 0.0224, "step": 16380 }, { "epoch": 4.038954635108481, "grad_norm": 0.2451171875, "learning_rate": 2.168670904194594e-05, "loss": 0.0193, "step": 16382 }, { "epoch": 4.039447731755424, "grad_norm": 0.271484375, "learning_rate": 2.1665306570101086e-05, "loss": 0.0176, "step": 16384 }, { "epoch": 4.039940828402367, "grad_norm": 0.2451171875, "learning_rate": 2.1643913381354707e-05, "loss": 0.0215, "step": 16386 }, { "epoch": 4.040433925049309, "grad_norm": 0.208984375, "learning_rate": 2.1622529478242125e-05, "loss": 0.0195, "step": 16388 }, { "epoch": 4.040927021696253, "grad_norm": 0.193359375, "learning_rate": 2.1601154863297355e-05, "loss": 0.0193, "step": 16390 }, { "epoch": 4.041420118343195, "grad_norm": 0.2470703125, "learning_rate": 2.157978953905352e-05, "loss": 0.0243, "step": 16392 }, { "epoch": 4.041913214990138, "grad_norm": 0.2373046875, "learning_rate": 2.155843350804243e-05, "loss": 0.0197, "step": 16394 }, { "epoch": 4.042406311637081, "grad_norm": 0.2197265625, "learning_rate": 2.1537086772794934e-05, "loss": 0.0214, "step": 16396 }, { "epoch": 4.042899408284024, "grad_norm": 0.2451171875, "learning_rate": 2.1515749335840773e-05, "loss": 0.0195, "step": 16398 }, { "epoch": 4.043392504930966, "grad_norm": 0.26171875, "learning_rate": 2.1494421199708447e-05, "loss": 0.0232, "step": 16400 }, { "epoch": 4.04388560157791, "grad_norm": 0.2890625, "learning_rate": 2.147310236692559e-05, "loss": 0.0221, "step": 16402 }, { "epoch": 4.044378698224852, "grad_norm": 0.2236328125, "learning_rate": 2.145179284001848e-05, "loss": 0.0221, "step": 16404 }, { "epoch": 4.044871794871795, "grad_norm": 0.2451171875, "learning_rate": 2.143049262151253e-05, "loss": 0.0209, "step": 16406 }, { "epoch": 4.045364891518737, "grad_norm": 0.244140625, "learning_rate": 2.1409201713931858e-05, "loss": 0.0211, "step": 16408 }, { "epoch": 4.045857988165681, "grad_norm": 0.2421875, "learning_rate": 2.1387920119799577e-05, "loss": 0.0234, "step": 16410 }, { "epoch": 4.046351084812623, "grad_norm": 0.25390625, "learning_rate": 2.1366647841637698e-05, "loss": 0.023, "step": 16412 }, { "epoch": 4.046844181459566, "grad_norm": 0.25390625, "learning_rate": 2.1345384881967036e-05, "loss": 0.0218, "step": 16414 }, { "epoch": 4.047337278106509, "grad_norm": 0.275390625, "learning_rate": 2.1324131243307466e-05, "loss": 0.0213, "step": 16416 }, { "epoch": 4.047830374753452, "grad_norm": 0.21875, "learning_rate": 2.1302886928177556e-05, "loss": 0.0196, "step": 16418 }, { "epoch": 4.048323471400394, "grad_norm": 0.259765625, "learning_rate": 2.1281651939094992e-05, "loss": 0.0229, "step": 16420 }, { "epoch": 4.048816568047338, "grad_norm": 0.28125, "learning_rate": 2.126042627857615e-05, "loss": 0.0196, "step": 16422 }, { "epoch": 4.04930966469428, "grad_norm": 0.23046875, "learning_rate": 2.123920994913643e-05, "loss": 0.0212, "step": 16424 }, { "epoch": 4.049802761341223, "grad_norm": 0.251953125, "learning_rate": 2.121800295329006e-05, "loss": 0.0228, "step": 16426 }, { "epoch": 4.050295857988166, "grad_norm": 0.2412109375, "learning_rate": 2.1196805293550215e-05, "loss": 0.0203, "step": 16428 }, { "epoch": 4.050788954635109, "grad_norm": 0.26953125, "learning_rate": 2.1175616972428946e-05, "loss": 0.0215, "step": 16430 }, { "epoch": 4.051282051282051, "grad_norm": 0.27734375, "learning_rate": 2.1154437992437103e-05, "loss": 0.0229, "step": 16432 }, { "epoch": 4.0517751479289945, "grad_norm": 0.28125, "learning_rate": 2.1133268356084634e-05, "loss": 0.0208, "step": 16434 }, { "epoch": 4.052268244575937, "grad_norm": 0.2490234375, "learning_rate": 2.1112108065880175e-05, "loss": 0.0218, "step": 16436 }, { "epoch": 4.05276134122288, "grad_norm": 0.275390625, "learning_rate": 2.1090957124331366e-05, "loss": 0.0241, "step": 16438 }, { "epoch": 4.053254437869822, "grad_norm": 0.275390625, "learning_rate": 2.1069815533944715e-05, "loss": 0.0219, "step": 16440 }, { "epoch": 4.0537475345167655, "grad_norm": 0.2421875, "learning_rate": 2.1048683297225603e-05, "loss": 0.0202, "step": 16442 }, { "epoch": 4.054240631163708, "grad_norm": 0.2265625, "learning_rate": 2.1027560416678328e-05, "loss": 0.0202, "step": 16444 }, { "epoch": 4.054733727810651, "grad_norm": 0.251953125, "learning_rate": 2.1006446894806065e-05, "loss": 0.0217, "step": 16446 }, { "epoch": 4.055226824457594, "grad_norm": 0.259765625, "learning_rate": 2.0985342734110925e-05, "loss": 0.0199, "step": 16448 }, { "epoch": 4.0557199211045365, "grad_norm": 0.2216796875, "learning_rate": 2.0964247937093807e-05, "loss": 0.0224, "step": 16450 }, { "epoch": 4.056213017751479, "grad_norm": 0.248046875, "learning_rate": 2.0943162506254576e-05, "loss": 0.0241, "step": 16452 }, { "epoch": 4.056706114398422, "grad_norm": 0.212890625, "learning_rate": 2.0922086444091994e-05, "loss": 0.0218, "step": 16454 }, { "epoch": 4.057199211045365, "grad_norm": 0.23046875, "learning_rate": 2.090101975310368e-05, "loss": 0.0233, "step": 16456 }, { "epoch": 4.0576923076923075, "grad_norm": 0.267578125, "learning_rate": 2.087996243578616e-05, "loss": 0.0207, "step": 16458 }, { "epoch": 4.058185404339251, "grad_norm": 0.2158203125, "learning_rate": 2.085891449463484e-05, "loss": 0.0203, "step": 16460 }, { "epoch": 4.058678500986193, "grad_norm": 0.2333984375, "learning_rate": 2.0837875932144045e-05, "loss": 0.0209, "step": 16462 }, { "epoch": 4.059171597633136, "grad_norm": 0.2197265625, "learning_rate": 2.0816846750806907e-05, "loss": 0.021, "step": 16464 }, { "epoch": 4.0596646942800785, "grad_norm": 0.2177734375, "learning_rate": 2.079582695311554e-05, "loss": 0.0215, "step": 16466 }, { "epoch": 4.060157790927022, "grad_norm": 0.2578125, "learning_rate": 2.0774816541560893e-05, "loss": 0.0223, "step": 16468 }, { "epoch": 4.060650887573964, "grad_norm": 0.240234375, "learning_rate": 2.0753815518632824e-05, "loss": 0.0196, "step": 16470 }, { "epoch": 4.061143984220907, "grad_norm": 0.2578125, "learning_rate": 2.0732823886820062e-05, "loss": 0.0212, "step": 16472 }, { "epoch": 4.06163708086785, "grad_norm": 0.236328125, "learning_rate": 2.0711841648610254e-05, "loss": 0.0232, "step": 16474 }, { "epoch": 4.062130177514793, "grad_norm": 0.251953125, "learning_rate": 2.0690868806489882e-05, "loss": 0.0241, "step": 16476 }, { "epoch": 4.062623274161735, "grad_norm": 0.265625, "learning_rate": 2.066990536294441e-05, "loss": 0.0213, "step": 16478 }, { "epoch": 4.063116370808679, "grad_norm": 0.2177734375, "learning_rate": 2.0648951320458034e-05, "loss": 0.0197, "step": 16480 }, { "epoch": 4.063609467455621, "grad_norm": 0.23828125, "learning_rate": 2.0628006681513967e-05, "loss": 0.0215, "step": 16482 }, { "epoch": 4.064102564102564, "grad_norm": 0.27734375, "learning_rate": 2.0607071448594272e-05, "loss": 0.0242, "step": 16484 }, { "epoch": 4.064595660749507, "grad_norm": 0.248046875, "learning_rate": 2.0586145624179876e-05, "loss": 0.0201, "step": 16486 }, { "epoch": 4.06508875739645, "grad_norm": 0.2470703125, "learning_rate": 2.0565229210750613e-05, "loss": 0.0238, "step": 16488 }, { "epoch": 4.065581854043392, "grad_norm": 0.25390625, "learning_rate": 2.0544322210785205e-05, "loss": 0.0226, "step": 16490 }, { "epoch": 4.066074950690336, "grad_norm": 0.2431640625, "learning_rate": 2.0523424626761257e-05, "loss": 0.0203, "step": 16492 }, { "epoch": 4.066568047337278, "grad_norm": 0.2353515625, "learning_rate": 2.0502536461155207e-05, "loss": 0.0211, "step": 16494 }, { "epoch": 4.067061143984221, "grad_norm": 0.26171875, "learning_rate": 2.048165771644244e-05, "loss": 0.022, "step": 16496 }, { "epoch": 4.067554240631163, "grad_norm": 0.2353515625, "learning_rate": 2.0460788395097207e-05, "loss": 0.0198, "step": 16498 }, { "epoch": 4.068047337278107, "grad_norm": 0.2431640625, "learning_rate": 2.0439928499592643e-05, "loss": 0.0211, "step": 16500 }, { "epoch": 4.068540433925049, "grad_norm": 0.2451171875, "learning_rate": 2.0419078032400764e-05, "loss": 0.0242, "step": 16502 }, { "epoch": 4.069033530571992, "grad_norm": 0.23046875, "learning_rate": 2.039823699599246e-05, "loss": 0.0189, "step": 16504 }, { "epoch": 4.069526627218935, "grad_norm": 0.2412109375, "learning_rate": 2.0377405392837522e-05, "loss": 0.0215, "step": 16506 }, { "epoch": 4.070019723865878, "grad_norm": 0.232421875, "learning_rate": 2.035658322540457e-05, "loss": 0.0191, "step": 16508 }, { "epoch": 4.07051282051282, "grad_norm": 0.25, "learning_rate": 2.033577049616121e-05, "loss": 0.0221, "step": 16510 }, { "epoch": 4.071005917159764, "grad_norm": 0.255859375, "learning_rate": 2.031496720757382e-05, "loss": 0.021, "step": 16512 }, { "epoch": 4.071499013806706, "grad_norm": 0.2109375, "learning_rate": 2.0294173362107716e-05, "loss": 0.0232, "step": 16514 }, { "epoch": 4.071992110453649, "grad_norm": 0.23046875, "learning_rate": 2.0273388962227092e-05, "loss": 0.0207, "step": 16516 }, { "epoch": 4.072485207100592, "grad_norm": 0.24609375, "learning_rate": 2.025261401039501e-05, "loss": 0.0217, "step": 16518 }, { "epoch": 4.072978303747535, "grad_norm": 0.28515625, "learning_rate": 2.023184850907345e-05, "loss": 0.0247, "step": 16520 }, { "epoch": 4.073471400394477, "grad_norm": 0.2421875, "learning_rate": 2.0211092460723146e-05, "loss": 0.0202, "step": 16522 }, { "epoch": 4.07396449704142, "grad_norm": 0.240234375, "learning_rate": 2.019034586780394e-05, "loss": 0.0207, "step": 16524 }, { "epoch": 4.074457593688363, "grad_norm": 0.2421875, "learning_rate": 2.016960873277428e-05, "loss": 0.0202, "step": 16526 }, { "epoch": 4.074950690335306, "grad_norm": 0.263671875, "learning_rate": 2.0148881058091763e-05, "loss": 0.0214, "step": 16528 }, { "epoch": 4.075443786982248, "grad_norm": 0.283203125, "learning_rate": 2.012816284621264e-05, "loss": 0.0191, "step": 16530 }, { "epoch": 4.075936883629192, "grad_norm": 0.2373046875, "learning_rate": 2.010745409959217e-05, "loss": 0.0216, "step": 16532 }, { "epoch": 4.076429980276134, "grad_norm": 0.2265625, "learning_rate": 2.0086754820684483e-05, "loss": 0.022, "step": 16534 }, { "epoch": 4.076923076923077, "grad_norm": 0.251953125, "learning_rate": 2.006606501194247e-05, "loss": 0.0203, "step": 16536 }, { "epoch": 4.07741617357002, "grad_norm": 0.2275390625, "learning_rate": 2.00453846758181e-05, "loss": 0.0232, "step": 16538 }, { "epoch": 4.077909270216963, "grad_norm": 0.248046875, "learning_rate": 2.002471381476201e-05, "loss": 0.0202, "step": 16540 }, { "epoch": 4.078402366863905, "grad_norm": 0.2373046875, "learning_rate": 2.0004052431223897e-05, "loss": 0.0223, "step": 16542 }, { "epoch": 4.0788954635108485, "grad_norm": 0.24609375, "learning_rate": 1.9983400527652197e-05, "loss": 0.0216, "step": 16544 }, { "epoch": 4.079388560157791, "grad_norm": 0.265625, "learning_rate": 1.9962758106494284e-05, "loss": 0.0218, "step": 16546 }, { "epoch": 4.079881656804734, "grad_norm": 0.2412109375, "learning_rate": 1.9942125170196436e-05, "loss": 0.0202, "step": 16548 }, { "epoch": 4.080374753451676, "grad_norm": 0.271484375, "learning_rate": 1.9921501721203684e-05, "loss": 0.0207, "step": 16550 }, { "epoch": 4.0808678500986195, "grad_norm": 0.2216796875, "learning_rate": 1.9900887761960128e-05, "loss": 0.0196, "step": 16552 }, { "epoch": 4.081360946745562, "grad_norm": 0.251953125, "learning_rate": 1.988028329490854e-05, "loss": 0.0196, "step": 16554 }, { "epoch": 4.081854043392505, "grad_norm": 0.25390625, "learning_rate": 1.9859688322490765e-05, "loss": 0.0211, "step": 16556 }, { "epoch": 4.082347140039448, "grad_norm": 0.24609375, "learning_rate": 1.983910284714735e-05, "loss": 0.0203, "step": 16558 }, { "epoch": 4.0828402366863905, "grad_norm": 0.2275390625, "learning_rate": 1.98185268713178e-05, "loss": 0.0185, "step": 16560 }, { "epoch": 4.083333333333333, "grad_norm": 0.255859375, "learning_rate": 1.979796039744053e-05, "loss": 0.0203, "step": 16562 }, { "epoch": 4.0838264299802765, "grad_norm": 0.2216796875, "learning_rate": 1.977740342795268e-05, "loss": 0.0231, "step": 16564 }, { "epoch": 4.084319526627219, "grad_norm": 0.24609375, "learning_rate": 1.9756855965290487e-05, "loss": 0.0205, "step": 16566 }, { "epoch": 4.0848126232741615, "grad_norm": 0.2451171875, "learning_rate": 1.973631801188882e-05, "loss": 0.0247, "step": 16568 }, { "epoch": 4.085305719921105, "grad_norm": 0.265625, "learning_rate": 1.9715789570181674e-05, "loss": 0.0222, "step": 16570 }, { "epoch": 4.085798816568047, "grad_norm": 0.294921875, "learning_rate": 1.969527064260167e-05, "loss": 0.0234, "step": 16572 }, { "epoch": 4.08629191321499, "grad_norm": 0.2412109375, "learning_rate": 1.9674761231580475e-05, "loss": 0.0223, "step": 16574 }, { "epoch": 4.0867850098619325, "grad_norm": 0.28125, "learning_rate": 1.965426133954854e-05, "loss": 0.0204, "step": 16576 }, { "epoch": 4.087278106508876, "grad_norm": 0.224609375, "learning_rate": 1.963377096893524e-05, "loss": 0.0214, "step": 16578 }, { "epoch": 4.087771203155818, "grad_norm": 0.2177734375, "learning_rate": 1.9613290122168802e-05, "loss": 0.0226, "step": 16580 }, { "epoch": 4.088264299802761, "grad_norm": 0.23046875, "learning_rate": 1.959281880167626e-05, "loss": 0.0183, "step": 16582 }, { "epoch": 4.088757396449704, "grad_norm": 0.2431640625, "learning_rate": 1.957235700988368e-05, "loss": 0.0227, "step": 16584 }, { "epoch": 4.089250493096647, "grad_norm": 0.302734375, "learning_rate": 1.955190474921581e-05, "loss": 0.0219, "step": 16586 }, { "epoch": 4.089743589743589, "grad_norm": 0.291015625, "learning_rate": 1.9531462022096403e-05, "loss": 0.019, "step": 16588 }, { "epoch": 4.090236686390533, "grad_norm": 0.220703125, "learning_rate": 1.951102883094802e-05, "loss": 0.0199, "step": 16590 }, { "epoch": 4.090729783037475, "grad_norm": 0.234375, "learning_rate": 1.9490605178192113e-05, "loss": 0.022, "step": 16592 }, { "epoch": 4.091222879684418, "grad_norm": 0.208984375, "learning_rate": 1.9470191066249033e-05, "loss": 0.021, "step": 16594 }, { "epoch": 4.091715976331361, "grad_norm": 0.2333984375, "learning_rate": 1.944978649753787e-05, "loss": 0.0189, "step": 16596 }, { "epoch": 4.092209072978304, "grad_norm": 0.267578125, "learning_rate": 1.9429391474476797e-05, "loss": 0.0188, "step": 16598 }, { "epoch": 4.092702169625246, "grad_norm": 0.2255859375, "learning_rate": 1.9409005999482666e-05, "loss": 0.0197, "step": 16600 }, { "epoch": 4.09319526627219, "grad_norm": 0.28125, "learning_rate": 1.938863007497127e-05, "loss": 0.0202, "step": 16602 }, { "epoch": 4.093688362919132, "grad_norm": 0.2578125, "learning_rate": 1.9368263703357302e-05, "loss": 0.0216, "step": 16604 }, { "epoch": 4.094181459566075, "grad_norm": 0.25390625, "learning_rate": 1.934790688705427e-05, "loss": 0.0217, "step": 16606 }, { "epoch": 4.094674556213017, "grad_norm": 0.283203125, "learning_rate": 1.9327559628474566e-05, "loss": 0.0226, "step": 16608 }, { "epoch": 4.095167652859961, "grad_norm": 0.267578125, "learning_rate": 1.9307221930029472e-05, "loss": 0.0212, "step": 16610 }, { "epoch": 4.095660749506903, "grad_norm": 0.2080078125, "learning_rate": 1.9286893794129134e-05, "loss": 0.0199, "step": 16612 }, { "epoch": 4.096153846153846, "grad_norm": 0.263671875, "learning_rate": 1.9266575223182503e-05, "loss": 0.0223, "step": 16614 }, { "epoch": 4.096646942800789, "grad_norm": 0.2578125, "learning_rate": 1.9246266219597465e-05, "loss": 0.0255, "step": 16616 }, { "epoch": 4.097140039447732, "grad_norm": 0.263671875, "learning_rate": 1.9225966785780748e-05, "loss": 0.0205, "step": 16618 }, { "epoch": 4.097633136094674, "grad_norm": 0.224609375, "learning_rate": 1.920567692413795e-05, "loss": 0.0214, "step": 16620 }, { "epoch": 4.098126232741618, "grad_norm": 0.298828125, "learning_rate": 1.9185396637073537e-05, "loss": 0.0242, "step": 16622 }, { "epoch": 4.09861932938856, "grad_norm": 0.283203125, "learning_rate": 1.9165125926990834e-05, "loss": 0.0251, "step": 16624 }, { "epoch": 4.099112426035503, "grad_norm": 0.216796875, "learning_rate": 1.914486479629206e-05, "loss": 0.0201, "step": 16626 }, { "epoch": 4.099605522682446, "grad_norm": 0.3203125, "learning_rate": 1.912461324737822e-05, "loss": 0.0191, "step": 16628 }, { "epoch": 4.100098619329389, "grad_norm": 0.310546875, "learning_rate": 1.9104371282649257e-05, "loss": 0.0246, "step": 16630 }, { "epoch": 4.100591715976331, "grad_norm": 0.26171875, "learning_rate": 1.9084138904503966e-05, "loss": 0.0226, "step": 16632 }, { "epoch": 4.101084812623274, "grad_norm": 0.2392578125, "learning_rate": 1.9063916115339984e-05, "loss": 0.0204, "step": 16634 }, { "epoch": 4.101577909270217, "grad_norm": 0.267578125, "learning_rate": 1.9043702917553828e-05, "loss": 0.02, "step": 16636 }, { "epoch": 4.10207100591716, "grad_norm": 0.2294921875, "learning_rate": 1.9023499313540893e-05, "loss": 0.0207, "step": 16638 }, { "epoch": 4.102564102564102, "grad_norm": 0.255859375, "learning_rate": 1.9003305305695395e-05, "loss": 0.0247, "step": 16640 }, { "epoch": 4.103057199211046, "grad_norm": 0.21875, "learning_rate": 1.8983120896410468e-05, "loss": 0.0204, "step": 16642 }, { "epoch": 4.103550295857988, "grad_norm": 0.255859375, "learning_rate": 1.8962946088078037e-05, "loss": 0.0207, "step": 16644 }, { "epoch": 4.104043392504931, "grad_norm": 0.373046875, "learning_rate": 1.894278088308894e-05, "loss": 0.0279, "step": 16646 }, { "epoch": 4.104536489151874, "grad_norm": 0.251953125, "learning_rate": 1.8922625283832886e-05, "loss": 0.0259, "step": 16648 }, { "epoch": 4.105029585798817, "grad_norm": 0.25, "learning_rate": 1.8902479292698405e-05, "loss": 0.0199, "step": 16650 }, { "epoch": 4.105522682445759, "grad_norm": 0.271484375, "learning_rate": 1.8882342912072913e-05, "loss": 0.0177, "step": 16652 }, { "epoch": 4.1060157790927025, "grad_norm": 0.345703125, "learning_rate": 1.8862216144342692e-05, "loss": 0.0265, "step": 16654 }, { "epoch": 4.106508875739645, "grad_norm": 0.34765625, "learning_rate": 1.884209899189291e-05, "loss": 0.0262, "step": 16656 }, { "epoch": 4.107001972386588, "grad_norm": 0.271484375, "learning_rate": 1.882199145710747e-05, "loss": 0.0238, "step": 16658 }, { "epoch": 4.107495069033531, "grad_norm": 0.2392578125, "learning_rate": 1.880189354236932e-05, "loss": 0.0208, "step": 16660 }, { "epoch": 4.1079881656804735, "grad_norm": 0.265625, "learning_rate": 1.8781805250060125e-05, "loss": 0.0283, "step": 16662 }, { "epoch": 4.108481262327416, "grad_norm": 0.29296875, "learning_rate": 1.8761726582560467e-05, "loss": 0.0266, "step": 16664 }, { "epoch": 4.108974358974359, "grad_norm": 0.2373046875, "learning_rate": 1.874165754224979e-05, "loss": 0.0208, "step": 16666 }, { "epoch": 4.109467455621302, "grad_norm": 0.259765625, "learning_rate": 1.872159813150638e-05, "loss": 0.0217, "step": 16668 }, { "epoch": 4.1099605522682445, "grad_norm": 0.27734375, "learning_rate": 1.870154835270742e-05, "loss": 0.0274, "step": 16670 }, { "epoch": 4.110453648915187, "grad_norm": 0.27734375, "learning_rate": 1.8681508208228836e-05, "loss": 0.0307, "step": 16672 }, { "epoch": 4.1109467455621305, "grad_norm": 0.251953125, "learning_rate": 1.8661477700445606e-05, "loss": 0.0228, "step": 16674 }, { "epoch": 4.111439842209073, "grad_norm": 0.33203125, "learning_rate": 1.8641456831731386e-05, "loss": 0.023, "step": 16676 }, { "epoch": 4.1119329388560155, "grad_norm": 0.283203125, "learning_rate": 1.8621445604458765e-05, "loss": 0.0281, "step": 16678 }, { "epoch": 4.112426035502959, "grad_norm": 0.330078125, "learning_rate": 1.8601444020999215e-05, "loss": 0.0298, "step": 16680 }, { "epoch": 4.1129191321499015, "grad_norm": 0.2392578125, "learning_rate": 1.8581452083723007e-05, "loss": 0.0212, "step": 16682 }, { "epoch": 4.113412228796844, "grad_norm": 0.2451171875, "learning_rate": 1.856146979499934e-05, "loss": 0.0188, "step": 16684 }, { "epoch": 4.113905325443787, "grad_norm": 0.30859375, "learning_rate": 1.8541497157196153e-05, "loss": 0.0315, "step": 16686 }, { "epoch": 4.11439842209073, "grad_norm": 0.33203125, "learning_rate": 1.852153417268041e-05, "loss": 0.0281, "step": 16688 }, { "epoch": 4.1148915187376724, "grad_norm": 0.255859375, "learning_rate": 1.8501580843817723e-05, "loss": 0.0214, "step": 16690 }, { "epoch": 4.115384615384615, "grad_norm": 0.2451171875, "learning_rate": 1.848163717297279e-05, "loss": 0.0219, "step": 16692 }, { "epoch": 4.115877712031558, "grad_norm": 0.302734375, "learning_rate": 1.846170316250897e-05, "loss": 0.028, "step": 16694 }, { "epoch": 4.116370808678501, "grad_norm": 0.3203125, "learning_rate": 1.8441778814788578e-05, "loss": 0.0315, "step": 16696 }, { "epoch": 4.116863905325443, "grad_norm": 0.294921875, "learning_rate": 1.8421864132172785e-05, "loss": 0.0183, "step": 16698 }, { "epoch": 4.117357001972387, "grad_norm": 0.2431640625, "learning_rate": 1.8401959117021507e-05, "loss": 0.0263, "step": 16700 }, { "epoch": 4.117850098619329, "grad_norm": 0.287109375, "learning_rate": 1.8382063771693714e-05, "loss": 0.0326, "step": 16702 }, { "epoch": 4.118343195266272, "grad_norm": 0.3125, "learning_rate": 1.8362178098547013e-05, "loss": 0.0287, "step": 16704 }, { "epoch": 4.118836291913215, "grad_norm": 0.29296875, "learning_rate": 1.8342302099938057e-05, "loss": 0.0264, "step": 16706 }, { "epoch": 4.119329388560158, "grad_norm": 0.2490234375, "learning_rate": 1.83224357782222e-05, "loss": 0.0231, "step": 16708 }, { "epoch": 4.1198224852071, "grad_norm": 0.302734375, "learning_rate": 1.830257913575374e-05, "loss": 0.0304, "step": 16710 }, { "epoch": 4.120315581854044, "grad_norm": 0.271484375, "learning_rate": 1.828273217488581e-05, "loss": 0.0284, "step": 16712 }, { "epoch": 4.120808678500986, "grad_norm": 0.2412109375, "learning_rate": 1.8262894897970318e-05, "loss": 0.0232, "step": 16714 }, { "epoch": 4.121301775147929, "grad_norm": 0.30078125, "learning_rate": 1.82430673073582e-05, "loss": 0.0247, "step": 16716 }, { "epoch": 4.121794871794871, "grad_norm": 0.298828125, "learning_rate": 1.822324940539901e-05, "loss": 0.0314, "step": 16718 }, { "epoch": 4.122287968441815, "grad_norm": 0.30078125, "learning_rate": 1.820344119444142e-05, "loss": 0.0296, "step": 16720 }, { "epoch": 4.122781065088757, "grad_norm": 0.248046875, "learning_rate": 1.8183642676832703e-05, "loss": 0.0259, "step": 16722 }, { "epoch": 4.1232741617357, "grad_norm": 0.298828125, "learning_rate": 1.8163853854919134e-05, "loss": 0.0245, "step": 16724 }, { "epoch": 4.123767258382643, "grad_norm": 0.294921875, "learning_rate": 1.8144074731045822e-05, "loss": 0.0305, "step": 16726 }, { "epoch": 4.124260355029586, "grad_norm": 0.298828125, "learning_rate": 1.8124305307556632e-05, "loss": 0.0297, "step": 16728 }, { "epoch": 4.124753451676528, "grad_norm": 0.265625, "learning_rate": 1.8104545586794453e-05, "loss": 0.0286, "step": 16730 }, { "epoch": 4.125246548323472, "grad_norm": 0.3515625, "learning_rate": 1.808479557110081e-05, "loss": 0.0258, "step": 16732 }, { "epoch": 4.125739644970414, "grad_norm": 0.291015625, "learning_rate": 1.806505526281631e-05, "loss": 0.0276, "step": 16734 }, { "epoch": 4.126232741617357, "grad_norm": 0.322265625, "learning_rate": 1.8045324664280195e-05, "loss": 0.0302, "step": 16736 }, { "epoch": 4.1267258382643, "grad_norm": 0.296875, "learning_rate": 1.802560377783068e-05, "loss": 0.0282, "step": 16738 }, { "epoch": 4.127218934911243, "grad_norm": 0.275390625, "learning_rate": 1.8005892605804796e-05, "loss": 0.032, "step": 16740 }, { "epoch": 4.127712031558185, "grad_norm": 0.326171875, "learning_rate": 1.7986191150538456e-05, "loss": 0.0328, "step": 16742 }, { "epoch": 4.128205128205128, "grad_norm": 0.31640625, "learning_rate": 1.796649941436638e-05, "loss": 0.0326, "step": 16744 }, { "epoch": 4.128698224852071, "grad_norm": 0.306640625, "learning_rate": 1.7946817399622097e-05, "loss": 0.0284, "step": 16746 }, { "epoch": 4.129191321499014, "grad_norm": 0.2890625, "learning_rate": 1.792714510863812e-05, "loss": 0.028, "step": 16748 }, { "epoch": 4.129684418145956, "grad_norm": 0.279296875, "learning_rate": 1.7907482543745667e-05, "loss": 0.0293, "step": 16750 }, { "epoch": 4.1301775147929, "grad_norm": 0.27734375, "learning_rate": 1.7887829707274884e-05, "loss": 0.0308, "step": 16752 }, { "epoch": 4.130670611439842, "grad_norm": 0.283203125, "learning_rate": 1.7868186601554726e-05, "loss": 0.027, "step": 16754 }, { "epoch": 4.131163708086785, "grad_norm": 0.275390625, "learning_rate": 1.7848553228913024e-05, "loss": 0.0246, "step": 16756 }, { "epoch": 4.131656804733728, "grad_norm": 0.30859375, "learning_rate": 1.7828929591676458e-05, "loss": 0.0289, "step": 16758 }, { "epoch": 4.132149901380671, "grad_norm": 0.29296875, "learning_rate": 1.780931569217048e-05, "loss": 0.0308, "step": 16760 }, { "epoch": 4.132642998027613, "grad_norm": 0.287109375, "learning_rate": 1.7789711532719532e-05, "loss": 0.0281, "step": 16762 }, { "epoch": 4.133136094674557, "grad_norm": 0.263671875, "learning_rate": 1.7770117115646744e-05, "loss": 0.0281, "step": 16764 }, { "epoch": 4.133629191321499, "grad_norm": 0.337890625, "learning_rate": 1.775053244327419e-05, "loss": 0.032, "step": 16766 }, { "epoch": 4.134122287968442, "grad_norm": 0.365234375, "learning_rate": 1.7730957517922774e-05, "loss": 0.03, "step": 16768 }, { "epoch": 4.134615384615385, "grad_norm": 0.330078125, "learning_rate": 1.7711392341912214e-05, "loss": 0.0355, "step": 16770 }, { "epoch": 4.1351084812623276, "grad_norm": 0.30859375, "learning_rate": 1.7691836917561112e-05, "loss": 0.031, "step": 16772 }, { "epoch": 4.13560157790927, "grad_norm": 0.296875, "learning_rate": 1.7672291247186866e-05, "loss": 0.0296, "step": 16774 }, { "epoch": 4.136094674556213, "grad_norm": 0.298828125, "learning_rate": 1.7652755333105808e-05, "loss": 0.029, "step": 16776 }, { "epoch": 4.136587771203156, "grad_norm": 0.29296875, "learning_rate": 1.7633229177632982e-05, "loss": 0.0287, "step": 16778 }, { "epoch": 4.1370808678500985, "grad_norm": 0.349609375, "learning_rate": 1.761371278308237e-05, "loss": 0.0288, "step": 16780 }, { "epoch": 4.137573964497041, "grad_norm": 0.310546875, "learning_rate": 1.759420615176679e-05, "loss": 0.0329, "step": 16782 }, { "epoch": 4.1380670611439845, "grad_norm": 0.28125, "learning_rate": 1.757470928599787e-05, "loss": 0.0298, "step": 16784 }, { "epoch": 4.138560157790927, "grad_norm": 0.27734375, "learning_rate": 1.7555222188086106e-05, "loss": 0.0258, "step": 16786 }, { "epoch": 4.1390532544378695, "grad_norm": 0.279296875, "learning_rate": 1.753574486034082e-05, "loss": 0.0279, "step": 16788 }, { "epoch": 4.139546351084813, "grad_norm": 0.341796875, "learning_rate": 1.7516277305070196e-05, "loss": 0.028, "step": 16790 }, { "epoch": 4.1400394477317555, "grad_norm": 0.349609375, "learning_rate": 1.7496819524581264e-05, "loss": 0.0362, "step": 16792 }, { "epoch": 4.140532544378698, "grad_norm": 0.28125, "learning_rate": 1.747737152117983e-05, "loss": 0.03, "step": 16794 }, { "epoch": 4.141025641025641, "grad_norm": 0.330078125, "learning_rate": 1.7457933297170615e-05, "loss": 0.0333, "step": 16796 }, { "epoch": 4.141518737672584, "grad_norm": 0.333984375, "learning_rate": 1.7438504854857163e-05, "loss": 0.038, "step": 16798 }, { "epoch": 4.1420118343195265, "grad_norm": 0.296875, "learning_rate": 1.7419086196541855e-05, "loss": 0.0328, "step": 16800 }, { "epoch": 4.142504930966469, "grad_norm": 0.341796875, "learning_rate": 1.7399677324525897e-05, "loss": 0.0322, "step": 16802 }, { "epoch": 4.142998027613412, "grad_norm": 0.330078125, "learning_rate": 1.7380278241109372e-05, "loss": 0.0323, "step": 16804 }, { "epoch": 4.143491124260355, "grad_norm": 0.322265625, "learning_rate": 1.736088894859118e-05, "loss": 0.0353, "step": 16806 }, { "epoch": 4.1439842209072975, "grad_norm": 0.322265625, "learning_rate": 1.734150944926902e-05, "loss": 0.0328, "step": 16808 }, { "epoch": 4.144477317554241, "grad_norm": 0.283203125, "learning_rate": 1.7322139745439492e-05, "loss": 0.0295, "step": 16810 }, { "epoch": 4.144970414201183, "grad_norm": 0.3515625, "learning_rate": 1.7302779839398032e-05, "loss": 0.0293, "step": 16812 }, { "epoch": 4.145463510848126, "grad_norm": 0.359375, "learning_rate": 1.728342973343887e-05, "loss": 0.0335, "step": 16814 }, { "epoch": 4.145956607495069, "grad_norm": 0.34375, "learning_rate": 1.7264089429855112e-05, "loss": 0.0314, "step": 16816 }, { "epoch": 4.146449704142012, "grad_norm": 0.306640625, "learning_rate": 1.724475893093871e-05, "loss": 0.0314, "step": 16818 }, { "epoch": 4.146942800788954, "grad_norm": 0.279296875, "learning_rate": 1.722543823898043e-05, "loss": 0.0315, "step": 16820 }, { "epoch": 4.147435897435898, "grad_norm": 0.337890625, "learning_rate": 1.7206127356269828e-05, "loss": 0.035, "step": 16822 }, { "epoch": 4.14792899408284, "grad_norm": 0.298828125, "learning_rate": 1.7186826285095458e-05, "loss": 0.0348, "step": 16824 }, { "epoch": 4.148422090729783, "grad_norm": 0.3359375, "learning_rate": 1.7167535027744507e-05, "loss": 0.0363, "step": 16826 }, { "epoch": 4.148915187376726, "grad_norm": 0.30078125, "learning_rate": 1.7148253586503127e-05, "loss": 0.0315, "step": 16828 }, { "epoch": 4.149408284023669, "grad_norm": 0.318359375, "learning_rate": 1.712898196365629e-05, "loss": 0.0335, "step": 16830 }, { "epoch": 4.149901380670611, "grad_norm": 0.33203125, "learning_rate": 1.7109720161487785e-05, "loss": 0.0354, "step": 16832 }, { "epoch": 4.150394477317554, "grad_norm": 0.35546875, "learning_rate": 1.709046818228024e-05, "loss": 0.0334, "step": 16834 }, { "epoch": 4.150887573964497, "grad_norm": 0.298828125, "learning_rate": 1.7071226028315113e-05, "loss": 0.0308, "step": 16836 }, { "epoch": 4.15138067061144, "grad_norm": 0.322265625, "learning_rate": 1.7051993701872747e-05, "loss": 0.0332, "step": 16838 }, { "epoch": 4.151873767258382, "grad_norm": 0.33203125, "learning_rate": 1.70327712052322e-05, "loss": 0.0304, "step": 16840 }, { "epoch": 4.152366863905326, "grad_norm": 0.275390625, "learning_rate": 1.701355854067155e-05, "loss": 0.0317, "step": 16842 }, { "epoch": 4.152859960552268, "grad_norm": 0.388671875, "learning_rate": 1.6994355710467524e-05, "loss": 0.0323, "step": 16844 }, { "epoch": 4.153353057199211, "grad_norm": 0.302734375, "learning_rate": 1.6975162716895787e-05, "loss": 0.0332, "step": 16846 }, { "epoch": 4.153846153846154, "grad_norm": 0.296875, "learning_rate": 1.695597956223082e-05, "loss": 0.0351, "step": 16848 }, { "epoch": 4.154339250493097, "grad_norm": 0.294921875, "learning_rate": 1.693680624874593e-05, "loss": 0.0301, "step": 16850 }, { "epoch": 4.154832347140039, "grad_norm": 0.33984375, "learning_rate": 1.6917642778713293e-05, "loss": 0.0333, "step": 16852 }, { "epoch": 4.155325443786983, "grad_norm": 0.294921875, "learning_rate": 1.6898489154403817e-05, "loss": 0.0308, "step": 16854 }, { "epoch": 4.155818540433925, "grad_norm": 0.376953125, "learning_rate": 1.6879345378087396e-05, "loss": 0.036, "step": 16856 }, { "epoch": 4.156311637080868, "grad_norm": 0.2890625, "learning_rate": 1.6860211452032615e-05, "loss": 0.0295, "step": 16858 }, { "epoch": 4.15680473372781, "grad_norm": 0.3671875, "learning_rate": 1.684108737850696e-05, "loss": 0.0307, "step": 16860 }, { "epoch": 4.157297830374754, "grad_norm": 0.337890625, "learning_rate": 1.6821973159776762e-05, "loss": 0.0318, "step": 16862 }, { "epoch": 4.157790927021696, "grad_norm": 0.328125, "learning_rate": 1.6802868798107142e-05, "loss": 0.0339, "step": 16864 }, { "epoch": 4.158284023668639, "grad_norm": 0.29296875, "learning_rate": 1.6783774295762112e-05, "loss": 0.0331, "step": 16866 }, { "epoch": 4.158777120315582, "grad_norm": 0.361328125, "learning_rate": 1.6764689655004394e-05, "loss": 0.0343, "step": 16868 }, { "epoch": 4.159270216962525, "grad_norm": 0.388671875, "learning_rate": 1.674561487809574e-05, "loss": 0.0383, "step": 16870 }, { "epoch": 4.159763313609467, "grad_norm": 0.380859375, "learning_rate": 1.6726549967296522e-05, "loss": 0.0367, "step": 16872 }, { "epoch": 4.160256410256411, "grad_norm": 0.3203125, "learning_rate": 1.6707494924866075e-05, "loss": 0.0299, "step": 16874 }, { "epoch": 4.160749506903353, "grad_norm": 0.36328125, "learning_rate": 1.6688449753062518e-05, "loss": 0.0333, "step": 16876 }, { "epoch": 4.161242603550296, "grad_norm": 0.333984375, "learning_rate": 1.666941445414283e-05, "loss": 0.0366, "step": 16878 }, { "epoch": 4.161735700197239, "grad_norm": 0.34765625, "learning_rate": 1.66503890303628e-05, "loss": 0.0431, "step": 16880 }, { "epoch": 4.162228796844182, "grad_norm": 0.3125, "learning_rate": 1.6631373483976985e-05, "loss": 0.0328, "step": 16882 }, { "epoch": 4.162721893491124, "grad_norm": 0.34375, "learning_rate": 1.661236781723894e-05, "loss": 0.0303, "step": 16884 }, { "epoch": 4.1632149901380675, "grad_norm": 0.3359375, "learning_rate": 1.6593372032400856e-05, "loss": 0.0354, "step": 16886 }, { "epoch": 4.16370808678501, "grad_norm": 0.306640625, "learning_rate": 1.657438613171387e-05, "loss": 0.0368, "step": 16888 }, { "epoch": 4.164201183431953, "grad_norm": 0.3125, "learning_rate": 1.6555410117427926e-05, "loss": 0.0322, "step": 16890 }, { "epoch": 4.164694280078895, "grad_norm": 0.380859375, "learning_rate": 1.6536443991791773e-05, "loss": 0.03, "step": 16892 }, { "epoch": 4.1651873767258385, "grad_norm": 0.314453125, "learning_rate": 1.651748775705303e-05, "loss": 0.0326, "step": 16894 }, { "epoch": 4.165680473372781, "grad_norm": 0.3046875, "learning_rate": 1.6498541415458036e-05, "loss": 0.0341, "step": 16896 }, { "epoch": 4.1661735700197235, "grad_norm": 0.349609375, "learning_rate": 1.647960496925216e-05, "loss": 0.0309, "step": 16898 }, { "epoch": 4.166666666666667, "grad_norm": 0.30859375, "learning_rate": 1.6460678420679377e-05, "loss": 0.0342, "step": 16900 }, { "epoch": 4.1671597633136095, "grad_norm": 0.322265625, "learning_rate": 1.6441761771982632e-05, "loss": 0.0364, "step": 16902 }, { "epoch": 4.167652859960552, "grad_norm": 0.32421875, "learning_rate": 1.6422855025403648e-05, "loss": 0.0356, "step": 16904 }, { "epoch": 4.168145956607495, "grad_norm": 0.3046875, "learning_rate": 1.6403958183182976e-05, "loss": 0.0296, "step": 16906 }, { "epoch": 4.168639053254438, "grad_norm": 0.310546875, "learning_rate": 1.6385071247560023e-05, "loss": 0.0322, "step": 16908 }, { "epoch": 4.1691321499013805, "grad_norm": 0.359375, "learning_rate": 1.6366194220772934e-05, "loss": 0.0355, "step": 16910 }, { "epoch": 4.169625246548324, "grad_norm": 0.322265625, "learning_rate": 1.6347327105058817e-05, "loss": 0.0359, "step": 16912 }, { "epoch": 4.170118343195266, "grad_norm": 0.37109375, "learning_rate": 1.6328469902653454e-05, "loss": 0.0349, "step": 16914 }, { "epoch": 4.170611439842209, "grad_norm": 0.310546875, "learning_rate": 1.6309622615791608e-05, "loss": 0.0342, "step": 16916 }, { "epoch": 4.1711045364891515, "grad_norm": 0.333984375, "learning_rate": 1.629078524670674e-05, "loss": 0.0327, "step": 16918 }, { "epoch": 4.171597633136095, "grad_norm": 0.3828125, "learning_rate": 1.6271957797631175e-05, "loss": 0.0356, "step": 16920 }, { "epoch": 4.172090729783037, "grad_norm": 0.33984375, "learning_rate": 1.625314027079611e-05, "loss": 0.0374, "step": 16922 }, { "epoch": 4.17258382642998, "grad_norm": 0.2890625, "learning_rate": 1.6234332668431485e-05, "loss": 0.0311, "step": 16924 }, { "epoch": 4.173076923076923, "grad_norm": 0.337890625, "learning_rate": 1.621553499276617e-05, "loss": 0.0379, "step": 16926 }, { "epoch": 4.173570019723866, "grad_norm": 0.361328125, "learning_rate": 1.6196747246027688e-05, "loss": 0.0404, "step": 16928 }, { "epoch": 4.174063116370808, "grad_norm": 0.30859375, "learning_rate": 1.6177969430442606e-05, "loss": 0.0331, "step": 16930 }, { "epoch": 4.174556213017752, "grad_norm": 0.333984375, "learning_rate": 1.6159201548236124e-05, "loss": 0.0314, "step": 16932 }, { "epoch": 4.175049309664694, "grad_norm": 0.33984375, "learning_rate": 1.614044360163237e-05, "loss": 0.0408, "step": 16934 }, { "epoch": 4.175542406311637, "grad_norm": 0.3203125, "learning_rate": 1.612169559285426e-05, "loss": 0.0364, "step": 16936 }, { "epoch": 4.17603550295858, "grad_norm": 0.330078125, "learning_rate": 1.610295752412353e-05, "loss": 0.0371, "step": 16938 }, { "epoch": 4.176528599605523, "grad_norm": 0.388671875, "learning_rate": 1.608422939766079e-05, "loss": 0.0366, "step": 16940 }, { "epoch": 4.177021696252465, "grad_norm": 0.33984375, "learning_rate": 1.6065511215685336e-05, "loss": 0.0379, "step": 16942 }, { "epoch": 4.177514792899408, "grad_norm": 0.318359375, "learning_rate": 1.604680298041549e-05, "loss": 0.0381, "step": 16944 }, { "epoch": 4.178007889546351, "grad_norm": 0.294921875, "learning_rate": 1.602810469406819e-05, "loss": 0.0335, "step": 16946 }, { "epoch": 4.178500986193294, "grad_norm": 0.283203125, "learning_rate": 1.600941635885933e-05, "loss": 0.0306, "step": 16948 }, { "epoch": 4.178994082840236, "grad_norm": 0.328125, "learning_rate": 1.5990737977003577e-05, "loss": 0.0322, "step": 16950 }, { "epoch": 4.17948717948718, "grad_norm": 0.318359375, "learning_rate": 1.597206955071442e-05, "loss": 0.0364, "step": 16952 }, { "epoch": 4.179980276134122, "grad_norm": 0.29296875, "learning_rate": 1.5953411082204174e-05, "loss": 0.0334, "step": 16954 }, { "epoch": 4.180473372781065, "grad_norm": 0.333984375, "learning_rate": 1.5934762573683958e-05, "loss": 0.0343, "step": 16956 }, { "epoch": 4.180966469428008, "grad_norm": 0.34375, "learning_rate": 1.5916124027363776e-05, "loss": 0.034, "step": 16958 }, { "epoch": 4.181459566074951, "grad_norm": 0.328125, "learning_rate": 1.5897495445452338e-05, "loss": 0.0341, "step": 16960 }, { "epoch": 4.181952662721893, "grad_norm": 0.3125, "learning_rate": 1.587887683015725e-05, "loss": 0.0319, "step": 16962 }, { "epoch": 4.182445759368837, "grad_norm": 0.310546875, "learning_rate": 1.5860268183684933e-05, "loss": 0.0322, "step": 16964 }, { "epoch": 4.182938856015779, "grad_norm": 0.296875, "learning_rate": 1.584166950824061e-05, "loss": 0.0339, "step": 16966 }, { "epoch": 4.183431952662722, "grad_norm": 0.291015625, "learning_rate": 1.5823080806028333e-05, "loss": 0.0334, "step": 16968 }, { "epoch": 4.183925049309664, "grad_norm": 0.3203125, "learning_rate": 1.580450207925096e-05, "loss": 0.0323, "step": 16970 }, { "epoch": 4.184418145956608, "grad_norm": 0.369140625, "learning_rate": 1.578593333011018e-05, "loss": 0.0335, "step": 16972 }, { "epoch": 4.18491124260355, "grad_norm": 0.41015625, "learning_rate": 1.5767374560806514e-05, "loss": 0.0387, "step": 16974 }, { "epoch": 4.185404339250493, "grad_norm": 0.34765625, "learning_rate": 1.5748825773539245e-05, "loss": 0.0345, "step": 16976 }, { "epoch": 4.185897435897436, "grad_norm": 0.296875, "learning_rate": 1.57302869705065e-05, "loss": 0.0335, "step": 16978 }, { "epoch": 4.186390532544379, "grad_norm": 0.318359375, "learning_rate": 1.5711758153905266e-05, "loss": 0.0328, "step": 16980 }, { "epoch": 4.186883629191321, "grad_norm": 0.314453125, "learning_rate": 1.5693239325931296e-05, "loss": 0.0359, "step": 16982 }, { "epoch": 4.187376725838265, "grad_norm": 0.345703125, "learning_rate": 1.5674730488779166e-05, "loss": 0.038, "step": 16984 }, { "epoch": 4.187869822485207, "grad_norm": 0.306640625, "learning_rate": 1.5656231644642294e-05, "loss": 0.0342, "step": 16986 }, { "epoch": 4.18836291913215, "grad_norm": 0.287109375, "learning_rate": 1.5637742795712908e-05, "loss": 0.0335, "step": 16988 }, { "epoch": 4.188856015779093, "grad_norm": 0.373046875, "learning_rate": 1.5619263944181995e-05, "loss": 0.0359, "step": 16990 }, { "epoch": 4.189349112426036, "grad_norm": 0.34375, "learning_rate": 1.560079509223942e-05, "loss": 0.0383, "step": 16992 }, { "epoch": 4.189842209072978, "grad_norm": 0.318359375, "learning_rate": 1.5582336242073858e-05, "loss": 0.0404, "step": 16994 }, { "epoch": 4.1903353057199215, "grad_norm": 0.306640625, "learning_rate": 1.5563887395872777e-05, "loss": 0.0337, "step": 16996 }, { "epoch": 4.190828402366864, "grad_norm": 0.34375, "learning_rate": 1.5545448555822474e-05, "loss": 0.0353, "step": 16998 }, { "epoch": 4.191321499013807, "grad_norm": 0.337890625, "learning_rate": 1.552701972410805e-05, "loss": 0.0361, "step": 17000 }, { "epoch": 4.191814595660749, "grad_norm": 0.36328125, "learning_rate": 1.550860090291345e-05, "loss": 0.0347, "step": 17002 }, { "epoch": 4.1923076923076925, "grad_norm": 0.328125, "learning_rate": 1.549019209442133e-05, "loss": 0.0375, "step": 17004 }, { "epoch": 4.192800788954635, "grad_norm": 0.353515625, "learning_rate": 1.5471793300813344e-05, "loss": 0.034, "step": 17006 }, { "epoch": 4.193293885601578, "grad_norm": 0.34375, "learning_rate": 1.5453404524269766e-05, "loss": 0.0351, "step": 17008 }, { "epoch": 4.193786982248521, "grad_norm": 0.36328125, "learning_rate": 1.5435025766969803e-05, "loss": 0.0325, "step": 17010 }, { "epoch": 4.1942800788954635, "grad_norm": 0.294921875, "learning_rate": 1.5416657031091432e-05, "loss": 0.0346, "step": 17012 }, { "epoch": 4.194773175542406, "grad_norm": 0.314453125, "learning_rate": 1.5398298318811467e-05, "loss": 0.0387, "step": 17014 }, { "epoch": 4.195266272189349, "grad_norm": 0.33203125, "learning_rate": 1.5379949632305523e-05, "loss": 0.0369, "step": 17016 }, { "epoch": 4.195759368836292, "grad_norm": 0.322265625, "learning_rate": 1.5361610973747954e-05, "loss": 0.0365, "step": 17018 }, { "epoch": 4.1962524654832345, "grad_norm": 0.35546875, "learning_rate": 1.5343282345312093e-05, "loss": 0.039, "step": 17020 }, { "epoch": 4.196745562130178, "grad_norm": 0.34375, "learning_rate": 1.5324963749169917e-05, "loss": 0.0423, "step": 17022 }, { "epoch": 4.19723865877712, "grad_norm": 0.349609375, "learning_rate": 1.5306655187492302e-05, "loss": 0.0424, "step": 17024 }, { "epoch": 4.197731755424063, "grad_norm": 0.30078125, "learning_rate": 1.528835666244892e-05, "loss": 0.0329, "step": 17026 }, { "epoch": 4.1982248520710055, "grad_norm": 0.349609375, "learning_rate": 1.527006817620823e-05, "loss": 0.0306, "step": 17028 }, { "epoch": 4.198717948717949, "grad_norm": 0.33203125, "learning_rate": 1.5251789730937571e-05, "loss": 0.0358, "step": 17030 }, { "epoch": 4.199211045364891, "grad_norm": 0.310546875, "learning_rate": 1.5233521328802947e-05, "loss": 0.0369, "step": 17032 }, { "epoch": 4.199704142011834, "grad_norm": 0.32421875, "learning_rate": 1.5215262971969357e-05, "loss": 0.035, "step": 17034 }, { "epoch": 4.200197238658777, "grad_norm": 0.373046875, "learning_rate": 1.5197014662600451e-05, "loss": 0.0372, "step": 17036 }, { "epoch": 4.20069033530572, "grad_norm": 0.3671875, "learning_rate": 1.517877640285883e-05, "loss": 0.0392, "step": 17038 }, { "epoch": 4.201183431952662, "grad_norm": 0.298828125, "learning_rate": 1.5160548194905765e-05, "loss": 0.0339, "step": 17040 }, { "epoch": 4.201676528599606, "grad_norm": 0.3046875, "learning_rate": 1.514233004090141e-05, "loss": 0.0354, "step": 17042 }, { "epoch": 4.202169625246548, "grad_norm": 0.390625, "learning_rate": 1.5124121943004766e-05, "loss": 0.0387, "step": 17044 }, { "epoch": 4.202662721893491, "grad_norm": 0.33203125, "learning_rate": 1.5105923903373508e-05, "loss": 0.0413, "step": 17046 }, { "epoch": 4.203155818540434, "grad_norm": 0.310546875, "learning_rate": 1.5087735924164303e-05, "loss": 0.0354, "step": 17048 }, { "epoch": 4.203648915187377, "grad_norm": 0.291015625, "learning_rate": 1.5069558007532426e-05, "loss": 0.0313, "step": 17050 }, { "epoch": 4.204142011834319, "grad_norm": 0.34765625, "learning_rate": 1.5051390155632172e-05, "loss": 0.0316, "step": 17052 }, { "epoch": 4.204635108481263, "grad_norm": 0.306640625, "learning_rate": 1.5033232370616445e-05, "loss": 0.0364, "step": 17054 }, { "epoch": 4.205128205128205, "grad_norm": 0.34375, "learning_rate": 1.501508465463708e-05, "loss": 0.0357, "step": 17056 }, { "epoch": 4.205621301775148, "grad_norm": 0.36328125, "learning_rate": 1.4996947009844697e-05, "loss": 0.0342, "step": 17058 }, { "epoch": 4.20611439842209, "grad_norm": 0.32421875, "learning_rate": 1.4978819438388658e-05, "loss": 0.0363, "step": 17060 }, { "epoch": 4.206607495069034, "grad_norm": 0.31640625, "learning_rate": 1.4960701942417243e-05, "loss": 0.0374, "step": 17062 }, { "epoch": 4.207100591715976, "grad_norm": 0.27734375, "learning_rate": 1.4942594524077413e-05, "loss": 0.0388, "step": 17064 }, { "epoch": 4.207593688362919, "grad_norm": 0.384765625, "learning_rate": 1.4924497185515063e-05, "loss": 0.0352, "step": 17066 }, { "epoch": 4.208086785009862, "grad_norm": 0.30078125, "learning_rate": 1.4906409928874787e-05, "loss": 0.0352, "step": 17068 }, { "epoch": 4.208579881656805, "grad_norm": 0.34765625, "learning_rate": 1.4888332756300027e-05, "loss": 0.0368, "step": 17070 }, { "epoch": 4.209072978303747, "grad_norm": 0.337890625, "learning_rate": 1.487026566993307e-05, "loss": 0.0386, "step": 17072 }, { "epoch": 4.209566074950691, "grad_norm": 0.349609375, "learning_rate": 1.485220867191488e-05, "loss": 0.0382, "step": 17074 }, { "epoch": 4.210059171597633, "grad_norm": 0.34375, "learning_rate": 1.4834161764385423e-05, "loss": 0.0391, "step": 17076 }, { "epoch": 4.210552268244576, "grad_norm": 0.353515625, "learning_rate": 1.4816124949483245e-05, "loss": 0.0409, "step": 17078 }, { "epoch": 4.211045364891519, "grad_norm": 0.34375, "learning_rate": 1.4798098229345925e-05, "loss": 0.04, "step": 17080 }, { "epoch": 4.211538461538462, "grad_norm": 0.279296875, "learning_rate": 1.4780081606109642e-05, "loss": 0.0349, "step": 17082 }, { "epoch": 4.212031558185404, "grad_norm": 0.375, "learning_rate": 1.4762075081909487e-05, "loss": 0.0402, "step": 17084 }, { "epoch": 4.212524654832347, "grad_norm": 0.3203125, "learning_rate": 1.4744078658879357e-05, "loss": 0.0383, "step": 17086 }, { "epoch": 4.21301775147929, "grad_norm": 0.3125, "learning_rate": 1.4726092339151908e-05, "loss": 0.0413, "step": 17088 }, { "epoch": 4.213510848126233, "grad_norm": 0.357421875, "learning_rate": 1.4708116124858651e-05, "loss": 0.0355, "step": 17090 }, { "epoch": 4.214003944773175, "grad_norm": 0.318359375, "learning_rate": 1.4690150018129788e-05, "loss": 0.0358, "step": 17092 }, { "epoch": 4.214497041420119, "grad_norm": 0.357421875, "learning_rate": 1.467219402109451e-05, "loss": 0.0344, "step": 17094 }, { "epoch": 4.214990138067061, "grad_norm": 0.328125, "learning_rate": 1.4654248135880621e-05, "loss": 0.038, "step": 17096 }, { "epoch": 4.215483234714004, "grad_norm": 0.31640625, "learning_rate": 1.4636312364614846e-05, "loss": 0.0348, "step": 17098 }, { "epoch": 4.215976331360947, "grad_norm": 0.302734375, "learning_rate": 1.461838670942266e-05, "loss": 0.0386, "step": 17100 }, { "epoch": 4.21646942800789, "grad_norm": 0.33984375, "learning_rate": 1.4600471172428353e-05, "loss": 0.0382, "step": 17102 }, { "epoch": 4.216962524654832, "grad_norm": 0.294921875, "learning_rate": 1.458256575575503e-05, "loss": 0.0358, "step": 17104 }, { "epoch": 4.2174556213017755, "grad_norm": 0.3359375, "learning_rate": 1.4564670461524576e-05, "loss": 0.0359, "step": 17106 }, { "epoch": 4.217948717948718, "grad_norm": 0.34375, "learning_rate": 1.45467852918577e-05, "loss": 0.0364, "step": 17108 }, { "epoch": 4.218441814595661, "grad_norm": 0.3125, "learning_rate": 1.4528910248873862e-05, "loss": 0.0363, "step": 17110 }, { "epoch": 4.218934911242604, "grad_norm": 0.3125, "learning_rate": 1.4511045334691364e-05, "loss": 0.0381, "step": 17112 }, { "epoch": 4.2194280078895465, "grad_norm": 0.400390625, "learning_rate": 1.4493190551427305e-05, "loss": 0.0383, "step": 17114 }, { "epoch": 4.219921104536489, "grad_norm": 0.314453125, "learning_rate": 1.4475345901197568e-05, "loss": 0.0353, "step": 17116 }, { "epoch": 4.220414201183432, "grad_norm": 0.341796875, "learning_rate": 1.445751138611685e-05, "loss": 0.0343, "step": 17118 }, { "epoch": 4.220907297830375, "grad_norm": 0.3203125, "learning_rate": 1.4439687008298652e-05, "loss": 0.0364, "step": 17120 }, { "epoch": 4.2214003944773175, "grad_norm": 0.3046875, "learning_rate": 1.442187276985526e-05, "loss": 0.0378, "step": 17122 }, { "epoch": 4.22189349112426, "grad_norm": 0.3125, "learning_rate": 1.4404068672897741e-05, "loss": 0.0361, "step": 17124 }, { "epoch": 4.2223865877712035, "grad_norm": 0.30859375, "learning_rate": 1.4386274719535986e-05, "loss": 0.0361, "step": 17126 }, { "epoch": 4.222879684418146, "grad_norm": 0.326171875, "learning_rate": 1.4368490911878685e-05, "loss": 0.0369, "step": 17128 }, { "epoch": 4.2233727810650885, "grad_norm": 0.34765625, "learning_rate": 1.4350717252033308e-05, "loss": 0.0375, "step": 17130 }, { "epoch": 4.223865877712032, "grad_norm": 0.30859375, "learning_rate": 1.4332953742106148e-05, "loss": 0.0354, "step": 17132 }, { "epoch": 4.2243589743589745, "grad_norm": 0.337890625, "learning_rate": 1.4315200384202276e-05, "loss": 0.0374, "step": 17134 }, { "epoch": 4.224852071005917, "grad_norm": 0.3203125, "learning_rate": 1.4297457180425555e-05, "loss": 0.0323, "step": 17136 }, { "epoch": 4.22534516765286, "grad_norm": 0.3203125, "learning_rate": 1.427972413287868e-05, "loss": 0.0357, "step": 17138 }, { "epoch": 4.225838264299803, "grad_norm": 0.3515625, "learning_rate": 1.426200124366308e-05, "loss": 0.0348, "step": 17140 }, { "epoch": 4.226331360946745, "grad_norm": 0.310546875, "learning_rate": 1.4244288514879034e-05, "loss": 0.0369, "step": 17142 }, { "epoch": 4.226824457593688, "grad_norm": 0.310546875, "learning_rate": 1.4226585948625592e-05, "loss": 0.035, "step": 17144 }, { "epoch": 4.227317554240631, "grad_norm": 0.32421875, "learning_rate": 1.4208893547000612e-05, "loss": 0.0387, "step": 17146 }, { "epoch": 4.227810650887574, "grad_norm": 0.3515625, "learning_rate": 1.4191211312100739e-05, "loss": 0.0384, "step": 17148 }, { "epoch": 4.228303747534516, "grad_norm": 0.3359375, "learning_rate": 1.4173539246021428e-05, "loss": 0.0407, "step": 17150 }, { "epoch": 4.22879684418146, "grad_norm": 0.29296875, "learning_rate": 1.4155877350856939e-05, "loss": 0.0371, "step": 17152 }, { "epoch": 4.229289940828402, "grad_norm": 0.31640625, "learning_rate": 1.4138225628700242e-05, "loss": 0.0363, "step": 17154 }, { "epoch": 4.229783037475345, "grad_norm": 0.31640625, "learning_rate": 1.4120584081643196e-05, "loss": 0.0363, "step": 17156 }, { "epoch": 4.230276134122288, "grad_norm": 0.341796875, "learning_rate": 1.410295271177644e-05, "loss": 0.0402, "step": 17158 }, { "epoch": 4.230769230769231, "grad_norm": 0.33984375, "learning_rate": 1.408533152118936e-05, "loss": 0.0386, "step": 17160 }, { "epoch": 4.231262327416173, "grad_norm": 0.33984375, "learning_rate": 1.4067720511970195e-05, "loss": 0.0387, "step": 17162 }, { "epoch": 4.231755424063117, "grad_norm": 0.345703125, "learning_rate": 1.405011968620592e-05, "loss": 0.0374, "step": 17164 }, { "epoch": 4.232248520710059, "grad_norm": 0.34765625, "learning_rate": 1.4032529045982379e-05, "loss": 0.0351, "step": 17166 }, { "epoch": 4.232741617357002, "grad_norm": 0.33203125, "learning_rate": 1.4014948593384069e-05, "loss": 0.0379, "step": 17168 }, { "epoch": 4.233234714003944, "grad_norm": 0.365234375, "learning_rate": 1.3997378330494493e-05, "loss": 0.0363, "step": 17170 }, { "epoch": 4.233727810650888, "grad_norm": 0.34765625, "learning_rate": 1.3979818259395728e-05, "loss": 0.0343, "step": 17172 }, { "epoch": 4.23422090729783, "grad_norm": 0.310546875, "learning_rate": 1.3962268382168775e-05, "loss": 0.0344, "step": 17174 }, { "epoch": 4.234714003944773, "grad_norm": 0.3046875, "learning_rate": 1.3944728700893394e-05, "loss": 0.0368, "step": 17176 }, { "epoch": 4.235207100591716, "grad_norm": 0.28125, "learning_rate": 1.3927199217648124e-05, "loss": 0.0343, "step": 17178 }, { "epoch": 4.235700197238659, "grad_norm": 0.328125, "learning_rate": 1.3909679934510345e-05, "loss": 0.0374, "step": 17180 }, { "epoch": 4.236193293885601, "grad_norm": 0.28515625, "learning_rate": 1.389217085355612e-05, "loss": 0.0345, "step": 17182 }, { "epoch": 4.236686390532545, "grad_norm": 0.322265625, "learning_rate": 1.3874671976860454e-05, "loss": 0.0385, "step": 17184 }, { "epoch": 4.237179487179487, "grad_norm": 0.34375, "learning_rate": 1.385718330649698e-05, "loss": 0.0367, "step": 17186 }, { "epoch": 4.23767258382643, "grad_norm": 0.294921875, "learning_rate": 1.3839704844538281e-05, "loss": 0.0376, "step": 17188 }, { "epoch": 4.238165680473373, "grad_norm": 0.34765625, "learning_rate": 1.3822236593055605e-05, "loss": 0.0352, "step": 17190 }, { "epoch": 4.238658777120316, "grad_norm": 0.326171875, "learning_rate": 1.3804778554119036e-05, "loss": 0.0387, "step": 17192 }, { "epoch": 4.239151873767258, "grad_norm": 0.283203125, "learning_rate": 1.3787330729797487e-05, "loss": 0.0349, "step": 17194 }, { "epoch": 4.239644970414201, "grad_norm": 0.318359375, "learning_rate": 1.376989312215855e-05, "loss": 0.0357, "step": 17196 }, { "epoch": 4.240138067061144, "grad_norm": 0.353515625, "learning_rate": 1.375246573326877e-05, "loss": 0.0357, "step": 17198 }, { "epoch": 4.240631163708087, "grad_norm": 0.310546875, "learning_rate": 1.373504856519331e-05, "loss": 0.036, "step": 17200 }, { "epoch": 4.241124260355029, "grad_norm": 0.361328125, "learning_rate": 1.3717641619996268e-05, "loss": 0.0378, "step": 17202 }, { "epoch": 4.241617357001973, "grad_norm": 0.310546875, "learning_rate": 1.3700244899740421e-05, "loss": 0.0354, "step": 17204 }, { "epoch": 4.242110453648915, "grad_norm": 0.337890625, "learning_rate": 1.3682858406487387e-05, "loss": 0.0348, "step": 17206 }, { "epoch": 4.242603550295858, "grad_norm": 0.31640625, "learning_rate": 1.366548214229758e-05, "loss": 0.0362, "step": 17208 }, { "epoch": 4.243096646942801, "grad_norm": 0.333984375, "learning_rate": 1.3648116109230136e-05, "loss": 0.0349, "step": 17210 }, { "epoch": 4.243589743589744, "grad_norm": 0.3125, "learning_rate": 1.3630760309343104e-05, "loss": 0.0349, "step": 17212 }, { "epoch": 4.244082840236686, "grad_norm": 0.328125, "learning_rate": 1.3613414744693143e-05, "loss": 0.0363, "step": 17214 }, { "epoch": 4.2445759368836296, "grad_norm": 0.298828125, "learning_rate": 1.3596079417335916e-05, "loss": 0.0345, "step": 17216 }, { "epoch": 4.245069033530572, "grad_norm": 0.287109375, "learning_rate": 1.357875432932566e-05, "loss": 0.0342, "step": 17218 }, { "epoch": 4.245562130177515, "grad_norm": 0.32421875, "learning_rate": 1.3561439482715532e-05, "loss": 0.0339, "step": 17220 }, { "epoch": 4.246055226824458, "grad_norm": 0.30859375, "learning_rate": 1.3544134879557468e-05, "loss": 0.0384, "step": 17222 }, { "epoch": 4.2465483234714005, "grad_norm": 0.33984375, "learning_rate": 1.3526840521902073e-05, "loss": 0.0321, "step": 17224 }, { "epoch": 4.247041420118343, "grad_norm": 0.328125, "learning_rate": 1.350955641179893e-05, "loss": 0.0374, "step": 17226 }, { "epoch": 4.247534516765286, "grad_norm": 0.40625, "learning_rate": 1.3492282551296209e-05, "loss": 0.0394, "step": 17228 }, { "epoch": 4.248027613412229, "grad_norm": 0.330078125, "learning_rate": 1.3475018942441053e-05, "loss": 0.0358, "step": 17230 }, { "epoch": 4.2485207100591715, "grad_norm": 0.36328125, "learning_rate": 1.3457765587279214e-05, "loss": 0.0393, "step": 17232 }, { "epoch": 4.249013806706114, "grad_norm": 0.3515625, "learning_rate": 1.3440522487855344e-05, "loss": 0.035, "step": 17234 }, { "epoch": 4.2495069033530575, "grad_norm": 0.34765625, "learning_rate": 1.3423289646212856e-05, "loss": 0.0404, "step": 17236 }, { "epoch": 4.25, "grad_norm": 0.32421875, "learning_rate": 1.3406067064393913e-05, "loss": 0.039, "step": 17238 }, { "epoch": 4.2504930966469425, "grad_norm": 0.384765625, "learning_rate": 1.3388854744439538e-05, "loss": 0.0375, "step": 17240 }, { "epoch": 4.250986193293886, "grad_norm": 0.349609375, "learning_rate": 1.3371652688389392e-05, "loss": 0.0353, "step": 17242 }, { "epoch": 4.2514792899408285, "grad_norm": 0.3359375, "learning_rate": 1.3354460898282128e-05, "loss": 0.0366, "step": 17244 }, { "epoch": 4.251972386587771, "grad_norm": 0.376953125, "learning_rate": 1.3337279376154987e-05, "loss": 0.0392, "step": 17246 }, { "epoch": 4.252465483234714, "grad_norm": 0.359375, "learning_rate": 1.3320108124044106e-05, "loss": 0.0385, "step": 17248 }, { "epoch": 4.252958579881657, "grad_norm": 0.330078125, "learning_rate": 1.3302947143984367e-05, "loss": 0.0368, "step": 17250 }, { "epoch": 4.2534516765285995, "grad_norm": 0.361328125, "learning_rate": 1.3285796438009446e-05, "loss": 0.0345, "step": 17252 }, { "epoch": 4.253944773175542, "grad_norm": 0.3203125, "learning_rate": 1.3268656008151815e-05, "loss": 0.0384, "step": 17254 }, { "epoch": 4.254437869822485, "grad_norm": 0.314453125, "learning_rate": 1.3251525856442658e-05, "loss": 0.0374, "step": 17256 }, { "epoch": 4.254930966469428, "grad_norm": 0.31640625, "learning_rate": 1.3234405984912068e-05, "loss": 0.0381, "step": 17258 }, { "epoch": 4.2554240631163704, "grad_norm": 0.314453125, "learning_rate": 1.3217296395588785e-05, "loss": 0.0373, "step": 17260 }, { "epoch": 4.255917159763314, "grad_norm": 0.33984375, "learning_rate": 1.320019709050041e-05, "loss": 0.0372, "step": 17262 }, { "epoch": 4.256410256410256, "grad_norm": 0.330078125, "learning_rate": 1.3183108071673312e-05, "loss": 0.0413, "step": 17264 }, { "epoch": 4.256903353057199, "grad_norm": 0.357421875, "learning_rate": 1.3166029341132624e-05, "loss": 0.0358, "step": 17266 }, { "epoch": 4.257396449704142, "grad_norm": 0.369140625, "learning_rate": 1.3148960900902286e-05, "loss": 0.0402, "step": 17268 }, { "epoch": 4.257889546351085, "grad_norm": 0.318359375, "learning_rate": 1.3131902753004988e-05, "loss": 0.0412, "step": 17270 }, { "epoch": 4.258382642998027, "grad_norm": 0.392578125, "learning_rate": 1.3114854899462248e-05, "loss": 0.04, "step": 17272 }, { "epoch": 4.258875739644971, "grad_norm": 0.416015625, "learning_rate": 1.3097817342294295e-05, "loss": 0.0358, "step": 17274 }, { "epoch": 4.259368836291913, "grad_norm": 0.359375, "learning_rate": 1.3080790083520167e-05, "loss": 0.0415, "step": 17276 }, { "epoch": 4.259861932938856, "grad_norm": 0.33203125, "learning_rate": 1.3063773125157718e-05, "loss": 0.035, "step": 17278 }, { "epoch": 4.260355029585799, "grad_norm": 0.34765625, "learning_rate": 1.3046766469223548e-05, "loss": 0.0397, "step": 17280 }, { "epoch": 4.260848126232742, "grad_norm": 0.349609375, "learning_rate": 1.3029770117733031e-05, "loss": 0.0415, "step": 17282 }, { "epoch": 4.261341222879684, "grad_norm": 0.376953125, "learning_rate": 1.3012784072700334e-05, "loss": 0.0368, "step": 17284 }, { "epoch": 4.261834319526627, "grad_norm": 0.3359375, "learning_rate": 1.2995808336138427e-05, "loss": 0.0403, "step": 17286 }, { "epoch": 4.26232741617357, "grad_norm": 0.34765625, "learning_rate": 1.2978842910058975e-05, "loss": 0.0397, "step": 17288 }, { "epoch": 4.262820512820513, "grad_norm": 0.37109375, "learning_rate": 1.2961887796472505e-05, "loss": 0.0385, "step": 17290 }, { "epoch": 4.263313609467455, "grad_norm": 0.279296875, "learning_rate": 1.2944942997388287e-05, "loss": 0.0333, "step": 17292 }, { "epoch": 4.263806706114399, "grad_norm": 0.34375, "learning_rate": 1.2928008514814371e-05, "loss": 0.04, "step": 17294 }, { "epoch": 4.264299802761341, "grad_norm": 0.314453125, "learning_rate": 1.2911084350757608e-05, "loss": 0.0371, "step": 17296 }, { "epoch": 4.264792899408284, "grad_norm": 0.33984375, "learning_rate": 1.289417050722358e-05, "loss": 0.0381, "step": 17298 }, { "epoch": 4.265285996055227, "grad_norm": 0.30078125, "learning_rate": 1.2877266986216684e-05, "loss": 0.0349, "step": 17300 }, { "epoch": 4.26577909270217, "grad_norm": 0.388671875, "learning_rate": 1.286037378974011e-05, "loss": 0.0386, "step": 17302 }, { "epoch": 4.266272189349112, "grad_norm": 0.375, "learning_rate": 1.2843490919795741e-05, "loss": 0.0368, "step": 17304 }, { "epoch": 4.266765285996055, "grad_norm": 0.4375, "learning_rate": 1.2826618378384315e-05, "loss": 0.0374, "step": 17306 }, { "epoch": 4.267258382642998, "grad_norm": 0.33203125, "learning_rate": 1.2809756167505328e-05, "loss": 0.0403, "step": 17308 }, { "epoch": 4.267751479289941, "grad_norm": 0.3203125, "learning_rate": 1.2792904289157048e-05, "loss": 0.0385, "step": 17310 }, { "epoch": 4.268244575936883, "grad_norm": 0.333984375, "learning_rate": 1.2776062745336504e-05, "loss": 0.0445, "step": 17312 }, { "epoch": 4.268737672583827, "grad_norm": 0.369140625, "learning_rate": 1.2759231538039529e-05, "loss": 0.0375, "step": 17314 }, { "epoch": 4.269230769230769, "grad_norm": 0.37109375, "learning_rate": 1.2742410669260717e-05, "loss": 0.0378, "step": 17316 }, { "epoch": 4.269723865877712, "grad_norm": 0.4765625, "learning_rate": 1.2725600140993388e-05, "loss": 0.042, "step": 17318 }, { "epoch": 4.270216962524655, "grad_norm": 0.296875, "learning_rate": 1.2708799955229766e-05, "loss": 0.034, "step": 17320 }, { "epoch": 4.270710059171598, "grad_norm": 0.341796875, "learning_rate": 1.2692010113960696e-05, "loss": 0.0386, "step": 17322 }, { "epoch": 4.27120315581854, "grad_norm": 0.373046875, "learning_rate": 1.2675230619175893e-05, "loss": 0.0367, "step": 17324 }, { "epoch": 4.271696252465484, "grad_norm": 0.3671875, "learning_rate": 1.2658461472863825e-05, "loss": 0.0328, "step": 17326 }, { "epoch": 4.272189349112426, "grad_norm": 0.322265625, "learning_rate": 1.2641702677011724e-05, "loss": 0.0394, "step": 17328 }, { "epoch": 4.272682445759369, "grad_norm": 0.30078125, "learning_rate": 1.2624954233605613e-05, "loss": 0.0369, "step": 17330 }, { "epoch": 4.273175542406312, "grad_norm": 0.384765625, "learning_rate": 1.2608216144630237e-05, "loss": 0.0386, "step": 17332 }, { "epoch": 4.273668639053255, "grad_norm": 0.318359375, "learning_rate": 1.2591488412069208e-05, "loss": 0.0387, "step": 17334 }, { "epoch": 4.274161735700197, "grad_norm": 0.333984375, "learning_rate": 1.257477103790482e-05, "loss": 0.0397, "step": 17336 }, { "epoch": 4.2746548323471405, "grad_norm": 0.337890625, "learning_rate": 1.2558064024118176e-05, "loss": 0.034, "step": 17338 }, { "epoch": 4.275147928994083, "grad_norm": 0.3046875, "learning_rate": 1.2541367372689172e-05, "loss": 0.0359, "step": 17340 }, { "epoch": 4.2756410256410255, "grad_norm": 0.341796875, "learning_rate": 1.2524681085596423e-05, "loss": 0.0375, "step": 17342 }, { "epoch": 4.276134122287968, "grad_norm": 0.33203125, "learning_rate": 1.2508005164817404e-05, "loss": 0.0359, "step": 17344 }, { "epoch": 4.2766272189349115, "grad_norm": 0.326171875, "learning_rate": 1.2491339612328212e-05, "loss": 0.0369, "step": 17346 }, { "epoch": 4.277120315581854, "grad_norm": 0.328125, "learning_rate": 1.24746844301039e-05, "loss": 0.0343, "step": 17348 }, { "epoch": 4.2776134122287965, "grad_norm": 0.326171875, "learning_rate": 1.2458039620118133e-05, "loss": 0.0368, "step": 17350 }, { "epoch": 4.27810650887574, "grad_norm": 0.3671875, "learning_rate": 1.244140518434348e-05, "loss": 0.0386, "step": 17352 }, { "epoch": 4.2785996055226825, "grad_norm": 0.291015625, "learning_rate": 1.2424781124751151e-05, "loss": 0.0327, "step": 17354 }, { "epoch": 4.279092702169625, "grad_norm": 0.27734375, "learning_rate": 1.2408167443311214e-05, "loss": 0.0322, "step": 17356 }, { "epoch": 4.279585798816568, "grad_norm": 0.3203125, "learning_rate": 1.2391564141992517e-05, "loss": 0.033, "step": 17358 }, { "epoch": 4.280078895463511, "grad_norm": 0.373046875, "learning_rate": 1.2374971222762565e-05, "loss": 0.0378, "step": 17360 }, { "epoch": 4.2805719921104535, "grad_norm": 0.349609375, "learning_rate": 1.2358388687587796e-05, "loss": 0.0351, "step": 17362 }, { "epoch": 4.281065088757396, "grad_norm": 0.37109375, "learning_rate": 1.2341816538433259e-05, "loss": 0.0364, "step": 17364 }, { "epoch": 4.281558185404339, "grad_norm": 0.380859375, "learning_rate": 1.2325254777262917e-05, "loss": 0.0347, "step": 17366 }, { "epoch": 4.282051282051282, "grad_norm": 0.322265625, "learning_rate": 1.230870340603939e-05, "loss": 0.0363, "step": 17368 }, { "epoch": 4.2825443786982245, "grad_norm": 0.369140625, "learning_rate": 1.2292162426724108e-05, "loss": 0.045, "step": 17370 }, { "epoch": 4.283037475345168, "grad_norm": 0.357421875, "learning_rate": 1.2275631841277268e-05, "loss": 0.0378, "step": 17372 }, { "epoch": 4.28353057199211, "grad_norm": 0.412109375, "learning_rate": 1.2259111651657862e-05, "loss": 0.0354, "step": 17374 }, { "epoch": 4.284023668639053, "grad_norm": 0.357421875, "learning_rate": 1.224260185982361e-05, "loss": 0.0369, "step": 17376 }, { "epoch": 4.284516765285996, "grad_norm": 0.361328125, "learning_rate": 1.2226102467730982e-05, "loss": 0.0383, "step": 17378 }, { "epoch": 4.285009861932939, "grad_norm": 0.322265625, "learning_rate": 1.2209613477335324e-05, "loss": 0.0364, "step": 17380 }, { "epoch": 4.285502958579881, "grad_norm": 0.33203125, "learning_rate": 1.2193134890590597e-05, "loss": 0.0367, "step": 17382 }, { "epoch": 4.285996055226825, "grad_norm": 0.3359375, "learning_rate": 1.2176666709449635e-05, "loss": 0.0344, "step": 17384 }, { "epoch": 4.286489151873767, "grad_norm": 0.43359375, "learning_rate": 1.2160208935864014e-05, "loss": 0.0358, "step": 17386 }, { "epoch": 4.28698224852071, "grad_norm": 0.349609375, "learning_rate": 1.2143761571784074e-05, "loss": 0.0403, "step": 17388 }, { "epoch": 4.287475345167653, "grad_norm": 0.37890625, "learning_rate": 1.212732461915893e-05, "loss": 0.0369, "step": 17390 }, { "epoch": 4.287968441814596, "grad_norm": 0.333984375, "learning_rate": 1.2110898079936394e-05, "loss": 0.0387, "step": 17392 }, { "epoch": 4.288461538461538, "grad_norm": 0.34765625, "learning_rate": 1.2094481956063196e-05, "loss": 0.037, "step": 17394 }, { "epoch": 4.288954635108481, "grad_norm": 0.3203125, "learning_rate": 1.207807624948466e-05, "loss": 0.0359, "step": 17396 }, { "epoch": 4.289447731755424, "grad_norm": 0.380859375, "learning_rate": 1.2061680962144983e-05, "loss": 0.0334, "step": 17398 }, { "epoch": 4.289940828402367, "grad_norm": 0.3125, "learning_rate": 1.204529609598709e-05, "loss": 0.0339, "step": 17400 }, { "epoch": 4.290433925049309, "grad_norm": 0.28515625, "learning_rate": 1.2028921652952696e-05, "loss": 0.0341, "step": 17402 }, { "epoch": 4.290927021696253, "grad_norm": 0.33203125, "learning_rate": 1.2012557634982269e-05, "loss": 0.0384, "step": 17404 }, { "epoch": 4.291420118343195, "grad_norm": 0.373046875, "learning_rate": 1.1996204044014981e-05, "loss": 0.036, "step": 17406 }, { "epoch": 4.291913214990138, "grad_norm": 0.314453125, "learning_rate": 1.1979860881988902e-05, "loss": 0.0352, "step": 17408 }, { "epoch": 4.292406311637081, "grad_norm": 0.314453125, "learning_rate": 1.196352815084073e-05, "loss": 0.0344, "step": 17410 }, { "epoch": 4.292899408284024, "grad_norm": 0.326171875, "learning_rate": 1.1947205852506005e-05, "loss": 0.0349, "step": 17412 }, { "epoch": 4.293392504930966, "grad_norm": 0.357421875, "learning_rate": 1.1930893988919011e-05, "loss": 0.038, "step": 17414 }, { "epoch": 4.29388560157791, "grad_norm": 0.306640625, "learning_rate": 1.1914592562012806e-05, "loss": 0.0348, "step": 17416 }, { "epoch": 4.294378698224852, "grad_norm": 0.3203125, "learning_rate": 1.1898301573719195e-05, "loss": 0.0379, "step": 17418 }, { "epoch": 4.294871794871795, "grad_norm": 0.3125, "learning_rate": 1.1882021025968704e-05, "loss": 0.0352, "step": 17420 }, { "epoch": 4.295364891518737, "grad_norm": 0.31640625, "learning_rate": 1.1865750920690755e-05, "loss": 0.0365, "step": 17422 }, { "epoch": 4.295857988165681, "grad_norm": 0.34765625, "learning_rate": 1.1849491259813373e-05, "loss": 0.0354, "step": 17424 }, { "epoch": 4.296351084812623, "grad_norm": 0.34375, "learning_rate": 1.1833242045263459e-05, "loss": 0.0383, "step": 17426 }, { "epoch": 4.296844181459566, "grad_norm": 0.3046875, "learning_rate": 1.181700327896661e-05, "loss": 0.0373, "step": 17428 }, { "epoch": 4.297337278106509, "grad_norm": 0.306640625, "learning_rate": 1.1800774962847228e-05, "loss": 0.035, "step": 17430 }, { "epoch": 4.297830374753452, "grad_norm": 0.36328125, "learning_rate": 1.1784557098828453e-05, "loss": 0.0387, "step": 17432 }, { "epoch": 4.298323471400394, "grad_norm": 0.3046875, "learning_rate": 1.1768349688832203e-05, "loss": 0.0383, "step": 17434 }, { "epoch": 4.298816568047338, "grad_norm": 0.31640625, "learning_rate": 1.1752152734779143e-05, "loss": 0.0363, "step": 17436 }, { "epoch": 4.29930966469428, "grad_norm": 0.365234375, "learning_rate": 1.1735966238588691e-05, "loss": 0.0358, "step": 17438 }, { "epoch": 4.299802761341223, "grad_norm": 0.32421875, "learning_rate": 1.1719790202179026e-05, "loss": 0.0371, "step": 17440 }, { "epoch": 4.300295857988166, "grad_norm": 0.306640625, "learning_rate": 1.1703624627467135e-05, "loss": 0.0343, "step": 17442 }, { "epoch": 4.300788954635109, "grad_norm": 0.294921875, "learning_rate": 1.16874695163687e-05, "loss": 0.0374, "step": 17444 }, { "epoch": 4.301282051282051, "grad_norm": 0.35546875, "learning_rate": 1.1671324870798195e-05, "loss": 0.0385, "step": 17446 }, { "epoch": 4.3017751479289945, "grad_norm": 0.3046875, "learning_rate": 1.165519069266886e-05, "loss": 0.0363, "step": 17448 }, { "epoch": 4.302268244575937, "grad_norm": 0.3203125, "learning_rate": 1.1639066983892687e-05, "loss": 0.0345, "step": 17450 }, { "epoch": 4.30276134122288, "grad_norm": 0.3125, "learning_rate": 1.1622953746380415e-05, "loss": 0.0396, "step": 17452 }, { "epoch": 4.303254437869822, "grad_norm": 0.32421875, "learning_rate": 1.1606850982041583e-05, "loss": 0.0367, "step": 17454 }, { "epoch": 4.3037475345167655, "grad_norm": 0.359375, "learning_rate": 1.159075869278442e-05, "loss": 0.0417, "step": 17456 }, { "epoch": 4.304240631163708, "grad_norm": 0.330078125, "learning_rate": 1.1574676880515955e-05, "loss": 0.0383, "step": 17458 }, { "epoch": 4.304733727810651, "grad_norm": 0.3515625, "learning_rate": 1.1558605547141988e-05, "loss": 0.0375, "step": 17460 }, { "epoch": 4.305226824457594, "grad_norm": 0.36328125, "learning_rate": 1.154254469456706e-05, "loss": 0.0373, "step": 17462 }, { "epoch": 4.3057199211045365, "grad_norm": 0.326171875, "learning_rate": 1.1526494324694458e-05, "loss": 0.0398, "step": 17464 }, { "epoch": 4.306213017751479, "grad_norm": 0.326171875, "learning_rate": 1.1510454439426267e-05, "loss": 0.0354, "step": 17466 }, { "epoch": 4.306706114398422, "grad_norm": 0.341796875, "learning_rate": 1.1494425040663292e-05, "loss": 0.0345, "step": 17468 }, { "epoch": 4.307199211045365, "grad_norm": 0.3359375, "learning_rate": 1.1478406130305097e-05, "loss": 0.0379, "step": 17470 }, { "epoch": 4.3076923076923075, "grad_norm": 0.279296875, "learning_rate": 1.1462397710250005e-05, "loss": 0.0365, "step": 17472 }, { "epoch": 4.308185404339251, "grad_norm": 0.29296875, "learning_rate": 1.1446399782395123e-05, "loss": 0.0396, "step": 17474 }, { "epoch": 4.308678500986193, "grad_norm": 0.333984375, "learning_rate": 1.1430412348636277e-05, "loss": 0.0356, "step": 17476 }, { "epoch": 4.309171597633136, "grad_norm": 0.322265625, "learning_rate": 1.1414435410868086e-05, "loss": 0.0331, "step": 17478 }, { "epoch": 4.3096646942800785, "grad_norm": 0.369140625, "learning_rate": 1.1398468970983888e-05, "loss": 0.0391, "step": 17480 }, { "epoch": 4.310157790927022, "grad_norm": 0.326171875, "learning_rate": 1.1382513030875808e-05, "loss": 0.0404, "step": 17482 }, { "epoch": 4.310650887573964, "grad_norm": 0.328125, "learning_rate": 1.136656759243474e-05, "loss": 0.0366, "step": 17484 }, { "epoch": 4.311143984220907, "grad_norm": 0.353515625, "learning_rate": 1.1350632657550253e-05, "loss": 0.0379, "step": 17486 }, { "epoch": 4.31163708086785, "grad_norm": 0.3515625, "learning_rate": 1.1334708228110747e-05, "loss": 0.037, "step": 17488 }, { "epoch": 4.312130177514793, "grad_norm": 0.3203125, "learning_rate": 1.1318794306003355e-05, "loss": 0.0414, "step": 17490 }, { "epoch": 4.312623274161735, "grad_norm": 0.330078125, "learning_rate": 1.1302890893113982e-05, "loss": 0.0333, "step": 17492 }, { "epoch": 4.313116370808679, "grad_norm": 0.314453125, "learning_rate": 1.1286997991327253e-05, "loss": 0.037, "step": 17494 }, { "epoch": 4.313609467455621, "grad_norm": 0.314453125, "learning_rate": 1.1271115602526572e-05, "loss": 0.0328, "step": 17496 }, { "epoch": 4.314102564102564, "grad_norm": 0.3828125, "learning_rate": 1.1255243728594112e-05, "loss": 0.0342, "step": 17498 }, { "epoch": 4.314595660749507, "grad_norm": 0.318359375, "learning_rate": 1.1239382371410712e-05, "loss": 0.0386, "step": 17500 }, { "epoch": 4.31508875739645, "grad_norm": 0.3203125, "learning_rate": 1.1223531532856125e-05, "loss": 0.0385, "step": 17502 }, { "epoch": 4.315581854043392, "grad_norm": 0.330078125, "learning_rate": 1.120769121480869e-05, "loss": 0.0354, "step": 17504 }, { "epoch": 4.316074950690336, "grad_norm": 0.3515625, "learning_rate": 1.11918614191456e-05, "loss": 0.0349, "step": 17506 }, { "epoch": 4.316568047337278, "grad_norm": 0.310546875, "learning_rate": 1.117604214774277e-05, "loss": 0.0381, "step": 17508 }, { "epoch": 4.317061143984221, "grad_norm": 0.337890625, "learning_rate": 1.1160233402474885e-05, "loss": 0.0338, "step": 17510 }, { "epoch": 4.317554240631163, "grad_norm": 0.3046875, "learning_rate": 1.1144435185215374e-05, "loss": 0.0348, "step": 17512 }, { "epoch": 4.318047337278107, "grad_norm": 0.478515625, "learning_rate": 1.1128647497836353e-05, "loss": 0.0368, "step": 17514 }, { "epoch": 4.318540433925049, "grad_norm": 0.322265625, "learning_rate": 1.1112870342208847e-05, "loss": 0.0361, "step": 17516 }, { "epoch": 4.319033530571992, "grad_norm": 0.32421875, "learning_rate": 1.1097103720202461e-05, "loss": 0.0378, "step": 17518 }, { "epoch": 4.319526627218935, "grad_norm": 0.3203125, "learning_rate": 1.1081347633685657e-05, "loss": 0.0372, "step": 17520 }, { "epoch": 4.320019723865878, "grad_norm": 0.373046875, "learning_rate": 1.1065602084525618e-05, "loss": 0.0363, "step": 17522 }, { "epoch": 4.32051282051282, "grad_norm": 0.3203125, "learning_rate": 1.1049867074588282e-05, "loss": 0.0366, "step": 17524 }, { "epoch": 4.321005917159764, "grad_norm": 0.31640625, "learning_rate": 1.1034142605738351e-05, "loss": 0.0349, "step": 17526 }, { "epoch": 4.321499013806706, "grad_norm": 0.330078125, "learning_rate": 1.1018428679839199e-05, "loss": 0.0366, "step": 17528 }, { "epoch": 4.321992110453649, "grad_norm": 0.322265625, "learning_rate": 1.1002725298753092e-05, "loss": 0.035, "step": 17530 }, { "epoch": 4.322485207100591, "grad_norm": 0.349609375, "learning_rate": 1.0987032464340896e-05, "loss": 0.0369, "step": 17532 }, { "epoch": 4.322978303747535, "grad_norm": 0.36328125, "learning_rate": 1.0971350178462391e-05, "loss": 0.0386, "step": 17534 }, { "epoch": 4.323471400394477, "grad_norm": 0.328125, "learning_rate": 1.0955678442975925e-05, "loss": 0.0354, "step": 17536 }, { "epoch": 4.32396449704142, "grad_norm": 0.384765625, "learning_rate": 1.0940017259738733e-05, "loss": 0.0335, "step": 17538 }, { "epoch": 4.324457593688363, "grad_norm": 0.30078125, "learning_rate": 1.0924366630606763e-05, "loss": 0.0332, "step": 17540 }, { "epoch": 4.324950690335306, "grad_norm": 0.361328125, "learning_rate": 1.0908726557434645e-05, "loss": 0.0378, "step": 17542 }, { "epoch": 4.325443786982248, "grad_norm": 0.3515625, "learning_rate": 1.0893097042075883e-05, "loss": 0.0352, "step": 17544 }, { "epoch": 4.325936883629192, "grad_norm": 0.30859375, "learning_rate": 1.0877478086382597e-05, "loss": 0.0377, "step": 17546 }, { "epoch": 4.326429980276134, "grad_norm": 0.453125, "learning_rate": 1.0861869692205794e-05, "loss": 0.0398, "step": 17548 }, { "epoch": 4.326923076923077, "grad_norm": 0.30078125, "learning_rate": 1.0846271861395086e-05, "loss": 0.0342, "step": 17550 }, { "epoch": 4.32741617357002, "grad_norm": 0.3359375, "learning_rate": 1.0830684595798946e-05, "loss": 0.0338, "step": 17552 }, { "epoch": 4.327909270216963, "grad_norm": 0.33984375, "learning_rate": 1.0815107897264554e-05, "loss": 0.0368, "step": 17554 }, { "epoch": 4.328402366863905, "grad_norm": 0.326171875, "learning_rate": 1.0799541767637777e-05, "loss": 0.0401, "step": 17556 }, { "epoch": 4.3288954635108485, "grad_norm": 0.333984375, "learning_rate": 1.0783986208763386e-05, "loss": 0.0371, "step": 17558 }, { "epoch": 4.329388560157791, "grad_norm": 0.35546875, "learning_rate": 1.0768441222484693e-05, "loss": 0.0335, "step": 17560 }, { "epoch": 4.329881656804734, "grad_norm": 0.34765625, "learning_rate": 1.075290681064397e-05, "loss": 0.0386, "step": 17562 }, { "epoch": 4.330374753451677, "grad_norm": 0.32421875, "learning_rate": 1.0737382975082056e-05, "loss": 0.0352, "step": 17564 }, { "epoch": 4.3308678500986195, "grad_norm": 0.337890625, "learning_rate": 1.0721869717638632e-05, "loss": 0.0382, "step": 17566 }, { "epoch": 4.331360946745562, "grad_norm": 0.376953125, "learning_rate": 1.0706367040152143e-05, "loss": 0.0334, "step": 17568 }, { "epoch": 4.331854043392505, "grad_norm": 0.318359375, "learning_rate": 1.0690874944459673e-05, "loss": 0.0417, "step": 17570 }, { "epoch": 4.332347140039448, "grad_norm": 0.392578125, "learning_rate": 1.0675393432397184e-05, "loss": 0.0377, "step": 17572 }, { "epoch": 4.3328402366863905, "grad_norm": 0.30859375, "learning_rate": 1.065992250579928e-05, "loss": 0.0365, "step": 17574 }, { "epoch": 4.333333333333333, "grad_norm": 0.357421875, "learning_rate": 1.0644462166499392e-05, "loss": 0.0389, "step": 17576 }, { "epoch": 4.3338264299802765, "grad_norm": 0.373046875, "learning_rate": 1.062901241632962e-05, "loss": 0.0312, "step": 17578 }, { "epoch": 4.334319526627219, "grad_norm": 0.29296875, "learning_rate": 1.0613573257120868e-05, "loss": 0.032, "step": 17580 }, { "epoch": 4.3348126232741615, "grad_norm": 0.31640625, "learning_rate": 1.0598144690702749e-05, "loss": 0.035, "step": 17582 }, { "epoch": 4.335305719921105, "grad_norm": 0.361328125, "learning_rate": 1.0582726718903634e-05, "loss": 0.0351, "step": 17584 }, { "epoch": 4.335798816568047, "grad_norm": 0.291015625, "learning_rate": 1.0567319343550675e-05, "loss": 0.033, "step": 17586 }, { "epoch": 4.33629191321499, "grad_norm": 0.306640625, "learning_rate": 1.0551922566469663e-05, "loss": 0.0373, "step": 17588 }, { "epoch": 4.3367850098619325, "grad_norm": 0.3125, "learning_rate": 1.0536536389485275e-05, "loss": 0.0347, "step": 17590 }, { "epoch": 4.337278106508876, "grad_norm": 0.337890625, "learning_rate": 1.0521160814420806e-05, "loss": 0.0355, "step": 17592 }, { "epoch": 4.337771203155818, "grad_norm": 0.3125, "learning_rate": 1.0505795843098366e-05, "loss": 0.0335, "step": 17594 }, { "epoch": 4.338264299802761, "grad_norm": 0.29296875, "learning_rate": 1.0490441477338786e-05, "loss": 0.0391, "step": 17596 }, { "epoch": 4.338757396449704, "grad_norm": 0.314453125, "learning_rate": 1.0475097718961658e-05, "loss": 0.0362, "step": 17598 }, { "epoch": 4.339250493096647, "grad_norm": 0.318359375, "learning_rate": 1.0459764569785314e-05, "loss": 0.0341, "step": 17600 }, { "epoch": 4.339743589743589, "grad_norm": 0.310546875, "learning_rate": 1.0444442031626744e-05, "loss": 0.0366, "step": 17602 }, { "epoch": 4.340236686390533, "grad_norm": 0.30859375, "learning_rate": 1.0429130106301854e-05, "loss": 0.0353, "step": 17604 }, { "epoch": 4.340729783037475, "grad_norm": 0.302734375, "learning_rate": 1.0413828795625125e-05, "loss": 0.0339, "step": 17606 }, { "epoch": 4.341222879684418, "grad_norm": 0.357421875, "learning_rate": 1.0398538101409872e-05, "loss": 0.0383, "step": 17608 }, { "epoch": 4.341715976331361, "grad_norm": 0.29296875, "learning_rate": 1.0383258025468124e-05, "loss": 0.0337, "step": 17610 }, { "epoch": 4.342209072978304, "grad_norm": 0.330078125, "learning_rate": 1.0367988569610653e-05, "loss": 0.0371, "step": 17612 }, { "epoch": 4.342702169625246, "grad_norm": 0.302734375, "learning_rate": 1.0352729735646982e-05, "loss": 0.0313, "step": 17614 }, { "epoch": 4.34319526627219, "grad_norm": 0.326171875, "learning_rate": 1.0337481525385362e-05, "loss": 0.0339, "step": 17616 }, { "epoch": 4.343688362919132, "grad_norm": 0.3515625, "learning_rate": 1.0322243940632814e-05, "loss": 0.0407, "step": 17618 }, { "epoch": 4.344181459566075, "grad_norm": 0.345703125, "learning_rate": 1.0307016983195039e-05, "loss": 0.0369, "step": 17620 }, { "epoch": 4.344674556213017, "grad_norm": 0.341796875, "learning_rate": 1.0291800654876527e-05, "loss": 0.0368, "step": 17622 }, { "epoch": 4.345167652859961, "grad_norm": 0.337890625, "learning_rate": 1.0276594957480502e-05, "loss": 0.0377, "step": 17624 }, { "epoch": 4.345660749506903, "grad_norm": 0.35546875, "learning_rate": 1.0261399892808932e-05, "loss": 0.0384, "step": 17626 }, { "epoch": 4.346153846153846, "grad_norm": 0.34375, "learning_rate": 1.02462154626625e-05, "loss": 0.0406, "step": 17628 }, { "epoch": 4.346646942800789, "grad_norm": 0.34765625, "learning_rate": 1.0231041668840669e-05, "loss": 0.0385, "step": 17630 }, { "epoch": 4.347140039447732, "grad_norm": 0.34765625, "learning_rate": 1.0215878513141597e-05, "loss": 0.0363, "step": 17632 }, { "epoch": 4.347633136094674, "grad_norm": 0.330078125, "learning_rate": 1.0200725997362238e-05, "loss": 0.0367, "step": 17634 }, { "epoch": 4.348126232741618, "grad_norm": 0.34375, "learning_rate": 1.0185584123298198e-05, "loss": 0.0338, "step": 17636 }, { "epoch": 4.34861932938856, "grad_norm": 0.37890625, "learning_rate": 1.017045289274391e-05, "loss": 0.0353, "step": 17638 }, { "epoch": 4.349112426035503, "grad_norm": 0.298828125, "learning_rate": 1.0155332307492482e-05, "loss": 0.0349, "step": 17640 }, { "epoch": 4.349605522682446, "grad_norm": 0.3046875, "learning_rate": 1.0140222369335816e-05, "loss": 0.0329, "step": 17642 }, { "epoch": 4.350098619329389, "grad_norm": 0.28515625, "learning_rate": 1.0125123080064502e-05, "loss": 0.0339, "step": 17644 }, { "epoch": 4.350591715976331, "grad_norm": 0.30859375, "learning_rate": 1.0110034441467908e-05, "loss": 0.0357, "step": 17646 }, { "epoch": 4.351084812623274, "grad_norm": 0.31640625, "learning_rate": 1.0094956455334137e-05, "loss": 0.0373, "step": 17648 }, { "epoch": 4.351577909270217, "grad_norm": 0.326171875, "learning_rate": 1.007988912344996e-05, "loss": 0.0362, "step": 17650 }, { "epoch": 4.35207100591716, "grad_norm": 0.333984375, "learning_rate": 1.0064832447600981e-05, "loss": 0.035, "step": 17652 }, { "epoch": 4.352564102564102, "grad_norm": 0.369140625, "learning_rate": 1.0049786429571484e-05, "loss": 0.0364, "step": 17654 }, { "epoch": 4.353057199211046, "grad_norm": 0.328125, "learning_rate": 1.0034751071144522e-05, "loss": 0.0356, "step": 17656 }, { "epoch": 4.353550295857988, "grad_norm": 0.30859375, "learning_rate": 1.0019726374101845e-05, "loss": 0.0354, "step": 17658 }, { "epoch": 4.354043392504931, "grad_norm": 0.29296875, "learning_rate": 1.0004712340223987e-05, "loss": 0.0341, "step": 17660 }, { "epoch": 4.354536489151874, "grad_norm": 0.357421875, "learning_rate": 9.989708971290213e-06, "loss": 0.0339, "step": 17662 }, { "epoch": 4.355029585798817, "grad_norm": 0.3203125, "learning_rate": 9.974716269078432e-06, "loss": 0.0348, "step": 17664 }, { "epoch": 4.355522682445759, "grad_norm": 0.384765625, "learning_rate": 9.959734235365448e-06, "loss": 0.034, "step": 17666 }, { "epoch": 4.3560157790927025, "grad_norm": 0.439453125, "learning_rate": 9.94476287192666e-06, "loss": 0.0353, "step": 17668 }, { "epoch": 4.356508875739645, "grad_norm": 0.28515625, "learning_rate": 9.929802180536274e-06, "loss": 0.0355, "step": 17670 }, { "epoch": 4.357001972386588, "grad_norm": 0.31640625, "learning_rate": 9.914852162967214e-06, "loss": 0.0362, "step": 17672 }, { "epoch": 4.357495069033531, "grad_norm": 0.384765625, "learning_rate": 9.899912820991152e-06, "loss": 0.0395, "step": 17674 }, { "epoch": 4.3579881656804735, "grad_norm": 0.283203125, "learning_rate": 9.884984156378497e-06, "loss": 0.0316, "step": 17676 }, { "epoch": 4.358481262327416, "grad_norm": 0.310546875, "learning_rate": 9.870066170898307e-06, "loss": 0.0372, "step": 17678 }, { "epoch": 4.358974358974359, "grad_norm": 0.3125, "learning_rate": 9.855158866318548e-06, "loss": 0.0365, "step": 17680 }, { "epoch": 4.359467455621302, "grad_norm": 0.296875, "learning_rate": 9.84026224440574e-06, "loss": 0.0304, "step": 17682 }, { "epoch": 4.3599605522682445, "grad_norm": 0.296875, "learning_rate": 9.825376306925238e-06, "loss": 0.0303, "step": 17684 }, { "epoch": 4.360453648915187, "grad_norm": 0.326171875, "learning_rate": 9.81050105564112e-06, "loss": 0.0384, "step": 17686 }, { "epoch": 4.3609467455621305, "grad_norm": 0.337890625, "learning_rate": 9.795636492316184e-06, "loss": 0.0387, "step": 17688 }, { "epoch": 4.361439842209073, "grad_norm": 0.326171875, "learning_rate": 9.780782618711981e-06, "loss": 0.0359, "step": 17690 }, { "epoch": 4.3619329388560155, "grad_norm": 0.28515625, "learning_rate": 9.765939436588712e-06, "loss": 0.0299, "step": 17692 }, { "epoch": 4.362426035502959, "grad_norm": 0.341796875, "learning_rate": 9.751106947705468e-06, "loss": 0.0356, "step": 17694 }, { "epoch": 4.3629191321499015, "grad_norm": 0.287109375, "learning_rate": 9.73628515381989e-06, "loss": 0.0332, "step": 17696 }, { "epoch": 4.363412228796844, "grad_norm": 0.314453125, "learning_rate": 9.72147405668853e-06, "loss": 0.0373, "step": 17698 }, { "epoch": 4.363905325443787, "grad_norm": 0.306640625, "learning_rate": 9.706673658066533e-06, "loss": 0.0364, "step": 17700 }, { "epoch": 4.36439842209073, "grad_norm": 0.333984375, "learning_rate": 9.691883959707838e-06, "loss": 0.0364, "step": 17702 }, { "epoch": 4.3648915187376724, "grad_norm": 0.29296875, "learning_rate": 9.677104963365125e-06, "loss": 0.0359, "step": 17704 }, { "epoch": 4.365384615384615, "grad_norm": 0.38671875, "learning_rate": 9.662336670789729e-06, "loss": 0.0389, "step": 17706 }, { "epoch": 4.365877712031558, "grad_norm": 0.34765625, "learning_rate": 9.647579083731861e-06, "loss": 0.038, "step": 17708 }, { "epoch": 4.366370808678501, "grad_norm": 0.310546875, "learning_rate": 9.6328322039403e-06, "loss": 0.0394, "step": 17710 }, { "epoch": 4.366863905325443, "grad_norm": 0.32421875, "learning_rate": 9.618096033162705e-06, "loss": 0.0363, "step": 17712 }, { "epoch": 4.367357001972387, "grad_norm": 0.333984375, "learning_rate": 9.603370573145343e-06, "loss": 0.036, "step": 17714 }, { "epoch": 4.367850098619329, "grad_norm": 0.302734375, "learning_rate": 9.588655825633275e-06, "loss": 0.0324, "step": 17716 }, { "epoch": 4.368343195266272, "grad_norm": 0.384765625, "learning_rate": 9.573951792370295e-06, "loss": 0.0336, "step": 17718 }, { "epoch": 4.368836291913215, "grad_norm": 0.294921875, "learning_rate": 9.559258475098876e-06, "loss": 0.0341, "step": 17720 }, { "epoch": 4.369329388560158, "grad_norm": 0.3671875, "learning_rate": 9.544575875560314e-06, "loss": 0.0342, "step": 17722 }, { "epoch": 4.3698224852071, "grad_norm": 0.375, "learning_rate": 9.529903995494516e-06, "loss": 0.0333, "step": 17724 }, { "epoch": 4.370315581854044, "grad_norm": 0.302734375, "learning_rate": 9.515242836640248e-06, "loss": 0.0335, "step": 17726 }, { "epoch": 4.370808678500986, "grad_norm": 0.333984375, "learning_rate": 9.500592400734897e-06, "loss": 0.0366, "step": 17728 }, { "epoch": 4.371301775147929, "grad_norm": 0.37109375, "learning_rate": 9.485952689514622e-06, "loss": 0.0383, "step": 17730 }, { "epoch": 4.371794871794872, "grad_norm": 0.296875, "learning_rate": 9.471323704714341e-06, "loss": 0.0366, "step": 17732 }, { "epoch": 4.372287968441815, "grad_norm": 0.3046875, "learning_rate": 9.456705448067604e-06, "loss": 0.0325, "step": 17734 }, { "epoch": 4.372781065088757, "grad_norm": 0.380859375, "learning_rate": 9.442097921306848e-06, "loss": 0.036, "step": 17736 }, { "epoch": 4.3732741617357, "grad_norm": 0.326171875, "learning_rate": 9.427501126163052e-06, "loss": 0.0391, "step": 17738 }, { "epoch": 4.373767258382643, "grad_norm": 0.3125, "learning_rate": 9.412915064366112e-06, "loss": 0.0368, "step": 17740 }, { "epoch": 4.374260355029586, "grad_norm": 0.34375, "learning_rate": 9.39833973764448e-06, "loss": 0.0365, "step": 17742 }, { "epoch": 4.374753451676528, "grad_norm": 0.2890625, "learning_rate": 9.38377514772546e-06, "loss": 0.0323, "step": 17744 }, { "epoch": 4.375246548323472, "grad_norm": 0.296875, "learning_rate": 9.369221296335006e-06, "loss": 0.037, "step": 17746 }, { "epoch": 4.375739644970414, "grad_norm": 0.3046875, "learning_rate": 9.354678185197852e-06, "loss": 0.0371, "step": 17748 }, { "epoch": 4.376232741617357, "grad_norm": 0.3046875, "learning_rate": 9.340145816037449e-06, "loss": 0.0379, "step": 17750 }, { "epoch": 4.3767258382643, "grad_norm": 0.298828125, "learning_rate": 9.32562419057591e-06, "loss": 0.0329, "step": 17752 }, { "epoch": 4.377218934911243, "grad_norm": 0.306640625, "learning_rate": 9.311113310534203e-06, "loss": 0.0319, "step": 17754 }, { "epoch": 4.377712031558185, "grad_norm": 0.2734375, "learning_rate": 9.296613177631897e-06, "loss": 0.0307, "step": 17756 }, { "epoch": 4.378205128205128, "grad_norm": 0.2734375, "learning_rate": 9.28212379358735e-06, "loss": 0.0319, "step": 17758 }, { "epoch": 4.378698224852071, "grad_norm": 0.3203125, "learning_rate": 9.267645160117632e-06, "loss": 0.0346, "step": 17760 }, { "epoch": 4.379191321499014, "grad_norm": 0.3359375, "learning_rate": 9.253177278938551e-06, "loss": 0.0385, "step": 17762 }, { "epoch": 4.379684418145956, "grad_norm": 0.36328125, "learning_rate": 9.238720151764624e-06, "loss": 0.034, "step": 17764 }, { "epoch": 4.3801775147929, "grad_norm": 0.416015625, "learning_rate": 9.224273780309101e-06, "loss": 0.0365, "step": 17766 }, { "epoch": 4.380670611439842, "grad_norm": 0.28515625, "learning_rate": 9.20983816628399e-06, "loss": 0.0373, "step": 17768 }, { "epoch": 4.381163708086785, "grad_norm": 0.302734375, "learning_rate": 9.195413311399948e-06, "loss": 0.0347, "step": 17770 }, { "epoch": 4.381656804733728, "grad_norm": 0.29296875, "learning_rate": 9.180999217366404e-06, "loss": 0.0329, "step": 17772 }, { "epoch": 4.382149901380671, "grad_norm": 0.30078125, "learning_rate": 9.166595885891527e-06, "loss": 0.0342, "step": 17774 }, { "epoch": 4.382642998027613, "grad_norm": 0.326171875, "learning_rate": 9.152203318682184e-06, "loss": 0.0374, "step": 17776 }, { "epoch": 4.383136094674557, "grad_norm": 0.34375, "learning_rate": 9.13782151744399e-06, "loss": 0.0321, "step": 17778 }, { "epoch": 4.383629191321499, "grad_norm": 0.330078125, "learning_rate": 9.123450483881245e-06, "loss": 0.0328, "step": 17780 }, { "epoch": 4.384122287968442, "grad_norm": 0.3671875, "learning_rate": 9.109090219697026e-06, "loss": 0.0357, "step": 17782 }, { "epoch": 4.384615384615385, "grad_norm": 0.37109375, "learning_rate": 9.094740726593065e-06, "loss": 0.0392, "step": 17784 }, { "epoch": 4.3851084812623276, "grad_norm": 0.37890625, "learning_rate": 9.080402006269884e-06, "loss": 0.0352, "step": 17786 }, { "epoch": 4.38560157790927, "grad_norm": 0.34375, "learning_rate": 9.066074060426688e-06, "loss": 0.0344, "step": 17788 }, { "epoch": 4.386094674556213, "grad_norm": 0.275390625, "learning_rate": 9.051756890761431e-06, "loss": 0.0297, "step": 17790 }, { "epoch": 4.386587771203156, "grad_norm": 0.2890625, "learning_rate": 9.037450498970768e-06, "loss": 0.0317, "step": 17792 }, { "epoch": 4.3870808678500985, "grad_norm": 0.314453125, "learning_rate": 9.023154886750096e-06, "loss": 0.0357, "step": 17794 }, { "epoch": 4.387573964497041, "grad_norm": 0.3046875, "learning_rate": 9.008870055793517e-06, "loss": 0.0315, "step": 17796 }, { "epoch": 4.3880670611439845, "grad_norm": 0.326171875, "learning_rate": 8.994596007793887e-06, "loss": 0.0343, "step": 17798 }, { "epoch": 4.388560157790927, "grad_norm": 0.318359375, "learning_rate": 8.98033274444272e-06, "loss": 0.0384, "step": 17800 }, { "epoch": 4.3890532544378695, "grad_norm": 0.35546875, "learning_rate": 8.96608026743031e-06, "loss": 0.0348, "step": 17802 }, { "epoch": 4.389546351084813, "grad_norm": 0.287109375, "learning_rate": 8.951838578445649e-06, "loss": 0.035, "step": 17804 }, { "epoch": 4.3900394477317555, "grad_norm": 0.287109375, "learning_rate": 8.937607679176474e-06, "loss": 0.0331, "step": 17806 }, { "epoch": 4.390532544378698, "grad_norm": 0.31640625, "learning_rate": 8.923387571309216e-06, "loss": 0.0373, "step": 17808 }, { "epoch": 4.391025641025641, "grad_norm": 0.302734375, "learning_rate": 8.909178256529038e-06, "loss": 0.0345, "step": 17810 }, { "epoch": 4.391518737672584, "grad_norm": 0.314453125, "learning_rate": 8.894979736519837e-06, "loss": 0.035, "step": 17812 }, { "epoch": 4.3920118343195265, "grad_norm": 0.3125, "learning_rate": 8.88079201296419e-06, "loss": 0.0349, "step": 17814 }, { "epoch": 4.392504930966469, "grad_norm": 0.330078125, "learning_rate": 8.866615087543429e-06, "loss": 0.0329, "step": 17816 }, { "epoch": 4.392998027613412, "grad_norm": 0.326171875, "learning_rate": 8.852448961937598e-06, "loss": 0.0346, "step": 17818 }, { "epoch": 4.393491124260355, "grad_norm": 0.328125, "learning_rate": 8.83829363782548e-06, "loss": 0.0358, "step": 17820 }, { "epoch": 4.3939842209072975, "grad_norm": 0.279296875, "learning_rate": 8.82414911688454e-06, "loss": 0.0368, "step": 17822 }, { "epoch": 4.394477317554241, "grad_norm": 0.30078125, "learning_rate": 8.810015400790994e-06, "loss": 0.0361, "step": 17824 }, { "epoch": 4.394970414201183, "grad_norm": 0.30859375, "learning_rate": 8.795892491219781e-06, "loss": 0.0333, "step": 17826 }, { "epoch": 4.395463510848126, "grad_norm": 0.2578125, "learning_rate": 8.781780389844484e-06, "loss": 0.032, "step": 17828 }, { "epoch": 4.395956607495069, "grad_norm": 0.310546875, "learning_rate": 8.767679098337545e-06, "loss": 0.0349, "step": 17830 }, { "epoch": 4.396449704142012, "grad_norm": 0.33203125, "learning_rate": 8.75358861837e-06, "loss": 0.0355, "step": 17832 }, { "epoch": 4.396942800788954, "grad_norm": 0.337890625, "learning_rate": 8.739508951611652e-06, "loss": 0.0352, "step": 17834 }, { "epoch": 4.397435897435898, "grad_norm": 0.3046875, "learning_rate": 8.72544009973103e-06, "loss": 0.035, "step": 17836 }, { "epoch": 4.39792899408284, "grad_norm": 0.275390625, "learning_rate": 8.711382064395368e-06, "loss": 0.0305, "step": 17838 }, { "epoch": 4.398422090729783, "grad_norm": 0.349609375, "learning_rate": 8.697334847270633e-06, "loss": 0.0393, "step": 17840 }, { "epoch": 4.398915187376726, "grad_norm": 0.306640625, "learning_rate": 8.68329845002146e-06, "loss": 0.0356, "step": 17842 }, { "epoch": 4.399408284023669, "grad_norm": 0.33203125, "learning_rate": 8.669272874311296e-06, "loss": 0.0338, "step": 17844 }, { "epoch": 4.399901380670611, "grad_norm": 0.310546875, "learning_rate": 8.65525812180219e-06, "loss": 0.0341, "step": 17846 }, { "epoch": 4.400394477317554, "grad_norm": 0.337890625, "learning_rate": 8.641254194155034e-06, "loss": 0.0328, "step": 17848 }, { "epoch": 4.400887573964497, "grad_norm": 0.330078125, "learning_rate": 8.627261093029327e-06, "loss": 0.0365, "step": 17850 }, { "epoch": 4.40138067061144, "grad_norm": 0.3125, "learning_rate": 8.613278820083326e-06, "loss": 0.0386, "step": 17852 }, { "epoch": 4.401873767258382, "grad_norm": 0.33984375, "learning_rate": 8.599307376974053e-06, "loss": 0.0377, "step": 17854 }, { "epoch": 4.402366863905326, "grad_norm": 0.349609375, "learning_rate": 8.585346765357138e-06, "loss": 0.0385, "step": 17856 }, { "epoch": 4.402859960552268, "grad_norm": 0.296875, "learning_rate": 8.57139698688706e-06, "loss": 0.0351, "step": 17858 }, { "epoch": 4.403353057199211, "grad_norm": 0.353515625, "learning_rate": 8.55745804321687e-06, "loss": 0.0354, "step": 17860 }, { "epoch": 4.403846153846154, "grad_norm": 0.302734375, "learning_rate": 8.543529935998484e-06, "loss": 0.0356, "step": 17862 }, { "epoch": 4.404339250493097, "grad_norm": 0.32421875, "learning_rate": 8.529612666882414e-06, "loss": 0.0312, "step": 17864 }, { "epoch": 4.404832347140039, "grad_norm": 0.27734375, "learning_rate": 8.51570623751795e-06, "loss": 0.033, "step": 17866 }, { "epoch": 4.405325443786983, "grad_norm": 0.2734375, "learning_rate": 8.501810649553099e-06, "loss": 0.0312, "step": 17868 }, { "epoch": 4.405818540433925, "grad_norm": 0.267578125, "learning_rate": 8.487925904634509e-06, "loss": 0.033, "step": 17870 }, { "epoch": 4.406311637080868, "grad_norm": 0.33984375, "learning_rate": 8.474052004407673e-06, "loss": 0.0356, "step": 17872 }, { "epoch": 4.40680473372781, "grad_norm": 0.310546875, "learning_rate": 8.460188950516656e-06, "loss": 0.0343, "step": 17874 }, { "epoch": 4.407297830374754, "grad_norm": 0.294921875, "learning_rate": 8.446336744604378e-06, "loss": 0.0318, "step": 17876 }, { "epoch": 4.407790927021696, "grad_norm": 0.330078125, "learning_rate": 8.432495388312345e-06, "loss": 0.033, "step": 17878 }, { "epoch": 4.408284023668639, "grad_norm": 0.29296875, "learning_rate": 8.418664883280858e-06, "loss": 0.0323, "step": 17880 }, { "epoch": 4.408777120315582, "grad_norm": 0.2890625, "learning_rate": 8.404845231148918e-06, "loss": 0.0334, "step": 17882 }, { "epoch": 4.409270216962525, "grad_norm": 0.28125, "learning_rate": 8.391036433554188e-06, "loss": 0.0344, "step": 17884 }, { "epoch": 4.409763313609467, "grad_norm": 0.27734375, "learning_rate": 8.377238492133155e-06, "loss": 0.0322, "step": 17886 }, { "epoch": 4.410256410256411, "grad_norm": 0.310546875, "learning_rate": 8.363451408520883e-06, "loss": 0.0348, "step": 17888 }, { "epoch": 4.410749506903353, "grad_norm": 0.33984375, "learning_rate": 8.34967518435129e-06, "loss": 0.0344, "step": 17890 }, { "epoch": 4.411242603550296, "grad_norm": 0.296875, "learning_rate": 8.335909821256881e-06, "loss": 0.0285, "step": 17892 }, { "epoch": 4.411735700197239, "grad_norm": 0.287109375, "learning_rate": 8.32215532086894e-06, "loss": 0.0303, "step": 17894 }, { "epoch": 4.412228796844182, "grad_norm": 0.2890625, "learning_rate": 8.308411684817452e-06, "loss": 0.0335, "step": 17896 }, { "epoch": 4.412721893491124, "grad_norm": 0.314453125, "learning_rate": 8.294678914731124e-06, "loss": 0.0331, "step": 17898 }, { "epoch": 4.4132149901380675, "grad_norm": 0.38671875, "learning_rate": 8.28095701223739e-06, "loss": 0.0359, "step": 17900 }, { "epoch": 4.41370808678501, "grad_norm": 0.3125, "learning_rate": 8.26724597896229e-06, "loss": 0.0337, "step": 17902 }, { "epoch": 4.414201183431953, "grad_norm": 0.3125, "learning_rate": 8.253545816530762e-06, "loss": 0.0336, "step": 17904 }, { "epoch": 4.414694280078895, "grad_norm": 0.310546875, "learning_rate": 8.23985652656628e-06, "loss": 0.0397, "step": 17906 }, { "epoch": 4.4151873767258385, "grad_norm": 0.310546875, "learning_rate": 8.226178110691118e-06, "loss": 0.0381, "step": 17908 }, { "epoch": 4.415680473372781, "grad_norm": 0.34375, "learning_rate": 8.212510570526255e-06, "loss": 0.0328, "step": 17910 }, { "epoch": 4.4161735700197235, "grad_norm": 0.3046875, "learning_rate": 8.198853907691362e-06, "loss": 0.0379, "step": 17912 }, { "epoch": 4.416666666666667, "grad_norm": 0.328125, "learning_rate": 8.185208123804844e-06, "loss": 0.035, "step": 17914 }, { "epoch": 4.4171597633136095, "grad_norm": 0.283203125, "learning_rate": 8.171573220483763e-06, "loss": 0.0339, "step": 17916 }, { "epoch": 4.417652859960552, "grad_norm": 0.326171875, "learning_rate": 8.157949199343994e-06, "loss": 0.0318, "step": 17918 }, { "epoch": 4.418145956607495, "grad_norm": 0.34375, "learning_rate": 8.144336062000013e-06, "loss": 0.0358, "step": 17920 }, { "epoch": 4.418639053254438, "grad_norm": 0.302734375, "learning_rate": 8.13073381006506e-06, "loss": 0.0314, "step": 17922 }, { "epoch": 4.4191321499013805, "grad_norm": 0.294921875, "learning_rate": 8.117142445151083e-06, "loss": 0.0304, "step": 17924 }, { "epoch": 4.419625246548323, "grad_norm": 0.27734375, "learning_rate": 8.103561968868733e-06, "loss": 0.0319, "step": 17926 }, { "epoch": 4.420118343195266, "grad_norm": 0.345703125, "learning_rate": 8.089992382827371e-06, "loss": 0.029, "step": 17928 }, { "epoch": 4.420611439842209, "grad_norm": 0.28125, "learning_rate": 8.076433688635077e-06, "loss": 0.0339, "step": 17930 }, { "epoch": 4.4211045364891515, "grad_norm": 0.33203125, "learning_rate": 8.06288588789864e-06, "loss": 0.0317, "step": 17932 }, { "epoch": 4.421597633136095, "grad_norm": 0.314453125, "learning_rate": 8.049348982223526e-06, "loss": 0.0345, "step": 17934 }, { "epoch": 4.422090729783037, "grad_norm": 0.31640625, "learning_rate": 8.035822973213947e-06, "loss": 0.0358, "step": 17936 }, { "epoch": 4.42258382642998, "grad_norm": 0.2890625, "learning_rate": 8.0223078624728e-06, "loss": 0.0329, "step": 17938 }, { "epoch": 4.423076923076923, "grad_norm": 0.310546875, "learning_rate": 8.008803651601704e-06, "loss": 0.0344, "step": 17940 }, { "epoch": 4.423570019723866, "grad_norm": 0.30859375, "learning_rate": 7.99531034220099e-06, "loss": 0.0347, "step": 17942 }, { "epoch": 4.424063116370808, "grad_norm": 0.30859375, "learning_rate": 7.981827935869701e-06, "loss": 0.0348, "step": 17944 }, { "epoch": 4.424556213017752, "grad_norm": 0.37109375, "learning_rate": 7.96835643420557e-06, "loss": 0.0329, "step": 17946 }, { "epoch": 4.425049309664694, "grad_norm": 0.267578125, "learning_rate": 7.954895838805032e-06, "loss": 0.0325, "step": 17948 }, { "epoch": 4.425542406311637, "grad_norm": 0.326171875, "learning_rate": 7.941446151263243e-06, "loss": 0.0355, "step": 17950 }, { "epoch": 4.42603550295858, "grad_norm": 0.37109375, "learning_rate": 7.928007373174073e-06, "loss": 0.0376, "step": 17952 }, { "epoch": 4.426528599605523, "grad_norm": 0.28515625, "learning_rate": 7.914579506130104e-06, "loss": 0.0327, "step": 17954 }, { "epoch": 4.427021696252465, "grad_norm": 0.291015625, "learning_rate": 7.901162551722595e-06, "loss": 0.0346, "step": 17956 }, { "epoch": 4.427514792899409, "grad_norm": 0.3046875, "learning_rate": 7.887756511541544e-06, "loss": 0.0323, "step": 17958 }, { "epoch": 4.428007889546351, "grad_norm": 0.28515625, "learning_rate": 7.874361387175633e-06, "loss": 0.0344, "step": 17960 }, { "epoch": 4.428500986193294, "grad_norm": 0.283203125, "learning_rate": 7.86097718021227e-06, "loss": 0.0316, "step": 17962 }, { "epoch": 4.428994082840236, "grad_norm": 0.31640625, "learning_rate": 7.847603892237543e-06, "loss": 0.0317, "step": 17964 }, { "epoch": 4.42948717948718, "grad_norm": 0.287109375, "learning_rate": 7.834241524836262e-06, "loss": 0.0342, "step": 17966 }, { "epoch": 4.429980276134122, "grad_norm": 0.326171875, "learning_rate": 7.820890079591946e-06, "loss": 0.0323, "step": 17968 }, { "epoch": 4.430473372781065, "grad_norm": 0.322265625, "learning_rate": 7.807549558086824e-06, "loss": 0.0306, "step": 17970 }, { "epoch": 4.430966469428008, "grad_norm": 0.294921875, "learning_rate": 7.794219961901817e-06, "loss": 0.0324, "step": 17972 }, { "epoch": 4.431459566074951, "grad_norm": 0.330078125, "learning_rate": 7.780901292616549e-06, "loss": 0.0341, "step": 17974 }, { "epoch": 4.431952662721893, "grad_norm": 0.267578125, "learning_rate": 7.767593551809382e-06, "loss": 0.0295, "step": 17976 }, { "epoch": 4.432445759368837, "grad_norm": 0.3515625, "learning_rate": 7.75429674105731e-06, "loss": 0.0355, "step": 17978 }, { "epoch": 4.432938856015779, "grad_norm": 0.35546875, "learning_rate": 7.741010861936138e-06, "loss": 0.032, "step": 17980 }, { "epoch": 4.433431952662722, "grad_norm": 0.302734375, "learning_rate": 7.727735916020274e-06, "loss": 0.0341, "step": 17982 }, { "epoch": 4.433925049309664, "grad_norm": 0.34765625, "learning_rate": 7.714471904882881e-06, "loss": 0.0316, "step": 17984 }, { "epoch": 4.434418145956608, "grad_norm": 0.369140625, "learning_rate": 7.701218830095825e-06, "loss": 0.0357, "step": 17986 }, { "epoch": 4.43491124260355, "grad_norm": 0.337890625, "learning_rate": 7.68797669322967e-06, "loss": 0.0369, "step": 17988 }, { "epoch": 4.435404339250493, "grad_norm": 0.294921875, "learning_rate": 7.674745495853686e-06, "loss": 0.0336, "step": 17990 }, { "epoch": 4.435897435897436, "grad_norm": 0.333984375, "learning_rate": 7.661525239535828e-06, "loss": 0.036, "step": 17992 }, { "epoch": 4.436390532544379, "grad_norm": 0.328125, "learning_rate": 7.648315925842809e-06, "loss": 0.0361, "step": 17994 }, { "epoch": 4.436883629191321, "grad_norm": 0.369140625, "learning_rate": 7.635117556339955e-06, "loss": 0.0346, "step": 17996 }, { "epoch": 4.437376725838265, "grad_norm": 0.27734375, "learning_rate": 7.621930132591371e-06, "loss": 0.0313, "step": 17998 }, { "epoch": 4.437869822485207, "grad_norm": 0.34375, "learning_rate": 7.60875365615984e-06, "loss": 0.0304, "step": 18000 }, { "epoch": 4.43836291913215, "grad_norm": 0.3125, "learning_rate": 7.595588128606846e-06, "loss": 0.0343, "step": 18002 }, { "epoch": 4.438856015779093, "grad_norm": 0.2890625, "learning_rate": 7.582433551492574e-06, "loss": 0.0352, "step": 18004 }, { "epoch": 4.439349112426036, "grad_norm": 0.30859375, "learning_rate": 7.569289926375933e-06, "loss": 0.0321, "step": 18006 }, { "epoch": 4.439842209072978, "grad_norm": 0.31640625, "learning_rate": 7.556157254814511e-06, "loss": 0.0325, "step": 18008 }, { "epoch": 4.4403353057199215, "grad_norm": 0.298828125, "learning_rate": 7.5430355383645625e-06, "loss": 0.0324, "step": 18010 }, { "epoch": 4.440828402366864, "grad_norm": 0.30859375, "learning_rate": 7.529924778581143e-06, "loss": 0.0332, "step": 18012 }, { "epoch": 4.441321499013807, "grad_norm": 0.330078125, "learning_rate": 7.516824977017911e-06, "loss": 0.0309, "step": 18014 }, { "epoch": 4.441814595660749, "grad_norm": 0.3515625, "learning_rate": 7.5037361352272904e-06, "loss": 0.0324, "step": 18016 }, { "epoch": 4.4423076923076925, "grad_norm": 0.30078125, "learning_rate": 7.490658254760363e-06, "loss": 0.0345, "step": 18018 }, { "epoch": 4.442800788954635, "grad_norm": 0.30859375, "learning_rate": 7.477591337166934e-06, "loss": 0.031, "step": 18020 }, { "epoch": 4.443293885601578, "grad_norm": 0.28515625, "learning_rate": 7.46453538399553e-06, "loss": 0.032, "step": 18022 }, { "epoch": 4.443786982248521, "grad_norm": 0.294921875, "learning_rate": 7.451490396793304e-06, "loss": 0.0307, "step": 18024 }, { "epoch": 4.4442800788954635, "grad_norm": 0.326171875, "learning_rate": 7.4384563771062176e-06, "loss": 0.034, "step": 18026 }, { "epoch": 4.444773175542406, "grad_norm": 0.287109375, "learning_rate": 7.425433326478837e-06, "loss": 0.0313, "step": 18028 }, { "epoch": 4.445266272189349, "grad_norm": 0.345703125, "learning_rate": 7.412421246454482e-06, "loss": 0.0349, "step": 18030 }, { "epoch": 4.445759368836292, "grad_norm": 0.310546875, "learning_rate": 7.399420138575142e-06, "loss": 0.0341, "step": 18032 }, { "epoch": 4.4462524654832345, "grad_norm": 0.32421875, "learning_rate": 7.386430004381539e-06, "loss": 0.0325, "step": 18034 }, { "epoch": 4.446745562130178, "grad_norm": 0.318359375, "learning_rate": 7.373450845413077e-06, "loss": 0.0304, "step": 18036 }, { "epoch": 4.44723865877712, "grad_norm": 0.298828125, "learning_rate": 7.360482663207824e-06, "loss": 0.0383, "step": 18038 }, { "epoch": 4.447731755424063, "grad_norm": 0.267578125, "learning_rate": 7.34752545930264e-06, "loss": 0.0323, "step": 18040 }, { "epoch": 4.4482248520710055, "grad_norm": 0.291015625, "learning_rate": 7.334579235232975e-06, "loss": 0.0311, "step": 18042 }, { "epoch": 4.448717948717949, "grad_norm": 0.306640625, "learning_rate": 7.321643992533056e-06, "loss": 0.0304, "step": 18044 }, { "epoch": 4.449211045364891, "grad_norm": 0.3515625, "learning_rate": 7.308719732735769e-06, "loss": 0.0323, "step": 18046 }, { "epoch": 4.449704142011834, "grad_norm": 0.33203125, "learning_rate": 7.295806457372711e-06, "loss": 0.0337, "step": 18048 }, { "epoch": 4.450197238658777, "grad_norm": 0.34765625, "learning_rate": 7.282904167974203e-06, "loss": 0.0332, "step": 18050 }, { "epoch": 4.45069033530572, "grad_norm": 0.318359375, "learning_rate": 7.270012866069176e-06, "loss": 0.0362, "step": 18052 }, { "epoch": 4.451183431952662, "grad_norm": 0.28515625, "learning_rate": 7.257132553185408e-06, "loss": 0.0289, "step": 18054 }, { "epoch": 4.451676528599606, "grad_norm": 0.294921875, "learning_rate": 7.244263230849202e-06, "loss": 0.0334, "step": 18056 }, { "epoch": 4.452169625246548, "grad_norm": 0.318359375, "learning_rate": 7.231404900585714e-06, "loss": 0.0354, "step": 18058 }, { "epoch": 4.452662721893491, "grad_norm": 0.279296875, "learning_rate": 7.218557563918693e-06, "loss": 0.035, "step": 18060 }, { "epoch": 4.453155818540434, "grad_norm": 0.337890625, "learning_rate": 7.2057212223706205e-06, "loss": 0.0305, "step": 18062 }, { "epoch": 4.453648915187377, "grad_norm": 0.291015625, "learning_rate": 7.192895877462691e-06, "loss": 0.0336, "step": 18064 }, { "epoch": 4.454142011834319, "grad_norm": 0.265625, "learning_rate": 7.180081530714744e-06, "loss": 0.034, "step": 18066 }, { "epoch": 4.454635108481263, "grad_norm": 0.349609375, "learning_rate": 7.16727818364541e-06, "loss": 0.038, "step": 18068 }, { "epoch": 4.455128205128205, "grad_norm": 0.345703125, "learning_rate": 7.154485837771885e-06, "loss": 0.0339, "step": 18070 }, { "epoch": 4.455621301775148, "grad_norm": 0.322265625, "learning_rate": 7.141704494610202e-06, "loss": 0.0321, "step": 18072 }, { "epoch": 4.45611439842209, "grad_norm": 0.298828125, "learning_rate": 7.128934155674971e-06, "loss": 0.0304, "step": 18074 }, { "epoch": 4.456607495069034, "grad_norm": 0.34765625, "learning_rate": 7.1161748224795825e-06, "loss": 0.033, "step": 18076 }, { "epoch": 4.457100591715976, "grad_norm": 0.390625, "learning_rate": 7.103426496536081e-06, "loss": 0.0342, "step": 18078 }, { "epoch": 4.457593688362919, "grad_norm": 0.28515625, "learning_rate": 7.09068917935517e-06, "loss": 0.0349, "step": 18080 }, { "epoch": 4.458086785009862, "grad_norm": 0.26953125, "learning_rate": 7.077962872446375e-06, "loss": 0.0341, "step": 18082 }, { "epoch": 4.458579881656805, "grad_norm": 0.3046875, "learning_rate": 7.0652475773177464e-06, "loss": 0.0325, "step": 18084 }, { "epoch": 4.459072978303747, "grad_norm": 0.283203125, "learning_rate": 7.052543295476199e-06, "loss": 0.0291, "step": 18086 }, { "epoch": 4.459566074950691, "grad_norm": 0.328125, "learning_rate": 7.039850028427208e-06, "loss": 0.0321, "step": 18088 }, { "epoch": 4.460059171597633, "grad_norm": 0.287109375, "learning_rate": 7.027167777675015e-06, "loss": 0.0311, "step": 18090 }, { "epoch": 4.460552268244576, "grad_norm": 0.2890625, "learning_rate": 7.014496544722526e-06, "loss": 0.0338, "step": 18092 }, { "epoch": 4.461045364891519, "grad_norm": 0.28515625, "learning_rate": 7.001836331071365e-06, "loss": 0.0313, "step": 18094 }, { "epoch": 4.461538461538462, "grad_norm": 0.30859375, "learning_rate": 6.989187138221853e-06, "loss": 0.0309, "step": 18096 }, { "epoch": 4.462031558185404, "grad_norm": 0.298828125, "learning_rate": 6.976548967672936e-06, "loss": 0.033, "step": 18098 }, { "epoch": 4.462524654832347, "grad_norm": 0.298828125, "learning_rate": 6.96392182092237e-06, "loss": 0.0349, "step": 18100 }, { "epoch": 4.46301775147929, "grad_norm": 0.333984375, "learning_rate": 6.951305699466504e-06, "loss": 0.031, "step": 18102 }, { "epoch": 4.463510848126233, "grad_norm": 0.29296875, "learning_rate": 6.93870060480043e-06, "loss": 0.0385, "step": 18104 }, { "epoch": 4.464003944773175, "grad_norm": 0.28125, "learning_rate": 6.92610653841792e-06, "loss": 0.0332, "step": 18106 }, { "epoch": 4.464497041420119, "grad_norm": 0.345703125, "learning_rate": 6.913523501811448e-06, "loss": 0.0348, "step": 18108 }, { "epoch": 4.464990138067061, "grad_norm": 0.33203125, "learning_rate": 6.900951496472163e-06, "loss": 0.0326, "step": 18110 }, { "epoch": 4.465483234714004, "grad_norm": 0.30859375, "learning_rate": 6.888390523889931e-06, "loss": 0.0343, "step": 18112 }, { "epoch": 4.465976331360947, "grad_norm": 0.283203125, "learning_rate": 6.875840585553306e-06, "loss": 0.0322, "step": 18114 }, { "epoch": 4.46646942800789, "grad_norm": 0.34765625, "learning_rate": 6.863301682949486e-06, "loss": 0.033, "step": 18116 }, { "epoch": 4.466962524654832, "grad_norm": 0.28125, "learning_rate": 6.850773817564426e-06, "loss": 0.0335, "step": 18118 }, { "epoch": 4.4674556213017755, "grad_norm": 0.34375, "learning_rate": 6.838256990882752e-06, "loss": 0.0352, "step": 18120 }, { "epoch": 4.467948717948718, "grad_norm": 0.349609375, "learning_rate": 6.8257512043877755e-06, "loss": 0.0362, "step": 18122 }, { "epoch": 4.468441814595661, "grad_norm": 0.349609375, "learning_rate": 6.813256459561501e-06, "loss": 0.0365, "step": 18124 }, { "epoch": 4.468934911242604, "grad_norm": 0.318359375, "learning_rate": 6.800772757884621e-06, "loss": 0.0317, "step": 18126 }, { "epoch": 4.4694280078895465, "grad_norm": 0.30859375, "learning_rate": 6.788300100836542e-06, "loss": 0.0305, "step": 18128 }, { "epoch": 4.469921104536489, "grad_norm": 0.275390625, "learning_rate": 6.775838489895314e-06, "loss": 0.0353, "step": 18130 }, { "epoch": 4.470414201183432, "grad_norm": 0.29296875, "learning_rate": 6.763387926537723e-06, "loss": 0.0345, "step": 18132 }, { "epoch": 4.470907297830375, "grad_norm": 0.275390625, "learning_rate": 6.750948412239222e-06, "loss": 0.0332, "step": 18134 }, { "epoch": 4.4714003944773175, "grad_norm": 0.3046875, "learning_rate": 6.738519948473976e-06, "loss": 0.0326, "step": 18136 }, { "epoch": 4.47189349112426, "grad_norm": 0.2734375, "learning_rate": 6.726102536714829e-06, "loss": 0.031, "step": 18138 }, { "epoch": 4.4723865877712035, "grad_norm": 0.349609375, "learning_rate": 6.713696178433293e-06, "loss": 0.0318, "step": 18140 }, { "epoch": 4.472879684418146, "grad_norm": 0.2890625, "learning_rate": 6.701300875099614e-06, "loss": 0.0356, "step": 18142 }, { "epoch": 4.4733727810650885, "grad_norm": 0.318359375, "learning_rate": 6.688916628182706e-06, "loss": 0.0313, "step": 18144 }, { "epoch": 4.473865877712032, "grad_norm": 0.296875, "learning_rate": 6.67654343915014e-06, "loss": 0.0323, "step": 18146 }, { "epoch": 4.4743589743589745, "grad_norm": 0.271484375, "learning_rate": 6.664181309468232e-06, "loss": 0.0315, "step": 18148 }, { "epoch": 4.474852071005917, "grad_norm": 0.30859375, "learning_rate": 6.6518302406019524e-06, "loss": 0.0357, "step": 18150 }, { "epoch": 4.4753451676528595, "grad_norm": 0.322265625, "learning_rate": 6.639490234014989e-06, "loss": 0.0339, "step": 18152 }, { "epoch": 4.475838264299803, "grad_norm": 0.3671875, "learning_rate": 6.627161291169692e-06, "loss": 0.0312, "step": 18154 }, { "epoch": 4.476331360946745, "grad_norm": 0.349609375, "learning_rate": 6.614843413527106e-06, "loss": 0.0322, "step": 18156 }, { "epoch": 4.476824457593688, "grad_norm": 0.333984375, "learning_rate": 6.602536602546983e-06, "loss": 0.0337, "step": 18158 }, { "epoch": 4.477317554240631, "grad_norm": 0.30078125, "learning_rate": 6.590240859687702e-06, "loss": 0.0317, "step": 18160 }, { "epoch": 4.477810650887574, "grad_norm": 0.26171875, "learning_rate": 6.5779561864064535e-06, "loss": 0.0313, "step": 18162 }, { "epoch": 4.478303747534516, "grad_norm": 0.2890625, "learning_rate": 6.565682584158983e-06, "loss": 0.0349, "step": 18164 }, { "epoch": 4.47879684418146, "grad_norm": 0.306640625, "learning_rate": 6.553420054399794e-06, "loss": 0.0322, "step": 18166 }, { "epoch": 4.479289940828402, "grad_norm": 0.341796875, "learning_rate": 6.54116859858207e-06, "loss": 0.0305, "step": 18168 }, { "epoch": 4.479783037475345, "grad_norm": 0.34375, "learning_rate": 6.528928218157671e-06, "loss": 0.0339, "step": 18170 }, { "epoch": 4.480276134122288, "grad_norm": 0.30859375, "learning_rate": 6.516698914577168e-06, "loss": 0.0347, "step": 18172 }, { "epoch": 4.480769230769231, "grad_norm": 0.310546875, "learning_rate": 6.5044806892897605e-06, "loss": 0.0352, "step": 18174 }, { "epoch": 4.481262327416173, "grad_norm": 0.3203125, "learning_rate": 6.492273543743432e-06, "loss": 0.0355, "step": 18176 }, { "epoch": 4.481755424063117, "grad_norm": 0.29296875, "learning_rate": 6.48007747938475e-06, "loss": 0.0335, "step": 18178 }, { "epoch": 4.482248520710059, "grad_norm": 0.28515625, "learning_rate": 6.467892497659045e-06, "loss": 0.0335, "step": 18180 }, { "epoch": 4.482741617357002, "grad_norm": 0.30859375, "learning_rate": 6.455718600010285e-06, "loss": 0.0307, "step": 18182 }, { "epoch": 4.483234714003945, "grad_norm": 0.34375, "learning_rate": 6.443555787881161e-06, "loss": 0.0346, "step": 18184 }, { "epoch": 4.483727810650888, "grad_norm": 0.294921875, "learning_rate": 6.431404062713042e-06, "loss": 0.0321, "step": 18186 }, { "epoch": 4.48422090729783, "grad_norm": 0.28515625, "learning_rate": 6.419263425945932e-06, "loss": 0.0323, "step": 18188 }, { "epoch": 4.484714003944773, "grad_norm": 0.337890625, "learning_rate": 6.407133879018623e-06, "loss": 0.0339, "step": 18190 }, { "epoch": 4.485207100591716, "grad_norm": 0.302734375, "learning_rate": 6.3950154233684775e-06, "loss": 0.0325, "step": 18192 }, { "epoch": 4.485700197238659, "grad_norm": 0.291015625, "learning_rate": 6.3829080604316585e-06, "loss": 0.0334, "step": 18194 }, { "epoch": 4.486193293885601, "grad_norm": 0.2890625, "learning_rate": 6.370811791642906e-06, "loss": 0.0286, "step": 18196 }, { "epoch": 4.486686390532545, "grad_norm": 0.330078125, "learning_rate": 6.3587266184357176e-06, "loss": 0.0335, "step": 18198 }, { "epoch": 4.487179487179487, "grad_norm": 0.26953125, "learning_rate": 6.34665254224226e-06, "loss": 0.0333, "step": 18200 }, { "epoch": 4.48767258382643, "grad_norm": 0.330078125, "learning_rate": 6.334589564493343e-06, "loss": 0.0344, "step": 18202 }, { "epoch": 4.488165680473373, "grad_norm": 0.294921875, "learning_rate": 6.322537686618557e-06, "loss": 0.0314, "step": 18204 }, { "epoch": 4.488658777120316, "grad_norm": 0.32421875, "learning_rate": 6.310496910046049e-06, "loss": 0.0339, "step": 18206 }, { "epoch": 4.489151873767258, "grad_norm": 0.30859375, "learning_rate": 6.298467236202787e-06, "loss": 0.0326, "step": 18208 }, { "epoch": 4.489644970414201, "grad_norm": 0.328125, "learning_rate": 6.2864486665142996e-06, "loss": 0.0374, "step": 18210 }, { "epoch": 4.490138067061144, "grad_norm": 0.27734375, "learning_rate": 6.274441202404879e-06, "loss": 0.0305, "step": 18212 }, { "epoch": 4.490631163708087, "grad_norm": 0.2890625, "learning_rate": 6.2624448452974885e-06, "loss": 0.0314, "step": 18214 }, { "epoch": 4.491124260355029, "grad_norm": 0.267578125, "learning_rate": 6.250459596613711e-06, "loss": 0.0296, "step": 18216 }, { "epoch": 4.491617357001973, "grad_norm": 0.287109375, "learning_rate": 6.238485457773935e-06, "loss": 0.0315, "step": 18218 }, { "epoch": 4.492110453648915, "grad_norm": 0.2890625, "learning_rate": 6.226522430197102e-06, "loss": 0.0309, "step": 18220 }, { "epoch": 4.492603550295858, "grad_norm": 0.302734375, "learning_rate": 6.214570515300944e-06, "loss": 0.0318, "step": 18222 }, { "epoch": 4.493096646942801, "grad_norm": 0.296875, "learning_rate": 6.202629714501807e-06, "loss": 0.031, "step": 18224 }, { "epoch": 4.493589743589744, "grad_norm": 0.294921875, "learning_rate": 6.1907000292147464e-06, "loss": 0.0332, "step": 18226 }, { "epoch": 4.494082840236686, "grad_norm": 0.271484375, "learning_rate": 6.178781460853511e-06, "loss": 0.0269, "step": 18228 }, { "epoch": 4.4945759368836296, "grad_norm": 0.271484375, "learning_rate": 6.16687401083047e-06, "loss": 0.0311, "step": 18230 }, { "epoch": 4.495069033530572, "grad_norm": 0.27734375, "learning_rate": 6.154977680556783e-06, "loss": 0.0312, "step": 18232 }, { "epoch": 4.495562130177515, "grad_norm": 0.310546875, "learning_rate": 6.14309247144218e-06, "loss": 0.0371, "step": 18234 }, { "epoch": 4.496055226824458, "grad_norm": 0.291015625, "learning_rate": 6.131218384895176e-06, "loss": 0.0334, "step": 18236 }, { "epoch": 4.4965483234714005, "grad_norm": 0.314453125, "learning_rate": 6.11935542232287e-06, "loss": 0.0347, "step": 18238 }, { "epoch": 4.497041420118343, "grad_norm": 0.373046875, "learning_rate": 6.107503585131113e-06, "loss": 0.0322, "step": 18240 }, { "epoch": 4.497534516765286, "grad_norm": 0.287109375, "learning_rate": 6.095662874724417e-06, "loss": 0.0343, "step": 18242 }, { "epoch": 4.498027613412229, "grad_norm": 0.318359375, "learning_rate": 6.083833292505958e-06, "loss": 0.0337, "step": 18244 }, { "epoch": 4.4985207100591715, "grad_norm": 0.310546875, "learning_rate": 6.072014839877627e-06, "loss": 0.0326, "step": 18246 }, { "epoch": 4.499013806706114, "grad_norm": 0.322265625, "learning_rate": 6.060207518239946e-06, "loss": 0.0344, "step": 18248 }, { "epoch": 4.4995069033530575, "grad_norm": 0.361328125, "learning_rate": 6.048411328992187e-06, "loss": 0.0406, "step": 18250 }, { "epoch": 4.5, "grad_norm": 0.333984375, "learning_rate": 6.036626273532231e-06, "loss": 0.0387, "step": 18252 }, { "epoch": 4.5004930966469425, "grad_norm": 0.275390625, "learning_rate": 6.024852353256682e-06, "loss": 0.0327, "step": 18254 }, { "epoch": 4.500986193293886, "grad_norm": 0.263671875, "learning_rate": 6.013089569560825e-06, "loss": 0.0323, "step": 18256 }, { "epoch": 4.5014792899408285, "grad_norm": 0.333984375, "learning_rate": 6.001337923838602e-06, "loss": 0.0344, "step": 18258 }, { "epoch": 4.501972386587771, "grad_norm": 0.30859375, "learning_rate": 5.989597417482684e-06, "loss": 0.0375, "step": 18260 }, { "epoch": 4.5024654832347135, "grad_norm": 0.302734375, "learning_rate": 5.977868051884317e-06, "loss": 0.0321, "step": 18262 }, { "epoch": 4.502958579881657, "grad_norm": 0.267578125, "learning_rate": 5.966149828433576e-06, "loss": 0.033, "step": 18264 }, { "epoch": 4.5034516765285995, "grad_norm": 0.322265625, "learning_rate": 5.954442748519073e-06, "loss": 0.037, "step": 18266 }, { "epoch": 4.503944773175542, "grad_norm": 0.3046875, "learning_rate": 5.9427468135282e-06, "loss": 0.0316, "step": 18268 }, { "epoch": 4.504437869822485, "grad_norm": 0.30078125, "learning_rate": 5.931062024846967e-06, "loss": 0.0326, "step": 18270 }, { "epoch": 4.504930966469428, "grad_norm": 0.296875, "learning_rate": 5.919388383860103e-06, "loss": 0.0323, "step": 18272 }, { "epoch": 4.5054240631163704, "grad_norm": 0.3046875, "learning_rate": 5.907725891950999e-06, "loss": 0.0318, "step": 18274 }, { "epoch": 4.505917159763314, "grad_norm": 0.267578125, "learning_rate": 5.8960745505017175e-06, "loss": 0.0342, "step": 18276 }, { "epoch": 4.506410256410256, "grad_norm": 0.37890625, "learning_rate": 5.884434360893021e-06, "loss": 0.0332, "step": 18278 }, { "epoch": 4.506903353057199, "grad_norm": 0.36328125, "learning_rate": 5.872805324504316e-06, "loss": 0.0336, "step": 18280 }, { "epoch": 4.507396449704142, "grad_norm": 0.26953125, "learning_rate": 5.8611874427137225e-06, "loss": 0.0347, "step": 18282 }, { "epoch": 4.507889546351085, "grad_norm": 0.29296875, "learning_rate": 5.849580716898017e-06, "loss": 0.0307, "step": 18284 }, { "epoch": 4.508382642998027, "grad_norm": 0.322265625, "learning_rate": 5.8379851484326566e-06, "loss": 0.0322, "step": 18286 }, { "epoch": 4.508875739644971, "grad_norm": 0.32421875, "learning_rate": 5.826400738691806e-06, "loss": 0.0335, "step": 18288 }, { "epoch": 4.509368836291913, "grad_norm": 0.296875, "learning_rate": 5.814827489048258e-06, "loss": 0.0324, "step": 18290 }, { "epoch": 4.509861932938856, "grad_norm": 0.3125, "learning_rate": 5.803265400873514e-06, "loss": 0.0284, "step": 18292 }, { "epoch": 4.510355029585799, "grad_norm": 0.2890625, "learning_rate": 5.7917144755377576e-06, "loss": 0.0316, "step": 18294 }, { "epoch": 4.510848126232742, "grad_norm": 0.4140625, "learning_rate": 5.780174714409825e-06, "loss": 0.0343, "step": 18296 }, { "epoch": 4.511341222879684, "grad_norm": 0.2890625, "learning_rate": 5.7686461188572345e-06, "loss": 0.0326, "step": 18298 }, { "epoch": 4.511834319526627, "grad_norm": 0.35546875, "learning_rate": 5.757128690246205e-06, "loss": 0.0347, "step": 18300 }, { "epoch": 4.51232741617357, "grad_norm": 0.306640625, "learning_rate": 5.7456224299415996e-06, "loss": 0.0325, "step": 18302 }, { "epoch": 4.512820512820513, "grad_norm": 0.314453125, "learning_rate": 5.734127339306994e-06, "loss": 0.0333, "step": 18304 }, { "epoch": 4.513313609467455, "grad_norm": 0.43359375, "learning_rate": 5.7226434197046206e-06, "loss": 0.0355, "step": 18306 }, { "epoch": 4.513806706114399, "grad_norm": 0.365234375, "learning_rate": 5.71117067249538e-06, "loss": 0.0317, "step": 18308 }, { "epoch": 4.514299802761341, "grad_norm": 0.263671875, "learning_rate": 5.6997090990388525e-06, "loss": 0.0308, "step": 18310 }, { "epoch": 4.514792899408284, "grad_norm": 0.291015625, "learning_rate": 5.688258700693294e-06, "loss": 0.0323, "step": 18312 }, { "epoch": 4.515285996055227, "grad_norm": 0.287109375, "learning_rate": 5.676819478815654e-06, "loss": 0.0311, "step": 18314 }, { "epoch": 4.51577909270217, "grad_norm": 0.298828125, "learning_rate": 5.665391434761535e-06, "loss": 0.0316, "step": 18316 }, { "epoch": 4.516272189349112, "grad_norm": 0.28515625, "learning_rate": 5.653974569885223e-06, "loss": 0.0315, "step": 18318 }, { "epoch": 4.516765285996055, "grad_norm": 0.30078125, "learning_rate": 5.642568885539701e-06, "loss": 0.031, "step": 18320 }, { "epoch": 4.517258382642998, "grad_norm": 0.32421875, "learning_rate": 5.631174383076587e-06, "loss": 0.0313, "step": 18322 }, { "epoch": 4.517751479289941, "grad_norm": 0.3046875, "learning_rate": 5.61979106384618e-06, "loss": 0.0313, "step": 18324 }, { "epoch": 4.518244575936883, "grad_norm": 0.283203125, "learning_rate": 5.608418929197512e-06, "loss": 0.0351, "step": 18326 }, { "epoch": 4.518737672583827, "grad_norm": 0.291015625, "learning_rate": 5.597057980478193e-06, "loss": 0.0317, "step": 18328 }, { "epoch": 4.519230769230769, "grad_norm": 0.328125, "learning_rate": 5.585708219034591e-06, "loss": 0.0334, "step": 18330 }, { "epoch": 4.519723865877712, "grad_norm": 0.31640625, "learning_rate": 5.574369646211697e-06, "loss": 0.0341, "step": 18332 }, { "epoch": 4.520216962524655, "grad_norm": 0.275390625, "learning_rate": 5.563042263353213e-06, "loss": 0.0334, "step": 18334 }, { "epoch": 4.520710059171598, "grad_norm": 0.283203125, "learning_rate": 5.551726071801511e-06, "loss": 0.0316, "step": 18336 }, { "epoch": 4.52120315581854, "grad_norm": 0.296875, "learning_rate": 5.5404210728975616e-06, "loss": 0.0318, "step": 18338 }, { "epoch": 4.521696252465484, "grad_norm": 0.275390625, "learning_rate": 5.529127267981138e-06, "loss": 0.0312, "step": 18340 }, { "epoch": 4.522189349112426, "grad_norm": 0.32421875, "learning_rate": 5.51784465839058e-06, "loss": 0.0307, "step": 18342 }, { "epoch": 4.522682445759369, "grad_norm": 0.30078125, "learning_rate": 5.506573245462943e-06, "loss": 0.0333, "step": 18344 }, { "epoch": 4.523175542406312, "grad_norm": 0.28515625, "learning_rate": 5.495313030533966e-06, "loss": 0.0311, "step": 18346 }, { "epoch": 4.523668639053255, "grad_norm": 0.337890625, "learning_rate": 5.484064014938051e-06, "loss": 0.0344, "step": 18348 }, { "epoch": 4.524161735700197, "grad_norm": 0.306640625, "learning_rate": 5.472826200008263e-06, "loss": 0.0354, "step": 18350 }, { "epoch": 4.5246548323471405, "grad_norm": 0.3671875, "learning_rate": 5.461599587076316e-06, "loss": 0.0324, "step": 18352 }, { "epoch": 4.525147928994083, "grad_norm": 0.265625, "learning_rate": 5.450384177472678e-06, "loss": 0.0314, "step": 18354 }, { "epoch": 4.5256410256410255, "grad_norm": 0.298828125, "learning_rate": 5.439179972526387e-06, "loss": 0.033, "step": 18356 }, { "epoch": 4.526134122287968, "grad_norm": 0.328125, "learning_rate": 5.427986973565258e-06, "loss": 0.0326, "step": 18358 }, { "epoch": 4.5266272189349115, "grad_norm": 0.306640625, "learning_rate": 5.416805181915685e-06, "loss": 0.0339, "step": 18360 }, { "epoch": 4.527120315581854, "grad_norm": 0.3203125, "learning_rate": 5.405634598902776e-06, "loss": 0.035, "step": 18362 }, { "epoch": 4.5276134122287965, "grad_norm": 0.255859375, "learning_rate": 5.394475225850337e-06, "loss": 0.0298, "step": 18364 }, { "epoch": 4.52810650887574, "grad_norm": 0.322265625, "learning_rate": 5.383327064080757e-06, "loss": 0.0305, "step": 18366 }, { "epoch": 4.5285996055226825, "grad_norm": 0.345703125, "learning_rate": 5.372190114915221e-06, "loss": 0.0334, "step": 18368 }, { "epoch": 4.529092702169625, "grad_norm": 0.296875, "learning_rate": 5.361064379673464e-06, "loss": 0.033, "step": 18370 }, { "epoch": 4.529585798816568, "grad_norm": 0.2890625, "learning_rate": 5.349949859673998e-06, "loss": 0.0295, "step": 18372 }, { "epoch": 4.530078895463511, "grad_norm": 0.31640625, "learning_rate": 5.338846556233923e-06, "loss": 0.0372, "step": 18374 }, { "epoch": 4.5305719921104535, "grad_norm": 0.318359375, "learning_rate": 5.3277544706690555e-06, "loss": 0.035, "step": 18376 }, { "epoch": 4.531065088757396, "grad_norm": 0.267578125, "learning_rate": 5.316673604293876e-06, "loss": 0.0333, "step": 18378 }, { "epoch": 4.531558185404339, "grad_norm": 0.328125, "learning_rate": 5.305603958421479e-06, "loss": 0.0355, "step": 18380 }, { "epoch": 4.532051282051282, "grad_norm": 0.298828125, "learning_rate": 5.294545534363759e-06, "loss": 0.0336, "step": 18382 }, { "epoch": 4.5325443786982245, "grad_norm": 0.294921875, "learning_rate": 5.2834983334311365e-06, "loss": 0.0338, "step": 18384 }, { "epoch": 4.533037475345168, "grad_norm": 0.3125, "learning_rate": 5.272462356932805e-06, "loss": 0.0322, "step": 18386 }, { "epoch": 4.53353057199211, "grad_norm": 0.318359375, "learning_rate": 5.261437606176567e-06, "loss": 0.0309, "step": 18388 }, { "epoch": 4.534023668639053, "grad_norm": 0.31640625, "learning_rate": 5.25042408246893e-06, "loss": 0.0319, "step": 18390 }, { "epoch": 4.534516765285996, "grad_norm": 0.328125, "learning_rate": 5.239421787115062e-06, "loss": 0.0302, "step": 18392 }, { "epoch": 4.535009861932939, "grad_norm": 0.26953125, "learning_rate": 5.228430721418764e-06, "loss": 0.0278, "step": 18394 }, { "epoch": 4.535502958579881, "grad_norm": 0.3046875, "learning_rate": 5.217450886682584e-06, "loss": 0.0354, "step": 18396 }, { "epoch": 4.535996055226825, "grad_norm": 0.28515625, "learning_rate": 5.206482284207647e-06, "loss": 0.0342, "step": 18398 }, { "epoch": 4.536489151873767, "grad_norm": 0.33984375, "learning_rate": 5.195524915293848e-06, "loss": 0.0329, "step": 18400 }, { "epoch": 4.53698224852071, "grad_norm": 0.314453125, "learning_rate": 5.184578781239646e-06, "loss": 0.0309, "step": 18402 }, { "epoch": 4.537475345167653, "grad_norm": 0.283203125, "learning_rate": 5.1736438833422495e-06, "loss": 0.0309, "step": 18404 }, { "epoch": 4.537968441814596, "grad_norm": 0.28515625, "learning_rate": 5.162720222897488e-06, "loss": 0.0321, "step": 18406 }, { "epoch": 4.538461538461538, "grad_norm": 0.271484375, "learning_rate": 5.151807801199892e-06, "loss": 0.0308, "step": 18408 }, { "epoch": 4.538954635108482, "grad_norm": 0.333984375, "learning_rate": 5.14090661954264e-06, "loss": 0.0329, "step": 18410 }, { "epoch": 4.539447731755424, "grad_norm": 0.328125, "learning_rate": 5.130016679217564e-06, "loss": 0.0357, "step": 18412 }, { "epoch": 4.539940828402367, "grad_norm": 0.34765625, "learning_rate": 5.11913798151521e-06, "loss": 0.0328, "step": 18414 }, { "epoch": 4.540433925049309, "grad_norm": 0.3203125, "learning_rate": 5.108270527724745e-06, "loss": 0.0315, "step": 18416 }, { "epoch": 4.540927021696253, "grad_norm": 0.298828125, "learning_rate": 5.097414319134042e-06, "loss": 0.0348, "step": 18418 }, { "epoch": 4.541420118343195, "grad_norm": 0.291015625, "learning_rate": 5.086569357029591e-06, "loss": 0.0312, "step": 18420 }, { "epoch": 4.541913214990138, "grad_norm": 0.2734375, "learning_rate": 5.075735642696611e-06, "loss": 0.0334, "step": 18422 }, { "epoch": 4.542406311637081, "grad_norm": 0.302734375, "learning_rate": 5.064913177418939e-06, "loss": 0.0333, "step": 18424 }, { "epoch": 4.542899408284024, "grad_norm": 0.310546875, "learning_rate": 5.054101962479119e-06, "loss": 0.0353, "step": 18426 }, { "epoch": 4.543392504930966, "grad_norm": 0.291015625, "learning_rate": 5.043301999158323e-06, "loss": 0.0331, "step": 18428 }, { "epoch": 4.543885601577909, "grad_norm": 0.275390625, "learning_rate": 5.032513288736407e-06, "loss": 0.0304, "step": 18430 }, { "epoch": 4.544378698224852, "grad_norm": 0.30859375, "learning_rate": 5.021735832491892e-06, "loss": 0.0311, "step": 18432 }, { "epoch": 4.544871794871795, "grad_norm": 0.2734375, "learning_rate": 5.010969631701978e-06, "loss": 0.031, "step": 18434 }, { "epoch": 4.545364891518737, "grad_norm": 0.291015625, "learning_rate": 5.000214687642513e-06, "loss": 0.0336, "step": 18436 }, { "epoch": 4.545857988165681, "grad_norm": 0.291015625, "learning_rate": 4.989471001588009e-06, "loss": 0.0296, "step": 18438 }, { "epoch": 4.546351084812623, "grad_norm": 0.318359375, "learning_rate": 4.97873857481167e-06, "loss": 0.0334, "step": 18440 }, { "epoch": 4.546844181459566, "grad_norm": 0.291015625, "learning_rate": 4.968017408585357e-06, "loss": 0.0329, "step": 18442 }, { "epoch": 4.547337278106509, "grad_norm": 0.3046875, "learning_rate": 4.957307504179553e-06, "loss": 0.0365, "step": 18444 }, { "epoch": 4.547830374753452, "grad_norm": 0.33984375, "learning_rate": 4.946608862863456e-06, "loss": 0.0312, "step": 18446 }, { "epoch": 4.548323471400394, "grad_norm": 0.3671875, "learning_rate": 4.935921485904926e-06, "loss": 0.0354, "step": 18448 }, { "epoch": 4.548816568047338, "grad_norm": 0.306640625, "learning_rate": 4.925245374570464e-06, "loss": 0.0321, "step": 18450 }, { "epoch": 4.54930966469428, "grad_norm": 0.271484375, "learning_rate": 4.9145805301252565e-06, "loss": 0.0314, "step": 18452 }, { "epoch": 4.549802761341223, "grad_norm": 0.298828125, "learning_rate": 4.903926953833149e-06, "loss": 0.032, "step": 18454 }, { "epoch": 4.550295857988166, "grad_norm": 0.337890625, "learning_rate": 4.893284646956642e-06, "loss": 0.0356, "step": 18456 }, { "epoch": 4.550788954635109, "grad_norm": 0.30078125, "learning_rate": 4.882653610756937e-06, "loss": 0.0315, "step": 18458 }, { "epoch": 4.551282051282051, "grad_norm": 0.26953125, "learning_rate": 4.872033846493818e-06, "loss": 0.0309, "step": 18460 }, { "epoch": 4.5517751479289945, "grad_norm": 0.298828125, "learning_rate": 4.861425355425819e-06, "loss": 0.0319, "step": 18462 }, { "epoch": 4.552268244575937, "grad_norm": 0.373046875, "learning_rate": 4.850828138810104e-06, "loss": 0.0327, "step": 18464 }, { "epoch": 4.55276134122288, "grad_norm": 0.259765625, "learning_rate": 4.8402421979025e-06, "loss": 0.0322, "step": 18466 }, { "epoch": 4.553254437869823, "grad_norm": 0.27734375, "learning_rate": 4.829667533957494e-06, "loss": 0.0366, "step": 18468 }, { "epoch": 4.5537475345167655, "grad_norm": 0.318359375, "learning_rate": 4.819104148228249e-06, "loss": 0.0305, "step": 18470 }, { "epoch": 4.554240631163708, "grad_norm": 0.263671875, "learning_rate": 4.808552041966608e-06, "loss": 0.0304, "step": 18472 }, { "epoch": 4.554733727810651, "grad_norm": 0.283203125, "learning_rate": 4.798011216423004e-06, "loss": 0.0309, "step": 18474 }, { "epoch": 4.555226824457594, "grad_norm": 0.3125, "learning_rate": 4.787481672846616e-06, "loss": 0.0331, "step": 18476 }, { "epoch": 4.5557199211045365, "grad_norm": 0.26953125, "learning_rate": 4.776963412485258e-06, "loss": 0.0327, "step": 18478 }, { "epoch": 4.556213017751479, "grad_norm": 0.267578125, "learning_rate": 4.766456436585376e-06, "loss": 0.0323, "step": 18480 }, { "epoch": 4.556706114398422, "grad_norm": 0.328125, "learning_rate": 4.755960746392129e-06, "loss": 0.033, "step": 18482 }, { "epoch": 4.557199211045365, "grad_norm": 0.287109375, "learning_rate": 4.745476343149313e-06, "loss": 0.031, "step": 18484 }, { "epoch": 4.5576923076923075, "grad_norm": 0.29296875, "learning_rate": 4.735003228099399e-06, "loss": 0.0341, "step": 18486 }, { "epoch": 4.55818540433925, "grad_norm": 0.28125, "learning_rate": 4.724541402483462e-06, "loss": 0.034, "step": 18488 }, { "epoch": 4.558678500986193, "grad_norm": 0.271484375, "learning_rate": 4.714090867541343e-06, "loss": 0.031, "step": 18490 }, { "epoch": 4.559171597633136, "grad_norm": 0.31640625, "learning_rate": 4.7036516245114514e-06, "loss": 0.0342, "step": 18492 }, { "epoch": 4.5596646942800785, "grad_norm": 0.349609375, "learning_rate": 4.693223674630909e-06, "loss": 0.0302, "step": 18494 }, { "epoch": 4.560157790927022, "grad_norm": 0.333984375, "learning_rate": 4.6828070191354935e-06, "loss": 0.0314, "step": 18496 }, { "epoch": 4.560650887573964, "grad_norm": 0.3125, "learning_rate": 4.672401659259629e-06, "loss": 0.0332, "step": 18498 }, { "epoch": 4.561143984220907, "grad_norm": 0.296875, "learning_rate": 4.6620075962364175e-06, "loss": 0.0335, "step": 18500 }, { "epoch": 4.56163708086785, "grad_norm": 0.2890625, "learning_rate": 4.651624831297574e-06, "loss": 0.0306, "step": 18502 }, { "epoch": 4.562130177514793, "grad_norm": 0.310546875, "learning_rate": 4.641253365673581e-06, "loss": 0.0306, "step": 18504 }, { "epoch": 4.562623274161735, "grad_norm": 0.32421875, "learning_rate": 4.630893200593445e-06, "loss": 0.0336, "step": 18506 }, { "epoch": 4.563116370808679, "grad_norm": 0.330078125, "learning_rate": 4.62054433728496e-06, "loss": 0.0339, "step": 18508 }, { "epoch": 4.563609467455621, "grad_norm": 0.30859375, "learning_rate": 4.610206776974502e-06, "loss": 0.0275, "step": 18510 }, { "epoch": 4.564102564102564, "grad_norm": 0.302734375, "learning_rate": 4.599880520887123e-06, "loss": 0.0314, "step": 18512 }, { "epoch": 4.564595660749507, "grad_norm": 0.283203125, "learning_rate": 4.589565570246556e-06, "loss": 0.0322, "step": 18514 }, { "epoch": 4.56508875739645, "grad_norm": 0.330078125, "learning_rate": 4.579261926275158e-06, "loss": 0.0351, "step": 18516 }, { "epoch": 4.565581854043392, "grad_norm": 0.326171875, "learning_rate": 4.568969590194005e-06, "loss": 0.0314, "step": 18518 }, { "epoch": 4.566074950690336, "grad_norm": 0.302734375, "learning_rate": 4.558688563222746e-06, "loss": 0.0321, "step": 18520 }, { "epoch": 4.566568047337278, "grad_norm": 0.283203125, "learning_rate": 4.5484188465797935e-06, "loss": 0.0323, "step": 18522 }, { "epoch": 4.567061143984221, "grad_norm": 0.30859375, "learning_rate": 4.538160441482131e-06, "loss": 0.0322, "step": 18524 }, { "epoch": 4.567554240631163, "grad_norm": 0.296875, "learning_rate": 4.527913349145441e-06, "loss": 0.0316, "step": 18526 }, { "epoch": 4.568047337278107, "grad_norm": 0.302734375, "learning_rate": 4.5176775707840735e-06, "loss": 0.0335, "step": 18528 }, { "epoch": 4.568540433925049, "grad_norm": 0.28515625, "learning_rate": 4.507453107611015e-06, "loss": 0.0335, "step": 18530 }, { "epoch": 4.569033530571992, "grad_norm": 0.31640625, "learning_rate": 4.49723996083794e-06, "loss": 0.0306, "step": 18532 }, { "epoch": 4.569526627218935, "grad_norm": 0.330078125, "learning_rate": 4.487038131675114e-06, "loss": 0.0305, "step": 18534 }, { "epoch": 4.570019723865878, "grad_norm": 0.357421875, "learning_rate": 4.476847621331581e-06, "loss": 0.0341, "step": 18536 }, { "epoch": 4.57051282051282, "grad_norm": 0.28515625, "learning_rate": 4.466668431014931e-06, "loss": 0.0333, "step": 18538 }, { "epoch": 4.571005917159764, "grad_norm": 0.38671875, "learning_rate": 4.456500561931454e-06, "loss": 0.0386, "step": 18540 }, { "epoch": 4.571499013806706, "grad_norm": 0.279296875, "learning_rate": 4.446344015286108e-06, "loss": 0.0367, "step": 18542 }, { "epoch": 4.571992110453649, "grad_norm": 0.30859375, "learning_rate": 4.436198792282498e-06, "loss": 0.0308, "step": 18544 }, { "epoch": 4.572485207100591, "grad_norm": 0.33984375, "learning_rate": 4.426064894122905e-06, "loss": 0.0323, "step": 18546 }, { "epoch": 4.572978303747535, "grad_norm": 0.294921875, "learning_rate": 4.415942322008215e-06, "loss": 0.0337, "step": 18548 }, { "epoch": 4.573471400394477, "grad_norm": 0.32421875, "learning_rate": 4.405831077138056e-06, "loss": 0.0343, "step": 18550 }, { "epoch": 4.57396449704142, "grad_norm": 0.314453125, "learning_rate": 4.3957311607106256e-06, "loss": 0.0306, "step": 18552 }, { "epoch": 4.574457593688363, "grad_norm": 0.283203125, "learning_rate": 4.385642573922843e-06, "loss": 0.0335, "step": 18554 }, { "epoch": 4.574950690335306, "grad_norm": 0.32421875, "learning_rate": 4.375565317970265e-06, "loss": 0.0333, "step": 18556 }, { "epoch": 4.575443786982248, "grad_norm": 0.28515625, "learning_rate": 4.36549939404709e-06, "loss": 0.0315, "step": 18558 }, { "epoch": 4.575936883629192, "grad_norm": 0.2734375, "learning_rate": 4.355444803346198e-06, "loss": 0.0328, "step": 18560 }, { "epoch": 4.576429980276134, "grad_norm": 0.3125, "learning_rate": 4.345401547059091e-06, "loss": 0.0332, "step": 18562 }, { "epoch": 4.576923076923077, "grad_norm": 0.2890625, "learning_rate": 4.335369626375985e-06, "loss": 0.0356, "step": 18564 }, { "epoch": 4.57741617357002, "grad_norm": 0.30078125, "learning_rate": 4.3253490424856935e-06, "loss": 0.0305, "step": 18566 }, { "epoch": 4.577909270216963, "grad_norm": 0.341796875, "learning_rate": 4.315339796575724e-06, "loss": 0.0355, "step": 18568 }, { "epoch": 4.578402366863905, "grad_norm": 0.373046875, "learning_rate": 4.305341889832215e-06, "loss": 0.0362, "step": 18570 }, { "epoch": 4.5788954635108485, "grad_norm": 0.322265625, "learning_rate": 4.295355323439986e-06, "loss": 0.0347, "step": 18572 }, { "epoch": 4.579388560157791, "grad_norm": 0.361328125, "learning_rate": 4.285380098582514e-06, "loss": 0.0368, "step": 18574 }, { "epoch": 4.579881656804734, "grad_norm": 0.291015625, "learning_rate": 4.275416216441863e-06, "loss": 0.0324, "step": 18576 }, { "epoch": 4.580374753451677, "grad_norm": 0.279296875, "learning_rate": 4.265463678198889e-06, "loss": 0.0354, "step": 18578 }, { "epoch": 4.5808678500986195, "grad_norm": 0.2734375, "learning_rate": 4.255522485032959e-06, "loss": 0.0329, "step": 18580 }, { "epoch": 4.581360946745562, "grad_norm": 0.314453125, "learning_rate": 4.2455926381221975e-06, "loss": 0.0355, "step": 18582 }, { "epoch": 4.581854043392505, "grad_norm": 0.3046875, "learning_rate": 4.23567413864332e-06, "loss": 0.0314, "step": 18584 }, { "epoch": 4.582347140039448, "grad_norm": 0.2734375, "learning_rate": 4.22576698777174e-06, "loss": 0.0313, "step": 18586 }, { "epoch": 4.5828402366863905, "grad_norm": 0.265625, "learning_rate": 4.215871186681508e-06, "loss": 0.0324, "step": 18588 }, { "epoch": 4.583333333333333, "grad_norm": 0.28125, "learning_rate": 4.205986736545331e-06, "loss": 0.0293, "step": 18590 }, { "epoch": 4.5838264299802765, "grad_norm": 0.37109375, "learning_rate": 4.196113638534582e-06, "loss": 0.0318, "step": 18592 }, { "epoch": 4.584319526627219, "grad_norm": 0.412109375, "learning_rate": 4.1862518938192375e-06, "loss": 0.0341, "step": 18594 }, { "epoch": 4.5848126232741615, "grad_norm": 0.3046875, "learning_rate": 4.176401503568028e-06, "loss": 0.0366, "step": 18596 }, { "epoch": 4.585305719921105, "grad_norm": 0.263671875, "learning_rate": 4.166562468948243e-06, "loss": 0.0304, "step": 18598 }, { "epoch": 4.585798816568047, "grad_norm": 0.306640625, "learning_rate": 4.156734791125861e-06, "loss": 0.0339, "step": 18600 }, { "epoch": 4.58629191321499, "grad_norm": 0.33203125, "learning_rate": 4.146918471265537e-06, "loss": 0.0365, "step": 18602 }, { "epoch": 4.5867850098619325, "grad_norm": 0.341796875, "learning_rate": 4.137113510530544e-06, "loss": 0.0362, "step": 18604 }, { "epoch": 4.587278106508876, "grad_norm": 0.2890625, "learning_rate": 4.127319910082839e-06, "loss": 0.0297, "step": 18606 }, { "epoch": 4.587771203155818, "grad_norm": 0.28125, "learning_rate": 4.117537671082983e-06, "loss": 0.0322, "step": 18608 }, { "epoch": 4.588264299802761, "grad_norm": 0.310546875, "learning_rate": 4.107766794690282e-06, "loss": 0.0332, "step": 18610 }, { "epoch": 4.588757396449704, "grad_norm": 0.33203125, "learning_rate": 4.0980072820625995e-06, "loss": 0.0303, "step": 18612 }, { "epoch": 4.589250493096647, "grad_norm": 0.30859375, "learning_rate": 4.088259134356498e-06, "loss": 0.0359, "step": 18614 }, { "epoch": 4.589743589743589, "grad_norm": 0.291015625, "learning_rate": 4.078522352727199e-06, "loss": 0.0348, "step": 18616 }, { "epoch": 4.590236686390533, "grad_norm": 0.28125, "learning_rate": 4.068796938328556e-06, "loss": 0.0347, "step": 18618 }, { "epoch": 4.590729783037475, "grad_norm": 0.302734375, "learning_rate": 4.059082892313082e-06, "loss": 0.0312, "step": 18620 }, { "epoch": 4.591222879684418, "grad_norm": 0.26171875, "learning_rate": 4.0493802158319435e-06, "loss": 0.0289, "step": 18622 }, { "epoch": 4.591715976331361, "grad_norm": 0.291015625, "learning_rate": 4.039688910035e-06, "loss": 0.0316, "step": 18624 }, { "epoch": 4.592209072978304, "grad_norm": 0.3515625, "learning_rate": 4.030008976070665e-06, "loss": 0.0347, "step": 18626 }, { "epoch": 4.592702169625246, "grad_norm": 0.294921875, "learning_rate": 4.020340415086099e-06, "loss": 0.0324, "step": 18628 }, { "epoch": 4.59319526627219, "grad_norm": 0.29296875, "learning_rate": 4.0106832282270854e-06, "loss": 0.0331, "step": 18630 }, { "epoch": 4.593688362919132, "grad_norm": 0.34375, "learning_rate": 4.0010374166380315e-06, "loss": 0.035, "step": 18632 }, { "epoch": 4.594181459566075, "grad_norm": 0.291015625, "learning_rate": 3.991402981462045e-06, "loss": 0.0328, "step": 18634 }, { "epoch": 4.594674556213018, "grad_norm": 0.32421875, "learning_rate": 3.981779923840845e-06, "loss": 0.0311, "step": 18636 }, { "epoch": 4.595167652859961, "grad_norm": 0.29296875, "learning_rate": 3.97216824491482e-06, "loss": 0.035, "step": 18638 }, { "epoch": 4.595660749506903, "grad_norm": 0.345703125, "learning_rate": 3.9625679458230145e-06, "loss": 0.0345, "step": 18640 }, { "epoch": 4.596153846153846, "grad_norm": 0.251953125, "learning_rate": 3.952979027703107e-06, "loss": 0.0322, "step": 18642 }, { "epoch": 4.596646942800789, "grad_norm": 0.296875, "learning_rate": 3.9434014916914455e-06, "loss": 0.0342, "step": 18644 }, { "epoch": 4.597140039447732, "grad_norm": 0.271484375, "learning_rate": 3.933835338923009e-06, "loss": 0.0333, "step": 18646 }, { "epoch": 4.597633136094674, "grad_norm": 0.33203125, "learning_rate": 3.9242805705314576e-06, "loss": 0.0327, "step": 18648 }, { "epoch": 4.598126232741618, "grad_norm": 0.2890625, "learning_rate": 3.914737187649076e-06, "loss": 0.0296, "step": 18650 }, { "epoch": 4.59861932938856, "grad_norm": 0.296875, "learning_rate": 3.905205191406802e-06, "loss": 0.031, "step": 18652 }, { "epoch": 4.599112426035503, "grad_norm": 0.2890625, "learning_rate": 3.895684582934256e-06, "loss": 0.0316, "step": 18654 }, { "epoch": 4.599605522682445, "grad_norm": 0.35546875, "learning_rate": 3.886175363359646e-06, "loss": 0.029, "step": 18656 }, { "epoch": 4.600098619329389, "grad_norm": 0.326171875, "learning_rate": 3.876677533809892e-06, "loss": 0.0328, "step": 18658 }, { "epoch": 4.600591715976331, "grad_norm": 0.326171875, "learning_rate": 3.867191095410539e-06, "loss": 0.0329, "step": 18660 }, { "epoch": 4.601084812623274, "grad_norm": 0.28125, "learning_rate": 3.857716049285765e-06, "loss": 0.0324, "step": 18662 }, { "epoch": 4.601577909270217, "grad_norm": 0.3984375, "learning_rate": 3.848252396558438e-06, "loss": 0.0313, "step": 18664 }, { "epoch": 4.60207100591716, "grad_norm": 0.30078125, "learning_rate": 3.8388001383500405e-06, "loss": 0.0343, "step": 18666 }, { "epoch": 4.602564102564102, "grad_norm": 0.28515625, "learning_rate": 3.8293592757807415e-06, "loss": 0.0328, "step": 18668 }, { "epoch": 4.603057199211046, "grad_norm": 0.310546875, "learning_rate": 3.819929809969292e-06, "loss": 0.036, "step": 18670 }, { "epoch": 4.603550295857988, "grad_norm": 0.27734375, "learning_rate": 3.810511742033174e-06, "loss": 0.0323, "step": 18672 }, { "epoch": 4.604043392504931, "grad_norm": 0.291015625, "learning_rate": 3.8011050730884624e-06, "loss": 0.0315, "step": 18674 }, { "epoch": 4.604536489151874, "grad_norm": 0.283203125, "learning_rate": 3.79170980424991e-06, "loss": 0.0319, "step": 18676 }, { "epoch": 4.605029585798817, "grad_norm": 0.271484375, "learning_rate": 3.782325936630893e-06, "loss": 0.0335, "step": 18678 }, { "epoch": 4.605522682445759, "grad_norm": 0.333984375, "learning_rate": 3.7729534713434765e-06, "loss": 0.033, "step": 18680 }, { "epoch": 4.6060157790927025, "grad_norm": 0.2734375, "learning_rate": 3.7635924094983397e-06, "loss": 0.0342, "step": 18682 }, { "epoch": 4.606508875739645, "grad_norm": 0.3203125, "learning_rate": 3.7542427522047953e-06, "loss": 0.0318, "step": 18684 }, { "epoch": 4.607001972386588, "grad_norm": 0.294921875, "learning_rate": 3.7449045005708783e-06, "loss": 0.0307, "step": 18686 }, { "epoch": 4.607495069033531, "grad_norm": 0.3359375, "learning_rate": 3.7355776557031706e-06, "loss": 0.0323, "step": 18688 }, { "epoch": 4.6079881656804735, "grad_norm": 0.29296875, "learning_rate": 3.7262622187070105e-06, "loss": 0.0326, "step": 18690 }, { "epoch": 4.608481262327416, "grad_norm": 0.3203125, "learning_rate": 3.7169581906862817e-06, "loss": 0.0339, "step": 18692 }, { "epoch": 4.608974358974359, "grad_norm": 0.2890625, "learning_rate": 3.7076655727435905e-06, "loss": 0.0306, "step": 18694 }, { "epoch": 4.609467455621302, "grad_norm": 0.3046875, "learning_rate": 3.6983843659801674e-06, "loss": 0.0338, "step": 18696 }, { "epoch": 4.6099605522682445, "grad_norm": 0.32421875, "learning_rate": 3.689114571495844e-06, "loss": 0.0344, "step": 18698 }, { "epoch": 4.610453648915187, "grad_norm": 0.3125, "learning_rate": 3.679856190389208e-06, "loss": 0.0359, "step": 18700 }, { "epoch": 4.6109467455621305, "grad_norm": 0.2734375, "learning_rate": 3.670609223757371e-06, "loss": 0.0288, "step": 18702 }, { "epoch": 4.611439842209073, "grad_norm": 0.306640625, "learning_rate": 3.6613736726962e-06, "loss": 0.0314, "step": 18704 }, { "epoch": 4.6119329388560155, "grad_norm": 0.28515625, "learning_rate": 3.6521495383001204e-06, "loss": 0.0325, "step": 18706 }, { "epoch": 4.612426035502959, "grad_norm": 0.279296875, "learning_rate": 3.6429368216622573e-06, "loss": 0.0345, "step": 18708 }, { "epoch": 4.6129191321499015, "grad_norm": 0.365234375, "learning_rate": 3.6337355238743818e-06, "loss": 0.0324, "step": 18710 }, { "epoch": 4.613412228796844, "grad_norm": 0.306640625, "learning_rate": 3.6245456460268666e-06, "loss": 0.0327, "step": 18712 }, { "epoch": 4.6139053254437865, "grad_norm": 0.287109375, "learning_rate": 3.615367189208807e-06, "loss": 0.0312, "step": 18714 }, { "epoch": 4.61439842209073, "grad_norm": 0.412109375, "learning_rate": 3.606200154507855e-06, "loss": 0.0343, "step": 18716 }, { "epoch": 4.6148915187376724, "grad_norm": 0.25390625, "learning_rate": 3.5970445430103973e-06, "loss": 0.0319, "step": 18718 }, { "epoch": 4.615384615384615, "grad_norm": 0.30859375, "learning_rate": 3.587900355801399e-06, "loss": 0.0331, "step": 18720 }, { "epoch": 4.615877712031558, "grad_norm": 0.267578125, "learning_rate": 3.578767593964505e-06, "loss": 0.031, "step": 18722 }, { "epoch": 4.616370808678501, "grad_norm": 0.33203125, "learning_rate": 3.5696462585820044e-06, "loss": 0.0339, "step": 18724 }, { "epoch": 4.616863905325443, "grad_norm": 0.28515625, "learning_rate": 3.5605363507348112e-06, "loss": 0.0306, "step": 18726 }, { "epoch": 4.617357001972387, "grad_norm": 0.345703125, "learning_rate": 3.5514378715025165e-06, "loss": 0.0306, "step": 18728 }, { "epoch": 4.617850098619329, "grad_norm": 0.3125, "learning_rate": 3.542350821963314e-06, "loss": 0.0311, "step": 18730 }, { "epoch": 4.618343195266272, "grad_norm": 0.296875, "learning_rate": 3.5332752031941197e-06, "loss": 0.0346, "step": 18732 }, { "epoch": 4.618836291913215, "grad_norm": 0.267578125, "learning_rate": 3.5242110162703956e-06, "loss": 0.0319, "step": 18734 }, { "epoch": 4.619329388560158, "grad_norm": 0.294921875, "learning_rate": 3.515158262266327e-06, "loss": 0.0287, "step": 18736 }, { "epoch": 4.6198224852071, "grad_norm": 0.359375, "learning_rate": 3.5061169422547113e-06, "loss": 0.0306, "step": 18738 }, { "epoch": 4.620315581854044, "grad_norm": 0.33984375, "learning_rate": 3.4970870573069693e-06, "loss": 0.0314, "step": 18740 }, { "epoch": 4.620808678500986, "grad_norm": 0.314453125, "learning_rate": 3.4880686084932444e-06, "loss": 0.0373, "step": 18742 }, { "epoch": 4.621301775147929, "grad_norm": 0.359375, "learning_rate": 3.479061596882205e-06, "loss": 0.0309, "step": 18744 }, { "epoch": 4.621794871794872, "grad_norm": 0.298828125, "learning_rate": 3.4700660235413075e-06, "loss": 0.0368, "step": 18746 }, { "epoch": 4.622287968441815, "grad_norm": 0.294921875, "learning_rate": 3.4610818895365217e-06, "loss": 0.0327, "step": 18748 }, { "epoch": 4.622781065088757, "grad_norm": 0.314453125, "learning_rate": 3.45210919593254e-06, "loss": 0.0331, "step": 18750 }, { "epoch": 4.6232741617357, "grad_norm": 0.3046875, "learning_rate": 3.4431479437926683e-06, "loss": 0.0273, "step": 18752 }, { "epoch": 4.623767258382643, "grad_norm": 0.291015625, "learning_rate": 3.434198134178879e-06, "loss": 0.0316, "step": 18754 }, { "epoch": 4.624260355029586, "grad_norm": 0.294921875, "learning_rate": 3.4252597681517674e-06, "loss": 0.0325, "step": 18756 }, { "epoch": 4.624753451676528, "grad_norm": 0.306640625, "learning_rate": 3.4163328467705536e-06, "loss": 0.0309, "step": 18758 }, { "epoch": 4.625246548323472, "grad_norm": 0.2890625, "learning_rate": 3.40741737109318e-06, "loss": 0.0282, "step": 18760 }, { "epoch": 4.625739644970414, "grad_norm": 0.29296875, "learning_rate": 3.398513342176135e-06, "loss": 0.0331, "step": 18762 }, { "epoch": 4.626232741617357, "grad_norm": 0.357421875, "learning_rate": 3.389620761074619e-06, "loss": 0.0309, "step": 18764 }, { "epoch": 4.6267258382643, "grad_norm": 0.2734375, "learning_rate": 3.3807396288424444e-06, "loss": 0.034, "step": 18766 }, { "epoch": 4.627218934911243, "grad_norm": 0.333984375, "learning_rate": 3.371869946532069e-06, "loss": 0.0326, "step": 18768 }, { "epoch": 4.627712031558185, "grad_norm": 0.294921875, "learning_rate": 3.3630117151946194e-06, "loss": 0.0321, "step": 18770 }, { "epoch": 4.628205128205128, "grad_norm": 0.34375, "learning_rate": 3.354164935879822e-06, "loss": 0.0328, "step": 18772 }, { "epoch": 4.628698224852071, "grad_norm": 0.302734375, "learning_rate": 3.3453296096361053e-06, "loss": 0.0361, "step": 18774 }, { "epoch": 4.629191321499014, "grad_norm": 0.302734375, "learning_rate": 3.3365057375104535e-06, "loss": 0.0354, "step": 18776 }, { "epoch": 4.629684418145956, "grad_norm": 0.2890625, "learning_rate": 3.3276933205485862e-06, "loss": 0.0333, "step": 18778 }, { "epoch": 4.6301775147929, "grad_norm": 0.279296875, "learning_rate": 3.3188923597948007e-06, "loss": 0.0317, "step": 18780 }, { "epoch": 4.630670611439842, "grad_norm": 0.2578125, "learning_rate": 3.3101028562920744e-06, "loss": 0.0301, "step": 18782 }, { "epoch": 4.631163708086785, "grad_norm": 0.33984375, "learning_rate": 3.301324811081996e-06, "loss": 0.0318, "step": 18784 }, { "epoch": 4.631656804733728, "grad_norm": 0.29296875, "learning_rate": 3.2925582252048338e-06, "loss": 0.033, "step": 18786 }, { "epoch": 4.632149901380671, "grad_norm": 0.275390625, "learning_rate": 3.2838030996994782e-06, "loss": 0.0325, "step": 18788 }, { "epoch": 4.632642998027613, "grad_norm": 0.330078125, "learning_rate": 3.2750594356034336e-06, "loss": 0.0334, "step": 18790 }, { "epoch": 4.633136094674557, "grad_norm": 0.28515625, "learning_rate": 3.2663272339528926e-06, "loss": 0.0341, "step": 18792 }, { "epoch": 4.633629191321499, "grad_norm": 0.28515625, "learning_rate": 3.2576064957826613e-06, "loss": 0.029, "step": 18794 }, { "epoch": 4.634122287968442, "grad_norm": 0.29296875, "learning_rate": 3.248897222126213e-06, "loss": 0.0326, "step": 18796 }, { "epoch": 4.634615384615385, "grad_norm": 0.314453125, "learning_rate": 3.240199414015621e-06, "loss": 0.0341, "step": 18798 }, { "epoch": 4.6351084812623276, "grad_norm": 0.337890625, "learning_rate": 3.23151307248164e-06, "loss": 0.0302, "step": 18800 }, { "epoch": 4.63560157790927, "grad_norm": 0.302734375, "learning_rate": 3.222838198553646e-06, "loss": 0.0331, "step": 18802 }, { "epoch": 4.6360946745562135, "grad_norm": 0.314453125, "learning_rate": 3.214174793259661e-06, "loss": 0.0344, "step": 18804 }, { "epoch": 4.636587771203156, "grad_norm": 0.279296875, "learning_rate": 3.2055228576263528e-06, "loss": 0.03, "step": 18806 }, { "epoch": 4.6370808678500985, "grad_norm": 0.26953125, "learning_rate": 3.1968823926790014e-06, "loss": 0.0345, "step": 18808 }, { "epoch": 4.637573964497041, "grad_norm": 0.30859375, "learning_rate": 3.1882533994415766e-06, "loss": 0.035, "step": 18810 }, { "epoch": 4.6380670611439845, "grad_norm": 0.40234375, "learning_rate": 3.1796358789366485e-06, "loss": 0.0336, "step": 18812 }, { "epoch": 4.638560157790927, "grad_norm": 0.306640625, "learning_rate": 3.171029832185435e-06, "loss": 0.0325, "step": 18814 }, { "epoch": 4.6390532544378695, "grad_norm": 0.408203125, "learning_rate": 3.1624352602078187e-06, "loss": 0.0351, "step": 18816 }, { "epoch": 4.639546351084813, "grad_norm": 0.373046875, "learning_rate": 3.1538521640223084e-06, "loss": 0.0326, "step": 18818 }, { "epoch": 4.6400394477317555, "grad_norm": 0.279296875, "learning_rate": 3.145280544646012e-06, "loss": 0.0297, "step": 18820 }, { "epoch": 4.640532544378698, "grad_norm": 0.3203125, "learning_rate": 3.1367204030947618e-06, "loss": 0.0332, "step": 18822 }, { "epoch": 4.641025641025641, "grad_norm": 0.2890625, "learning_rate": 3.1281717403829457e-06, "loss": 0.0296, "step": 18824 }, { "epoch": 4.641518737672584, "grad_norm": 0.30078125, "learning_rate": 3.119634557523643e-06, "loss": 0.0348, "step": 18826 }, { "epoch": 4.6420118343195265, "grad_norm": 0.275390625, "learning_rate": 3.111108855528555e-06, "loss": 0.0344, "step": 18828 }, { "epoch": 4.642504930966469, "grad_norm": 0.27734375, "learning_rate": 3.102594635408029e-06, "loss": 0.0318, "step": 18830 }, { "epoch": 4.642998027613412, "grad_norm": 0.2734375, "learning_rate": 3.0940918981710585e-06, "loss": 0.0338, "step": 18832 }, { "epoch": 4.643491124260355, "grad_norm": 0.29296875, "learning_rate": 3.085600644825226e-06, "loss": 0.0309, "step": 18834 }, { "epoch": 4.6439842209072975, "grad_norm": 0.291015625, "learning_rate": 3.077120876376838e-06, "loss": 0.0324, "step": 18836 }, { "epoch": 4.644477317554241, "grad_norm": 0.33203125, "learning_rate": 3.0686525938307566e-06, "loss": 0.0316, "step": 18838 }, { "epoch": 4.644970414201183, "grad_norm": 0.27734375, "learning_rate": 3.060195798190546e-06, "loss": 0.0321, "step": 18840 }, { "epoch": 4.645463510848126, "grad_norm": 0.30859375, "learning_rate": 3.0517504904583714e-06, "loss": 0.0291, "step": 18842 }, { "epoch": 4.645956607495069, "grad_norm": 0.314453125, "learning_rate": 3.0433166716350657e-06, "loss": 0.0299, "step": 18844 }, { "epoch": 4.646449704142012, "grad_norm": 0.25390625, "learning_rate": 3.0348943427200737e-06, "loss": 0.0309, "step": 18846 }, { "epoch": 4.646942800788954, "grad_norm": 0.33984375, "learning_rate": 3.0264835047114637e-06, "loss": 0.0351, "step": 18848 }, { "epoch": 4.647435897435898, "grad_norm": 0.322265625, "learning_rate": 3.018084158606005e-06, "loss": 0.0307, "step": 18850 }, { "epoch": 4.64792899408284, "grad_norm": 0.2890625, "learning_rate": 3.0096963053990345e-06, "loss": 0.0326, "step": 18852 }, { "epoch": 4.648422090729783, "grad_norm": 0.322265625, "learning_rate": 3.001319946084602e-06, "loss": 0.0324, "step": 18854 }, { "epoch": 4.648915187376726, "grad_norm": 0.29296875, "learning_rate": 2.992955081655313e-06, "loss": 0.0284, "step": 18856 }, { "epoch": 4.649408284023669, "grad_norm": 0.302734375, "learning_rate": 2.984601713102464e-06, "loss": 0.0343, "step": 18858 }, { "epoch": 4.649901380670611, "grad_norm": 0.35546875, "learning_rate": 2.9762598414159848e-06, "loss": 0.0351, "step": 18860 }, { "epoch": 4.650394477317555, "grad_norm": 0.2734375, "learning_rate": 2.9679294675844073e-06, "loss": 0.0312, "step": 18862 }, { "epoch": 4.650887573964497, "grad_norm": 0.28515625, "learning_rate": 2.9596105925949745e-06, "loss": 0.0307, "step": 18864 }, { "epoch": 4.65138067061144, "grad_norm": 0.265625, "learning_rate": 2.9513032174334654e-06, "loss": 0.0346, "step": 18866 }, { "epoch": 4.651873767258382, "grad_norm": 0.287109375, "learning_rate": 2.9430073430843917e-06, "loss": 0.0324, "step": 18868 }, { "epoch": 4.652366863905326, "grad_norm": 0.3046875, "learning_rate": 2.9347229705308344e-06, "loss": 0.0343, "step": 18870 }, { "epoch": 4.652859960552268, "grad_norm": 0.30859375, "learning_rate": 2.926450100754552e-06, "loss": 0.0375, "step": 18872 }, { "epoch": 4.653353057199211, "grad_norm": 0.298828125, "learning_rate": 2.918188734735927e-06, "loss": 0.0327, "step": 18874 }, { "epoch": 4.653846153846154, "grad_norm": 0.34765625, "learning_rate": 2.9099388734539543e-06, "loss": 0.0339, "step": 18876 }, { "epoch": 4.654339250493097, "grad_norm": 0.28125, "learning_rate": 2.90170051788633e-06, "loss": 0.0314, "step": 18878 }, { "epoch": 4.654832347140039, "grad_norm": 0.298828125, "learning_rate": 2.8934736690092945e-06, "loss": 0.0348, "step": 18880 }, { "epoch": 4.655325443786982, "grad_norm": 0.287109375, "learning_rate": 2.885258327797835e-06, "loss": 0.0329, "step": 18882 }, { "epoch": 4.655818540433925, "grad_norm": 0.32421875, "learning_rate": 2.8770544952254617e-06, "loss": 0.0341, "step": 18884 }, { "epoch": 4.656311637080868, "grad_norm": 0.28515625, "learning_rate": 2.8688621722643973e-06, "loss": 0.0317, "step": 18886 }, { "epoch": 4.65680473372781, "grad_norm": 0.30859375, "learning_rate": 2.8606813598854864e-06, "loss": 0.0351, "step": 18888 }, { "epoch": 4.657297830374754, "grad_norm": 0.267578125, "learning_rate": 2.8525120590581768e-06, "loss": 0.0326, "step": 18890 }, { "epoch": 4.657790927021696, "grad_norm": 0.310546875, "learning_rate": 2.8443542707506043e-06, "loss": 0.0313, "step": 18892 }, { "epoch": 4.658284023668639, "grad_norm": 0.337890625, "learning_rate": 2.836207995929485e-06, "loss": 0.032, "step": 18894 }, { "epoch": 4.658777120315582, "grad_norm": 0.28515625, "learning_rate": 2.8280732355602247e-06, "loss": 0.0298, "step": 18896 }, { "epoch": 4.659270216962525, "grad_norm": 0.287109375, "learning_rate": 2.8199499906068184e-06, "loss": 0.0325, "step": 18898 }, { "epoch": 4.659763313609467, "grad_norm": 0.2890625, "learning_rate": 2.8118382620319183e-06, "loss": 0.0315, "step": 18900 }, { "epoch": 4.660256410256411, "grad_norm": 0.296875, "learning_rate": 2.803738050796811e-06, "loss": 0.0293, "step": 18902 }, { "epoch": 4.660749506903353, "grad_norm": 0.28125, "learning_rate": 2.7956493578614297e-06, "loss": 0.0319, "step": 18904 }, { "epoch": 4.661242603550296, "grad_norm": 0.287109375, "learning_rate": 2.7875721841843285e-06, "loss": 0.0327, "step": 18906 }, { "epoch": 4.661735700197239, "grad_norm": 0.2734375, "learning_rate": 2.779506530722653e-06, "loss": 0.0299, "step": 18908 }, { "epoch": 4.662228796844182, "grad_norm": 0.2890625, "learning_rate": 2.7714523984322947e-06, "loss": 0.032, "step": 18910 }, { "epoch": 4.662721893491124, "grad_norm": 0.28125, "learning_rate": 2.763409788267668e-06, "loss": 0.0321, "step": 18912 }, { "epoch": 4.6632149901380675, "grad_norm": 0.287109375, "learning_rate": 2.7553787011818877e-06, "loss": 0.0333, "step": 18914 }, { "epoch": 4.66370808678501, "grad_norm": 0.37890625, "learning_rate": 2.7473591381266708e-06, "loss": 0.0356, "step": 18916 }, { "epoch": 4.664201183431953, "grad_norm": 0.349609375, "learning_rate": 2.73935110005239e-06, "loss": 0.0349, "step": 18918 }, { "epoch": 4.664694280078895, "grad_norm": 0.341796875, "learning_rate": 2.7313545879080417e-06, "loss": 0.0335, "step": 18920 }, { "epoch": 4.6651873767258385, "grad_norm": 0.337890625, "learning_rate": 2.723369602641235e-06, "loss": 0.035, "step": 18922 }, { "epoch": 4.665680473372781, "grad_norm": 0.287109375, "learning_rate": 2.7153961451982677e-06, "loss": 0.0354, "step": 18924 }, { "epoch": 4.6661735700197235, "grad_norm": 0.3984375, "learning_rate": 2.7074342165240298e-06, "loss": 0.04, "step": 18926 }, { "epoch": 4.666666666666667, "grad_norm": 0.296875, "learning_rate": 2.699483817562032e-06, "loss": 0.0306, "step": 18928 }, { "epoch": 4.6671597633136095, "grad_norm": 0.265625, "learning_rate": 2.6915449492544765e-06, "loss": 0.0342, "step": 18930 }, { "epoch": 4.667652859960552, "grad_norm": 0.3828125, "learning_rate": 2.6836176125421332e-06, "loss": 0.0306, "step": 18932 }, { "epoch": 4.668145956607495, "grad_norm": 0.34375, "learning_rate": 2.67570180836445e-06, "loss": 0.0357, "step": 18934 }, { "epoch": 4.668639053254438, "grad_norm": 0.275390625, "learning_rate": 2.667797537659489e-06, "loss": 0.0299, "step": 18936 }, { "epoch": 4.6691321499013805, "grad_norm": 0.259765625, "learning_rate": 2.659904801363966e-06, "loss": 0.0312, "step": 18938 }, { "epoch": 4.669625246548323, "grad_norm": 0.263671875, "learning_rate": 2.6520236004131893e-06, "loss": 0.0313, "step": 18940 }, { "epoch": 4.670118343195266, "grad_norm": 0.31640625, "learning_rate": 2.644153935741145e-06, "loss": 0.0322, "step": 18942 }, { "epoch": 4.670611439842209, "grad_norm": 0.31640625, "learning_rate": 2.63629580828042e-06, "loss": 0.0294, "step": 18944 }, { "epoch": 4.6711045364891515, "grad_norm": 0.349609375, "learning_rate": 2.6284492189622478e-06, "loss": 0.0352, "step": 18946 }, { "epoch": 4.671597633136095, "grad_norm": 0.28515625, "learning_rate": 2.6206141687165065e-06, "loss": 0.0354, "step": 18948 }, { "epoch": 4.672090729783037, "grad_norm": 0.29296875, "learning_rate": 2.6127906584716754e-06, "loss": 0.031, "step": 18950 }, { "epoch": 4.67258382642998, "grad_norm": 0.291015625, "learning_rate": 2.604978689154891e-06, "loss": 0.0339, "step": 18952 }, { "epoch": 4.673076923076923, "grad_norm": 0.3125, "learning_rate": 2.597178261691924e-06, "loss": 0.0325, "step": 18954 }, { "epoch": 4.673570019723866, "grad_norm": 0.287109375, "learning_rate": 2.5893893770071566e-06, "loss": 0.0301, "step": 18956 }, { "epoch": 4.674063116370808, "grad_norm": 0.267578125, "learning_rate": 2.5816120360236174e-06, "loss": 0.0341, "step": 18958 }, { "epoch": 4.674556213017752, "grad_norm": 0.302734375, "learning_rate": 2.5738462396629692e-06, "loss": 0.0336, "step": 18960 }, { "epoch": 4.675049309664694, "grad_norm": 0.3125, "learning_rate": 2.566091988845498e-06, "loss": 0.0331, "step": 18962 }, { "epoch": 4.675542406311637, "grad_norm": 0.265625, "learning_rate": 2.558349284490125e-06, "loss": 0.0297, "step": 18964 }, { "epoch": 4.67603550295858, "grad_norm": 0.2890625, "learning_rate": 2.5506181275144035e-06, "loss": 0.0327, "step": 18966 }, { "epoch": 4.676528599605523, "grad_norm": 0.2734375, "learning_rate": 2.5428985188345356e-06, "loss": 0.0292, "step": 18968 }, { "epoch": 4.677021696252465, "grad_norm": 0.29296875, "learning_rate": 2.535190459365311e-06, "loss": 0.0325, "step": 18970 }, { "epoch": 4.677514792899409, "grad_norm": 0.279296875, "learning_rate": 2.527493950020199e-06, "loss": 0.031, "step": 18972 }, { "epoch": 4.678007889546351, "grad_norm": 0.2890625, "learning_rate": 2.5198089917112588e-06, "loss": 0.0339, "step": 18974 }, { "epoch": 4.678500986193294, "grad_norm": 0.265625, "learning_rate": 2.512135585349218e-06, "loss": 0.0334, "step": 18976 }, { "epoch": 4.678994082840236, "grad_norm": 0.326171875, "learning_rate": 2.5044737318434153e-06, "loss": 0.0328, "step": 18978 }, { "epoch": 4.67948717948718, "grad_norm": 0.32421875, "learning_rate": 2.4968234321018246e-06, "loss": 0.0306, "step": 18980 }, { "epoch": 4.679980276134122, "grad_norm": 0.3046875, "learning_rate": 2.489184687031043e-06, "loss": 0.0348, "step": 18982 }, { "epoch": 4.680473372781065, "grad_norm": 0.28515625, "learning_rate": 2.4815574975363023e-06, "loss": 0.0372, "step": 18984 }, { "epoch": 4.680966469428008, "grad_norm": 0.267578125, "learning_rate": 2.4739418645214783e-06, "loss": 0.0306, "step": 18986 }, { "epoch": 4.681459566074951, "grad_norm": 0.357421875, "learning_rate": 2.4663377888890503e-06, "loss": 0.0339, "step": 18988 }, { "epoch": 4.681952662721893, "grad_norm": 0.294921875, "learning_rate": 2.4587452715401638e-06, "loss": 0.0307, "step": 18990 }, { "epoch": 4.682445759368837, "grad_norm": 0.318359375, "learning_rate": 2.451164313374554e-06, "loss": 0.033, "step": 18992 }, { "epoch": 4.682938856015779, "grad_norm": 0.283203125, "learning_rate": 2.4435949152906145e-06, "loss": 0.0319, "step": 18994 }, { "epoch": 4.683431952662722, "grad_norm": 0.326171875, "learning_rate": 2.4360370781853716e-06, "loss": 0.0316, "step": 18996 }, { "epoch": 4.683925049309664, "grad_norm": 0.287109375, "learning_rate": 2.428490802954453e-06, "loss": 0.0289, "step": 18998 }, { "epoch": 4.684418145956608, "grad_norm": 0.302734375, "learning_rate": 2.4209560904921545e-06, "loss": 0.0336, "step": 19000 }, { "epoch": 4.68491124260355, "grad_norm": 0.318359375, "learning_rate": 2.4134329416913517e-06, "loss": 0.0296, "step": 19002 }, { "epoch": 4.685404339250493, "grad_norm": 0.28515625, "learning_rate": 2.405921357443619e-06, "loss": 0.0319, "step": 19004 }, { "epoch": 4.685897435897436, "grad_norm": 0.29296875, "learning_rate": 2.3984213386391006e-06, "loss": 0.0317, "step": 19006 }, { "epoch": 4.686390532544379, "grad_norm": 0.279296875, "learning_rate": 2.390932886166575e-06, "loss": 0.0283, "step": 19008 }, { "epoch": 4.686883629191321, "grad_norm": 0.263671875, "learning_rate": 2.3834560009135087e-06, "loss": 0.03, "step": 19010 }, { "epoch": 4.687376725838265, "grad_norm": 0.28515625, "learning_rate": 2.375990683765894e-06, "loss": 0.032, "step": 19012 }, { "epoch": 4.687869822485207, "grad_norm": 0.32421875, "learning_rate": 2.3685369356084675e-06, "loss": 0.0317, "step": 19014 }, { "epoch": 4.68836291913215, "grad_norm": 0.314453125, "learning_rate": 2.3610947573245e-06, "loss": 0.0313, "step": 19016 }, { "epoch": 4.688856015779093, "grad_norm": 0.3046875, "learning_rate": 2.3536641497959754e-06, "loss": 0.0324, "step": 19018 }, { "epoch": 4.689349112426036, "grad_norm": 0.2734375, "learning_rate": 2.3462451139034226e-06, "loss": 0.0292, "step": 19020 }, { "epoch": 4.689842209072978, "grad_norm": 0.294921875, "learning_rate": 2.338837650526049e-06, "loss": 0.0343, "step": 19022 }, { "epoch": 4.6903353057199215, "grad_norm": 0.265625, "learning_rate": 2.331441760541697e-06, "loss": 0.0267, "step": 19024 }, { "epoch": 4.690828402366864, "grad_norm": 0.2734375, "learning_rate": 2.324057444826788e-06, "loss": 0.0289, "step": 19026 }, { "epoch": 4.691321499013807, "grad_norm": 0.2578125, "learning_rate": 2.3166847042564445e-06, "loss": 0.0299, "step": 19028 }, { "epoch": 4.69181459566075, "grad_norm": 0.259765625, "learning_rate": 2.3093235397043333e-06, "loss": 0.03, "step": 19030 }, { "epoch": 4.6923076923076925, "grad_norm": 0.314453125, "learning_rate": 2.3019739520428464e-06, "loss": 0.0311, "step": 19032 }, { "epoch": 4.692800788954635, "grad_norm": 0.2890625, "learning_rate": 2.2946359421429086e-06, "loss": 0.0295, "step": 19034 }, { "epoch": 4.693293885601578, "grad_norm": 0.34375, "learning_rate": 2.287309510874125e-06, "loss": 0.0348, "step": 19036 }, { "epoch": 4.693786982248521, "grad_norm": 0.25390625, "learning_rate": 2.279994659104734e-06, "loss": 0.0305, "step": 19038 }, { "epoch": 4.6942800788954635, "grad_norm": 0.275390625, "learning_rate": 2.2726913877015534e-06, "loss": 0.0313, "step": 19040 }, { "epoch": 4.694773175542406, "grad_norm": 0.2734375, "learning_rate": 2.2653996975301016e-06, "loss": 0.0334, "step": 19042 }, { "epoch": 4.695266272189349, "grad_norm": 0.28515625, "learning_rate": 2.258119589454455e-06, "loss": 0.0326, "step": 19044 }, { "epoch": 4.695759368836292, "grad_norm": 0.3359375, "learning_rate": 2.250851064337367e-06, "loss": 0.0332, "step": 19046 }, { "epoch": 4.6962524654832345, "grad_norm": 0.30078125, "learning_rate": 2.2435941230401824e-06, "loss": 0.0327, "step": 19048 }, { "epoch": 4.696745562130177, "grad_norm": 0.306640625, "learning_rate": 2.236348766422891e-06, "loss": 0.034, "step": 19050 }, { "epoch": 4.69723865877712, "grad_norm": 0.322265625, "learning_rate": 2.229114995344128e-06, "loss": 0.0324, "step": 19052 }, { "epoch": 4.697731755424063, "grad_norm": 0.279296875, "learning_rate": 2.221892810661108e-06, "loss": 0.0312, "step": 19054 }, { "epoch": 4.6982248520710055, "grad_norm": 0.30078125, "learning_rate": 2.214682213229724e-06, "loss": 0.032, "step": 19056 }, { "epoch": 4.698717948717949, "grad_norm": 0.2890625, "learning_rate": 2.207483203904448e-06, "loss": 0.0327, "step": 19058 }, { "epoch": 4.699211045364891, "grad_norm": 0.28125, "learning_rate": 2.200295783538431e-06, "loss": 0.0311, "step": 19060 }, { "epoch": 4.699704142011834, "grad_norm": 0.384765625, "learning_rate": 2.193119952983391e-06, "loss": 0.0325, "step": 19062 }, { "epoch": 4.700197238658777, "grad_norm": 0.294921875, "learning_rate": 2.1859557130897267e-06, "loss": 0.0328, "step": 19064 }, { "epoch": 4.70069033530572, "grad_norm": 0.314453125, "learning_rate": 2.178803064706436e-06, "loss": 0.0366, "step": 19066 }, { "epoch": 4.701183431952662, "grad_norm": 0.306640625, "learning_rate": 2.1716620086811523e-06, "loss": 0.0345, "step": 19068 }, { "epoch": 4.701676528599606, "grad_norm": 0.32421875, "learning_rate": 2.164532545860121e-06, "loss": 0.0316, "step": 19070 }, { "epoch": 4.702169625246548, "grad_norm": 0.298828125, "learning_rate": 2.1574146770882097e-06, "loss": 0.0304, "step": 19072 }, { "epoch": 4.702662721893491, "grad_norm": 0.357421875, "learning_rate": 2.150308403208967e-06, "loss": 0.0325, "step": 19074 }, { "epoch": 4.703155818540434, "grad_norm": 0.357421875, "learning_rate": 2.1432137250644857e-06, "loss": 0.0318, "step": 19076 }, { "epoch": 4.703648915187377, "grad_norm": 0.259765625, "learning_rate": 2.1361306434955486e-06, "loss": 0.032, "step": 19078 }, { "epoch": 4.704142011834319, "grad_norm": 0.326171875, "learning_rate": 2.1290591593415176e-06, "loss": 0.0359, "step": 19080 }, { "epoch": 4.704635108481263, "grad_norm": 0.287109375, "learning_rate": 2.121999273440434e-06, "loss": 0.033, "step": 19082 }, { "epoch": 4.705128205128205, "grad_norm": 0.28515625, "learning_rate": 2.1149509866289054e-06, "loss": 0.0299, "step": 19084 }, { "epoch": 4.705621301775148, "grad_norm": 0.27734375, "learning_rate": 2.1079142997422084e-06, "loss": 0.0278, "step": 19086 }, { "epoch": 4.706114398422091, "grad_norm": 0.34375, "learning_rate": 2.100889213614243e-06, "loss": 0.0298, "step": 19088 }, { "epoch": 4.706607495069034, "grad_norm": 0.3046875, "learning_rate": 2.0938757290774878e-06, "loss": 0.0324, "step": 19090 }, { "epoch": 4.707100591715976, "grad_norm": 0.2412109375, "learning_rate": 2.086873846963089e-06, "loss": 0.0303, "step": 19092 }, { "epoch": 4.707593688362919, "grad_norm": 0.271484375, "learning_rate": 2.0798835681008264e-06, "loss": 0.0277, "step": 19094 }, { "epoch": 4.708086785009862, "grad_norm": 0.2451171875, "learning_rate": 2.072904893319072e-06, "loss": 0.0288, "step": 19096 }, { "epoch": 4.708579881656805, "grad_norm": 0.28515625, "learning_rate": 2.0659378234448525e-06, "loss": 0.0325, "step": 19098 }, { "epoch": 4.709072978303747, "grad_norm": 0.326171875, "learning_rate": 2.0589823593037847e-06, "loss": 0.036, "step": 19100 }, { "epoch": 4.709566074950691, "grad_norm": 0.28515625, "learning_rate": 2.052038501720155e-06, "loss": 0.0319, "step": 19102 }, { "epoch": 4.710059171597633, "grad_norm": 0.318359375, "learning_rate": 2.045106251516826e-06, "loss": 0.0319, "step": 19104 }, { "epoch": 4.710552268244576, "grad_norm": 0.310546875, "learning_rate": 2.0381856095153086e-06, "loss": 0.0319, "step": 19106 }, { "epoch": 4.711045364891518, "grad_norm": 0.298828125, "learning_rate": 2.0312765765357455e-06, "loss": 0.0346, "step": 19108 }, { "epoch": 4.711538461538462, "grad_norm": 0.279296875, "learning_rate": 2.024379153396905e-06, "loss": 0.0307, "step": 19110 }, { "epoch": 4.712031558185404, "grad_norm": 0.314453125, "learning_rate": 2.0174933409161544e-06, "loss": 0.0304, "step": 19112 }, { "epoch": 4.712524654832347, "grad_norm": 0.26953125, "learning_rate": 2.0106191399095088e-06, "loss": 0.0286, "step": 19114 }, { "epoch": 4.71301775147929, "grad_norm": 0.28515625, "learning_rate": 2.0037565511915936e-06, "loss": 0.0321, "step": 19116 }, { "epoch": 4.713510848126233, "grad_norm": 0.32421875, "learning_rate": 1.9969055755756803e-06, "loss": 0.0325, "step": 19118 }, { "epoch": 4.714003944773175, "grad_norm": 0.296875, "learning_rate": 1.9900662138736204e-06, "loss": 0.0345, "step": 19120 }, { "epoch": 4.714497041420119, "grad_norm": 0.333984375, "learning_rate": 1.9832384668959427e-06, "loss": 0.0338, "step": 19122 }, { "epoch": 4.714990138067061, "grad_norm": 0.41796875, "learning_rate": 1.9764223354517553e-06, "loss": 0.0343, "step": 19124 }, { "epoch": 4.715483234714004, "grad_norm": 0.28125, "learning_rate": 1.9696178203488124e-06, "loss": 0.032, "step": 19126 }, { "epoch": 4.715976331360947, "grad_norm": 0.279296875, "learning_rate": 1.9628249223935023e-06, "loss": 0.0323, "step": 19128 }, { "epoch": 4.71646942800789, "grad_norm": 0.32421875, "learning_rate": 1.9560436423907923e-06, "loss": 0.0306, "step": 19130 }, { "epoch": 4.716962524654832, "grad_norm": 0.2578125, "learning_rate": 1.9492739811443286e-06, "loss": 0.026, "step": 19132 }, { "epoch": 4.7174556213017755, "grad_norm": 0.29296875, "learning_rate": 1.942515939456335e-06, "loss": 0.033, "step": 19134 }, { "epoch": 4.717948717948718, "grad_norm": 0.283203125, "learning_rate": 1.935769518127706e-06, "loss": 0.0349, "step": 19136 }, { "epoch": 4.718441814595661, "grad_norm": 0.298828125, "learning_rate": 1.9290347179578894e-06, "loss": 0.0336, "step": 19138 }, { "epoch": 4.718934911242604, "grad_norm": 0.318359375, "learning_rate": 1.9223115397450254e-06, "loss": 0.0329, "step": 19140 }, { "epoch": 4.7194280078895465, "grad_norm": 0.28125, "learning_rate": 1.915599984285832e-06, "loss": 0.0311, "step": 19142 }, { "epoch": 4.719921104536489, "grad_norm": 0.2890625, "learning_rate": 1.908900052375673e-06, "loss": 0.0326, "step": 19144 }, { "epoch": 4.720414201183432, "grad_norm": 0.30859375, "learning_rate": 1.9022117448085353e-06, "loss": 0.0365, "step": 19146 }, { "epoch": 4.720907297830375, "grad_norm": 0.32421875, "learning_rate": 1.8955350623770184e-06, "loss": 0.0325, "step": 19148 }, { "epoch": 4.7214003944773175, "grad_norm": 0.3125, "learning_rate": 1.888870005872334e-06, "loss": 0.0319, "step": 19150 }, { "epoch": 4.72189349112426, "grad_norm": 0.27734375, "learning_rate": 1.882216576084339e-06, "loss": 0.0316, "step": 19152 }, { "epoch": 4.7223865877712035, "grad_norm": 0.28125, "learning_rate": 1.8755747738015028e-06, "loss": 0.0305, "step": 19154 }, { "epoch": 4.722879684418146, "grad_norm": 0.291015625, "learning_rate": 1.8689445998109067e-06, "loss": 0.031, "step": 19156 }, { "epoch": 4.7233727810650885, "grad_norm": 0.294921875, "learning_rate": 1.8623260548982668e-06, "loss": 0.0322, "step": 19158 }, { "epoch": 4.723865877712032, "grad_norm": 0.30859375, "learning_rate": 1.8557191398479223e-06, "loss": 0.0301, "step": 19160 }, { "epoch": 4.7243589743589745, "grad_norm": 0.275390625, "learning_rate": 1.8491238554428248e-06, "loss": 0.0301, "step": 19162 }, { "epoch": 4.724852071005917, "grad_norm": 0.310546875, "learning_rate": 1.842540202464571e-06, "loss": 0.0321, "step": 19164 }, { "epoch": 4.7253451676528595, "grad_norm": 0.26171875, "learning_rate": 1.835968181693315e-06, "loss": 0.0333, "step": 19166 }, { "epoch": 4.725838264299803, "grad_norm": 0.28515625, "learning_rate": 1.8294077939079336e-06, "loss": 0.0344, "step": 19168 }, { "epoch": 4.726331360946745, "grad_norm": 0.259765625, "learning_rate": 1.822859039885838e-06, "loss": 0.0343, "step": 19170 }, { "epoch": 4.726824457593688, "grad_norm": 0.337890625, "learning_rate": 1.8163219204030856e-06, "loss": 0.0324, "step": 19172 }, { "epoch": 4.727317554240631, "grad_norm": 0.26953125, "learning_rate": 1.809796436234379e-06, "loss": 0.0283, "step": 19174 }, { "epoch": 4.727810650887574, "grad_norm": 0.27734375, "learning_rate": 1.8032825881530213e-06, "loss": 0.0315, "step": 19176 }, { "epoch": 4.728303747534516, "grad_norm": 0.2890625, "learning_rate": 1.7967803769309399e-06, "loss": 0.0333, "step": 19178 }, { "epoch": 4.72879684418146, "grad_norm": 0.3515625, "learning_rate": 1.7902898033386738e-06, "loss": 0.0332, "step": 19180 }, { "epoch": 4.729289940828402, "grad_norm": 0.28125, "learning_rate": 1.783810868145408e-06, "loss": 0.0309, "step": 19182 }, { "epoch": 4.729783037475345, "grad_norm": 0.34375, "learning_rate": 1.7773435721189058e-06, "loss": 0.0321, "step": 19184 }, { "epoch": 4.730276134122288, "grad_norm": 0.287109375, "learning_rate": 1.7708879160256097e-06, "loss": 0.0335, "step": 19186 }, { "epoch": 4.730769230769231, "grad_norm": 0.3125, "learning_rate": 1.7644439006305303e-06, "loss": 0.0328, "step": 19188 }, { "epoch": 4.731262327416173, "grad_norm": 0.283203125, "learning_rate": 1.758011526697334e-06, "loss": 0.0309, "step": 19190 }, { "epoch": 4.731755424063117, "grad_norm": 0.287109375, "learning_rate": 1.7515907949882782e-06, "loss": 0.0294, "step": 19192 }, { "epoch": 4.732248520710059, "grad_norm": 0.267578125, "learning_rate": 1.745181706264254e-06, "loss": 0.0312, "step": 19194 }, { "epoch": 4.732741617357002, "grad_norm": 0.28125, "learning_rate": 1.7387842612847983e-06, "loss": 0.0325, "step": 19196 }, { "epoch": 4.733234714003945, "grad_norm": 0.259765625, "learning_rate": 1.7323984608080267e-06, "loss": 0.0312, "step": 19198 }, { "epoch": 4.733727810650888, "grad_norm": 0.298828125, "learning_rate": 1.726024305590679e-06, "loss": 0.0327, "step": 19200 }, { "epoch": 4.73422090729783, "grad_norm": 0.296875, "learning_rate": 1.71966179638815e-06, "loss": 0.0309, "step": 19202 }, { "epoch": 4.734714003944773, "grad_norm": 0.31640625, "learning_rate": 1.7133109339544263e-06, "loss": 0.0315, "step": 19204 }, { "epoch": 4.735207100591716, "grad_norm": 0.3046875, "learning_rate": 1.7069717190421275e-06, "loss": 0.035, "step": 19206 }, { "epoch": 4.735700197238659, "grad_norm": 0.283203125, "learning_rate": 1.700644152402464e-06, "loss": 0.0318, "step": 19208 }, { "epoch": 4.736193293885601, "grad_norm": 0.28515625, "learning_rate": 1.694328234785314e-06, "loss": 0.0312, "step": 19210 }, { "epoch": 4.736686390532545, "grad_norm": 0.3359375, "learning_rate": 1.6880239669391229e-06, "loss": 0.0335, "step": 19212 }, { "epoch": 4.737179487179487, "grad_norm": 0.2578125, "learning_rate": 1.6817313496110155e-06, "loss": 0.0289, "step": 19214 }, { "epoch": 4.73767258382643, "grad_norm": 0.267578125, "learning_rate": 1.6754503835466727e-06, "loss": 0.029, "step": 19216 }, { "epoch": 4.738165680473373, "grad_norm": 0.279296875, "learning_rate": 1.669181069490422e-06, "loss": 0.0321, "step": 19218 }, { "epoch": 4.738658777120316, "grad_norm": 0.23046875, "learning_rate": 1.6629234081852463e-06, "loss": 0.028, "step": 19220 }, { "epoch": 4.739151873767258, "grad_norm": 0.28515625, "learning_rate": 1.6566774003726637e-06, "loss": 0.0319, "step": 19222 }, { "epoch": 4.739644970414201, "grad_norm": 0.27734375, "learning_rate": 1.6504430467929044e-06, "loss": 0.0347, "step": 19224 }, { "epoch": 4.740138067061144, "grad_norm": 0.29296875, "learning_rate": 1.644220348184755e-06, "loss": 0.0324, "step": 19226 }, { "epoch": 4.740631163708087, "grad_norm": 0.265625, "learning_rate": 1.6380093052856483e-06, "loss": 0.0317, "step": 19228 }, { "epoch": 4.741124260355029, "grad_norm": 0.37109375, "learning_rate": 1.6318099188316061e-06, "loss": 0.0334, "step": 19230 }, { "epoch": 4.741617357001973, "grad_norm": 0.271484375, "learning_rate": 1.6256221895573187e-06, "loss": 0.0331, "step": 19232 }, { "epoch": 4.742110453648915, "grad_norm": 0.310546875, "learning_rate": 1.6194461181960552e-06, "loss": 0.0348, "step": 19234 }, { "epoch": 4.742603550295858, "grad_norm": 0.314453125, "learning_rate": 1.6132817054797079e-06, "loss": 0.029, "step": 19236 }, { "epoch": 4.743096646942801, "grad_norm": 0.26171875, "learning_rate": 1.6071289521388033e-06, "loss": 0.0281, "step": 19238 }, { "epoch": 4.743589743589744, "grad_norm": 0.267578125, "learning_rate": 1.6009878589024696e-06, "loss": 0.0317, "step": 19240 }, { "epoch": 4.744082840236686, "grad_norm": 0.30078125, "learning_rate": 1.5948584264984689e-06, "loss": 0.0308, "step": 19242 }, { "epoch": 4.7445759368836296, "grad_norm": 0.28515625, "learning_rate": 1.5887406556531648e-06, "loss": 0.0333, "step": 19244 }, { "epoch": 4.745069033530572, "grad_norm": 0.287109375, "learning_rate": 1.582634547091555e-06, "loss": 0.0287, "step": 19246 }, { "epoch": 4.745562130177515, "grad_norm": 0.27734375, "learning_rate": 1.5765401015372382e-06, "loss": 0.0304, "step": 19248 }, { "epoch": 4.746055226824458, "grad_norm": 0.287109375, "learning_rate": 1.5704573197124484e-06, "loss": 0.0342, "step": 19250 }, { "epoch": 4.7465483234714005, "grad_norm": 0.2421875, "learning_rate": 1.5643862023380307e-06, "loss": 0.0275, "step": 19252 }, { "epoch": 4.747041420118343, "grad_norm": 0.28515625, "learning_rate": 1.5583267501334208e-06, "loss": 0.0297, "step": 19254 }, { "epoch": 4.7475345167652865, "grad_norm": 0.34375, "learning_rate": 1.5522789638167445e-06, "loss": 0.0318, "step": 19256 }, { "epoch": 4.748027613412229, "grad_norm": 0.2734375, "learning_rate": 1.5462428441046616e-06, "loss": 0.034, "step": 19258 }, { "epoch": 4.7485207100591715, "grad_norm": 0.298828125, "learning_rate": 1.5402183917125002e-06, "loss": 0.0338, "step": 19260 }, { "epoch": 4.749013806706114, "grad_norm": 0.3046875, "learning_rate": 1.534205607354189e-06, "loss": 0.0306, "step": 19262 }, { "epoch": 4.7495069033530575, "grad_norm": 0.2890625, "learning_rate": 1.5282044917422689e-06, "loss": 0.0359, "step": 19264 }, { "epoch": 4.75, "grad_norm": 0.283203125, "learning_rate": 1.5222150455879159e-06, "loss": 0.032, "step": 19266 }, { "epoch": 4.7504930966469425, "grad_norm": 0.3125, "learning_rate": 1.5162372696009175e-06, "loss": 0.0314, "step": 19268 }, { "epoch": 4.750986193293886, "grad_norm": 0.26171875, "learning_rate": 1.5102711644896627e-06, "loss": 0.0341, "step": 19270 }, { "epoch": 4.7514792899408285, "grad_norm": 0.28125, "learning_rate": 1.504316730961175e-06, "loss": 0.0315, "step": 19272 }, { "epoch": 4.751972386587771, "grad_norm": 0.287109375, "learning_rate": 1.4983739697210675e-06, "loss": 0.0346, "step": 19274 }, { "epoch": 4.7524654832347135, "grad_norm": 0.34765625, "learning_rate": 1.4924428814736213e-06, "loss": 0.0329, "step": 19276 }, { "epoch": 4.752958579881657, "grad_norm": 0.302734375, "learning_rate": 1.4865234669216854e-06, "loss": 0.0335, "step": 19278 }, { "epoch": 4.7534516765285995, "grad_norm": 0.306640625, "learning_rate": 1.480615726766743e-06, "loss": 0.0309, "step": 19280 }, { "epoch": 4.753944773175542, "grad_norm": 0.314453125, "learning_rate": 1.474719661708901e-06, "loss": 0.0323, "step": 19282 }, { "epoch": 4.754437869822485, "grad_norm": 0.267578125, "learning_rate": 1.468835272446889e-06, "loss": 0.0307, "step": 19284 }, { "epoch": 4.754930966469428, "grad_norm": 0.291015625, "learning_rate": 1.462962559678005e-06, "loss": 0.0314, "step": 19286 }, { "epoch": 4.7554240631163704, "grad_norm": 0.279296875, "learning_rate": 1.4571015240982256e-06, "loss": 0.0335, "step": 19288 }, { "epoch": 4.755917159763314, "grad_norm": 0.291015625, "learning_rate": 1.4512521664021061e-06, "loss": 0.0323, "step": 19290 }, { "epoch": 4.756410256410256, "grad_norm": 0.32421875, "learning_rate": 1.445414487282837e-06, "loss": 0.0324, "step": 19292 }, { "epoch": 4.756903353057199, "grad_norm": 0.275390625, "learning_rate": 1.4395884874321975e-06, "loss": 0.0262, "step": 19294 }, { "epoch": 4.757396449704142, "grad_norm": 0.283203125, "learning_rate": 1.4337741675406246e-06, "loss": 0.0313, "step": 19296 }, { "epoch": 4.757889546351085, "grad_norm": 0.3125, "learning_rate": 1.4279715282971228e-06, "loss": 0.0312, "step": 19298 }, { "epoch": 4.758382642998027, "grad_norm": 0.3203125, "learning_rate": 1.4221805703893753e-06, "loss": 0.0322, "step": 19300 }, { "epoch": 4.758875739644971, "grad_norm": 0.24609375, "learning_rate": 1.4164012945035998e-06, "loss": 0.0292, "step": 19302 }, { "epoch": 4.759368836291913, "grad_norm": 0.296875, "learning_rate": 1.4106337013246928e-06, "loss": 0.032, "step": 19304 }, { "epoch": 4.759861932938856, "grad_norm": 0.314453125, "learning_rate": 1.404877791536141e-06, "loss": 0.0329, "step": 19306 }, { "epoch": 4.760355029585799, "grad_norm": 0.283203125, "learning_rate": 1.3991335658200655e-06, "loss": 0.0297, "step": 19308 }, { "epoch": 4.760848126232742, "grad_norm": 0.291015625, "learning_rate": 1.3934010248571772e-06, "loss": 0.0306, "step": 19310 }, { "epoch": 4.761341222879684, "grad_norm": 0.33203125, "learning_rate": 1.3876801693268105e-06, "loss": 0.0312, "step": 19312 }, { "epoch": 4.761834319526627, "grad_norm": 0.26953125, "learning_rate": 1.381970999906934e-06, "loss": 0.03, "step": 19314 }, { "epoch": 4.76232741617357, "grad_norm": 0.302734375, "learning_rate": 1.3762735172740959e-06, "loss": 0.0311, "step": 19316 }, { "epoch": 4.762820512820513, "grad_norm": 0.279296875, "learning_rate": 1.370587722103489e-06, "loss": 0.0306, "step": 19318 }, { "epoch": 4.763313609467455, "grad_norm": 0.279296875, "learning_rate": 1.364913615068919e-06, "loss": 0.032, "step": 19320 }, { "epoch": 4.763806706114399, "grad_norm": 0.27734375, "learning_rate": 1.3592511968427923e-06, "loss": 0.03, "step": 19322 }, { "epoch": 4.764299802761341, "grad_norm": 0.35546875, "learning_rate": 1.353600468096139e-06, "loss": 0.0334, "step": 19324 }, { "epoch": 4.764792899408284, "grad_norm": 0.3515625, "learning_rate": 1.3479614294986009e-06, "loss": 0.0332, "step": 19326 }, { "epoch": 4.765285996055227, "grad_norm": 0.29296875, "learning_rate": 1.3423340817184437e-06, "loss": 0.0273, "step": 19328 }, { "epoch": 4.76577909270217, "grad_norm": 0.328125, "learning_rate": 1.3367184254225118e-06, "loss": 0.0342, "step": 19330 }, { "epoch": 4.766272189349112, "grad_norm": 0.2890625, "learning_rate": 1.3311144612763393e-06, "loss": 0.0342, "step": 19332 }, { "epoch": 4.766765285996055, "grad_norm": 0.248046875, "learning_rate": 1.325522189943984e-06, "loss": 0.0291, "step": 19334 }, { "epoch": 4.767258382642998, "grad_norm": 0.306640625, "learning_rate": 1.3199416120881713e-06, "loss": 0.034, "step": 19336 }, { "epoch": 4.767751479289941, "grad_norm": 0.3125, "learning_rate": 1.314372728370239e-06, "loss": 0.0325, "step": 19338 }, { "epoch": 4.768244575936883, "grad_norm": 0.25390625, "learning_rate": 1.3088155394501366e-06, "loss": 0.0315, "step": 19340 }, { "epoch": 4.768737672583827, "grad_norm": 0.25390625, "learning_rate": 1.3032700459864156e-06, "loss": 0.0325, "step": 19342 }, { "epoch": 4.769230769230769, "grad_norm": 0.2890625, "learning_rate": 1.2977362486362277e-06, "loss": 0.0279, "step": 19344 }, { "epoch": 4.769723865877712, "grad_norm": 0.28515625, "learning_rate": 1.292214148055393e-06, "loss": 0.0339, "step": 19346 }, { "epoch": 4.770216962524655, "grad_norm": 0.322265625, "learning_rate": 1.2867037448982877e-06, "loss": 0.0302, "step": 19348 }, { "epoch": 4.770710059171598, "grad_norm": 0.28515625, "learning_rate": 1.2812050398179453e-06, "loss": 0.0303, "step": 19350 }, { "epoch": 4.77120315581854, "grad_norm": 0.291015625, "learning_rate": 1.2757180334659668e-06, "loss": 0.0313, "step": 19352 }, { "epoch": 4.771696252465484, "grad_norm": 0.326171875, "learning_rate": 1.270242726492621e-06, "loss": 0.0327, "step": 19354 }, { "epoch": 4.772189349112426, "grad_norm": 0.29296875, "learning_rate": 1.264779119546744e-06, "loss": 0.0344, "step": 19356 }, { "epoch": 4.772682445759369, "grad_norm": 0.259765625, "learning_rate": 1.2593272132757961e-06, "loss": 0.0313, "step": 19358 }, { "epoch": 4.773175542406312, "grad_norm": 0.26953125, "learning_rate": 1.2538870083258824e-06, "loss": 0.0307, "step": 19360 }, { "epoch": 4.773668639053255, "grad_norm": 0.328125, "learning_rate": 1.2484585053416765e-06, "loss": 0.0297, "step": 19362 }, { "epoch": 4.774161735700197, "grad_norm": 0.33984375, "learning_rate": 1.2430417049665077e-06, "loss": 0.0363, "step": 19364 }, { "epoch": 4.7746548323471405, "grad_norm": 0.337890625, "learning_rate": 1.237636607842274e-06, "loss": 0.0363, "step": 19366 }, { "epoch": 4.775147928994083, "grad_norm": 0.283203125, "learning_rate": 1.2322432146095297e-06, "loss": 0.0328, "step": 19368 }, { "epoch": 4.7756410256410255, "grad_norm": 0.28515625, "learning_rate": 1.226861525907419e-06, "loss": 0.0332, "step": 19370 }, { "epoch": 4.776134122287968, "grad_norm": 0.322265625, "learning_rate": 1.221491542373676e-06, "loss": 0.0296, "step": 19372 }, { "epoch": 4.7766272189349115, "grad_norm": 0.310546875, "learning_rate": 1.2161332646447143e-06, "loss": 0.031, "step": 19374 }, { "epoch": 4.777120315581854, "grad_norm": 0.376953125, "learning_rate": 1.2107866933554813e-06, "loss": 0.0315, "step": 19376 }, { "epoch": 4.7776134122287965, "grad_norm": 0.302734375, "learning_rate": 1.2054518291396033e-06, "loss": 0.0312, "step": 19378 }, { "epoch": 4.77810650887574, "grad_norm": 0.30078125, "learning_rate": 1.200128672629275e-06, "loss": 0.0341, "step": 19380 }, { "epoch": 4.7785996055226825, "grad_norm": 0.28515625, "learning_rate": 1.1948172244553357e-06, "loss": 0.0325, "step": 19382 }, { "epoch": 4.779092702169625, "grad_norm": 0.31640625, "learning_rate": 1.1895174852472157e-06, "loss": 0.0306, "step": 19384 }, { "epoch": 4.779585798816568, "grad_norm": 0.3203125, "learning_rate": 1.1842294556329458e-06, "loss": 0.0338, "step": 19386 }, { "epoch": 4.780078895463511, "grad_norm": 0.291015625, "learning_rate": 1.1789531362392136e-06, "loss": 0.0307, "step": 19388 }, { "epoch": 4.7805719921104535, "grad_norm": 0.26953125, "learning_rate": 1.1736885276912747e-06, "loss": 0.0308, "step": 19390 }, { "epoch": 4.781065088757396, "grad_norm": 0.30859375, "learning_rate": 1.1684356306130295e-06, "loss": 0.0309, "step": 19392 }, { "epoch": 4.781558185404339, "grad_norm": 0.3125, "learning_rate": 1.1631944456269583e-06, "loss": 0.0314, "step": 19394 }, { "epoch": 4.782051282051282, "grad_norm": 0.30078125, "learning_rate": 1.1579649733541752e-06, "loss": 0.034, "step": 19396 }, { "epoch": 4.7825443786982245, "grad_norm": 0.369140625, "learning_rate": 1.152747214414418e-06, "loss": 0.0338, "step": 19398 }, { "epoch": 4.783037475345168, "grad_norm": 0.302734375, "learning_rate": 1.147541169426003e-06, "loss": 0.0299, "step": 19400 }, { "epoch": 4.78353057199211, "grad_norm": 0.3359375, "learning_rate": 1.1423468390058812e-06, "loss": 0.0345, "step": 19402 }, { "epoch": 4.784023668639053, "grad_norm": 0.345703125, "learning_rate": 1.1371642237696044e-06, "loss": 0.0295, "step": 19404 }, { "epoch": 4.784516765285996, "grad_norm": 0.33984375, "learning_rate": 1.1319933243313596e-06, "loss": 0.0312, "step": 19406 }, { "epoch": 4.785009861932939, "grad_norm": 0.318359375, "learning_rate": 1.1268341413039008e-06, "loss": 0.0364, "step": 19408 }, { "epoch": 4.785502958579881, "grad_norm": 0.2578125, "learning_rate": 1.1216866752986389e-06, "loss": 0.0285, "step": 19410 }, { "epoch": 4.785996055226825, "grad_norm": 0.267578125, "learning_rate": 1.1165509269255748e-06, "loss": 0.0295, "step": 19412 }, { "epoch": 4.786489151873767, "grad_norm": 0.26953125, "learning_rate": 1.1114268967933105e-06, "loss": 0.03, "step": 19414 }, { "epoch": 4.78698224852071, "grad_norm": 0.279296875, "learning_rate": 1.1063145855090939e-06, "loss": 0.0325, "step": 19416 }, { "epoch": 4.787475345167653, "grad_norm": 0.271484375, "learning_rate": 1.1012139936787402e-06, "loss": 0.0323, "step": 19418 }, { "epoch": 4.787968441814596, "grad_norm": 0.3125, "learning_rate": 1.0961251219067214e-06, "loss": 0.0329, "step": 19420 }, { "epoch": 4.788461538461538, "grad_norm": 0.287109375, "learning_rate": 1.0910479707960663e-06, "loss": 0.035, "step": 19422 }, { "epoch": 4.788954635108482, "grad_norm": 0.322265625, "learning_rate": 1.0859825409484713e-06, "loss": 0.0304, "step": 19424 }, { "epoch": 4.789447731755424, "grad_norm": 0.259765625, "learning_rate": 1.080928832964212e-06, "loss": 0.0346, "step": 19426 }, { "epoch": 4.789940828402367, "grad_norm": 0.2890625, "learning_rate": 1.0758868474421868e-06, "loss": 0.0326, "step": 19428 }, { "epoch": 4.790433925049309, "grad_norm": 0.296875, "learning_rate": 1.0708565849798846e-06, "loss": 0.0295, "step": 19430 }, { "epoch": 4.790927021696253, "grad_norm": 0.318359375, "learning_rate": 1.0658380461734175e-06, "loss": 0.0342, "step": 19432 }, { "epoch": 4.791420118343195, "grad_norm": 0.275390625, "learning_rate": 1.0608312316175317e-06, "loss": 0.0305, "step": 19434 }, { "epoch": 4.791913214990138, "grad_norm": 0.287109375, "learning_rate": 1.055836141905553e-06, "loss": 0.0325, "step": 19436 }, { "epoch": 4.792406311637081, "grad_norm": 0.2890625, "learning_rate": 1.0508527776294186e-06, "loss": 0.0325, "step": 19438 }, { "epoch": 4.792899408284024, "grad_norm": 0.2890625, "learning_rate": 1.0458811393796896e-06, "loss": 0.0299, "step": 19440 }, { "epoch": 4.793392504930966, "grad_norm": 0.29296875, "learning_rate": 1.0409212277455394e-06, "loss": 0.0302, "step": 19442 }, { "epoch": 4.793885601577909, "grad_norm": 0.33984375, "learning_rate": 1.0359730433147308e-06, "loss": 0.0317, "step": 19444 }, { "epoch": 4.794378698224852, "grad_norm": 0.337890625, "learning_rate": 1.0310365866736727e-06, "loss": 0.0302, "step": 19446 }, { "epoch": 4.794871794871795, "grad_norm": 0.26953125, "learning_rate": 1.0261118584073526e-06, "loss": 0.0314, "step": 19448 }, { "epoch": 4.795364891518737, "grad_norm": 0.302734375, "learning_rate": 1.0211988590993704e-06, "loss": 0.0323, "step": 19450 }, { "epoch": 4.795857988165681, "grad_norm": 0.298828125, "learning_rate": 1.0162975893319492e-06, "loss": 0.0314, "step": 19452 }, { "epoch": 4.796351084812623, "grad_norm": 0.3046875, "learning_rate": 1.0114080496859136e-06, "loss": 0.0314, "step": 19454 }, { "epoch": 4.796844181459566, "grad_norm": 0.291015625, "learning_rate": 1.0065302407407106e-06, "loss": 0.0333, "step": 19456 }, { "epoch": 4.797337278106509, "grad_norm": 0.3046875, "learning_rate": 1.0016641630743894e-06, "loss": 0.0303, "step": 19458 }, { "epoch": 4.797830374753452, "grad_norm": 0.29296875, "learning_rate": 9.968098172635887e-07, "loss": 0.0309, "step": 19460 }, { "epoch": 4.798323471400394, "grad_norm": 0.287109375, "learning_rate": 9.919672038835925e-07, "loss": 0.0317, "step": 19462 }, { "epoch": 4.798816568047338, "grad_norm": 0.267578125, "learning_rate": 9.871363235082865e-07, "loss": 0.0276, "step": 19464 }, { "epoch": 4.79930966469428, "grad_norm": 0.341796875, "learning_rate": 9.823171767101236e-07, "loss": 0.0317, "step": 19466 }, { "epoch": 4.799802761341223, "grad_norm": 0.2578125, "learning_rate": 9.77509764060225e-07, "loss": 0.0288, "step": 19468 }, { "epoch": 4.800295857988166, "grad_norm": 0.345703125, "learning_rate": 9.727140861282902e-07, "loss": 0.0328, "step": 19470 }, { "epoch": 4.800788954635109, "grad_norm": 0.37890625, "learning_rate": 9.679301434826426e-07, "loss": 0.0329, "step": 19472 }, { "epoch": 4.801282051282051, "grad_norm": 0.357421875, "learning_rate": 9.631579366901843e-07, "loss": 0.0338, "step": 19474 }, { "epoch": 4.8017751479289945, "grad_norm": 0.294921875, "learning_rate": 9.58397466316463e-07, "loss": 0.0315, "step": 19476 }, { "epoch": 4.802268244575937, "grad_norm": 0.27734375, "learning_rate": 9.536487329256271e-07, "loss": 0.03, "step": 19478 }, { "epoch": 4.80276134122288, "grad_norm": 0.287109375, "learning_rate": 9.489117370804157e-07, "loss": 0.031, "step": 19480 }, { "epoch": 4.803254437869823, "grad_norm": 0.279296875, "learning_rate": 9.441864793422017e-07, "loss": 0.0356, "step": 19482 }, { "epoch": 4.8037475345167655, "grad_norm": 0.30859375, "learning_rate": 9.394729602709374e-07, "loss": 0.0302, "step": 19484 }, { "epoch": 4.804240631163708, "grad_norm": 0.283203125, "learning_rate": 9.34771180425209e-07, "loss": 0.0339, "step": 19486 }, { "epoch": 4.804733727810651, "grad_norm": 0.287109375, "learning_rate": 9.300811403622156e-07, "loss": 0.0303, "step": 19488 }, { "epoch": 4.805226824457594, "grad_norm": 0.26953125, "learning_rate": 9.254028406377346e-07, "loss": 0.0316, "step": 19490 }, { "epoch": 4.8057199211045365, "grad_norm": 0.302734375, "learning_rate": 9.207362818061783e-07, "loss": 0.0358, "step": 19492 }, { "epoch": 4.806213017751479, "grad_norm": 0.318359375, "learning_rate": 9.160814644205595e-07, "loss": 0.0354, "step": 19494 }, { "epoch": 4.806706114398422, "grad_norm": 0.30859375, "learning_rate": 9.11438389032504e-07, "loss": 0.0346, "step": 19496 }, { "epoch": 4.807199211045365, "grad_norm": 0.296875, "learning_rate": 9.068070561922382e-07, "loss": 0.0313, "step": 19498 }, { "epoch": 4.8076923076923075, "grad_norm": 0.337890625, "learning_rate": 9.021874664486008e-07, "loss": 0.0333, "step": 19500 }, { "epoch": 4.80818540433925, "grad_norm": 0.361328125, "learning_rate": 8.975796203490428e-07, "loss": 0.0293, "step": 19502 }, { "epoch": 4.808678500986193, "grad_norm": 0.26171875, "learning_rate": 8.929835184396162e-07, "loss": 0.0304, "step": 19504 }, { "epoch": 4.809171597633136, "grad_norm": 0.3046875, "learning_rate": 8.883991612649967e-07, "loss": 0.0322, "step": 19506 }, { "epoch": 4.8096646942800785, "grad_norm": 0.31640625, "learning_rate": 8.838265493684272e-07, "loss": 0.034, "step": 19508 }, { "epoch": 4.810157790927022, "grad_norm": 0.29296875, "learning_rate": 8.792656832918078e-07, "loss": 0.0317, "step": 19510 }, { "epoch": 4.810650887573964, "grad_norm": 0.283203125, "learning_rate": 8.747165635756283e-07, "loss": 0.0326, "step": 19512 }, { "epoch": 4.811143984220907, "grad_norm": 0.306640625, "learning_rate": 8.701791907589796e-07, "loss": 0.0305, "step": 19514 }, { "epoch": 4.81163708086785, "grad_norm": 0.37890625, "learning_rate": 8.65653565379565e-07, "loss": 0.0316, "step": 19516 }, { "epoch": 4.812130177514793, "grad_norm": 0.31640625, "learning_rate": 8.611396879736999e-07, "loss": 0.0307, "step": 19518 }, { "epoch": 4.812623274161735, "grad_norm": 0.2734375, "learning_rate": 8.56637559076312e-07, "loss": 0.0326, "step": 19520 }, { "epoch": 4.813116370808679, "grad_norm": 0.310546875, "learning_rate": 8.521471792209079e-07, "loss": 0.0312, "step": 19522 }, { "epoch": 4.813609467455621, "grad_norm": 0.28515625, "learning_rate": 8.476685489396397e-07, "loss": 0.0308, "step": 19524 }, { "epoch": 4.814102564102564, "grad_norm": 0.345703125, "learning_rate": 8.432016687632271e-07, "loss": 0.0327, "step": 19526 }, { "epoch": 4.814595660749507, "grad_norm": 0.404296875, "learning_rate": 8.387465392210581e-07, "loss": 0.0328, "step": 19528 }, { "epoch": 4.81508875739645, "grad_norm": 0.341796875, "learning_rate": 8.343031608410656e-07, "loss": 0.0337, "step": 19530 }, { "epoch": 4.815581854043392, "grad_norm": 0.25390625, "learning_rate": 8.298715341498175e-07, "loss": 0.0314, "step": 19532 }, { "epoch": 4.816074950690336, "grad_norm": 0.29296875, "learning_rate": 8.254516596724826e-07, "loss": 0.0349, "step": 19534 }, { "epoch": 4.816568047337278, "grad_norm": 0.2890625, "learning_rate": 8.210435379328418e-07, "loss": 0.0317, "step": 19536 }, { "epoch": 4.817061143984221, "grad_norm": 0.2578125, "learning_rate": 8.166471694532996e-07, "loss": 0.0322, "step": 19538 }, { "epoch": 4.817554240631163, "grad_norm": 0.3203125, "learning_rate": 8.12262554754828e-07, "loss": 0.0329, "step": 19540 }, { "epoch": 4.818047337278107, "grad_norm": 0.30859375, "learning_rate": 8.078896943570446e-07, "loss": 0.0332, "step": 19542 }, { "epoch": 4.818540433925049, "grad_norm": 0.416015625, "learning_rate": 8.035285887781463e-07, "loss": 0.0325, "step": 19544 }, { "epoch": 4.819033530571992, "grad_norm": 0.3203125, "learning_rate": 7.991792385349528e-07, "loss": 0.0343, "step": 19546 }, { "epoch": 4.819526627218935, "grad_norm": 0.283203125, "learning_rate": 7.948416441428852e-07, "loss": 0.03, "step": 19548 }, { "epoch": 4.820019723865878, "grad_norm": 0.3359375, "learning_rate": 7.905158061159656e-07, "loss": 0.0308, "step": 19550 }, { "epoch": 4.82051282051282, "grad_norm": 0.2734375, "learning_rate": 7.862017249668507e-07, "loss": 0.0307, "step": 19552 }, { "epoch": 4.821005917159764, "grad_norm": 0.291015625, "learning_rate": 7.818994012067649e-07, "loss": 0.0342, "step": 19554 }, { "epoch": 4.821499013806706, "grad_norm": 0.26953125, "learning_rate": 7.776088353455668e-07, "loss": 0.0278, "step": 19556 }, { "epoch": 4.821992110453649, "grad_norm": 0.291015625, "learning_rate": 7.733300278917055e-07, "loss": 0.0338, "step": 19558 }, { "epoch": 4.822485207100591, "grad_norm": 0.3359375, "learning_rate": 7.690629793522419e-07, "loss": 0.0335, "step": 19560 }, { "epoch": 4.822978303747535, "grad_norm": 0.28515625, "learning_rate": 7.648076902328494e-07, "loss": 0.0314, "step": 19562 }, { "epoch": 4.823471400394477, "grad_norm": 0.275390625, "learning_rate": 7.605641610378133e-07, "loss": 0.0325, "step": 19564 }, { "epoch": 4.82396449704142, "grad_norm": 0.294921875, "learning_rate": 7.563323922699983e-07, "loss": 0.0336, "step": 19566 }, { "epoch": 4.824457593688363, "grad_norm": 0.296875, "learning_rate": 7.52112384430903e-07, "loss": 0.0357, "step": 19568 }, { "epoch": 4.824950690335306, "grad_norm": 0.302734375, "learning_rate": 7.479041380206275e-07, "loss": 0.0345, "step": 19570 }, { "epoch": 4.825443786982248, "grad_norm": 0.361328125, "learning_rate": 7.437076535378507e-07, "loss": 0.032, "step": 19572 }, { "epoch": 4.825936883629192, "grad_norm": 0.28515625, "learning_rate": 7.395229314798968e-07, "loss": 0.0324, "step": 19574 }, { "epoch": 4.826429980276134, "grad_norm": 0.271484375, "learning_rate": 7.353499723426804e-07, "loss": 0.0326, "step": 19576 }, { "epoch": 4.826923076923077, "grad_norm": 0.283203125, "learning_rate": 7.31188776620706e-07, "loss": 0.0308, "step": 19578 }, { "epoch": 4.82741617357002, "grad_norm": 0.28515625, "learning_rate": 7.270393448071233e-07, "loss": 0.0307, "step": 19580 }, { "epoch": 4.827909270216963, "grad_norm": 0.287109375, "learning_rate": 7.229016773936281e-07, "loss": 0.0301, "step": 19582 }, { "epoch": 4.828402366863905, "grad_norm": 0.255859375, "learning_rate": 7.187757748705948e-07, "loss": 0.0304, "step": 19584 }, { "epoch": 4.8288954635108485, "grad_norm": 0.345703125, "learning_rate": 7.146616377269432e-07, "loss": 0.031, "step": 19586 }, { "epoch": 4.829388560157791, "grad_norm": 0.267578125, "learning_rate": 7.105592664502169e-07, "loss": 0.0312, "step": 19588 }, { "epoch": 4.829881656804734, "grad_norm": 0.296875, "learning_rate": 7.064686615265826e-07, "loss": 0.0305, "step": 19590 }, { "epoch": 4.830374753451677, "grad_norm": 0.28125, "learning_rate": 7.023898234407967e-07, "loss": 0.0308, "step": 19592 }, { "epoch": 4.8308678500986195, "grad_norm": 0.314453125, "learning_rate": 6.98322752676217e-07, "loss": 0.0308, "step": 19594 }, { "epoch": 4.831360946745562, "grad_norm": 0.271484375, "learning_rate": 6.94267449714836e-07, "loss": 0.0337, "step": 19596 }, { "epoch": 4.831854043392505, "grad_norm": 0.296875, "learning_rate": 6.902239150372025e-07, "loss": 0.0309, "step": 19598 }, { "epoch": 4.832347140039448, "grad_norm": 0.333984375, "learning_rate": 6.86192149122522e-07, "loss": 0.0336, "step": 19600 }, { "epoch": 4.8328402366863905, "grad_norm": 0.298828125, "learning_rate": 6.821721524485569e-07, "loss": 0.0291, "step": 19602 }, { "epoch": 4.833333333333333, "grad_norm": 0.28515625, "learning_rate": 6.781639254917149e-07, "loss": 0.0342, "step": 19604 }, { "epoch": 4.8338264299802765, "grad_norm": 0.298828125, "learning_rate": 6.741674687269827e-07, "loss": 0.035, "step": 19606 }, { "epoch": 4.834319526627219, "grad_norm": 0.302734375, "learning_rate": 6.701827826279816e-07, "loss": 0.0316, "step": 19608 }, { "epoch": 4.8348126232741615, "grad_norm": 0.296875, "learning_rate": 6.662098676669005e-07, "loss": 0.0328, "step": 19610 }, { "epoch": 4.835305719921105, "grad_norm": 0.29296875, "learning_rate": 6.622487243145625e-07, "loss": 0.0344, "step": 19612 }, { "epoch": 4.835798816568047, "grad_norm": 0.314453125, "learning_rate": 6.582993530403813e-07, "loss": 0.0314, "step": 19614 }, { "epoch": 4.83629191321499, "grad_norm": 0.30859375, "learning_rate": 6.543617543123826e-07, "loss": 0.0345, "step": 19616 }, { "epoch": 4.8367850098619325, "grad_norm": 0.330078125, "learning_rate": 6.504359285971817e-07, "loss": 0.0338, "step": 19618 }, { "epoch": 4.837278106508876, "grad_norm": 0.275390625, "learning_rate": 6.465218763600289e-07, "loss": 0.0332, "step": 19620 }, { "epoch": 4.837771203155818, "grad_norm": 0.34375, "learning_rate": 6.426195980647532e-07, "loss": 0.0323, "step": 19622 }, { "epoch": 4.838264299802761, "grad_norm": 0.3125, "learning_rate": 6.387290941737845e-07, "loss": 0.0366, "step": 19624 }, { "epoch": 4.838757396449704, "grad_norm": 0.318359375, "learning_rate": 6.348503651481985e-07, "loss": 0.033, "step": 19626 }, { "epoch": 4.839250493096647, "grad_norm": 0.291015625, "learning_rate": 6.309834114476165e-07, "loss": 0.0326, "step": 19628 }, { "epoch": 4.839743589743589, "grad_norm": 0.337890625, "learning_rate": 6.271282335303052e-07, "loss": 0.0321, "step": 19630 }, { "epoch": 4.840236686390533, "grad_norm": 0.341796875, "learning_rate": 6.232848318531326e-07, "loss": 0.0315, "step": 19632 }, { "epoch": 4.840729783037475, "grad_norm": 0.3125, "learning_rate": 6.194532068715453e-07, "loss": 0.0312, "step": 19634 }, { "epoch": 4.841222879684418, "grad_norm": 0.2890625, "learning_rate": 6.156333590396246e-07, "loss": 0.036, "step": 19636 }, { "epoch": 4.841715976331361, "grad_norm": 0.298828125, "learning_rate": 6.118252888100528e-07, "loss": 0.0316, "step": 19638 }, { "epoch": 4.842209072978304, "grad_norm": 0.314453125, "learning_rate": 6.080289966340802e-07, "loss": 0.033, "step": 19640 }, { "epoch": 4.842702169625246, "grad_norm": 0.314453125, "learning_rate": 6.042444829616245e-07, "loss": 0.0351, "step": 19642 }, { "epoch": 4.84319526627219, "grad_norm": 0.341796875, "learning_rate": 6.004717482411381e-07, "loss": 0.0332, "step": 19644 }, { "epoch": 4.843688362919132, "grad_norm": 0.298828125, "learning_rate": 5.967107929197302e-07, "loss": 0.0334, "step": 19646 }, { "epoch": 4.844181459566075, "grad_norm": 0.2890625, "learning_rate": 5.929616174430885e-07, "loss": 0.0297, "step": 19648 }, { "epoch": 4.844674556213018, "grad_norm": 0.275390625, "learning_rate": 5.892242222555133e-07, "loss": 0.0327, "step": 19650 }, { "epoch": 4.845167652859961, "grad_norm": 0.267578125, "learning_rate": 5.854986077999169e-07, "loss": 0.0322, "step": 19652 }, { "epoch": 4.845660749506903, "grad_norm": 0.283203125, "learning_rate": 5.817847745177796e-07, "loss": 0.0327, "step": 19654 }, { "epoch": 4.846153846153846, "grad_norm": 0.275390625, "learning_rate": 5.78082722849238e-07, "loss": 0.0292, "step": 19656 }, { "epoch": 4.846646942800789, "grad_norm": 0.33203125, "learning_rate": 5.743924532329858e-07, "loss": 0.0345, "step": 19658 }, { "epoch": 4.847140039447732, "grad_norm": 0.287109375, "learning_rate": 5.70713966106351e-07, "loss": 0.0315, "step": 19660 }, { "epoch": 4.847633136094674, "grad_norm": 0.287109375, "learning_rate": 5.670472619052514e-07, "loss": 0.0309, "step": 19662 }, { "epoch": 4.848126232741618, "grad_norm": 0.314453125, "learning_rate": 5.633923410642283e-07, "loss": 0.0326, "step": 19664 }, { "epoch": 4.84861932938856, "grad_norm": 0.3515625, "learning_rate": 5.597492040163799e-07, "loss": 0.0292, "step": 19666 }, { "epoch": 4.849112426035503, "grad_norm": 0.30078125, "learning_rate": 5.561178511934607e-07, "loss": 0.0313, "step": 19668 }, { "epoch": 4.849605522682445, "grad_norm": 0.28125, "learning_rate": 5.52498283025793e-07, "loss": 0.03, "step": 19670 }, { "epoch": 4.850098619329389, "grad_norm": 0.298828125, "learning_rate": 5.488904999423339e-07, "loss": 0.0297, "step": 19672 }, { "epoch": 4.850591715976331, "grad_norm": 0.2578125, "learning_rate": 5.45294502370608e-07, "loss": 0.0298, "step": 19674 }, { "epoch": 4.851084812623274, "grad_norm": 0.330078125, "learning_rate": 5.417102907367632e-07, "loss": 0.0323, "step": 19676 }, { "epoch": 4.851577909270217, "grad_norm": 0.337890625, "learning_rate": 5.3813786546556e-07, "loss": 0.0337, "step": 19678 }, { "epoch": 4.85207100591716, "grad_norm": 0.27734375, "learning_rate": 5.345772269803484e-07, "loss": 0.0327, "step": 19680 }, { "epoch": 4.852564102564102, "grad_norm": 0.33984375, "learning_rate": 5.310283757030688e-07, "loss": 0.0336, "step": 19682 }, { "epoch": 4.853057199211046, "grad_norm": 0.2275390625, "learning_rate": 5.274913120542957e-07, "loss": 0.0288, "step": 19684 }, { "epoch": 4.853550295857988, "grad_norm": 0.2451171875, "learning_rate": 5.239660364531829e-07, "loss": 0.0317, "step": 19686 }, { "epoch": 4.854043392504931, "grad_norm": 0.3203125, "learning_rate": 5.204525493174961e-07, "loss": 0.0337, "step": 19688 }, { "epoch": 4.854536489151874, "grad_norm": 0.369140625, "learning_rate": 5.169508510636023e-07, "loss": 0.0346, "step": 19690 }, { "epoch": 4.855029585798817, "grad_norm": 0.287109375, "learning_rate": 5.134609421064918e-07, "loss": 0.0334, "step": 19692 }, { "epoch": 4.855522682445759, "grad_norm": 0.33984375, "learning_rate": 5.099828228597004e-07, "loss": 0.0349, "step": 19694 }, { "epoch": 4.8560157790927025, "grad_norm": 0.265625, "learning_rate": 5.065164937354428e-07, "loss": 0.0361, "step": 19696 }, { "epoch": 4.856508875739645, "grad_norm": 0.306640625, "learning_rate": 5.030619551444682e-07, "loss": 0.0324, "step": 19698 }, { "epoch": 4.857001972386588, "grad_norm": 0.275390625, "learning_rate": 4.996192074961714e-07, "loss": 0.0322, "step": 19700 }, { "epoch": 4.857495069033531, "grad_norm": 0.328125, "learning_rate": 4.961882511985483e-07, "loss": 0.0346, "step": 19702 }, { "epoch": 4.8579881656804735, "grad_norm": 0.275390625, "learning_rate": 4.927690866581736e-07, "loss": 0.0338, "step": 19704 }, { "epoch": 4.858481262327416, "grad_norm": 0.365234375, "learning_rate": 4.893617142802342e-07, "loss": 0.0295, "step": 19706 }, { "epoch": 4.858974358974359, "grad_norm": 0.279296875, "learning_rate": 4.859661344685296e-07, "loss": 0.0298, "step": 19708 }, { "epoch": 4.859467455621302, "grad_norm": 0.248046875, "learning_rate": 4.825823476254488e-07, "loss": 0.0298, "step": 19710 }, { "epoch": 4.8599605522682445, "grad_norm": 0.296875, "learning_rate": 4.792103541520044e-07, "loss": 0.0309, "step": 19712 }, { "epoch": 4.860453648915187, "grad_norm": 0.33984375, "learning_rate": 4.758501544477767e-07, "loss": 0.0328, "step": 19714 }, { "epoch": 4.8609467455621305, "grad_norm": 0.265625, "learning_rate": 4.7250174891098067e-07, "loss": 0.0316, "step": 19716 }, { "epoch": 4.861439842209073, "grad_norm": 0.326171875, "learning_rate": 4.6916513793840986e-07, "loss": 0.03, "step": 19718 }, { "epoch": 4.8619329388560155, "grad_norm": 0.328125, "learning_rate": 4.6584032192549254e-07, "loss": 0.0314, "step": 19720 }, { "epoch": 4.862426035502959, "grad_norm": 0.318359375, "learning_rate": 4.625273012662246e-07, "loss": 0.0303, "step": 19722 }, { "epoch": 4.8629191321499015, "grad_norm": 0.328125, "learning_rate": 4.5922607635321414e-07, "loss": 0.0311, "step": 19724 }, { "epoch": 4.863412228796844, "grad_norm": 0.291015625, "learning_rate": 4.559366475776705e-07, "loss": 0.029, "step": 19726 }, { "epoch": 4.8639053254437865, "grad_norm": 0.251953125, "learning_rate": 4.5265901532942636e-07, "loss": 0.0306, "step": 19728 }, { "epoch": 4.86439842209073, "grad_norm": 0.3984375, "learning_rate": 4.4939317999688204e-07, "loss": 0.0351, "step": 19730 }, { "epoch": 4.8648915187376724, "grad_norm": 0.2890625, "learning_rate": 4.461391419670724e-07, "loss": 0.0308, "step": 19732 }, { "epoch": 4.865384615384615, "grad_norm": 0.25390625, "learning_rate": 4.4289690162561124e-07, "loss": 0.0323, "step": 19734 }, { "epoch": 4.865877712031558, "grad_norm": 0.287109375, "learning_rate": 4.3966645935671346e-07, "loss": 0.0337, "step": 19736 }, { "epoch": 4.866370808678501, "grad_norm": 0.3203125, "learning_rate": 4.364478155432283e-07, "loss": 0.0301, "step": 19738 }, { "epoch": 4.866863905325443, "grad_norm": 0.306640625, "learning_rate": 4.3324097056656187e-07, "loss": 0.0328, "step": 19740 }, { "epoch": 4.867357001972387, "grad_norm": 0.294921875, "learning_rate": 4.3004592480674346e-07, "loss": 0.03, "step": 19742 }, { "epoch": 4.867850098619329, "grad_norm": 0.328125, "learning_rate": 4.268626786424146e-07, "loss": 0.031, "step": 19744 }, { "epoch": 4.868343195266272, "grad_norm": 0.294921875, "learning_rate": 4.236912324508069e-07, "loss": 0.0327, "step": 19746 }, { "epoch": 4.868836291913215, "grad_norm": 0.30859375, "learning_rate": 4.20531586607753e-07, "loss": 0.0259, "step": 19748 }, { "epoch": 4.869329388560158, "grad_norm": 0.345703125, "learning_rate": 4.173837414876869e-07, "loss": 0.0359, "step": 19750 }, { "epoch": 4.8698224852071, "grad_norm": 0.3046875, "learning_rate": 4.1424769746364334e-07, "loss": 0.0326, "step": 19752 }, { "epoch": 4.870315581854044, "grad_norm": 0.25390625, "learning_rate": 4.1112345490725845e-07, "loss": 0.0273, "step": 19754 }, { "epoch": 4.870808678500986, "grad_norm": 0.2734375, "learning_rate": 4.080110141887805e-07, "loss": 0.0319, "step": 19756 }, { "epoch": 4.871301775147929, "grad_norm": 0.400390625, "learning_rate": 4.049103756770478e-07, "loss": 0.0326, "step": 19758 }, { "epoch": 4.871794871794872, "grad_norm": 0.294921875, "learning_rate": 4.0182153973949974e-07, "loss": 0.0281, "step": 19760 }, { "epoch": 4.872287968441815, "grad_norm": 0.328125, "learning_rate": 3.987445067421991e-07, "loss": 0.029, "step": 19762 }, { "epoch": 4.872781065088757, "grad_norm": 0.265625, "learning_rate": 3.9567927704975416e-07, "loss": 0.0304, "step": 19764 }, { "epoch": 4.8732741617357, "grad_norm": 0.294921875, "learning_rate": 3.9262585102545215e-07, "loss": 0.0317, "step": 19766 }, { "epoch": 4.873767258382643, "grad_norm": 0.322265625, "learning_rate": 3.895842290311147e-07, "loss": 0.0328, "step": 19768 }, { "epoch": 4.874260355029586, "grad_norm": 0.326171875, "learning_rate": 3.8655441142720905e-07, "loss": 0.0321, "step": 19770 }, { "epoch": 4.874753451676528, "grad_norm": 0.26171875, "learning_rate": 3.8353639857275914e-07, "loss": 0.0304, "step": 19772 }, { "epoch": 4.875246548323472, "grad_norm": 0.30859375, "learning_rate": 3.805301908254455e-07, "loss": 0.0336, "step": 19774 }, { "epoch": 4.875739644970414, "grad_norm": 0.3046875, "learning_rate": 3.775357885415054e-07, "loss": 0.0347, "step": 19776 }, { "epoch": 4.876232741617357, "grad_norm": 0.318359375, "learning_rate": 3.745531920757994e-07, "loss": 0.0306, "step": 19778 }, { "epoch": 4.8767258382643, "grad_norm": 0.31640625, "learning_rate": 3.7158240178176707e-07, "loss": 0.0321, "step": 19780 }, { "epoch": 4.877218934911243, "grad_norm": 0.3125, "learning_rate": 3.6862341801148226e-07, "loss": 0.0331, "step": 19782 }, { "epoch": 4.877712031558185, "grad_norm": 0.283203125, "learning_rate": 3.6567624111558675e-07, "loss": 0.0304, "step": 19784 }, { "epoch": 4.878205128205128, "grad_norm": 0.275390625, "learning_rate": 3.6274087144335665e-07, "loss": 0.0342, "step": 19786 }, { "epoch": 4.878698224852071, "grad_norm": 0.373046875, "learning_rate": 3.598173093426249e-07, "loss": 0.0342, "step": 19788 }, { "epoch": 4.879191321499014, "grad_norm": 0.314453125, "learning_rate": 3.569055551598588e-07, "loss": 0.0308, "step": 19790 }, { "epoch": 4.879684418145956, "grad_norm": 0.298828125, "learning_rate": 3.5400560924012673e-07, "loss": 0.0324, "step": 19792 }, { "epoch": 4.8801775147929, "grad_norm": 0.294921875, "learning_rate": 3.51117471927076e-07, "loss": 0.0316, "step": 19794 }, { "epoch": 4.880670611439842, "grad_norm": 0.36328125, "learning_rate": 3.4824114356297734e-07, "loss": 0.0309, "step": 19796 }, { "epoch": 4.881163708086785, "grad_norm": 0.271484375, "learning_rate": 3.4537662448869134e-07, "loss": 0.0308, "step": 19798 }, { "epoch": 4.881656804733728, "grad_norm": 0.306640625, "learning_rate": 3.4252391504366877e-07, "loss": 0.0288, "step": 19800 }, { "epoch": 4.882149901380671, "grad_norm": 0.2734375, "learning_rate": 3.3968301556598356e-07, "loss": 0.0304, "step": 19802 }, { "epoch": 4.882642998027613, "grad_norm": 0.353515625, "learning_rate": 3.3685392639228875e-07, "loss": 0.0296, "step": 19804 }, { "epoch": 4.883136094674557, "grad_norm": 0.283203125, "learning_rate": 3.340366478578494e-07, "loss": 0.0352, "step": 19806 }, { "epoch": 4.883629191321499, "grad_norm": 0.291015625, "learning_rate": 3.312311802965429e-07, "loss": 0.0315, "step": 19808 }, { "epoch": 4.884122287968442, "grad_norm": 0.3046875, "learning_rate": 3.284375240408144e-07, "loss": 0.0281, "step": 19810 }, { "epoch": 4.884615384615385, "grad_norm": 0.3359375, "learning_rate": 3.2565567942172136e-07, "loss": 0.0324, "step": 19812 }, { "epoch": 4.8851084812623276, "grad_norm": 0.296875, "learning_rate": 3.228856467689556e-07, "loss": 0.0359, "step": 19814 }, { "epoch": 4.88560157790927, "grad_norm": 0.28125, "learning_rate": 3.2012742641076564e-07, "loss": 0.0315, "step": 19816 }, { "epoch": 4.8860946745562135, "grad_norm": 0.2734375, "learning_rate": 3.1738101867401227e-07, "loss": 0.0306, "step": 19818 }, { "epoch": 4.886587771203156, "grad_norm": 0.271484375, "learning_rate": 3.146464238841573e-07, "loss": 0.0281, "step": 19820 }, { "epoch": 4.8870808678500985, "grad_norm": 0.3203125, "learning_rate": 3.1192364236528606e-07, "loss": 0.0361, "step": 19822 }, { "epoch": 4.887573964497041, "grad_norm": 0.34375, "learning_rate": 3.092126744400514e-07, "loss": 0.0339, "step": 19824 }, { "epoch": 4.8880670611439845, "grad_norm": 0.35546875, "learning_rate": 3.065135204296965e-07, "loss": 0.0294, "step": 19826 }, { "epoch": 4.888560157790927, "grad_norm": 0.318359375, "learning_rate": 3.0382618065413205e-07, "loss": 0.0331, "step": 19828 }, { "epoch": 4.8890532544378695, "grad_norm": 0.318359375, "learning_rate": 3.011506554317811e-07, "loss": 0.0293, "step": 19830 }, { "epoch": 4.889546351084813, "grad_norm": 0.28515625, "learning_rate": 2.984869450797345e-07, "loss": 0.0319, "step": 19832 }, { "epoch": 4.8900394477317555, "grad_norm": 0.349609375, "learning_rate": 2.958350499136398e-07, "loss": 0.0335, "step": 19834 }, { "epoch": 4.890532544378698, "grad_norm": 0.2734375, "learning_rate": 2.9319497024776764e-07, "loss": 0.0334, "step": 19836 }, { "epoch": 4.891025641025641, "grad_norm": 0.29296875, "learning_rate": 2.9056670639499015e-07, "loss": 0.031, "step": 19838 }, { "epoch": 4.891518737672584, "grad_norm": 0.302734375, "learning_rate": 2.879502586667582e-07, "loss": 0.0326, "step": 19840 }, { "epoch": 4.8920118343195265, "grad_norm": 0.40625, "learning_rate": 2.853456273731459e-07, "loss": 0.0354, "step": 19842 }, { "epoch": 4.892504930966469, "grad_norm": 0.291015625, "learning_rate": 2.827528128228063e-07, "loss": 0.0322, "step": 19844 }, { "epoch": 4.892998027613412, "grad_norm": 0.283203125, "learning_rate": 2.8017181532300486e-07, "loss": 0.0326, "step": 19846 }, { "epoch": 4.893491124260355, "grad_norm": 0.296875, "learning_rate": 2.7760263517961903e-07, "loss": 0.03, "step": 19848 }, { "epoch": 4.8939842209072975, "grad_norm": 0.25, "learning_rate": 2.750452726970942e-07, "loss": 0.032, "step": 19850 }, { "epoch": 4.894477317554241, "grad_norm": 0.33984375, "learning_rate": 2.724997281784991e-07, "loss": 0.035, "step": 19852 }, { "epoch": 4.894970414201183, "grad_norm": 0.2734375, "learning_rate": 2.6996600192549237e-07, "loss": 0.0298, "step": 19854 }, { "epoch": 4.895463510848126, "grad_norm": 0.318359375, "learning_rate": 2.6744409423833384e-07, "loss": 0.0328, "step": 19856 }, { "epoch": 4.895956607495069, "grad_norm": 0.255859375, "learning_rate": 2.6493400541588444e-07, "loss": 0.0313, "step": 19858 }, { "epoch": 4.896449704142012, "grad_norm": 0.322265625, "learning_rate": 2.624357357556062e-07, "loss": 0.033, "step": 19860 }, { "epoch": 4.896942800788954, "grad_norm": 0.279296875, "learning_rate": 2.5994928555356234e-07, "loss": 0.0298, "step": 19862 }, { "epoch": 4.897435897435898, "grad_norm": 0.287109375, "learning_rate": 2.5747465510439494e-07, "loss": 0.0293, "step": 19864 }, { "epoch": 4.89792899408284, "grad_norm": 0.31640625, "learning_rate": 2.550118447013805e-07, "loss": 0.0319, "step": 19866 }, { "epoch": 4.898422090729783, "grad_norm": 0.28125, "learning_rate": 2.5256085463636337e-07, "loss": 0.0316, "step": 19868 }, { "epoch": 4.898915187376726, "grad_norm": 0.28125, "learning_rate": 2.501216851998001e-07, "loss": 0.0296, "step": 19870 }, { "epoch": 4.899408284023669, "grad_norm": 0.3125, "learning_rate": 2.476943366807483e-07, "loss": 0.0334, "step": 19872 }, { "epoch": 4.899901380670611, "grad_norm": 0.328125, "learning_rate": 2.4527880936687787e-07, "loss": 0.0321, "step": 19874 }, { "epoch": 4.900394477317555, "grad_norm": 0.34765625, "learning_rate": 2.4287510354441543e-07, "loss": 0.0349, "step": 19876 }, { "epoch": 4.900887573964497, "grad_norm": 0.287109375, "learning_rate": 2.4048321949822205e-07, "loss": 0.0278, "step": 19878 }, { "epoch": 4.90138067061144, "grad_norm": 0.259765625, "learning_rate": 2.3810315751175982e-07, "loss": 0.03, "step": 19880 }, { "epoch": 4.901873767258382, "grad_norm": 0.34765625, "learning_rate": 2.357349178670698e-07, "loss": 0.031, "step": 19882 }, { "epoch": 4.902366863905326, "grad_norm": 0.28125, "learning_rate": 2.3337850084480528e-07, "loss": 0.0261, "step": 19884 }, { "epoch": 4.902859960552268, "grad_norm": 0.275390625, "learning_rate": 2.3103390672422064e-07, "loss": 0.0321, "step": 19886 }, { "epoch": 4.903353057199211, "grad_norm": 0.30078125, "learning_rate": 2.287011357831492e-07, "loss": 0.0324, "step": 19888 }, { "epoch": 4.903846153846154, "grad_norm": 0.359375, "learning_rate": 2.2638018829804764e-07, "loss": 0.032, "step": 19890 }, { "epoch": 4.904339250493097, "grad_norm": 0.34375, "learning_rate": 2.2407106454395145e-07, "loss": 0.038, "step": 19892 }, { "epoch": 4.904832347140039, "grad_norm": 0.259765625, "learning_rate": 2.217737647945195e-07, "loss": 0.0304, "step": 19894 }, { "epoch": 4.905325443786982, "grad_norm": 0.28515625, "learning_rate": 2.194882893219785e-07, "loss": 0.0332, "step": 19896 }, { "epoch": 4.905818540433925, "grad_norm": 0.26953125, "learning_rate": 2.172146383971896e-07, "loss": 0.0321, "step": 19898 }, { "epoch": 4.906311637080868, "grad_norm": 0.294921875, "learning_rate": 2.1495281228957053e-07, "loss": 0.0322, "step": 19900 }, { "epoch": 4.90680473372781, "grad_norm": 0.3125, "learning_rate": 2.127028112671847e-07, "loss": 0.0328, "step": 19902 }, { "epoch": 4.907297830374754, "grad_norm": 0.283203125, "learning_rate": 2.1046463559664109e-07, "loss": 0.0335, "step": 19904 }, { "epoch": 4.907790927021696, "grad_norm": 0.296875, "learning_rate": 2.0823828554320524e-07, "loss": 0.0328, "step": 19906 }, { "epoch": 4.908284023668639, "grad_norm": 0.310546875, "learning_rate": 2.0602376137068835e-07, "loss": 0.0292, "step": 19908 }, { "epoch": 4.908777120315582, "grad_norm": 0.28515625, "learning_rate": 2.0382106334153605e-07, "loss": 0.0326, "step": 19910 }, { "epoch": 4.909270216962525, "grad_norm": 0.287109375, "learning_rate": 2.01630191716784e-07, "loss": 0.032, "step": 19912 }, { "epoch": 4.909763313609467, "grad_norm": 0.349609375, "learning_rate": 1.994511467560467e-07, "loss": 0.0351, "step": 19914 }, { "epoch": 4.910256410256411, "grad_norm": 0.31640625, "learning_rate": 1.9728392871757318e-07, "loss": 0.0323, "step": 19916 }, { "epoch": 4.910749506903353, "grad_norm": 0.2734375, "learning_rate": 1.951285378581802e-07, "loss": 0.0337, "step": 19918 }, { "epoch": 4.911242603550296, "grad_norm": 0.26171875, "learning_rate": 1.9298497443329678e-07, "loss": 0.0312, "step": 19920 }, { "epoch": 4.911735700197239, "grad_norm": 0.2490234375, "learning_rate": 1.9085323869694195e-07, "loss": 0.0276, "step": 19922 }, { "epoch": 4.912228796844182, "grad_norm": 0.287109375, "learning_rate": 1.8873333090173585e-07, "loss": 0.0327, "step": 19924 }, { "epoch": 4.912721893491124, "grad_norm": 0.310546875, "learning_rate": 1.8662525129891084e-07, "loss": 0.0298, "step": 19926 }, { "epoch": 4.9132149901380675, "grad_norm": 0.279296875, "learning_rate": 1.8452900013827823e-07, "loss": 0.0314, "step": 19928 }, { "epoch": 4.91370808678501, "grad_norm": 0.267578125, "learning_rate": 1.824445776682504e-07, "loss": 0.0305, "step": 19930 }, { "epoch": 4.914201183431953, "grad_norm": 0.294921875, "learning_rate": 1.803719841358631e-07, "loss": 0.0313, "step": 19932 }, { "epoch": 4.914694280078895, "grad_norm": 0.2734375, "learning_rate": 1.7831121978670873e-07, "loss": 0.032, "step": 19934 }, { "epoch": 4.9151873767258385, "grad_norm": 0.25, "learning_rate": 1.7626228486500306e-07, "loss": 0.0287, "step": 19936 }, { "epoch": 4.915680473372781, "grad_norm": 0.283203125, "learning_rate": 1.742251796135741e-07, "loss": 0.0309, "step": 19938 }, { "epoch": 4.9161735700197235, "grad_norm": 0.296875, "learning_rate": 1.721999042738065e-07, "loss": 0.033, "step": 19940 }, { "epoch": 4.916666666666667, "grad_norm": 0.296875, "learning_rate": 1.701864590857194e-07, "loss": 0.0292, "step": 19942 }, { "epoch": 4.9171597633136095, "grad_norm": 0.265625, "learning_rate": 1.6818484428792191e-07, "loss": 0.031, "step": 19944 }, { "epoch": 4.917652859960552, "grad_norm": 0.28125, "learning_rate": 1.661950601176021e-07, "loss": 0.0331, "step": 19946 }, { "epoch": 4.918145956607495, "grad_norm": 0.2734375, "learning_rate": 1.6421710681057134e-07, "loss": 0.0303, "step": 19948 }, { "epoch": 4.918639053254438, "grad_norm": 0.251953125, "learning_rate": 1.62250984601231e-07, "loss": 0.03, "step": 19950 }, { "epoch": 4.9191321499013805, "grad_norm": 0.333984375, "learning_rate": 1.602966937225614e-07, "loss": 0.0336, "step": 19952 }, { "epoch": 4.919625246548323, "grad_norm": 0.298828125, "learning_rate": 1.5835423440617725e-07, "loss": 0.0342, "step": 19954 }, { "epoch": 4.920118343195266, "grad_norm": 0.265625, "learning_rate": 1.5642360688225e-07, "loss": 0.0313, "step": 19956 }, { "epoch": 4.920611439842209, "grad_norm": 0.267578125, "learning_rate": 1.545048113795855e-07, "loss": 0.0299, "step": 19958 }, { "epoch": 4.9211045364891515, "grad_norm": 0.33984375, "learning_rate": 1.5259784812556854e-07, "loss": 0.0307, "step": 19960 }, { "epoch": 4.921597633136095, "grad_norm": 0.271484375, "learning_rate": 1.5070271734618501e-07, "loss": 0.0326, "step": 19962 }, { "epoch": 4.922090729783037, "grad_norm": 0.28515625, "learning_rate": 1.4881941926601084e-07, "loss": 0.0307, "step": 19964 }, { "epoch": 4.92258382642998, "grad_norm": 0.32421875, "learning_rate": 1.4694795410823415e-07, "loss": 0.0321, "step": 19966 }, { "epoch": 4.923076923076923, "grad_norm": 0.30859375, "learning_rate": 1.4508832209463307e-07, "loss": 0.0287, "step": 19968 }, { "epoch": 4.923570019723866, "grad_norm": 0.298828125, "learning_rate": 1.4324052344558692e-07, "loss": 0.0318, "step": 19970 }, { "epoch": 4.924063116370808, "grad_norm": 0.31640625, "learning_rate": 1.4140455838006495e-07, "loss": 0.0333, "step": 19972 }, { "epoch": 4.924556213017752, "grad_norm": 0.322265625, "learning_rate": 1.3958042711563757e-07, "loss": 0.0365, "step": 19974 }, { "epoch": 4.925049309664694, "grad_norm": 0.259765625, "learning_rate": 1.3776812986848743e-07, "loss": 0.0292, "step": 19976 }, { "epoch": 4.925542406311637, "grad_norm": 0.380859375, "learning_rate": 1.3596766685336494e-07, "loss": 0.0309, "step": 19978 }, { "epoch": 4.92603550295858, "grad_norm": 0.271484375, "learning_rate": 1.3417903828364386e-07, "loss": 0.0309, "step": 19980 }, { "epoch": 4.926528599605523, "grad_norm": 0.2890625, "learning_rate": 1.3240224437129912e-07, "loss": 0.0336, "step": 19982 }, { "epoch": 4.927021696252465, "grad_norm": 0.279296875, "learning_rate": 1.3063728532686227e-07, "loss": 0.0326, "step": 19984 }, { "epoch": 4.927514792899409, "grad_norm": 0.28125, "learning_rate": 1.2888416135951044e-07, "loss": 0.0295, "step": 19986 }, { "epoch": 4.928007889546351, "grad_norm": 0.337890625, "learning_rate": 1.2714287267699963e-07, "loss": 0.0328, "step": 19988 }, { "epoch": 4.928500986193294, "grad_norm": 0.30078125, "learning_rate": 1.2541341948567598e-07, "loss": 0.0323, "step": 19990 }, { "epoch": 4.928994082840236, "grad_norm": 0.294921875, "learning_rate": 1.2369580199048657e-07, "loss": 0.0338, "step": 19992 }, { "epoch": 4.92948717948718, "grad_norm": 0.28125, "learning_rate": 1.2199002039497976e-07, "loss": 0.0292, "step": 19994 }, { "epoch": 4.929980276134122, "grad_norm": 0.337890625, "learning_rate": 1.20296074901316e-07, "loss": 0.0305, "step": 19996 }, { "epoch": 4.930473372781065, "grad_norm": 0.26171875, "learning_rate": 1.1861396571021255e-07, "loss": 0.0319, "step": 19998 }, { "epoch": 4.930966469428008, "grad_norm": 0.283203125, "learning_rate": 1.1694369302102104e-07, "loss": 0.0306, "step": 20000 }, { "epoch": 4.931459566074951, "grad_norm": 0.337890625, "learning_rate": 1.1528525703168313e-07, "loss": 0.0363, "step": 20002 }, { "epoch": 4.931952662721893, "grad_norm": 0.29296875, "learning_rate": 1.1363865793871942e-07, "loss": 0.0327, "step": 20004 }, { "epoch": 4.932445759368837, "grad_norm": 0.28515625, "learning_rate": 1.1200389593727378e-07, "loss": 0.0314, "step": 20006 }, { "epoch": 4.932938856015779, "grad_norm": 0.291015625, "learning_rate": 1.1038097122106905e-07, "loss": 0.0324, "step": 20008 }, { "epoch": 4.933431952662722, "grad_norm": 0.287109375, "learning_rate": 1.0876988398242915e-07, "loss": 0.0314, "step": 20010 }, { "epoch": 4.933925049309664, "grad_norm": 0.267578125, "learning_rate": 1.0717063441229025e-07, "loss": 0.0322, "step": 20012 }, { "epoch": 4.934418145956608, "grad_norm": 0.275390625, "learning_rate": 1.055832227001452e-07, "loss": 0.0327, "step": 20014 }, { "epoch": 4.93491124260355, "grad_norm": 0.279296875, "learning_rate": 1.0400764903414351e-07, "loss": 0.0357, "step": 20016 }, { "epoch": 4.935404339250493, "grad_norm": 0.380859375, "learning_rate": 1.0244391360096916e-07, "loss": 0.0313, "step": 20018 }, { "epoch": 4.935897435897436, "grad_norm": 0.291015625, "learning_rate": 1.0089201658595171e-07, "loss": 0.0305, "step": 20020 }, { "epoch": 4.936390532544379, "grad_norm": 0.34765625, "learning_rate": 9.935195817299958e-08, "loss": 0.0343, "step": 20022 }, { "epoch": 4.936883629191321, "grad_norm": 0.263671875, "learning_rate": 9.782373854461125e-08, "loss": 0.0314, "step": 20024 }, { "epoch": 4.937376725838265, "grad_norm": 0.3125, "learning_rate": 9.630735788188628e-08, "loss": 0.0328, "step": 20026 }, { "epoch": 4.937869822485207, "grad_norm": 0.267578125, "learning_rate": 9.480281636453647e-08, "loss": 0.0282, "step": 20028 }, { "epoch": 4.93836291913215, "grad_norm": 0.26953125, "learning_rate": 9.331011417085256e-08, "loss": 0.0301, "step": 20030 }, { "epoch": 4.938856015779093, "grad_norm": 0.291015625, "learning_rate": 9.182925147772636e-08, "loss": 0.0357, "step": 20032 }, { "epoch": 4.939349112426036, "grad_norm": 0.28125, "learning_rate": 9.036022846065084e-08, "loss": 0.0304, "step": 20034 }, { "epoch": 4.939842209072978, "grad_norm": 0.26171875, "learning_rate": 8.890304529370897e-08, "loss": 0.0318, "step": 20036 }, { "epoch": 4.9403353057199215, "grad_norm": 0.302734375, "learning_rate": 8.745770214959592e-08, "loss": 0.0326, "step": 20038 }, { "epoch": 4.940828402366864, "grad_norm": 0.2890625, "learning_rate": 8.602419919957472e-08, "loss": 0.0317, "step": 20040 }, { "epoch": 4.941321499013807, "grad_norm": 0.3359375, "learning_rate": 8.460253661354279e-08, "loss": 0.0317, "step": 20042 }, { "epoch": 4.94181459566075, "grad_norm": 0.263671875, "learning_rate": 8.319271455996536e-08, "loss": 0.0304, "step": 20044 }, { "epoch": 4.9423076923076925, "grad_norm": 0.296875, "learning_rate": 8.17947332059088e-08, "loss": 0.0324, "step": 20046 }, { "epoch": 4.942800788954635, "grad_norm": 0.31640625, "learning_rate": 8.04085927170628e-08, "loss": 0.0339, "step": 20048 }, { "epoch": 4.943293885601578, "grad_norm": 0.328125, "learning_rate": 7.903429325766265e-08, "loss": 0.0316, "step": 20050 }, { "epoch": 4.943786982248521, "grad_norm": 0.28515625, "learning_rate": 7.767183499058917e-08, "loss": 0.0337, "step": 20052 }, { "epoch": 4.9442800788954635, "grad_norm": 0.287109375, "learning_rate": 7.632121807730208e-08, "loss": 0.0305, "step": 20054 }, { "epoch": 4.944773175542406, "grad_norm": 0.318359375, "learning_rate": 7.498244267785114e-08, "loss": 0.0343, "step": 20056 }, { "epoch": 4.945266272189349, "grad_norm": 0.275390625, "learning_rate": 7.365550895088724e-08, "loss": 0.0305, "step": 20058 }, { "epoch": 4.945759368836292, "grad_norm": 0.28125, "learning_rate": 7.234041705367345e-08, "loss": 0.0335, "step": 20060 }, { "epoch": 4.9462524654832345, "grad_norm": 0.30859375, "learning_rate": 7.103716714204068e-08, "loss": 0.0312, "step": 20062 }, { "epoch": 4.946745562130177, "grad_norm": 0.279296875, "learning_rate": 6.974575937042094e-08, "loss": 0.0316, "step": 20064 }, { "epoch": 4.94723865877712, "grad_norm": 0.302734375, "learning_rate": 6.846619389188069e-08, "loss": 0.0328, "step": 20066 }, { "epoch": 4.947731755424063, "grad_norm": 0.267578125, "learning_rate": 6.719847085804309e-08, "loss": 0.0296, "step": 20068 }, { "epoch": 4.9482248520710055, "grad_norm": 0.359375, "learning_rate": 6.594259041914353e-08, "loss": 0.0376, "step": 20070 }, { "epoch": 4.948717948717949, "grad_norm": 0.26953125, "learning_rate": 6.469855272399628e-08, "loss": 0.031, "step": 20072 }, { "epoch": 4.949211045364891, "grad_norm": 0.337890625, "learning_rate": 6.346635792005006e-08, "loss": 0.0335, "step": 20074 }, { "epoch": 4.949704142011834, "grad_norm": 0.291015625, "learning_rate": 6.224600615331034e-08, "loss": 0.0323, "step": 20076 }, { "epoch": 4.950197238658777, "grad_norm": 0.318359375, "learning_rate": 6.103749756839472e-08, "loss": 0.0331, "step": 20078 }, { "epoch": 4.95069033530572, "grad_norm": 0.3046875, "learning_rate": 5.9840832308522e-08, "loss": 0.0344, "step": 20080 }, { "epoch": 4.951183431952662, "grad_norm": 0.328125, "learning_rate": 5.865601051551206e-08, "loss": 0.0342, "step": 20082 }, { "epoch": 4.951676528599606, "grad_norm": 0.326171875, "learning_rate": 5.748303232976371e-08, "loss": 0.0341, "step": 20084 }, { "epoch": 4.952169625246548, "grad_norm": 0.306640625, "learning_rate": 5.632189789027687e-08, "loss": 0.0322, "step": 20086 }, { "epoch": 4.952662721893491, "grad_norm": 0.322265625, "learning_rate": 5.517260733467477e-08, "loss": 0.036, "step": 20088 }, { "epoch": 4.953155818540434, "grad_norm": 0.2890625, "learning_rate": 5.40351607991263e-08, "loss": 0.0316, "step": 20090 }, { "epoch": 4.953648915187377, "grad_norm": 0.31640625, "learning_rate": 5.29095584184347e-08, "loss": 0.0336, "step": 20092 }, { "epoch": 4.954142011834319, "grad_norm": 0.296875, "learning_rate": 5.17958003260044e-08, "loss": 0.0281, "step": 20094 }, { "epoch": 4.954635108481263, "grad_norm": 0.287109375, "learning_rate": 5.069388665380759e-08, "loss": 0.031, "step": 20096 }, { "epoch": 4.955128205128205, "grad_norm": 0.359375, "learning_rate": 4.960381753242871e-08, "loss": 0.0331, "step": 20098 }, { "epoch": 4.955621301775148, "grad_norm": 0.28515625, "learning_rate": 4.852559309105331e-08, "loss": 0.0305, "step": 20100 }, { "epoch": 4.956114398422091, "grad_norm": 0.328125, "learning_rate": 4.745921345744586e-08, "loss": 0.0279, "step": 20102 }, { "epoch": 4.956607495069034, "grad_norm": 0.3671875, "learning_rate": 4.640467875799415e-08, "loss": 0.032, "step": 20104 }, { "epoch": 4.957100591715976, "grad_norm": 0.291015625, "learning_rate": 4.536198911765377e-08, "loss": 0.0344, "step": 20106 }, { "epoch": 4.957593688362919, "grad_norm": 0.3046875, "learning_rate": 4.433114465999255e-08, "loss": 0.0296, "step": 20108 }, { "epoch": 4.958086785009862, "grad_norm": 0.294921875, "learning_rate": 4.3312145507168337e-08, "loss": 0.0327, "step": 20110 }, { "epoch": 4.958579881656805, "grad_norm": 0.365234375, "learning_rate": 4.230499177994007e-08, "loss": 0.0393, "step": 20112 }, { "epoch": 4.959072978303747, "grad_norm": 0.27734375, "learning_rate": 4.130968359766785e-08, "loss": 0.032, "step": 20114 }, { "epoch": 4.959566074950691, "grad_norm": 0.28515625, "learning_rate": 4.032622107829065e-08, "loss": 0.0331, "step": 20116 }, { "epoch": 4.960059171597633, "grad_norm": 0.302734375, "learning_rate": 3.935460433837079e-08, "loss": 0.036, "step": 20118 }, { "epoch": 4.960552268244576, "grad_norm": 0.42578125, "learning_rate": 3.83948334930273e-08, "loss": 0.0348, "step": 20120 }, { "epoch": 4.961045364891518, "grad_norm": 0.328125, "learning_rate": 3.744690865601363e-08, "loss": 0.0336, "step": 20122 }, { "epoch": 4.961538461538462, "grad_norm": 0.3125, "learning_rate": 3.651082993966215e-08, "loss": 0.0345, "step": 20124 }, { "epoch": 4.962031558185404, "grad_norm": 0.30859375, "learning_rate": 3.558659745490634e-08, "loss": 0.0324, "step": 20126 }, { "epoch": 4.962524654832347, "grad_norm": 0.310546875, "learning_rate": 3.467421131125859e-08, "loss": 0.0356, "step": 20128 }, { "epoch": 4.96301775147929, "grad_norm": 0.310546875, "learning_rate": 3.377367161686573e-08, "loss": 0.0326, "step": 20130 }, { "epoch": 4.963510848126233, "grad_norm": 0.330078125, "learning_rate": 3.28849784784202e-08, "loss": 0.0332, "step": 20132 }, { "epoch": 4.964003944773175, "grad_norm": 0.3828125, "learning_rate": 3.200813200124886e-08, "loss": 0.034, "step": 20134 }, { "epoch": 4.964497041420119, "grad_norm": 0.287109375, "learning_rate": 3.114313228926857e-08, "loss": 0.0308, "step": 20136 }, { "epoch": 4.964990138067061, "grad_norm": 0.28125, "learning_rate": 3.0289979444986236e-08, "loss": 0.032, "step": 20138 }, { "epoch": 4.965483234714004, "grad_norm": 0.27734375, "learning_rate": 2.9448673569487663e-08, "loss": 0.0321, "step": 20140 }, { "epoch": 4.965976331360947, "grad_norm": 0.326171875, "learning_rate": 2.8619214762504176e-08, "loss": 0.0328, "step": 20142 }, { "epoch": 4.96646942800789, "grad_norm": 0.259765625, "learning_rate": 2.7801603122301624e-08, "loss": 0.0318, "step": 20144 }, { "epoch": 4.966962524654832, "grad_norm": 0.28515625, "learning_rate": 2.6995838745780267e-08, "loss": 0.0296, "step": 20146 }, { "epoch": 4.9674556213017755, "grad_norm": 0.283203125, "learning_rate": 2.620192172843039e-08, "loss": 0.0311, "step": 20148 }, { "epoch": 4.967948717948718, "grad_norm": 0.279296875, "learning_rate": 2.541985216433229e-08, "loss": 0.0303, "step": 20150 }, { "epoch": 4.968441814595661, "grad_norm": 0.26953125, "learning_rate": 2.4649630146178494e-08, "loss": 0.0285, "step": 20152 }, { "epoch": 4.968934911242604, "grad_norm": 0.30078125, "learning_rate": 2.3891255765229325e-08, "loss": 0.0332, "step": 20154 }, { "epoch": 4.9694280078895465, "grad_norm": 0.328125, "learning_rate": 2.3144729111357343e-08, "loss": 0.0381, "step": 20156 }, { "epoch": 4.969921104536489, "grad_norm": 0.275390625, "learning_rate": 2.241005027303622e-08, "loss": 0.0312, "step": 20158 }, { "epoch": 4.970414201183432, "grad_norm": 0.2890625, "learning_rate": 2.1687219337329645e-08, "loss": 0.0299, "step": 20160 }, { "epoch": 4.970907297830375, "grad_norm": 0.283203125, "learning_rate": 2.0976236389891325e-08, "loss": 0.0288, "step": 20162 }, { "epoch": 4.9714003944773175, "grad_norm": 0.31640625, "learning_rate": 2.0277101514987184e-08, "loss": 0.0324, "step": 20164 }, { "epoch": 4.97189349112426, "grad_norm": 0.28125, "learning_rate": 1.958981479546207e-08, "loss": 0.0291, "step": 20166 }, { "epoch": 4.9723865877712035, "grad_norm": 0.3046875, "learning_rate": 1.8914376312750836e-08, "loss": 0.0316, "step": 20168 }, { "epoch": 4.972879684418146, "grad_norm": 0.2578125, "learning_rate": 1.8250786146922772e-08, "loss": 0.0284, "step": 20170 }, { "epoch": 4.9733727810650885, "grad_norm": 0.353515625, "learning_rate": 1.7599044376592766e-08, "loss": 0.0331, "step": 20172 }, { "epoch": 4.973865877712032, "grad_norm": 0.25390625, "learning_rate": 1.695915107901014e-08, "loss": 0.0318, "step": 20174 }, { "epoch": 4.9743589743589745, "grad_norm": 0.29296875, "learning_rate": 1.633110633000312e-08, "loss": 0.0331, "step": 20176 }, { "epoch": 4.974852071005917, "grad_norm": 0.291015625, "learning_rate": 1.5714910204001066e-08, "loss": 0.0299, "step": 20178 }, { "epoch": 4.9753451676528595, "grad_norm": 0.28125, "learning_rate": 1.511056277402334e-08, "loss": 0.0299, "step": 20180 }, { "epoch": 4.975838264299803, "grad_norm": 0.265625, "learning_rate": 1.451806411167933e-08, "loss": 0.0326, "step": 20182 }, { "epoch": 4.976331360946745, "grad_norm": 0.267578125, "learning_rate": 1.3937414287201745e-08, "loss": 0.032, "step": 20184 }, { "epoch": 4.976824457593688, "grad_norm": 0.314453125, "learning_rate": 1.336861336938e-08, "loss": 0.0285, "step": 20186 }, { "epoch": 4.977317554240631, "grad_norm": 0.306640625, "learning_rate": 1.2811661425637944e-08, "loss": 0.0306, "step": 20188 }, { "epoch": 4.977810650887574, "grad_norm": 0.26953125, "learning_rate": 1.226655852196723e-08, "loss": 0.0296, "step": 20190 }, { "epoch": 4.978303747534516, "grad_norm": 0.2578125, "learning_rate": 1.1733304722971738e-08, "loss": 0.028, "step": 20192 }, { "epoch": 4.97879684418146, "grad_norm": 0.314453125, "learning_rate": 1.1211900091845362e-08, "loss": 0.0331, "step": 20194 }, { "epoch": 4.979289940828402, "grad_norm": 0.28125, "learning_rate": 1.0702344690372013e-08, "loss": 0.0311, "step": 20196 }, { "epoch": 4.979783037475345, "grad_norm": 0.283203125, "learning_rate": 1.0204638578947823e-08, "loss": 0.0318, "step": 20198 }, { "epoch": 4.980276134122288, "grad_norm": 0.267578125, "learning_rate": 9.718781816536738e-09, "loss": 0.0317, "step": 20200 }, { "epoch": 4.980769230769231, "grad_norm": 0.28515625, "learning_rate": 9.244774460726024e-09, "loss": 0.033, "step": 20202 }, { "epoch": 4.981262327416173, "grad_norm": 0.267578125, "learning_rate": 8.782616567681867e-09, "loss": 0.03, "step": 20204 }, { "epoch": 4.981755424063117, "grad_norm": 0.32421875, "learning_rate": 8.332308192182669e-09, "loss": 0.0308, "step": 20206 }, { "epoch": 4.982248520710059, "grad_norm": 0.341796875, "learning_rate": 7.89384938758575e-09, "loss": 0.0348, "step": 20208 }, { "epoch": 4.982741617357002, "grad_norm": 0.27734375, "learning_rate": 7.467240205849545e-09, "loss": 0.0307, "step": 20210 }, { "epoch": 4.983234714003945, "grad_norm": 0.271484375, "learning_rate": 7.052480697533614e-09, "loss": 0.0342, "step": 20212 }, { "epoch": 4.983727810650888, "grad_norm": 0.279296875, "learning_rate": 6.6495709117875286e-09, "loss": 0.0331, "step": 20214 }, { "epoch": 4.98422090729783, "grad_norm": 0.328125, "learning_rate": 6.25851089636198e-09, "loss": 0.0365, "step": 20216 }, { "epoch": 4.984714003944773, "grad_norm": 0.337890625, "learning_rate": 5.879300697597678e-09, "loss": 0.034, "step": 20218 }, { "epoch": 4.985207100591716, "grad_norm": 0.296875, "learning_rate": 5.511940360425349e-09, "loss": 0.0324, "step": 20220 }, { "epoch": 4.985700197238659, "grad_norm": 0.302734375, "learning_rate": 5.15642992838794e-09, "loss": 0.031, "step": 20222 }, { "epoch": 4.986193293885601, "grad_norm": 0.2734375, "learning_rate": 4.812769443618415e-09, "loss": 0.0297, "step": 20224 }, { "epoch": 4.986686390532545, "grad_norm": 0.287109375, "learning_rate": 4.480958946828651e-09, "loss": 0.0266, "step": 20226 }, { "epoch": 4.987179487179487, "grad_norm": 0.3125, "learning_rate": 4.16099847735385e-09, "loss": 0.0331, "step": 20228 }, { "epoch": 4.98767258382643, "grad_norm": 0.400390625, "learning_rate": 3.85288807310813e-09, "loss": 0.0274, "step": 20230 }, { "epoch": 4.988165680473373, "grad_norm": 0.267578125, "learning_rate": 3.556627770595622e-09, "loss": 0.0286, "step": 20232 }, { "epoch": 4.988658777120316, "grad_norm": 0.283203125, "learning_rate": 3.2722176049437836e-09, "loss": 0.0333, "step": 20234 }, { "epoch": 4.989151873767258, "grad_norm": 0.287109375, "learning_rate": 2.999657609825679e-09, "loss": 0.0298, "step": 20236 }, { "epoch": 4.989644970414201, "grad_norm": 0.287109375, "learning_rate": 2.7389478175710025e-09, "loss": 0.033, "step": 20238 }, { "epoch": 4.990138067061144, "grad_norm": 0.2890625, "learning_rate": 2.490088259066159e-09, "loss": 0.0304, "step": 20240 }, { "epoch": 4.990631163708087, "grad_norm": 0.326171875, "learning_rate": 2.2530789637986716e-09, "loss": 0.0329, "step": 20242 }, { "epoch": 4.991124260355029, "grad_norm": 0.298828125, "learning_rate": 2.027919959857183e-09, "loss": 0.0323, "step": 20244 }, { "epoch": 4.991617357001973, "grad_norm": 0.27734375, "learning_rate": 1.8146112739203525e-09, "loss": 0.0306, "step": 20246 }, { "epoch": 4.992110453648915, "grad_norm": 0.33984375, "learning_rate": 1.6131529312790605e-09, "loss": 0.0299, "step": 20248 }, { "epoch": 4.992603550295858, "grad_norm": 0.3125, "learning_rate": 1.423544955792e-09, "loss": 0.0301, "step": 20250 }, { "epoch": 4.993096646942801, "grad_norm": 0.283203125, "learning_rate": 1.2457873699300848e-09, "loss": 0.0318, "step": 20252 }, { "epoch": 4.993589743589744, "grad_norm": 0.310546875, "learning_rate": 1.0798801947764502e-09, "loss": 0.0346, "step": 20254 }, { "epoch": 4.994082840236686, "grad_norm": 0.30859375, "learning_rate": 9.258234499709417e-10, "loss": 0.0335, "step": 20256 }, { "epoch": 4.9945759368836296, "grad_norm": 0.326171875, "learning_rate": 7.836171537878301e-10, "loss": 0.029, "step": 20258 }, { "epoch": 4.995069033530572, "grad_norm": 0.33984375, "learning_rate": 6.532613230580964e-10, "loss": 0.0346, "step": 20260 }, { "epoch": 4.995562130177515, "grad_norm": 0.373046875, "learning_rate": 5.347559732471474e-10, "loss": 0.0307, "step": 20262 }, { "epoch": 4.996055226824458, "grad_norm": 0.287109375, "learning_rate": 4.2810111839930447e-10, "loss": 0.0305, "step": 20264 }, { "epoch": 4.9965483234714005, "grad_norm": 0.296875, "learning_rate": 3.3329677113780323e-10, "loss": 0.0321, "step": 20266 }, { "epoch": 4.997041420118343, "grad_norm": 0.306640625, "learning_rate": 2.50342942709203e-10, "loss": 0.0318, "step": 20268 }, { "epoch": 4.9975345167652865, "grad_norm": 0.275390625, "learning_rate": 1.7923964293897754e-10, "loss": 0.0316, "step": 20270 }, { "epoch": 4.998027613412229, "grad_norm": 0.2890625, "learning_rate": 1.1998688026482185e-10, "loss": 0.0336, "step": 20272 }, { "epoch": 4.9985207100591715, "grad_norm": 0.349609375, "learning_rate": 7.258466170334544e-11, "loss": 0.0354, "step": 20274 }, { "epoch": 4.999013806706114, "grad_norm": 0.27734375, "learning_rate": 3.7032992861174565e-11, "loss": 0.0314, "step": 20276 }, { "epoch": 4.9995069033530575, "grad_norm": 0.26953125, "learning_rate": 1.3331877957156735e-11, "loss": 0.0323, "step": 20278 }, { "epoch": 5.0, "grad_norm": 0.34765625, "learning_rate": 1.4813198001562e-12, "loss": 0.0257, "step": 20280 }, { "epoch": 5.0, "eval_loss": 0.060740407556295395, "eval_runtime": 69.8761, "eval_samples_per_second": 227.961, "eval_steps_per_second": 1.789, "step": 20280 }, { "epoch": 5.0, "step": 20280, "total_flos": 7.48763679670272e+20, "train_loss": 0.06658803844747282, "train_runtime": 14605.9054, "train_samples_per_second": 177.725, "train_steps_per_second": 1.388 } ], "logging_steps": 2, "max_steps": 20280, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 4056, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.48763679670272e+20, "train_batch_size": 128, "trial_name": null, "trial_params": null }