{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9090644635792413, "eval_steps": 500, "global_step": 15625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.432150460863581e-05, "grad_norm": 2.25, "learning_rate": 2e-05, "loss": 0.9123, "step": 10 }, { "epoch": 0.00012864300921727162, "grad_norm": 2.765625, "learning_rate": 2e-05, "loss": 0.5268, "step": 20 }, { "epoch": 0.00019296451382590742, "grad_norm": 2.125, "learning_rate": 2e-05, "loss": 0.5193, "step": 30 }, { "epoch": 0.00025728601843454324, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.5167, "step": 40 }, { "epoch": 0.00032160752304317904, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 0.5129, "step": 50 }, { "epoch": 0.00038592902765181484, "grad_norm": 2.78125, "learning_rate": 2e-05, "loss": 0.5131, "step": 60 }, { "epoch": 0.00045025053226045063, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.5065, "step": 70 }, { "epoch": 0.0005145720368690865, "grad_norm": 2.421875, "learning_rate": 2e-05, "loss": 0.5123, "step": 80 }, { "epoch": 0.0005788935414777223, "grad_norm": 2.703125, "learning_rate": 2e-05, "loss": 0.4953, "step": 90 }, { "epoch": 0.0006432150460863581, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.4935, "step": 100 }, { "epoch": 0.0007075365506949939, "grad_norm": 2.84375, "learning_rate": 2e-05, "loss": 0.487, "step": 110 }, { "epoch": 0.0007718580553036297, "grad_norm": 2.0, "learning_rate": 2e-05, "loss": 0.4784, "step": 120 }, { "epoch": 0.0008361795599122655, "grad_norm": 2.3125, "learning_rate": 2e-05, "loss": 0.4745, "step": 130 }, { "epoch": 0.0009005010645209013, "grad_norm": 2.84375, "learning_rate": 2e-05, "loss": 0.4811, "step": 140 }, { "epoch": 0.0009648225691295371, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 0.4681, "step": 150 }, { "epoch": 0.001029144073738173, "grad_norm": 1.9921875, "learning_rate": 2e-05, "loss": 0.4657, "step": 160 }, { "epoch": 0.0010934655783468087, "grad_norm": 2.71875, "learning_rate": 2e-05, "loss": 0.4622, "step": 170 }, { "epoch": 0.0011577870829554446, "grad_norm": 2.671875, "learning_rate": 2e-05, "loss": 0.4551, "step": 180 }, { "epoch": 0.0012221085875640803, "grad_norm": 2.4375, "learning_rate": 2e-05, "loss": 0.4548, "step": 190 }, { "epoch": 0.0012864300921727162, "grad_norm": 2.484375, "learning_rate": 2e-05, "loss": 0.443, "step": 200 }, { "epoch": 0.0013507515967813518, "grad_norm": 3.015625, "learning_rate": 2e-05, "loss": 0.4357, "step": 210 }, { "epoch": 0.0014150731013899878, "grad_norm": 3.296875, "learning_rate": 2e-05, "loss": 0.4224, "step": 220 }, { "epoch": 0.0014793946059986234, "grad_norm": 2.859375, "learning_rate": 2e-05, "loss": 0.4226, "step": 230 }, { "epoch": 0.0015437161106072593, "grad_norm": 2.8125, "learning_rate": 2e-05, "loss": 0.4331, "step": 240 }, { "epoch": 0.001608037615215895, "grad_norm": 3.015625, "learning_rate": 2e-05, "loss": 0.4166, "step": 250 }, { "epoch": 0.001672359119824531, "grad_norm": 2.71875, "learning_rate": 2e-05, "loss": 0.4132, "step": 260 }, { "epoch": 0.0017366806244331668, "grad_norm": 2.234375, "learning_rate": 2e-05, "loss": 0.4078, "step": 270 }, { "epoch": 0.0018010021290418025, "grad_norm": 3.28125, "learning_rate": 2e-05, "loss": 0.4001, "step": 280 }, { "epoch": 0.0018653236336504384, "grad_norm": 3.25, "learning_rate": 2e-05, "loss": 0.3942, "step": 290 }, { "epoch": 0.0019296451382590741, "grad_norm": 3.25, "learning_rate": 2e-05, "loss": 0.3827, "step": 300 }, { "epoch": 0.00199396664286771, "grad_norm": 3.109375, "learning_rate": 2e-05, "loss": 0.3871, "step": 310 }, { "epoch": 0.002058288147476346, "grad_norm": 2.8125, "learning_rate": 2e-05, "loss": 0.3694, "step": 320 }, { "epoch": 0.0021226096520849816, "grad_norm": 2.890625, "learning_rate": 2e-05, "loss": 0.377, "step": 330 }, { "epoch": 0.0021869311566936173, "grad_norm": 2.65625, "learning_rate": 2e-05, "loss": 0.3603, "step": 340 }, { "epoch": 0.002251252661302253, "grad_norm": 3.421875, "learning_rate": 2e-05, "loss": 0.3604, "step": 350 }, { "epoch": 0.002315574165910889, "grad_norm": 5.3125, "learning_rate": 2e-05, "loss": 0.3606, "step": 360 }, { "epoch": 0.002379895670519525, "grad_norm": 2.734375, "learning_rate": 2e-05, "loss": 0.3552, "step": 370 }, { "epoch": 0.0024442171751281605, "grad_norm": 4.09375, "learning_rate": 2e-05, "loss": 0.3466, "step": 380 }, { "epoch": 0.002508538679736796, "grad_norm": 3.21875, "learning_rate": 2e-05, "loss": 0.3506, "step": 390 }, { "epoch": 0.0025728601843454323, "grad_norm": 3.6875, "learning_rate": 2e-05, "loss": 0.3492, "step": 400 }, { "epoch": 0.002637181688954068, "grad_norm": 3.65625, "learning_rate": 2e-05, "loss": 0.3467, "step": 410 }, { "epoch": 0.0027015031935627037, "grad_norm": 3.0, "learning_rate": 2e-05, "loss": 0.3269, "step": 420 }, { "epoch": 0.00276582469817134, "grad_norm": 2.90625, "learning_rate": 2e-05, "loss": 0.3392, "step": 430 }, { "epoch": 0.0028301462027799755, "grad_norm": 3.328125, "learning_rate": 2e-05, "loss": 0.3333, "step": 440 }, { "epoch": 0.002894467707388611, "grad_norm": 3.546875, "learning_rate": 2e-05, "loss": 0.3236, "step": 450 }, { "epoch": 0.002958789211997247, "grad_norm": 3.765625, "learning_rate": 2e-05, "loss": 0.3229, "step": 460 }, { "epoch": 0.003023110716605883, "grad_norm": 3.3125, "learning_rate": 2e-05, "loss": 0.321, "step": 470 }, { "epoch": 0.0030874322212145187, "grad_norm": 2.921875, "learning_rate": 2e-05, "loss": 0.3187, "step": 480 }, { "epoch": 0.0031517537258231544, "grad_norm": 2.59375, "learning_rate": 2e-05, "loss": 0.3205, "step": 490 }, { "epoch": 0.00321607523043179, "grad_norm": 4.09375, "learning_rate": 2e-05, "loss": 0.3166, "step": 500 }, { "epoch": 0.003280396735040426, "grad_norm": 2.734375, "learning_rate": 2e-05, "loss": 0.3093, "step": 510 }, { "epoch": 0.003344718239649062, "grad_norm": 3.03125, "learning_rate": 2e-05, "loss": 0.3118, "step": 520 }, { "epoch": 0.0034090397442576976, "grad_norm": 3.125, "learning_rate": 2e-05, "loss": 0.3092, "step": 530 }, { "epoch": 0.0034733612488663337, "grad_norm": 2.96875, "learning_rate": 2e-05, "loss": 0.3123, "step": 540 }, { "epoch": 0.0035376827534749694, "grad_norm": 2.734375, "learning_rate": 2e-05, "loss": 0.3019, "step": 550 }, { "epoch": 0.003602004258083605, "grad_norm": 2.53125, "learning_rate": 2e-05, "loss": 0.301, "step": 560 }, { "epoch": 0.0036663257626922408, "grad_norm": 2.265625, "learning_rate": 2e-05, "loss": 0.3011, "step": 570 }, { "epoch": 0.003730647267300877, "grad_norm": 3.21875, "learning_rate": 2e-05, "loss": 0.2949, "step": 580 }, { "epoch": 0.0037949687719095126, "grad_norm": 2.96875, "learning_rate": 2e-05, "loss": 0.3023, "step": 590 }, { "epoch": 0.0038592902765181483, "grad_norm": 3.109375, "learning_rate": 2e-05, "loss": 0.3025, "step": 600 }, { "epoch": 0.003923611781126784, "grad_norm": 3.140625, "learning_rate": 2e-05, "loss": 0.297, "step": 610 }, { "epoch": 0.00398793328573542, "grad_norm": 2.53125, "learning_rate": 2e-05, "loss": 0.2962, "step": 620 }, { "epoch": 0.004052254790344056, "grad_norm": 3.171875, "learning_rate": 2e-05, "loss": 0.2929, "step": 630 }, { "epoch": 0.004116576294952692, "grad_norm": 3.03125, "learning_rate": 2e-05, "loss": 0.3011, "step": 640 }, { "epoch": 0.004180897799561327, "grad_norm": 3.609375, "learning_rate": 2e-05, "loss": 0.2968, "step": 650 }, { "epoch": 0.004245219304169963, "grad_norm": 2.53125, "learning_rate": 2e-05, "loss": 0.2969, "step": 660 }, { "epoch": 0.004309540808778599, "grad_norm": 2.46875, "learning_rate": 2e-05, "loss": 0.2965, "step": 670 }, { "epoch": 0.004373862313387235, "grad_norm": 2.84375, "learning_rate": 2e-05, "loss": 0.2882, "step": 680 }, { "epoch": 0.004438183817995871, "grad_norm": 2.46875, "learning_rate": 2e-05, "loss": 0.2901, "step": 690 }, { "epoch": 0.004502505322604506, "grad_norm": 2.59375, "learning_rate": 2e-05, "loss": 0.2945, "step": 700 }, { "epoch": 0.004566826827213142, "grad_norm": 3.078125, "learning_rate": 2e-05, "loss": 0.2879, "step": 710 }, { "epoch": 0.004631148331821778, "grad_norm": 3.5, "learning_rate": 2e-05, "loss": 0.2824, "step": 720 }, { "epoch": 0.0046954698364304135, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.2934, "step": 730 }, { "epoch": 0.00475979134103905, "grad_norm": 3.671875, "learning_rate": 2e-05, "loss": 0.2977, "step": 740 }, { "epoch": 0.004824112845647686, "grad_norm": 2.671875, "learning_rate": 2e-05, "loss": 0.2924, "step": 750 }, { "epoch": 0.004888434350256321, "grad_norm": 2.375, "learning_rate": 2e-05, "loss": 0.287, "step": 760 }, { "epoch": 0.004952755854864957, "grad_norm": 2.9375, "learning_rate": 2e-05, "loss": 0.2881, "step": 770 }, { "epoch": 0.005017077359473592, "grad_norm": 2.484375, "learning_rate": 2e-05, "loss": 0.289, "step": 780 }, { "epoch": 0.0050813988640822285, "grad_norm": 2.53125, "learning_rate": 2e-05, "loss": 0.2873, "step": 790 }, { "epoch": 0.005145720368690865, "grad_norm": 3.484375, "learning_rate": 2e-05, "loss": 0.2805, "step": 800 }, { "epoch": 0.0052100418732995, "grad_norm": 2.8125, "learning_rate": 2e-05, "loss": 0.2847, "step": 810 }, { "epoch": 0.005274363377908136, "grad_norm": 2.9375, "learning_rate": 2e-05, "loss": 0.2881, "step": 820 }, { "epoch": 0.005338684882516772, "grad_norm": 2.296875, "learning_rate": 2e-05, "loss": 0.2876, "step": 830 }, { "epoch": 0.005403006387125407, "grad_norm": 2.828125, "learning_rate": 2e-05, "loss": 0.2773, "step": 840 }, { "epoch": 0.0054673278917340435, "grad_norm": 2.9375, "learning_rate": 2e-05, "loss": 0.2873, "step": 850 }, { "epoch": 0.00553164939634268, "grad_norm": 2.625, "learning_rate": 2e-05, "loss": 0.284, "step": 860 }, { "epoch": 0.005595970900951315, "grad_norm": 3.109375, "learning_rate": 2e-05, "loss": 0.2856, "step": 870 }, { "epoch": 0.005660292405559951, "grad_norm": 2.375, "learning_rate": 2e-05, "loss": 0.2769, "step": 880 }, { "epoch": 0.005724613910168586, "grad_norm": 2.4375, "learning_rate": 2e-05, "loss": 0.2811, "step": 890 }, { "epoch": 0.005788935414777222, "grad_norm": 2.203125, "learning_rate": 2e-05, "loss": 0.2757, "step": 900 }, { "epoch": 0.0058532569193858585, "grad_norm": 2.484375, "learning_rate": 2e-05, "loss": 0.2798, "step": 910 }, { "epoch": 0.005917578423994494, "grad_norm": 2.484375, "learning_rate": 2e-05, "loss": 0.2801, "step": 920 }, { "epoch": 0.00598189992860313, "grad_norm": 2.25, "learning_rate": 2e-05, "loss": 0.285, "step": 930 }, { "epoch": 0.006046221433211766, "grad_norm": 2.8125, "learning_rate": 2e-05, "loss": 0.2713, "step": 940 }, { "epoch": 0.006110542937820401, "grad_norm": 2.3125, "learning_rate": 2e-05, "loss": 0.2736, "step": 950 }, { "epoch": 0.006174864442429037, "grad_norm": 2.234375, "learning_rate": 2e-05, "loss": 0.279, "step": 960 }, { "epoch": 0.0062391859470376735, "grad_norm": 2.375, "learning_rate": 2e-05, "loss": 0.2795, "step": 970 }, { "epoch": 0.006303507451646309, "grad_norm": 3.015625, "learning_rate": 2e-05, "loss": 0.2772, "step": 980 }, { "epoch": 0.006367828956254945, "grad_norm": 2.546875, "learning_rate": 2e-05, "loss": 0.2763, "step": 990 }, { "epoch": 0.00643215046086358, "grad_norm": 2.578125, "learning_rate": 2e-05, "loss": 0.2732, "step": 1000 }, { "epoch": 0.006496471965472216, "grad_norm": 2.734375, "learning_rate": 2e-05, "loss": 0.2658, "step": 1010 }, { "epoch": 0.006560793470080852, "grad_norm": 2.203125, "learning_rate": 2e-05, "loss": 0.2762, "step": 1020 }, { "epoch": 0.006625114974689488, "grad_norm": 2.5, "learning_rate": 2e-05, "loss": 0.2647, "step": 1030 }, { "epoch": 0.006689436479298124, "grad_norm": 2.609375, "learning_rate": 2e-05, "loss": 0.2738, "step": 1040 }, { "epoch": 0.00675375798390676, "grad_norm": 2.171875, "learning_rate": 2e-05, "loss": 0.2768, "step": 1050 }, { "epoch": 0.006818079488515395, "grad_norm": 2.328125, "learning_rate": 2e-05, "loss": 0.2667, "step": 1060 }, { "epoch": 0.006882400993124031, "grad_norm": 2.390625, "learning_rate": 2e-05, "loss": 0.2734, "step": 1070 }, { "epoch": 0.006946722497732667, "grad_norm": 2.3125, "learning_rate": 2e-05, "loss": 0.27, "step": 1080 }, { "epoch": 0.007011044002341303, "grad_norm": 3.09375, "learning_rate": 2e-05, "loss": 0.2683, "step": 1090 }, { "epoch": 0.007075365506949939, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 0.2683, "step": 1100 }, { "epoch": 0.007139687011558574, "grad_norm": 2.34375, "learning_rate": 2e-05, "loss": 0.2715, "step": 1110 }, { "epoch": 0.00720400851616721, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.2756, "step": 1120 }, { "epoch": 0.007268330020775846, "grad_norm": 2.734375, "learning_rate": 2e-05, "loss": 0.2689, "step": 1130 }, { "epoch": 0.0073326515253844815, "grad_norm": 2.1875, "learning_rate": 2e-05, "loss": 0.2684, "step": 1140 }, { "epoch": 0.007396973029993118, "grad_norm": 1.9375, "learning_rate": 2e-05, "loss": 0.2662, "step": 1150 }, { "epoch": 0.007461294534601754, "grad_norm": 2.609375, "learning_rate": 2e-05, "loss": 0.2605, "step": 1160 }, { "epoch": 0.007525616039210389, "grad_norm": 1.8671875, "learning_rate": 2e-05, "loss": 0.2664, "step": 1170 }, { "epoch": 0.007589937543819025, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.2659, "step": 1180 }, { "epoch": 0.007654259048427661, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.2703, "step": 1190 }, { "epoch": 0.0077185805530362965, "grad_norm": 2.921875, "learning_rate": 2e-05, "loss": 0.2714, "step": 1200 }, { "epoch": 0.007782902057644933, "grad_norm": 3.0, "learning_rate": 2e-05, "loss": 0.2651, "step": 1210 }, { "epoch": 0.007847223562253569, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 0.2624, "step": 1220 }, { "epoch": 0.007911545066862204, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 0.2681, "step": 1230 }, { "epoch": 0.00797586657147084, "grad_norm": 2.015625, "learning_rate": 2e-05, "loss": 0.2654, "step": 1240 }, { "epoch": 0.008040188076079476, "grad_norm": 2.375, "learning_rate": 2e-05, "loss": 0.2575, "step": 1250 }, { "epoch": 0.008104509580688112, "grad_norm": 2.21875, "learning_rate": 2e-05, "loss": 0.255, "step": 1260 }, { "epoch": 0.008168831085296747, "grad_norm": 2.375, "learning_rate": 2e-05, "loss": 0.2593, "step": 1270 }, { "epoch": 0.008233152589905384, "grad_norm": 2.40625, "learning_rate": 2e-05, "loss": 0.2623, "step": 1280 }, { "epoch": 0.008297474094514019, "grad_norm": 2.8125, "learning_rate": 2e-05, "loss": 0.2591, "step": 1290 }, { "epoch": 0.008361795599122654, "grad_norm": 2.1875, "learning_rate": 2e-05, "loss": 0.2573, "step": 1300 }, { "epoch": 0.008426117103731291, "grad_norm": 2.015625, "learning_rate": 2e-05, "loss": 0.2611, "step": 1310 }, { "epoch": 0.008490438608339927, "grad_norm": 2.578125, "learning_rate": 2e-05, "loss": 0.2591, "step": 1320 }, { "epoch": 0.008554760112948562, "grad_norm": 2.234375, "learning_rate": 2e-05, "loss": 0.2599, "step": 1330 }, { "epoch": 0.008619081617557199, "grad_norm": 2.8125, "learning_rate": 2e-05, "loss": 0.259, "step": 1340 }, { "epoch": 0.008683403122165834, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.2661, "step": 1350 }, { "epoch": 0.00874772462677447, "grad_norm": 2.40625, "learning_rate": 2e-05, "loss": 0.2639, "step": 1360 }, { "epoch": 0.008812046131383105, "grad_norm": 2.015625, "learning_rate": 2e-05, "loss": 0.2621, "step": 1370 }, { "epoch": 0.008876367635991742, "grad_norm": 2.65625, "learning_rate": 2e-05, "loss": 0.2537, "step": 1380 }, { "epoch": 0.008940689140600377, "grad_norm": 2.0, "learning_rate": 2e-05, "loss": 0.2596, "step": 1390 }, { "epoch": 0.009005010645209012, "grad_norm": 2.09375, "learning_rate": 2e-05, "loss": 0.255, "step": 1400 }, { "epoch": 0.009069332149817649, "grad_norm": 2.171875, "learning_rate": 2e-05, "loss": 0.2601, "step": 1410 }, { "epoch": 0.009133653654426284, "grad_norm": 1.9140625, "learning_rate": 2e-05, "loss": 0.2516, "step": 1420 }, { "epoch": 0.00919797515903492, "grad_norm": 2.171875, "learning_rate": 2e-05, "loss": 0.26, "step": 1430 }, { "epoch": 0.009262296663643556, "grad_norm": 2.375, "learning_rate": 2e-05, "loss": 0.2524, "step": 1440 }, { "epoch": 0.009326618168252192, "grad_norm": 2.328125, "learning_rate": 2e-05, "loss": 0.2551, "step": 1450 }, { "epoch": 0.009390939672860827, "grad_norm": 2.296875, "learning_rate": 2e-05, "loss": 0.2593, "step": 1460 }, { "epoch": 0.009455261177469464, "grad_norm": 2.203125, "learning_rate": 2e-05, "loss": 0.2612, "step": 1470 }, { "epoch": 0.0095195826820781, "grad_norm": 2.515625, "learning_rate": 2e-05, "loss": 0.2579, "step": 1480 }, { "epoch": 0.009583904186686735, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.2564, "step": 1490 }, { "epoch": 0.009648225691295371, "grad_norm": 2.25, "learning_rate": 2e-05, "loss": 0.252, "step": 1500 }, { "epoch": 0.009712547195904007, "grad_norm": 2.703125, "learning_rate": 2e-05, "loss": 0.259, "step": 1510 }, { "epoch": 0.009776868700512642, "grad_norm": 2.4375, "learning_rate": 2e-05, "loss": 0.2543, "step": 1520 }, { "epoch": 0.009841190205121279, "grad_norm": 2.203125, "learning_rate": 2e-05, "loss": 0.2513, "step": 1530 }, { "epoch": 0.009905511709729914, "grad_norm": 2.578125, "learning_rate": 2e-05, "loss": 0.2601, "step": 1540 }, { "epoch": 0.00996983321433855, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.2475, "step": 1550 }, { "epoch": 0.010034154718947185, "grad_norm": 2.6875, "learning_rate": 2e-05, "loss": 0.2579, "step": 1560 }, { "epoch": 0.010098476223555822, "grad_norm": 2.46875, "learning_rate": 2e-05, "loss": 0.2547, "step": 1570 }, { "epoch": 0.010162797728164457, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.2558, "step": 1580 }, { "epoch": 0.010227119232773092, "grad_norm": 1.9609375, "learning_rate": 2e-05, "loss": 0.2532, "step": 1590 }, { "epoch": 0.01029144073738173, "grad_norm": 1.96875, "learning_rate": 2e-05, "loss": 0.249, "step": 1600 }, { "epoch": 0.010355762241990365, "grad_norm": 2.28125, "learning_rate": 2e-05, "loss": 0.2578, "step": 1610 }, { "epoch": 0.010420083746599, "grad_norm": 2.28125, "learning_rate": 2e-05, "loss": 0.2536, "step": 1620 }, { "epoch": 0.010484405251207637, "grad_norm": 2.78125, "learning_rate": 2e-05, "loss": 0.2551, "step": 1630 }, { "epoch": 0.010548726755816272, "grad_norm": 1.9921875, "learning_rate": 2e-05, "loss": 0.2538, "step": 1640 }, { "epoch": 0.010613048260424907, "grad_norm": 2.5, "learning_rate": 2e-05, "loss": 0.2616, "step": 1650 }, { "epoch": 0.010677369765033544, "grad_norm": 2.390625, "learning_rate": 2e-05, "loss": 0.2421, "step": 1660 }, { "epoch": 0.01074169126964218, "grad_norm": 2.34375, "learning_rate": 2e-05, "loss": 0.2531, "step": 1670 }, { "epoch": 0.010806012774250815, "grad_norm": 2.265625, "learning_rate": 2e-05, "loss": 0.2531, "step": 1680 }, { "epoch": 0.010870334278859452, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.2456, "step": 1690 }, { "epoch": 0.010934655783468087, "grad_norm": 2.015625, "learning_rate": 2e-05, "loss": 0.2455, "step": 1700 }, { "epoch": 0.010998977288076722, "grad_norm": 1.84375, "learning_rate": 2e-05, "loss": 0.2521, "step": 1710 }, { "epoch": 0.01106329879268536, "grad_norm": 2.625, "learning_rate": 2e-05, "loss": 0.2469, "step": 1720 }, { "epoch": 0.011127620297293995, "grad_norm": 2.421875, "learning_rate": 2e-05, "loss": 0.2522, "step": 1730 }, { "epoch": 0.01119194180190263, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2552, "step": 1740 }, { "epoch": 0.011256263306511267, "grad_norm": 1.8671875, "learning_rate": 2e-05, "loss": 0.2596, "step": 1750 }, { "epoch": 0.011320584811119902, "grad_norm": 3.984375, "learning_rate": 2e-05, "loss": 0.2493, "step": 1760 }, { "epoch": 0.011384906315728537, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 0.2504, "step": 1770 }, { "epoch": 0.011449227820337173, "grad_norm": 2.1875, "learning_rate": 2e-05, "loss": 0.2472, "step": 1780 }, { "epoch": 0.01151354932494581, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.2513, "step": 1790 }, { "epoch": 0.011577870829554445, "grad_norm": 2.796875, "learning_rate": 2e-05, "loss": 0.2456, "step": 1800 }, { "epoch": 0.01164219233416308, "grad_norm": 3.0, "learning_rate": 2e-05, "loss": 0.2467, "step": 1810 }, { "epoch": 0.011706513838771717, "grad_norm": 1.921875, "learning_rate": 2e-05, "loss": 0.2566, "step": 1820 }, { "epoch": 0.011770835343380352, "grad_norm": 2.359375, "learning_rate": 2e-05, "loss": 0.2547, "step": 1830 }, { "epoch": 0.011835156847988988, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 0.2538, "step": 1840 }, { "epoch": 0.011899478352597625, "grad_norm": 2.0, "learning_rate": 2e-05, "loss": 0.2501, "step": 1850 }, { "epoch": 0.01196379985720626, "grad_norm": 2.109375, "learning_rate": 2e-05, "loss": 0.2499, "step": 1860 }, { "epoch": 0.012028121361814895, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.2383, "step": 1870 }, { "epoch": 0.012092442866423532, "grad_norm": 1.9765625, "learning_rate": 2e-05, "loss": 0.2397, "step": 1880 }, { "epoch": 0.012156764371032167, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 0.2449, "step": 1890 }, { "epoch": 0.012221085875640803, "grad_norm": 2.59375, "learning_rate": 2e-05, "loss": 0.2503, "step": 1900 }, { "epoch": 0.01228540738024944, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.2447, "step": 1910 }, { "epoch": 0.012349728884858075, "grad_norm": 2.15625, "learning_rate": 2e-05, "loss": 0.2509, "step": 1920 }, { "epoch": 0.01241405038946671, "grad_norm": 2.765625, "learning_rate": 2e-05, "loss": 0.2462, "step": 1930 }, { "epoch": 0.012478371894075347, "grad_norm": 2.203125, "learning_rate": 2e-05, "loss": 0.2487, "step": 1940 }, { "epoch": 0.012542693398683982, "grad_norm": 2.953125, "learning_rate": 2e-05, "loss": 0.2439, "step": 1950 }, { "epoch": 0.012607014903292618, "grad_norm": 2.328125, "learning_rate": 2e-05, "loss": 0.2536, "step": 1960 }, { "epoch": 0.012671336407901253, "grad_norm": 1.9609375, "learning_rate": 2e-05, "loss": 0.2498, "step": 1970 }, { "epoch": 0.01273565791250989, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.2521, "step": 1980 }, { "epoch": 0.012799979417118525, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 0.2522, "step": 1990 }, { "epoch": 0.01286430092172716, "grad_norm": 2.25, "learning_rate": 2e-05, "loss": 0.2494, "step": 2000 }, { "epoch": 0.012928622426335797, "grad_norm": 2.34375, "learning_rate": 2e-05, "loss": 0.2402, "step": 2010 }, { "epoch": 0.012992943930944433, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2477, "step": 2020 }, { "epoch": 0.013057265435553068, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 0.2429, "step": 2030 }, { "epoch": 0.013121586940161705, "grad_norm": 2.25, "learning_rate": 2e-05, "loss": 0.2416, "step": 2040 }, { "epoch": 0.01318590844477034, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.2443, "step": 2050 }, { "epoch": 0.013250229949378975, "grad_norm": 2.953125, "learning_rate": 2e-05, "loss": 0.2457, "step": 2060 }, { "epoch": 0.013314551453987612, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.2428, "step": 2070 }, { "epoch": 0.013378872958596248, "grad_norm": 1.8046875, "learning_rate": 2e-05, "loss": 0.2491, "step": 2080 }, { "epoch": 0.013443194463204883, "grad_norm": 2.578125, "learning_rate": 2e-05, "loss": 0.2471, "step": 2090 }, { "epoch": 0.01350751596781352, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 0.2398, "step": 2100 }, { "epoch": 0.013571837472422155, "grad_norm": 1.8984375, "learning_rate": 2e-05, "loss": 0.2489, "step": 2110 }, { "epoch": 0.01363615897703079, "grad_norm": 1.78125, "learning_rate": 2e-05, "loss": 0.2425, "step": 2120 }, { "epoch": 0.013700480481639427, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.244, "step": 2130 }, { "epoch": 0.013764801986248063, "grad_norm": 1.7890625, "learning_rate": 2e-05, "loss": 0.24, "step": 2140 }, { "epoch": 0.013829123490856698, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.2399, "step": 2150 }, { "epoch": 0.013893444995465335, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.2418, "step": 2160 }, { "epoch": 0.01395776650007397, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.2427, "step": 2170 }, { "epoch": 0.014022088004682605, "grad_norm": 2.4375, "learning_rate": 2e-05, "loss": 0.2403, "step": 2180 }, { "epoch": 0.01408640950929124, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 0.2457, "step": 2190 }, { "epoch": 0.014150731013899878, "grad_norm": 1.96875, "learning_rate": 2e-05, "loss": 0.2491, "step": 2200 }, { "epoch": 0.014419093228245763, "grad_norm": 1.953125, "learning_rate": 2e-05, "loss": 0.2406, "step": 2210 }, { "epoch": 0.014484337993984433, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.2421, "step": 2220 }, { "epoch": 0.0145495827597231, "grad_norm": 2.203125, "learning_rate": 2e-05, "loss": 0.2342, "step": 2230 }, { "epoch": 0.01461482752546177, "grad_norm": 2.234375, "learning_rate": 2e-05, "loss": 0.2452, "step": 2240 }, { "epoch": 0.014680072291200438, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2409, "step": 2250 }, { "epoch": 0.014745317056939107, "grad_norm": 2.09375, "learning_rate": 2e-05, "loss": 0.2417, "step": 2260 }, { "epoch": 0.014810561822677776, "grad_norm": 2.359375, "learning_rate": 2e-05, "loss": 0.2374, "step": 2270 }, { "epoch": 0.014875806588416444, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.2435, "step": 2280 }, { "epoch": 0.014941051354155114, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.2358, "step": 2290 }, { "epoch": 0.015006296119893781, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.2439, "step": 2300 }, { "epoch": 0.01507154088563245, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.2432, "step": 2310 }, { "epoch": 0.015136785651371118, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.2418, "step": 2320 }, { "epoch": 0.015202030417109788, "grad_norm": 2.25, "learning_rate": 2e-05, "loss": 0.235, "step": 2330 }, { "epoch": 0.015267275182848455, "grad_norm": 2.0, "learning_rate": 2e-05, "loss": 0.2367, "step": 2340 }, { "epoch": 0.015332519948587125, "grad_norm": 2.09375, "learning_rate": 2e-05, "loss": 0.2425, "step": 2350 }, { "epoch": 0.015397764714325793, "grad_norm": 2.109375, "learning_rate": 2e-05, "loss": 0.2417, "step": 2360 }, { "epoch": 0.015463009480064462, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.2352, "step": 2370 }, { "epoch": 0.01552825424580313, "grad_norm": 2.3125, "learning_rate": 2e-05, "loss": 0.2418, "step": 2380 }, { "epoch": 0.015593499011541799, "grad_norm": 2.0, "learning_rate": 2e-05, "loss": 0.2358, "step": 2390 }, { "epoch": 0.01565874377728047, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 0.2424, "step": 2400 }, { "epoch": 0.015723988543019138, "grad_norm": 2.015625, "learning_rate": 2e-05, "loss": 0.2359, "step": 2410 }, { "epoch": 0.015789233308757804, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.2435, "step": 2420 }, { "epoch": 0.015854478074496473, "grad_norm": 2.390625, "learning_rate": 2e-05, "loss": 0.2349, "step": 2430 }, { "epoch": 0.015919722840235143, "grad_norm": 1.8359375, "learning_rate": 2e-05, "loss": 0.2382, "step": 2440 }, { "epoch": 0.015984967605973812, "grad_norm": 1.9453125, "learning_rate": 2e-05, "loss": 0.2419, "step": 2450 }, { "epoch": 0.016050212371712478, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.2395, "step": 2460 }, { "epoch": 0.016115457137451147, "grad_norm": 2.46875, "learning_rate": 2e-05, "loss": 0.2358, "step": 2470 }, { "epoch": 0.016180701903189817, "grad_norm": 2.609375, "learning_rate": 2e-05, "loss": 0.2371, "step": 2480 }, { "epoch": 0.016245946668928486, "grad_norm": 2.171875, "learning_rate": 2e-05, "loss": 0.2401, "step": 2490 }, { "epoch": 0.016311191434667152, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 0.2348, "step": 2500 }, { "epoch": 0.01637643620040582, "grad_norm": 1.8359375, "learning_rate": 2e-05, "loss": 0.2318, "step": 2510 }, { "epoch": 0.01644168096614449, "grad_norm": 2.1875, "learning_rate": 2e-05, "loss": 0.2375, "step": 2520 }, { "epoch": 0.01650692573188316, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.2409, "step": 2530 }, { "epoch": 0.01657217049762183, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.2397, "step": 2540 }, { "epoch": 0.016637415263360496, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.2346, "step": 2550 }, { "epoch": 0.016702660029099165, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2462, "step": 2560 }, { "epoch": 0.016767904794837835, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2367, "step": 2570 }, { "epoch": 0.016833149560576504, "grad_norm": 1.8984375, "learning_rate": 2e-05, "loss": 0.2401, "step": 2580 }, { "epoch": 0.01689839432631517, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.2372, "step": 2590 }, { "epoch": 0.01696363909205384, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.2372, "step": 2600 }, { "epoch": 0.01702888385779251, "grad_norm": 2.25, "learning_rate": 2e-05, "loss": 0.2307, "step": 2610 }, { "epoch": 0.017094128623531178, "grad_norm": 1.9609375, "learning_rate": 2e-05, "loss": 0.2411, "step": 2620 }, { "epoch": 0.017159373389269844, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.2332, "step": 2630 }, { "epoch": 0.017224618155008514, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.2387, "step": 2640 }, { "epoch": 0.017289862920747183, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.2399, "step": 2650 }, { "epoch": 0.017355107686485852, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.2386, "step": 2660 }, { "epoch": 0.017420352452224522, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.2388, "step": 2670 }, { "epoch": 0.017485597217963188, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.2361, "step": 2680 }, { "epoch": 0.017550841983701857, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.237, "step": 2690 }, { "epoch": 0.017616086749440527, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.2332, "step": 2700 }, { "epoch": 0.017739317134478426, "grad_norm": 1.8984375, "learning_rate": 2e-05, "loss": 0.2319, "step": 2710 }, { "epoch": 0.017804775869291998, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 0.2451, "step": 2720 }, { "epoch": 0.017870234604105573, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2381, "step": 2730 }, { "epoch": 0.017935693338919145, "grad_norm": 2.390625, "learning_rate": 2e-05, "loss": 0.2286, "step": 2740 }, { "epoch": 0.018001152073732717, "grad_norm": 2.109375, "learning_rate": 2e-05, "loss": 0.2274, "step": 2750 }, { "epoch": 0.018066610808546293, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.2359, "step": 2760 }, { "epoch": 0.018132069543359865, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.2348, "step": 2770 }, { "epoch": 0.01819752827817344, "grad_norm": 2.015625, "learning_rate": 2e-05, "loss": 0.2371, "step": 2780 }, { "epoch": 0.018262987012987012, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.2417, "step": 2790 }, { "epoch": 0.018328445747800588, "grad_norm": 2.0, "learning_rate": 2e-05, "loss": 0.2347, "step": 2800 }, { "epoch": 0.01839390448261416, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.2355, "step": 2810 }, { "epoch": 0.018459363217427735, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 0.2443, "step": 2820 }, { "epoch": 0.018524821952241307, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2333, "step": 2830 }, { "epoch": 0.01859028068705488, "grad_norm": 1.8671875, "learning_rate": 2e-05, "loss": 0.2326, "step": 2840 }, { "epoch": 0.018655739421868454, "grad_norm": 2.390625, "learning_rate": 2e-05, "loss": 0.2357, "step": 2850 }, { "epoch": 0.018721198156682026, "grad_norm": 2.203125, "learning_rate": 2e-05, "loss": 0.2298, "step": 2860 }, { "epoch": 0.018786656891495602, "grad_norm": 1.9453125, "learning_rate": 2e-05, "loss": 0.2286, "step": 2870 }, { "epoch": 0.018852115626309174, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 0.231, "step": 2880 }, { "epoch": 0.01891757436112275, "grad_norm": 2.125, "learning_rate": 2e-05, "loss": 0.2347, "step": 2890 }, { "epoch": 0.01898303309593632, "grad_norm": 1.9140625, "learning_rate": 2e-05, "loss": 0.2294, "step": 2900 }, { "epoch": 0.019048491830749897, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.2298, "step": 2910 }, { "epoch": 0.01911395056556347, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.2277, "step": 2920 }, { "epoch": 0.01917940930037704, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.2363, "step": 2930 }, { "epoch": 0.019244868035190616, "grad_norm": 1.8671875, "learning_rate": 2e-05, "loss": 0.2303, "step": 2940 }, { "epoch": 0.019310326770004188, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.238, "step": 2950 }, { "epoch": 0.019375785504817764, "grad_norm": 2.015625, "learning_rate": 2e-05, "loss": 0.2296, "step": 2960 }, { "epoch": 0.019441244239631335, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 0.2328, "step": 2970 }, { "epoch": 0.01950670297444491, "grad_norm": 1.7890625, "learning_rate": 2e-05, "loss": 0.2331, "step": 2980 }, { "epoch": 0.019572161709258483, "grad_norm": 2.234375, "learning_rate": 2e-05, "loss": 0.2358, "step": 2990 }, { "epoch": 0.01963762044407206, "grad_norm": 1.8984375, "learning_rate": 2e-05, "loss": 0.2312, "step": 3000 }, { "epoch": 0.019741717988574728, "grad_norm": 1.921875, "learning_rate": 2e-05, "loss": 0.2357, "step": 3010 }, { "epoch": 0.0198073050915268, "grad_norm": 3.140625, "learning_rate": 2e-05, "loss": 0.2317, "step": 3020 }, { "epoch": 0.019872892194478877, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.2379, "step": 3030 }, { "epoch": 0.019938479297430953, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.2291, "step": 3040 }, { "epoch": 0.02000406640038303, "grad_norm": 1.8359375, "learning_rate": 2e-05, "loss": 0.2318, "step": 3050 }, { "epoch": 0.020069653503335103, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2317, "step": 3060 }, { "epoch": 0.02013524060628718, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.2308, "step": 3070 }, { "epoch": 0.020200827709239255, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.2313, "step": 3080 }, { "epoch": 0.02026641481219133, "grad_norm": 1.84375, "learning_rate": 2e-05, "loss": 0.2299, "step": 3090 }, { "epoch": 0.020332001915143408, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.2319, "step": 3100 }, { "epoch": 0.02039758901809548, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.2327, "step": 3110 }, { "epoch": 0.020463176121047557, "grad_norm": 2.578125, "learning_rate": 2e-05, "loss": 0.2278, "step": 3120 }, { "epoch": 0.020528763223999633, "grad_norm": 2.21875, "learning_rate": 2e-05, "loss": 0.2313, "step": 3130 }, { "epoch": 0.02059435032695171, "grad_norm": 2.34375, "learning_rate": 2e-05, "loss": 0.2291, "step": 3140 }, { "epoch": 0.020659937429903782, "grad_norm": 1.921875, "learning_rate": 2e-05, "loss": 0.2295, "step": 3150 }, { "epoch": 0.02072552453285586, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 0.2335, "step": 3160 }, { "epoch": 0.020791111635807935, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.2271, "step": 3170 }, { "epoch": 0.02085669873876001, "grad_norm": 3.34375, "learning_rate": 2e-05, "loss": 0.237, "step": 3180 }, { "epoch": 0.020922285841712084, "grad_norm": 2.34375, "learning_rate": 2e-05, "loss": 0.2256, "step": 3190 }, { "epoch": 0.02098787294466416, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.2293, "step": 3200 }, { "epoch": 0.021053460047616237, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 0.2291, "step": 3210 }, { "epoch": 0.021119047150568313, "grad_norm": 1.8046875, "learning_rate": 2e-05, "loss": 0.2243, "step": 3220 }, { "epoch": 0.021184634253520386, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.2261, "step": 3230 }, { "epoch": 0.021250221356472462, "grad_norm": 2.46875, "learning_rate": 2e-05, "loss": 0.2289, "step": 3240 }, { "epoch": 0.02131580845942454, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.2386, "step": 3250 }, { "epoch": 0.021381395562376615, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.2255, "step": 3260 }, { "epoch": 0.02144698266532869, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.2214, "step": 3270 }, { "epoch": 0.021512569768280764, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.2278, "step": 3280 }, { "epoch": 0.02157815687123284, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.2233, "step": 3290 }, { "epoch": 0.021643743974184917, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.2244, "step": 3300 }, { "epoch": 0.021709331077136993, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 0.2291, "step": 3310 }, { "epoch": 0.021774918180089066, "grad_norm": 2.0, "learning_rate": 2e-05, "loss": 0.2297, "step": 3320 }, { "epoch": 0.021840505283041142, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2239, "step": 3330 }, { "epoch": 0.02190609238599322, "grad_norm": 2.015625, "learning_rate": 2e-05, "loss": 0.2287, "step": 3340 }, { "epoch": 0.021971679488945295, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2269, "step": 3350 }, { "epoch": 0.022037266591897368, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.2356, "step": 3360 }, { "epoch": 0.022102853694849444, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.23, "step": 3370 }, { "epoch": 0.02216844079780152, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.2339, "step": 3380 }, { "epoch": 0.022234027900753597, "grad_norm": 4.15625, "learning_rate": 2e-05, "loss": 0.2293, "step": 3390 }, { "epoch": 0.022299615003705673, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2296, "step": 3400 }, { "epoch": 0.022365202106657746, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 0.2345, "step": 3410 }, { "epoch": 0.022430789209609822, "grad_norm": 2.46875, "learning_rate": 2e-05, "loss": 0.2311, "step": 3420 }, { "epoch": 0.0224963763125619, "grad_norm": 2.234375, "learning_rate": 2e-05, "loss": 0.2231, "step": 3430 }, { "epoch": 0.022561963415513975, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 0.2298, "step": 3440 }, { "epoch": 0.022627550518466048, "grad_norm": 1.9921875, "learning_rate": 2e-05, "loss": 0.2273, "step": 3450 }, { "epoch": 0.022693137621418124, "grad_norm": 2.171875, "learning_rate": 2e-05, "loss": 0.2333, "step": 3460 }, { "epoch": 0.0227587247243702, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 0.2202, "step": 3470 }, { "epoch": 0.022824311827322277, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.2343, "step": 3480 }, { "epoch": 0.02288989893027435, "grad_norm": 2.296875, "learning_rate": 2e-05, "loss": 0.2285, "step": 3490 }, { "epoch": 0.022955486033226426, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 0.2276, "step": 3500 }, { "epoch": 0.20421224109844077, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.2247, "step": 3510 }, { "epoch": 0.2047940423551315, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2226, "step": 3520 }, { "epoch": 0.2053758436118222, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 0.2225, "step": 3530 }, { "epoch": 0.20595764486851292, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.2274, "step": 3540 }, { "epoch": 0.20653944612520364, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.2256, "step": 3550 }, { "epoch": 0.20712124738189436, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.2218, "step": 3560 }, { "epoch": 0.20770304863858505, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.2284, "step": 3570 }, { "epoch": 0.20828484989527576, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.2217, "step": 3580 }, { "epoch": 0.20886665115196648, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.2186, "step": 3590 }, { "epoch": 0.2094484524086572, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.225, "step": 3600 }, { "epoch": 0.21003025366534792, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.2289, "step": 3610 }, { "epoch": 0.21061205492203863, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.2227, "step": 3620 }, { "epoch": 0.21119385617872935, "grad_norm": 1.9609375, "learning_rate": 2e-05, "loss": 0.2286, "step": 3630 }, { "epoch": 0.21177565743542007, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.2192, "step": 3640 }, { "epoch": 0.21235745869211078, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.227, "step": 3650 }, { "epoch": 0.2129392599488015, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2275, "step": 3660 }, { "epoch": 0.2135210612054922, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.2183, "step": 3670 }, { "epoch": 0.2141028624621829, "grad_norm": 2.09375, "learning_rate": 2e-05, "loss": 0.2294, "step": 3680 }, { "epoch": 0.21468466371887363, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 0.2215, "step": 3690 }, { "epoch": 0.21526646497556434, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 0.2256, "step": 3700 }, { "epoch": 0.21584826623225506, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 0.2344, "step": 3710 }, { "epoch": 0.21643006748894578, "grad_norm": 2.296875, "learning_rate": 2e-05, "loss": 0.2215, "step": 3720 }, { "epoch": 0.2170118687456365, "grad_norm": 1.9375, "learning_rate": 2e-05, "loss": 0.2224, "step": 3730 }, { "epoch": 0.2175936700023272, "grad_norm": 2.015625, "learning_rate": 2e-05, "loss": 0.2228, "step": 3740 }, { "epoch": 0.21817547125901793, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.2241, "step": 3750 }, { "epoch": 0.21875727251570865, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.2257, "step": 3760 }, { "epoch": 0.21933907377239936, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.218, "step": 3770 }, { "epoch": 0.21992087502909005, "grad_norm": 1.8203125, "learning_rate": 2e-05, "loss": 0.2204, "step": 3780 }, { "epoch": 0.22050267628578077, "grad_norm": 1.96875, "learning_rate": 2e-05, "loss": 0.2274, "step": 3790 }, { "epoch": 0.22108447754247149, "grad_norm": 2.171875, "learning_rate": 2e-05, "loss": 0.2224, "step": 3800 }, { "epoch": 0.2216662787991622, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.2237, "step": 3810 }, { "epoch": 0.22224808005585292, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 0.2268, "step": 3820 }, { "epoch": 0.22282988131254364, "grad_norm": 1.8046875, "learning_rate": 2e-05, "loss": 0.2249, "step": 3830 }, { "epoch": 0.22341168256923435, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.2334, "step": 3840 }, { "epoch": 0.22399348382592507, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.2211, "step": 3850 }, { "epoch": 0.2245752850826158, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.2173, "step": 3860 }, { "epoch": 0.2251570863393065, "grad_norm": 1.78125, "learning_rate": 2e-05, "loss": 0.2233, "step": 3870 }, { "epoch": 0.2257388875959972, "grad_norm": 2.09375, "learning_rate": 2e-05, "loss": 0.2258, "step": 3880 }, { "epoch": 0.2263206888526879, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.2222, "step": 3890 }, { "epoch": 0.22690249010937863, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 0.2223, "step": 3900 }, { "epoch": 0.22748429136606935, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.2226, "step": 3910 }, { "epoch": 0.22806609262276006, "grad_norm": 1.9375, "learning_rate": 2e-05, "loss": 0.2295, "step": 3920 }, { "epoch": 0.22864789387945078, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 0.229, "step": 3930 }, { "epoch": 0.2292296951361415, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 0.2219, "step": 3940 }, { "epoch": 0.22981149639283222, "grad_norm": 2.28125, "learning_rate": 2e-05, "loss": 0.2253, "step": 3950 }, { "epoch": 0.23039329764952293, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 0.2229, "step": 3960 }, { "epoch": 0.23097509890621365, "grad_norm": 1.8046875, "learning_rate": 2e-05, "loss": 0.2242, "step": 3970 }, { "epoch": 0.23155690016290434, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2212, "step": 3980 }, { "epoch": 0.23213870141959506, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.2234, "step": 3990 }, { "epoch": 0.23272050267628577, "grad_norm": 2.015625, "learning_rate": 2e-05, "loss": 0.2216, "step": 4000 }, { "epoch": 0.2333023039329765, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.2213, "step": 4010 }, { "epoch": 0.2338841051896672, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 0.2243, "step": 4020 }, { "epoch": 0.23446590644635792, "grad_norm": 3.25, "learning_rate": 2e-05, "loss": 0.2163, "step": 4030 }, { "epoch": 0.23504770770304864, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 0.2256, "step": 4040 }, { "epoch": 0.23562950895973936, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.2178, "step": 4050 }, { "epoch": 0.23621131021643008, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.2191, "step": 4060 }, { "epoch": 0.2367931114731208, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.2236, "step": 4070 }, { "epoch": 0.2373749127298115, "grad_norm": 1.8359375, "learning_rate": 2e-05, "loss": 0.2297, "step": 4080 }, { "epoch": 0.2379567139865022, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.2242, "step": 4090 }, { "epoch": 0.23853851524319292, "grad_norm": 1.8515625, "learning_rate": 2e-05, "loss": 0.2249, "step": 4100 }, { "epoch": 0.23912031649988363, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 0.2225, "step": 4110 }, { "epoch": 0.23970211775657435, "grad_norm": 1.984375, "learning_rate": 2e-05, "loss": 0.2232, "step": 4120 }, { "epoch": 0.24028391901326507, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2235, "step": 4130 }, { "epoch": 0.24086572026995579, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.2183, "step": 4140 }, { "epoch": 0.2414475215266465, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 0.2216, "step": 4150 }, { "epoch": 0.24202932278333722, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.2218, "step": 4160 }, { "epoch": 0.24261112404002794, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.2167, "step": 4170 }, { "epoch": 0.24319292529671865, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.2258, "step": 4180 }, { "epoch": 0.24377472655340934, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.2174, "step": 4190 }, { "epoch": 0.24435652781010006, "grad_norm": 1.7890625, "learning_rate": 2e-05, "loss": 0.2179, "step": 4200 }, { "epoch": 0.24493832906679078, "grad_norm": 1.953125, "learning_rate": 2e-05, "loss": 0.2234, "step": 4210 }, { "epoch": 0.2455201303234815, "grad_norm": 1.390625, "learning_rate": 2e-05, "loss": 0.2168, "step": 4220 }, { "epoch": 0.2461019315801722, "grad_norm": 1.7890625, "learning_rate": 2e-05, "loss": 0.2173, "step": 4230 }, { "epoch": 0.24668373283686293, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.2187, "step": 4240 }, { "epoch": 0.24726553409355365, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 0.2195, "step": 4250 }, { "epoch": 0.24784733535024436, "grad_norm": 1.828125, "learning_rate": 2e-05, "loss": 0.2169, "step": 4260 }, { "epoch": 0.24842913660693508, "grad_norm": 2.015625, "learning_rate": 2e-05, "loss": 0.219, "step": 4270 }, { "epoch": 0.2490109378636258, "grad_norm": 2.21875, "learning_rate": 2e-05, "loss": 0.2176, "step": 4280 }, { "epoch": 0.2495927391203165, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 0.2157, "step": 4290 }, { "epoch": 0.2501745403770072, "grad_norm": 2.390625, "learning_rate": 2e-05, "loss": 0.2242, "step": 4300 }, { "epoch": 0.2507563416336979, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.2243, "step": 4310 }, { "epoch": 0.25133814289038864, "grad_norm": 1.84375, "learning_rate": 2e-05, "loss": 0.2192, "step": 4320 }, { "epoch": 0.25191994414707936, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.2216, "step": 4330 }, { "epoch": 0.2525017454037701, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.2163, "step": 4340 }, { "epoch": 0.2530835466604608, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.2249, "step": 4350 }, { "epoch": 0.2536653479171515, "grad_norm": 1.3125, "learning_rate": 2e-05, "loss": 0.2249, "step": 4360 }, { "epoch": 0.2542471491738422, "grad_norm": 2.0, "learning_rate": 2e-05, "loss": 0.2227, "step": 4370 }, { "epoch": 0.25482895043053294, "grad_norm": 3.953125, "learning_rate": 2e-05, "loss": 0.2163, "step": 4380 }, { "epoch": 0.25541075168722366, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.2168, "step": 4390 }, { "epoch": 0.2559925529439144, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.2164, "step": 4400 }, { "epoch": 0.2565743542006051, "grad_norm": 1.8203125, "learning_rate": 2e-05, "loss": 0.213, "step": 4410 }, { "epoch": 0.2571561554572958, "grad_norm": 1.9765625, "learning_rate": 2e-05, "loss": 0.2203, "step": 4420 }, { "epoch": 0.2577379567139865, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 0.2154, "step": 4430 }, { "epoch": 0.25831975797067724, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 0.2214, "step": 4440 }, { "epoch": 0.2589015592273679, "grad_norm": 2.28125, "learning_rate": 2e-05, "loss": 0.2136, "step": 4450 }, { "epoch": 0.2594833604840586, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.2204, "step": 4460 }, { "epoch": 0.26006516174074934, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.219, "step": 4470 }, { "epoch": 0.26064696299744006, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 0.2112, "step": 4480 }, { "epoch": 0.2612287642541308, "grad_norm": 1.9609375, "learning_rate": 2e-05, "loss": 0.2184, "step": 4490 }, { "epoch": 0.2618105655108215, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.2219, "step": 4500 }, { "epoch": 0.2623923667675122, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.2205, "step": 4510 }, { "epoch": 0.2629741680242029, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.216, "step": 4520 }, { "epoch": 0.26355596928089364, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.2275, "step": 4530 }, { "epoch": 0.26413777053758436, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 0.2199, "step": 4540 }, { "epoch": 0.2647195717942751, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.2193, "step": 4550 }, { "epoch": 0.2653013730509658, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 0.2123, "step": 4560 }, { "epoch": 0.2658831743076565, "grad_norm": 2.25, "learning_rate": 2e-05, "loss": 0.2244, "step": 4570 }, { "epoch": 0.26646497556434723, "grad_norm": 2.0, "learning_rate": 2e-05, "loss": 0.2165, "step": 4580 }, { "epoch": 0.26704677682103795, "grad_norm": 2.0, "learning_rate": 2e-05, "loss": 0.2167, "step": 4590 }, { "epoch": 0.26762857807772866, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 0.2146, "step": 4600 }, { "epoch": 0.2682103793344194, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 0.2166, "step": 4610 }, { "epoch": 0.2687921805911101, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.2148, "step": 4620 }, { "epoch": 0.2693739818478008, "grad_norm": 1.8203125, "learning_rate": 2e-05, "loss": 0.2174, "step": 4630 }, { "epoch": 0.26995578310449153, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.2229, "step": 4640 }, { "epoch": 0.2705375843611822, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.2139, "step": 4650 }, { "epoch": 0.2711193856178729, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.213, "step": 4660 }, { "epoch": 0.2717011868745636, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.2109, "step": 4670 }, { "epoch": 0.27228298813125434, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.2168, "step": 4680 }, { "epoch": 0.27286478938794506, "grad_norm": 1.96875, "learning_rate": 2e-05, "loss": 0.2185, "step": 4690 }, { "epoch": 0.2734465906446358, "grad_norm": 2.171875, "learning_rate": 2e-05, "loss": 0.2209, "step": 4700 }, { "epoch": 0.2740283919013265, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.2132, "step": 4710 }, { "epoch": 0.2746101931580172, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2109, "step": 4720 }, { "epoch": 0.27519199441470793, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.2212, "step": 4730 }, { "epoch": 0.27577379567139865, "grad_norm": 1.8984375, "learning_rate": 2e-05, "loss": 0.2227, "step": 4740 }, { "epoch": 0.27635559692808936, "grad_norm": 1.8515625, "learning_rate": 2e-05, "loss": 0.2187, "step": 4750 }, { "epoch": 0.2769373981847801, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.2176, "step": 4760 }, { "epoch": 0.2775191994414708, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 0.217, "step": 4770 }, { "epoch": 0.2781010006981615, "grad_norm": 1.34375, "learning_rate": 2e-05, "loss": 0.2158, "step": 4780 }, { "epoch": 0.27868280195485223, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.2147, "step": 4790 }, { "epoch": 0.27926460321154295, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.221, "step": 4800 }, { "epoch": 0.27984640446823367, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 0.2137, "step": 4810 }, { "epoch": 0.2804282057249244, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.2216, "step": 4820 }, { "epoch": 0.2810100069816151, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 0.2122, "step": 4830 }, { "epoch": 0.2815918082383058, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.2156, "step": 4840 }, { "epoch": 0.28217360949499654, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.2129, "step": 4850 }, { "epoch": 0.2827554107516872, "grad_norm": 1.265625, "learning_rate": 2e-05, "loss": 0.2167, "step": 4860 }, { "epoch": 0.2833372120083779, "grad_norm": 1.8203125, "learning_rate": 2e-05, "loss": 0.2152, "step": 4870 }, { "epoch": 0.28391901326506863, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.2119, "step": 4880 }, { "epoch": 0.28450081452175935, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 0.2137, "step": 4890 }, { "epoch": 0.28508261577845007, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 0.22, "step": 4900 }, { "epoch": 0.2856644170351408, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2184, "step": 4910 }, { "epoch": 0.2862462182918315, "grad_norm": 1.8515625, "learning_rate": 2e-05, "loss": 0.2243, "step": 4920 }, { "epoch": 0.2868280195485222, "grad_norm": 1.953125, "learning_rate": 2e-05, "loss": 0.2173, "step": 4930 }, { "epoch": 0.28740982080521293, "grad_norm": 1.9765625, "learning_rate": 2e-05, "loss": 0.2135, "step": 4940 }, { "epoch": 0.28799162206190365, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.2184, "step": 4950 }, { "epoch": 0.28857342331859437, "grad_norm": 1.921875, "learning_rate": 2e-05, "loss": 0.2161, "step": 4960 }, { "epoch": 0.2891552245752851, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.2181, "step": 4970 }, { "epoch": 0.2897370258319758, "grad_norm": 1.9140625, "learning_rate": 2e-05, "loss": 0.2139, "step": 4980 }, { "epoch": 0.2903188270886665, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 0.2135, "step": 4990 }, { "epoch": 0.29090062834535724, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.2145, "step": 5000 }, { "epoch": 0.29148242960204795, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 0.215, "step": 5010 }, { "epoch": 0.29206423085873867, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.2147, "step": 5020 }, { "epoch": 0.2926460321154294, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.2142, "step": 5030 }, { "epoch": 0.2932278333721201, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.2148, "step": 5040 }, { "epoch": 0.2938096346288108, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.2099, "step": 5050 }, { "epoch": 0.29439143588550154, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.2139, "step": 5060 }, { "epoch": 0.2949732371421922, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.213, "step": 5070 }, { "epoch": 0.2955550383988829, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.2142, "step": 5080 }, { "epoch": 0.29613683965557364, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.216, "step": 5090 }, { "epoch": 0.29671864091226435, "grad_norm": 1.9609375, "learning_rate": 2e-05, "loss": 0.2194, "step": 5100 }, { "epoch": 0.29730044216895507, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.2099, "step": 5110 }, { "epoch": 0.2978822434256458, "grad_norm": 1.953125, "learning_rate": 2e-05, "loss": 0.216, "step": 5120 }, { "epoch": 0.2984640446823365, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.2113, "step": 5130 }, { "epoch": 0.2990458459390272, "grad_norm": 1.921875, "learning_rate": 2e-05, "loss": 0.2155, "step": 5140 }, { "epoch": 0.29962764719571794, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 0.2083, "step": 5150 }, { "epoch": 0.30020944845240866, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.2179, "step": 5160 }, { "epoch": 0.3007912497090994, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.2137, "step": 5170 }, { "epoch": 0.3013730509657901, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.2129, "step": 5180 }, { "epoch": 0.3019548522224808, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.2079, "step": 5190 }, { "epoch": 0.3025366534791715, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.2175, "step": 5200 }, { "epoch": 0.30311845473586224, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.2186, "step": 5210 }, { "epoch": 0.30370025599255296, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 0.2091, "step": 5220 }, { "epoch": 0.3042820572492437, "grad_norm": 1.828125, "learning_rate": 2e-05, "loss": 0.2132, "step": 5230 }, { "epoch": 0.3048638585059344, "grad_norm": 1.96875, "learning_rate": 2e-05, "loss": 0.2139, "step": 5240 }, { "epoch": 0.3054456597626251, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.2201, "step": 5250 }, { "epoch": 0.3060274610193158, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 0.2159, "step": 5260 }, { "epoch": 0.3066092622760065, "grad_norm": 1.8515625, "learning_rate": 2e-05, "loss": 0.2146, "step": 5270 }, { "epoch": 0.3071910635326972, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2075, "step": 5280 }, { "epoch": 0.3077728647893879, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 0.215, "step": 5290 }, { "epoch": 0.30835466604607864, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.2197, "step": 5300 }, { "epoch": 0.30893646730276936, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.2151, "step": 5310 }, { "epoch": 0.3095182685594601, "grad_norm": 1.8203125, "learning_rate": 2e-05, "loss": 0.2208, "step": 5320 }, { "epoch": 0.3101000698161508, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.2121, "step": 5330 }, { "epoch": 0.3106818710728415, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.2063, "step": 5340 }, { "epoch": 0.3112636723295322, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.2118, "step": 5350 }, { "epoch": 0.31184547358622294, "grad_norm": 2.0, "learning_rate": 2e-05, "loss": 0.2156, "step": 5360 }, { "epoch": 0.31242727484291366, "grad_norm": 1.796875, "learning_rate": 2e-05, "loss": 0.2199, "step": 5370 }, { "epoch": 0.3130090760996044, "grad_norm": 1.921875, "learning_rate": 2e-05, "loss": 0.211, "step": 5380 }, { "epoch": 0.3135908773562951, "grad_norm": 1.8046875, "learning_rate": 2e-05, "loss": 0.2158, "step": 5390 }, { "epoch": 0.3141726786129858, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.2078, "step": 5400 }, { "epoch": 0.31475447986967653, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.2168, "step": 5410 }, { "epoch": 0.31533628112636725, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.2191, "step": 5420 }, { "epoch": 0.31591808238305796, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.2117, "step": 5430 }, { "epoch": 0.3164998836397487, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.2034, "step": 5440 }, { "epoch": 0.3170816848964394, "grad_norm": 2.21875, "learning_rate": 2e-05, "loss": 0.2146, "step": 5450 }, { "epoch": 0.3176634861531301, "grad_norm": 1.2734375, "learning_rate": 2e-05, "loss": 0.2086, "step": 5460 }, { "epoch": 0.31824528740982083, "grad_norm": 1.78125, "learning_rate": 2e-05, "loss": 0.2155, "step": 5470 }, { "epoch": 0.3188270886665115, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.2146, "step": 5480 }, { "epoch": 0.3194088899232022, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.2133, "step": 5490 }, { "epoch": 0.3199906911798929, "grad_norm": 1.78125, "learning_rate": 2e-05, "loss": 0.2191, "step": 5500 }, { "epoch": 0.32057249243658364, "grad_norm": 2.265625, "learning_rate": 2e-05, "loss": 0.2131, "step": 5510 }, { "epoch": 0.32115429369327436, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 0.214, "step": 5520 }, { "epoch": 0.3217360949499651, "grad_norm": 1.78125, "learning_rate": 2e-05, "loss": 0.2091, "step": 5530 }, { "epoch": 0.3223178962066558, "grad_norm": 1.953125, "learning_rate": 2e-05, "loss": 0.2077, "step": 5540 }, { "epoch": 0.3228996974633465, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.2137, "step": 5550 }, { "epoch": 0.32348149872003723, "grad_norm": 2.8125, "learning_rate": 2e-05, "loss": 0.2149, "step": 5560 }, { "epoch": 0.32406329997672795, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.2111, "step": 5570 }, { "epoch": 0.32464510123341866, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.2136, "step": 5580 }, { "epoch": 0.3252269024901094, "grad_norm": 1.796875, "learning_rate": 2e-05, "loss": 0.2099, "step": 5590 }, { "epoch": 0.3258087037468001, "grad_norm": 2.359375, "learning_rate": 2e-05, "loss": 0.2114, "step": 5600 }, { "epoch": 0.3263905050034908, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.2152, "step": 5610 }, { "epoch": 0.32697230626018153, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.2153, "step": 5620 }, { "epoch": 0.32755410751687225, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.207, "step": 5630 }, { "epoch": 0.32813590877356297, "grad_norm": 1.9453125, "learning_rate": 2e-05, "loss": 0.215, "step": 5640 }, { "epoch": 0.3287177100302537, "grad_norm": 1.9765625, "learning_rate": 2e-05, "loss": 0.2084, "step": 5650 }, { "epoch": 0.3292995112869444, "grad_norm": 1.78125, "learning_rate": 2e-05, "loss": 0.217, "step": 5660 }, { "epoch": 0.3298813125436351, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.2094, "step": 5670 }, { "epoch": 0.33046311380032584, "grad_norm": 1.96875, "learning_rate": 2e-05, "loss": 0.2118, "step": 5680 }, { "epoch": 0.3310449150570165, "grad_norm": 1.7890625, "learning_rate": 2e-05, "loss": 0.2092, "step": 5690 }, { "epoch": 0.3316267163137072, "grad_norm": 1.796875, "learning_rate": 2e-05, "loss": 0.2086, "step": 5700 }, { "epoch": 0.33220851757039793, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.2126, "step": 5710 }, { "epoch": 0.33279031882708865, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.2143, "step": 5720 }, { "epoch": 0.33337212008377937, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.2077, "step": 5730 }, { "epoch": 0.3339539213404701, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.2101, "step": 5740 }, { "epoch": 0.3345357225971608, "grad_norm": 1.828125, "learning_rate": 2e-05, "loss": 0.2088, "step": 5750 }, { "epoch": 0.3351175238538515, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.2186, "step": 5760 }, { "epoch": 0.33569932511054223, "grad_norm": 1.28125, "learning_rate": 2e-05, "loss": 0.2052, "step": 5770 }, { "epoch": 0.33628112636723295, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2074, "step": 5780 }, { "epoch": 0.33686292762392367, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.2087, "step": 5790 }, { "epoch": 0.3374447288806144, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 0.2156, "step": 5800 }, { "epoch": 0.3380265301373051, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.2118, "step": 5810 }, { "epoch": 0.3386083313939958, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.2093, "step": 5820 }, { "epoch": 0.33919013265068654, "grad_norm": 1.8984375, "learning_rate": 2e-05, "loss": 0.2071, "step": 5830 }, { "epoch": 0.33977193390737725, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.2179, "step": 5840 }, { "epoch": 0.34035373516406797, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.2113, "step": 5850 }, { "epoch": 0.3409355364207587, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.2038, "step": 5860 }, { "epoch": 0.3415173376774494, "grad_norm": 1.796875, "learning_rate": 2e-05, "loss": 0.2165, "step": 5870 }, { "epoch": 0.3420991389341401, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.2154, "step": 5880 }, { "epoch": 0.3426809401908308, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 0.2146, "step": 5890 }, { "epoch": 0.3432627414475215, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2077, "step": 5900 }, { "epoch": 0.3438445427042122, "grad_norm": 1.8046875, "learning_rate": 2e-05, "loss": 0.2105, "step": 5910 }, { "epoch": 0.34442634396090294, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 0.2076, "step": 5920 }, { "epoch": 0.34500814521759365, "grad_norm": 2.171875, "learning_rate": 2e-05, "loss": 0.2107, "step": 5930 }, { "epoch": 0.34558994647428437, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.209, "step": 5940 }, { "epoch": 0.3461717477309751, "grad_norm": 2.21875, "learning_rate": 2e-05, "loss": 0.2101, "step": 5950 }, { "epoch": 0.3467535489876658, "grad_norm": 1.84375, "learning_rate": 2e-05, "loss": 0.2137, "step": 5960 }, { "epoch": 0.3473353502443565, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.2067, "step": 5970 }, { "epoch": 0.34791715150104724, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 0.2203, "step": 5980 }, { "epoch": 0.34849895275773796, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.2115, "step": 5990 }, { "epoch": 0.3490807540144287, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 0.21, "step": 6000 }, { "epoch": 0.3496625552711194, "grad_norm": 1.28125, "learning_rate": 2e-05, "loss": 0.2184, "step": 6010 }, { "epoch": 0.3502443565278101, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.2033, "step": 6020 }, { "epoch": 0.3508261577845008, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 0.2103, "step": 6030 }, { "epoch": 0.35140795904119154, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 0.2085, "step": 6040 }, { "epoch": 0.35198976029788226, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.213, "step": 6050 }, { "epoch": 0.352571561554573, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2067, "step": 6060 }, { "epoch": 0.3531533628112637, "grad_norm": 1.8359375, "learning_rate": 2e-05, "loss": 0.2107, "step": 6070 }, { "epoch": 0.3537351640679544, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 0.2057, "step": 6080 }, { "epoch": 0.35431696532464513, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 0.2135, "step": 6090 }, { "epoch": 0.3548987665813358, "grad_norm": 1.7890625, "learning_rate": 2e-05, "loss": 0.2066, "step": 6100 }, { "epoch": 0.3554805678380265, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.2162, "step": 6110 }, { "epoch": 0.3560623690947172, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.2062, "step": 6120 }, { "epoch": 0.35664417035140794, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.2115, "step": 6130 }, { "epoch": 0.35722597160809866, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.2108, "step": 6140 }, { "epoch": 0.3578077728647894, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 0.2096, "step": 6150 }, { "epoch": 0.3583895741214801, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.207, "step": 6160 }, { "epoch": 0.3589713753781708, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.2071, "step": 6170 }, { "epoch": 0.3595531766348615, "grad_norm": 1.9375, "learning_rate": 2e-05, "loss": 0.2109, "step": 6180 }, { "epoch": 0.36013497789155224, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2118, "step": 6190 }, { "epoch": 0.36071677914824296, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.2092, "step": 6200 }, { "epoch": 0.3612985804049337, "grad_norm": 2.390625, "learning_rate": 2e-05, "loss": 0.2032, "step": 6210 }, { "epoch": 0.3618803816616244, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.2169, "step": 6220 }, { "epoch": 0.3624621829183151, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.2051, "step": 6230 }, { "epoch": 0.36304398417500583, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.2112, "step": 6240 }, { "epoch": 0.36362578543169655, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.2064, "step": 6250 }, { "epoch": 0.36420758668838726, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.2055, "step": 6260 }, { "epoch": 0.364789387945078, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.2018, "step": 6270 }, { "epoch": 0.3653711892017687, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.2051, "step": 6280 }, { "epoch": 0.3659529904584594, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 0.2094, "step": 6290 }, { "epoch": 0.36653479171515013, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 0.2119, "step": 6300 }, { "epoch": 0.3671165929718408, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.2134, "step": 6310 }, { "epoch": 0.3676983942285315, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.2078, "step": 6320 }, { "epoch": 0.36828019548522223, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 0.2012, "step": 6330 }, { "epoch": 0.36886199674191295, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 0.2051, "step": 6340 }, { "epoch": 0.36944379799860366, "grad_norm": 2.015625, "learning_rate": 2e-05, "loss": 0.2086, "step": 6350 }, { "epoch": 0.3700255992552944, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 0.2118, "step": 6360 }, { "epoch": 0.3706074005119851, "grad_norm": 1.7890625, "learning_rate": 2e-05, "loss": 0.2086, "step": 6370 }, { "epoch": 0.3711892017686758, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.2185, "step": 6380 }, { "epoch": 0.37177100302536653, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.2094, "step": 6390 }, { "epoch": 0.37235280428205725, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.2116, "step": 6400 }, { "epoch": 0.37293460553874797, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 0.2097, "step": 6410 }, { "epoch": 0.3735164067954387, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.2116, "step": 6420 }, { "epoch": 0.3740982080521294, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.2068, "step": 6430 }, { "epoch": 0.3746800093088201, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.2094, "step": 6440 }, { "epoch": 0.37526181056551083, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.2103, "step": 6450 }, { "epoch": 0.37584361182220155, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.201, "step": 6460 }, { "epoch": 0.37642541307889227, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2099, "step": 6470 }, { "epoch": 0.377007214335583, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.2075, "step": 6480 }, { "epoch": 0.3775890155922737, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 0.2045, "step": 6490 }, { "epoch": 0.3781708168489644, "grad_norm": 1.9609375, "learning_rate": 2e-05, "loss": 0.2085, "step": 6500 }, { "epoch": 0.3787526181056551, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.2156, "step": 6510 }, { "epoch": 0.3793344193623458, "grad_norm": 1.796875, "learning_rate": 2e-05, "loss": 0.2152, "step": 6520 }, { "epoch": 0.3799162206190365, "grad_norm": 1.21875, "learning_rate": 2e-05, "loss": 0.2085, "step": 6530 }, { "epoch": 0.38049802187572723, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.2068, "step": 6540 }, { "epoch": 0.38107982313241795, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.2148, "step": 6550 }, { "epoch": 0.38166162438910867, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.2052, "step": 6560 }, { "epoch": 0.3822434256457994, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 0.2098, "step": 6570 }, { "epoch": 0.3828252269024901, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 0.2048, "step": 6580 }, { "epoch": 0.3834070281591808, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.2031, "step": 6590 }, { "epoch": 0.38398882941587154, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.2028, "step": 6600 }, { "epoch": 0.38457063067256225, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.2072, "step": 6610 }, { "epoch": 0.38515243192925297, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.2026, "step": 6620 }, { "epoch": 0.3857342331859437, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.2031, "step": 6630 }, { "epoch": 0.3863160344426344, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.2104, "step": 6640 }, { "epoch": 0.3868978356993251, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 0.2065, "step": 6650 }, { "epoch": 0.38747963695601584, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.2067, "step": 6660 }, { "epoch": 0.38806143821270656, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.2109, "step": 6670 }, { "epoch": 0.3886432394693973, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.2045, "step": 6680 }, { "epoch": 0.389225040726088, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2026, "step": 6690 }, { "epoch": 0.3898068419827787, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.2036, "step": 6700 }, { "epoch": 0.3903886432394694, "grad_norm": 1.984375, "learning_rate": 2e-05, "loss": 0.2152, "step": 6710 }, { "epoch": 0.3909704444961601, "grad_norm": 3.0, "learning_rate": 2e-05, "loss": 0.2053, "step": 6720 }, { "epoch": 0.3915522457528508, "grad_norm": 2.328125, "learning_rate": 2e-05, "loss": 0.21, "step": 6730 }, { "epoch": 0.3921340470095415, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.2068, "step": 6740 }, { "epoch": 0.39271584826623224, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 0.2063, "step": 6750 }, { "epoch": 0.39329764952292295, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.2093, "step": 6760 }, { "epoch": 0.39387945077961367, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.2043, "step": 6770 }, { "epoch": 0.3944612520363044, "grad_norm": 2.15625, "learning_rate": 2e-05, "loss": 0.211, "step": 6780 }, { "epoch": 0.3950430532929951, "grad_norm": 1.796875, "learning_rate": 2e-05, "loss": 0.2111, "step": 6790 }, { "epoch": 0.3956248545496858, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.2022, "step": 6800 }, { "epoch": 0.39620665580637654, "grad_norm": 2.46875, "learning_rate": 2e-05, "loss": 0.2028, "step": 6810 }, { "epoch": 0.39678845706306726, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.2143, "step": 6820 }, { "epoch": 0.397370258319758, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 0.2124, "step": 6830 }, { "epoch": 0.3979520595764487, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 0.2185, "step": 6840 }, { "epoch": 0.3985338608331394, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.2118, "step": 6850 }, { "epoch": 0.3991156620898301, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 0.2091, "step": 6860 }, { "epoch": 0.39969746334652084, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.2086, "step": 6870 }, { "epoch": 0.40027926460321156, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.2025, "step": 6880 }, { "epoch": 0.4008610658599023, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.2035, "step": 6890 }, { "epoch": 0.401442867116593, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.2086, "step": 6900 }, { "epoch": 0.4020246683732837, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.2096, "step": 6910 }, { "epoch": 0.40260646962997443, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.207, "step": 6920 }, { "epoch": 0.4031882708866651, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.2113, "step": 6930 }, { "epoch": 0.4037700721433558, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.2077, "step": 6940 }, { "epoch": 0.4043518734000465, "grad_norm": 1.96875, "learning_rate": 2e-05, "loss": 0.1999, "step": 6950 }, { "epoch": 0.40493367465673724, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.2014, "step": 6960 }, { "epoch": 0.40551547591342796, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.2058, "step": 6970 }, { "epoch": 0.4060972771701187, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.209, "step": 6980 }, { "epoch": 0.4066790784268094, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.2103, "step": 6990 }, { "epoch": 0.4072608796835001, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.2117, "step": 7000 }, { "epoch": 0.4078426809401908, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.2055, "step": 7010 }, { "epoch": 0.40842448219688154, "grad_norm": 1.8984375, "learning_rate": 2e-05, "loss": 0.2129, "step": 7020 }, { "epoch": 0.40900628345357226, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.2038, "step": 7030 }, { "epoch": 0.409588084710263, "grad_norm": 1.9375, "learning_rate": 2e-05, "loss": 0.2028, "step": 7040 }, { "epoch": 0.4101698859669537, "grad_norm": 2.203125, "learning_rate": 2e-05, "loss": 0.2085, "step": 7050 }, { "epoch": 0.4107516872236444, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.2076, "step": 7060 }, { "epoch": 0.41133348848033513, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.2047, "step": 7070 }, { "epoch": 0.41191528973702585, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.2079, "step": 7080 }, { "epoch": 0.41249709099371656, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2016, "step": 7090 }, { "epoch": 0.4130788922504073, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.2084, "step": 7100 }, { "epoch": 0.413660693507098, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.2076, "step": 7110 }, { "epoch": 0.4142424947637887, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.2084, "step": 7120 }, { "epoch": 0.4148242960204794, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.2003, "step": 7130 }, { "epoch": 0.4154060972771701, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.2059, "step": 7140 }, { "epoch": 0.4159878985338608, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.2067, "step": 7150 }, { "epoch": 0.41656969979055153, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.2028, "step": 7160 }, { "epoch": 0.41715150104724225, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 0.2015, "step": 7170 }, { "epoch": 0.41773330230393296, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2007, "step": 7180 }, { "epoch": 0.4183151035606237, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 0.2061, "step": 7190 }, { "epoch": 0.4188969048173144, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 0.2026, "step": 7200 }, { "epoch": 0.4194787060740051, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.2088, "step": 7210 }, { "epoch": 0.42006050733069583, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.2037, "step": 7220 }, { "epoch": 0.42064230858738655, "grad_norm": 2.578125, "learning_rate": 2e-05, "loss": 0.2063, "step": 7230 }, { "epoch": 0.42122410984407727, "grad_norm": 1.796875, "learning_rate": 2e-05, "loss": 0.2086, "step": 7240 }, { "epoch": 0.421805911100768, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.2048, "step": 7250 }, { "epoch": 0.4223877123574587, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.2002, "step": 7260 }, { "epoch": 0.4229695136141494, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.1988, "step": 7270 }, { "epoch": 0.42355131487084013, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.2082, "step": 7280 }, { "epoch": 0.42413311612753085, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.2086, "step": 7290 }, { "epoch": 0.42471491738422157, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.1968, "step": 7300 }, { "epoch": 0.4252967186409123, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.1984, "step": 7310 }, { "epoch": 0.425878519897603, "grad_norm": 2.46875, "learning_rate": 2e-05, "loss": 0.2068, "step": 7320 }, { "epoch": 0.4264603211542937, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.2029, "step": 7330 }, { "epoch": 0.4270421224109844, "grad_norm": 2.40625, "learning_rate": 2e-05, "loss": 0.2093, "step": 7340 }, { "epoch": 0.4276239236676751, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.2055, "step": 7350 }, { "epoch": 0.4282057249243658, "grad_norm": 3.625, "learning_rate": 2e-05, "loss": 0.2018, "step": 7360 }, { "epoch": 0.42878752618105653, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.2011, "step": 7370 }, { "epoch": 0.42936932743774725, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.2059, "step": 7380 }, { "epoch": 0.42995112869443797, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.2051, "step": 7390 }, { "epoch": 0.4305329299511287, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.1991, "step": 7400 }, { "epoch": 0.4311147312078194, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.2114, "step": 7410 }, { "epoch": 0.4316965324645101, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.1999, "step": 7420 }, { "epoch": 0.43227833372120084, "grad_norm": 2.09375, "learning_rate": 2e-05, "loss": 0.2031, "step": 7430 }, { "epoch": 0.43286013497789155, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 0.2055, "step": 7440 }, { "epoch": 0.43344193623458227, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.2042, "step": 7450 }, { "epoch": 0.434023737491273, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.2036, "step": 7460 }, { "epoch": 0.4346055387479637, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.2049, "step": 7470 }, { "epoch": 0.4351873400046544, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.2002, "step": 7480 }, { "epoch": 0.43576914126134514, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.2091, "step": 7490 }, { "epoch": 0.43635094251803586, "grad_norm": 2.265625, "learning_rate": 2e-05, "loss": 0.2114, "step": 7500 }, { "epoch": 0.4369327437747266, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.2053, "step": 7510 }, { "epoch": 0.4375145450314173, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 0.1966, "step": 7520 }, { "epoch": 0.438096346288108, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.1968, "step": 7530 }, { "epoch": 0.4386781475447987, "grad_norm": 1.9453125, "learning_rate": 2e-05, "loss": 0.2038, "step": 7540 }, { "epoch": 0.4392599488014894, "grad_norm": 1.78125, "learning_rate": 2e-05, "loss": 0.2028, "step": 7550 }, { "epoch": 0.4398417500581801, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 0.2027, "step": 7560 }, { "epoch": 0.4404235513148708, "grad_norm": 1.7890625, "learning_rate": 2e-05, "loss": 0.201, "step": 7570 }, { "epoch": 0.44100535257156154, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2057, "step": 7580 }, { "epoch": 0.44158715382825225, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.2059, "step": 7590 }, { "epoch": 0.44216895508494297, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.1953, "step": 7600 }, { "epoch": 0.4427507563416337, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.2039, "step": 7610 }, { "epoch": 0.4433325575983244, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.2079, "step": 7620 }, { "epoch": 0.4439143588550151, "grad_norm": 1.796875, "learning_rate": 2e-05, "loss": 0.202, "step": 7630 }, { "epoch": 0.44449616011170584, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.2022, "step": 7640 }, { "epoch": 0.44507796136839656, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.2013, "step": 7650 }, { "epoch": 0.4456597626250873, "grad_norm": 1.921875, "learning_rate": 2e-05, "loss": 0.2021, "step": 7660 }, { "epoch": 0.446241563881778, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.206, "step": 7670 }, { "epoch": 0.4468233651384687, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.2018, "step": 7680 }, { "epoch": 0.4474051663951594, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.1979, "step": 7690 }, { "epoch": 0.44798696765185014, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 0.2063, "step": 7700 }, { "epoch": 0.44856876890854086, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.2001, "step": 7710 }, { "epoch": 0.4491505701652316, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.2038, "step": 7720 }, { "epoch": 0.4497323714219223, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.2037, "step": 7730 }, { "epoch": 0.450314172678613, "grad_norm": 1.8046875, "learning_rate": 2e-05, "loss": 0.209, "step": 7740 }, { "epoch": 0.4508959739353037, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.2001, "step": 7750 }, { "epoch": 0.4514777751919944, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 0.2054, "step": 7760 }, { "epoch": 0.4520595764486851, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.204, "step": 7770 }, { "epoch": 0.4526413777053758, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.2025, "step": 7780 }, { "epoch": 0.45322317896206654, "grad_norm": 2.375, "learning_rate": 2e-05, "loss": 0.1992, "step": 7790 }, { "epoch": 0.45380498021875726, "grad_norm": 2.25, "learning_rate": 2e-05, "loss": 0.1998, "step": 7800 }, { "epoch": 0.454386781475448, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.2055, "step": 7810 }, { "epoch": 0.4549685827321387, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.206, "step": 7820 }, { "epoch": 0.4555503839888294, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.2024, "step": 7830 }, { "epoch": 0.4561321852455201, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.1985, "step": 7840 }, { "epoch": 0.45671398650221084, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.2086, "step": 7850 }, { "epoch": 0.45729578775890156, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.2052, "step": 7860 }, { "epoch": 0.4578775890155923, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.204, "step": 7870 }, { "epoch": 0.458459390272283, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.2028, "step": 7880 }, { "epoch": 0.4590411915289737, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.1982, "step": 7890 }, { "epoch": 0.45962299278566443, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.1986, "step": 7900 }, { "epoch": 0.46020479404235515, "grad_norm": 2.328125, "learning_rate": 2e-05, "loss": 0.1971, "step": 7910 }, { "epoch": 0.46078659529904586, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 0.2011, "step": 7920 }, { "epoch": 0.4613683965557366, "grad_norm": 2.40625, "learning_rate": 2e-05, "loss": 0.207, "step": 7930 }, { "epoch": 0.4619501978124273, "grad_norm": 1.9765625, "learning_rate": 2e-05, "loss": 0.2032, "step": 7940 }, { "epoch": 0.462531999069118, "grad_norm": 1.8984375, "learning_rate": 2e-05, "loss": 0.2053, "step": 7950 }, { "epoch": 0.4631138003258087, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 0.1991, "step": 7960 }, { "epoch": 0.4636956015824994, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.2059, "step": 7970 }, { "epoch": 0.4642774028391901, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1986, "step": 7980 }, { "epoch": 0.46485920409588083, "grad_norm": 1.828125, "learning_rate": 2e-05, "loss": 0.2033, "step": 7990 }, { "epoch": 0.46544100535257155, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 0.2005, "step": 8000 }, { "epoch": 0.46602280660926226, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.2101, "step": 8010 }, { "epoch": 0.466604607865953, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.1976, "step": 8020 }, { "epoch": 0.4671864091226437, "grad_norm": 1.8671875, "learning_rate": 2e-05, "loss": 0.1992, "step": 8030 }, { "epoch": 0.4677682103793344, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.2069, "step": 8040 }, { "epoch": 0.46835001163602513, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.2055, "step": 8050 }, { "epoch": 0.46893181289271585, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.203, "step": 8060 }, { "epoch": 0.46951361414940657, "grad_norm": 1.8515625, "learning_rate": 2e-05, "loss": 0.2023, "step": 8070 }, { "epoch": 0.4700954154060973, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.2084, "step": 8080 }, { "epoch": 0.470677216662788, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.2094, "step": 8090 }, { "epoch": 0.4712590179194787, "grad_norm": 2.15625, "learning_rate": 2e-05, "loss": 0.202, "step": 8100 }, { "epoch": 0.47184081917616943, "grad_norm": 1.28125, "learning_rate": 2e-05, "loss": 0.1977, "step": 8110 }, { "epoch": 0.47242262043286015, "grad_norm": 1.8359375, "learning_rate": 2e-05, "loss": 0.2017, "step": 8120 }, { "epoch": 0.47300442168955087, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.193, "step": 8130 }, { "epoch": 0.4735862229462416, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.2109, "step": 8140 }, { "epoch": 0.4741680242029323, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 0.1937, "step": 8150 }, { "epoch": 0.474749825459623, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.2015, "step": 8160 }, { "epoch": 0.4753316267163137, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.2024, "step": 8170 }, { "epoch": 0.4759134279730044, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.2028, "step": 8180 }, { "epoch": 0.4764952292296951, "grad_norm": 2.109375, "learning_rate": 2e-05, "loss": 0.1949, "step": 8190 }, { "epoch": 0.47707703048638583, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 0.2067, "step": 8200 }, { "epoch": 0.47765883174307655, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.1973, "step": 8210 }, { "epoch": 0.47824063299976727, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.197, "step": 8220 }, { "epoch": 0.478822434256458, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.1956, "step": 8230 }, { "epoch": 0.4794042355131487, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1982, "step": 8240 }, { "epoch": 0.4799860367698394, "grad_norm": 2.15625, "learning_rate": 2e-05, "loss": 0.1975, "step": 8250 }, { "epoch": 0.48056783802653014, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.1969, "step": 8260 }, { "epoch": 0.48114963928322085, "grad_norm": 1.828125, "learning_rate": 2e-05, "loss": 0.2013, "step": 8270 }, { "epoch": 0.48173144053991157, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 0.1965, "step": 8280 }, { "epoch": 0.4823132417966023, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.2035, "step": 8290 }, { "epoch": 0.482895043053293, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.1969, "step": 8300 }, { "epoch": 0.4834768443099837, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1949, "step": 8310 }, { "epoch": 0.48405864556667444, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1987, "step": 8320 }, { "epoch": 0.48464044682336516, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.2047, "step": 8330 }, { "epoch": 0.4852222480800559, "grad_norm": 1.390625, "learning_rate": 2e-05, "loss": 0.2075, "step": 8340 }, { "epoch": 0.4858040493367466, "grad_norm": 2.125, "learning_rate": 2e-05, "loss": 0.2021, "step": 8350 }, { "epoch": 0.4863858505934373, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.2014, "step": 8360 }, { "epoch": 0.48696765185012797, "grad_norm": 1.9375, "learning_rate": 2e-05, "loss": 0.2019, "step": 8370 }, { "epoch": 0.4875494531068187, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 0.2022, "step": 8380 }, { "epoch": 0.4881312543635094, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.2055, "step": 8390 }, { "epoch": 0.4887130556202001, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.2015, "step": 8400 }, { "epoch": 0.48929485687689084, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.2027, "step": 8410 }, { "epoch": 0.48987665813358156, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.2009, "step": 8420 }, { "epoch": 0.49045845939027227, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 0.2031, "step": 8430 }, { "epoch": 0.491040260646963, "grad_norm": 1.328125, "learning_rate": 2e-05, "loss": 0.2014, "step": 8440 }, { "epoch": 0.4916220619036537, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.2051, "step": 8450 }, { "epoch": 0.4922038631603444, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 0.2, "step": 8460 }, { "epoch": 0.49278566441703514, "grad_norm": 1.9375, "learning_rate": 2e-05, "loss": 0.1986, "step": 8470 }, { "epoch": 0.49336746567372586, "grad_norm": 1.34375, "learning_rate": 2e-05, "loss": 0.1997, "step": 8480 }, { "epoch": 0.4939492669304166, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.2012, "step": 8490 }, { "epoch": 0.4945310681871073, "grad_norm": 2.28125, "learning_rate": 2e-05, "loss": 0.1993, "step": 8500 }, { "epoch": 0.495112869443798, "grad_norm": 3.03125, "learning_rate": 2e-05, "loss": 0.2046, "step": 8510 }, { "epoch": 0.4956946707004887, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.2006, "step": 8520 }, { "epoch": 0.49627647195717944, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.2005, "step": 8530 }, { "epoch": 0.49685827321387016, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.2065, "step": 8540 }, { "epoch": 0.4974400744705609, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.2061, "step": 8550 }, { "epoch": 0.4980218757272516, "grad_norm": 1.796875, "learning_rate": 2e-05, "loss": 0.2032, "step": 8560 }, { "epoch": 0.4986036769839423, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.2103, "step": 8570 }, { "epoch": 0.499185478240633, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.1934, "step": 8580 }, { "epoch": 0.4997672794973237, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 0.203, "step": 8590 }, { "epoch": 0.5003490807540144, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 0.2059, "step": 8600 }, { "epoch": 0.5009308820107051, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.1978, "step": 8610 }, { "epoch": 0.5015126832673958, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.1987, "step": 8620 }, { "epoch": 0.5020944845240866, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.1923, "step": 8630 }, { "epoch": 0.5026762857807773, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.1964, "step": 8640 }, { "epoch": 0.503258087037468, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 0.2076, "step": 8650 }, { "epoch": 0.5038398882941587, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.2042, "step": 8660 }, { "epoch": 0.5044216895508494, "grad_norm": 1.8984375, "learning_rate": 2e-05, "loss": 0.2003, "step": 8670 }, { "epoch": 0.5050034908075401, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.1981, "step": 8680 }, { "epoch": 0.5055852920642309, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.1957, "step": 8690 }, { "epoch": 0.5061670933209216, "grad_norm": 1.828125, "learning_rate": 2e-05, "loss": 0.2042, "step": 8700 }, { "epoch": 0.5067488945776123, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.1998, "step": 8710 }, { "epoch": 0.507330695834303, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.2107, "step": 8720 }, { "epoch": 0.5079124970909937, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.199, "step": 8730 }, { "epoch": 0.5084942983476844, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.2059, "step": 8740 }, { "epoch": 0.5090760996043752, "grad_norm": 2.125, "learning_rate": 2e-05, "loss": 0.2057, "step": 8750 }, { "epoch": 0.5096579008610659, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.198, "step": 8760 }, { "epoch": 0.5102397021177566, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1962, "step": 8770 }, { "epoch": 0.5108215033744473, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.199, "step": 8780 }, { "epoch": 0.511403304631138, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.2023, "step": 8790 }, { "epoch": 0.5119851058878288, "grad_norm": 1.8359375, "learning_rate": 2e-05, "loss": 0.2043, "step": 8800 }, { "epoch": 0.5125669071445195, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 0.1959, "step": 8810 }, { "epoch": 0.5131487084012102, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 0.2019, "step": 8820 }, { "epoch": 0.5137305096579009, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.2009, "step": 8830 }, { "epoch": 0.5143123109145916, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.2055, "step": 8840 }, { "epoch": 0.5148941121712823, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 0.2037, "step": 8850 }, { "epoch": 0.515475913427973, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1999, "step": 8860 }, { "epoch": 0.5160577146846638, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.1957, "step": 8870 }, { "epoch": 0.5166395159413545, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.2015, "step": 8880 }, { "epoch": 0.5172213171980451, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.2038, "step": 8890 }, { "epoch": 0.5178031184547358, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 0.1999, "step": 8900 }, { "epoch": 0.5183849197114265, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1965, "step": 8910 }, { "epoch": 0.5189667209681172, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 0.1953, "step": 8920 }, { "epoch": 0.519548522224808, "grad_norm": 2.390625, "learning_rate": 2e-05, "loss": 0.2033, "step": 8930 }, { "epoch": 0.5201303234814987, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.1939, "step": 8940 }, { "epoch": 0.5207121247381894, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.2023, "step": 8950 }, { "epoch": 0.5212939259948801, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.2116, "step": 8960 }, { "epoch": 0.5218757272515708, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 0.1955, "step": 8970 }, { "epoch": 0.5224575285082615, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.1979, "step": 8980 }, { "epoch": 0.5230393297649523, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.2002, "step": 8990 }, { "epoch": 0.523621131021643, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.1995, "step": 9000 }, { "epoch": 0.5242029322783337, "grad_norm": 2.171875, "learning_rate": 2e-05, "loss": 0.202, "step": 9010 }, { "epoch": 0.5247847335350244, "grad_norm": 1.9921875, "learning_rate": 2e-05, "loss": 0.1997, "step": 9020 }, { "epoch": 0.5253665347917151, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 0.1984, "step": 9030 }, { "epoch": 0.5259483360484059, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 0.1953, "step": 9040 }, { "epoch": 0.5265301373050966, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.1966, "step": 9050 }, { "epoch": 0.5271119385617873, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.2021, "step": 9060 }, { "epoch": 0.527693739818478, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.2032, "step": 9070 }, { "epoch": 0.5282755410751687, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.1982, "step": 9080 }, { "epoch": 0.5288573423318594, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.1945, "step": 9090 }, { "epoch": 0.5294391435885502, "grad_norm": 1.8515625, "learning_rate": 2e-05, "loss": 0.1889, "step": 9100 }, { "epoch": 0.5300209448452409, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 0.204, "step": 9110 }, { "epoch": 0.5306027461019316, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.2003, "step": 9120 }, { "epoch": 0.5311845473586223, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.2015, "step": 9130 }, { "epoch": 0.531766348615313, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 0.2016, "step": 9140 }, { "epoch": 0.5323481498720037, "grad_norm": 1.84375, "learning_rate": 2e-05, "loss": 0.2051, "step": 9150 }, { "epoch": 0.5329299511286945, "grad_norm": 2.6875, "learning_rate": 2e-05, "loss": 0.2016, "step": 9160 }, { "epoch": 0.5335117523853852, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.2005, "step": 9170 }, { "epoch": 0.5340935536420759, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.2001, "step": 9180 }, { "epoch": 0.5346753548987666, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 0.196, "step": 9190 }, { "epoch": 0.5352571561554573, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.1957, "step": 9200 }, { "epoch": 0.535838957412148, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.1972, "step": 9210 }, { "epoch": 0.5364207586688388, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 0.1928, "step": 9220 }, { "epoch": 0.5370025599255295, "grad_norm": 1.9375, "learning_rate": 2e-05, "loss": 0.1996, "step": 9230 }, { "epoch": 0.5375843611822202, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.1943, "step": 9240 }, { "epoch": 0.5381661624389109, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 0.1901, "step": 9250 }, { "epoch": 0.5387479636956016, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.2016, "step": 9260 }, { "epoch": 0.5393297649522923, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.1996, "step": 9270 }, { "epoch": 0.5399115662089831, "grad_norm": 1.328125, "learning_rate": 2e-05, "loss": 0.1972, "step": 9280 }, { "epoch": 0.5404933674656738, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.2063, "step": 9290 }, { "epoch": 0.5410751687223644, "grad_norm": 1.8359375, "learning_rate": 2e-05, "loss": 0.1986, "step": 9300 }, { "epoch": 0.5416569699790551, "grad_norm": 1.3125, "learning_rate": 2e-05, "loss": 0.2049, "step": 9310 }, { "epoch": 0.5422387712357458, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.1968, "step": 9320 }, { "epoch": 0.5428205724924365, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.1941, "step": 9330 }, { "epoch": 0.5434023737491273, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.2014, "step": 9340 }, { "epoch": 0.543984175005818, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.1961, "step": 9350 }, { "epoch": 0.5445659762625087, "grad_norm": 1.8671875, "learning_rate": 2e-05, "loss": 0.205, "step": 9360 }, { "epoch": 0.5451477775191994, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.2042, "step": 9370 }, { "epoch": 0.5457295787758901, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1927, "step": 9380 }, { "epoch": 0.5463113800325808, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.205, "step": 9390 }, { "epoch": 0.5468931812892716, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 0.1973, "step": 9400 }, { "epoch": 0.5474749825459623, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.2069, "step": 9410 }, { "epoch": 0.548056783802653, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.1974, "step": 9420 }, { "epoch": 0.5486385850593437, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.194, "step": 9430 }, { "epoch": 0.5492203863160344, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 0.1969, "step": 9440 }, { "epoch": 0.5498021875727251, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.2019, "step": 9450 }, { "epoch": 0.5503839888294159, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.1983, "step": 9460 }, { "epoch": 0.5509657900861066, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.1945, "step": 9470 }, { "epoch": 0.5515475913427973, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.2043, "step": 9480 }, { "epoch": 0.552129392599488, "grad_norm": 2.171875, "learning_rate": 2e-05, "loss": 0.1923, "step": 9490 }, { "epoch": 0.5527111938561787, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.203, "step": 9500 }, { "epoch": 0.5532929951128694, "grad_norm": 1.84375, "learning_rate": 2e-05, "loss": 0.2024, "step": 9510 }, { "epoch": 0.5538747963695602, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.1938, "step": 9520 }, { "epoch": 0.5544565976262509, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1949, "step": 9530 }, { "epoch": 0.5550383988829416, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 0.1989, "step": 9540 }, { "epoch": 0.5556202001396323, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 0.1998, "step": 9550 }, { "epoch": 0.556202001396323, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.2009, "step": 9560 }, { "epoch": 0.5567838026530137, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.1976, "step": 9570 }, { "epoch": 0.5573656039097045, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.1979, "step": 9580 }, { "epoch": 0.5579474051663952, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.1874, "step": 9590 }, { "epoch": 0.5585292064230859, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.2006, "step": 9600 }, { "epoch": 0.5591110076797766, "grad_norm": 1.78125, "learning_rate": 2e-05, "loss": 0.1976, "step": 9610 }, { "epoch": 0.5596928089364673, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1918, "step": 9620 }, { "epoch": 0.560274610193158, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.1916, "step": 9630 }, { "epoch": 0.5608564114498488, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 0.1925, "step": 9640 }, { "epoch": 0.5614382127065395, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.2056, "step": 9650 }, { "epoch": 0.5620200139632302, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1989, "step": 9660 }, { "epoch": 0.5626018152199209, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.2061, "step": 9670 }, { "epoch": 0.5631836164766116, "grad_norm": 1.390625, "learning_rate": 2e-05, "loss": 0.1993, "step": 9680 }, { "epoch": 0.5637654177333024, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 0.1957, "step": 9690 }, { "epoch": 0.5643472189899931, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1978, "step": 9700 }, { "epoch": 0.5649290202466837, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.1918, "step": 9710 }, { "epoch": 0.5655108215033744, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 0.2032, "step": 9720 }, { "epoch": 0.5660926227600651, "grad_norm": 1.953125, "learning_rate": 2e-05, "loss": 0.1951, "step": 9730 }, { "epoch": 0.5666744240167558, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.1994, "step": 9740 }, { "epoch": 0.5672562252734465, "grad_norm": 1.8984375, "learning_rate": 2e-05, "loss": 0.2014, "step": 9750 }, { "epoch": 0.5678380265301373, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.1957, "step": 9760 }, { "epoch": 0.568419827786828, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.2084, "step": 9770 }, { "epoch": 0.5690016290435187, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 0.1968, "step": 9780 }, { "epoch": 0.5695834303002094, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.196, "step": 9790 }, { "epoch": 0.5701652315569001, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.1956, "step": 9800 }, { "epoch": 0.5707470328135908, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.2006, "step": 9810 }, { "epoch": 0.5713288340702816, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.1954, "step": 9820 }, { "epoch": 0.5719106353269723, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.2012, "step": 9830 }, { "epoch": 0.572492436583663, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 0.1908, "step": 9840 }, { "epoch": 0.5730742378403537, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.1983, "step": 9850 }, { "epoch": 0.5736560390970444, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.2031, "step": 9860 }, { "epoch": 0.5742378403537352, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.1971, "step": 9870 }, { "epoch": 0.5748196416104259, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.1965, "step": 9880 }, { "epoch": 0.5754014428671166, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.1993, "step": 9890 }, { "epoch": 0.5759832441238073, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.1979, "step": 9900 }, { "epoch": 0.576565045380498, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.1966, "step": 9910 }, { "epoch": 0.5771468466371887, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 0.1889, "step": 9920 }, { "epoch": 0.5777286478938795, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.1955, "step": 9930 }, { "epoch": 0.5783104491505702, "grad_norm": 1.78125, "learning_rate": 2e-05, "loss": 0.1942, "step": 9940 }, { "epoch": 0.5788922504072609, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.2002, "step": 9950 }, { "epoch": 0.5794740516639516, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.1961, "step": 9960 }, { "epoch": 0.5800558529206423, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.1976, "step": 9970 }, { "epoch": 0.580637654177333, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1981, "step": 9980 }, { "epoch": 0.5812194554340238, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.1927, "step": 9990 }, { "epoch": 0.5818012566907145, "grad_norm": 1.84375, "learning_rate": 2e-05, "loss": 0.1866, "step": 10000 }, { "epoch": 0.5823830579474052, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1975, "step": 10010 }, { "epoch": 0.5829648592040959, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.1944, "step": 10020 }, { "epoch": 0.5835466604607866, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 0.1929, "step": 10030 }, { "epoch": 0.5841284617174773, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.1926, "step": 10040 }, { "epoch": 0.5847102629741681, "grad_norm": 1.328125, "learning_rate": 2e-05, "loss": 0.1965, "step": 10050 }, { "epoch": 0.5852920642308588, "grad_norm": 1.78125, "learning_rate": 2e-05, "loss": 0.1919, "step": 10060 }, { "epoch": 0.5858738654875495, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.1991, "step": 10070 }, { "epoch": 0.5864556667442402, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1966, "step": 10080 }, { "epoch": 0.5870374680009309, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.1921, "step": 10090 }, { "epoch": 0.5876192692576216, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.202, "step": 10100 }, { "epoch": 0.5882010705143124, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1992, "step": 10110 }, { "epoch": 0.5887828717710031, "grad_norm": 1.9921875, "learning_rate": 2e-05, "loss": 0.1981, "step": 10120 }, { "epoch": 0.5893646730276937, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.1926, "step": 10130 }, { "epoch": 0.5899464742843844, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.1949, "step": 10140 }, { "epoch": 0.5905282755410751, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.1994, "step": 10150 }, { "epoch": 0.5911100767977658, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.1982, "step": 10160 }, { "epoch": 0.5916918780544566, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 0.1963, "step": 10170 }, { "epoch": 0.5922736793111473, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.1936, "step": 10180 }, { "epoch": 0.592855480567838, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 0.1975, "step": 10190 }, { "epoch": 0.5934372818245287, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.2004, "step": 10200 }, { "epoch": 0.5940190830812194, "grad_norm": 2.09375, "learning_rate": 2e-05, "loss": 0.1942, "step": 10210 }, { "epoch": 0.5946008843379101, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 0.1967, "step": 10220 }, { "epoch": 0.5951826855946009, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.1941, "step": 10230 }, { "epoch": 0.5957644868512916, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.1945, "step": 10240 }, { "epoch": 0.5963462881079823, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.1944, "step": 10250 }, { "epoch": 0.596928089364673, "grad_norm": 1.2890625, "learning_rate": 2e-05, "loss": 0.2036, "step": 10260 }, { "epoch": 0.5975098906213637, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.1988, "step": 10270 }, { "epoch": 0.5980916918780544, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 0.1969, "step": 10280 }, { "epoch": 0.5986734931347452, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 0.1881, "step": 10290 }, { "epoch": 0.5992552943914359, "grad_norm": 2.125, "learning_rate": 2e-05, "loss": 0.1914, "step": 10300 }, { "epoch": 0.5998370956481266, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 0.199, "step": 10310 }, { "epoch": 0.6004188969048173, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.2019, "step": 10320 }, { "epoch": 0.601000698161508, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 0.1993, "step": 10330 }, { "epoch": 0.6015824994181987, "grad_norm": 1.8671875, "learning_rate": 2e-05, "loss": 0.2006, "step": 10340 }, { "epoch": 0.6021643006748895, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.1949, "step": 10350 }, { "epoch": 0.6027461019315802, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 0.1885, "step": 10360 }, { "epoch": 0.6033279031882709, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.2039, "step": 10370 }, { "epoch": 0.6039097044449616, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.1868, "step": 10380 }, { "epoch": 0.6044915057016523, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 0.2036, "step": 10390 }, { "epoch": 0.605073306958343, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.2035, "step": 10400 }, { "epoch": 0.6056551082150338, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.1957, "step": 10410 }, { "epoch": 0.6062369094717245, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.195, "step": 10420 }, { "epoch": 0.6068187107284152, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.1988, "step": 10430 }, { "epoch": 0.6074005119851059, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.1918, "step": 10440 }, { "epoch": 0.6079823132417966, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.1978, "step": 10450 }, { "epoch": 0.6085641144984874, "grad_norm": 1.8046875, "learning_rate": 2e-05, "loss": 0.1996, "step": 10460 }, { "epoch": 0.6091459157551781, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 0.2012, "step": 10470 }, { "epoch": 0.6097277170118688, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.1911, "step": 10480 }, { "epoch": 0.6103095182685595, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.1915, "step": 10490 }, { "epoch": 0.6108913195252502, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1956, "step": 10500 }, { "epoch": 0.6114731207819409, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 0.1987, "step": 10510 }, { "epoch": 0.6120549220386317, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.2038, "step": 10520 }, { "epoch": 0.6126367232953224, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.1991, "step": 10530 }, { "epoch": 0.613218524552013, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.1942, "step": 10540 }, { "epoch": 0.6138003258087037, "grad_norm": 2.09375, "learning_rate": 2e-05, "loss": 0.1916, "step": 10550 }, { "epoch": 0.6143821270653944, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 0.1926, "step": 10560 }, { "epoch": 0.6149639283220851, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.1904, "step": 10570 }, { "epoch": 0.6155457295787758, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.19, "step": 10580 }, { "epoch": 0.6161275308354666, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.1928, "step": 10590 }, { "epoch": 0.6167093320921573, "grad_norm": 2.53125, "learning_rate": 2e-05, "loss": 0.1963, "step": 10600 }, { "epoch": 0.617291133348848, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.1951, "step": 10610 }, { "epoch": 0.6178729346055387, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.1987, "step": 10620 }, { "epoch": 0.6184547358622294, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.1948, "step": 10630 }, { "epoch": 0.6190365371189201, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1929, "step": 10640 }, { "epoch": 0.6196183383756109, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.1983, "step": 10650 }, { "epoch": 0.6202001396323016, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.2026, "step": 10660 }, { "epoch": 0.6207819408889923, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.1898, "step": 10670 }, { "epoch": 0.621363742145683, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.1921, "step": 10680 }, { "epoch": 0.6219455434023737, "grad_norm": 2.125, "learning_rate": 2e-05, "loss": 0.194, "step": 10690 }, { "epoch": 0.6225273446590645, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.1968, "step": 10700 }, { "epoch": 0.6231091459157552, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 0.196, "step": 10710 }, { "epoch": 0.6236909471724459, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.1954, "step": 10720 }, { "epoch": 0.6242727484291366, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.2009, "step": 10730 }, { "epoch": 0.6248545496858273, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.1904, "step": 10740 }, { "epoch": 0.625436350942518, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 0.1912, "step": 10750 }, { "epoch": 0.6260181521992088, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.1967, "step": 10760 }, { "epoch": 0.6265999534558995, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.2025, "step": 10770 }, { "epoch": 0.6271817547125902, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.1928, "step": 10780 }, { "epoch": 0.6277635559692809, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.1945, "step": 10790 }, { "epoch": 0.6283453572259716, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.1964, "step": 10800 }, { "epoch": 0.6289271584826623, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 0.194, "step": 10810 }, { "epoch": 0.6295089597393531, "grad_norm": 1.828125, "learning_rate": 2e-05, "loss": 0.2006, "step": 10820 }, { "epoch": 0.6300907609960438, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.1941, "step": 10830 }, { "epoch": 0.6306725622527345, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.1911, "step": 10840 }, { "epoch": 0.6312543635094252, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.1847, "step": 10850 }, { "epoch": 0.6318361647661159, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 0.1973, "step": 10860 }, { "epoch": 0.6324179660228066, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 0.1971, "step": 10870 }, { "epoch": 0.6329997672794974, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.1932, "step": 10880 }, { "epoch": 0.6335815685361881, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.1976, "step": 10890 }, { "epoch": 0.6341633697928788, "grad_norm": 2.09375, "learning_rate": 2e-05, "loss": 0.1946, "step": 10900 }, { "epoch": 0.6347451710495695, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.1941, "step": 10910 }, { "epoch": 0.6353269723062602, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.1982, "step": 10920 }, { "epoch": 0.635908773562951, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 0.1945, "step": 10930 }, { "epoch": 0.6364905748196417, "grad_norm": 1.9921875, "learning_rate": 2e-05, "loss": 0.1909, "step": 10940 }, { "epoch": 0.6370723760763323, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.1939, "step": 10950 }, { "epoch": 0.637654177333023, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.1909, "step": 10960 }, { "epoch": 0.6382359785897137, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.1961, "step": 10970 }, { "epoch": 0.6388177798464044, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 0.1965, "step": 10980 }, { "epoch": 0.6393995811030951, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.194, "step": 10990 }, { "epoch": 0.6399813823597859, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.1931, "step": 11000 }, { "epoch": 0.6405631836164766, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 0.1987, "step": 11010 }, { "epoch": 0.6411449848731673, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.1927, "step": 11020 }, { "epoch": 0.641726786129858, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.2, "step": 11030 }, { "epoch": 0.6423085873865487, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.1961, "step": 11040 }, { "epoch": 0.6428903886432394, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1944, "step": 11050 }, { "epoch": 0.6434721898999302, "grad_norm": 2.265625, "learning_rate": 2e-05, "loss": 0.2021, "step": 11060 }, { "epoch": 0.6440539911566209, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 0.1967, "step": 11070 }, { "epoch": 0.6446357924133116, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.1922, "step": 11080 }, { "epoch": 0.6452175936700023, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 0.1909, "step": 11090 }, { "epoch": 0.645799394926693, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.185, "step": 11100 }, { "epoch": 0.6463811961833837, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1926, "step": 11110 }, { "epoch": 0.6469629974400745, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.1938, "step": 11120 }, { "epoch": 0.6475447986967652, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.1929, "step": 11130 }, { "epoch": 0.6481265999534559, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.2016, "step": 11140 }, { "epoch": 0.6487084012101466, "grad_norm": 1.9765625, "learning_rate": 2e-05, "loss": 0.1955, "step": 11150 }, { "epoch": 0.6492902024668373, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.2007, "step": 11160 }, { "epoch": 0.649872003723528, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.1922, "step": 11170 }, { "epoch": 0.6504538049802188, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.199, "step": 11180 }, { "epoch": 0.6510356062369095, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 0.1949, "step": 11190 }, { "epoch": 0.6516174074936002, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 0.1989, "step": 11200 }, { "epoch": 0.6521992087502909, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 0.1896, "step": 11210 }, { "epoch": 0.6527810100069816, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.1991, "step": 11220 }, { "epoch": 0.6533628112636723, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 0.2002, "step": 11230 }, { "epoch": 0.6539446125203631, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.1963, "step": 11240 }, { "epoch": 0.6545264137770538, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 0.1899, "step": 11250 }, { "epoch": 0.6551082150337445, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.2011, "step": 11260 }, { "epoch": 0.6556900162904352, "grad_norm": 1.8203125, "learning_rate": 2e-05, "loss": 0.1949, "step": 11270 }, { "epoch": 0.6562718175471259, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.1867, "step": 11280 }, { "epoch": 0.6568536188038167, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.1981, "step": 11290 }, { "epoch": 0.6574354200605074, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 0.1922, "step": 11300 }, { "epoch": 0.6580172213171981, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 0.2005, "step": 11310 }, { "epoch": 0.6585990225738888, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.1973, "step": 11320 }, { "epoch": 0.6591808238305795, "grad_norm": 1.390625, "learning_rate": 2e-05, "loss": 0.1991, "step": 11330 }, { "epoch": 0.6597626250872702, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.1994, "step": 11340 }, { "epoch": 0.660344426343961, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 0.1977, "step": 11350 }, { "epoch": 0.6609262276006517, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.1952, "step": 11360 }, { "epoch": 0.6615080288573423, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 0.1918, "step": 11370 }, { "epoch": 0.662089830114033, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.1874, "step": 11380 }, { "epoch": 0.6626716313707237, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 0.1966, "step": 11390 }, { "epoch": 0.6632534326274144, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.1953, "step": 11400 }, { "epoch": 0.6638352338841051, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 0.1884, "step": 11410 }, { "epoch": 0.6644170351407959, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.1903, "step": 11420 }, { "epoch": 0.6649988363974866, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 0.1918, "step": 11430 }, { "epoch": 0.6655806376541773, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.1956, "step": 11440 }, { "epoch": 0.666162438910868, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.1924, "step": 11450 }, { "epoch": 0.6667442401675587, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.1874, "step": 11460 }, { "epoch": 0.6673260414242494, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.1948, "step": 11470 }, { "epoch": 0.6679078426809402, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.1908, "step": 11480 }, { "epoch": 0.6684896439376309, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 0.1921, "step": 11490 }, { "epoch": 0.6690714451943216, "grad_norm": 1.8203125, "learning_rate": 2e-05, "loss": 0.1897, "step": 11500 }, { "epoch": 0.6696532464510123, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 0.1964, "step": 11510 }, { "epoch": 0.670235047707703, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 0.1938, "step": 11520 }, { "epoch": 0.6708168489643938, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1921, "step": 11530 }, { "epoch": 0.6713986502210845, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.1881, "step": 11540 }, { "epoch": 0.6719804514777752, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.1926, "step": 11550 }, { "epoch": 0.6725622527344659, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.1948, "step": 11560 }, { "epoch": 0.6731440539911566, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.1885, "step": 11570 }, { "epoch": 0.6737258552478473, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.1906, "step": 11580 }, { "epoch": 0.674307656504538, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.1991, "step": 11590 }, { "epoch": 0.6748894577612288, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 0.189, "step": 11600 }, { "epoch": 0.6754712590179195, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.1926, "step": 11610 }, { "epoch": 0.6760530602746102, "grad_norm": 1.9609375, "learning_rate": 2e-05, "loss": 0.2006, "step": 11620 }, { "epoch": 0.6766348615313009, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.1909, "step": 11630 }, { "epoch": 0.6772166627879916, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.1944, "step": 11640 }, { "epoch": 0.6777984640446824, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 0.1982, "step": 11650 }, { "epoch": 0.6783802653013731, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.1961, "step": 11660 }, { "epoch": 0.6789620665580638, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.1877, "step": 11670 }, { "epoch": 0.6795438678147545, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.1929, "step": 11680 }, { "epoch": 0.6801256690714452, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.1915, "step": 11690 }, { "epoch": 0.6807074703281359, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 0.19, "step": 11700 }, { "epoch": 0.6812892715848267, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.1923, "step": 11710 }, { "epoch": 0.6818710728415174, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.1969, "step": 11720 }, { "epoch": 0.6824528740982081, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.1876, "step": 11730 }, { "epoch": 0.6830346753548988, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.1919, "step": 11740 }, { "epoch": 0.6836164766115895, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1939, "step": 11750 }, { "epoch": 0.6841982778682802, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 0.1953, "step": 11760 }, { "epoch": 0.684780079124971, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.196, "step": 11770 }, { "epoch": 0.6853618803816616, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.1975, "step": 11780 }, { "epoch": 0.6859436816383523, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 0.1965, "step": 11790 }, { "epoch": 0.686525482895043, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 0.1993, "step": 11800 }, { "epoch": 0.6871072841517337, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1959, "step": 11810 }, { "epoch": 0.6876890854084244, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.1962, "step": 11820 }, { "epoch": 0.6882708866651152, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 0.1973, "step": 11830 }, { "epoch": 0.6888526879218059, "grad_norm": 1.265625, "learning_rate": 2e-05, "loss": 0.1908, "step": 11840 }, { "epoch": 0.6894344891784966, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.1892, "step": 11850 }, { "epoch": 0.6900162904351873, "grad_norm": 1.8203125, "learning_rate": 2e-05, "loss": 0.1985, "step": 11860 }, { "epoch": 0.690598091691878, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.1862, "step": 11870 }, { "epoch": 0.6911798929485687, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.1974, "step": 11880 }, { "epoch": 0.6917616942052595, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.1938, "step": 11890 }, { "epoch": 0.6923434954619502, "grad_norm": 1.84375, "learning_rate": 2e-05, "loss": 0.1933, "step": 11900 }, { "epoch": 0.6929252967186409, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.194, "step": 11910 }, { "epoch": 0.6935070979753316, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.1906, "step": 11920 }, { "epoch": 0.6940888992320223, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.1994, "step": 11930 }, { "epoch": 0.694670700488713, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.1944, "step": 11940 }, { "epoch": 0.6952525017454038, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.1994, "step": 11950 }, { "epoch": 0.6958343030020945, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.1926, "step": 11960 }, { "epoch": 0.6964161042587852, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1899, "step": 11970 }, { "epoch": 0.6969979055154759, "grad_norm": 1.1875, "learning_rate": 2e-05, "loss": 0.1932, "step": 11980 }, { "epoch": 0.6975797067721666, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.2018, "step": 11990 }, { "epoch": 0.6981615080288573, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1967, "step": 12000 }, { "epoch": 0.6987433092855481, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.1952, "step": 12010 }, { "epoch": 0.6993251105422388, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.1867, "step": 12020 }, { "epoch": 0.6999069117989295, "grad_norm": 1.8515625, "learning_rate": 2e-05, "loss": 0.1951, "step": 12030 }, { "epoch": 0.7004887130556202, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.1906, "step": 12040 }, { "epoch": 0.7010705143123109, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.1943, "step": 12050 }, { "epoch": 0.7016523155690016, "grad_norm": 1.34375, "learning_rate": 2e-05, "loss": 0.1913, "step": 12060 }, { "epoch": 0.7022341168256924, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.1924, "step": 12070 }, { "epoch": 0.7028159180823831, "grad_norm": 1.8046875, "learning_rate": 2e-05, "loss": 0.1917, "step": 12080 }, { "epoch": 0.7033977193390738, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1951, "step": 12090 }, { "epoch": 0.7039795205957645, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 0.192, "step": 12100 }, { "epoch": 0.7045613218524552, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 0.1914, "step": 12110 }, { "epoch": 0.705143123109146, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.189, "step": 12120 }, { "epoch": 0.7057249243658367, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.1956, "step": 12130 }, { "epoch": 0.7063067256225274, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.1871, "step": 12140 }, { "epoch": 0.7068885268792181, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 0.1923, "step": 12150 }, { "epoch": 0.7074703281359088, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1964, "step": 12160 }, { "epoch": 0.7080521293925995, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.1907, "step": 12170 }, { "epoch": 0.7086339306492903, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.189, "step": 12180 }, { "epoch": 0.7092157319059809, "grad_norm": 1.84375, "learning_rate": 2e-05, "loss": 0.1992, "step": 12190 }, { "epoch": 0.7097975331626716, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 0.193, "step": 12200 }, { "epoch": 0.7103793344193623, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 0.1986, "step": 12210 }, { "epoch": 0.710961135676053, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 0.1948, "step": 12220 }, { "epoch": 0.7115429369327437, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.1925, "step": 12230 }, { "epoch": 0.7121247381894344, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.1895, "step": 12240 }, { "epoch": 0.7127065394461252, "grad_norm": 3.5625, "learning_rate": 2e-05, "loss": 0.1859, "step": 12250 }, { "epoch": 0.7132883407028159, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.187, "step": 12260 }, { "epoch": 0.7138701419595066, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.1889, "step": 12270 }, { "epoch": 0.7144519432161973, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 0.1868, "step": 12280 }, { "epoch": 0.715033744472888, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.2027, "step": 12290 }, { "epoch": 0.7156155457295788, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.1981, "step": 12300 }, { "epoch": 0.7161973469862695, "grad_norm": 1.34375, "learning_rate": 2e-05, "loss": 0.189, "step": 12310 }, { "epoch": 0.7167791482429602, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 0.1937, "step": 12320 }, { "epoch": 0.7173609494996509, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 0.1842, "step": 12330 }, { "epoch": 0.7179427507563416, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 0.2, "step": 12340 }, { "epoch": 0.7185245520130323, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1907, "step": 12350 }, { "epoch": 0.719106353269723, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.1923, "step": 12360 }, { "epoch": 0.7196881545264138, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.1911, "step": 12370 }, { "epoch": 0.7202699557831045, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1887, "step": 12380 }, { "epoch": 0.7208517570397952, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 0.1894, "step": 12390 }, { "epoch": 0.7214335582964859, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.1926, "step": 12400 }, { "epoch": 0.7220153595531766, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 0.1942, "step": 12410 }, { "epoch": 0.7225971608098674, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 0.1935, "step": 12420 }, { "epoch": 0.7231789620665581, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.1869, "step": 12430 }, { "epoch": 0.7237607633232488, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 0.1995, "step": 12440 }, { "epoch": 0.7243425645799395, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.1874, "step": 12450 }, { "epoch": 0.7249243658366302, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.191, "step": 12460 }, { "epoch": 0.7255061670933209, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1912, "step": 12470 }, { "epoch": 0.7260879683500117, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 0.1865, "step": 12480 }, { "epoch": 0.7266697696067024, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.192, "step": 12490 }, { "epoch": 0.7272515708633931, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.1869, "step": 12500 }, { "epoch": 0.7278333721200838, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 0.1923, "step": 12510 }, { "epoch": 0.7284151733767745, "grad_norm": 1.875, "learning_rate": 2e-05, "loss": 0.1896, "step": 12520 }, { "epoch": 0.7289969746334652, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 0.1864, "step": 12530 }, { "epoch": 0.729578775890156, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.1922, "step": 12540 }, { "epoch": 0.7301605771468467, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.1926, "step": 12550 }, { "epoch": 0.7307423784035374, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.1973, "step": 12560 }, { "epoch": 0.7313241796602281, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.186, "step": 12570 }, { "epoch": 0.7319059809169188, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 0.1958, "step": 12580 }, { "epoch": 0.7324877821736095, "grad_norm": 1.390625, "learning_rate": 2e-05, "loss": 0.191, "step": 12590 }, { "epoch": 0.7330695834303003, "grad_norm": 1.265625, "learning_rate": 2e-05, "loss": 0.1886, "step": 12600 }, { "epoch": 0.7336513846869909, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.1925, "step": 12610 }, { "epoch": 0.7342331859436816, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 0.1922, "step": 12620 }, { "epoch": 0.7348149872003723, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1931, "step": 12630 }, { "epoch": 0.735396788457063, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.1939, "step": 12640 }, { "epoch": 0.7359785897137537, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.1827, "step": 12650 }, { "epoch": 0.7365603909704445, "grad_norm": 2.3125, "learning_rate": 2e-05, "loss": 0.1921, "step": 12660 }, { "epoch": 0.7371421922271352, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.1901, "step": 12670 }, { "epoch": 0.7377239934838259, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.1859, "step": 12680 }, { "epoch": 0.7383057947405166, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.196, "step": 12690 }, { "epoch": 0.7388875959972073, "grad_norm": 1.328125, "learning_rate": 2e-05, "loss": 0.1915, "step": 12700 }, { "epoch": 0.739469397253898, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1862, "step": 12710 }, { "epoch": 0.7400511985105888, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.1976, "step": 12720 }, { "epoch": 0.7406329997672795, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.1869, "step": 12730 }, { "epoch": 0.7412148010239702, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 0.1931, "step": 12740 }, { "epoch": 0.7417966022806609, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.1889, "step": 12750 }, { "epoch": 0.7423784035373516, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1878, "step": 12760 }, { "epoch": 0.7429602047940423, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.1846, "step": 12770 }, { "epoch": 0.7435420060507331, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.1889, "step": 12780 }, { "epoch": 0.7441238073074238, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.1863, "step": 12790 }, { "epoch": 0.7447056085641145, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.198, "step": 12800 }, { "epoch": 0.7452874098208052, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.1848, "step": 12810 }, { "epoch": 0.7458692110774959, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.1983, "step": 12820 }, { "epoch": 0.7464510123341866, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.1902, "step": 12830 }, { "epoch": 0.7470328135908774, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 0.1951, "step": 12840 }, { "epoch": 0.7476146148475681, "grad_norm": 2.796875, "learning_rate": 2e-05, "loss": 0.1877, "step": 12850 }, { "epoch": 0.7481964161042588, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.1916, "step": 12860 }, { "epoch": 0.7487782173609495, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.1944, "step": 12870 }, { "epoch": 0.7493600186176402, "grad_norm": 1.8203125, "learning_rate": 2e-05, "loss": 0.1874, "step": 12880 }, { "epoch": 0.749941819874331, "grad_norm": 2.484375, "learning_rate": 2e-05, "loss": 0.1953, "step": 12890 }, { "epoch": 0.7505236211310217, "grad_norm": 1.265625, "learning_rate": 2e-05, "loss": 0.1946, "step": 12900 }, { "epoch": 0.7511054223877124, "grad_norm": 1.2109375, "learning_rate": 2e-05, "loss": 0.1905, "step": 12910 }, { "epoch": 0.7516872236444031, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 0.1876, "step": 12920 }, { "epoch": 0.7522690249010938, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1909, "step": 12930 }, { "epoch": 0.7528508261577845, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 0.1899, "step": 12940 }, { "epoch": 0.7534326274144753, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.1895, "step": 12950 }, { "epoch": 0.754014428671166, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 0.1822, "step": 12960 }, { "epoch": 0.7545962299278567, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.1909, "step": 12970 }, { "epoch": 0.7551780311845474, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 0.1845, "step": 12980 }, { "epoch": 0.7557598324412381, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.1923, "step": 12990 }, { "epoch": 0.7563416336979288, "grad_norm": 1.953125, "learning_rate": 2e-05, "loss": 0.1929, "step": 13000 }, { "epoch": 0.7569234349546196, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.1915, "step": 13010 }, { "epoch": 0.7575052362113102, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.1893, "step": 13020 }, { "epoch": 0.7580870374680009, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.2014, "step": 13030 }, { "epoch": 0.7586688387246916, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.1985, "step": 13040 }, { "epoch": 0.7592506399813823, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.1919, "step": 13050 }, { "epoch": 0.759832441238073, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.1892, "step": 13060 }, { "epoch": 0.7604142424947637, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1849, "step": 13070 }, { "epoch": 0.7609960437514545, "grad_norm": 1.34375, "learning_rate": 2e-05, "loss": 0.1904, "step": 13080 }, { "epoch": 0.7615778450081452, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.1896, "step": 13090 }, { "epoch": 0.7621596462648359, "grad_norm": 1.7890625, "learning_rate": 2e-05, "loss": 0.1891, "step": 13100 }, { "epoch": 0.7627414475215266, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.1914, "step": 13110 }, { "epoch": 0.7633232487782173, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.188, "step": 13120 }, { "epoch": 0.763905050034908, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.2015, "step": 13130 }, { "epoch": 0.7644868512915988, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 0.1905, "step": 13140 }, { "epoch": 0.7650686525482895, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.1894, "step": 13150 }, { "epoch": 0.7656504538049802, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.1938, "step": 13160 }, { "epoch": 0.7662322550616709, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.1898, "step": 13170 }, { "epoch": 0.7668140563183616, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1891, "step": 13180 }, { "epoch": 0.7673958575750524, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 0.1957, "step": 13190 }, { "epoch": 0.7679776588317431, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.1943, "step": 13200 }, { "epoch": 0.7685594600884338, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 0.1898, "step": 13210 }, { "epoch": 0.7691412613451245, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.1906, "step": 13220 }, { "epoch": 0.7697230626018152, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 0.1904, "step": 13230 }, { "epoch": 0.7703048638585059, "grad_norm": 2.171875, "learning_rate": 2e-05, "loss": 0.1967, "step": 13240 }, { "epoch": 0.7708866651151967, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.1881, "step": 13250 }, { "epoch": 0.7714684663718874, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 0.193, "step": 13260 }, { "epoch": 0.7720502676285781, "grad_norm": 4.96875, "learning_rate": 2e-05, "loss": 0.1843, "step": 13270 }, { "epoch": 0.7726320688852688, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.1997, "step": 13280 }, { "epoch": 0.7732138701419595, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.1926, "step": 13290 }, { "epoch": 0.7737956713986502, "grad_norm": 2.734375, "learning_rate": 2e-05, "loss": 0.1971, "step": 13300 }, { "epoch": 0.774377472655341, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.1891, "step": 13310 }, { "epoch": 0.7749592739120317, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.19, "step": 13320 }, { "epoch": 0.7755410751687224, "grad_norm": 1.1796875, "learning_rate": 2e-05, "loss": 0.1967, "step": 13330 }, { "epoch": 0.7761228764254131, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.1902, "step": 13340 }, { "epoch": 0.7767046776821038, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 0.1891, "step": 13350 }, { "epoch": 0.7772864789387945, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 0.1926, "step": 13360 }, { "epoch": 0.7778682801954853, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.1928, "step": 13370 }, { "epoch": 0.778450081452176, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1904, "step": 13380 }, { "epoch": 0.7790318827088667, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 0.1981, "step": 13390 }, { "epoch": 0.7796136839655574, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.1882, "step": 13400 }, { "epoch": 0.7801954852222481, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.1871, "step": 13410 }, { "epoch": 0.7807772864789388, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.1967, "step": 13420 }, { "epoch": 0.7813590877356295, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.1957, "step": 13430 }, { "epoch": 0.7819408889923202, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 0.1907, "step": 13440 }, { "epoch": 0.7825226902490109, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 0.1937, "step": 13450 }, { "epoch": 0.7831044915057016, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.1971, "step": 13460 }, { "epoch": 0.7836862927623923, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.1944, "step": 13470 }, { "epoch": 0.784268094019083, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.1879, "step": 13480 }, { "epoch": 0.7848498952757738, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 0.1884, "step": 13490 }, { "epoch": 0.7854316965324645, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 0.1849, "step": 13500 }, { "epoch": 0.7860134977891552, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.1871, "step": 13510 }, { "epoch": 0.7865952990458459, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.1954, "step": 13520 }, { "epoch": 0.7871771003025366, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1938, "step": 13530 }, { "epoch": 0.7877589015592273, "grad_norm": 2.125, "learning_rate": 2e-05, "loss": 0.1921, "step": 13540 }, { "epoch": 0.7883407028159181, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.1841, "step": 13550 }, { "epoch": 0.7889225040726088, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.1892, "step": 13560 }, { "epoch": 0.7895043053292995, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 0.1865, "step": 13570 }, { "epoch": 0.7900861065859902, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1921, "step": 13580 }, { "epoch": 0.7906679078426809, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 0.1855, "step": 13590 }, { "epoch": 0.7912497090993716, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.1925, "step": 13600 }, { "epoch": 0.7918315103560624, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.193, "step": 13610 }, { "epoch": 0.7924133116127531, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.1905, "step": 13620 }, { "epoch": 0.7929951128694438, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 0.1866, "step": 13630 }, { "epoch": 0.7935769141261345, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.1917, "step": 13640 }, { "epoch": 0.7941587153828252, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 0.1843, "step": 13650 }, { "epoch": 0.794740516639516, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 0.1933, "step": 13660 }, { "epoch": 0.7953223178962067, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.1946, "step": 13670 }, { "epoch": 0.7959041191528974, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.197, "step": 13680 }, { "epoch": 0.7964859204095881, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.1929, "step": 13690 }, { "epoch": 0.7970677216662788, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.1894, "step": 13700 }, { "epoch": 0.7976495229229695, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 0.193, "step": 13710 }, { "epoch": 0.7982313241796603, "grad_norm": 1.828125, "learning_rate": 2e-05, "loss": 0.1891, "step": 13720 }, { "epoch": 0.798813125436351, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1817, "step": 13730 }, { "epoch": 0.7993949266930417, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.1896, "step": 13740 }, { "epoch": 0.7999767279497324, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 0.1872, "step": 13750 }, { "epoch": 0.8005585292064231, "grad_norm": 1.34375, "learning_rate": 2e-05, "loss": 0.1987, "step": 13760 }, { "epoch": 0.8011403304631138, "grad_norm": 1.390625, "learning_rate": 2e-05, "loss": 0.1835, "step": 13770 }, { "epoch": 0.8017221317198046, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1898, "step": 13780 }, { "epoch": 0.8023039329764953, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.193, "step": 13790 }, { "epoch": 0.802885734233186, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.1867, "step": 13800 }, { "epoch": 0.8034675354898767, "grad_norm": 2.71875, "learning_rate": 2e-05, "loss": 0.1959, "step": 13810 }, { "epoch": 0.8040493367465674, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 0.1922, "step": 13820 }, { "epoch": 0.8046311380032581, "grad_norm": 1.8515625, "learning_rate": 2e-05, "loss": 0.1799, "step": 13830 }, { "epoch": 0.8052129392599489, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.196, "step": 13840 }, { "epoch": 0.8057947405166395, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.1941, "step": 13850 }, { "epoch": 0.8063765417733302, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.1913, "step": 13860 }, { "epoch": 0.8069583430300209, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 0.1911, "step": 13870 }, { "epoch": 0.8075401442867116, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.1894, "step": 13880 }, { "epoch": 0.8081219455434023, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.1824, "step": 13890 }, { "epoch": 0.808703746800093, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1895, "step": 13900 }, { "epoch": 0.8092855480567838, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.1869, "step": 13910 }, { "epoch": 0.8098673493134745, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1888, "step": 13920 }, { "epoch": 0.8104491505701652, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.1905, "step": 13930 }, { "epoch": 0.8110309518268559, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.1926, "step": 13940 }, { "epoch": 0.8116127530835466, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.1901, "step": 13950 }, { "epoch": 0.8121945543402374, "grad_norm": 1.96875, "learning_rate": 2e-05, "loss": 0.1904, "step": 13960 }, { "epoch": 0.8127763555969281, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1936, "step": 13970 }, { "epoch": 0.8133581568536188, "grad_norm": 1.2109375, "learning_rate": 2e-05, "loss": 0.1947, "step": 13980 }, { "epoch": 0.8139399581103095, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.1926, "step": 13990 }, { "epoch": 0.8145217593670002, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 0.1941, "step": 14000 }, { "epoch": 0.8151035606236909, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1898, "step": 14010 }, { "epoch": 0.8156853618803817, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 0.1926, "step": 14020 }, { "epoch": 0.8162671631370724, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.1872, "step": 14030 }, { "epoch": 0.8168489643937631, "grad_norm": 1.28125, "learning_rate": 2e-05, "loss": 0.1846, "step": 14040 }, { "epoch": 0.8174307656504538, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 0.1928, "step": 14050 }, { "epoch": 0.8180125669071445, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 0.1904, "step": 14060 }, { "epoch": 0.8185943681638352, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.1855, "step": 14070 }, { "epoch": 0.819176169420526, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 0.1879, "step": 14080 }, { "epoch": 0.8197579706772167, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 0.1915, "step": 14090 }, { "epoch": 0.8203397719339074, "grad_norm": 1.8671875, "learning_rate": 2e-05, "loss": 0.1897, "step": 14100 }, { "epoch": 0.8209215731905981, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 0.1784, "step": 14110 }, { "epoch": 0.8215033744472888, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.1934, "step": 14120 }, { "epoch": 0.8220851757039795, "grad_norm": 1.2734375, "learning_rate": 2e-05, "loss": 0.1926, "step": 14130 }, { "epoch": 0.8226669769606703, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 0.1863, "step": 14140 }, { "epoch": 0.823248778217361, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1911, "step": 14150 }, { "epoch": 0.8238305794740517, "grad_norm": 1.28125, "learning_rate": 2e-05, "loss": 0.1875, "step": 14160 }, { "epoch": 0.8244123807307424, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.1837, "step": 14170 }, { "epoch": 0.8249941819874331, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.1912, "step": 14180 }, { "epoch": 0.8255759832441238, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 0.187, "step": 14190 }, { "epoch": 0.8261577845008146, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.1863, "step": 14200 }, { "epoch": 0.8267395857575053, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.2013, "step": 14210 }, { "epoch": 0.827321387014196, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.1905, "step": 14220 }, { "epoch": 0.8279031882708867, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.1896, "step": 14230 }, { "epoch": 0.8284849895275774, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 0.1895, "step": 14240 }, { "epoch": 0.8290667907842681, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.1905, "step": 14250 }, { "epoch": 0.8296485920409588, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 0.1919, "step": 14260 }, { "epoch": 0.8302303932976495, "grad_norm": 1.265625, "learning_rate": 2e-05, "loss": 0.1869, "step": 14270 }, { "epoch": 0.8308121945543402, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.1921, "step": 14280 }, { "epoch": 0.8313939958110309, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 0.1859, "step": 14290 }, { "epoch": 0.8319757970677216, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 0.1941, "step": 14300 }, { "epoch": 0.8325575983244123, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 0.1849, "step": 14310 }, { "epoch": 0.8331393995811031, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.1966, "step": 14320 }, { "epoch": 0.8337212008377938, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.1867, "step": 14330 }, { "epoch": 0.8343030020944845, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 0.1891, "step": 14340 }, { "epoch": 0.8348848033511752, "grad_norm": 3.15625, "learning_rate": 2e-05, "loss": 0.1881, "step": 14350 }, { "epoch": 0.8354666046078659, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.1873, "step": 14360 }, { "epoch": 0.8360484058645566, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 0.1908, "step": 14370 }, { "epoch": 0.8366302071212474, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 0.194, "step": 14380 }, { "epoch": 0.8372120083779381, "grad_norm": 1.390625, "learning_rate": 2e-05, "loss": 0.1854, "step": 14390 }, { "epoch": 0.8377938096346288, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.1855, "step": 14400 }, { "epoch": 0.8383756108913195, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.1926, "step": 14410 }, { "epoch": 0.8389574121480102, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.1874, "step": 14420 }, { "epoch": 0.839539213404701, "grad_norm": 2.390625, "learning_rate": 2e-05, "loss": 0.1883, "step": 14430 }, { "epoch": 0.8401210146613917, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.1828, "step": 14440 }, { "epoch": 0.8407028159180824, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 0.1864, "step": 14450 }, { "epoch": 0.8412846171747731, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.1918, "step": 14460 }, { "epoch": 0.8418664184314638, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.1891, "step": 14470 }, { "epoch": 0.8424482196881545, "grad_norm": 1.3046875, "learning_rate": 2e-05, "loss": 0.1836, "step": 14480 }, { "epoch": 0.8430300209448452, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.1869, "step": 14490 }, { "epoch": 0.843611822201536, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.1906, "step": 14500 }, { "epoch": 0.8441936234582267, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1908, "step": 14510 }, { "epoch": 0.8447754247149174, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 0.1932, "step": 14520 }, { "epoch": 0.8453572259716081, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.1902, "step": 14530 }, { "epoch": 0.8459390272282988, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 0.1856, "step": 14540 }, { "epoch": 0.8465208284849896, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1905, "step": 14550 }, { "epoch": 0.8471026297416803, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.1897, "step": 14560 }, { "epoch": 0.847684430998371, "grad_norm": 1.9609375, "learning_rate": 2e-05, "loss": 0.1955, "step": 14570 }, { "epoch": 0.8482662322550617, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 0.1886, "step": 14580 }, { "epoch": 0.8488480335117524, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 0.186, "step": 14590 }, { "epoch": 0.8494298347684431, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 0.1864, "step": 14600 }, { "epoch": 0.8500116360251339, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 0.1841, "step": 14610 }, { "epoch": 0.8505934372818246, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.1906, "step": 14620 }, { "epoch": 0.8511752385385153, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.19, "step": 14630 }, { "epoch": 0.851757039795206, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 0.1846, "step": 14640 }, { "epoch": 0.8523388410518967, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 0.1845, "step": 14650 }, { "epoch": 0.8529206423085874, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.1926, "step": 14660 }, { "epoch": 0.853502443565278, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 0.1855, "step": 14670 }, { "epoch": 0.8540842448219688, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 0.187, "step": 14680 }, { "epoch": 0.8546660460786595, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 0.1936, "step": 14690 }, { "epoch": 0.8552478473353502, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1933, "step": 14700 }, { "epoch": 0.8558296485920409, "grad_norm": 1.2890625, "learning_rate": 2e-05, "loss": 0.1855, "step": 14710 }, { "epoch": 0.8564114498487316, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 0.1878, "step": 14720 }, { "epoch": 0.8569932511054223, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.1958, "step": 14730 }, { "epoch": 0.8575750523621131, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 0.1857, "step": 14740 }, { "epoch": 0.8581568536188038, "grad_norm": 1.9453125, "learning_rate": 2e-05, "loss": 0.1895, "step": 14750 }, { "epoch": 0.8587386548754945, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 0.193, "step": 14760 }, { "epoch": 0.8593204561321852, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.1879, "step": 14770 }, { "epoch": 0.8599022573888759, "grad_norm": 1.21875, "learning_rate": 2e-05, "loss": 0.188, "step": 14780 }, { "epoch": 0.8604840586455667, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.1912, "step": 14790 }, { "epoch": 0.8610658599022574, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1904, "step": 14800 }, { "epoch": 0.8616476611589481, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 0.1896, "step": 14810 }, { "epoch": 0.8622294624156388, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.1844, "step": 14820 }, { "epoch": 0.8628112636723295, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 0.1924, "step": 14830 }, { "epoch": 0.8633930649290202, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 0.1774, "step": 14840 }, { "epoch": 0.863974866185711, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1862, "step": 14850 }, { "epoch": 0.8645566674424017, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.1914, "step": 14860 }, { "epoch": 0.8651384686990924, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1817, "step": 14870 }, { "epoch": 0.8657202699557831, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.1876, "step": 14880 }, { "epoch": 0.8663020712124738, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.1912, "step": 14890 }, { "epoch": 0.8668838724691645, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 0.1924, "step": 14900 }, { "epoch": 0.8674656737258553, "grad_norm": 1.84375, "learning_rate": 2e-05, "loss": 0.1894, "step": 14910 }, { "epoch": 0.868047474982546, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.1847, "step": 14920 }, { "epoch": 0.8686292762392367, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.1827, "step": 14930 }, { "epoch": 0.8692110774959274, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.1867, "step": 14940 }, { "epoch": 0.8697928787526181, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.1858, "step": 14950 }, { "epoch": 0.8703746800093088, "grad_norm": 1.6953125, "learning_rate": 2e-05, "loss": 0.1802, "step": 14960 }, { "epoch": 0.8709564812659996, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 0.181, "step": 14970 }, { "epoch": 0.8715382825226903, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 0.1909, "step": 14980 }, { "epoch": 0.872120083779381, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.1865, "step": 14990 }, { "epoch": 0.8727018850360717, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1878, "step": 15000 }, { "epoch": 0.8732836862927624, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 0.1841, "step": 15010 }, { "epoch": 0.8738654875494531, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1893, "step": 15020 }, { "epoch": 0.8744472888061439, "grad_norm": 1.2265625, "learning_rate": 2e-05, "loss": 0.1857, "step": 15030 }, { "epoch": 0.8750290900628346, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 0.189, "step": 15040 }, { "epoch": 0.8756108913195253, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 0.1863, "step": 15050 }, { "epoch": 0.876192692576216, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 0.1923, "step": 15060 }, { "epoch": 0.8767744938329067, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.1885, "step": 15070 }, { "epoch": 0.8773562950895974, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.1851, "step": 15080 }, { "epoch": 0.877938096346288, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 0.1876, "step": 15090 }, { "epoch": 0.8785198976029788, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 0.1883, "step": 15100 }, { "epoch": 0.8791016988596695, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 0.185, "step": 15110 }, { "epoch": 0.8796835001163602, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.1854, "step": 15120 }, { "epoch": 0.8802653013730509, "grad_norm": 1.796875, "learning_rate": 2e-05, "loss": 0.1848, "step": 15130 }, { "epoch": 0.8808471026297416, "grad_norm": 2.328125, "learning_rate": 2e-05, "loss": 0.1893, "step": 15140 }, { "epoch": 0.8814289038864324, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 0.1881, "step": 15150 }, { "epoch": 0.8820107051431231, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 0.1857, "step": 15160 }, { "epoch": 0.8825925063998138, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 0.191, "step": 15170 }, { "epoch": 0.8831743076565045, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.1885, "step": 15180 }, { "epoch": 0.8837561089131952, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.1863, "step": 15190 }, { "epoch": 0.8843379101698859, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 0.196, "step": 15200 }, { "epoch": 0.8849197114265767, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 0.1847, "step": 15210 }, { "epoch": 0.8855015126832674, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 0.1823, "step": 15220 }, { "epoch": 0.8860833139399581, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 0.1871, "step": 15230 }, { "epoch": 0.8866651151966488, "grad_norm": 1.3125, "learning_rate": 2e-05, "loss": 0.1875, "step": 15240 }, { "epoch": 0.8872469164533395, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.1899, "step": 15250 }, { "epoch": 0.8878287177100302, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 0.1801, "step": 15260 }, { "epoch": 0.888410518966721, "grad_norm": 1.328125, "learning_rate": 2e-05, "loss": 0.1883, "step": 15270 }, { "epoch": 0.8889923202234117, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 0.1833, "step": 15280 }, { "epoch": 0.8895741214801024, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 0.193, "step": 15290 }, { "epoch": 0.8901559227367931, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 0.177, "step": 15300 }, { "epoch": 0.8907377239934838, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.1885, "step": 15310 }, { "epoch": 0.8913195252501745, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 0.1882, "step": 15320 }, { "epoch": 0.8919013265068653, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.1845, "step": 15330 }, { "epoch": 0.892483127763556, "grad_norm": 1.328125, "learning_rate": 2e-05, "loss": 0.1844, "step": 15340 }, { "epoch": 0.8930649290202467, "grad_norm": 2.109375, "learning_rate": 2e-05, "loss": 0.1823, "step": 15350 }, { "epoch": 0.8936467302769374, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.1835, "step": 15360 }, { "epoch": 0.8942285315336281, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 0.1876, "step": 15370 }, { "epoch": 0.8948103327903189, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.1857, "step": 15380 }, { "epoch": 0.8953921340470096, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1879, "step": 15390 }, { "epoch": 0.8959739353037003, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 0.1862, "step": 15400 }, { "epoch": 0.896555736560391, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.185, "step": 15410 }, { "epoch": 0.8971375378170817, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.1875, "step": 15420 }, { "epoch": 0.8977193390737724, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 0.1819, "step": 15430 }, { "epoch": 0.8983011403304632, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 0.1877, "step": 15440 }, { "epoch": 0.8988829415871539, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 0.187, "step": 15450 }, { "epoch": 0.8994647428438446, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 0.1893, "step": 15460 }, { "epoch": 0.9000465441005353, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 0.192, "step": 15470 }, { "epoch": 0.900628345357226, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 0.1831, "step": 15480 }, { "epoch": 0.9012101466139167, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 0.1891, "step": 15490 }, { "epoch": 0.9017919478706073, "grad_norm": 1.328125, "learning_rate": 2e-05, "loss": 0.1916, "step": 15500 }, { "epoch": 0.9023737491272981, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 0.1886, "step": 15510 }, { "epoch": 0.9029555503839888, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 0.1851, "step": 15520 }, { "epoch": 0.9035373516406795, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 0.1871, "step": 15530 }, { "epoch": 0.9041191528973702, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 0.1876, "step": 15540 }, { "epoch": 0.9047009541540609, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 0.1896, "step": 15550 }, { "epoch": 0.9052827554107516, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.185, "step": 15560 }, { "epoch": 0.9058645566674424, "grad_norm": 2.375, "learning_rate": 2e-05, "loss": 0.1844, "step": 15570 }, { "epoch": 0.9064463579241331, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 0.1853, "step": 15580 }, { "epoch": 0.9070281591808238, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 0.1806, "step": 15590 }, { "epoch": 0.9076099604375145, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 0.1859, "step": 15600 }, { "epoch": 0.9081917616942052, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 0.1796, "step": 15610 }, { "epoch": 0.908773562950896, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 0.1854, "step": 15620 } ], "logging_steps": 10, "max_steps": 15625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4853137926124995e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }