diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10968 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9090644635792413, + "eval_steps": 500, + "global_step": 15625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.432150460863581e-05, + "grad_norm": 2.25, + "learning_rate": 2e-05, + "loss": 0.9123, + "step": 10 + }, + { + "epoch": 0.00012864300921727162, + "grad_norm": 2.765625, + "learning_rate": 2e-05, + "loss": 0.5268, + "step": 20 + }, + { + "epoch": 0.00019296451382590742, + "grad_norm": 2.125, + "learning_rate": 2e-05, + "loss": 0.5193, + "step": 30 + }, + { + "epoch": 0.00025728601843454324, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.5167, + "step": 40 + }, + { + "epoch": 0.00032160752304317904, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 0.5129, + "step": 50 + }, + { + "epoch": 0.00038592902765181484, + "grad_norm": 2.78125, + "learning_rate": 2e-05, + "loss": 0.5131, + "step": 60 + }, + { + "epoch": 0.00045025053226045063, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.5065, + "step": 70 + }, + { + "epoch": 0.0005145720368690865, + "grad_norm": 2.421875, + "learning_rate": 2e-05, + "loss": 0.5123, + "step": 80 + }, + { + "epoch": 0.0005788935414777223, + "grad_norm": 2.703125, + "learning_rate": 2e-05, + "loss": 0.4953, + "step": 90 + }, + { + "epoch": 0.0006432150460863581, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.4935, + "step": 100 + }, + { + "epoch": 0.0007075365506949939, + "grad_norm": 2.84375, + "learning_rate": 2e-05, + "loss": 0.487, + "step": 110 + }, + { + "epoch": 0.0007718580553036297, + "grad_norm": 2.0, + "learning_rate": 2e-05, + "loss": 0.4784, + "step": 120 + }, + { + "epoch": 0.0008361795599122655, + "grad_norm": 2.3125, + "learning_rate": 2e-05, + "loss": 0.4745, + "step": 130 + }, + { + "epoch": 0.0009005010645209013, + "grad_norm": 2.84375, + "learning_rate": 2e-05, + "loss": 0.4811, + "step": 140 + }, + { + "epoch": 0.0009648225691295371, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 0.4681, + "step": 150 + }, + { + "epoch": 0.001029144073738173, + "grad_norm": 1.9921875, + "learning_rate": 2e-05, + "loss": 0.4657, + "step": 160 + }, + { + "epoch": 0.0010934655783468087, + "grad_norm": 2.71875, + "learning_rate": 2e-05, + "loss": 0.4622, + "step": 170 + }, + { + "epoch": 0.0011577870829554446, + "grad_norm": 2.671875, + "learning_rate": 2e-05, + "loss": 0.4551, + "step": 180 + }, + { + "epoch": 0.0012221085875640803, + "grad_norm": 2.4375, + "learning_rate": 2e-05, + "loss": 0.4548, + "step": 190 + }, + { + "epoch": 0.0012864300921727162, + "grad_norm": 2.484375, + "learning_rate": 2e-05, + "loss": 0.443, + "step": 200 + }, + { + "epoch": 0.0013507515967813518, + "grad_norm": 3.015625, + "learning_rate": 2e-05, + "loss": 0.4357, + "step": 210 + }, + { + "epoch": 0.0014150731013899878, + "grad_norm": 3.296875, + "learning_rate": 2e-05, + "loss": 0.4224, + "step": 220 + }, + { + "epoch": 0.0014793946059986234, + "grad_norm": 2.859375, + "learning_rate": 2e-05, + "loss": 0.4226, + "step": 230 + }, + { + "epoch": 0.0015437161106072593, + "grad_norm": 2.8125, + "learning_rate": 2e-05, + "loss": 0.4331, + "step": 240 + }, + { + "epoch": 0.001608037615215895, + "grad_norm": 3.015625, + "learning_rate": 2e-05, + "loss": 0.4166, + "step": 250 + }, + { + "epoch": 0.001672359119824531, + "grad_norm": 2.71875, + "learning_rate": 2e-05, + "loss": 0.4132, + "step": 260 + }, + { + "epoch": 0.0017366806244331668, + "grad_norm": 2.234375, + "learning_rate": 2e-05, + "loss": 0.4078, + "step": 270 + }, + { + "epoch": 0.0018010021290418025, + "grad_norm": 3.28125, + "learning_rate": 2e-05, + "loss": 0.4001, + "step": 280 + }, + { + "epoch": 0.0018653236336504384, + "grad_norm": 3.25, + "learning_rate": 2e-05, + "loss": 0.3942, + "step": 290 + }, + { + "epoch": 0.0019296451382590741, + "grad_norm": 3.25, + "learning_rate": 2e-05, + "loss": 0.3827, + "step": 300 + }, + { + "epoch": 0.00199396664286771, + "grad_norm": 3.109375, + "learning_rate": 2e-05, + "loss": 0.3871, + "step": 310 + }, + { + "epoch": 0.002058288147476346, + "grad_norm": 2.8125, + "learning_rate": 2e-05, + "loss": 0.3694, + "step": 320 + }, + { + "epoch": 0.0021226096520849816, + "grad_norm": 2.890625, + "learning_rate": 2e-05, + "loss": 0.377, + "step": 330 + }, + { + "epoch": 0.0021869311566936173, + "grad_norm": 2.65625, + "learning_rate": 2e-05, + "loss": 0.3603, + "step": 340 + }, + { + "epoch": 0.002251252661302253, + "grad_norm": 3.421875, + "learning_rate": 2e-05, + "loss": 0.3604, + "step": 350 + }, + { + "epoch": 0.002315574165910889, + "grad_norm": 5.3125, + "learning_rate": 2e-05, + "loss": 0.3606, + "step": 360 + }, + { + "epoch": 0.002379895670519525, + "grad_norm": 2.734375, + "learning_rate": 2e-05, + "loss": 0.3552, + "step": 370 + }, + { + "epoch": 0.0024442171751281605, + "grad_norm": 4.09375, + "learning_rate": 2e-05, + "loss": 0.3466, + "step": 380 + }, + { + "epoch": 0.002508538679736796, + "grad_norm": 3.21875, + "learning_rate": 2e-05, + "loss": 0.3506, + "step": 390 + }, + { + "epoch": 0.0025728601843454323, + "grad_norm": 3.6875, + "learning_rate": 2e-05, + "loss": 0.3492, + "step": 400 + }, + { + "epoch": 0.002637181688954068, + "grad_norm": 3.65625, + "learning_rate": 2e-05, + "loss": 0.3467, + "step": 410 + }, + { + "epoch": 0.0027015031935627037, + "grad_norm": 3.0, + "learning_rate": 2e-05, + "loss": 0.3269, + "step": 420 + }, + { + "epoch": 0.00276582469817134, + "grad_norm": 2.90625, + "learning_rate": 2e-05, + "loss": 0.3392, + "step": 430 + }, + { + "epoch": 0.0028301462027799755, + "grad_norm": 3.328125, + "learning_rate": 2e-05, + "loss": 0.3333, + "step": 440 + }, + { + "epoch": 0.002894467707388611, + "grad_norm": 3.546875, + "learning_rate": 2e-05, + "loss": 0.3236, + "step": 450 + }, + { + "epoch": 0.002958789211997247, + "grad_norm": 3.765625, + "learning_rate": 2e-05, + "loss": 0.3229, + "step": 460 + }, + { + "epoch": 0.003023110716605883, + "grad_norm": 3.3125, + "learning_rate": 2e-05, + "loss": 0.321, + "step": 470 + }, + { + "epoch": 0.0030874322212145187, + "grad_norm": 2.921875, + "learning_rate": 2e-05, + "loss": 0.3187, + "step": 480 + }, + { + "epoch": 0.0031517537258231544, + "grad_norm": 2.59375, + "learning_rate": 2e-05, + "loss": 0.3205, + "step": 490 + }, + { + "epoch": 0.00321607523043179, + "grad_norm": 4.09375, + "learning_rate": 2e-05, + "loss": 0.3166, + "step": 500 + }, + { + "epoch": 0.003280396735040426, + "grad_norm": 2.734375, + "learning_rate": 2e-05, + "loss": 0.3093, + "step": 510 + }, + { + "epoch": 0.003344718239649062, + "grad_norm": 3.03125, + "learning_rate": 2e-05, + "loss": 0.3118, + "step": 520 + }, + { + "epoch": 0.0034090397442576976, + "grad_norm": 3.125, + "learning_rate": 2e-05, + "loss": 0.3092, + "step": 530 + }, + { + "epoch": 0.0034733612488663337, + "grad_norm": 2.96875, + "learning_rate": 2e-05, + "loss": 0.3123, + "step": 540 + }, + { + "epoch": 0.0035376827534749694, + "grad_norm": 2.734375, + "learning_rate": 2e-05, + "loss": 0.3019, + "step": 550 + }, + { + "epoch": 0.003602004258083605, + "grad_norm": 2.53125, + "learning_rate": 2e-05, + "loss": 0.301, + "step": 560 + }, + { + "epoch": 0.0036663257626922408, + "grad_norm": 2.265625, + "learning_rate": 2e-05, + "loss": 0.3011, + "step": 570 + }, + { + "epoch": 0.003730647267300877, + "grad_norm": 3.21875, + "learning_rate": 2e-05, + "loss": 0.2949, + "step": 580 + }, + { + "epoch": 0.0037949687719095126, + "grad_norm": 2.96875, + "learning_rate": 2e-05, + "loss": 0.3023, + "step": 590 + }, + { + "epoch": 0.0038592902765181483, + "grad_norm": 3.109375, + "learning_rate": 2e-05, + "loss": 0.3025, + "step": 600 + }, + { + "epoch": 0.003923611781126784, + "grad_norm": 3.140625, + "learning_rate": 2e-05, + "loss": 0.297, + "step": 610 + }, + { + "epoch": 0.00398793328573542, + "grad_norm": 2.53125, + "learning_rate": 2e-05, + "loss": 0.2962, + "step": 620 + }, + { + "epoch": 0.004052254790344056, + "grad_norm": 3.171875, + "learning_rate": 2e-05, + "loss": 0.2929, + "step": 630 + }, + { + "epoch": 0.004116576294952692, + "grad_norm": 3.03125, + "learning_rate": 2e-05, + "loss": 0.3011, + "step": 640 + }, + { + "epoch": 0.004180897799561327, + "grad_norm": 3.609375, + "learning_rate": 2e-05, + "loss": 0.2968, + "step": 650 + }, + { + "epoch": 0.004245219304169963, + "grad_norm": 2.53125, + "learning_rate": 2e-05, + "loss": 0.2969, + "step": 660 + }, + { + "epoch": 0.004309540808778599, + "grad_norm": 2.46875, + "learning_rate": 2e-05, + "loss": 0.2965, + "step": 670 + }, + { + "epoch": 0.004373862313387235, + "grad_norm": 2.84375, + "learning_rate": 2e-05, + "loss": 0.2882, + "step": 680 + }, + { + "epoch": 0.004438183817995871, + "grad_norm": 2.46875, + "learning_rate": 2e-05, + "loss": 0.2901, + "step": 690 + }, + { + "epoch": 0.004502505322604506, + "grad_norm": 2.59375, + "learning_rate": 2e-05, + "loss": 0.2945, + "step": 700 + }, + { + "epoch": 0.004566826827213142, + "grad_norm": 3.078125, + "learning_rate": 2e-05, + "loss": 0.2879, + "step": 710 + }, + { + "epoch": 0.004631148331821778, + "grad_norm": 3.5, + "learning_rate": 2e-05, + "loss": 0.2824, + "step": 720 + }, + { + "epoch": 0.0046954698364304135, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.2934, + "step": 730 + }, + { + "epoch": 0.00475979134103905, + "grad_norm": 3.671875, + "learning_rate": 2e-05, + "loss": 0.2977, + "step": 740 + }, + { + "epoch": 0.004824112845647686, + "grad_norm": 2.671875, + "learning_rate": 2e-05, + "loss": 0.2924, + "step": 750 + }, + { + "epoch": 0.004888434350256321, + "grad_norm": 2.375, + "learning_rate": 2e-05, + "loss": 0.287, + "step": 760 + }, + { + "epoch": 0.004952755854864957, + "grad_norm": 2.9375, + "learning_rate": 2e-05, + "loss": 0.2881, + "step": 770 + }, + { + "epoch": 0.005017077359473592, + "grad_norm": 2.484375, + "learning_rate": 2e-05, + "loss": 0.289, + "step": 780 + }, + { + "epoch": 0.0050813988640822285, + "grad_norm": 2.53125, + "learning_rate": 2e-05, + "loss": 0.2873, + "step": 790 + }, + { + "epoch": 0.005145720368690865, + "grad_norm": 3.484375, + "learning_rate": 2e-05, + "loss": 0.2805, + "step": 800 + }, + { + "epoch": 0.0052100418732995, + "grad_norm": 2.8125, + "learning_rate": 2e-05, + "loss": 0.2847, + "step": 810 + }, + { + "epoch": 0.005274363377908136, + "grad_norm": 2.9375, + "learning_rate": 2e-05, + "loss": 0.2881, + "step": 820 + }, + { + "epoch": 0.005338684882516772, + "grad_norm": 2.296875, + "learning_rate": 2e-05, + "loss": 0.2876, + "step": 830 + }, + { + "epoch": 0.005403006387125407, + "grad_norm": 2.828125, + "learning_rate": 2e-05, + "loss": 0.2773, + "step": 840 + }, + { + "epoch": 0.0054673278917340435, + "grad_norm": 2.9375, + "learning_rate": 2e-05, + "loss": 0.2873, + "step": 850 + }, + { + "epoch": 0.00553164939634268, + "grad_norm": 2.625, + "learning_rate": 2e-05, + "loss": 0.284, + "step": 860 + }, + { + "epoch": 0.005595970900951315, + "grad_norm": 3.109375, + "learning_rate": 2e-05, + "loss": 0.2856, + "step": 870 + }, + { + "epoch": 0.005660292405559951, + "grad_norm": 2.375, + "learning_rate": 2e-05, + "loss": 0.2769, + "step": 880 + }, + { + "epoch": 0.005724613910168586, + "grad_norm": 2.4375, + "learning_rate": 2e-05, + "loss": 0.2811, + "step": 890 + }, + { + "epoch": 0.005788935414777222, + "grad_norm": 2.203125, + "learning_rate": 2e-05, + "loss": 0.2757, + "step": 900 + }, + { + "epoch": 0.0058532569193858585, + "grad_norm": 2.484375, + "learning_rate": 2e-05, + "loss": 0.2798, + "step": 910 + }, + { + "epoch": 0.005917578423994494, + "grad_norm": 2.484375, + "learning_rate": 2e-05, + "loss": 0.2801, + "step": 920 + }, + { + "epoch": 0.00598189992860313, + "grad_norm": 2.25, + "learning_rate": 2e-05, + "loss": 0.285, + "step": 930 + }, + { + "epoch": 0.006046221433211766, + "grad_norm": 2.8125, + "learning_rate": 2e-05, + "loss": 0.2713, + "step": 940 + }, + { + "epoch": 0.006110542937820401, + "grad_norm": 2.3125, + "learning_rate": 2e-05, + "loss": 0.2736, + "step": 950 + }, + { + "epoch": 0.006174864442429037, + "grad_norm": 2.234375, + "learning_rate": 2e-05, + "loss": 0.279, + "step": 960 + }, + { + "epoch": 0.0062391859470376735, + "grad_norm": 2.375, + "learning_rate": 2e-05, + "loss": 0.2795, + "step": 970 + }, + { + "epoch": 0.006303507451646309, + "grad_norm": 3.015625, + "learning_rate": 2e-05, + "loss": 0.2772, + "step": 980 + }, + { + "epoch": 0.006367828956254945, + "grad_norm": 2.546875, + "learning_rate": 2e-05, + "loss": 0.2763, + "step": 990 + }, + { + "epoch": 0.00643215046086358, + "grad_norm": 2.578125, + "learning_rate": 2e-05, + "loss": 0.2732, + "step": 1000 + }, + { + "epoch": 0.006496471965472216, + "grad_norm": 2.734375, + "learning_rate": 2e-05, + "loss": 0.2658, + "step": 1010 + }, + { + "epoch": 0.006560793470080852, + "grad_norm": 2.203125, + "learning_rate": 2e-05, + "loss": 0.2762, + "step": 1020 + }, + { + "epoch": 0.006625114974689488, + "grad_norm": 2.5, + "learning_rate": 2e-05, + "loss": 0.2647, + "step": 1030 + }, + { + "epoch": 0.006689436479298124, + "grad_norm": 2.609375, + "learning_rate": 2e-05, + "loss": 0.2738, + "step": 1040 + }, + { + "epoch": 0.00675375798390676, + "grad_norm": 2.171875, + "learning_rate": 2e-05, + "loss": 0.2768, + "step": 1050 + }, + { + "epoch": 0.006818079488515395, + "grad_norm": 2.328125, + "learning_rate": 2e-05, + "loss": 0.2667, + "step": 1060 + }, + { + "epoch": 0.006882400993124031, + "grad_norm": 2.390625, + "learning_rate": 2e-05, + "loss": 0.2734, + "step": 1070 + }, + { + "epoch": 0.006946722497732667, + "grad_norm": 2.3125, + "learning_rate": 2e-05, + "loss": 0.27, + "step": 1080 + }, + { + "epoch": 0.007011044002341303, + "grad_norm": 3.09375, + "learning_rate": 2e-05, + "loss": 0.2683, + "step": 1090 + }, + { + "epoch": 0.007075365506949939, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 0.2683, + "step": 1100 + }, + { + "epoch": 0.007139687011558574, + "grad_norm": 2.34375, + "learning_rate": 2e-05, + "loss": 0.2715, + "step": 1110 + }, + { + "epoch": 0.00720400851616721, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.2756, + "step": 1120 + }, + { + "epoch": 0.007268330020775846, + "grad_norm": 2.734375, + "learning_rate": 2e-05, + "loss": 0.2689, + "step": 1130 + }, + { + "epoch": 0.0073326515253844815, + "grad_norm": 2.1875, + "learning_rate": 2e-05, + "loss": 0.2684, + "step": 1140 + }, + { + "epoch": 0.007396973029993118, + "grad_norm": 1.9375, + "learning_rate": 2e-05, + "loss": 0.2662, + "step": 1150 + }, + { + "epoch": 0.007461294534601754, + "grad_norm": 2.609375, + "learning_rate": 2e-05, + "loss": 0.2605, + "step": 1160 + }, + { + "epoch": 0.007525616039210389, + "grad_norm": 1.8671875, + "learning_rate": 2e-05, + "loss": 0.2664, + "step": 1170 + }, + { + "epoch": 0.007589937543819025, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.007654259048427661, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.2703, + "step": 1190 + }, + { + "epoch": 0.0077185805530362965, + "grad_norm": 2.921875, + "learning_rate": 2e-05, + "loss": 0.2714, + "step": 1200 + }, + { + "epoch": 0.007782902057644933, + "grad_norm": 3.0, + "learning_rate": 2e-05, + "loss": 0.2651, + "step": 1210 + }, + { + "epoch": 0.007847223562253569, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 0.2624, + "step": 1220 + }, + { + "epoch": 0.007911545066862204, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 0.2681, + "step": 1230 + }, + { + "epoch": 0.00797586657147084, + "grad_norm": 2.015625, + "learning_rate": 2e-05, + "loss": 0.2654, + "step": 1240 + }, + { + "epoch": 0.008040188076079476, + "grad_norm": 2.375, + "learning_rate": 2e-05, + "loss": 0.2575, + "step": 1250 + }, + { + "epoch": 0.008104509580688112, + "grad_norm": 2.21875, + "learning_rate": 2e-05, + "loss": 0.255, + "step": 1260 + }, + { + "epoch": 0.008168831085296747, + "grad_norm": 2.375, + "learning_rate": 2e-05, + "loss": 0.2593, + "step": 1270 + }, + { + "epoch": 0.008233152589905384, + "grad_norm": 2.40625, + "learning_rate": 2e-05, + "loss": 0.2623, + "step": 1280 + }, + { + "epoch": 0.008297474094514019, + "grad_norm": 2.8125, + "learning_rate": 2e-05, + "loss": 0.2591, + "step": 1290 + }, + { + "epoch": 0.008361795599122654, + "grad_norm": 2.1875, + "learning_rate": 2e-05, + "loss": 0.2573, + "step": 1300 + }, + { + "epoch": 0.008426117103731291, + "grad_norm": 2.015625, + "learning_rate": 2e-05, + "loss": 0.2611, + "step": 1310 + }, + { + "epoch": 0.008490438608339927, + "grad_norm": 2.578125, + "learning_rate": 2e-05, + "loss": 0.2591, + "step": 1320 + }, + { + "epoch": 0.008554760112948562, + "grad_norm": 2.234375, + "learning_rate": 2e-05, + "loss": 0.2599, + "step": 1330 + }, + { + "epoch": 0.008619081617557199, + "grad_norm": 2.8125, + "learning_rate": 2e-05, + "loss": 0.259, + "step": 1340 + }, + { + "epoch": 0.008683403122165834, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.2661, + "step": 1350 + }, + { + "epoch": 0.00874772462677447, + "grad_norm": 2.40625, + "learning_rate": 2e-05, + "loss": 0.2639, + "step": 1360 + }, + { + "epoch": 0.008812046131383105, + "grad_norm": 2.015625, + "learning_rate": 2e-05, + "loss": 0.2621, + "step": 1370 + }, + { + "epoch": 0.008876367635991742, + "grad_norm": 2.65625, + "learning_rate": 2e-05, + "loss": 0.2537, + "step": 1380 + }, + { + "epoch": 0.008940689140600377, + "grad_norm": 2.0, + "learning_rate": 2e-05, + "loss": 0.2596, + "step": 1390 + }, + { + "epoch": 0.009005010645209012, + "grad_norm": 2.09375, + "learning_rate": 2e-05, + "loss": 0.255, + "step": 1400 + }, + { + "epoch": 0.009069332149817649, + "grad_norm": 2.171875, + "learning_rate": 2e-05, + "loss": 0.2601, + "step": 1410 + }, + { + "epoch": 0.009133653654426284, + "grad_norm": 1.9140625, + "learning_rate": 2e-05, + "loss": 0.2516, + "step": 1420 + }, + { + "epoch": 0.00919797515903492, + "grad_norm": 2.171875, + "learning_rate": 2e-05, + "loss": 0.26, + "step": 1430 + }, + { + "epoch": 0.009262296663643556, + "grad_norm": 2.375, + "learning_rate": 2e-05, + "loss": 0.2524, + "step": 1440 + }, + { + "epoch": 0.009326618168252192, + "grad_norm": 2.328125, + "learning_rate": 2e-05, + "loss": 0.2551, + "step": 1450 + }, + { + "epoch": 0.009390939672860827, + "grad_norm": 2.296875, + "learning_rate": 2e-05, + "loss": 0.2593, + "step": 1460 + }, + { + "epoch": 0.009455261177469464, + "grad_norm": 2.203125, + "learning_rate": 2e-05, + "loss": 0.2612, + "step": 1470 + }, + { + "epoch": 0.0095195826820781, + "grad_norm": 2.515625, + "learning_rate": 2e-05, + "loss": 0.2579, + "step": 1480 + }, + { + "epoch": 0.009583904186686735, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.2564, + "step": 1490 + }, + { + "epoch": 0.009648225691295371, + "grad_norm": 2.25, + "learning_rate": 2e-05, + "loss": 0.252, + "step": 1500 + }, + { + "epoch": 0.009712547195904007, + "grad_norm": 2.703125, + "learning_rate": 2e-05, + "loss": 0.259, + "step": 1510 + }, + { + "epoch": 0.009776868700512642, + "grad_norm": 2.4375, + "learning_rate": 2e-05, + "loss": 0.2543, + "step": 1520 + }, + { + "epoch": 0.009841190205121279, + "grad_norm": 2.203125, + "learning_rate": 2e-05, + "loss": 0.2513, + "step": 1530 + }, + { + "epoch": 0.009905511709729914, + "grad_norm": 2.578125, + "learning_rate": 2e-05, + "loss": 0.2601, + "step": 1540 + }, + { + "epoch": 0.00996983321433855, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.2475, + "step": 1550 + }, + { + "epoch": 0.010034154718947185, + "grad_norm": 2.6875, + "learning_rate": 2e-05, + "loss": 0.2579, + "step": 1560 + }, + { + "epoch": 0.010098476223555822, + "grad_norm": 2.46875, + "learning_rate": 2e-05, + "loss": 0.2547, + "step": 1570 + }, + { + "epoch": 0.010162797728164457, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.2558, + "step": 1580 + }, + { + "epoch": 0.010227119232773092, + "grad_norm": 1.9609375, + "learning_rate": 2e-05, + "loss": 0.2532, + "step": 1590 + }, + { + "epoch": 0.01029144073738173, + "grad_norm": 1.96875, + "learning_rate": 2e-05, + "loss": 0.249, + "step": 1600 + }, + { + "epoch": 0.010355762241990365, + "grad_norm": 2.28125, + "learning_rate": 2e-05, + "loss": 0.2578, + "step": 1610 + }, + { + "epoch": 0.010420083746599, + "grad_norm": 2.28125, + "learning_rate": 2e-05, + "loss": 0.2536, + "step": 1620 + }, + { + "epoch": 0.010484405251207637, + "grad_norm": 2.78125, + "learning_rate": 2e-05, + "loss": 0.2551, + "step": 1630 + }, + { + "epoch": 0.010548726755816272, + "grad_norm": 1.9921875, + "learning_rate": 2e-05, + "loss": 0.2538, + "step": 1640 + }, + { + "epoch": 0.010613048260424907, + "grad_norm": 2.5, + "learning_rate": 2e-05, + "loss": 0.2616, + "step": 1650 + }, + { + "epoch": 0.010677369765033544, + "grad_norm": 2.390625, + "learning_rate": 2e-05, + "loss": 0.2421, + "step": 1660 + }, + { + "epoch": 0.01074169126964218, + "grad_norm": 2.34375, + "learning_rate": 2e-05, + "loss": 0.2531, + "step": 1670 + }, + { + "epoch": 0.010806012774250815, + "grad_norm": 2.265625, + "learning_rate": 2e-05, + "loss": 0.2531, + "step": 1680 + }, + { + "epoch": 0.010870334278859452, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.2456, + "step": 1690 + }, + { + "epoch": 0.010934655783468087, + "grad_norm": 2.015625, + "learning_rate": 2e-05, + "loss": 0.2455, + "step": 1700 + }, + { + "epoch": 0.010998977288076722, + "grad_norm": 1.84375, + "learning_rate": 2e-05, + "loss": 0.2521, + "step": 1710 + }, + { + "epoch": 0.01106329879268536, + "grad_norm": 2.625, + "learning_rate": 2e-05, + "loss": 0.2469, + "step": 1720 + }, + { + "epoch": 0.011127620297293995, + "grad_norm": 2.421875, + "learning_rate": 2e-05, + "loss": 0.2522, + "step": 1730 + }, + { + "epoch": 0.01119194180190263, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2552, + "step": 1740 + }, + { + "epoch": 0.011256263306511267, + "grad_norm": 1.8671875, + "learning_rate": 2e-05, + "loss": 0.2596, + "step": 1750 + }, + { + "epoch": 0.011320584811119902, + "grad_norm": 3.984375, + "learning_rate": 2e-05, + "loss": 0.2493, + "step": 1760 + }, + { + "epoch": 0.011384906315728537, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 0.2504, + "step": 1770 + }, + { + "epoch": 0.011449227820337173, + "grad_norm": 2.1875, + "learning_rate": 2e-05, + "loss": 0.2472, + "step": 1780 + }, + { + "epoch": 0.01151354932494581, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.2513, + "step": 1790 + }, + { + "epoch": 0.011577870829554445, + "grad_norm": 2.796875, + "learning_rate": 2e-05, + "loss": 0.2456, + "step": 1800 + }, + { + "epoch": 0.01164219233416308, + "grad_norm": 3.0, + "learning_rate": 2e-05, + "loss": 0.2467, + "step": 1810 + }, + { + "epoch": 0.011706513838771717, + "grad_norm": 1.921875, + "learning_rate": 2e-05, + "loss": 0.2566, + "step": 1820 + }, + { + "epoch": 0.011770835343380352, + "grad_norm": 2.359375, + "learning_rate": 2e-05, + "loss": 0.2547, + "step": 1830 + }, + { + "epoch": 0.011835156847988988, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 0.2538, + "step": 1840 + }, + { + "epoch": 0.011899478352597625, + "grad_norm": 2.0, + "learning_rate": 2e-05, + "loss": 0.2501, + "step": 1850 + }, + { + "epoch": 0.01196379985720626, + "grad_norm": 2.109375, + "learning_rate": 2e-05, + "loss": 0.2499, + "step": 1860 + }, + { + "epoch": 0.012028121361814895, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.2383, + "step": 1870 + }, + { + "epoch": 0.012092442866423532, + "grad_norm": 1.9765625, + "learning_rate": 2e-05, + "loss": 0.2397, + "step": 1880 + }, + { + "epoch": 0.012156764371032167, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 0.2449, + "step": 1890 + }, + { + "epoch": 0.012221085875640803, + "grad_norm": 2.59375, + "learning_rate": 2e-05, + "loss": 0.2503, + "step": 1900 + }, + { + "epoch": 0.01228540738024944, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.2447, + "step": 1910 + }, + { + "epoch": 0.012349728884858075, + "grad_norm": 2.15625, + "learning_rate": 2e-05, + "loss": 0.2509, + "step": 1920 + }, + { + "epoch": 0.01241405038946671, + "grad_norm": 2.765625, + "learning_rate": 2e-05, + "loss": 0.2462, + "step": 1930 + }, + { + "epoch": 0.012478371894075347, + "grad_norm": 2.203125, + "learning_rate": 2e-05, + "loss": 0.2487, + "step": 1940 + }, + { + "epoch": 0.012542693398683982, + "grad_norm": 2.953125, + "learning_rate": 2e-05, + "loss": 0.2439, + "step": 1950 + }, + { + "epoch": 0.012607014903292618, + "grad_norm": 2.328125, + "learning_rate": 2e-05, + "loss": 0.2536, + "step": 1960 + }, + { + "epoch": 0.012671336407901253, + "grad_norm": 1.9609375, + "learning_rate": 2e-05, + "loss": 0.2498, + "step": 1970 + }, + { + "epoch": 0.01273565791250989, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.2521, + "step": 1980 + }, + { + "epoch": 0.012799979417118525, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 0.2522, + "step": 1990 + }, + { + "epoch": 0.01286430092172716, + "grad_norm": 2.25, + "learning_rate": 2e-05, + "loss": 0.2494, + "step": 2000 + }, + { + "epoch": 0.012928622426335797, + "grad_norm": 2.34375, + "learning_rate": 2e-05, + "loss": 0.2402, + "step": 2010 + }, + { + "epoch": 0.012992943930944433, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2477, + "step": 2020 + }, + { + "epoch": 0.013057265435553068, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 0.2429, + "step": 2030 + }, + { + "epoch": 0.013121586940161705, + "grad_norm": 2.25, + "learning_rate": 2e-05, + "loss": 0.2416, + "step": 2040 + }, + { + "epoch": 0.01318590844477034, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.2443, + "step": 2050 + }, + { + "epoch": 0.013250229949378975, + "grad_norm": 2.953125, + "learning_rate": 2e-05, + "loss": 0.2457, + "step": 2060 + }, + { + "epoch": 0.013314551453987612, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.2428, + "step": 2070 + }, + { + "epoch": 0.013378872958596248, + "grad_norm": 1.8046875, + "learning_rate": 2e-05, + "loss": 0.2491, + "step": 2080 + }, + { + "epoch": 0.013443194463204883, + "grad_norm": 2.578125, + "learning_rate": 2e-05, + "loss": 0.2471, + "step": 2090 + }, + { + "epoch": 0.01350751596781352, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 0.2398, + "step": 2100 + }, + { + "epoch": 0.013571837472422155, + "grad_norm": 1.8984375, + "learning_rate": 2e-05, + "loss": 0.2489, + "step": 2110 + }, + { + "epoch": 0.01363615897703079, + "grad_norm": 1.78125, + "learning_rate": 2e-05, + "loss": 0.2425, + "step": 2120 + }, + { + "epoch": 0.013700480481639427, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.244, + "step": 2130 + }, + { + "epoch": 0.013764801986248063, + "grad_norm": 1.7890625, + "learning_rate": 2e-05, + "loss": 0.24, + "step": 2140 + }, + { + "epoch": 0.013829123490856698, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.2399, + "step": 2150 + }, + { + "epoch": 0.013893444995465335, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.2418, + "step": 2160 + }, + { + "epoch": 0.01395776650007397, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.2427, + "step": 2170 + }, + { + "epoch": 0.014022088004682605, + "grad_norm": 2.4375, + "learning_rate": 2e-05, + "loss": 0.2403, + "step": 2180 + }, + { + "epoch": 0.01408640950929124, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 0.2457, + "step": 2190 + }, + { + "epoch": 0.014150731013899878, + "grad_norm": 1.96875, + "learning_rate": 2e-05, + "loss": 0.2491, + "step": 2200 + }, + { + "epoch": 0.014419093228245763, + "grad_norm": 1.953125, + "learning_rate": 2e-05, + "loss": 0.2406, + "step": 2210 + }, + { + "epoch": 0.014484337993984433, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.2421, + "step": 2220 + }, + { + "epoch": 0.0145495827597231, + "grad_norm": 2.203125, + "learning_rate": 2e-05, + "loss": 0.2342, + "step": 2230 + }, + { + "epoch": 0.01461482752546177, + "grad_norm": 2.234375, + "learning_rate": 2e-05, + "loss": 0.2452, + "step": 2240 + }, + { + "epoch": 0.014680072291200438, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2409, + "step": 2250 + }, + { + "epoch": 0.014745317056939107, + "grad_norm": 2.09375, + "learning_rate": 2e-05, + "loss": 0.2417, + "step": 2260 + }, + { + "epoch": 0.014810561822677776, + "grad_norm": 2.359375, + "learning_rate": 2e-05, + "loss": 0.2374, + "step": 2270 + }, + { + "epoch": 0.014875806588416444, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.2435, + "step": 2280 + }, + { + "epoch": 0.014941051354155114, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.2358, + "step": 2290 + }, + { + "epoch": 0.015006296119893781, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.2439, + "step": 2300 + }, + { + "epoch": 0.01507154088563245, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.2432, + "step": 2310 + }, + { + "epoch": 0.015136785651371118, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.2418, + "step": 2320 + }, + { + "epoch": 0.015202030417109788, + "grad_norm": 2.25, + "learning_rate": 2e-05, + "loss": 0.235, + "step": 2330 + }, + { + "epoch": 0.015267275182848455, + "grad_norm": 2.0, + "learning_rate": 2e-05, + "loss": 0.2367, + "step": 2340 + }, + { + "epoch": 0.015332519948587125, + "grad_norm": 2.09375, + "learning_rate": 2e-05, + "loss": 0.2425, + "step": 2350 + }, + { + "epoch": 0.015397764714325793, + "grad_norm": 2.109375, + "learning_rate": 2e-05, + "loss": 0.2417, + "step": 2360 + }, + { + "epoch": 0.015463009480064462, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.2352, + "step": 2370 + }, + { + "epoch": 0.01552825424580313, + "grad_norm": 2.3125, + "learning_rate": 2e-05, + "loss": 0.2418, + "step": 2380 + }, + { + "epoch": 0.015593499011541799, + "grad_norm": 2.0, + "learning_rate": 2e-05, + "loss": 0.2358, + "step": 2390 + }, + { + "epoch": 0.01565874377728047, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 0.2424, + "step": 2400 + }, + { + "epoch": 0.015723988543019138, + "grad_norm": 2.015625, + "learning_rate": 2e-05, + "loss": 0.2359, + "step": 2410 + }, + { + "epoch": 0.015789233308757804, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.2435, + "step": 2420 + }, + { + "epoch": 0.015854478074496473, + "grad_norm": 2.390625, + "learning_rate": 2e-05, + "loss": 0.2349, + "step": 2430 + }, + { + "epoch": 0.015919722840235143, + "grad_norm": 1.8359375, + "learning_rate": 2e-05, + "loss": 0.2382, + "step": 2440 + }, + { + "epoch": 0.015984967605973812, + "grad_norm": 1.9453125, + "learning_rate": 2e-05, + "loss": 0.2419, + "step": 2450 + }, + { + "epoch": 0.016050212371712478, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.2395, + "step": 2460 + }, + { + "epoch": 0.016115457137451147, + "grad_norm": 2.46875, + "learning_rate": 2e-05, + "loss": 0.2358, + "step": 2470 + }, + { + "epoch": 0.016180701903189817, + "grad_norm": 2.609375, + "learning_rate": 2e-05, + "loss": 0.2371, + "step": 2480 + }, + { + "epoch": 0.016245946668928486, + "grad_norm": 2.171875, + "learning_rate": 2e-05, + "loss": 0.2401, + "step": 2490 + }, + { + "epoch": 0.016311191434667152, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 0.2348, + "step": 2500 + }, + { + "epoch": 0.01637643620040582, + "grad_norm": 1.8359375, + "learning_rate": 2e-05, + "loss": 0.2318, + "step": 2510 + }, + { + "epoch": 0.01644168096614449, + "grad_norm": 2.1875, + "learning_rate": 2e-05, + "loss": 0.2375, + "step": 2520 + }, + { + "epoch": 0.01650692573188316, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.2409, + "step": 2530 + }, + { + "epoch": 0.01657217049762183, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.2397, + "step": 2540 + }, + { + "epoch": 0.016637415263360496, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.2346, + "step": 2550 + }, + { + "epoch": 0.016702660029099165, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2462, + "step": 2560 + }, + { + "epoch": 0.016767904794837835, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2367, + "step": 2570 + }, + { + "epoch": 0.016833149560576504, + "grad_norm": 1.8984375, + "learning_rate": 2e-05, + "loss": 0.2401, + "step": 2580 + }, + { + "epoch": 0.01689839432631517, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.2372, + "step": 2590 + }, + { + "epoch": 0.01696363909205384, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.2372, + "step": 2600 + }, + { + "epoch": 0.01702888385779251, + "grad_norm": 2.25, + "learning_rate": 2e-05, + "loss": 0.2307, + "step": 2610 + }, + { + "epoch": 0.017094128623531178, + "grad_norm": 1.9609375, + "learning_rate": 2e-05, + "loss": 0.2411, + "step": 2620 + }, + { + "epoch": 0.017159373389269844, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.2332, + "step": 2630 + }, + { + "epoch": 0.017224618155008514, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.2387, + "step": 2640 + }, + { + "epoch": 0.017289862920747183, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.2399, + "step": 2650 + }, + { + "epoch": 0.017355107686485852, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.2386, + "step": 2660 + }, + { + "epoch": 0.017420352452224522, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.2388, + "step": 2670 + }, + { + "epoch": 0.017485597217963188, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.2361, + "step": 2680 + }, + { + "epoch": 0.017550841983701857, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.237, + "step": 2690 + }, + { + "epoch": 0.017616086749440527, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.2332, + "step": 2700 + }, + { + "epoch": 0.017739317134478426, + "grad_norm": 1.8984375, + "learning_rate": 2e-05, + "loss": 0.2319, + "step": 2710 + }, + { + "epoch": 0.017804775869291998, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 0.2451, + "step": 2720 + }, + { + "epoch": 0.017870234604105573, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2381, + "step": 2730 + }, + { + "epoch": 0.017935693338919145, + "grad_norm": 2.390625, + "learning_rate": 2e-05, + "loss": 0.2286, + "step": 2740 + }, + { + "epoch": 0.018001152073732717, + "grad_norm": 2.109375, + "learning_rate": 2e-05, + "loss": 0.2274, + "step": 2750 + }, + { + "epoch": 0.018066610808546293, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.2359, + "step": 2760 + }, + { + "epoch": 0.018132069543359865, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.2348, + "step": 2770 + }, + { + "epoch": 0.01819752827817344, + "grad_norm": 2.015625, + "learning_rate": 2e-05, + "loss": 0.2371, + "step": 2780 + }, + { + "epoch": 0.018262987012987012, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.2417, + "step": 2790 + }, + { + "epoch": 0.018328445747800588, + "grad_norm": 2.0, + "learning_rate": 2e-05, + "loss": 0.2347, + "step": 2800 + }, + { + "epoch": 0.01839390448261416, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.2355, + "step": 2810 + }, + { + "epoch": 0.018459363217427735, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 0.2443, + "step": 2820 + }, + { + "epoch": 0.018524821952241307, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2333, + "step": 2830 + }, + { + "epoch": 0.01859028068705488, + "grad_norm": 1.8671875, + "learning_rate": 2e-05, + "loss": 0.2326, + "step": 2840 + }, + { + "epoch": 0.018655739421868454, + "grad_norm": 2.390625, + "learning_rate": 2e-05, + "loss": 0.2357, + "step": 2850 + }, + { + "epoch": 0.018721198156682026, + "grad_norm": 2.203125, + "learning_rate": 2e-05, + "loss": 0.2298, + "step": 2860 + }, + { + "epoch": 0.018786656891495602, + "grad_norm": 1.9453125, + "learning_rate": 2e-05, + "loss": 0.2286, + "step": 2870 + }, + { + "epoch": 0.018852115626309174, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 0.231, + "step": 2880 + }, + { + "epoch": 0.01891757436112275, + "grad_norm": 2.125, + "learning_rate": 2e-05, + "loss": 0.2347, + "step": 2890 + }, + { + "epoch": 0.01898303309593632, + "grad_norm": 1.9140625, + "learning_rate": 2e-05, + "loss": 0.2294, + "step": 2900 + }, + { + "epoch": 0.019048491830749897, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.2298, + "step": 2910 + }, + { + "epoch": 0.01911395056556347, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.2277, + "step": 2920 + }, + { + "epoch": 0.01917940930037704, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.2363, + "step": 2930 + }, + { + "epoch": 0.019244868035190616, + "grad_norm": 1.8671875, + "learning_rate": 2e-05, + "loss": 0.2303, + "step": 2940 + }, + { + "epoch": 0.019310326770004188, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.238, + "step": 2950 + }, + { + "epoch": 0.019375785504817764, + "grad_norm": 2.015625, + "learning_rate": 2e-05, + "loss": 0.2296, + "step": 2960 + }, + { + "epoch": 0.019441244239631335, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 0.2328, + "step": 2970 + }, + { + "epoch": 0.01950670297444491, + "grad_norm": 1.7890625, + "learning_rate": 2e-05, + "loss": 0.2331, + "step": 2980 + }, + { + "epoch": 0.019572161709258483, + "grad_norm": 2.234375, + "learning_rate": 2e-05, + "loss": 0.2358, + "step": 2990 + }, + { + "epoch": 0.01963762044407206, + "grad_norm": 1.8984375, + "learning_rate": 2e-05, + "loss": 0.2312, + "step": 3000 + }, + { + "epoch": 0.019741717988574728, + "grad_norm": 1.921875, + "learning_rate": 2e-05, + "loss": 0.2357, + "step": 3010 + }, + { + "epoch": 0.0198073050915268, + "grad_norm": 3.140625, + "learning_rate": 2e-05, + "loss": 0.2317, + "step": 3020 + }, + { + "epoch": 0.019872892194478877, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.2379, + "step": 3030 + }, + { + "epoch": 0.019938479297430953, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.2291, + "step": 3040 + }, + { + "epoch": 0.02000406640038303, + "grad_norm": 1.8359375, + "learning_rate": 2e-05, + "loss": 0.2318, + "step": 3050 + }, + { + "epoch": 0.020069653503335103, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2317, + "step": 3060 + }, + { + "epoch": 0.02013524060628718, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.2308, + "step": 3070 + }, + { + "epoch": 0.020200827709239255, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.2313, + "step": 3080 + }, + { + "epoch": 0.02026641481219133, + "grad_norm": 1.84375, + "learning_rate": 2e-05, + "loss": 0.2299, + "step": 3090 + }, + { + "epoch": 0.020332001915143408, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.2319, + "step": 3100 + }, + { + "epoch": 0.02039758901809548, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.2327, + "step": 3110 + }, + { + "epoch": 0.020463176121047557, + "grad_norm": 2.578125, + "learning_rate": 2e-05, + "loss": 0.2278, + "step": 3120 + }, + { + "epoch": 0.020528763223999633, + "grad_norm": 2.21875, + "learning_rate": 2e-05, + "loss": 0.2313, + "step": 3130 + }, + { + "epoch": 0.02059435032695171, + "grad_norm": 2.34375, + "learning_rate": 2e-05, + "loss": 0.2291, + "step": 3140 + }, + { + "epoch": 0.020659937429903782, + "grad_norm": 1.921875, + "learning_rate": 2e-05, + "loss": 0.2295, + "step": 3150 + }, + { + "epoch": 0.02072552453285586, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 0.2335, + "step": 3160 + }, + { + "epoch": 0.020791111635807935, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.2271, + "step": 3170 + }, + { + "epoch": 0.02085669873876001, + "grad_norm": 3.34375, + "learning_rate": 2e-05, + "loss": 0.237, + "step": 3180 + }, + { + "epoch": 0.020922285841712084, + "grad_norm": 2.34375, + "learning_rate": 2e-05, + "loss": 0.2256, + "step": 3190 + }, + { + "epoch": 0.02098787294466416, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.2293, + "step": 3200 + }, + { + "epoch": 0.021053460047616237, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 0.2291, + "step": 3210 + }, + { + "epoch": 0.021119047150568313, + "grad_norm": 1.8046875, + "learning_rate": 2e-05, + "loss": 0.2243, + "step": 3220 + }, + { + "epoch": 0.021184634253520386, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.2261, + "step": 3230 + }, + { + "epoch": 0.021250221356472462, + "grad_norm": 2.46875, + "learning_rate": 2e-05, + "loss": 0.2289, + "step": 3240 + }, + { + "epoch": 0.02131580845942454, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.2386, + "step": 3250 + }, + { + "epoch": 0.021381395562376615, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.2255, + "step": 3260 + }, + { + "epoch": 0.02144698266532869, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.2214, + "step": 3270 + }, + { + "epoch": 0.021512569768280764, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.2278, + "step": 3280 + }, + { + "epoch": 0.02157815687123284, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.2233, + "step": 3290 + }, + { + "epoch": 0.021643743974184917, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.2244, + "step": 3300 + }, + { + "epoch": 0.021709331077136993, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 0.2291, + "step": 3310 + }, + { + "epoch": 0.021774918180089066, + "grad_norm": 2.0, + "learning_rate": 2e-05, + "loss": 0.2297, + "step": 3320 + }, + { + "epoch": 0.021840505283041142, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2239, + "step": 3330 + }, + { + "epoch": 0.02190609238599322, + "grad_norm": 2.015625, + "learning_rate": 2e-05, + "loss": 0.2287, + "step": 3340 + }, + { + "epoch": 0.021971679488945295, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2269, + "step": 3350 + }, + { + "epoch": 0.022037266591897368, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.2356, + "step": 3360 + }, + { + "epoch": 0.022102853694849444, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.23, + "step": 3370 + }, + { + "epoch": 0.02216844079780152, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.2339, + "step": 3380 + }, + { + "epoch": 0.022234027900753597, + "grad_norm": 4.15625, + "learning_rate": 2e-05, + "loss": 0.2293, + "step": 3390 + }, + { + "epoch": 0.022299615003705673, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2296, + "step": 3400 + }, + { + "epoch": 0.022365202106657746, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 0.2345, + "step": 3410 + }, + { + "epoch": 0.022430789209609822, + "grad_norm": 2.46875, + "learning_rate": 2e-05, + "loss": 0.2311, + "step": 3420 + }, + { + "epoch": 0.0224963763125619, + "grad_norm": 2.234375, + "learning_rate": 2e-05, + "loss": 0.2231, + "step": 3430 + }, + { + "epoch": 0.022561963415513975, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 0.2298, + "step": 3440 + }, + { + "epoch": 0.022627550518466048, + "grad_norm": 1.9921875, + "learning_rate": 2e-05, + "loss": 0.2273, + "step": 3450 + }, + { + "epoch": 0.022693137621418124, + "grad_norm": 2.171875, + "learning_rate": 2e-05, + "loss": 0.2333, + "step": 3460 + }, + { + "epoch": 0.0227587247243702, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 0.2202, + "step": 3470 + }, + { + "epoch": 0.022824311827322277, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.2343, + "step": 3480 + }, + { + "epoch": 0.02288989893027435, + "grad_norm": 2.296875, + "learning_rate": 2e-05, + "loss": 0.2285, + "step": 3490 + }, + { + "epoch": 0.022955486033226426, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 0.2276, + "step": 3500 + }, + { + "epoch": 0.20421224109844077, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.2247, + "step": 3510 + }, + { + "epoch": 0.2047940423551315, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2226, + "step": 3520 + }, + { + "epoch": 0.2053758436118222, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 0.2225, + "step": 3530 + }, + { + "epoch": 0.20595764486851292, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.2274, + "step": 3540 + }, + { + "epoch": 0.20653944612520364, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.2256, + "step": 3550 + }, + { + "epoch": 0.20712124738189436, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.2218, + "step": 3560 + }, + { + "epoch": 0.20770304863858505, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.2284, + "step": 3570 + }, + { + "epoch": 0.20828484989527576, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.2217, + "step": 3580 + }, + { + "epoch": 0.20886665115196648, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.2186, + "step": 3590 + }, + { + "epoch": 0.2094484524086572, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.225, + "step": 3600 + }, + { + "epoch": 0.21003025366534792, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.2289, + "step": 3610 + }, + { + "epoch": 0.21061205492203863, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.2227, + "step": 3620 + }, + { + "epoch": 0.21119385617872935, + "grad_norm": 1.9609375, + "learning_rate": 2e-05, + "loss": 0.2286, + "step": 3630 + }, + { + "epoch": 0.21177565743542007, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.2192, + "step": 3640 + }, + { + "epoch": 0.21235745869211078, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.227, + "step": 3650 + }, + { + "epoch": 0.2129392599488015, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2275, + "step": 3660 + }, + { + "epoch": 0.2135210612054922, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.2183, + "step": 3670 + }, + { + "epoch": 0.2141028624621829, + "grad_norm": 2.09375, + "learning_rate": 2e-05, + "loss": 0.2294, + "step": 3680 + }, + { + "epoch": 0.21468466371887363, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 0.2215, + "step": 3690 + }, + { + "epoch": 0.21526646497556434, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 0.2256, + "step": 3700 + }, + { + "epoch": 0.21584826623225506, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 0.2344, + "step": 3710 + }, + { + "epoch": 0.21643006748894578, + "grad_norm": 2.296875, + "learning_rate": 2e-05, + "loss": 0.2215, + "step": 3720 + }, + { + "epoch": 0.2170118687456365, + "grad_norm": 1.9375, + "learning_rate": 2e-05, + "loss": 0.2224, + "step": 3730 + }, + { + "epoch": 0.2175936700023272, + "grad_norm": 2.015625, + "learning_rate": 2e-05, + "loss": 0.2228, + "step": 3740 + }, + { + "epoch": 0.21817547125901793, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.2241, + "step": 3750 + }, + { + "epoch": 0.21875727251570865, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.2257, + "step": 3760 + }, + { + "epoch": 0.21933907377239936, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.218, + "step": 3770 + }, + { + "epoch": 0.21992087502909005, + "grad_norm": 1.8203125, + "learning_rate": 2e-05, + "loss": 0.2204, + "step": 3780 + }, + { + "epoch": 0.22050267628578077, + "grad_norm": 1.96875, + "learning_rate": 2e-05, + "loss": 0.2274, + "step": 3790 + }, + { + "epoch": 0.22108447754247149, + "grad_norm": 2.171875, + "learning_rate": 2e-05, + "loss": 0.2224, + "step": 3800 + }, + { + "epoch": 0.2216662787991622, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.2237, + "step": 3810 + }, + { + "epoch": 0.22224808005585292, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 0.2268, + "step": 3820 + }, + { + "epoch": 0.22282988131254364, + "grad_norm": 1.8046875, + "learning_rate": 2e-05, + "loss": 0.2249, + "step": 3830 + }, + { + "epoch": 0.22341168256923435, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.2334, + "step": 3840 + }, + { + "epoch": 0.22399348382592507, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.2211, + "step": 3850 + }, + { + "epoch": 0.2245752850826158, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.2173, + "step": 3860 + }, + { + "epoch": 0.2251570863393065, + "grad_norm": 1.78125, + "learning_rate": 2e-05, + "loss": 0.2233, + "step": 3870 + }, + { + "epoch": 0.2257388875959972, + "grad_norm": 2.09375, + "learning_rate": 2e-05, + "loss": 0.2258, + "step": 3880 + }, + { + "epoch": 0.2263206888526879, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.2222, + "step": 3890 + }, + { + "epoch": 0.22690249010937863, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 0.2223, + "step": 3900 + }, + { + "epoch": 0.22748429136606935, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.2226, + "step": 3910 + }, + { + "epoch": 0.22806609262276006, + "grad_norm": 1.9375, + "learning_rate": 2e-05, + "loss": 0.2295, + "step": 3920 + }, + { + "epoch": 0.22864789387945078, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 0.229, + "step": 3930 + }, + { + "epoch": 0.2292296951361415, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 0.2219, + "step": 3940 + }, + { + "epoch": 0.22981149639283222, + "grad_norm": 2.28125, + "learning_rate": 2e-05, + "loss": 0.2253, + "step": 3950 + }, + { + "epoch": 0.23039329764952293, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 0.2229, + "step": 3960 + }, + { + "epoch": 0.23097509890621365, + "grad_norm": 1.8046875, + "learning_rate": 2e-05, + "loss": 0.2242, + "step": 3970 + }, + { + "epoch": 0.23155690016290434, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2212, + "step": 3980 + }, + { + "epoch": 0.23213870141959506, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.2234, + "step": 3990 + }, + { + "epoch": 0.23272050267628577, + "grad_norm": 2.015625, + "learning_rate": 2e-05, + "loss": 0.2216, + "step": 4000 + }, + { + "epoch": 0.2333023039329765, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.2213, + "step": 4010 + }, + { + "epoch": 0.2338841051896672, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 0.2243, + "step": 4020 + }, + { + "epoch": 0.23446590644635792, + "grad_norm": 3.25, + "learning_rate": 2e-05, + "loss": 0.2163, + "step": 4030 + }, + { + "epoch": 0.23504770770304864, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 0.2256, + "step": 4040 + }, + { + "epoch": 0.23562950895973936, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.2178, + "step": 4050 + }, + { + "epoch": 0.23621131021643008, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.2191, + "step": 4060 + }, + { + "epoch": 0.2367931114731208, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.2236, + "step": 4070 + }, + { + "epoch": 0.2373749127298115, + "grad_norm": 1.8359375, + "learning_rate": 2e-05, + "loss": 0.2297, + "step": 4080 + }, + { + "epoch": 0.2379567139865022, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.2242, + "step": 4090 + }, + { + "epoch": 0.23853851524319292, + "grad_norm": 1.8515625, + "learning_rate": 2e-05, + "loss": 0.2249, + "step": 4100 + }, + { + "epoch": 0.23912031649988363, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 0.2225, + "step": 4110 + }, + { + "epoch": 0.23970211775657435, + "grad_norm": 1.984375, + "learning_rate": 2e-05, + "loss": 0.2232, + "step": 4120 + }, + { + "epoch": 0.24028391901326507, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2235, + "step": 4130 + }, + { + "epoch": 0.24086572026995579, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.2183, + "step": 4140 + }, + { + "epoch": 0.2414475215266465, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 0.2216, + "step": 4150 + }, + { + "epoch": 0.24202932278333722, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.2218, + "step": 4160 + }, + { + "epoch": 0.24261112404002794, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.2167, + "step": 4170 + }, + { + "epoch": 0.24319292529671865, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.2258, + "step": 4180 + }, + { + "epoch": 0.24377472655340934, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.2174, + "step": 4190 + }, + { + "epoch": 0.24435652781010006, + "grad_norm": 1.7890625, + "learning_rate": 2e-05, + "loss": 0.2179, + "step": 4200 + }, + { + "epoch": 0.24493832906679078, + "grad_norm": 1.953125, + "learning_rate": 2e-05, + "loss": 0.2234, + "step": 4210 + }, + { + "epoch": 0.2455201303234815, + "grad_norm": 1.390625, + "learning_rate": 2e-05, + "loss": 0.2168, + "step": 4220 + }, + { + "epoch": 0.2461019315801722, + "grad_norm": 1.7890625, + "learning_rate": 2e-05, + "loss": 0.2173, + "step": 4230 + }, + { + "epoch": 0.24668373283686293, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.2187, + "step": 4240 + }, + { + "epoch": 0.24726553409355365, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 0.2195, + "step": 4250 + }, + { + "epoch": 0.24784733535024436, + "grad_norm": 1.828125, + "learning_rate": 2e-05, + "loss": 0.2169, + "step": 4260 + }, + { + "epoch": 0.24842913660693508, + "grad_norm": 2.015625, + "learning_rate": 2e-05, + "loss": 0.219, + "step": 4270 + }, + { + "epoch": 0.2490109378636258, + "grad_norm": 2.21875, + "learning_rate": 2e-05, + "loss": 0.2176, + "step": 4280 + }, + { + "epoch": 0.2495927391203165, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 0.2157, + "step": 4290 + }, + { + "epoch": 0.2501745403770072, + "grad_norm": 2.390625, + "learning_rate": 2e-05, + "loss": 0.2242, + "step": 4300 + }, + { + "epoch": 0.2507563416336979, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.2243, + "step": 4310 + }, + { + "epoch": 0.25133814289038864, + "grad_norm": 1.84375, + "learning_rate": 2e-05, + "loss": 0.2192, + "step": 4320 + }, + { + "epoch": 0.25191994414707936, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.2216, + "step": 4330 + }, + { + "epoch": 0.2525017454037701, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.2163, + "step": 4340 + }, + { + "epoch": 0.2530835466604608, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.2249, + "step": 4350 + }, + { + "epoch": 0.2536653479171515, + "grad_norm": 1.3125, + "learning_rate": 2e-05, + "loss": 0.2249, + "step": 4360 + }, + { + "epoch": 0.2542471491738422, + "grad_norm": 2.0, + "learning_rate": 2e-05, + "loss": 0.2227, + "step": 4370 + }, + { + "epoch": 0.25482895043053294, + "grad_norm": 3.953125, + "learning_rate": 2e-05, + "loss": 0.2163, + "step": 4380 + }, + { + "epoch": 0.25541075168722366, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.2168, + "step": 4390 + }, + { + "epoch": 0.2559925529439144, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.2164, + "step": 4400 + }, + { + "epoch": 0.2565743542006051, + "grad_norm": 1.8203125, + "learning_rate": 2e-05, + "loss": 0.213, + "step": 4410 + }, + { + "epoch": 0.2571561554572958, + "grad_norm": 1.9765625, + "learning_rate": 2e-05, + "loss": 0.2203, + "step": 4420 + }, + { + "epoch": 0.2577379567139865, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 0.2154, + "step": 4430 + }, + { + "epoch": 0.25831975797067724, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 0.2214, + "step": 4440 + }, + { + "epoch": 0.2589015592273679, + "grad_norm": 2.28125, + "learning_rate": 2e-05, + "loss": 0.2136, + "step": 4450 + }, + { + "epoch": 0.2594833604840586, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.2204, + "step": 4460 + }, + { + "epoch": 0.26006516174074934, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.219, + "step": 4470 + }, + { + "epoch": 0.26064696299744006, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 0.2112, + "step": 4480 + }, + { + "epoch": 0.2612287642541308, + "grad_norm": 1.9609375, + "learning_rate": 2e-05, + "loss": 0.2184, + "step": 4490 + }, + { + "epoch": 0.2618105655108215, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.2219, + "step": 4500 + }, + { + "epoch": 0.2623923667675122, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.2205, + "step": 4510 + }, + { + "epoch": 0.2629741680242029, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.216, + "step": 4520 + }, + { + "epoch": 0.26355596928089364, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.2275, + "step": 4530 + }, + { + "epoch": 0.26413777053758436, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 0.2199, + "step": 4540 + }, + { + "epoch": 0.2647195717942751, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.2193, + "step": 4550 + }, + { + "epoch": 0.2653013730509658, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 0.2123, + "step": 4560 + }, + { + "epoch": 0.2658831743076565, + "grad_norm": 2.25, + "learning_rate": 2e-05, + "loss": 0.2244, + "step": 4570 + }, + { + "epoch": 0.26646497556434723, + "grad_norm": 2.0, + "learning_rate": 2e-05, + "loss": 0.2165, + "step": 4580 + }, + { + "epoch": 0.26704677682103795, + "grad_norm": 2.0, + "learning_rate": 2e-05, + "loss": 0.2167, + "step": 4590 + }, + { + "epoch": 0.26762857807772866, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 0.2146, + "step": 4600 + }, + { + "epoch": 0.2682103793344194, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 0.2166, + "step": 4610 + }, + { + "epoch": 0.2687921805911101, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.2148, + "step": 4620 + }, + { + "epoch": 0.2693739818478008, + "grad_norm": 1.8203125, + "learning_rate": 2e-05, + "loss": 0.2174, + "step": 4630 + }, + { + "epoch": 0.26995578310449153, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.2229, + "step": 4640 + }, + { + "epoch": 0.2705375843611822, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.2139, + "step": 4650 + }, + { + "epoch": 0.2711193856178729, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.213, + "step": 4660 + }, + { + "epoch": 0.2717011868745636, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.2109, + "step": 4670 + }, + { + "epoch": 0.27228298813125434, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.2168, + "step": 4680 + }, + { + "epoch": 0.27286478938794506, + "grad_norm": 1.96875, + "learning_rate": 2e-05, + "loss": 0.2185, + "step": 4690 + }, + { + "epoch": 0.2734465906446358, + "grad_norm": 2.171875, + "learning_rate": 2e-05, + "loss": 0.2209, + "step": 4700 + }, + { + "epoch": 0.2740283919013265, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.2132, + "step": 4710 + }, + { + "epoch": 0.2746101931580172, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2109, + "step": 4720 + }, + { + "epoch": 0.27519199441470793, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.2212, + "step": 4730 + }, + { + "epoch": 0.27577379567139865, + "grad_norm": 1.8984375, + "learning_rate": 2e-05, + "loss": 0.2227, + "step": 4740 + }, + { + "epoch": 0.27635559692808936, + "grad_norm": 1.8515625, + "learning_rate": 2e-05, + "loss": 0.2187, + "step": 4750 + }, + { + "epoch": 0.2769373981847801, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.2176, + "step": 4760 + }, + { + "epoch": 0.2775191994414708, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 0.217, + "step": 4770 + }, + { + "epoch": 0.2781010006981615, + "grad_norm": 1.34375, + "learning_rate": 2e-05, + "loss": 0.2158, + "step": 4780 + }, + { + "epoch": 0.27868280195485223, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.2147, + "step": 4790 + }, + { + "epoch": 0.27926460321154295, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.221, + "step": 4800 + }, + { + "epoch": 0.27984640446823367, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 0.2137, + "step": 4810 + }, + { + "epoch": 0.2804282057249244, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.2216, + "step": 4820 + }, + { + "epoch": 0.2810100069816151, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 0.2122, + "step": 4830 + }, + { + "epoch": 0.2815918082383058, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.2156, + "step": 4840 + }, + { + "epoch": 0.28217360949499654, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.2129, + "step": 4850 + }, + { + "epoch": 0.2827554107516872, + "grad_norm": 1.265625, + "learning_rate": 2e-05, + "loss": 0.2167, + "step": 4860 + }, + { + "epoch": 0.2833372120083779, + "grad_norm": 1.8203125, + "learning_rate": 2e-05, + "loss": 0.2152, + "step": 4870 + }, + { + "epoch": 0.28391901326506863, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.2119, + "step": 4880 + }, + { + "epoch": 0.28450081452175935, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 0.2137, + "step": 4890 + }, + { + "epoch": 0.28508261577845007, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 0.22, + "step": 4900 + }, + { + "epoch": 0.2856644170351408, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2184, + "step": 4910 + }, + { + "epoch": 0.2862462182918315, + "grad_norm": 1.8515625, + "learning_rate": 2e-05, + "loss": 0.2243, + "step": 4920 + }, + { + "epoch": 0.2868280195485222, + "grad_norm": 1.953125, + "learning_rate": 2e-05, + "loss": 0.2173, + "step": 4930 + }, + { + "epoch": 0.28740982080521293, + "grad_norm": 1.9765625, + "learning_rate": 2e-05, + "loss": 0.2135, + "step": 4940 + }, + { + "epoch": 0.28799162206190365, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.2184, + "step": 4950 + }, + { + "epoch": 0.28857342331859437, + "grad_norm": 1.921875, + "learning_rate": 2e-05, + "loss": 0.2161, + "step": 4960 + }, + { + "epoch": 0.2891552245752851, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.2181, + "step": 4970 + }, + { + "epoch": 0.2897370258319758, + "grad_norm": 1.9140625, + "learning_rate": 2e-05, + "loss": 0.2139, + "step": 4980 + }, + { + "epoch": 0.2903188270886665, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 0.2135, + "step": 4990 + }, + { + "epoch": 0.29090062834535724, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.2145, + "step": 5000 + }, + { + "epoch": 0.29148242960204795, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 0.215, + "step": 5010 + }, + { + "epoch": 0.29206423085873867, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.2147, + "step": 5020 + }, + { + "epoch": 0.2926460321154294, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.2142, + "step": 5030 + }, + { + "epoch": 0.2932278333721201, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.2148, + "step": 5040 + }, + { + "epoch": 0.2938096346288108, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.2099, + "step": 5050 + }, + { + "epoch": 0.29439143588550154, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.2139, + "step": 5060 + }, + { + "epoch": 0.2949732371421922, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.213, + "step": 5070 + }, + { + "epoch": 0.2955550383988829, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.2142, + "step": 5080 + }, + { + "epoch": 0.29613683965557364, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.216, + "step": 5090 + }, + { + "epoch": 0.29671864091226435, + "grad_norm": 1.9609375, + "learning_rate": 2e-05, + "loss": 0.2194, + "step": 5100 + }, + { + "epoch": 0.29730044216895507, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.2099, + "step": 5110 + }, + { + "epoch": 0.2978822434256458, + "grad_norm": 1.953125, + "learning_rate": 2e-05, + "loss": 0.216, + "step": 5120 + }, + { + "epoch": 0.2984640446823365, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.2113, + "step": 5130 + }, + { + "epoch": 0.2990458459390272, + "grad_norm": 1.921875, + "learning_rate": 2e-05, + "loss": 0.2155, + "step": 5140 + }, + { + "epoch": 0.29962764719571794, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 0.2083, + "step": 5150 + }, + { + "epoch": 0.30020944845240866, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.2179, + "step": 5160 + }, + { + "epoch": 0.3007912497090994, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.2137, + "step": 5170 + }, + { + "epoch": 0.3013730509657901, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.2129, + "step": 5180 + }, + { + "epoch": 0.3019548522224808, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.2079, + "step": 5190 + }, + { + "epoch": 0.3025366534791715, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.2175, + "step": 5200 + }, + { + "epoch": 0.30311845473586224, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.2186, + "step": 5210 + }, + { + "epoch": 0.30370025599255296, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 0.2091, + "step": 5220 + }, + { + "epoch": 0.3042820572492437, + "grad_norm": 1.828125, + "learning_rate": 2e-05, + "loss": 0.2132, + "step": 5230 + }, + { + "epoch": 0.3048638585059344, + "grad_norm": 1.96875, + "learning_rate": 2e-05, + "loss": 0.2139, + "step": 5240 + }, + { + "epoch": 0.3054456597626251, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.2201, + "step": 5250 + }, + { + "epoch": 0.3060274610193158, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 0.2159, + "step": 5260 + }, + { + "epoch": 0.3066092622760065, + "grad_norm": 1.8515625, + "learning_rate": 2e-05, + "loss": 0.2146, + "step": 5270 + }, + { + "epoch": 0.3071910635326972, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2075, + "step": 5280 + }, + { + "epoch": 0.3077728647893879, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 0.215, + "step": 5290 + }, + { + "epoch": 0.30835466604607864, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.2197, + "step": 5300 + }, + { + "epoch": 0.30893646730276936, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.2151, + "step": 5310 + }, + { + "epoch": 0.3095182685594601, + "grad_norm": 1.8203125, + "learning_rate": 2e-05, + "loss": 0.2208, + "step": 5320 + }, + { + "epoch": 0.3101000698161508, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.2121, + "step": 5330 + }, + { + "epoch": 0.3106818710728415, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.2063, + "step": 5340 + }, + { + "epoch": 0.3112636723295322, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.2118, + "step": 5350 + }, + { + "epoch": 0.31184547358622294, + "grad_norm": 2.0, + "learning_rate": 2e-05, + "loss": 0.2156, + "step": 5360 + }, + { + "epoch": 0.31242727484291366, + "grad_norm": 1.796875, + "learning_rate": 2e-05, + "loss": 0.2199, + "step": 5370 + }, + { + "epoch": 0.3130090760996044, + "grad_norm": 1.921875, + "learning_rate": 2e-05, + "loss": 0.211, + "step": 5380 + }, + { + "epoch": 0.3135908773562951, + "grad_norm": 1.8046875, + "learning_rate": 2e-05, + "loss": 0.2158, + "step": 5390 + }, + { + "epoch": 0.3141726786129858, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.2078, + "step": 5400 + }, + { + "epoch": 0.31475447986967653, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.2168, + "step": 5410 + }, + { + "epoch": 0.31533628112636725, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.2191, + "step": 5420 + }, + { + "epoch": 0.31591808238305796, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.2117, + "step": 5430 + }, + { + "epoch": 0.3164998836397487, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.2034, + "step": 5440 + }, + { + "epoch": 0.3170816848964394, + "grad_norm": 2.21875, + "learning_rate": 2e-05, + "loss": 0.2146, + "step": 5450 + }, + { + "epoch": 0.3176634861531301, + "grad_norm": 1.2734375, + "learning_rate": 2e-05, + "loss": 0.2086, + "step": 5460 + }, + { + "epoch": 0.31824528740982083, + "grad_norm": 1.78125, + "learning_rate": 2e-05, + "loss": 0.2155, + "step": 5470 + }, + { + "epoch": 0.3188270886665115, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.2146, + "step": 5480 + }, + { + "epoch": 0.3194088899232022, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.2133, + "step": 5490 + }, + { + "epoch": 0.3199906911798929, + "grad_norm": 1.78125, + "learning_rate": 2e-05, + "loss": 0.2191, + "step": 5500 + }, + { + "epoch": 0.32057249243658364, + "grad_norm": 2.265625, + "learning_rate": 2e-05, + "loss": 0.2131, + "step": 5510 + }, + { + "epoch": 0.32115429369327436, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 0.214, + "step": 5520 + }, + { + "epoch": 0.3217360949499651, + "grad_norm": 1.78125, + "learning_rate": 2e-05, + "loss": 0.2091, + "step": 5530 + }, + { + "epoch": 0.3223178962066558, + "grad_norm": 1.953125, + "learning_rate": 2e-05, + "loss": 0.2077, + "step": 5540 + }, + { + "epoch": 0.3228996974633465, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.2137, + "step": 5550 + }, + { + "epoch": 0.32348149872003723, + "grad_norm": 2.8125, + "learning_rate": 2e-05, + "loss": 0.2149, + "step": 5560 + }, + { + "epoch": 0.32406329997672795, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.2111, + "step": 5570 + }, + { + "epoch": 0.32464510123341866, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.2136, + "step": 5580 + }, + { + "epoch": 0.3252269024901094, + "grad_norm": 1.796875, + "learning_rate": 2e-05, + "loss": 0.2099, + "step": 5590 + }, + { + "epoch": 0.3258087037468001, + "grad_norm": 2.359375, + "learning_rate": 2e-05, + "loss": 0.2114, + "step": 5600 + }, + { + "epoch": 0.3263905050034908, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.2152, + "step": 5610 + }, + { + "epoch": 0.32697230626018153, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.2153, + "step": 5620 + }, + { + "epoch": 0.32755410751687225, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.207, + "step": 5630 + }, + { + "epoch": 0.32813590877356297, + "grad_norm": 1.9453125, + "learning_rate": 2e-05, + "loss": 0.215, + "step": 5640 + }, + { + "epoch": 0.3287177100302537, + "grad_norm": 1.9765625, + "learning_rate": 2e-05, + "loss": 0.2084, + "step": 5650 + }, + { + "epoch": 0.3292995112869444, + "grad_norm": 1.78125, + "learning_rate": 2e-05, + "loss": 0.217, + "step": 5660 + }, + { + "epoch": 0.3298813125436351, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.2094, + "step": 5670 + }, + { + "epoch": 0.33046311380032584, + "grad_norm": 1.96875, + "learning_rate": 2e-05, + "loss": 0.2118, + "step": 5680 + }, + { + "epoch": 0.3310449150570165, + "grad_norm": 1.7890625, + "learning_rate": 2e-05, + "loss": 0.2092, + "step": 5690 + }, + { + "epoch": 0.3316267163137072, + "grad_norm": 1.796875, + "learning_rate": 2e-05, + "loss": 0.2086, + "step": 5700 + }, + { + "epoch": 0.33220851757039793, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.2126, + "step": 5710 + }, + { + "epoch": 0.33279031882708865, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.2143, + "step": 5720 + }, + { + "epoch": 0.33337212008377937, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.2077, + "step": 5730 + }, + { + "epoch": 0.3339539213404701, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.2101, + "step": 5740 + }, + { + "epoch": 0.3345357225971608, + "grad_norm": 1.828125, + "learning_rate": 2e-05, + "loss": 0.2088, + "step": 5750 + }, + { + "epoch": 0.3351175238538515, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.2186, + "step": 5760 + }, + { + "epoch": 0.33569932511054223, + "grad_norm": 1.28125, + "learning_rate": 2e-05, + "loss": 0.2052, + "step": 5770 + }, + { + "epoch": 0.33628112636723295, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2074, + "step": 5780 + }, + { + "epoch": 0.33686292762392367, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.2087, + "step": 5790 + }, + { + "epoch": 0.3374447288806144, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 0.2156, + "step": 5800 + }, + { + "epoch": 0.3380265301373051, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.2118, + "step": 5810 + }, + { + "epoch": 0.3386083313939958, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.2093, + "step": 5820 + }, + { + "epoch": 0.33919013265068654, + "grad_norm": 1.8984375, + "learning_rate": 2e-05, + "loss": 0.2071, + "step": 5830 + }, + { + "epoch": 0.33977193390737725, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.2179, + "step": 5840 + }, + { + "epoch": 0.34035373516406797, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.2113, + "step": 5850 + }, + { + "epoch": 0.3409355364207587, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.2038, + "step": 5860 + }, + { + "epoch": 0.3415173376774494, + "grad_norm": 1.796875, + "learning_rate": 2e-05, + "loss": 0.2165, + "step": 5870 + }, + { + "epoch": 0.3420991389341401, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.2154, + "step": 5880 + }, + { + "epoch": 0.3426809401908308, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 0.2146, + "step": 5890 + }, + { + "epoch": 0.3432627414475215, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2077, + "step": 5900 + }, + { + "epoch": 0.3438445427042122, + "grad_norm": 1.8046875, + "learning_rate": 2e-05, + "loss": 0.2105, + "step": 5910 + }, + { + "epoch": 0.34442634396090294, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 0.2076, + "step": 5920 + }, + { + "epoch": 0.34500814521759365, + "grad_norm": 2.171875, + "learning_rate": 2e-05, + "loss": 0.2107, + "step": 5930 + }, + { + "epoch": 0.34558994647428437, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.209, + "step": 5940 + }, + { + "epoch": 0.3461717477309751, + "grad_norm": 2.21875, + "learning_rate": 2e-05, + "loss": 0.2101, + "step": 5950 + }, + { + "epoch": 0.3467535489876658, + "grad_norm": 1.84375, + "learning_rate": 2e-05, + "loss": 0.2137, + "step": 5960 + }, + { + "epoch": 0.3473353502443565, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.2067, + "step": 5970 + }, + { + "epoch": 0.34791715150104724, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 0.2203, + "step": 5980 + }, + { + "epoch": 0.34849895275773796, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.2115, + "step": 5990 + }, + { + "epoch": 0.3490807540144287, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 0.21, + "step": 6000 + }, + { + "epoch": 0.3496625552711194, + "grad_norm": 1.28125, + "learning_rate": 2e-05, + "loss": 0.2184, + "step": 6010 + }, + { + "epoch": 0.3502443565278101, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.2033, + "step": 6020 + }, + { + "epoch": 0.3508261577845008, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 0.2103, + "step": 6030 + }, + { + "epoch": 0.35140795904119154, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 0.2085, + "step": 6040 + }, + { + "epoch": 0.35198976029788226, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.213, + "step": 6050 + }, + { + "epoch": 0.352571561554573, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2067, + "step": 6060 + }, + { + "epoch": 0.3531533628112637, + "grad_norm": 1.8359375, + "learning_rate": 2e-05, + "loss": 0.2107, + "step": 6070 + }, + { + "epoch": 0.3537351640679544, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 0.2057, + "step": 6080 + }, + { + "epoch": 0.35431696532464513, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 0.2135, + "step": 6090 + }, + { + "epoch": 0.3548987665813358, + "grad_norm": 1.7890625, + "learning_rate": 2e-05, + "loss": 0.2066, + "step": 6100 + }, + { + "epoch": 0.3554805678380265, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.2162, + "step": 6110 + }, + { + "epoch": 0.3560623690947172, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.2062, + "step": 6120 + }, + { + "epoch": 0.35664417035140794, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.2115, + "step": 6130 + }, + { + "epoch": 0.35722597160809866, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.2108, + "step": 6140 + }, + { + "epoch": 0.3578077728647894, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 0.2096, + "step": 6150 + }, + { + "epoch": 0.3583895741214801, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.207, + "step": 6160 + }, + { + "epoch": 0.3589713753781708, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.2071, + "step": 6170 + }, + { + "epoch": 0.3595531766348615, + "grad_norm": 1.9375, + "learning_rate": 2e-05, + "loss": 0.2109, + "step": 6180 + }, + { + "epoch": 0.36013497789155224, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2118, + "step": 6190 + }, + { + "epoch": 0.36071677914824296, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.2092, + "step": 6200 + }, + { + "epoch": 0.3612985804049337, + "grad_norm": 2.390625, + "learning_rate": 2e-05, + "loss": 0.2032, + "step": 6210 + }, + { + "epoch": 0.3618803816616244, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.2169, + "step": 6220 + }, + { + "epoch": 0.3624621829183151, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.2051, + "step": 6230 + }, + { + "epoch": 0.36304398417500583, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.2112, + "step": 6240 + }, + { + "epoch": 0.36362578543169655, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.2064, + "step": 6250 + }, + { + "epoch": 0.36420758668838726, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.2055, + "step": 6260 + }, + { + "epoch": 0.364789387945078, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.2018, + "step": 6270 + }, + { + "epoch": 0.3653711892017687, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.2051, + "step": 6280 + }, + { + "epoch": 0.3659529904584594, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 0.2094, + "step": 6290 + }, + { + "epoch": 0.36653479171515013, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 0.2119, + "step": 6300 + }, + { + "epoch": 0.3671165929718408, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.2134, + "step": 6310 + }, + { + "epoch": 0.3676983942285315, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.2078, + "step": 6320 + }, + { + "epoch": 0.36828019548522223, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 0.2012, + "step": 6330 + }, + { + "epoch": 0.36886199674191295, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 0.2051, + "step": 6340 + }, + { + "epoch": 0.36944379799860366, + "grad_norm": 2.015625, + "learning_rate": 2e-05, + "loss": 0.2086, + "step": 6350 + }, + { + "epoch": 0.3700255992552944, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 0.2118, + "step": 6360 + }, + { + "epoch": 0.3706074005119851, + "grad_norm": 1.7890625, + "learning_rate": 2e-05, + "loss": 0.2086, + "step": 6370 + }, + { + "epoch": 0.3711892017686758, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.2185, + "step": 6380 + }, + { + "epoch": 0.37177100302536653, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.2094, + "step": 6390 + }, + { + "epoch": 0.37235280428205725, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.2116, + "step": 6400 + }, + { + "epoch": 0.37293460553874797, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 0.2097, + "step": 6410 + }, + { + "epoch": 0.3735164067954387, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.2116, + "step": 6420 + }, + { + "epoch": 0.3740982080521294, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.2068, + "step": 6430 + }, + { + "epoch": 0.3746800093088201, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.2094, + "step": 6440 + }, + { + "epoch": 0.37526181056551083, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.2103, + "step": 6450 + }, + { + "epoch": 0.37584361182220155, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.201, + "step": 6460 + }, + { + "epoch": 0.37642541307889227, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2099, + "step": 6470 + }, + { + "epoch": 0.377007214335583, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.2075, + "step": 6480 + }, + { + "epoch": 0.3775890155922737, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 0.2045, + "step": 6490 + }, + { + "epoch": 0.3781708168489644, + "grad_norm": 1.9609375, + "learning_rate": 2e-05, + "loss": 0.2085, + "step": 6500 + }, + { + "epoch": 0.3787526181056551, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.2156, + "step": 6510 + }, + { + "epoch": 0.3793344193623458, + "grad_norm": 1.796875, + "learning_rate": 2e-05, + "loss": 0.2152, + "step": 6520 + }, + { + "epoch": 0.3799162206190365, + "grad_norm": 1.21875, + "learning_rate": 2e-05, + "loss": 0.2085, + "step": 6530 + }, + { + "epoch": 0.38049802187572723, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.2068, + "step": 6540 + }, + { + "epoch": 0.38107982313241795, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.2148, + "step": 6550 + }, + { + "epoch": 0.38166162438910867, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.2052, + "step": 6560 + }, + { + "epoch": 0.3822434256457994, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 0.2098, + "step": 6570 + }, + { + "epoch": 0.3828252269024901, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 0.2048, + "step": 6580 + }, + { + "epoch": 0.3834070281591808, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.2031, + "step": 6590 + }, + { + "epoch": 0.38398882941587154, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.2028, + "step": 6600 + }, + { + "epoch": 0.38457063067256225, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.2072, + "step": 6610 + }, + { + "epoch": 0.38515243192925297, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.2026, + "step": 6620 + }, + { + "epoch": 0.3857342331859437, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.2031, + "step": 6630 + }, + { + "epoch": 0.3863160344426344, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.2104, + "step": 6640 + }, + { + "epoch": 0.3868978356993251, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 0.2065, + "step": 6650 + }, + { + "epoch": 0.38747963695601584, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.2067, + "step": 6660 + }, + { + "epoch": 0.38806143821270656, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.2109, + "step": 6670 + }, + { + "epoch": 0.3886432394693973, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.2045, + "step": 6680 + }, + { + "epoch": 0.389225040726088, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2026, + "step": 6690 + }, + { + "epoch": 0.3898068419827787, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.2036, + "step": 6700 + }, + { + "epoch": 0.3903886432394694, + "grad_norm": 1.984375, + "learning_rate": 2e-05, + "loss": 0.2152, + "step": 6710 + }, + { + "epoch": 0.3909704444961601, + "grad_norm": 3.0, + "learning_rate": 2e-05, + "loss": 0.2053, + "step": 6720 + }, + { + "epoch": 0.3915522457528508, + "grad_norm": 2.328125, + "learning_rate": 2e-05, + "loss": 0.21, + "step": 6730 + }, + { + "epoch": 0.3921340470095415, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.2068, + "step": 6740 + }, + { + "epoch": 0.39271584826623224, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 0.2063, + "step": 6750 + }, + { + "epoch": 0.39329764952292295, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.2093, + "step": 6760 + }, + { + "epoch": 0.39387945077961367, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.2043, + "step": 6770 + }, + { + "epoch": 0.3944612520363044, + "grad_norm": 2.15625, + "learning_rate": 2e-05, + "loss": 0.211, + "step": 6780 + }, + { + "epoch": 0.3950430532929951, + "grad_norm": 1.796875, + "learning_rate": 2e-05, + "loss": 0.2111, + "step": 6790 + }, + { + "epoch": 0.3956248545496858, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.2022, + "step": 6800 + }, + { + "epoch": 0.39620665580637654, + "grad_norm": 2.46875, + "learning_rate": 2e-05, + "loss": 0.2028, + "step": 6810 + }, + { + "epoch": 0.39678845706306726, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.2143, + "step": 6820 + }, + { + "epoch": 0.397370258319758, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 0.2124, + "step": 6830 + }, + { + "epoch": 0.3979520595764487, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 0.2185, + "step": 6840 + }, + { + "epoch": 0.3985338608331394, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.2118, + "step": 6850 + }, + { + "epoch": 0.3991156620898301, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 0.2091, + "step": 6860 + }, + { + "epoch": 0.39969746334652084, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.2086, + "step": 6870 + }, + { + "epoch": 0.40027926460321156, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.2025, + "step": 6880 + }, + { + "epoch": 0.4008610658599023, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.2035, + "step": 6890 + }, + { + "epoch": 0.401442867116593, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.2086, + "step": 6900 + }, + { + "epoch": 0.4020246683732837, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.2096, + "step": 6910 + }, + { + "epoch": 0.40260646962997443, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.207, + "step": 6920 + }, + { + "epoch": 0.4031882708866651, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.2113, + "step": 6930 + }, + { + "epoch": 0.4037700721433558, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.2077, + "step": 6940 + }, + { + "epoch": 0.4043518734000465, + "grad_norm": 1.96875, + "learning_rate": 2e-05, + "loss": 0.1999, + "step": 6950 + }, + { + "epoch": 0.40493367465673724, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.2014, + "step": 6960 + }, + { + "epoch": 0.40551547591342796, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.2058, + "step": 6970 + }, + { + "epoch": 0.4060972771701187, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.209, + "step": 6980 + }, + { + "epoch": 0.4066790784268094, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.2103, + "step": 6990 + }, + { + "epoch": 0.4072608796835001, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.2117, + "step": 7000 + }, + { + "epoch": 0.4078426809401908, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.2055, + "step": 7010 + }, + { + "epoch": 0.40842448219688154, + "grad_norm": 1.8984375, + "learning_rate": 2e-05, + "loss": 0.2129, + "step": 7020 + }, + { + "epoch": 0.40900628345357226, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.2038, + "step": 7030 + }, + { + "epoch": 0.409588084710263, + "grad_norm": 1.9375, + "learning_rate": 2e-05, + "loss": 0.2028, + "step": 7040 + }, + { + "epoch": 0.4101698859669537, + "grad_norm": 2.203125, + "learning_rate": 2e-05, + "loss": 0.2085, + "step": 7050 + }, + { + "epoch": 0.4107516872236444, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.2076, + "step": 7060 + }, + { + "epoch": 0.41133348848033513, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.2047, + "step": 7070 + }, + { + "epoch": 0.41191528973702585, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.2079, + "step": 7080 + }, + { + "epoch": 0.41249709099371656, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2016, + "step": 7090 + }, + { + "epoch": 0.4130788922504073, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.2084, + "step": 7100 + }, + { + "epoch": 0.413660693507098, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.2076, + "step": 7110 + }, + { + "epoch": 0.4142424947637887, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.2084, + "step": 7120 + }, + { + "epoch": 0.4148242960204794, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.2003, + "step": 7130 + }, + { + "epoch": 0.4154060972771701, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.2059, + "step": 7140 + }, + { + "epoch": 0.4159878985338608, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.2067, + "step": 7150 + }, + { + "epoch": 0.41656969979055153, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.2028, + "step": 7160 + }, + { + "epoch": 0.41715150104724225, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 0.2015, + "step": 7170 + }, + { + "epoch": 0.41773330230393296, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2007, + "step": 7180 + }, + { + "epoch": 0.4183151035606237, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 0.2061, + "step": 7190 + }, + { + "epoch": 0.4188969048173144, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 0.2026, + "step": 7200 + }, + { + "epoch": 0.4194787060740051, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.2088, + "step": 7210 + }, + { + "epoch": 0.42006050733069583, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.2037, + "step": 7220 + }, + { + "epoch": 0.42064230858738655, + "grad_norm": 2.578125, + "learning_rate": 2e-05, + "loss": 0.2063, + "step": 7230 + }, + { + "epoch": 0.42122410984407727, + "grad_norm": 1.796875, + "learning_rate": 2e-05, + "loss": 0.2086, + "step": 7240 + }, + { + "epoch": 0.421805911100768, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.2048, + "step": 7250 + }, + { + "epoch": 0.4223877123574587, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.2002, + "step": 7260 + }, + { + "epoch": 0.4229695136141494, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.1988, + "step": 7270 + }, + { + "epoch": 0.42355131487084013, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.2082, + "step": 7280 + }, + { + "epoch": 0.42413311612753085, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.2086, + "step": 7290 + }, + { + "epoch": 0.42471491738422157, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.1968, + "step": 7300 + }, + { + "epoch": 0.4252967186409123, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.1984, + "step": 7310 + }, + { + "epoch": 0.425878519897603, + "grad_norm": 2.46875, + "learning_rate": 2e-05, + "loss": 0.2068, + "step": 7320 + }, + { + "epoch": 0.4264603211542937, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.2029, + "step": 7330 + }, + { + "epoch": 0.4270421224109844, + "grad_norm": 2.40625, + "learning_rate": 2e-05, + "loss": 0.2093, + "step": 7340 + }, + { + "epoch": 0.4276239236676751, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.2055, + "step": 7350 + }, + { + "epoch": 0.4282057249243658, + "grad_norm": 3.625, + "learning_rate": 2e-05, + "loss": 0.2018, + "step": 7360 + }, + { + "epoch": 0.42878752618105653, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.2011, + "step": 7370 + }, + { + "epoch": 0.42936932743774725, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.2059, + "step": 7380 + }, + { + "epoch": 0.42995112869443797, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.2051, + "step": 7390 + }, + { + "epoch": 0.4305329299511287, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.1991, + "step": 7400 + }, + { + "epoch": 0.4311147312078194, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.2114, + "step": 7410 + }, + { + "epoch": 0.4316965324645101, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.1999, + "step": 7420 + }, + { + "epoch": 0.43227833372120084, + "grad_norm": 2.09375, + "learning_rate": 2e-05, + "loss": 0.2031, + "step": 7430 + }, + { + "epoch": 0.43286013497789155, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 0.2055, + "step": 7440 + }, + { + "epoch": 0.43344193623458227, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.2042, + "step": 7450 + }, + { + "epoch": 0.434023737491273, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.2036, + "step": 7460 + }, + { + "epoch": 0.4346055387479637, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.2049, + "step": 7470 + }, + { + "epoch": 0.4351873400046544, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.2002, + "step": 7480 + }, + { + "epoch": 0.43576914126134514, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.2091, + "step": 7490 + }, + { + "epoch": 0.43635094251803586, + "grad_norm": 2.265625, + "learning_rate": 2e-05, + "loss": 0.2114, + "step": 7500 + }, + { + "epoch": 0.4369327437747266, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.2053, + "step": 7510 + }, + { + "epoch": 0.4375145450314173, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 0.1966, + "step": 7520 + }, + { + "epoch": 0.438096346288108, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.1968, + "step": 7530 + }, + { + "epoch": 0.4386781475447987, + "grad_norm": 1.9453125, + "learning_rate": 2e-05, + "loss": 0.2038, + "step": 7540 + }, + { + "epoch": 0.4392599488014894, + "grad_norm": 1.78125, + "learning_rate": 2e-05, + "loss": 0.2028, + "step": 7550 + }, + { + "epoch": 0.4398417500581801, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 0.2027, + "step": 7560 + }, + { + "epoch": 0.4404235513148708, + "grad_norm": 1.7890625, + "learning_rate": 2e-05, + "loss": 0.201, + "step": 7570 + }, + { + "epoch": 0.44100535257156154, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2057, + "step": 7580 + }, + { + "epoch": 0.44158715382825225, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.2059, + "step": 7590 + }, + { + "epoch": 0.44216895508494297, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.1953, + "step": 7600 + }, + { + "epoch": 0.4427507563416337, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.2039, + "step": 7610 + }, + { + "epoch": 0.4433325575983244, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.2079, + "step": 7620 + }, + { + "epoch": 0.4439143588550151, + "grad_norm": 1.796875, + "learning_rate": 2e-05, + "loss": 0.202, + "step": 7630 + }, + { + "epoch": 0.44449616011170584, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.2022, + "step": 7640 + }, + { + "epoch": 0.44507796136839656, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.2013, + "step": 7650 + }, + { + "epoch": 0.4456597626250873, + "grad_norm": 1.921875, + "learning_rate": 2e-05, + "loss": 0.2021, + "step": 7660 + }, + { + "epoch": 0.446241563881778, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.206, + "step": 7670 + }, + { + "epoch": 0.4468233651384687, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.2018, + "step": 7680 + }, + { + "epoch": 0.4474051663951594, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.1979, + "step": 7690 + }, + { + "epoch": 0.44798696765185014, + "grad_norm": 2.0625, + "learning_rate": 2e-05, + "loss": 0.2063, + "step": 7700 + }, + { + "epoch": 0.44856876890854086, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.2001, + "step": 7710 + }, + { + "epoch": 0.4491505701652316, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.2038, + "step": 7720 + }, + { + "epoch": 0.4497323714219223, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.2037, + "step": 7730 + }, + { + "epoch": 0.450314172678613, + "grad_norm": 1.8046875, + "learning_rate": 2e-05, + "loss": 0.209, + "step": 7740 + }, + { + "epoch": 0.4508959739353037, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.2001, + "step": 7750 + }, + { + "epoch": 0.4514777751919944, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 0.2054, + "step": 7760 + }, + { + "epoch": 0.4520595764486851, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.204, + "step": 7770 + }, + { + "epoch": 0.4526413777053758, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.2025, + "step": 7780 + }, + { + "epoch": 0.45322317896206654, + "grad_norm": 2.375, + "learning_rate": 2e-05, + "loss": 0.1992, + "step": 7790 + }, + { + "epoch": 0.45380498021875726, + "grad_norm": 2.25, + "learning_rate": 2e-05, + "loss": 0.1998, + "step": 7800 + }, + { + "epoch": 0.454386781475448, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.2055, + "step": 7810 + }, + { + "epoch": 0.4549685827321387, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.206, + "step": 7820 + }, + { + "epoch": 0.4555503839888294, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.2024, + "step": 7830 + }, + { + "epoch": 0.4561321852455201, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.1985, + "step": 7840 + }, + { + "epoch": 0.45671398650221084, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.2086, + "step": 7850 + }, + { + "epoch": 0.45729578775890156, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.2052, + "step": 7860 + }, + { + "epoch": 0.4578775890155923, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.204, + "step": 7870 + }, + { + "epoch": 0.458459390272283, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.2028, + "step": 7880 + }, + { + "epoch": 0.4590411915289737, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.1982, + "step": 7890 + }, + { + "epoch": 0.45962299278566443, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.1986, + "step": 7900 + }, + { + "epoch": 0.46020479404235515, + "grad_norm": 2.328125, + "learning_rate": 2e-05, + "loss": 0.1971, + "step": 7910 + }, + { + "epoch": 0.46078659529904586, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 0.2011, + "step": 7920 + }, + { + "epoch": 0.4613683965557366, + "grad_norm": 2.40625, + "learning_rate": 2e-05, + "loss": 0.207, + "step": 7930 + }, + { + "epoch": 0.4619501978124273, + "grad_norm": 1.9765625, + "learning_rate": 2e-05, + "loss": 0.2032, + "step": 7940 + }, + { + "epoch": 0.462531999069118, + "grad_norm": 1.8984375, + "learning_rate": 2e-05, + "loss": 0.2053, + "step": 7950 + }, + { + "epoch": 0.4631138003258087, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 0.1991, + "step": 7960 + }, + { + "epoch": 0.4636956015824994, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.2059, + "step": 7970 + }, + { + "epoch": 0.4642774028391901, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1986, + "step": 7980 + }, + { + "epoch": 0.46485920409588083, + "grad_norm": 1.828125, + "learning_rate": 2e-05, + "loss": 0.2033, + "step": 7990 + }, + { + "epoch": 0.46544100535257155, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 0.2005, + "step": 8000 + }, + { + "epoch": 0.46602280660926226, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.2101, + "step": 8010 + }, + { + "epoch": 0.466604607865953, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.1976, + "step": 8020 + }, + { + "epoch": 0.4671864091226437, + "grad_norm": 1.8671875, + "learning_rate": 2e-05, + "loss": 0.1992, + "step": 8030 + }, + { + "epoch": 0.4677682103793344, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.2069, + "step": 8040 + }, + { + "epoch": 0.46835001163602513, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.2055, + "step": 8050 + }, + { + "epoch": 0.46893181289271585, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.203, + "step": 8060 + }, + { + "epoch": 0.46951361414940657, + "grad_norm": 1.8515625, + "learning_rate": 2e-05, + "loss": 0.2023, + "step": 8070 + }, + { + "epoch": 0.4700954154060973, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.2084, + "step": 8080 + }, + { + "epoch": 0.470677216662788, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.2094, + "step": 8090 + }, + { + "epoch": 0.4712590179194787, + "grad_norm": 2.15625, + "learning_rate": 2e-05, + "loss": 0.202, + "step": 8100 + }, + { + "epoch": 0.47184081917616943, + "grad_norm": 1.28125, + "learning_rate": 2e-05, + "loss": 0.1977, + "step": 8110 + }, + { + "epoch": 0.47242262043286015, + "grad_norm": 1.8359375, + "learning_rate": 2e-05, + "loss": 0.2017, + "step": 8120 + }, + { + "epoch": 0.47300442168955087, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.193, + "step": 8130 + }, + { + "epoch": 0.4735862229462416, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.2109, + "step": 8140 + }, + { + "epoch": 0.4741680242029323, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 0.1937, + "step": 8150 + }, + { + "epoch": 0.474749825459623, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.2015, + "step": 8160 + }, + { + "epoch": 0.4753316267163137, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.2024, + "step": 8170 + }, + { + "epoch": 0.4759134279730044, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.2028, + "step": 8180 + }, + { + "epoch": 0.4764952292296951, + "grad_norm": 2.109375, + "learning_rate": 2e-05, + "loss": 0.1949, + "step": 8190 + }, + { + "epoch": 0.47707703048638583, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 0.2067, + "step": 8200 + }, + { + "epoch": 0.47765883174307655, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.1973, + "step": 8210 + }, + { + "epoch": 0.47824063299976727, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.197, + "step": 8220 + }, + { + "epoch": 0.478822434256458, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.1956, + "step": 8230 + }, + { + "epoch": 0.4794042355131487, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1982, + "step": 8240 + }, + { + "epoch": 0.4799860367698394, + "grad_norm": 2.15625, + "learning_rate": 2e-05, + "loss": 0.1975, + "step": 8250 + }, + { + "epoch": 0.48056783802653014, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.1969, + "step": 8260 + }, + { + "epoch": 0.48114963928322085, + "grad_norm": 1.828125, + "learning_rate": 2e-05, + "loss": 0.2013, + "step": 8270 + }, + { + "epoch": 0.48173144053991157, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 0.1965, + "step": 8280 + }, + { + "epoch": 0.4823132417966023, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.2035, + "step": 8290 + }, + { + "epoch": 0.482895043053293, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.1969, + "step": 8300 + }, + { + "epoch": 0.4834768443099837, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1949, + "step": 8310 + }, + { + "epoch": 0.48405864556667444, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1987, + "step": 8320 + }, + { + "epoch": 0.48464044682336516, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.2047, + "step": 8330 + }, + { + "epoch": 0.4852222480800559, + "grad_norm": 1.390625, + "learning_rate": 2e-05, + "loss": 0.2075, + "step": 8340 + }, + { + "epoch": 0.4858040493367466, + "grad_norm": 2.125, + "learning_rate": 2e-05, + "loss": 0.2021, + "step": 8350 + }, + { + "epoch": 0.4863858505934373, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.2014, + "step": 8360 + }, + { + "epoch": 0.48696765185012797, + "grad_norm": 1.9375, + "learning_rate": 2e-05, + "loss": 0.2019, + "step": 8370 + }, + { + "epoch": 0.4875494531068187, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 0.2022, + "step": 8380 + }, + { + "epoch": 0.4881312543635094, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.2055, + "step": 8390 + }, + { + "epoch": 0.4887130556202001, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.2015, + "step": 8400 + }, + { + "epoch": 0.48929485687689084, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.2027, + "step": 8410 + }, + { + "epoch": 0.48987665813358156, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.2009, + "step": 8420 + }, + { + "epoch": 0.49045845939027227, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 0.2031, + "step": 8430 + }, + { + "epoch": 0.491040260646963, + "grad_norm": 1.328125, + "learning_rate": 2e-05, + "loss": 0.2014, + "step": 8440 + }, + { + "epoch": 0.4916220619036537, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.2051, + "step": 8450 + }, + { + "epoch": 0.4922038631603444, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 0.2, + "step": 8460 + }, + { + "epoch": 0.49278566441703514, + "grad_norm": 1.9375, + "learning_rate": 2e-05, + "loss": 0.1986, + "step": 8470 + }, + { + "epoch": 0.49336746567372586, + "grad_norm": 1.34375, + "learning_rate": 2e-05, + "loss": 0.1997, + "step": 8480 + }, + { + "epoch": 0.4939492669304166, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.2012, + "step": 8490 + }, + { + "epoch": 0.4945310681871073, + "grad_norm": 2.28125, + "learning_rate": 2e-05, + "loss": 0.1993, + "step": 8500 + }, + { + "epoch": 0.495112869443798, + "grad_norm": 3.03125, + "learning_rate": 2e-05, + "loss": 0.2046, + "step": 8510 + }, + { + "epoch": 0.4956946707004887, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.2006, + "step": 8520 + }, + { + "epoch": 0.49627647195717944, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.2005, + "step": 8530 + }, + { + "epoch": 0.49685827321387016, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.2065, + "step": 8540 + }, + { + "epoch": 0.4974400744705609, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.2061, + "step": 8550 + }, + { + "epoch": 0.4980218757272516, + "grad_norm": 1.796875, + "learning_rate": 2e-05, + "loss": 0.2032, + "step": 8560 + }, + { + "epoch": 0.4986036769839423, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.2103, + "step": 8570 + }, + { + "epoch": 0.499185478240633, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.1934, + "step": 8580 + }, + { + "epoch": 0.4997672794973237, + "grad_norm": 2.140625, + "learning_rate": 2e-05, + "loss": 0.203, + "step": 8590 + }, + { + "epoch": 0.5003490807540144, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 0.2059, + "step": 8600 + }, + { + "epoch": 0.5009308820107051, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.1978, + "step": 8610 + }, + { + "epoch": 0.5015126832673958, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.1987, + "step": 8620 + }, + { + "epoch": 0.5020944845240866, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.1923, + "step": 8630 + }, + { + "epoch": 0.5026762857807773, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.1964, + "step": 8640 + }, + { + "epoch": 0.503258087037468, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 0.2076, + "step": 8650 + }, + { + "epoch": 0.5038398882941587, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.2042, + "step": 8660 + }, + { + "epoch": 0.5044216895508494, + "grad_norm": 1.8984375, + "learning_rate": 2e-05, + "loss": 0.2003, + "step": 8670 + }, + { + "epoch": 0.5050034908075401, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.1981, + "step": 8680 + }, + { + "epoch": 0.5055852920642309, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.1957, + "step": 8690 + }, + { + "epoch": 0.5061670933209216, + "grad_norm": 1.828125, + "learning_rate": 2e-05, + "loss": 0.2042, + "step": 8700 + }, + { + "epoch": 0.5067488945776123, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.1998, + "step": 8710 + }, + { + "epoch": 0.507330695834303, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.2107, + "step": 8720 + }, + { + "epoch": 0.5079124970909937, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.199, + "step": 8730 + }, + { + "epoch": 0.5084942983476844, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.2059, + "step": 8740 + }, + { + "epoch": 0.5090760996043752, + "grad_norm": 2.125, + "learning_rate": 2e-05, + "loss": 0.2057, + "step": 8750 + }, + { + "epoch": 0.5096579008610659, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.198, + "step": 8760 + }, + { + "epoch": 0.5102397021177566, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1962, + "step": 8770 + }, + { + "epoch": 0.5108215033744473, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.199, + "step": 8780 + }, + { + "epoch": 0.511403304631138, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.2023, + "step": 8790 + }, + { + "epoch": 0.5119851058878288, + "grad_norm": 1.8359375, + "learning_rate": 2e-05, + "loss": 0.2043, + "step": 8800 + }, + { + "epoch": 0.5125669071445195, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 0.1959, + "step": 8810 + }, + { + "epoch": 0.5131487084012102, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 0.2019, + "step": 8820 + }, + { + "epoch": 0.5137305096579009, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.2009, + "step": 8830 + }, + { + "epoch": 0.5143123109145916, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.2055, + "step": 8840 + }, + { + "epoch": 0.5148941121712823, + "grad_norm": 2.046875, + "learning_rate": 2e-05, + "loss": 0.2037, + "step": 8850 + }, + { + "epoch": 0.515475913427973, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1999, + "step": 8860 + }, + { + "epoch": 0.5160577146846638, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.1957, + "step": 8870 + }, + { + "epoch": 0.5166395159413545, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.2015, + "step": 8880 + }, + { + "epoch": 0.5172213171980451, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.2038, + "step": 8890 + }, + { + "epoch": 0.5178031184547358, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 0.1999, + "step": 8900 + }, + { + "epoch": 0.5183849197114265, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1965, + "step": 8910 + }, + { + "epoch": 0.5189667209681172, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 0.1953, + "step": 8920 + }, + { + "epoch": 0.519548522224808, + "grad_norm": 2.390625, + "learning_rate": 2e-05, + "loss": 0.2033, + "step": 8930 + }, + { + "epoch": 0.5201303234814987, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.1939, + "step": 8940 + }, + { + "epoch": 0.5207121247381894, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.2023, + "step": 8950 + }, + { + "epoch": 0.5212939259948801, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.2116, + "step": 8960 + }, + { + "epoch": 0.5218757272515708, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 0.1955, + "step": 8970 + }, + { + "epoch": 0.5224575285082615, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.1979, + "step": 8980 + }, + { + "epoch": 0.5230393297649523, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.2002, + "step": 8990 + }, + { + "epoch": 0.523621131021643, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.1995, + "step": 9000 + }, + { + "epoch": 0.5242029322783337, + "grad_norm": 2.171875, + "learning_rate": 2e-05, + "loss": 0.202, + "step": 9010 + }, + { + "epoch": 0.5247847335350244, + "grad_norm": 1.9921875, + "learning_rate": 2e-05, + "loss": 0.1997, + "step": 9020 + }, + { + "epoch": 0.5253665347917151, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 0.1984, + "step": 9030 + }, + { + "epoch": 0.5259483360484059, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 0.1953, + "step": 9040 + }, + { + "epoch": 0.5265301373050966, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.1966, + "step": 9050 + }, + { + "epoch": 0.5271119385617873, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.2021, + "step": 9060 + }, + { + "epoch": 0.527693739818478, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.2032, + "step": 9070 + }, + { + "epoch": 0.5282755410751687, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.1982, + "step": 9080 + }, + { + "epoch": 0.5288573423318594, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.1945, + "step": 9090 + }, + { + "epoch": 0.5294391435885502, + "grad_norm": 1.8515625, + "learning_rate": 2e-05, + "loss": 0.1889, + "step": 9100 + }, + { + "epoch": 0.5300209448452409, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 0.204, + "step": 9110 + }, + { + "epoch": 0.5306027461019316, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.2003, + "step": 9120 + }, + { + "epoch": 0.5311845473586223, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.2015, + "step": 9130 + }, + { + "epoch": 0.531766348615313, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 0.2016, + "step": 9140 + }, + { + "epoch": 0.5323481498720037, + "grad_norm": 1.84375, + "learning_rate": 2e-05, + "loss": 0.2051, + "step": 9150 + }, + { + "epoch": 0.5329299511286945, + "grad_norm": 2.6875, + "learning_rate": 2e-05, + "loss": 0.2016, + "step": 9160 + }, + { + "epoch": 0.5335117523853852, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.2005, + "step": 9170 + }, + { + "epoch": 0.5340935536420759, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.2001, + "step": 9180 + }, + { + "epoch": 0.5346753548987666, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 0.196, + "step": 9190 + }, + { + "epoch": 0.5352571561554573, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.1957, + "step": 9200 + }, + { + "epoch": 0.535838957412148, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.1972, + "step": 9210 + }, + { + "epoch": 0.5364207586688388, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 0.1928, + "step": 9220 + }, + { + "epoch": 0.5370025599255295, + "grad_norm": 1.9375, + "learning_rate": 2e-05, + "loss": 0.1996, + "step": 9230 + }, + { + "epoch": 0.5375843611822202, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.1943, + "step": 9240 + }, + { + "epoch": 0.5381661624389109, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 0.1901, + "step": 9250 + }, + { + "epoch": 0.5387479636956016, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.2016, + "step": 9260 + }, + { + "epoch": 0.5393297649522923, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.1996, + "step": 9270 + }, + { + "epoch": 0.5399115662089831, + "grad_norm": 1.328125, + "learning_rate": 2e-05, + "loss": 0.1972, + "step": 9280 + }, + { + "epoch": 0.5404933674656738, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.2063, + "step": 9290 + }, + { + "epoch": 0.5410751687223644, + "grad_norm": 1.8359375, + "learning_rate": 2e-05, + "loss": 0.1986, + "step": 9300 + }, + { + "epoch": 0.5416569699790551, + "grad_norm": 1.3125, + "learning_rate": 2e-05, + "loss": 0.2049, + "step": 9310 + }, + { + "epoch": 0.5422387712357458, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.1968, + "step": 9320 + }, + { + "epoch": 0.5428205724924365, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.1941, + "step": 9330 + }, + { + "epoch": 0.5434023737491273, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.2014, + "step": 9340 + }, + { + "epoch": 0.543984175005818, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.1961, + "step": 9350 + }, + { + "epoch": 0.5445659762625087, + "grad_norm": 1.8671875, + "learning_rate": 2e-05, + "loss": 0.205, + "step": 9360 + }, + { + "epoch": 0.5451477775191994, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.2042, + "step": 9370 + }, + { + "epoch": 0.5457295787758901, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1927, + "step": 9380 + }, + { + "epoch": 0.5463113800325808, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.205, + "step": 9390 + }, + { + "epoch": 0.5468931812892716, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 0.1973, + "step": 9400 + }, + { + "epoch": 0.5474749825459623, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.2069, + "step": 9410 + }, + { + "epoch": 0.548056783802653, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.1974, + "step": 9420 + }, + { + "epoch": 0.5486385850593437, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.194, + "step": 9430 + }, + { + "epoch": 0.5492203863160344, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 0.1969, + "step": 9440 + }, + { + "epoch": 0.5498021875727251, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.2019, + "step": 9450 + }, + { + "epoch": 0.5503839888294159, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.1983, + "step": 9460 + }, + { + "epoch": 0.5509657900861066, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.1945, + "step": 9470 + }, + { + "epoch": 0.5515475913427973, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.2043, + "step": 9480 + }, + { + "epoch": 0.552129392599488, + "grad_norm": 2.171875, + "learning_rate": 2e-05, + "loss": 0.1923, + "step": 9490 + }, + { + "epoch": 0.5527111938561787, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.203, + "step": 9500 + }, + { + "epoch": 0.5532929951128694, + "grad_norm": 1.84375, + "learning_rate": 2e-05, + "loss": 0.2024, + "step": 9510 + }, + { + "epoch": 0.5538747963695602, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.1938, + "step": 9520 + }, + { + "epoch": 0.5544565976262509, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1949, + "step": 9530 + }, + { + "epoch": 0.5550383988829416, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 0.1989, + "step": 9540 + }, + { + "epoch": 0.5556202001396323, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 0.1998, + "step": 9550 + }, + { + "epoch": 0.556202001396323, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.2009, + "step": 9560 + }, + { + "epoch": 0.5567838026530137, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.1976, + "step": 9570 + }, + { + "epoch": 0.5573656039097045, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.1979, + "step": 9580 + }, + { + "epoch": 0.5579474051663952, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.1874, + "step": 9590 + }, + { + "epoch": 0.5585292064230859, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.2006, + "step": 9600 + }, + { + "epoch": 0.5591110076797766, + "grad_norm": 1.78125, + "learning_rate": 2e-05, + "loss": 0.1976, + "step": 9610 + }, + { + "epoch": 0.5596928089364673, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1918, + "step": 9620 + }, + { + "epoch": 0.560274610193158, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.1916, + "step": 9630 + }, + { + "epoch": 0.5608564114498488, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 0.1925, + "step": 9640 + }, + { + "epoch": 0.5614382127065395, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.2056, + "step": 9650 + }, + { + "epoch": 0.5620200139632302, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1989, + "step": 9660 + }, + { + "epoch": 0.5626018152199209, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.2061, + "step": 9670 + }, + { + "epoch": 0.5631836164766116, + "grad_norm": 1.390625, + "learning_rate": 2e-05, + "loss": 0.1993, + "step": 9680 + }, + { + "epoch": 0.5637654177333024, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 0.1957, + "step": 9690 + }, + { + "epoch": 0.5643472189899931, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1978, + "step": 9700 + }, + { + "epoch": 0.5649290202466837, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.1918, + "step": 9710 + }, + { + "epoch": 0.5655108215033744, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 0.2032, + "step": 9720 + }, + { + "epoch": 0.5660926227600651, + "grad_norm": 1.953125, + "learning_rate": 2e-05, + "loss": 0.1951, + "step": 9730 + }, + { + "epoch": 0.5666744240167558, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.1994, + "step": 9740 + }, + { + "epoch": 0.5672562252734465, + "grad_norm": 1.8984375, + "learning_rate": 2e-05, + "loss": 0.2014, + "step": 9750 + }, + { + "epoch": 0.5678380265301373, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.1957, + "step": 9760 + }, + { + "epoch": 0.568419827786828, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.2084, + "step": 9770 + }, + { + "epoch": 0.5690016290435187, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 0.1968, + "step": 9780 + }, + { + "epoch": 0.5695834303002094, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.196, + "step": 9790 + }, + { + "epoch": 0.5701652315569001, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.1956, + "step": 9800 + }, + { + "epoch": 0.5707470328135908, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.2006, + "step": 9810 + }, + { + "epoch": 0.5713288340702816, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.1954, + "step": 9820 + }, + { + "epoch": 0.5719106353269723, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.2012, + "step": 9830 + }, + { + "epoch": 0.572492436583663, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 0.1908, + "step": 9840 + }, + { + "epoch": 0.5730742378403537, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.1983, + "step": 9850 + }, + { + "epoch": 0.5736560390970444, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.2031, + "step": 9860 + }, + { + "epoch": 0.5742378403537352, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.1971, + "step": 9870 + }, + { + "epoch": 0.5748196416104259, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.1965, + "step": 9880 + }, + { + "epoch": 0.5754014428671166, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.1993, + "step": 9890 + }, + { + "epoch": 0.5759832441238073, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.1979, + "step": 9900 + }, + { + "epoch": 0.576565045380498, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.1966, + "step": 9910 + }, + { + "epoch": 0.5771468466371887, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 0.1889, + "step": 9920 + }, + { + "epoch": 0.5777286478938795, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.1955, + "step": 9930 + }, + { + "epoch": 0.5783104491505702, + "grad_norm": 1.78125, + "learning_rate": 2e-05, + "loss": 0.1942, + "step": 9940 + }, + { + "epoch": 0.5788922504072609, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.2002, + "step": 9950 + }, + { + "epoch": 0.5794740516639516, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.1961, + "step": 9960 + }, + { + "epoch": 0.5800558529206423, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.1976, + "step": 9970 + }, + { + "epoch": 0.580637654177333, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1981, + "step": 9980 + }, + { + "epoch": 0.5812194554340238, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.1927, + "step": 9990 + }, + { + "epoch": 0.5818012566907145, + "grad_norm": 1.84375, + "learning_rate": 2e-05, + "loss": 0.1866, + "step": 10000 + }, + { + "epoch": 0.5823830579474052, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1975, + "step": 10010 + }, + { + "epoch": 0.5829648592040959, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.1944, + "step": 10020 + }, + { + "epoch": 0.5835466604607866, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 0.1929, + "step": 10030 + }, + { + "epoch": 0.5841284617174773, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 10040 + }, + { + "epoch": 0.5847102629741681, + "grad_norm": 1.328125, + "learning_rate": 2e-05, + "loss": 0.1965, + "step": 10050 + }, + { + "epoch": 0.5852920642308588, + "grad_norm": 1.78125, + "learning_rate": 2e-05, + "loss": 0.1919, + "step": 10060 + }, + { + "epoch": 0.5858738654875495, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.1991, + "step": 10070 + }, + { + "epoch": 0.5864556667442402, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1966, + "step": 10080 + }, + { + "epoch": 0.5870374680009309, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.1921, + "step": 10090 + }, + { + "epoch": 0.5876192692576216, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.202, + "step": 10100 + }, + { + "epoch": 0.5882010705143124, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1992, + "step": 10110 + }, + { + "epoch": 0.5887828717710031, + "grad_norm": 1.9921875, + "learning_rate": 2e-05, + "loss": 0.1981, + "step": 10120 + }, + { + "epoch": 0.5893646730276937, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 10130 + }, + { + "epoch": 0.5899464742843844, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.1949, + "step": 10140 + }, + { + "epoch": 0.5905282755410751, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.1994, + "step": 10150 + }, + { + "epoch": 0.5911100767977658, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.1982, + "step": 10160 + }, + { + "epoch": 0.5916918780544566, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 0.1963, + "step": 10170 + }, + { + "epoch": 0.5922736793111473, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.1936, + "step": 10180 + }, + { + "epoch": 0.592855480567838, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 0.1975, + "step": 10190 + }, + { + "epoch": 0.5934372818245287, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.2004, + "step": 10200 + }, + { + "epoch": 0.5940190830812194, + "grad_norm": 2.09375, + "learning_rate": 2e-05, + "loss": 0.1942, + "step": 10210 + }, + { + "epoch": 0.5946008843379101, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 0.1967, + "step": 10220 + }, + { + "epoch": 0.5951826855946009, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.1941, + "step": 10230 + }, + { + "epoch": 0.5957644868512916, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.1945, + "step": 10240 + }, + { + "epoch": 0.5963462881079823, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.1944, + "step": 10250 + }, + { + "epoch": 0.596928089364673, + "grad_norm": 1.2890625, + "learning_rate": 2e-05, + "loss": 0.2036, + "step": 10260 + }, + { + "epoch": 0.5975098906213637, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.1988, + "step": 10270 + }, + { + "epoch": 0.5980916918780544, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 0.1969, + "step": 10280 + }, + { + "epoch": 0.5986734931347452, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 0.1881, + "step": 10290 + }, + { + "epoch": 0.5992552943914359, + "grad_norm": 2.125, + "learning_rate": 2e-05, + "loss": 0.1914, + "step": 10300 + }, + { + "epoch": 0.5998370956481266, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 0.199, + "step": 10310 + }, + { + "epoch": 0.6004188969048173, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.2019, + "step": 10320 + }, + { + "epoch": 0.601000698161508, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 0.1993, + "step": 10330 + }, + { + "epoch": 0.6015824994181987, + "grad_norm": 1.8671875, + "learning_rate": 2e-05, + "loss": 0.2006, + "step": 10340 + }, + { + "epoch": 0.6021643006748895, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.1949, + "step": 10350 + }, + { + "epoch": 0.6027461019315802, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 0.1885, + "step": 10360 + }, + { + "epoch": 0.6033279031882709, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.2039, + "step": 10370 + }, + { + "epoch": 0.6039097044449616, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.1868, + "step": 10380 + }, + { + "epoch": 0.6044915057016523, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 0.2036, + "step": 10390 + }, + { + "epoch": 0.605073306958343, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.2035, + "step": 10400 + }, + { + "epoch": 0.6056551082150338, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.1957, + "step": 10410 + }, + { + "epoch": 0.6062369094717245, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.195, + "step": 10420 + }, + { + "epoch": 0.6068187107284152, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.1988, + "step": 10430 + }, + { + "epoch": 0.6074005119851059, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.1918, + "step": 10440 + }, + { + "epoch": 0.6079823132417966, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.1978, + "step": 10450 + }, + { + "epoch": 0.6085641144984874, + "grad_norm": 1.8046875, + "learning_rate": 2e-05, + "loss": 0.1996, + "step": 10460 + }, + { + "epoch": 0.6091459157551781, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 0.2012, + "step": 10470 + }, + { + "epoch": 0.6097277170118688, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.1911, + "step": 10480 + }, + { + "epoch": 0.6103095182685595, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.1915, + "step": 10490 + }, + { + "epoch": 0.6108913195252502, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1956, + "step": 10500 + }, + { + "epoch": 0.6114731207819409, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 0.1987, + "step": 10510 + }, + { + "epoch": 0.6120549220386317, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.2038, + "step": 10520 + }, + { + "epoch": 0.6126367232953224, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.1991, + "step": 10530 + }, + { + "epoch": 0.613218524552013, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.1942, + "step": 10540 + }, + { + "epoch": 0.6138003258087037, + "grad_norm": 2.09375, + "learning_rate": 2e-05, + "loss": 0.1916, + "step": 10550 + }, + { + "epoch": 0.6143821270653944, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 10560 + }, + { + "epoch": 0.6149639283220851, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.1904, + "step": 10570 + }, + { + "epoch": 0.6155457295787758, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.19, + "step": 10580 + }, + { + "epoch": 0.6161275308354666, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.1928, + "step": 10590 + }, + { + "epoch": 0.6167093320921573, + "grad_norm": 2.53125, + "learning_rate": 2e-05, + "loss": 0.1963, + "step": 10600 + }, + { + "epoch": 0.617291133348848, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.1951, + "step": 10610 + }, + { + "epoch": 0.6178729346055387, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.1987, + "step": 10620 + }, + { + "epoch": 0.6184547358622294, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.1948, + "step": 10630 + }, + { + "epoch": 0.6190365371189201, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1929, + "step": 10640 + }, + { + "epoch": 0.6196183383756109, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.1983, + "step": 10650 + }, + { + "epoch": 0.6202001396323016, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.2026, + "step": 10660 + }, + { + "epoch": 0.6207819408889923, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.1898, + "step": 10670 + }, + { + "epoch": 0.621363742145683, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.1921, + "step": 10680 + }, + { + "epoch": 0.6219455434023737, + "grad_norm": 2.125, + "learning_rate": 2e-05, + "loss": 0.194, + "step": 10690 + }, + { + "epoch": 0.6225273446590645, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.1968, + "step": 10700 + }, + { + "epoch": 0.6231091459157552, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 0.196, + "step": 10710 + }, + { + "epoch": 0.6236909471724459, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.1954, + "step": 10720 + }, + { + "epoch": 0.6242727484291366, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.2009, + "step": 10730 + }, + { + "epoch": 0.6248545496858273, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.1904, + "step": 10740 + }, + { + "epoch": 0.625436350942518, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 0.1912, + "step": 10750 + }, + { + "epoch": 0.6260181521992088, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.1967, + "step": 10760 + }, + { + "epoch": 0.6265999534558995, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.2025, + "step": 10770 + }, + { + "epoch": 0.6271817547125902, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.1928, + "step": 10780 + }, + { + "epoch": 0.6277635559692809, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.1945, + "step": 10790 + }, + { + "epoch": 0.6283453572259716, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.1964, + "step": 10800 + }, + { + "epoch": 0.6289271584826623, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 0.194, + "step": 10810 + }, + { + "epoch": 0.6295089597393531, + "grad_norm": 1.828125, + "learning_rate": 2e-05, + "loss": 0.2006, + "step": 10820 + }, + { + "epoch": 0.6300907609960438, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.1941, + "step": 10830 + }, + { + "epoch": 0.6306725622527345, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.1911, + "step": 10840 + }, + { + "epoch": 0.6312543635094252, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.1847, + "step": 10850 + }, + { + "epoch": 0.6318361647661159, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 0.1973, + "step": 10860 + }, + { + "epoch": 0.6324179660228066, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 0.1971, + "step": 10870 + }, + { + "epoch": 0.6329997672794974, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.1932, + "step": 10880 + }, + { + "epoch": 0.6335815685361881, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.1976, + "step": 10890 + }, + { + "epoch": 0.6341633697928788, + "grad_norm": 2.09375, + "learning_rate": 2e-05, + "loss": 0.1946, + "step": 10900 + }, + { + "epoch": 0.6347451710495695, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.1941, + "step": 10910 + }, + { + "epoch": 0.6353269723062602, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.1982, + "step": 10920 + }, + { + "epoch": 0.635908773562951, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 0.1945, + "step": 10930 + }, + { + "epoch": 0.6364905748196417, + "grad_norm": 1.9921875, + "learning_rate": 2e-05, + "loss": 0.1909, + "step": 10940 + }, + { + "epoch": 0.6370723760763323, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.1939, + "step": 10950 + }, + { + "epoch": 0.637654177333023, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.1909, + "step": 10960 + }, + { + "epoch": 0.6382359785897137, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.1961, + "step": 10970 + }, + { + "epoch": 0.6388177798464044, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 0.1965, + "step": 10980 + }, + { + "epoch": 0.6393995811030951, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.194, + "step": 10990 + }, + { + "epoch": 0.6399813823597859, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.1931, + "step": 11000 + }, + { + "epoch": 0.6405631836164766, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 0.1987, + "step": 11010 + }, + { + "epoch": 0.6411449848731673, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.1927, + "step": 11020 + }, + { + "epoch": 0.641726786129858, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.2, + "step": 11030 + }, + { + "epoch": 0.6423085873865487, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.1961, + "step": 11040 + }, + { + "epoch": 0.6428903886432394, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1944, + "step": 11050 + }, + { + "epoch": 0.6434721898999302, + "grad_norm": 2.265625, + "learning_rate": 2e-05, + "loss": 0.2021, + "step": 11060 + }, + { + "epoch": 0.6440539911566209, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 0.1967, + "step": 11070 + }, + { + "epoch": 0.6446357924133116, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.1922, + "step": 11080 + }, + { + "epoch": 0.6452175936700023, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 0.1909, + "step": 11090 + }, + { + "epoch": 0.645799394926693, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.185, + "step": 11100 + }, + { + "epoch": 0.6463811961833837, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 11110 + }, + { + "epoch": 0.6469629974400745, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.1938, + "step": 11120 + }, + { + "epoch": 0.6475447986967652, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.1929, + "step": 11130 + }, + { + "epoch": 0.6481265999534559, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.2016, + "step": 11140 + }, + { + "epoch": 0.6487084012101466, + "grad_norm": 1.9765625, + "learning_rate": 2e-05, + "loss": 0.1955, + "step": 11150 + }, + { + "epoch": 0.6492902024668373, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.2007, + "step": 11160 + }, + { + "epoch": 0.649872003723528, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.1922, + "step": 11170 + }, + { + "epoch": 0.6504538049802188, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.199, + "step": 11180 + }, + { + "epoch": 0.6510356062369095, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 0.1949, + "step": 11190 + }, + { + "epoch": 0.6516174074936002, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 0.1989, + "step": 11200 + }, + { + "epoch": 0.6521992087502909, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 0.1896, + "step": 11210 + }, + { + "epoch": 0.6527810100069816, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.1991, + "step": 11220 + }, + { + "epoch": 0.6533628112636723, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 0.2002, + "step": 11230 + }, + { + "epoch": 0.6539446125203631, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.1963, + "step": 11240 + }, + { + "epoch": 0.6545264137770538, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 0.1899, + "step": 11250 + }, + { + "epoch": 0.6551082150337445, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.2011, + "step": 11260 + }, + { + "epoch": 0.6556900162904352, + "grad_norm": 1.8203125, + "learning_rate": 2e-05, + "loss": 0.1949, + "step": 11270 + }, + { + "epoch": 0.6562718175471259, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.1867, + "step": 11280 + }, + { + "epoch": 0.6568536188038167, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.1981, + "step": 11290 + }, + { + "epoch": 0.6574354200605074, + "grad_norm": 1.90625, + "learning_rate": 2e-05, + "loss": 0.1922, + "step": 11300 + }, + { + "epoch": 0.6580172213171981, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 0.2005, + "step": 11310 + }, + { + "epoch": 0.6585990225738888, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.1973, + "step": 11320 + }, + { + "epoch": 0.6591808238305795, + "grad_norm": 1.390625, + "learning_rate": 2e-05, + "loss": 0.1991, + "step": 11330 + }, + { + "epoch": 0.6597626250872702, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.1994, + "step": 11340 + }, + { + "epoch": 0.660344426343961, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 0.1977, + "step": 11350 + }, + { + "epoch": 0.6609262276006517, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.1952, + "step": 11360 + }, + { + "epoch": 0.6615080288573423, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 0.1918, + "step": 11370 + }, + { + "epoch": 0.662089830114033, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.1874, + "step": 11380 + }, + { + "epoch": 0.6626716313707237, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 0.1966, + "step": 11390 + }, + { + "epoch": 0.6632534326274144, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.1953, + "step": 11400 + }, + { + "epoch": 0.6638352338841051, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 0.1884, + "step": 11410 + }, + { + "epoch": 0.6644170351407959, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.1903, + "step": 11420 + }, + { + "epoch": 0.6649988363974866, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 0.1918, + "step": 11430 + }, + { + "epoch": 0.6655806376541773, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.1956, + "step": 11440 + }, + { + "epoch": 0.666162438910868, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.1924, + "step": 11450 + }, + { + "epoch": 0.6667442401675587, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.1874, + "step": 11460 + }, + { + "epoch": 0.6673260414242494, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.1948, + "step": 11470 + }, + { + "epoch": 0.6679078426809402, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.1908, + "step": 11480 + }, + { + "epoch": 0.6684896439376309, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 0.1921, + "step": 11490 + }, + { + "epoch": 0.6690714451943216, + "grad_norm": 1.8203125, + "learning_rate": 2e-05, + "loss": 0.1897, + "step": 11500 + }, + { + "epoch": 0.6696532464510123, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 0.1964, + "step": 11510 + }, + { + "epoch": 0.670235047707703, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 0.1938, + "step": 11520 + }, + { + "epoch": 0.6708168489643938, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1921, + "step": 11530 + }, + { + "epoch": 0.6713986502210845, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.1881, + "step": 11540 + }, + { + "epoch": 0.6719804514777752, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 11550 + }, + { + "epoch": 0.6725622527344659, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.1948, + "step": 11560 + }, + { + "epoch": 0.6731440539911566, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.1885, + "step": 11570 + }, + { + "epoch": 0.6737258552478473, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.1906, + "step": 11580 + }, + { + "epoch": 0.674307656504538, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.1991, + "step": 11590 + }, + { + "epoch": 0.6748894577612288, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 0.189, + "step": 11600 + }, + { + "epoch": 0.6754712590179195, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 11610 + }, + { + "epoch": 0.6760530602746102, + "grad_norm": 1.9609375, + "learning_rate": 2e-05, + "loss": 0.2006, + "step": 11620 + }, + { + "epoch": 0.6766348615313009, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.1909, + "step": 11630 + }, + { + "epoch": 0.6772166627879916, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.1944, + "step": 11640 + }, + { + "epoch": 0.6777984640446824, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 0.1982, + "step": 11650 + }, + { + "epoch": 0.6783802653013731, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.1961, + "step": 11660 + }, + { + "epoch": 0.6789620665580638, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.1877, + "step": 11670 + }, + { + "epoch": 0.6795438678147545, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.1929, + "step": 11680 + }, + { + "epoch": 0.6801256690714452, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.1915, + "step": 11690 + }, + { + "epoch": 0.6807074703281359, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 0.19, + "step": 11700 + }, + { + "epoch": 0.6812892715848267, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.1923, + "step": 11710 + }, + { + "epoch": 0.6818710728415174, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.1969, + "step": 11720 + }, + { + "epoch": 0.6824528740982081, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.1876, + "step": 11730 + }, + { + "epoch": 0.6830346753548988, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.1919, + "step": 11740 + }, + { + "epoch": 0.6836164766115895, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1939, + "step": 11750 + }, + { + "epoch": 0.6841982778682802, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 0.1953, + "step": 11760 + }, + { + "epoch": 0.684780079124971, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.196, + "step": 11770 + }, + { + "epoch": 0.6853618803816616, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.1975, + "step": 11780 + }, + { + "epoch": 0.6859436816383523, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 0.1965, + "step": 11790 + }, + { + "epoch": 0.686525482895043, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 0.1993, + "step": 11800 + }, + { + "epoch": 0.6871072841517337, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1959, + "step": 11810 + }, + { + "epoch": 0.6876890854084244, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.1962, + "step": 11820 + }, + { + "epoch": 0.6882708866651152, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 0.1973, + "step": 11830 + }, + { + "epoch": 0.6888526879218059, + "grad_norm": 1.265625, + "learning_rate": 2e-05, + "loss": 0.1908, + "step": 11840 + }, + { + "epoch": 0.6894344891784966, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.1892, + "step": 11850 + }, + { + "epoch": 0.6900162904351873, + "grad_norm": 1.8203125, + "learning_rate": 2e-05, + "loss": 0.1985, + "step": 11860 + }, + { + "epoch": 0.690598091691878, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.1862, + "step": 11870 + }, + { + "epoch": 0.6911798929485687, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.1974, + "step": 11880 + }, + { + "epoch": 0.6917616942052595, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.1938, + "step": 11890 + }, + { + "epoch": 0.6923434954619502, + "grad_norm": 1.84375, + "learning_rate": 2e-05, + "loss": 0.1933, + "step": 11900 + }, + { + "epoch": 0.6929252967186409, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.194, + "step": 11910 + }, + { + "epoch": 0.6935070979753316, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.1906, + "step": 11920 + }, + { + "epoch": 0.6940888992320223, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.1994, + "step": 11930 + }, + { + "epoch": 0.694670700488713, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.1944, + "step": 11940 + }, + { + "epoch": 0.6952525017454038, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.1994, + "step": 11950 + }, + { + "epoch": 0.6958343030020945, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 11960 + }, + { + "epoch": 0.6964161042587852, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1899, + "step": 11970 + }, + { + "epoch": 0.6969979055154759, + "grad_norm": 1.1875, + "learning_rate": 2e-05, + "loss": 0.1932, + "step": 11980 + }, + { + "epoch": 0.6975797067721666, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.2018, + "step": 11990 + }, + { + "epoch": 0.6981615080288573, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1967, + "step": 12000 + }, + { + "epoch": 0.6987433092855481, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.1952, + "step": 12010 + }, + { + "epoch": 0.6993251105422388, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.1867, + "step": 12020 + }, + { + "epoch": 0.6999069117989295, + "grad_norm": 1.8515625, + "learning_rate": 2e-05, + "loss": 0.1951, + "step": 12030 + }, + { + "epoch": 0.7004887130556202, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.1906, + "step": 12040 + }, + { + "epoch": 0.7010705143123109, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.1943, + "step": 12050 + }, + { + "epoch": 0.7016523155690016, + "grad_norm": 1.34375, + "learning_rate": 2e-05, + "loss": 0.1913, + "step": 12060 + }, + { + "epoch": 0.7022341168256924, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.1924, + "step": 12070 + }, + { + "epoch": 0.7028159180823831, + "grad_norm": 1.8046875, + "learning_rate": 2e-05, + "loss": 0.1917, + "step": 12080 + }, + { + "epoch": 0.7033977193390738, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1951, + "step": 12090 + }, + { + "epoch": 0.7039795205957645, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 0.192, + "step": 12100 + }, + { + "epoch": 0.7045613218524552, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 0.1914, + "step": 12110 + }, + { + "epoch": 0.705143123109146, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.189, + "step": 12120 + }, + { + "epoch": 0.7057249243658367, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.1956, + "step": 12130 + }, + { + "epoch": 0.7063067256225274, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.1871, + "step": 12140 + }, + { + "epoch": 0.7068885268792181, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 0.1923, + "step": 12150 + }, + { + "epoch": 0.7074703281359088, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1964, + "step": 12160 + }, + { + "epoch": 0.7080521293925995, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.1907, + "step": 12170 + }, + { + "epoch": 0.7086339306492903, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.189, + "step": 12180 + }, + { + "epoch": 0.7092157319059809, + "grad_norm": 1.84375, + "learning_rate": 2e-05, + "loss": 0.1992, + "step": 12190 + }, + { + "epoch": 0.7097975331626716, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 0.193, + "step": 12200 + }, + { + "epoch": 0.7103793344193623, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 0.1986, + "step": 12210 + }, + { + "epoch": 0.710961135676053, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 0.1948, + "step": 12220 + }, + { + "epoch": 0.7115429369327437, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.1925, + "step": 12230 + }, + { + "epoch": 0.7121247381894344, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.1895, + "step": 12240 + }, + { + "epoch": 0.7127065394461252, + "grad_norm": 3.5625, + "learning_rate": 2e-05, + "loss": 0.1859, + "step": 12250 + }, + { + "epoch": 0.7132883407028159, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.187, + "step": 12260 + }, + { + "epoch": 0.7138701419595066, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.1889, + "step": 12270 + }, + { + "epoch": 0.7144519432161973, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 0.1868, + "step": 12280 + }, + { + "epoch": 0.715033744472888, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.2027, + "step": 12290 + }, + { + "epoch": 0.7156155457295788, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.1981, + "step": 12300 + }, + { + "epoch": 0.7161973469862695, + "grad_norm": 1.34375, + "learning_rate": 2e-05, + "loss": 0.189, + "step": 12310 + }, + { + "epoch": 0.7167791482429602, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 0.1937, + "step": 12320 + }, + { + "epoch": 0.7173609494996509, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 0.1842, + "step": 12330 + }, + { + "epoch": 0.7179427507563416, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 0.2, + "step": 12340 + }, + { + "epoch": 0.7185245520130323, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1907, + "step": 12350 + }, + { + "epoch": 0.719106353269723, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.1923, + "step": 12360 + }, + { + "epoch": 0.7196881545264138, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.1911, + "step": 12370 + }, + { + "epoch": 0.7202699557831045, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1887, + "step": 12380 + }, + { + "epoch": 0.7208517570397952, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 0.1894, + "step": 12390 + }, + { + "epoch": 0.7214335582964859, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 12400 + }, + { + "epoch": 0.7220153595531766, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 0.1942, + "step": 12410 + }, + { + "epoch": 0.7225971608098674, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 0.1935, + "step": 12420 + }, + { + "epoch": 0.7231789620665581, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.1869, + "step": 12430 + }, + { + "epoch": 0.7237607633232488, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 0.1995, + "step": 12440 + }, + { + "epoch": 0.7243425645799395, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.1874, + "step": 12450 + }, + { + "epoch": 0.7249243658366302, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.191, + "step": 12460 + }, + { + "epoch": 0.7255061670933209, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1912, + "step": 12470 + }, + { + "epoch": 0.7260879683500117, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 0.1865, + "step": 12480 + }, + { + "epoch": 0.7266697696067024, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.192, + "step": 12490 + }, + { + "epoch": 0.7272515708633931, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.1869, + "step": 12500 + }, + { + "epoch": 0.7278333721200838, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 0.1923, + "step": 12510 + }, + { + "epoch": 0.7284151733767745, + "grad_norm": 1.875, + "learning_rate": 2e-05, + "loss": 0.1896, + "step": 12520 + }, + { + "epoch": 0.7289969746334652, + "grad_norm": 1.9296875, + "learning_rate": 2e-05, + "loss": 0.1864, + "step": 12530 + }, + { + "epoch": 0.729578775890156, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.1922, + "step": 12540 + }, + { + "epoch": 0.7301605771468467, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 12550 + }, + { + "epoch": 0.7307423784035374, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.1973, + "step": 12560 + }, + { + "epoch": 0.7313241796602281, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.186, + "step": 12570 + }, + { + "epoch": 0.7319059809169188, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 0.1958, + "step": 12580 + }, + { + "epoch": 0.7324877821736095, + "grad_norm": 1.390625, + "learning_rate": 2e-05, + "loss": 0.191, + "step": 12590 + }, + { + "epoch": 0.7330695834303003, + "grad_norm": 1.265625, + "learning_rate": 2e-05, + "loss": 0.1886, + "step": 12600 + }, + { + "epoch": 0.7336513846869909, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.1925, + "step": 12610 + }, + { + "epoch": 0.7342331859436816, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 0.1922, + "step": 12620 + }, + { + "epoch": 0.7348149872003723, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1931, + "step": 12630 + }, + { + "epoch": 0.735396788457063, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.1939, + "step": 12640 + }, + { + "epoch": 0.7359785897137537, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.1827, + "step": 12650 + }, + { + "epoch": 0.7365603909704445, + "grad_norm": 2.3125, + "learning_rate": 2e-05, + "loss": 0.1921, + "step": 12660 + }, + { + "epoch": 0.7371421922271352, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.1901, + "step": 12670 + }, + { + "epoch": 0.7377239934838259, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.1859, + "step": 12680 + }, + { + "epoch": 0.7383057947405166, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.196, + "step": 12690 + }, + { + "epoch": 0.7388875959972073, + "grad_norm": 1.328125, + "learning_rate": 2e-05, + "loss": 0.1915, + "step": 12700 + }, + { + "epoch": 0.739469397253898, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1862, + "step": 12710 + }, + { + "epoch": 0.7400511985105888, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.1976, + "step": 12720 + }, + { + "epoch": 0.7406329997672795, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.1869, + "step": 12730 + }, + { + "epoch": 0.7412148010239702, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 0.1931, + "step": 12740 + }, + { + "epoch": 0.7417966022806609, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.1889, + "step": 12750 + }, + { + "epoch": 0.7423784035373516, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1878, + "step": 12760 + }, + { + "epoch": 0.7429602047940423, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.1846, + "step": 12770 + }, + { + "epoch": 0.7435420060507331, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.1889, + "step": 12780 + }, + { + "epoch": 0.7441238073074238, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.1863, + "step": 12790 + }, + { + "epoch": 0.7447056085641145, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.198, + "step": 12800 + }, + { + "epoch": 0.7452874098208052, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.1848, + "step": 12810 + }, + { + "epoch": 0.7458692110774959, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.1983, + "step": 12820 + }, + { + "epoch": 0.7464510123341866, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.1902, + "step": 12830 + }, + { + "epoch": 0.7470328135908774, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 0.1951, + "step": 12840 + }, + { + "epoch": 0.7476146148475681, + "grad_norm": 2.796875, + "learning_rate": 2e-05, + "loss": 0.1877, + "step": 12850 + }, + { + "epoch": 0.7481964161042588, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.1916, + "step": 12860 + }, + { + "epoch": 0.7487782173609495, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.1944, + "step": 12870 + }, + { + "epoch": 0.7493600186176402, + "grad_norm": 1.8203125, + "learning_rate": 2e-05, + "loss": 0.1874, + "step": 12880 + }, + { + "epoch": 0.749941819874331, + "grad_norm": 2.484375, + "learning_rate": 2e-05, + "loss": 0.1953, + "step": 12890 + }, + { + "epoch": 0.7505236211310217, + "grad_norm": 1.265625, + "learning_rate": 2e-05, + "loss": 0.1946, + "step": 12900 + }, + { + "epoch": 0.7511054223877124, + "grad_norm": 1.2109375, + "learning_rate": 2e-05, + "loss": 0.1905, + "step": 12910 + }, + { + "epoch": 0.7516872236444031, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 0.1876, + "step": 12920 + }, + { + "epoch": 0.7522690249010938, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1909, + "step": 12930 + }, + { + "epoch": 0.7528508261577845, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 0.1899, + "step": 12940 + }, + { + "epoch": 0.7534326274144753, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.1895, + "step": 12950 + }, + { + "epoch": 0.754014428671166, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 0.1822, + "step": 12960 + }, + { + "epoch": 0.7545962299278567, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.1909, + "step": 12970 + }, + { + "epoch": 0.7551780311845474, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 0.1845, + "step": 12980 + }, + { + "epoch": 0.7557598324412381, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.1923, + "step": 12990 + }, + { + "epoch": 0.7563416336979288, + "grad_norm": 1.953125, + "learning_rate": 2e-05, + "loss": 0.1929, + "step": 13000 + }, + { + "epoch": 0.7569234349546196, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.1915, + "step": 13010 + }, + { + "epoch": 0.7575052362113102, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.1893, + "step": 13020 + }, + { + "epoch": 0.7580870374680009, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.2014, + "step": 13030 + }, + { + "epoch": 0.7586688387246916, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.1985, + "step": 13040 + }, + { + "epoch": 0.7592506399813823, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.1919, + "step": 13050 + }, + { + "epoch": 0.759832441238073, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.1892, + "step": 13060 + }, + { + "epoch": 0.7604142424947637, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1849, + "step": 13070 + }, + { + "epoch": 0.7609960437514545, + "grad_norm": 1.34375, + "learning_rate": 2e-05, + "loss": 0.1904, + "step": 13080 + }, + { + "epoch": 0.7615778450081452, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.1896, + "step": 13090 + }, + { + "epoch": 0.7621596462648359, + "grad_norm": 1.7890625, + "learning_rate": 2e-05, + "loss": 0.1891, + "step": 13100 + }, + { + "epoch": 0.7627414475215266, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.1914, + "step": 13110 + }, + { + "epoch": 0.7633232487782173, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.188, + "step": 13120 + }, + { + "epoch": 0.763905050034908, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.2015, + "step": 13130 + }, + { + "epoch": 0.7644868512915988, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 0.1905, + "step": 13140 + }, + { + "epoch": 0.7650686525482895, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.1894, + "step": 13150 + }, + { + "epoch": 0.7656504538049802, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.1938, + "step": 13160 + }, + { + "epoch": 0.7662322550616709, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.1898, + "step": 13170 + }, + { + "epoch": 0.7668140563183616, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1891, + "step": 13180 + }, + { + "epoch": 0.7673958575750524, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 0.1957, + "step": 13190 + }, + { + "epoch": 0.7679776588317431, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.1943, + "step": 13200 + }, + { + "epoch": 0.7685594600884338, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 0.1898, + "step": 13210 + }, + { + "epoch": 0.7691412613451245, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.1906, + "step": 13220 + }, + { + "epoch": 0.7697230626018152, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 0.1904, + "step": 13230 + }, + { + "epoch": 0.7703048638585059, + "grad_norm": 2.171875, + "learning_rate": 2e-05, + "loss": 0.1967, + "step": 13240 + }, + { + "epoch": 0.7708866651151967, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.1881, + "step": 13250 + }, + { + "epoch": 0.7714684663718874, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 0.193, + "step": 13260 + }, + { + "epoch": 0.7720502676285781, + "grad_norm": 4.96875, + "learning_rate": 2e-05, + "loss": 0.1843, + "step": 13270 + }, + { + "epoch": 0.7726320688852688, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.1997, + "step": 13280 + }, + { + "epoch": 0.7732138701419595, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 13290 + }, + { + "epoch": 0.7737956713986502, + "grad_norm": 2.734375, + "learning_rate": 2e-05, + "loss": 0.1971, + "step": 13300 + }, + { + "epoch": 0.774377472655341, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.1891, + "step": 13310 + }, + { + "epoch": 0.7749592739120317, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.19, + "step": 13320 + }, + { + "epoch": 0.7755410751687224, + "grad_norm": 1.1796875, + "learning_rate": 2e-05, + "loss": 0.1967, + "step": 13330 + }, + { + "epoch": 0.7761228764254131, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.1902, + "step": 13340 + }, + { + "epoch": 0.7767046776821038, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 0.1891, + "step": 13350 + }, + { + "epoch": 0.7772864789387945, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 13360 + }, + { + "epoch": 0.7778682801954853, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.1928, + "step": 13370 + }, + { + "epoch": 0.778450081452176, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1904, + "step": 13380 + }, + { + "epoch": 0.7790318827088667, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 0.1981, + "step": 13390 + }, + { + "epoch": 0.7796136839655574, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.1882, + "step": 13400 + }, + { + "epoch": 0.7801954852222481, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.1871, + "step": 13410 + }, + { + "epoch": 0.7807772864789388, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.1967, + "step": 13420 + }, + { + "epoch": 0.7813590877356295, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.1957, + "step": 13430 + }, + { + "epoch": 0.7819408889923202, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 0.1907, + "step": 13440 + }, + { + "epoch": 0.7825226902490109, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 0.1937, + "step": 13450 + }, + { + "epoch": 0.7831044915057016, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.1971, + "step": 13460 + }, + { + "epoch": 0.7836862927623923, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.1944, + "step": 13470 + }, + { + "epoch": 0.784268094019083, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.1879, + "step": 13480 + }, + { + "epoch": 0.7848498952757738, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 0.1884, + "step": 13490 + }, + { + "epoch": 0.7854316965324645, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 0.1849, + "step": 13500 + }, + { + "epoch": 0.7860134977891552, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.1871, + "step": 13510 + }, + { + "epoch": 0.7865952990458459, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.1954, + "step": 13520 + }, + { + "epoch": 0.7871771003025366, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1938, + "step": 13530 + }, + { + "epoch": 0.7877589015592273, + "grad_norm": 2.125, + "learning_rate": 2e-05, + "loss": 0.1921, + "step": 13540 + }, + { + "epoch": 0.7883407028159181, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.1841, + "step": 13550 + }, + { + "epoch": 0.7889225040726088, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.1892, + "step": 13560 + }, + { + "epoch": 0.7895043053292995, + "grad_norm": 2.03125, + "learning_rate": 2e-05, + "loss": 0.1865, + "step": 13570 + }, + { + "epoch": 0.7900861065859902, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1921, + "step": 13580 + }, + { + "epoch": 0.7906679078426809, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 0.1855, + "step": 13590 + }, + { + "epoch": 0.7912497090993716, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.1925, + "step": 13600 + }, + { + "epoch": 0.7918315103560624, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.193, + "step": 13610 + }, + { + "epoch": 0.7924133116127531, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.1905, + "step": 13620 + }, + { + "epoch": 0.7929951128694438, + "grad_norm": 1.3984375, + "learning_rate": 2e-05, + "loss": 0.1866, + "step": 13630 + }, + { + "epoch": 0.7935769141261345, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.1917, + "step": 13640 + }, + { + "epoch": 0.7941587153828252, + "grad_norm": 1.8125, + "learning_rate": 2e-05, + "loss": 0.1843, + "step": 13650 + }, + { + "epoch": 0.794740516639516, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 0.1933, + "step": 13660 + }, + { + "epoch": 0.7953223178962067, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.1946, + "step": 13670 + }, + { + "epoch": 0.7959041191528974, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.197, + "step": 13680 + }, + { + "epoch": 0.7964859204095881, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.1929, + "step": 13690 + }, + { + "epoch": 0.7970677216662788, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.1894, + "step": 13700 + }, + { + "epoch": 0.7976495229229695, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 0.193, + "step": 13710 + }, + { + "epoch": 0.7982313241796603, + "grad_norm": 1.828125, + "learning_rate": 2e-05, + "loss": 0.1891, + "step": 13720 + }, + { + "epoch": 0.798813125436351, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1817, + "step": 13730 + }, + { + "epoch": 0.7993949266930417, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.1896, + "step": 13740 + }, + { + "epoch": 0.7999767279497324, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 0.1872, + "step": 13750 + }, + { + "epoch": 0.8005585292064231, + "grad_norm": 1.34375, + "learning_rate": 2e-05, + "loss": 0.1987, + "step": 13760 + }, + { + "epoch": 0.8011403304631138, + "grad_norm": 1.390625, + "learning_rate": 2e-05, + "loss": 0.1835, + "step": 13770 + }, + { + "epoch": 0.8017221317198046, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1898, + "step": 13780 + }, + { + "epoch": 0.8023039329764953, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.193, + "step": 13790 + }, + { + "epoch": 0.802885734233186, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.1867, + "step": 13800 + }, + { + "epoch": 0.8034675354898767, + "grad_norm": 2.71875, + "learning_rate": 2e-05, + "loss": 0.1959, + "step": 13810 + }, + { + "epoch": 0.8040493367465674, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 0.1922, + "step": 13820 + }, + { + "epoch": 0.8046311380032581, + "grad_norm": 1.8515625, + "learning_rate": 2e-05, + "loss": 0.1799, + "step": 13830 + }, + { + "epoch": 0.8052129392599489, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.196, + "step": 13840 + }, + { + "epoch": 0.8057947405166395, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.1941, + "step": 13850 + }, + { + "epoch": 0.8063765417733302, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.1913, + "step": 13860 + }, + { + "epoch": 0.8069583430300209, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 0.1911, + "step": 13870 + }, + { + "epoch": 0.8075401442867116, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.1894, + "step": 13880 + }, + { + "epoch": 0.8081219455434023, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.1824, + "step": 13890 + }, + { + "epoch": 0.808703746800093, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1895, + "step": 13900 + }, + { + "epoch": 0.8092855480567838, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.1869, + "step": 13910 + }, + { + "epoch": 0.8098673493134745, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1888, + "step": 13920 + }, + { + "epoch": 0.8104491505701652, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.1905, + "step": 13930 + }, + { + "epoch": 0.8110309518268559, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 13940 + }, + { + "epoch": 0.8116127530835466, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.1901, + "step": 13950 + }, + { + "epoch": 0.8121945543402374, + "grad_norm": 1.96875, + "learning_rate": 2e-05, + "loss": 0.1904, + "step": 13960 + }, + { + "epoch": 0.8127763555969281, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1936, + "step": 13970 + }, + { + "epoch": 0.8133581568536188, + "grad_norm": 1.2109375, + "learning_rate": 2e-05, + "loss": 0.1947, + "step": 13980 + }, + { + "epoch": 0.8139399581103095, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 13990 + }, + { + "epoch": 0.8145217593670002, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 0.1941, + "step": 14000 + }, + { + "epoch": 0.8151035606236909, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1898, + "step": 14010 + }, + { + "epoch": 0.8156853618803817, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 14020 + }, + { + "epoch": 0.8162671631370724, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.1872, + "step": 14030 + }, + { + "epoch": 0.8168489643937631, + "grad_norm": 1.28125, + "learning_rate": 2e-05, + "loss": 0.1846, + "step": 14040 + }, + { + "epoch": 0.8174307656504538, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 0.1928, + "step": 14050 + }, + { + "epoch": 0.8180125669071445, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 0.1904, + "step": 14060 + }, + { + "epoch": 0.8185943681638352, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.1855, + "step": 14070 + }, + { + "epoch": 0.819176169420526, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 0.1879, + "step": 14080 + }, + { + "epoch": 0.8197579706772167, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 0.1915, + "step": 14090 + }, + { + "epoch": 0.8203397719339074, + "grad_norm": 1.8671875, + "learning_rate": 2e-05, + "loss": 0.1897, + "step": 14100 + }, + { + "epoch": 0.8209215731905981, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 0.1784, + "step": 14110 + }, + { + "epoch": 0.8215033744472888, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.1934, + "step": 14120 + }, + { + "epoch": 0.8220851757039795, + "grad_norm": 1.2734375, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 14130 + }, + { + "epoch": 0.8226669769606703, + "grad_norm": 1.7734375, + "learning_rate": 2e-05, + "loss": 0.1863, + "step": 14140 + }, + { + "epoch": 0.823248778217361, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1911, + "step": 14150 + }, + { + "epoch": 0.8238305794740517, + "grad_norm": 1.28125, + "learning_rate": 2e-05, + "loss": 0.1875, + "step": 14160 + }, + { + "epoch": 0.8244123807307424, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.1837, + "step": 14170 + }, + { + "epoch": 0.8249941819874331, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.1912, + "step": 14180 + }, + { + "epoch": 0.8255759832441238, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 0.187, + "step": 14190 + }, + { + "epoch": 0.8261577845008146, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.1863, + "step": 14200 + }, + { + "epoch": 0.8267395857575053, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.2013, + "step": 14210 + }, + { + "epoch": 0.827321387014196, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.1905, + "step": 14220 + }, + { + "epoch": 0.8279031882708867, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.1896, + "step": 14230 + }, + { + "epoch": 0.8284849895275774, + "grad_norm": 1.765625, + "learning_rate": 2e-05, + "loss": 0.1895, + "step": 14240 + }, + { + "epoch": 0.8290667907842681, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.1905, + "step": 14250 + }, + { + "epoch": 0.8296485920409588, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 0.1919, + "step": 14260 + }, + { + "epoch": 0.8302303932976495, + "grad_norm": 1.265625, + "learning_rate": 2e-05, + "loss": 0.1869, + "step": 14270 + }, + { + "epoch": 0.8308121945543402, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.1921, + "step": 14280 + }, + { + "epoch": 0.8313939958110309, + "grad_norm": 1.6875, + "learning_rate": 2e-05, + "loss": 0.1859, + "step": 14290 + }, + { + "epoch": 0.8319757970677216, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 0.1941, + "step": 14300 + }, + { + "epoch": 0.8325575983244123, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 0.1849, + "step": 14310 + }, + { + "epoch": 0.8331393995811031, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.1966, + "step": 14320 + }, + { + "epoch": 0.8337212008377938, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.1867, + "step": 14330 + }, + { + "epoch": 0.8343030020944845, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 0.1891, + "step": 14340 + }, + { + "epoch": 0.8348848033511752, + "grad_norm": 3.15625, + "learning_rate": 2e-05, + "loss": 0.1881, + "step": 14350 + }, + { + "epoch": 0.8354666046078659, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.1873, + "step": 14360 + }, + { + "epoch": 0.8360484058645566, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 0.1908, + "step": 14370 + }, + { + "epoch": 0.8366302071212474, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 0.194, + "step": 14380 + }, + { + "epoch": 0.8372120083779381, + "grad_norm": 1.390625, + "learning_rate": 2e-05, + "loss": 0.1854, + "step": 14390 + }, + { + "epoch": 0.8377938096346288, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.1855, + "step": 14400 + }, + { + "epoch": 0.8383756108913195, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 14410 + }, + { + "epoch": 0.8389574121480102, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.1874, + "step": 14420 + }, + { + "epoch": 0.839539213404701, + "grad_norm": 2.390625, + "learning_rate": 2e-05, + "loss": 0.1883, + "step": 14430 + }, + { + "epoch": 0.8401210146613917, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.1828, + "step": 14440 + }, + { + "epoch": 0.8407028159180824, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 0.1864, + "step": 14450 + }, + { + "epoch": 0.8412846171747731, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.1918, + "step": 14460 + }, + { + "epoch": 0.8418664184314638, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.1891, + "step": 14470 + }, + { + "epoch": 0.8424482196881545, + "grad_norm": 1.3046875, + "learning_rate": 2e-05, + "loss": 0.1836, + "step": 14480 + }, + { + "epoch": 0.8430300209448452, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.1869, + "step": 14490 + }, + { + "epoch": 0.843611822201536, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.1906, + "step": 14500 + }, + { + "epoch": 0.8441936234582267, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1908, + "step": 14510 + }, + { + "epoch": 0.8447754247149174, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 0.1932, + "step": 14520 + }, + { + "epoch": 0.8453572259716081, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.1902, + "step": 14530 + }, + { + "epoch": 0.8459390272282988, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 0.1856, + "step": 14540 + }, + { + "epoch": 0.8465208284849896, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1905, + "step": 14550 + }, + { + "epoch": 0.8471026297416803, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.1897, + "step": 14560 + }, + { + "epoch": 0.847684430998371, + "grad_norm": 1.9609375, + "learning_rate": 2e-05, + "loss": 0.1955, + "step": 14570 + }, + { + "epoch": 0.8482662322550617, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 0.1886, + "step": 14580 + }, + { + "epoch": 0.8488480335117524, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 0.186, + "step": 14590 + }, + { + "epoch": 0.8494298347684431, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 0.1864, + "step": 14600 + }, + { + "epoch": 0.8500116360251339, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 0.1841, + "step": 14610 + }, + { + "epoch": 0.8505934372818246, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.1906, + "step": 14620 + }, + { + "epoch": 0.8511752385385153, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.19, + "step": 14630 + }, + { + "epoch": 0.851757039795206, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 0.1846, + "step": 14640 + }, + { + "epoch": 0.8523388410518967, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 0.1845, + "step": 14650 + }, + { + "epoch": 0.8529206423085874, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.1926, + "step": 14660 + }, + { + "epoch": 0.853502443565278, + "grad_norm": 1.6796875, + "learning_rate": 2e-05, + "loss": 0.1855, + "step": 14670 + }, + { + "epoch": 0.8540842448219688, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 0.187, + "step": 14680 + }, + { + "epoch": 0.8546660460786595, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 0.1936, + "step": 14690 + }, + { + "epoch": 0.8552478473353502, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1933, + "step": 14700 + }, + { + "epoch": 0.8558296485920409, + "grad_norm": 1.2890625, + "learning_rate": 2e-05, + "loss": 0.1855, + "step": 14710 + }, + { + "epoch": 0.8564114498487316, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 0.1878, + "step": 14720 + }, + { + "epoch": 0.8569932511054223, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.1958, + "step": 14730 + }, + { + "epoch": 0.8575750523621131, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 0.1857, + "step": 14740 + }, + { + "epoch": 0.8581568536188038, + "grad_norm": 1.9453125, + "learning_rate": 2e-05, + "loss": 0.1895, + "step": 14750 + }, + { + "epoch": 0.8587386548754945, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 0.193, + "step": 14760 + }, + { + "epoch": 0.8593204561321852, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.1879, + "step": 14770 + }, + { + "epoch": 0.8599022573888759, + "grad_norm": 1.21875, + "learning_rate": 2e-05, + "loss": 0.188, + "step": 14780 + }, + { + "epoch": 0.8604840586455667, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.1912, + "step": 14790 + }, + { + "epoch": 0.8610658599022574, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1904, + "step": 14800 + }, + { + "epoch": 0.8616476611589481, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 0.1896, + "step": 14810 + }, + { + "epoch": 0.8622294624156388, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.1844, + "step": 14820 + }, + { + "epoch": 0.8628112636723295, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 0.1924, + "step": 14830 + }, + { + "epoch": 0.8633930649290202, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 0.1774, + "step": 14840 + }, + { + "epoch": 0.863974866185711, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1862, + "step": 14850 + }, + { + "epoch": 0.8645566674424017, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.1914, + "step": 14860 + }, + { + "epoch": 0.8651384686990924, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1817, + "step": 14870 + }, + { + "epoch": 0.8657202699557831, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.1876, + "step": 14880 + }, + { + "epoch": 0.8663020712124738, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.1912, + "step": 14890 + }, + { + "epoch": 0.8668838724691645, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 0.1924, + "step": 14900 + }, + { + "epoch": 0.8674656737258553, + "grad_norm": 1.84375, + "learning_rate": 2e-05, + "loss": 0.1894, + "step": 14910 + }, + { + "epoch": 0.868047474982546, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.1847, + "step": 14920 + }, + { + "epoch": 0.8686292762392367, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.1827, + "step": 14930 + }, + { + "epoch": 0.8692110774959274, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.1867, + "step": 14940 + }, + { + "epoch": 0.8697928787526181, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.1858, + "step": 14950 + }, + { + "epoch": 0.8703746800093088, + "grad_norm": 1.6953125, + "learning_rate": 2e-05, + "loss": 0.1802, + "step": 14960 + }, + { + "epoch": 0.8709564812659996, + "grad_norm": 1.8828125, + "learning_rate": 2e-05, + "loss": 0.181, + "step": 14970 + }, + { + "epoch": 0.8715382825226903, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 0.1909, + "step": 14980 + }, + { + "epoch": 0.872120083779381, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.1865, + "step": 14990 + }, + { + "epoch": 0.8727018850360717, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1878, + "step": 15000 + }, + { + "epoch": 0.8732836862927624, + "grad_norm": 1.359375, + "learning_rate": 2e-05, + "loss": 0.1841, + "step": 15010 + }, + { + "epoch": 0.8738654875494531, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1893, + "step": 15020 + }, + { + "epoch": 0.8744472888061439, + "grad_norm": 1.2265625, + "learning_rate": 2e-05, + "loss": 0.1857, + "step": 15030 + }, + { + "epoch": 0.8750290900628346, + "grad_norm": 1.671875, + "learning_rate": 2e-05, + "loss": 0.189, + "step": 15040 + }, + { + "epoch": 0.8756108913195253, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 0.1863, + "step": 15050 + }, + { + "epoch": 0.876192692576216, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 0.1923, + "step": 15060 + }, + { + "epoch": 0.8767744938329067, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.1885, + "step": 15070 + }, + { + "epoch": 0.8773562950895974, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.1851, + "step": 15080 + }, + { + "epoch": 0.877938096346288, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 0.1876, + "step": 15090 + }, + { + "epoch": 0.8785198976029788, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 0.1883, + "step": 15100 + }, + { + "epoch": 0.8791016988596695, + "grad_norm": 1.6640625, + "learning_rate": 2e-05, + "loss": 0.185, + "step": 15110 + }, + { + "epoch": 0.8796835001163602, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.1854, + "step": 15120 + }, + { + "epoch": 0.8802653013730509, + "grad_norm": 1.796875, + "learning_rate": 2e-05, + "loss": 0.1848, + "step": 15130 + }, + { + "epoch": 0.8808471026297416, + "grad_norm": 2.328125, + "learning_rate": 2e-05, + "loss": 0.1893, + "step": 15140 + }, + { + "epoch": 0.8814289038864324, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 0.1881, + "step": 15150 + }, + { + "epoch": 0.8820107051431231, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 0.1857, + "step": 15160 + }, + { + "epoch": 0.8825925063998138, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 0.191, + "step": 15170 + }, + { + "epoch": 0.8831743076565045, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.1885, + "step": 15180 + }, + { + "epoch": 0.8837561089131952, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.1863, + "step": 15190 + }, + { + "epoch": 0.8843379101698859, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 0.196, + "step": 15200 + }, + { + "epoch": 0.8849197114265767, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 0.1847, + "step": 15210 + }, + { + "epoch": 0.8855015126832674, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 0.1823, + "step": 15220 + }, + { + "epoch": 0.8860833139399581, + "grad_norm": 1.7265625, + "learning_rate": 2e-05, + "loss": 0.1871, + "step": 15230 + }, + { + "epoch": 0.8866651151966488, + "grad_norm": 1.3125, + "learning_rate": 2e-05, + "loss": 0.1875, + "step": 15240 + }, + { + "epoch": 0.8872469164533395, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.1899, + "step": 15250 + }, + { + "epoch": 0.8878287177100302, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 0.1801, + "step": 15260 + }, + { + "epoch": 0.888410518966721, + "grad_norm": 1.328125, + "learning_rate": 2e-05, + "loss": 0.1883, + "step": 15270 + }, + { + "epoch": 0.8889923202234117, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 0.1833, + "step": 15280 + }, + { + "epoch": 0.8895741214801024, + "grad_norm": 1.703125, + "learning_rate": 2e-05, + "loss": 0.193, + "step": 15290 + }, + { + "epoch": 0.8901559227367931, + "grad_norm": 1.859375, + "learning_rate": 2e-05, + "loss": 0.177, + "step": 15300 + }, + { + "epoch": 0.8907377239934838, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.1885, + "step": 15310 + }, + { + "epoch": 0.8913195252501745, + "grad_norm": 1.6484375, + "learning_rate": 2e-05, + "loss": 0.1882, + "step": 15320 + }, + { + "epoch": 0.8919013265068653, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.1845, + "step": 15330 + }, + { + "epoch": 0.892483127763556, + "grad_norm": 1.328125, + "learning_rate": 2e-05, + "loss": 0.1844, + "step": 15340 + }, + { + "epoch": 0.8930649290202467, + "grad_norm": 2.109375, + "learning_rate": 2e-05, + "loss": 0.1823, + "step": 15350 + }, + { + "epoch": 0.8936467302769374, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.1835, + "step": 15360 + }, + { + "epoch": 0.8942285315336281, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 0.1876, + "step": 15370 + }, + { + "epoch": 0.8948103327903189, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 0.1857, + "step": 15380 + }, + { + "epoch": 0.8953921340470096, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1879, + "step": 15390 + }, + { + "epoch": 0.8959739353037003, + "grad_norm": 1.890625, + "learning_rate": 2e-05, + "loss": 0.1862, + "step": 15400 + }, + { + "epoch": 0.896555736560391, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.185, + "step": 15410 + }, + { + "epoch": 0.8971375378170817, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.1875, + "step": 15420 + }, + { + "epoch": 0.8977193390737724, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 0.1819, + "step": 15430 + }, + { + "epoch": 0.8983011403304632, + "grad_norm": 1.6015625, + "learning_rate": 2e-05, + "loss": 0.1877, + "step": 15440 + }, + { + "epoch": 0.8988829415871539, + "grad_norm": 1.6171875, + "learning_rate": 2e-05, + "loss": 0.187, + "step": 15450 + }, + { + "epoch": 0.8994647428438446, + "grad_norm": 1.5859375, + "learning_rate": 2e-05, + "loss": 0.1893, + "step": 15460 + }, + { + "epoch": 0.9000465441005353, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 0.192, + "step": 15470 + }, + { + "epoch": 0.900628345357226, + "grad_norm": 1.5703125, + "learning_rate": 2e-05, + "loss": 0.1831, + "step": 15480 + }, + { + "epoch": 0.9012101466139167, + "grad_norm": 1.7421875, + "learning_rate": 2e-05, + "loss": 0.1891, + "step": 15490 + }, + { + "epoch": 0.9017919478706073, + "grad_norm": 1.328125, + "learning_rate": 2e-05, + "loss": 0.1916, + "step": 15500 + }, + { + "epoch": 0.9023737491272981, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 0.1886, + "step": 15510 + }, + { + "epoch": 0.9029555503839888, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 0.1851, + "step": 15520 + }, + { + "epoch": 0.9035373516406795, + "grad_norm": 2.078125, + "learning_rate": 2e-05, + "loss": 0.1871, + "step": 15530 + }, + { + "epoch": 0.9041191528973702, + "grad_norm": 1.7578125, + "learning_rate": 2e-05, + "loss": 0.1876, + "step": 15540 + }, + { + "epoch": 0.9047009541540609, + "grad_norm": 1.3828125, + "learning_rate": 2e-05, + "loss": 0.1896, + "step": 15550 + }, + { + "epoch": 0.9052827554107516, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.185, + "step": 15560 + }, + { + "epoch": 0.9058645566674424, + "grad_norm": 2.375, + "learning_rate": 2e-05, + "loss": 0.1844, + "step": 15570 + }, + { + "epoch": 0.9064463579241331, + "grad_norm": 1.515625, + "learning_rate": 2e-05, + "loss": 0.1853, + "step": 15580 + }, + { + "epoch": 0.9070281591808238, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 0.1806, + "step": 15590 + }, + { + "epoch": 0.9076099604375145, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 0.1859, + "step": 15600 + }, + { + "epoch": 0.9081917616942052, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 0.1796, + "step": 15610 + }, + { + "epoch": 0.908773562950896, + "grad_norm": 1.71875, + "learning_rate": 2e-05, + "loss": 0.1854, + "step": 15620 + } + ], + "logging_steps": 10, + "max_steps": 15625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.4853137926124995e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}