{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.025, "eval_steps": 500, "global_step": 6500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 1.5970183610916138, "learning_rate": 1.8e-06, "loss": 1.0764, "step": 10 }, { "grad_norm": 0.9294151663780212, "learning_rate": 3.8e-06, "loss": 1.0659, "step": 20 }, { "grad_norm": 0.24522702395915985, "learning_rate": 5.8e-06, "loss": 1.0449, "step": 30 }, { "grad_norm": 0.2660885155200958, "learning_rate": 7.8e-06, "loss": 1.0464, "step": 40 }, { "grad_norm": 0.2017054259777069, "learning_rate": 9.800000000000001e-06, "loss": 1.0367, "step": 50 }, { "grad_norm": 0.2722417414188385, "learning_rate": 1.18e-05, "loss": 1.0338, "step": 60 }, { "grad_norm": 0.25059282779693604, "learning_rate": 1.3800000000000002e-05, "loss": 1.0364, "step": 70 }, { "grad_norm": 0.5216037034988403, "learning_rate": 1.58e-05, "loss": 1.0319, "step": 80 }, { "grad_norm": 1.4266352653503418, "learning_rate": 1.78e-05, "loss": 0.9991, "step": 90 }, { "grad_norm": 1.753671646118164, "learning_rate": 1.9800000000000004e-05, "loss": 0.9436, "step": 100 }, { "grad_norm": 1.8120691776275635, "learning_rate": 2.18e-05, "loss": 0.9, "step": 110 }, { "grad_norm": 2.7624964714050293, "learning_rate": 2.38e-05, "loss": 0.8231, "step": 120 }, { "grad_norm": 3.362637758255005, "learning_rate": 2.58e-05, "loss": 0.7328, "step": 130 }, { "grad_norm": 2.3826422691345215, "learning_rate": 2.7800000000000005e-05, "loss": 0.6657, "step": 140 }, { "grad_norm": 2.7328379154205322, "learning_rate": 2.98e-05, "loss": 0.5824, "step": 150 }, { "grad_norm": 2.083045244216919, "learning_rate": 3.18e-05, "loss": 0.4792, "step": 160 }, { "grad_norm": 2.2116315364837646, "learning_rate": 3.38e-05, "loss": 0.3948, "step": 170 }, { "grad_norm": 2.1600606441497803, "learning_rate": 3.58e-05, "loss": 0.3372, "step": 180 }, { "grad_norm": 2.114063024520874, "learning_rate": 3.7800000000000004e-05, "loss": 0.2888, "step": 190 }, { "grad_norm": 2.7655060291290283, "learning_rate": 3.9800000000000005e-05, "loss": 0.2676, "step": 200 }, { "grad_norm": 2.0653321743011475, "learning_rate": 4.18e-05, "loss": 0.2501, "step": 210 }, { "grad_norm": 1.653233528137207, "learning_rate": 4.38e-05, "loss": 0.2453, "step": 220 }, { "grad_norm": 1.624769687652588, "learning_rate": 4.58e-05, "loss": 0.2247, "step": 230 }, { "grad_norm": 1.4861382246017456, "learning_rate": 4.78e-05, "loss": 0.1982, "step": 240 }, { "grad_norm": 1.3590528964996338, "learning_rate": 4.9800000000000004e-05, "loss": 0.2075, "step": 250 }, { "grad_norm": 1.9476397037506104, "learning_rate": 5.1800000000000005e-05, "loss": 0.1913, "step": 260 }, { "grad_norm": 1.324093222618103, "learning_rate": 5.380000000000001e-05, "loss": 0.1952, "step": 270 }, { "grad_norm": 1.080151081085205, "learning_rate": 5.580000000000001e-05, "loss": 0.1771, "step": 280 }, { "grad_norm": 1.369262456893921, "learning_rate": 5.7799999999999995e-05, "loss": 0.1857, "step": 290 }, { "grad_norm": 1.3636747598648071, "learning_rate": 5.9800000000000003e-05, "loss": 0.1706, "step": 300 }, { "grad_norm": 1.040611982345581, "learning_rate": 6.18e-05, "loss": 0.1756, "step": 310 }, { "grad_norm": 1.0529470443725586, "learning_rate": 6.38e-05, "loss": 0.1593, "step": 320 }, { "grad_norm": 1.201231598854065, "learning_rate": 6.58e-05, "loss": 0.1551, "step": 330 }, { "grad_norm": 1.190171241760254, "learning_rate": 6.780000000000001e-05, "loss": 0.1581, "step": 340 }, { "grad_norm": 0.657278835773468, "learning_rate": 6.98e-05, "loss": 0.1464, "step": 350 }, { "grad_norm": 0.8992099165916443, "learning_rate": 7.18e-05, "loss": 0.1565, "step": 360 }, { "grad_norm": 0.8571988940238953, "learning_rate": 7.38e-05, "loss": 0.1402, "step": 370 }, { "grad_norm": 0.859052300453186, "learning_rate": 7.58e-05, "loss": 0.1499, "step": 380 }, { "grad_norm": 1.1119016408920288, "learning_rate": 7.780000000000001e-05, "loss": 0.1268, "step": 390 }, { "grad_norm": 0.8884002566337585, "learning_rate": 7.98e-05, "loss": 0.1296, "step": 400 }, { "grad_norm": 0.6597028970718384, "learning_rate": 8.18e-05, "loss": 0.1295, "step": 410 }, { "grad_norm": 0.8965606689453125, "learning_rate": 8.38e-05, "loss": 0.1312, "step": 420 }, { "grad_norm": 0.9830987453460693, "learning_rate": 8.58e-05, "loss": 0.1425, "step": 430 }, { "grad_norm": 1.0157232284545898, "learning_rate": 8.78e-05, "loss": 0.1355, "step": 440 }, { "grad_norm": 1.2113580703735352, "learning_rate": 8.98e-05, "loss": 0.1184, "step": 450 }, { "grad_norm": 0.6973419189453125, "learning_rate": 9.180000000000001e-05, "loss": 0.119, "step": 460 }, { "grad_norm": 0.7367122769355774, "learning_rate": 9.38e-05, "loss": 0.1292, "step": 470 }, { "grad_norm": 0.7014634609222412, "learning_rate": 9.58e-05, "loss": 0.1225, "step": 480 }, { "grad_norm": 0.6766982674598694, "learning_rate": 9.78e-05, "loss": 0.1196, "step": 490 }, { "grad_norm": 0.6201929450035095, "learning_rate": 9.98e-05, "loss": 0.1191, "step": 500 }, { "grad_norm": 0.6271793842315674, "learning_rate": 3.6e-06, "loss": 0.1153, "step": 510 }, { "grad_norm": 0.20930686593055725, "learning_rate": 7.6e-06, "loss": 0.0768, "step": 520 }, { "grad_norm": 0.143227219581604, "learning_rate": 1.16e-05, "loss": 0.0601, "step": 530 }, { "grad_norm": 0.1910770833492279, "learning_rate": 1.56e-05, "loss": 0.063, "step": 540 }, { "grad_norm": 0.36501529812812805, "learning_rate": 1.9600000000000002e-05, "loss": 0.0598, "step": 550 }, { "grad_norm": 0.29628098011016846, "learning_rate": 2.36e-05, "loss": 0.0599, "step": 560 }, { "grad_norm": 0.3480481505393982, "learning_rate": 2.7600000000000003e-05, "loss": 0.0651, "step": 570 }, { "grad_norm": 0.4538305103778839, "learning_rate": 3.16e-05, "loss": 0.0649, "step": 580 }, { "grad_norm": 0.5898908972740173, "learning_rate": 3.56e-05, "loss": 0.0692, "step": 590 }, { "grad_norm": 0.5158395171165466, "learning_rate": 3.960000000000001e-05, "loss": 0.0692, "step": 600 }, { "grad_norm": 0.5166841149330139, "learning_rate": 4.36e-05, "loss": 0.0702, "step": 610 }, { "grad_norm": 0.5158836841583252, "learning_rate": 4.76e-05, "loss": 0.0731, "step": 620 }, { "grad_norm": 0.6472628712654114, "learning_rate": 5.16e-05, "loss": 0.0771, "step": 630 }, { "grad_norm": 0.5659353733062744, "learning_rate": 5.560000000000001e-05, "loss": 0.0841, "step": 640 }, { "grad_norm": 0.6240392923355103, "learning_rate": 5.96e-05, "loss": 0.0847, "step": 650 }, { "grad_norm": 0.6373980641365051, "learning_rate": 6.36e-05, "loss": 0.0843, "step": 660 }, { "grad_norm": 0.6327357292175293, "learning_rate": 6.76e-05, "loss": 0.0934, "step": 670 }, { "grad_norm": 0.8802633285522461, "learning_rate": 7.16e-05, "loss": 0.0908, "step": 680 }, { "grad_norm": 0.6709054112434387, "learning_rate": 7.560000000000001e-05, "loss": 0.0934, "step": 690 }, { "grad_norm": 0.7937743663787842, "learning_rate": 7.960000000000001e-05, "loss": 0.0952, "step": 700 }, { "grad_norm": 0.6417607069015503, "learning_rate": 8.36e-05, "loss": 0.1026, "step": 710 }, { "grad_norm": 0.6258336305618286, "learning_rate": 8.76e-05, "loss": 0.0998, "step": 720 }, { "grad_norm": 0.7310349941253662, "learning_rate": 9.16e-05, "loss": 0.1048, "step": 730 }, { "grad_norm": 0.9520322680473328, "learning_rate": 9.56e-05, "loss": 0.1085, "step": 740 }, { "grad_norm": 0.5814223885536194, "learning_rate": 9.960000000000001e-05, "loss": 0.108, "step": 750 }, { "grad_norm": 0.7229575514793396, "learning_rate": 9.999911419878559e-05, "loss": 0.1092, "step": 760 }, { "grad_norm": 0.5426738858222961, "learning_rate": 9.999605221019081e-05, "loss": 0.1029, "step": 770 }, { "grad_norm": 0.7140430808067322, "learning_rate": 9.999080323230761e-05, "loss": 0.0989, "step": 780 }, { "grad_norm": 0.5421532392501831, "learning_rate": 9.998336749474329e-05, "loss": 0.0929, "step": 790 }, { "grad_norm": 0.6705226302146912, "learning_rate": 9.997374532276107e-05, "loss": 0.0941, "step": 800 }, { "grad_norm": 0.5059484839439392, "learning_rate": 9.996193713726596e-05, "loss": 0.0791, "step": 810 }, { "grad_norm": 0.6469954252243042, "learning_rate": 9.994794345478624e-05, "loss": 0.0901, "step": 820 }, { "grad_norm": 0.458099901676178, "learning_rate": 9.99317648874509e-05, "loss": 0.0849, "step": 830 }, { "grad_norm": 0.4811062514781952, "learning_rate": 9.991340214296292e-05, "loss": 0.0805, "step": 840 }, { "grad_norm": 0.5744126439094543, "learning_rate": 9.989285602456819e-05, "loss": 0.0938, "step": 850 }, { "grad_norm": 0.5527174472808838, "learning_rate": 9.98701274310205e-05, "loss": 0.0793, "step": 860 }, { "grad_norm": 0.5400684475898743, "learning_rate": 9.984521735654218e-05, "loss": 0.0748, "step": 870 }, { "grad_norm": 0.38753241300582886, "learning_rate": 9.981812689078057e-05, "loss": 0.0846, "step": 880 }, { "grad_norm": 0.5142819285392761, "learning_rate": 9.978885721876041e-05, "loss": 0.0837, "step": 890 }, { "grad_norm": 0.4644053876399994, "learning_rate": 9.975740962083198e-05, "loss": 0.0788, "step": 900 }, { "grad_norm": 0.5433564782142639, "learning_rate": 9.972378547261504e-05, "loss": 0.0812, "step": 910 }, { "grad_norm": 0.5788527131080627, "learning_rate": 9.968798624493885e-05, "loss": 0.0748, "step": 920 }, { "grad_norm": 0.7408686876296997, "learning_rate": 9.965001350377753e-05, "loss": 0.0766, "step": 930 }, { "grad_norm": 0.4285391867160797, "learning_rate": 9.960986891018183e-05, "loss": 0.0752, "step": 940 }, { "grad_norm": 0.5498768091201782, "learning_rate": 9.95675542202063e-05, "loss": 0.0752, "step": 950 }, { "grad_norm": 0.5009428262710571, "learning_rate": 9.952307128483256e-05, "loss": 0.0812, "step": 960 }, { "grad_norm": 0.43610042333602905, "learning_rate": 9.947642204988835e-05, "loss": 0.0729, "step": 970 }, { "grad_norm": 0.6121730804443359, "learning_rate": 9.942760855596226e-05, "loss": 0.0715, "step": 980 }, { "grad_norm": 0.31105107069015503, "learning_rate": 9.937663293831471e-05, "loss": 0.0736, "step": 990 }, { "grad_norm": 0.5373350977897644, "learning_rate": 9.932349742678433e-05, "loss": 0.0734, "step": 1000 }, { "grad_norm": 0.3666306138038635, "learning_rate": 3.6e-06, "loss": 0.0704, "step": 1010 }, { "grad_norm": 0.16294004023075104, "learning_rate": 7.6e-06, "loss": 0.0557, "step": 1020 }, { "grad_norm": 0.11897388100624084, "learning_rate": 1.16e-05, "loss": 0.0459, "step": 1030 }, { "grad_norm": 0.16593188047409058, "learning_rate": 1.56e-05, "loss": 0.0432, "step": 1040 }, { "grad_norm": 0.18971645832061768, "learning_rate": 1.9600000000000002e-05, "loss": 0.0453, "step": 1050 }, { "grad_norm": 0.294993132352829, "learning_rate": 2.36e-05, "loss": 0.0427, "step": 1060 }, { "grad_norm": 0.2616293728351593, "learning_rate": 2.7600000000000003e-05, "loss": 0.0446, "step": 1070 }, { "grad_norm": 0.3469824194908142, "learning_rate": 3.16e-05, "loss": 0.0439, "step": 1080 }, { "grad_norm": 0.3402693271636963, "learning_rate": 3.56e-05, "loss": 0.0433, "step": 1090 }, { "grad_norm": 0.39783069491386414, "learning_rate": 3.960000000000001e-05, "loss": 0.0491, "step": 1100 }, { "grad_norm": 0.37544241547584534, "learning_rate": 4.36e-05, "loss": 0.0509, "step": 1110 }, { "grad_norm": 0.4338155686855316, "learning_rate": 4.76e-05, "loss": 0.0473, "step": 1120 }, { "grad_norm": 0.33971983194351196, "learning_rate": 5.16e-05, "loss": 0.0518, "step": 1130 }, { "grad_norm": 0.39340782165527344, "learning_rate": 5.560000000000001e-05, "loss": 0.0501, "step": 1140 }, { "grad_norm": 0.4534987509250641, "learning_rate": 5.96e-05, "loss": 0.0568, "step": 1150 }, { "grad_norm": 0.5981581807136536, "learning_rate": 6.36e-05, "loss": 0.0559, "step": 1160 }, { "grad_norm": 0.3891669809818268, "learning_rate": 6.76e-05, "loss": 0.0559, "step": 1170 }, { "grad_norm": 0.47197505831718445, "learning_rate": 7.16e-05, "loss": 0.0619, "step": 1180 }, { "grad_norm": 0.4584357738494873, "learning_rate": 7.560000000000001e-05, "loss": 0.0618, "step": 1190 }, { "grad_norm": 0.5792739391326904, "learning_rate": 7.960000000000001e-05, "loss": 0.0663, "step": 1200 }, { "grad_norm": 0.3875551223754883, "learning_rate": 8.36e-05, "loss": 0.0652, "step": 1210 }, { "grad_norm": 0.3619835078716278, "learning_rate": 8.76e-05, "loss": 0.072, "step": 1220 }, { "grad_norm": 0.6116339564323425, "learning_rate": 9.16e-05, "loss": 0.0686, "step": 1230 }, { "grad_norm": 0.575858473777771, "learning_rate": 9.56e-05, "loss": 0.0721, "step": 1240 }, { "grad_norm": 0.5876694321632385, "learning_rate": 9.960000000000001e-05, "loss": 0.0774, "step": 1250 }, { "grad_norm": 0.4337020516395569, "learning_rate": 9.999911419878559e-05, "loss": 0.0761, "step": 1260 }, { "grad_norm": 0.5564613342285156, "learning_rate": 9.999605221019081e-05, "loss": 0.0694, "step": 1270 }, { "grad_norm": 0.45390748977661133, "learning_rate": 9.999080323230761e-05, "loss": 0.073, "step": 1280 }, { "grad_norm": 0.33817148208618164, "learning_rate": 9.998336749474329e-05, "loss": 0.067, "step": 1290 }, { "grad_norm": 0.4409405589103699, "learning_rate": 9.997374532276107e-05, "loss": 0.0682, "step": 1300 }, { "grad_norm": 0.4145369231700897, "learning_rate": 9.996193713726596e-05, "loss": 0.0673, "step": 1310 }, { "grad_norm": 0.4724181294441223, "learning_rate": 9.994794345478624e-05, "loss": 0.0665, "step": 1320 }, { "grad_norm": 0.5166686773300171, "learning_rate": 9.99317648874509e-05, "loss": 0.0671, "step": 1330 }, { "grad_norm": 0.41797006130218506, "learning_rate": 9.991340214296292e-05, "loss": 0.0614, "step": 1340 }, { "grad_norm": 0.364961177110672, "learning_rate": 9.989285602456819e-05, "loss": 0.0646, "step": 1350 }, { "grad_norm": 0.5224328637123108, "learning_rate": 9.98701274310205e-05, "loss": 0.0648, "step": 1360 }, { "grad_norm": 0.47416940331459045, "learning_rate": 9.984521735654218e-05, "loss": 0.0609, "step": 1370 }, { "grad_norm": 0.3529892861843109, "learning_rate": 9.981812689078057e-05, "loss": 0.0585, "step": 1380 }, { "grad_norm": 0.4784582853317261, "learning_rate": 9.978885721876041e-05, "loss": 0.0599, "step": 1390 }, { "grad_norm": 0.27717214822769165, "learning_rate": 9.975740962083198e-05, "loss": 0.0636, "step": 1400 }, { "grad_norm": 0.41375869512557983, "learning_rate": 9.972378547261504e-05, "loss": 0.0645, "step": 1410 }, { "grad_norm": 0.5045386552810669, "learning_rate": 9.968798624493885e-05, "loss": 0.0625, "step": 1420 }, { "grad_norm": 0.42626622319221497, "learning_rate": 9.965001350377753e-05, "loss": 0.0669, "step": 1430 }, { "grad_norm": 0.5226677656173706, "learning_rate": 9.960986891018183e-05, "loss": 0.062, "step": 1440 }, { "grad_norm": 0.5228267312049866, "learning_rate": 9.95675542202063e-05, "loss": 0.0637, "step": 1450 }, { "grad_norm": 0.4277467727661133, "learning_rate": 9.952307128483256e-05, "loss": 0.0596, "step": 1460 }, { "grad_norm": 0.40805479884147644, "learning_rate": 9.947642204988835e-05, "loss": 0.0623, "step": 1470 }, { "grad_norm": 0.5066496133804321, "learning_rate": 9.942760855596226e-05, "loss": 0.0608, "step": 1480 }, { "grad_norm": 0.46946775913238525, "learning_rate": 9.937663293831471e-05, "loss": 0.0629, "step": 1490 }, { "grad_norm": 0.4907722473144531, "learning_rate": 9.932349742678433e-05, "loss": 0.0562, "step": 1500 }, { "grad_norm": 0.3012191355228424, "learning_rate": 3.6e-06, "loss": 0.0513, "step": 1510 }, { "grad_norm": 0.11255031824111938, "learning_rate": 7.6e-06, "loss": 0.0429, "step": 1520 }, { "grad_norm": 0.10130707174539566, "learning_rate": 1.16e-05, "loss": 0.0368, "step": 1530 }, { "grad_norm": 0.11848820745944977, "learning_rate": 1.56e-05, "loss": 0.0345, "step": 1540 }, { "grad_norm": 0.1964358389377594, "learning_rate": 1.9600000000000002e-05, "loss": 0.0357, "step": 1550 }, { "grad_norm": 0.19960114359855652, "learning_rate": 2.36e-05, "loss": 0.0344, "step": 1560 }, { "grad_norm": 0.2515262961387634, "learning_rate": 2.7600000000000003e-05, "loss": 0.0364, "step": 1570 }, { "grad_norm": 0.30423256754875183, "learning_rate": 3.16e-05, "loss": 0.0353, "step": 1580 }, { "grad_norm": 0.3767169117927551, "learning_rate": 3.56e-05, "loss": 0.0362, "step": 1590 }, { "grad_norm": 0.2941502034664154, "learning_rate": 3.960000000000001e-05, "loss": 0.038, "step": 1600 }, { "grad_norm": 0.1843196004629135, "learning_rate": 3.6e-06, "loss": 0.034, "step": 1610 }, { "grad_norm": 0.09200790524482727, "learning_rate": 7.6e-06, "loss": 0.0297, "step": 1620 }, { "grad_norm": 0.12036135047674179, "learning_rate": 1.16e-05, "loss": 0.0296, "step": 1630 }, { "grad_norm": 0.17890624701976776, "learning_rate": 1.56e-05, "loss": 0.0309, "step": 1640 }, { "grad_norm": 0.3501648008823395, "learning_rate": 1.9600000000000002e-05, "loss": 0.0321, "step": 1650 }, { "grad_norm": 0.2992416322231293, "learning_rate": 2.36e-05, "loss": 0.0324, "step": 1660 }, { "grad_norm": 0.24689897894859314, "learning_rate": 2.7600000000000003e-05, "loss": 0.032, "step": 1670 }, { "grad_norm": 0.3009512424468994, "learning_rate": 3.16e-05, "loss": 0.0336, "step": 1680 }, { "grad_norm": 0.3029879927635193, "learning_rate": 3.56e-05, "loss": 0.0349, "step": 1690 }, { "grad_norm": 0.3682500123977661, "learning_rate": 3.960000000000001e-05, "loss": 0.0338, "step": 1700 }, { "grad_norm": 0.3186107575893402, "learning_rate": 4.36e-05, "loss": 0.0338, "step": 1710 }, { "grad_norm": 0.28322625160217285, "learning_rate": 4.76e-05, "loss": 0.038, "step": 1720 }, { "grad_norm": 0.321772962808609, "learning_rate": 5.16e-05, "loss": 0.0418, "step": 1730 }, { "grad_norm": 0.39493709802627563, "learning_rate": 5.560000000000001e-05, "loss": 0.0384, "step": 1740 }, { "grad_norm": 0.46459177136421204, "learning_rate": 5.96e-05, "loss": 0.0405, "step": 1750 }, { "grad_norm": 0.5326083302497864, "learning_rate": 6.36e-05, "loss": 0.0442, "step": 1760 }, { "grad_norm": 0.33989307284355164, "learning_rate": 6.76e-05, "loss": 0.0456, "step": 1770 }, { "grad_norm": 0.3888494372367859, "learning_rate": 7.16e-05, "loss": 0.0439, "step": 1780 }, { "grad_norm": 0.2806129455566406, "learning_rate": 7.560000000000001e-05, "loss": 0.0474, "step": 1790 }, { "grad_norm": 0.5664889216423035, "learning_rate": 7.960000000000001e-05, "loss": 0.0529, "step": 1800 }, { "grad_norm": 0.40685325860977173, "learning_rate": 8.36e-05, "loss": 0.0492, "step": 1810 }, { "grad_norm": 0.3915274441242218, "learning_rate": 8.76e-05, "loss": 0.0531, "step": 1820 }, { "grad_norm": 0.4390338063240051, "learning_rate": 9.16e-05, "loss": 0.0541, "step": 1830 }, { "grad_norm": 0.4535897970199585, "learning_rate": 9.56e-05, "loss": 0.0556, "step": 1840 }, { "grad_norm": 0.40557733178138733, "learning_rate": 9.960000000000001e-05, "loss": 0.0598, "step": 1850 }, { "grad_norm": 0.4307502806186676, "learning_rate": 9.999911419878559e-05, "loss": 0.0613, "step": 1860 }, { "grad_norm": 0.4397332966327667, "learning_rate": 9.999605221019081e-05, "loss": 0.057, "step": 1870 }, { "grad_norm": 0.4753943681716919, "learning_rate": 9.999080323230761e-05, "loss": 0.0609, "step": 1880 }, { "grad_norm": 0.44830775260925293, "learning_rate": 9.998336749474329e-05, "loss": 0.0551, "step": 1890 }, { "grad_norm": 0.3477669656276703, "learning_rate": 9.997374532276107e-05, "loss": 0.0515, "step": 1900 }, { "grad_norm": 0.5197297930717468, "learning_rate": 9.996193713726596e-05, "loss": 0.054, "step": 1910 }, { "grad_norm": 0.52423095703125, "learning_rate": 9.994794345478624e-05, "loss": 0.0534, "step": 1920 }, { "grad_norm": 0.38030654191970825, "learning_rate": 9.99317648874509e-05, "loss": 0.0529, "step": 1930 }, { "grad_norm": 0.6342213749885559, "learning_rate": 9.991340214296292e-05, "loss": 0.0529, "step": 1940 }, { "grad_norm": 0.48398685455322266, "learning_rate": 9.989285602456819e-05, "loss": 0.0536, "step": 1950 }, { "grad_norm": 0.43288710713386536, "learning_rate": 9.98701274310205e-05, "loss": 0.0538, "step": 1960 }, { "grad_norm": 0.45618921518325806, "learning_rate": 9.984521735654218e-05, "loss": 0.0541, "step": 1970 }, { "grad_norm": 0.595313549041748, "learning_rate": 9.981812689078057e-05, "loss": 0.055, "step": 1980 }, { "grad_norm": 0.37541764974594116, "learning_rate": 9.978885721876041e-05, "loss": 0.0561, "step": 1990 }, { "grad_norm": 0.3914823532104492, "learning_rate": 9.975740962083198e-05, "loss": 0.0542, "step": 2000 }, { "grad_norm": 0.48534271121025085, "learning_rate": 9.972378547261504e-05, "loss": 0.051, "step": 2010 }, { "grad_norm": 0.3523121178150177, "learning_rate": 9.968798624493885e-05, "loss": 0.0499, "step": 2020 }, { "grad_norm": 0.3841063678264618, "learning_rate": 9.965001350377753e-05, "loss": 0.0485, "step": 2030 }, { "grad_norm": 0.3822041451931, "learning_rate": 9.960986891018183e-05, "loss": 0.0523, "step": 2040 }, { "grad_norm": 0.3567885458469391, "learning_rate": 9.95675542202063e-05, "loss": 0.0489, "step": 2050 }, { "grad_norm": 0.43500715494155884, "learning_rate": 9.952307128483256e-05, "loss": 0.0516, "step": 2060 }, { "grad_norm": 0.415130078792572, "learning_rate": 9.947642204988835e-05, "loss": 0.0505, "step": 2070 }, { "grad_norm": 0.41638925671577454, "learning_rate": 9.942760855596226e-05, "loss": 0.0529, "step": 2080 }, { "grad_norm": 0.41729235649108887, "learning_rate": 9.937663293831471e-05, "loss": 0.0485, "step": 2090 }, { "grad_norm": 0.5248743295669556, "learning_rate": 9.932349742678433e-05, "loss": 0.0484, "step": 2100 }, { "grad_norm": 0.3727494776248932, "learning_rate": 3.6e-06, "loss": 0.0503, "step": 2110 }, { "grad_norm": 0.22962310910224915, "learning_rate": 7.6e-06, "loss": 0.0399, "step": 2120 }, { "grad_norm": 0.08213402330875397, "learning_rate": 1.16e-05, "loss": 0.0318, "step": 2130 }, { "grad_norm": 0.10316774249076843, "learning_rate": 1.56e-05, "loss": 0.0283, "step": 2140 }, { "grad_norm": 0.16202598810195923, "learning_rate": 1.9600000000000002e-05, "loss": 0.0291, "step": 2150 }, { "grad_norm": 0.2037431299686432, "learning_rate": 2.36e-05, "loss": 0.0285, "step": 2160 }, { "grad_norm": 0.21782545745372772, "learning_rate": 2.7600000000000003e-05, "loss": 0.0305, "step": 2170 }, { "grad_norm": 0.2840912640094757, "learning_rate": 3.16e-05, "loss": 0.0273, "step": 2180 }, { "grad_norm": 0.21943439543247223, "learning_rate": 3.56e-05, "loss": 0.0299, "step": 2190 }, { "grad_norm": 0.30279168486595154, "learning_rate": 3.960000000000001e-05, "loss": 0.0335, "step": 2200 }, { "grad_norm": 0.2775246500968933, "learning_rate": 4.36e-05, "loss": 0.0324, "step": 2210 }, { "grad_norm": 0.42877551913261414, "learning_rate": 4.76e-05, "loss": 0.034, "step": 2220 }, { "grad_norm": 0.3622698187828064, "learning_rate": 5.16e-05, "loss": 0.0304, "step": 2230 }, { "grad_norm": 0.3154076635837555, "learning_rate": 5.560000000000001e-05, "loss": 0.0343, "step": 2240 }, { "grad_norm": 0.40900129079818726, "learning_rate": 5.96e-05, "loss": 0.0365, "step": 2250 }, { "grad_norm": 0.2469077706336975, "learning_rate": 6.36e-05, "loss": 0.0342, "step": 2260 }, { "grad_norm": 0.3536086976528168, "learning_rate": 6.76e-05, "loss": 0.0345, "step": 2270 }, { "grad_norm": 0.3022139072418213, "learning_rate": 7.16e-05, "loss": 0.0396, "step": 2280 }, { "grad_norm": 0.322871595621109, "learning_rate": 7.560000000000001e-05, "loss": 0.041, "step": 2290 }, { "grad_norm": 0.36403512954711914, "learning_rate": 7.960000000000001e-05, "loss": 0.0438, "step": 2300 }, { "grad_norm": 0.38627511262893677, "learning_rate": 8.36e-05, "loss": 0.0444, "step": 2310 }, { "grad_norm": 0.46570056676864624, "learning_rate": 8.76e-05, "loss": 0.046, "step": 2320 }, { "grad_norm": 0.4484180808067322, "learning_rate": 9.16e-05, "loss": 0.0493, "step": 2330 }, { "grad_norm": 0.32608455419540405, "learning_rate": 9.56e-05, "loss": 0.0492, "step": 2340 }, { "grad_norm": 0.5241735577583313, "learning_rate": 9.960000000000001e-05, "loss": 0.0534, "step": 2350 }, { "grad_norm": 0.509925127029419, "learning_rate": 9.999911419878559e-05, "loss": 0.0506, "step": 2360 }, { "grad_norm": 0.41749557852745056, "learning_rate": 9.999605221019081e-05, "loss": 0.0491, "step": 2370 }, { "grad_norm": 0.41675058007240295, "learning_rate": 9.999080323230761e-05, "loss": 0.0523, "step": 2380 }, { "grad_norm": 0.43030524253845215, "learning_rate": 9.998336749474329e-05, "loss": 0.0516, "step": 2390 }, { "grad_norm": 0.40477266907691956, "learning_rate": 9.997374532276107e-05, "loss": 0.0465, "step": 2400 }, { "grad_norm": 0.37097233533859253, "learning_rate": 9.996193713726596e-05, "loss": 0.0486, "step": 2410 }, { "grad_norm": 0.3993120491504669, "learning_rate": 9.994794345478624e-05, "loss": 0.0483, "step": 2420 }, { "grad_norm": 0.3423318862915039, "learning_rate": 9.99317648874509e-05, "loss": 0.0432, "step": 2430 }, { "grad_norm": 0.41292425990104675, "learning_rate": 9.991340214296292e-05, "loss": 0.0482, "step": 2440 }, { "grad_norm": 0.3618304431438446, "learning_rate": 9.989285602456819e-05, "loss": 0.0445, "step": 2450 }, { "grad_norm": 0.2800253927707672, "learning_rate": 9.98701274310205e-05, "loss": 0.0472, "step": 2460 }, { "grad_norm": 0.47700509428977966, "learning_rate": 9.984521735654218e-05, "loss": 0.0479, "step": 2470 }, { "grad_norm": 0.3231373131275177, "learning_rate": 9.981812689078057e-05, "loss": 0.047, "step": 2480 }, { "grad_norm": 0.30088120698928833, "learning_rate": 9.978885721876041e-05, "loss": 0.0451, "step": 2490 }, { "grad_norm": 0.3034248352050781, "learning_rate": 9.975740962083198e-05, "loss": 0.0452, "step": 2500 }, { "grad_norm": 0.3968607187271118, "learning_rate": 9.972378547261504e-05, "loss": 0.0462, "step": 2510 }, { "grad_norm": 0.402818500995636, "learning_rate": 9.968798624493885e-05, "loss": 0.0452, "step": 2520 }, { "grad_norm": 0.4880514144897461, "learning_rate": 9.965001350377753e-05, "loss": 0.0462, "step": 2530 }, { "grad_norm": 0.5451019406318665, "learning_rate": 9.960986891018183e-05, "loss": 0.0414, "step": 2540 }, { "grad_norm": 0.39336031675338745, "learning_rate": 9.95675542202063e-05, "loss": 0.0468, "step": 2550 }, { "grad_norm": 0.3647398352622986, "learning_rate": 9.952307128483256e-05, "loss": 0.0463, "step": 2560 }, { "grad_norm": 0.36479389667510986, "learning_rate": 9.947642204988835e-05, "loss": 0.045, "step": 2570 }, { "grad_norm": 0.38387492299079895, "learning_rate": 9.942760855596226e-05, "loss": 0.0438, "step": 2580 }, { "grad_norm": 0.313623309135437, "learning_rate": 9.937663293831471e-05, "loss": 0.0379, "step": 2590 }, { "grad_norm": 0.38519930839538574, "learning_rate": 9.932349742678433e-05, "loss": 0.0462, "step": 2600 }, { "grad_norm": 0.2513759136199951, "learning_rate": 9.926820434569051e-05, "loss": 0.0419, "step": 2610 }, { "grad_norm": 0.43225133419036865, "learning_rate": 9.921075611373179e-05, "loss": 0.042, "step": 2620 }, { "grad_norm": 0.35011181235313416, "learning_rate": 9.915115524387988e-05, "loss": 0.0436, "step": 2630 }, { "grad_norm": 0.4772772789001465, "learning_rate": 9.908940434326997e-05, "loss": 0.0441, "step": 2640 }, { "grad_norm": 0.5008149743080139, "learning_rate": 9.902550611308645e-05, "loss": 0.0418, "step": 2650 }, { "grad_norm": 0.33304187655448914, "learning_rate": 9.895946334844494e-05, "loss": 0.044, "step": 2660 }, { "grad_norm": 0.3896394371986389, "learning_rate": 9.889127893826989e-05, "loss": 0.0429, "step": 2670 }, { "grad_norm": 0.384151816368103, "learning_rate": 9.882095586516831e-05, "loss": 0.0404, "step": 2680 }, { "grad_norm": 0.5263709425926208, "learning_rate": 9.874849720529921e-05, "loss": 0.0414, "step": 2690 }, { "grad_norm": 0.37322402000427246, "learning_rate": 9.867390612823914e-05, "loss": 0.0408, "step": 2700 }, { "grad_norm": 0.3406134247779846, "learning_rate": 3.6e-06, "loss": 0.0418, "step": 2710 }, { "grad_norm": 0.09609717130661011, "learning_rate": 7.6e-06, "loss": 0.0316, "step": 2720 }, { "grad_norm": 0.07473892718553543, "learning_rate": 1.16e-05, "loss": 0.0268, "step": 2730 }, { "grad_norm": 0.09371735900640488, "learning_rate": 1.56e-05, "loss": 0.0261, "step": 2740 }, { "grad_norm": 0.09075760841369629, "learning_rate": 1.9600000000000002e-05, "loss": 0.0263, "step": 2750 }, { "grad_norm": 0.21899884939193726, "learning_rate": 2.36e-05, "loss": 0.0246, "step": 2760 }, { "grad_norm": 0.20407100021839142, "learning_rate": 2.7600000000000003e-05, "loss": 0.0241, "step": 2770 }, { "grad_norm": 0.25961631536483765, "learning_rate": 3.16e-05, "loss": 0.0262, "step": 2780 }, { "grad_norm": 0.23574396967887878, "learning_rate": 3.56e-05, "loss": 0.029, "step": 2790 }, { "grad_norm": 0.20600852370262146, "learning_rate": 3.960000000000001e-05, "loss": 0.0292, "step": 2800 }, { "grad_norm": 0.28286001086235046, "learning_rate": 4.36e-05, "loss": 0.0287, "step": 2810 }, { "grad_norm": 0.33955004811286926, "learning_rate": 4.76e-05, "loss": 0.0283, "step": 2820 }, { "grad_norm": 0.3014861047267914, "learning_rate": 5.16e-05, "loss": 0.0284, "step": 2830 }, { "grad_norm": 0.3722647726535797, "learning_rate": 5.560000000000001e-05, "loss": 0.0303, "step": 2840 }, { "grad_norm": 0.2521559000015259, "learning_rate": 5.96e-05, "loss": 0.0303, "step": 2850 }, { "grad_norm": 0.3782460391521454, "learning_rate": 6.36e-05, "loss": 0.0321, "step": 2860 }, { "grad_norm": 0.45622745156288147, "learning_rate": 6.76e-05, "loss": 0.0342, "step": 2870 }, { "grad_norm": 0.32308587431907654, "learning_rate": 7.16e-05, "loss": 0.0312, "step": 2880 }, { "grad_norm": 0.3646230399608612, "learning_rate": 7.560000000000001e-05, "loss": 0.0347, "step": 2890 }, { "grad_norm": 0.38201573491096497, "learning_rate": 7.960000000000001e-05, "loss": 0.0352, "step": 2900 }, { "grad_norm": 0.43542519211769104, "learning_rate": 8.36e-05, "loss": 0.0388, "step": 2910 }, { "grad_norm": 0.4268929362297058, "learning_rate": 8.76e-05, "loss": 0.0382, "step": 2920 }, { "grad_norm": 0.4492916166782379, "learning_rate": 9.16e-05, "loss": 0.0399, "step": 2930 }, { "grad_norm": 0.4167806804180145, "learning_rate": 9.56e-05, "loss": 0.0419, "step": 2940 }, { "grad_norm": 0.48346272110939026, "learning_rate": 9.960000000000001e-05, "loss": 0.0424, "step": 2950 }, { "grad_norm": 0.34249964356422424, "learning_rate": 9.999911419878559e-05, "loss": 0.0451, "step": 2960 }, { "grad_norm": 0.4727078974246979, "learning_rate": 9.999605221019081e-05, "loss": 0.0408, "step": 2970 }, { "grad_norm": 0.4595201909542084, "learning_rate": 9.999080323230761e-05, "loss": 0.0448, "step": 2980 }, { "grad_norm": 0.5211357474327087, "learning_rate": 9.998336749474329e-05, "loss": 0.0404, "step": 2990 }, { "grad_norm": 0.5116234421730042, "learning_rate": 9.997374532276107e-05, "loss": 0.0419, "step": 3000 }, { "grad_norm": 0.3019302785396576, "learning_rate": 9.996193713726596e-05, "loss": 0.0394, "step": 3010 }, { "grad_norm": 0.4081250727176666, "learning_rate": 9.994794345478624e-05, "loss": 0.0399, "step": 3020 }, { "grad_norm": 0.26774004101753235, "learning_rate": 9.99317648874509e-05, "loss": 0.0408, "step": 3030 }, { "grad_norm": 0.2589474618434906, "learning_rate": 9.991340214296292e-05, "loss": 0.0406, "step": 3040 }, { "grad_norm": 0.514599621295929, "learning_rate": 9.989285602456819e-05, "loss": 0.0402, "step": 3050 }, { "grad_norm": 0.40104562044143677, "learning_rate": 9.98701274310205e-05, "loss": 0.0406, "step": 3060 }, { "grad_norm": 0.36765149235725403, "learning_rate": 9.984521735654218e-05, "loss": 0.0394, "step": 3070 }, { "grad_norm": 0.33816102147102356, "learning_rate": 9.981812689078057e-05, "loss": 0.0412, "step": 3080 }, { "grad_norm": 0.32110002636909485, "learning_rate": 9.978885721876041e-05, "loss": 0.0371, "step": 3090 }, { "grad_norm": 0.3860984146595001, "learning_rate": 9.975740962083198e-05, "loss": 0.0405, "step": 3100 }, { "grad_norm": 0.26341062784194946, "learning_rate": 9.972378547261504e-05, "loss": 0.0393, "step": 3110 }, { "grad_norm": 0.4155246615409851, "learning_rate": 9.968798624493885e-05, "loss": 0.0388, "step": 3120 }, { "grad_norm": 0.49742957949638367, "learning_rate": 9.965001350377753e-05, "loss": 0.0391, "step": 3130 }, { "grad_norm": 0.26576173305511475, "learning_rate": 9.960986891018183e-05, "loss": 0.0371, "step": 3140 }, { "grad_norm": 0.3517167270183563, "learning_rate": 9.95675542202063e-05, "loss": 0.0394, "step": 3150 }, { "grad_norm": 0.320377916097641, "learning_rate": 9.952307128483256e-05, "loss": 0.0401, "step": 3160 }, { "grad_norm": 0.2738504707813263, "learning_rate": 9.947642204988835e-05, "loss": 0.0383, "step": 3170 }, { "grad_norm": 0.4829009473323822, "learning_rate": 9.942760855596226e-05, "loss": 0.0392, "step": 3180 }, { "grad_norm": 0.42097535729408264, "learning_rate": 9.937663293831471e-05, "loss": 0.0404, "step": 3190 }, { "grad_norm": 0.45917344093322754, "learning_rate": 9.932349742678433e-05, "loss": 0.0419, "step": 3200 }, { "grad_norm": 0.36991745233535767, "learning_rate": 2.25e-06, "loss": 0.0393, "step": 3210 }, { "grad_norm": 0.19931316375732422, "learning_rate": 4.75e-06, "loss": 0.0316, "step": 3220 }, { "grad_norm": 0.09721539169549942, "learning_rate": 7.25e-06, "loss": 0.0254, "step": 3230 }, { "grad_norm": 0.08070161193609238, "learning_rate": 9.750000000000002e-06, "loss": 0.0228, "step": 3240 }, { "grad_norm": 0.09927989542484283, "learning_rate": 1.225e-05, "loss": 0.0251, "step": 3250 }, { "grad_norm": 0.07708446681499481, "learning_rate": 1.475e-05, "loss": 0.0249, "step": 3260 }, { "grad_norm": 0.12243490666151047, "learning_rate": 1.725e-05, "loss": 0.0208, "step": 3270 }, { "grad_norm": 0.16835889220237732, "learning_rate": 1.9750000000000002e-05, "loss": 0.023, "step": 3280 }, { "grad_norm": 0.12951454520225525, "learning_rate": 2.2250000000000002e-05, "loss": 0.0211, "step": 3290 }, { "grad_norm": 0.17981648445129395, "learning_rate": 2.4750000000000002e-05, "loss": 0.0216, "step": 3300 }, { "grad_norm": 0.10566560924053192, "learning_rate": 2.25e-06, "loss": 0.0243, "step": 3310 }, { "grad_norm": 0.07440933585166931, "learning_rate": 4.75e-06, "loss": 0.0236, "step": 3320 }, { "grad_norm": 0.06981145590543747, "learning_rate": 7.25e-06, "loss": 0.0201, "step": 3330 }, { "grad_norm": 0.09702398627996445, "learning_rate": 9.750000000000002e-06, "loss": 0.0197, "step": 3340 }, { "grad_norm": 0.09483855962753296, "learning_rate": 1.225e-05, "loss": 0.0189, "step": 3350 }, { "grad_norm": 0.10017610341310501, "learning_rate": 1.475e-05, "loss": 0.022, "step": 3360 }, { "grad_norm": 0.1346549093723297, "learning_rate": 1.725e-05, "loss": 0.019, "step": 3370 }, { "grad_norm": 0.15081147849559784, "learning_rate": 1.9750000000000002e-05, "loss": 0.0223, "step": 3380 }, { "grad_norm": 0.22095446288585663, "learning_rate": 2.2250000000000002e-05, "loss": 0.0221, "step": 3390 }, { "grad_norm": 0.15094400942325592, "learning_rate": 2.4750000000000002e-05, "loss": 0.0211, "step": 3400 }, { "grad_norm": 0.21096032857894897, "learning_rate": 2.725e-05, "loss": 0.0231, "step": 3410 }, { "grad_norm": 0.23008853197097778, "learning_rate": 2.975e-05, "loss": 0.0219, "step": 3420 }, { "grad_norm": 0.21590107679367065, "learning_rate": 3.2250000000000005e-05, "loss": 0.0223, "step": 3430 }, { "grad_norm": 0.31127309799194336, "learning_rate": 3.475e-05, "loss": 0.0232, "step": 3440 }, { "grad_norm": 0.1888926923274994, "learning_rate": 3.7250000000000004e-05, "loss": 0.0211, "step": 3450 }, { "grad_norm": 0.224850594997406, "learning_rate": 3.9750000000000004e-05, "loss": 0.0235, "step": 3460 }, { "grad_norm": 0.25683915615081787, "learning_rate": 4.2250000000000004e-05, "loss": 0.023, "step": 3470 }, { "grad_norm": 0.4701042175292969, "learning_rate": 4.4750000000000004e-05, "loss": 0.0251, "step": 3480 }, { "grad_norm": 0.2404465228319168, "learning_rate": 4.7249999999999997e-05, "loss": 0.0236, "step": 3490 }, { "grad_norm": 0.30311474204063416, "learning_rate": 4.975e-05, "loss": 0.0241, "step": 3500 }, { "grad_norm": 0.3277303874492645, "learning_rate": 5.2249999999999996e-05, "loss": 0.0252, "step": 3510 }, { "grad_norm": 0.3229980766773224, "learning_rate": 5.475e-05, "loss": 0.0253, "step": 3520 }, { "grad_norm": 0.4051625728607178, "learning_rate": 5.725e-05, "loss": 0.0239, "step": 3530 }, { "grad_norm": 0.3571256995201111, "learning_rate": 5.975000000000001e-05, "loss": 0.0272, "step": 3540 }, { "grad_norm": 0.34256404638290405, "learning_rate": 6.225000000000001e-05, "loss": 0.0273, "step": 3550 }, { "grad_norm": 0.3582518398761749, "learning_rate": 6.475e-05, "loss": 0.029, "step": 3560 }, { "grad_norm": 0.2795099914073944, "learning_rate": 6.725000000000001e-05, "loss": 0.0283, "step": 3570 }, { "grad_norm": 0.3371533453464508, "learning_rate": 6.975e-05, "loss": 0.0298, "step": 3580 }, { "grad_norm": 0.3718169331550598, "learning_rate": 7.225000000000001e-05, "loss": 0.0315, "step": 3590 }, { "grad_norm": 0.41186198592185974, "learning_rate": 7.475000000000001e-05, "loss": 0.0304, "step": 3600 }, { "grad_norm": 0.3523057699203491, "learning_rate": 2.25e-06, "loss": 0.0352, "step": 3610 }, { "grad_norm": 0.18908748030662537, "learning_rate": 4.75e-06, "loss": 0.0313, "step": 3620 }, { "grad_norm": 0.08770550787448883, "learning_rate": 7.25e-06, "loss": 0.0199, "step": 3630 }, { "grad_norm": 0.059360720217227936, "learning_rate": 9.750000000000002e-06, "loss": 0.0217, "step": 3640 }, { "grad_norm": 0.08034844696521759, "learning_rate": 1.225e-05, "loss": 0.0208, "step": 3650 }, { "grad_norm": 0.10670112818479538, "learning_rate": 1.475e-05, "loss": 0.0206, "step": 3660 }, { "grad_norm": 0.11115647107362747, "learning_rate": 1.725e-05, "loss": 0.0174, "step": 3670 }, { "grad_norm": 0.12775209546089172, "learning_rate": 1.9750000000000002e-05, "loss": 0.0201, "step": 3680 }, { "grad_norm": 0.16139353811740875, "learning_rate": 2.2250000000000002e-05, "loss": 0.0203, "step": 3690 }, { "grad_norm": 0.16247326135635376, "learning_rate": 2.4750000000000002e-05, "loss": 0.0186, "step": 3700 }, { "grad_norm": 0.17848622798919678, "learning_rate": 2.725e-05, "loss": 0.0183, "step": 3710 }, { "grad_norm": 0.17426474392414093, "learning_rate": 2.975e-05, "loss": 0.0186, "step": 3720 }, { "grad_norm": 0.2004312127828598, "learning_rate": 3.2250000000000005e-05, "loss": 0.0208, "step": 3730 }, { "grad_norm": 0.2544923424720764, "learning_rate": 3.475e-05, "loss": 0.0203, "step": 3740 }, { "grad_norm": 0.24687324464321136, "learning_rate": 3.7250000000000004e-05, "loss": 0.0205, "step": 3750 }, { "grad_norm": 0.24185478687286377, "learning_rate": 3.9750000000000004e-05, "loss": 0.0208, "step": 3760 }, { "grad_norm": 0.2701793313026428, "learning_rate": 4.2250000000000004e-05, "loss": 0.0206, "step": 3770 }, { "grad_norm": 0.2676536738872528, "learning_rate": 4.4750000000000004e-05, "loss": 0.0235, "step": 3780 }, { "grad_norm": 0.2788873612880707, "learning_rate": 4.7249999999999997e-05, "loss": 0.0226, "step": 3790 }, { "grad_norm": 0.2759779393672943, "learning_rate": 4.975e-05, "loss": 0.0231, "step": 3800 }, { "grad_norm": 0.24446116387844086, "learning_rate": 5.2249999999999996e-05, "loss": 0.0224, "step": 3810 }, { "grad_norm": 0.3492976427078247, "learning_rate": 5.475e-05, "loss": 0.024, "step": 3820 }, { "grad_norm": 0.28700438141822815, "learning_rate": 5.725e-05, "loss": 0.0276, "step": 3830 }, { "grad_norm": 0.31673601269721985, "learning_rate": 5.975000000000001e-05, "loss": 0.0263, "step": 3840 }, { "grad_norm": 0.4141864478588104, "learning_rate": 6.225000000000001e-05, "loss": 0.0276, "step": 3850 }, { "grad_norm": 0.2552778720855713, "learning_rate": 6.475e-05, "loss": 0.0279, "step": 3860 }, { "grad_norm": 0.3868294060230255, "learning_rate": 6.725000000000001e-05, "loss": 0.0264, "step": 3870 }, { "grad_norm": 0.3118395507335663, "learning_rate": 6.975e-05, "loss": 0.0299, "step": 3880 }, { "grad_norm": 0.3634006381034851, "learning_rate": 7.225000000000001e-05, "loss": 0.0305, "step": 3890 }, { "grad_norm": 0.3221040666103363, "learning_rate": 7.475000000000001e-05, "loss": 0.0286, "step": 3900 }, { "grad_norm": 0.2916862666606903, "learning_rate": 2.25e-06, "loss": 0.0317, "step": 3910 }, { "grad_norm": 0.14725710451602936, "learning_rate": 4.75e-06, "loss": 0.0246, "step": 3920 }, { "grad_norm": 0.08059115707874298, "learning_rate": 7.25e-06, "loss": 0.0201, "step": 3930 }, { "grad_norm": 0.08749918639659882, "learning_rate": 9.750000000000002e-06, "loss": 0.0192, "step": 3940 }, { "grad_norm": 0.0790024921298027, "learning_rate": 1.225e-05, "loss": 0.0198, "step": 3950 }, { "grad_norm": 0.09417396783828735, "learning_rate": 1.475e-05, "loss": 0.0177, "step": 3960 }, { "grad_norm": 0.10084370523691177, "learning_rate": 1.725e-05, "loss": 0.0191, "step": 3970 }, { "grad_norm": 0.1147816851735115, "learning_rate": 1.9750000000000002e-05, "loss": 0.019, "step": 3980 }, { "grad_norm": 0.12255968898534775, "learning_rate": 2.2250000000000002e-05, "loss": 0.0208, "step": 3990 }, { "grad_norm": 0.21064911782741547, "learning_rate": 2.4750000000000002e-05, "loss": 0.0178, "step": 4000 }, { "grad_norm": 0.16038502752780914, "learning_rate": 2.25e-06, "loss": 0.0163, "step": 4010 }, { "grad_norm": 0.06728145480155945, "learning_rate": 4.75e-06, "loss": 0.0173, "step": 4020 }, { "grad_norm": 0.08970823884010315, "learning_rate": 7.25e-06, "loss": 0.0163, "step": 4030 }, { "grad_norm": 0.08368533849716187, "learning_rate": 9.750000000000002e-06, "loss": 0.0167, "step": 4040 }, { "grad_norm": 0.14921842515468597, "learning_rate": 1.225e-05, "loss": 0.0188, "step": 4050 }, { "grad_norm": 0.11368093639612198, "learning_rate": 1.475e-05, "loss": 0.0163, "step": 4060 }, { "grad_norm": 0.11728593707084656, "learning_rate": 1.725e-05, "loss": 0.0161, "step": 4070 }, { "grad_norm": 0.13769982755184174, "learning_rate": 1.9750000000000002e-05, "loss": 0.0179, "step": 4080 }, { "grad_norm": 0.13639125227928162, "learning_rate": 2.2250000000000002e-05, "loss": 0.0178, "step": 4090 }, { "grad_norm": 0.18457946181297302, "learning_rate": 2.4750000000000002e-05, "loss": 0.0181, "step": 4100 }, { "grad_norm": 0.11619122326374054, "learning_rate": 2.25e-06, "loss": 0.0168, "step": 4110 }, { "grad_norm": 0.07324390113353729, "learning_rate": 4.75e-06, "loss": 0.0155, "step": 4120 }, { "grad_norm": 0.06126878410577774, "learning_rate": 7.25e-06, "loss": 0.0153, "step": 4130 }, { "grad_norm": 0.0816902294754982, "learning_rate": 9.750000000000002e-06, "loss": 0.0156, "step": 4140 }, { "grad_norm": 0.10470571368932724, "learning_rate": 1.225e-05, "loss": 0.0164, "step": 4150 }, { "grad_norm": 0.14720933139324188, "learning_rate": 1.475e-05, "loss": 0.0161, "step": 4160 }, { "grad_norm": 0.1340453177690506, "learning_rate": 1.725e-05, "loss": 0.0159, "step": 4170 }, { "grad_norm": 0.17479538917541504, "learning_rate": 1.9750000000000002e-05, "loss": 0.0168, "step": 4180 }, { "grad_norm": 0.20140662789344788, "learning_rate": 2.2250000000000002e-05, "loss": 0.0153, "step": 4190 }, { "grad_norm": 0.16525106132030487, "learning_rate": 2.4750000000000002e-05, "loss": 0.0165, "step": 4200 }, { "grad_norm": 0.20493212342262268, "learning_rate": 2.725e-05, "loss": 0.0177, "step": 4210 }, { "grad_norm": 0.18822354078292847, "learning_rate": 2.975e-05, "loss": 0.017, "step": 4220 }, { "grad_norm": 0.21991147100925446, "learning_rate": 3.2250000000000005e-05, "loss": 0.0212, "step": 4230 }, { "grad_norm": 0.1739887148141861, "learning_rate": 3.475e-05, "loss": 0.018, "step": 4240 }, { "grad_norm": 0.20195457339286804, "learning_rate": 3.7250000000000004e-05, "loss": 0.0204, "step": 4250 }, { "grad_norm": 0.27041324973106384, "learning_rate": 3.9750000000000004e-05, "loss": 0.0195, "step": 4260 }, { "grad_norm": 0.22696758806705475, "learning_rate": 4.2250000000000004e-05, "loss": 0.0214, "step": 4270 }, { "grad_norm": 0.3410264551639557, "learning_rate": 4.4750000000000004e-05, "loss": 0.0218, "step": 4280 }, { "grad_norm": 0.298494428396225, "learning_rate": 4.7249999999999997e-05, "loss": 0.023, "step": 4290 }, { "grad_norm": 0.30843472480773926, "learning_rate": 4.975e-05, "loss": 0.0213, "step": 4300 }, { "grad_norm": 0.32909199595451355, "learning_rate": 5.2249999999999996e-05, "loss": 0.0229, "step": 4310 }, { "grad_norm": 0.3116457462310791, "learning_rate": 5.475e-05, "loss": 0.0236, "step": 4320 }, { "grad_norm": 0.2727232277393341, "learning_rate": 5.725e-05, "loss": 0.0244, "step": 4330 }, { "grad_norm": 0.23119600117206573, "learning_rate": 5.975000000000001e-05, "loss": 0.0258, "step": 4340 }, { "grad_norm": 0.26668670773506165, "learning_rate": 6.225000000000001e-05, "loss": 0.024, "step": 4350 }, { "grad_norm": 0.36657702922821045, "learning_rate": 6.475e-05, "loss": 0.0267, "step": 4360 }, { "grad_norm": 0.24812264740467072, "learning_rate": 6.725000000000001e-05, "loss": 0.0274, "step": 4370 }, { "grad_norm": 0.459659606218338, "learning_rate": 6.975e-05, "loss": 0.0266, "step": 4380 }, { "grad_norm": 0.2510148584842682, "learning_rate": 7.225000000000001e-05, "loss": 0.029, "step": 4390 }, { "grad_norm": 0.4088396728038788, "learning_rate": 7.475000000000001e-05, "loss": 0.0279, "step": 4400 }, { "grad_norm": 0.34848520159721375, "learning_rate": 2.25e-06, "loss": 0.0284, "step": 4410 }, { "grad_norm": 0.15743011236190796, "learning_rate": 4.75e-06, "loss": 0.025, "step": 4420 }, { "grad_norm": 0.08282707631587982, "learning_rate": 7.25e-06, "loss": 0.0207, "step": 4430 }, { "grad_norm": 0.09306411445140839, "learning_rate": 9.750000000000002e-06, "loss": 0.0194, "step": 4440 }, { "grad_norm": 0.07907520979642868, "learning_rate": 1.225e-05, "loss": 0.0185, "step": 4450 }, { "grad_norm": 0.09091110527515411, "learning_rate": 1.475e-05, "loss": 0.0178, "step": 4460 }, { "grad_norm": 0.14218714833259583, "learning_rate": 1.725e-05, "loss": 0.0185, "step": 4470 }, { "grad_norm": 0.12567996978759766, "learning_rate": 1.9750000000000002e-05, "loss": 0.017, "step": 4480 }, { "grad_norm": 0.16725608706474304, "learning_rate": 2.2250000000000002e-05, "loss": 0.0179, "step": 4490 }, { "grad_norm": 0.19683462381362915, "learning_rate": 2.4750000000000002e-05, "loss": 0.0192, "step": 4500 }, { "grad_norm": 0.21441160142421722, "learning_rate": 2.725e-05, "loss": 0.0209, "step": 4510 }, { "grad_norm": 0.2019820213317871, "learning_rate": 2.975e-05, "loss": 0.0186, "step": 4520 }, { "grad_norm": 0.19397051632404327, "learning_rate": 3.2250000000000005e-05, "loss": 0.0185, "step": 4530 }, { "grad_norm": 0.19159729778766632, "learning_rate": 3.475e-05, "loss": 0.0188, "step": 4540 }, { "grad_norm": 0.18088988959789276, "learning_rate": 3.7250000000000004e-05, "loss": 0.0179, "step": 4550 }, { "grad_norm": 0.19023217260837555, "learning_rate": 3.9750000000000004e-05, "loss": 0.0194, "step": 4560 }, { "grad_norm": 0.24779464304447174, "learning_rate": 4.2250000000000004e-05, "loss": 0.0182, "step": 4570 }, { "grad_norm": 0.35198694467544556, "learning_rate": 4.4750000000000004e-05, "loss": 0.0221, "step": 4580 }, { "grad_norm": 0.2919461727142334, "learning_rate": 4.7249999999999997e-05, "loss": 0.0206, "step": 4590 }, { "grad_norm": 0.14702706038951874, "learning_rate": 4.975e-05, "loss": 0.0233, "step": 4600 }, { "grad_norm": 0.46422693133354187, "learning_rate": 5.2249999999999996e-05, "loss": 0.0241, "step": 4610 }, { "grad_norm": 0.3457808196544647, "learning_rate": 5.475e-05, "loss": 0.0258, "step": 4620 }, { "grad_norm": 0.36458754539489746, "learning_rate": 5.725e-05, "loss": 0.0255, "step": 4630 }, { "grad_norm": 0.2495424449443817, "learning_rate": 5.975000000000001e-05, "loss": 0.0255, "step": 4640 }, { "grad_norm": 0.2907949388027191, "learning_rate": 6.225000000000001e-05, "loss": 0.0248, "step": 4650 }, { "grad_norm": 0.312218576669693, "learning_rate": 6.475e-05, "loss": 0.0232, "step": 4660 }, { "grad_norm": 0.32994410395622253, "learning_rate": 6.725000000000001e-05, "loss": 0.0284, "step": 4670 }, { "grad_norm": 0.39620542526245117, "learning_rate": 6.975e-05, "loss": 0.0286, "step": 4680 }, { "grad_norm": 0.35751959681510925, "learning_rate": 7.225000000000001e-05, "loss": 0.0259, "step": 4690 }, { "grad_norm": 0.3610899746417999, "learning_rate": 7.475000000000001e-05, "loss": 0.0289, "step": 4700 }, { "grad_norm": 0.38666731119155884, "learning_rate": 7.725e-05, "loss": 0.0278, "step": 4710 }, { "grad_norm": 0.3191392421722412, "learning_rate": 7.975e-05, "loss": 0.0299, "step": 4720 }, { "grad_norm": 0.2874530255794525, "learning_rate": 8.225000000000001e-05, "loss": 0.0306, "step": 4730 }, { "grad_norm": 0.282899409532547, "learning_rate": 8.475000000000001e-05, "loss": 0.029, "step": 4740 }, { "grad_norm": 0.39327189326286316, "learning_rate": 8.725e-05, "loss": 0.0306, "step": 4750 }, { "grad_norm": 0.41812261939048767, "learning_rate": 8.975e-05, "loss": 0.0343, "step": 4760 }, { "grad_norm": 0.37506914138793945, "learning_rate": 9.225e-05, "loss": 0.0304, "step": 4770 }, { "grad_norm": 0.34700652956962585, "learning_rate": 9.475e-05, "loss": 0.0337, "step": 4780 }, { "grad_norm": 0.39856013655662537, "learning_rate": 9.725e-05, "loss": 0.0334, "step": 4790 }, { "grad_norm": 0.3455595076084137, "learning_rate": 9.975000000000001e-05, "loss": 0.0355, "step": 4800 }, { "grad_norm": 0.32341066002845764, "learning_rate": 2.25e-06, "loss": 0.0319, "step": 4810 }, { "grad_norm": 0.18348988890647888, "learning_rate": 4.75e-06, "loss": 0.0278, "step": 4820 }, { "grad_norm": 0.0759195014834404, "learning_rate": 7.25e-06, "loss": 0.0205, "step": 4830 }, { "grad_norm": 0.06178135424852371, "learning_rate": 9.750000000000002e-06, "loss": 0.0207, "step": 4840 }, { "grad_norm": 0.061087287962436676, "learning_rate": 1.225e-05, "loss": 0.0191, "step": 4850 }, { "grad_norm": 0.08582450449466705, "learning_rate": 1.475e-05, "loss": 0.0178, "step": 4860 }, { "grad_norm": 0.11223307996988297, "learning_rate": 1.725e-05, "loss": 0.0187, "step": 4870 }, { "grad_norm": 0.11273021996021271, "learning_rate": 1.9750000000000002e-05, "loss": 0.0181, "step": 4880 }, { "grad_norm": 0.12196135520935059, "learning_rate": 2.2250000000000002e-05, "loss": 0.02, "step": 4890 }, { "grad_norm": 0.15081153810024261, "learning_rate": 2.4750000000000002e-05, "loss": 0.0181, "step": 4900 }, { "grad_norm": 0.2181614339351654, "learning_rate": 2.725e-05, "loss": 0.0163, "step": 4910 }, { "grad_norm": 0.2155621200799942, "learning_rate": 2.975e-05, "loss": 0.0192, "step": 4920 }, { "grad_norm": 0.22211797535419464, "learning_rate": 3.2250000000000005e-05, "loss": 0.0195, "step": 4930 }, { "grad_norm": 0.13025997579097748, "learning_rate": 3.475e-05, "loss": 0.0207, "step": 4940 }, { "grad_norm": 0.21118290722370148, "learning_rate": 3.7250000000000004e-05, "loss": 0.02, "step": 4950 }, { "grad_norm": 0.21603266894817352, "learning_rate": 3.9750000000000004e-05, "loss": 0.0207, "step": 4960 }, { "grad_norm": 0.2582048177719116, "learning_rate": 4.2250000000000004e-05, "loss": 0.0196, "step": 4970 }, { "grad_norm": 0.2178124189376831, "learning_rate": 4.4750000000000004e-05, "loss": 0.0198, "step": 4980 }, { "grad_norm": 0.3020895719528198, "learning_rate": 4.7249999999999997e-05, "loss": 0.0202, "step": 4990 }, { "grad_norm": 0.1668292135000229, "learning_rate": 4.975e-05, "loss": 0.021, "step": 5000 }, { "grad_norm": 0.14442867040634155, "learning_rate": 2.25e-06, "loss": 0.0177, "step": 5010 }, { "grad_norm": 0.054793987423181534, "learning_rate": 4.75e-06, "loss": 0.0162, "step": 5020 }, { "grad_norm": 0.0670381411910057, "learning_rate": 7.25e-06, "loss": 0.0144, "step": 5030 }, { "grad_norm": 0.0958305150270462, "learning_rate": 9.750000000000002e-06, "loss": 0.0155, "step": 5040 }, { "grad_norm": 0.12960484623908997, "learning_rate": 1.225e-05, "loss": 0.018, "step": 5050 }, { "grad_norm": 0.15243755280971527, "learning_rate": 1.475e-05, "loss": 0.0154, "step": 5060 }, { "grad_norm": 0.12196221947669983, "learning_rate": 1.725e-05, "loss": 0.0141, "step": 5070 }, { "grad_norm": 0.16550977528095245, "learning_rate": 1.9750000000000002e-05, "loss": 0.0136, "step": 5080 }, { "grad_norm": 0.20836178958415985, "learning_rate": 2.2250000000000002e-05, "loss": 0.0177, "step": 5090 }, { "grad_norm": 0.14733245968818665, "learning_rate": 2.4750000000000002e-05, "loss": 0.0146, "step": 5100 }, { "grad_norm": 0.14296764135360718, "learning_rate": 2.725e-05, "loss": 0.0155, "step": 5110 }, { "grad_norm": 0.21847884356975555, "learning_rate": 2.975e-05, "loss": 0.0156, "step": 5120 }, { "grad_norm": 0.22458617389202118, "learning_rate": 3.2250000000000005e-05, "loss": 0.0171, "step": 5130 }, { "grad_norm": 0.21109852194786072, "learning_rate": 3.475e-05, "loss": 0.0179, "step": 5140 }, { "grad_norm": 0.2382119745016098, "learning_rate": 3.7250000000000004e-05, "loss": 0.0193, "step": 5150 }, { "grad_norm": 0.17990069091320038, "learning_rate": 3.9750000000000004e-05, "loss": 0.0179, "step": 5160 }, { "grad_norm": 0.3462783396244049, "learning_rate": 4.2250000000000004e-05, "loss": 0.0198, "step": 5170 }, { "grad_norm": 0.23819074034690857, "learning_rate": 4.4750000000000004e-05, "loss": 0.0186, "step": 5180 }, { "grad_norm": 0.2674412727355957, "learning_rate": 4.7249999999999997e-05, "loss": 0.019, "step": 5190 }, { "grad_norm": 0.25820428133010864, "learning_rate": 4.975e-05, "loss": 0.0213, "step": 5200 }, { "grad_norm": 0.2059033066034317, "learning_rate": 5.2249999999999996e-05, "loss": 0.0209, "step": 5210 }, { "grad_norm": 0.2888059914112091, "learning_rate": 5.475e-05, "loss": 0.0205, "step": 5220 }, { "grad_norm": 0.32638850808143616, "learning_rate": 5.725e-05, "loss": 0.023, "step": 5230 }, { "grad_norm": 0.21178038418293, "learning_rate": 5.975000000000001e-05, "loss": 0.022, "step": 5240 }, { "grad_norm": 0.3222191631793976, "learning_rate": 6.225000000000001e-05, "loss": 0.0258, "step": 5250 }, { "grad_norm": 0.3032192587852478, "learning_rate": 6.475e-05, "loss": 0.0237, "step": 5260 }, { "grad_norm": 0.25703561305999756, "learning_rate": 6.725000000000001e-05, "loss": 0.0278, "step": 5270 }, { "grad_norm": 0.3034331202507019, "learning_rate": 6.975e-05, "loss": 0.0256, "step": 5280 }, { "grad_norm": 0.39209669828414917, "learning_rate": 7.225000000000001e-05, "loss": 0.0258, "step": 5290 }, { "grad_norm": 0.35498857498168945, "learning_rate": 7.475000000000001e-05, "loss": 0.0268, "step": 5300 }, { "grad_norm": 0.3415033519268036, "learning_rate": 7.725e-05, "loss": 0.0272, "step": 5310 }, { "grad_norm": 0.3076319992542267, "learning_rate": 7.975e-05, "loss": 0.0284, "step": 5320 }, { "grad_norm": 0.3017461895942688, "learning_rate": 8.225000000000001e-05, "loss": 0.0273, "step": 5330 }, { "grad_norm": 0.3462255299091339, "learning_rate": 8.475000000000001e-05, "loss": 0.0311, "step": 5340 }, { "grad_norm": 0.34766459465026855, "learning_rate": 8.725e-05, "loss": 0.0299, "step": 5350 }, { "grad_norm": 0.34003376960754395, "learning_rate": 8.975e-05, "loss": 0.0322, "step": 5360 }, { "grad_norm": 0.3623153865337372, "learning_rate": 9.225e-05, "loss": 0.0323, "step": 5370 }, { "grad_norm": 0.27785149216651917, "learning_rate": 9.475e-05, "loss": 0.0322, "step": 5380 }, { "grad_norm": 0.3811187446117401, "learning_rate": 9.725e-05, "loss": 0.0327, "step": 5390 }, { "grad_norm": 0.41551879048347473, "learning_rate": 9.975000000000001e-05, "loss": 0.0345, "step": 5400 }, { "grad_norm": 0.38140812516212463, "learning_rate": 9.999965398327804e-05, "loss": 0.0362, "step": 5410 }, { "grad_norm": 0.3732677102088928, "learning_rate": 9.999845788223949e-05, "loss": 0.0326, "step": 5420 }, { "grad_norm": 0.3232232332229614, "learning_rate": 9.999640744550616e-05, "loss": 0.0324, "step": 5430 }, { "grad_norm": 0.3319040536880493, "learning_rate": 9.999350270811438e-05, "loss": 0.0315, "step": 5440 }, { "grad_norm": 0.43266093730926514, "learning_rate": 9.99897437196981e-05, "loss": 0.0332, "step": 5450 }, { "grad_norm": 0.3082657754421234, "learning_rate": 9.998513054448802e-05, "loss": 0.0318, "step": 5460 }, { "grad_norm": 0.4122726321220398, "learning_rate": 9.99796632613106e-05, "loss": 0.0348, "step": 5470 }, { "grad_norm": 0.29128387570381165, "learning_rate": 9.997334196358664e-05, "loss": 0.0303, "step": 5480 }, { "grad_norm": 0.2702691853046417, "learning_rate": 9.996616675932966e-05, "loss": 0.0314, "step": 5490 }, { "grad_norm": 0.37886449694633484, "learning_rate": 9.995813777114411e-05, "loss": 0.0332, "step": 5500 }, { "grad_norm": 0.24644127488136292, "learning_rate": 9.994925513622324e-05, "loss": 0.0324, "step": 5510 }, { "grad_norm": 0.29427140951156616, "learning_rate": 9.993951900634678e-05, "loss": 0.0314, "step": 5520 }, { "grad_norm": 0.1623036116361618, "learning_rate": 9.992892954787831e-05, "loss": 0.0313, "step": 5530 }, { "grad_norm": 0.32525572180747986, "learning_rate": 9.991748694176246e-05, "loss": 0.0317, "step": 5540 }, { "grad_norm": 0.39700940251350403, "learning_rate": 9.990519138352184e-05, "loss": 0.0319, "step": 5550 }, { "grad_norm": 0.371387779712677, "learning_rate": 9.989204308325356e-05, "loss": 0.0316, "step": 5560 }, { "grad_norm": 0.382245272397995, "learning_rate": 9.987804226562582e-05, "loss": 0.0321, "step": 5570 }, { "grad_norm": 0.3336215615272522, "learning_rate": 9.986318916987395e-05, "loss": 0.03, "step": 5580 }, { "grad_norm": 0.4367651641368866, "learning_rate": 9.984748404979642e-05, "loss": 0.0319, "step": 5590 }, { "grad_norm": 0.2899431586265564, "learning_rate": 9.983092717375033e-05, "loss": 0.0315, "step": 5600 }, { "grad_norm": 0.43919163942337036, "learning_rate": 9.981351882464706e-05, "loss": 0.0309, "step": 5610 }, { "grad_norm": 0.3506055772304535, "learning_rate": 9.979525929994727e-05, "loss": 0.0313, "step": 5620 }, { "grad_norm": 0.28003016114234924, "learning_rate": 9.977614891165581e-05, "loss": 0.0306, "step": 5630 }, { "grad_norm": 0.4560019075870514, "learning_rate": 9.975618798631653e-05, "loss": 0.0304, "step": 5640 }, { "grad_norm": 0.36266371607780457, "learning_rate": 9.973537686500656e-05, "loss": 0.0289, "step": 5650 }, { "grad_norm": 0.27017301321029663, "learning_rate": 9.971371590333053e-05, "loss": 0.0294, "step": 5660 }, { "grad_norm": 0.3721235692501068, "learning_rate": 9.969120547141453e-05, "loss": 0.0296, "step": 5670 }, { "grad_norm": 0.27416470646858215, "learning_rate": 9.966784595389971e-05, "loss": 0.0322, "step": 5680 }, { "grad_norm": 0.311551958322525, "learning_rate": 9.96436377499358e-05, "loss": 0.0285, "step": 5690 }, { "grad_norm": 0.3233130872249603, "learning_rate": 9.961858127317419e-05, "loss": 0.0302, "step": 5700 }, { "grad_norm": 0.2842532992362976, "learning_rate": 9.959267695176096e-05, "loss": 0.0314, "step": 5710 }, { "grad_norm": 0.4247515797615051, "learning_rate": 9.956592522832946e-05, "loss": 0.0315, "step": 5720 }, { "grad_norm": 0.46176719665527344, "learning_rate": 9.953832655999284e-05, "loss": 0.0301, "step": 5730 }, { "grad_norm": 0.3245146572589874, "learning_rate": 9.950988141833621e-05, "loss": 0.0284, "step": 5740 }, { "grad_norm": 0.3081766963005066, "learning_rate": 9.948059028940857e-05, "loss": 0.0286, "step": 5750 }, { "grad_norm": 0.30103135108947754, "learning_rate": 9.945045367371446e-05, "loss": 0.0292, "step": 5760 }, { "grad_norm": 0.45709553360939026, "learning_rate": 9.941947208620551e-05, "loss": 0.0334, "step": 5770 }, { "grad_norm": 0.3403961658477783, "learning_rate": 9.938764605627158e-05, "loss": 0.033, "step": 5780 }, { "grad_norm": 0.3128257989883423, "learning_rate": 9.935497612773167e-05, "loss": 0.0297, "step": 5790 }, { "grad_norm": 0.32093456387519836, "learning_rate": 9.932146285882477e-05, "loss": 0.0277, "step": 5800 }, { "grad_norm": 0.3948577046394348, "learning_rate": 9.928710682220017e-05, "loss": 0.0299, "step": 5810 }, { "grad_norm": 0.3564872741699219, "learning_rate": 9.925190860490772e-05, "loss": 0.0315, "step": 5820 }, { "grad_norm": 0.2942199110984802, "learning_rate": 9.921586880838784e-05, "loss": 0.0306, "step": 5830 }, { "grad_norm": 0.36606618762016296, "learning_rate": 9.917898804846124e-05, "loss": 0.0284, "step": 5840 }, { "grad_norm": 0.2491133213043213, "learning_rate": 9.914126695531833e-05, "loss": 0.0296, "step": 5850 }, { "grad_norm": 0.356919527053833, "learning_rate": 9.910270617350853e-05, "loss": 0.0299, "step": 5860 }, { "grad_norm": 0.28878551721572876, "learning_rate": 9.90633063619292e-05, "loss": 0.0298, "step": 5870 }, { "grad_norm": 0.23867709934711456, "learning_rate": 9.90230681938144e-05, "loss": 0.0299, "step": 5880 }, { "grad_norm": 0.2136927992105484, "learning_rate": 9.898199235672341e-05, "loss": 0.0276, "step": 5890 }, { "grad_norm": 0.28000661730766296, "learning_rate": 9.894007955252898e-05, "loss": 0.0298, "step": 5900 }, { "grad_norm": 0.24435938894748688, "learning_rate": 2.25e-06, "loss": 0.0261, "step": 5910 }, { "grad_norm": 0.1255520135164261, "learning_rate": 4.75e-06, "loss": 0.022, "step": 5920 }, { "grad_norm": 0.07361706346273422, "learning_rate": 7.25e-06, "loss": 0.0187, "step": 5930 }, { "grad_norm": 0.07405020296573639, "learning_rate": 9.750000000000002e-06, "loss": 0.0195, "step": 5940 }, { "grad_norm": 0.0707310363650322, "learning_rate": 1.225e-05, "loss": 0.0159, "step": 5950 }, { "grad_norm": 0.08231353759765625, "learning_rate": 1.475e-05, "loss": 0.0179, "step": 5960 }, { "grad_norm": 0.101494200527668, "learning_rate": 1.725e-05, "loss": 0.0217, "step": 5970 }, { "grad_norm": 0.13773274421691895, "learning_rate": 1.9750000000000002e-05, "loss": 0.0186, "step": 5980 }, { "grad_norm": 0.144323468208313, "learning_rate": 2.2250000000000002e-05, "loss": 0.0155, "step": 5990 }, { "grad_norm": 0.11110465973615646, "learning_rate": 2.4750000000000002e-05, "loss": 0.0167, "step": 6000 }, { "grad_norm": 0.15767154097557068, "learning_rate": 2.725e-05, "loss": 0.0181, "step": 6010 }, { "grad_norm": 0.20103546977043152, "learning_rate": 2.975e-05, "loss": 0.0169, "step": 6020 }, { "grad_norm": 0.13784171640872955, "learning_rate": 3.2250000000000005e-05, "loss": 0.018, "step": 6030 }, { "grad_norm": 0.10552939772605896, "learning_rate": 3.475e-05, "loss": 0.0169, "step": 6040 }, { "grad_norm": 0.19135749340057373, "learning_rate": 3.7250000000000004e-05, "loss": 0.0182, "step": 6050 }, { "grad_norm": 0.1571836769580841, "learning_rate": 3.9750000000000004e-05, "loss": 0.0169, "step": 6060 }, { "grad_norm": 0.21296827495098114, "learning_rate": 4.2250000000000004e-05, "loss": 0.0186, "step": 6070 }, { "grad_norm": 0.1799466907978058, "learning_rate": 4.4750000000000004e-05, "loss": 0.0192, "step": 6080 }, { "grad_norm": 0.2527002692222595, "learning_rate": 4.7249999999999997e-05, "loss": 0.0194, "step": 6090 }, { "grad_norm": 0.18937557935714722, "learning_rate": 4.975e-05, "loss": 0.0204, "step": 6100 }, { "grad_norm": 0.3399811387062073, "learning_rate": 5.2249999999999996e-05, "loss": 0.0191, "step": 6110 }, { "grad_norm": 0.3196389675140381, "learning_rate": 5.475e-05, "loss": 0.0216, "step": 6120 }, { "grad_norm": 0.22987596690654755, "learning_rate": 5.725e-05, "loss": 0.0193, "step": 6130 }, { "grad_norm": 0.25724199414253235, "learning_rate": 5.975000000000001e-05, "loss": 0.0219, "step": 6140 }, { "grad_norm": 0.18450762331485748, "learning_rate": 6.225000000000001e-05, "loss": 0.0216, "step": 6150 }, { "grad_norm": 0.2240147590637207, "learning_rate": 6.475e-05, "loss": 0.0204, "step": 6160 }, { "grad_norm": 0.2838953137397766, "learning_rate": 6.725000000000001e-05, "loss": 0.0231, "step": 6170 }, { "grad_norm": 0.2061644345521927, "learning_rate": 6.975e-05, "loss": 0.0238, "step": 6180 }, { "grad_norm": 0.277007520198822, "learning_rate": 7.225000000000001e-05, "loss": 0.0211, "step": 6190 }, { "grad_norm": 0.38259080052375793, "learning_rate": 7.475000000000001e-05, "loss": 0.0242, "step": 6200 }, { "grad_norm": 0.29872778058052063, "learning_rate": 7.725e-05, "loss": 0.0249, "step": 6210 }, { "grad_norm": 0.3128993809223175, "learning_rate": 7.975e-05, "loss": 0.0251, "step": 6220 }, { "grad_norm": 0.35145434737205505, "learning_rate": 8.225000000000001e-05, "loss": 0.0215, "step": 6230 }, { "grad_norm": 0.35203564167022705, "learning_rate": 8.475000000000001e-05, "loss": 0.0282, "step": 6240 }, { "grad_norm": 0.4482344388961792, "learning_rate": 8.725e-05, "loss": 0.0272, "step": 6250 }, { "grad_norm": 0.3477097451686859, "learning_rate": 8.975e-05, "loss": 0.0264, "step": 6260 }, { "grad_norm": 0.2589689791202545, "learning_rate": 9.225e-05, "loss": 0.0279, "step": 6270 }, { "grad_norm": 0.2147420048713684, "learning_rate": 9.475e-05, "loss": 0.0305, "step": 6280 }, { "grad_norm": 0.37397146224975586, "learning_rate": 9.725e-05, "loss": 0.0296, "step": 6290 }, { "grad_norm": 0.32392406463623047, "learning_rate": 9.975000000000001e-05, "loss": 0.0298, "step": 6300 }, { "grad_norm": 0.28752976655960083, "learning_rate": 2.25e-06, "loss": 0.0281, "step": 6310 }, { "grad_norm": 0.14773952960968018, "learning_rate": 4.75e-06, "loss": 0.0259, "step": 6320 }, { "grad_norm": 0.07288530468940735, "learning_rate": 7.25e-06, "loss": 0.019, "step": 6330 }, { "grad_norm": 0.056989990174770355, "learning_rate": 9.750000000000002e-06, "loss": 0.0177, "step": 6340 }, { "grad_norm": 0.0840739980340004, "learning_rate": 1.225e-05, "loss": 0.0183, "step": 6350 }, { "grad_norm": 0.07818028330802917, "learning_rate": 1.475e-05, "loss": 0.0169, "step": 6360 }, { "grad_norm": 0.0806255117058754, "learning_rate": 1.725e-05, "loss": 0.017, "step": 6370 }, { "grad_norm": 0.11104797571897507, "learning_rate": 1.9750000000000002e-05, "loss": 0.0163, "step": 6380 }, { "grad_norm": 0.1076383888721466, "learning_rate": 2.2250000000000002e-05, "loss": 0.0169, "step": 6390 }, { "grad_norm": 0.13541467487812042, "learning_rate": 2.4750000000000002e-05, "loss": 0.0171, "step": 6400 }, { "grad_norm": 0.1577892303466797, "learning_rate": 2.725e-05, "loss": 0.0159, "step": 6410 }, { "grad_norm": 0.14505299925804138, "learning_rate": 2.975e-05, "loss": 0.0151, "step": 6420 }, { "grad_norm": 0.1457807570695877, "learning_rate": 3.2250000000000005e-05, "loss": 0.0166, "step": 6430 }, { "grad_norm": 0.15191592276096344, "learning_rate": 3.475e-05, "loss": 0.0171, "step": 6440 }, { "grad_norm": 0.17364244163036346, "learning_rate": 3.7250000000000004e-05, "loss": 0.0167, "step": 6450 }, { "grad_norm": 0.12844876945018768, "learning_rate": 3.9750000000000004e-05, "loss": 0.0161, "step": 6460 }, { "grad_norm": 0.19524745643138885, "learning_rate": 4.2250000000000004e-05, "loss": 0.0182, "step": 6470 }, { "grad_norm": 0.2746567130088806, "learning_rate": 4.4750000000000004e-05, "loss": 0.0181, "step": 6480 }, { "grad_norm": 0.15469631552696228, "learning_rate": 4.7249999999999997e-05, "loss": 0.0164, "step": 6490 }, { "grad_norm": 0.3207453787326813, "learning_rate": 4.975e-05, "loss": 0.0187, "step": 6500 } ], "logging_steps": 10, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }