{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 11.0, "eval_steps": 500, "global_step": 352, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15625, "grad_norm": 1.4347870349884033, "learning_rate": 0.0004, "loss": 2.5907, "step": 5 }, { "epoch": 0.3125, "grad_norm": 1.8193588256835938, "learning_rate": 0.0009000000000000001, "loss": 2.0401, "step": 10 }, { "epoch": 0.46875, "grad_norm": 3.8659496307373047, "learning_rate": 0.0009936507936507937, "loss": 1.7812, "step": 15 }, { "epoch": 0.625, "grad_norm": 1.8431330919265747, "learning_rate": 0.0009857142857142857, "loss": 1.7989, "step": 20 }, { "epoch": 0.78125, "grad_norm": 1.5119725465774536, "learning_rate": 0.0009777777777777777, "loss": 1.8825, "step": 25 }, { "epoch": 0.9375, "grad_norm": 2.630089044570923, "learning_rate": 0.0009698412698412698, "loss": 1.8248, "step": 30 }, { "epoch": 1.09375, "grad_norm": 1.824265956878662, "learning_rate": 0.0009619047619047619, "loss": 1.4467, "step": 35 }, { "epoch": 1.25, "grad_norm": 2.9590811729431152, "learning_rate": 0.000953968253968254, "loss": 1.2601, "step": 40 }, { "epoch": 1.40625, "grad_norm": 2.812793731689453, "learning_rate": 0.000946031746031746, "loss": 1.2122, "step": 45 }, { "epoch": 1.5625, "grad_norm": 3.0169219970703125, "learning_rate": 0.0009380952380952381, "loss": 1.2601, "step": 50 }, { "epoch": 1.71875, "grad_norm": 3.0815634727478027, "learning_rate": 0.0009301587301587302, "loss": 1.2289, "step": 55 }, { "epoch": 1.875, "grad_norm": 2.0147032737731934, "learning_rate": 0.0009222222222222223, "loss": 1.2768, "step": 60 }, { "epoch": 2.03125, "grad_norm": 1.5649322271347046, "learning_rate": 0.0009142857142857143, "loss": 1.0438, "step": 65 }, { "epoch": 2.1875, "grad_norm": 1.8557994365692139, "learning_rate": 0.0009063492063492064, "loss": 0.5735, "step": 70 }, { "epoch": 2.34375, "grad_norm": 2.2182095050811768, "learning_rate": 0.0008984126984126985, "loss": 0.6358, "step": 75 }, { "epoch": 2.5, "grad_norm": 3.01589298248291, "learning_rate": 0.0008904761904761904, "loss": 0.5855, "step": 80 }, { "epoch": 2.65625, "grad_norm": 5.799967288970947, "learning_rate": 0.0008825396825396825, "loss": 0.6252, "step": 85 }, { "epoch": 2.8125, "grad_norm": 2.3106298446655273, "learning_rate": 0.0008746031746031746, "loss": 0.5751, "step": 90 }, { "epoch": 2.96875, "grad_norm": 5.094876766204834, "learning_rate": 0.0008666666666666667, "loss": 0.5947, "step": 95 }, { "epoch": 3.125, "grad_norm": 2.225015878677368, "learning_rate": 0.0008587301587301587, "loss": 0.3564, "step": 100 }, { "epoch": 3.28125, "grad_norm": 2.587599515914917, "learning_rate": 0.0008507936507936508, "loss": 0.2237, "step": 105 }, { "epoch": 3.4375, "grad_norm": 1.6697884798049927, "learning_rate": 0.0008428571428571429, "loss": 0.3015, "step": 110 }, { "epoch": 3.59375, "grad_norm": 1.5882270336151123, "learning_rate": 0.000834920634920635, "loss": 0.3354, "step": 115 }, { "epoch": 3.75, "grad_norm": 2.361110210418701, "learning_rate": 0.000826984126984127, "loss": 0.3349, "step": 120 }, { "epoch": 3.90625, "grad_norm": 2.789064884185791, "learning_rate": 0.0008190476190476191, "loss": 0.3435, "step": 125 }, { "epoch": 4.0625, "grad_norm": 1.9216350317001343, "learning_rate": 0.0008111111111111111, "loss": 0.3043, "step": 130 }, { "epoch": 4.21875, "grad_norm": 2.5817618370056152, "learning_rate": 0.0008031746031746032, "loss": 0.1865, "step": 135 }, { "epoch": 4.375, "grad_norm": 1.6777920722961426, "learning_rate": 0.0007952380952380952, "loss": 0.1989, "step": 140 }, { "epoch": 4.53125, "grad_norm": 1.9233051538467407, "learning_rate": 0.0007873015873015873, "loss": 0.1912, "step": 145 }, { "epoch": 4.6875, "grad_norm": 1.441566824913025, "learning_rate": 0.0007793650793650794, "loss": 0.1941, "step": 150 }, { "epoch": 4.84375, "grad_norm": 1.7641961574554443, "learning_rate": 0.0007714285714285715, "loss": 0.1927, "step": 155 }, { "epoch": 5.0, "grad_norm": 2.937122344970703, "learning_rate": 0.0007634920634920634, "loss": 0.2052, "step": 160 }, { "epoch": 5.15625, "grad_norm": 2.509890079498291, "learning_rate": 0.0007555555555555555, "loss": 0.1141, "step": 165 }, { "epoch": 5.3125, "grad_norm": 2.4012136459350586, "learning_rate": 0.0007476190476190476, "loss": 0.1059, "step": 170 }, { "epoch": 5.46875, "grad_norm": 1.8514283895492554, "learning_rate": 0.0007396825396825397, "loss": 0.1349, "step": 175 }, { "epoch": 5.625, "grad_norm": 3.880389451980591, "learning_rate": 0.0007317460317460317, "loss": 0.1334, "step": 180 }, { "epoch": 5.78125, "grad_norm": 1.4267611503601074, "learning_rate": 0.0007238095238095238, "loss": 0.1403, "step": 185 }, { "epoch": 5.9375, "grad_norm": 2.336212635040283, "learning_rate": 0.0007158730158730159, "loss": 0.1702, "step": 190 }, { "epoch": 6.09375, "grad_norm": 1.6925745010375977, "learning_rate": 0.000707936507936508, "loss": 0.1224, "step": 195 }, { "epoch": 6.25, "grad_norm": 2.5146541595458984, "learning_rate": 0.0007, "loss": 0.0903, "step": 200 }, { "epoch": 6.40625, "grad_norm": 1.3388704061508179, "learning_rate": 0.0006920634920634921, "loss": 0.0846, "step": 205 }, { "epoch": 6.5625, "grad_norm": 1.7252851724624634, "learning_rate": 0.0006841269841269842, "loss": 0.1077, "step": 210 }, { "epoch": 6.71875, "grad_norm": 1.524242639541626, "learning_rate": 0.0006761904761904763, "loss": 0.07, "step": 215 }, { "epoch": 6.875, "grad_norm": 2.5351924896240234, "learning_rate": 0.0006682539682539683, "loss": 0.1065, "step": 220 }, { "epoch": 7.03125, "grad_norm": 0.7822954654693604, "learning_rate": 0.0006603174603174604, "loss": 0.097, "step": 225 }, { "epoch": 7.1875, "grad_norm": 0.8932663202285767, "learning_rate": 0.0006523809523809525, "loss": 0.0564, "step": 230 }, { "epoch": 7.34375, "grad_norm": 2.202021360397339, "learning_rate": 0.0006444444444444444, "loss": 0.0899, "step": 235 }, { "epoch": 7.5, "grad_norm": 5.009730815887451, "learning_rate": 0.0006365079365079364, "loss": 0.0894, "step": 240 }, { "epoch": 7.65625, "grad_norm": 1.9714306592941284, "learning_rate": 0.0006285714285714285, "loss": 0.1092, "step": 245 }, { "epoch": 7.8125, "grad_norm": 1.9612032175064087, "learning_rate": 0.0006206349206349206, "loss": 0.0769, "step": 250 }, { "epoch": 7.96875, "grad_norm": 2.019763231277466, "learning_rate": 0.0006126984126984127, "loss": 0.066, "step": 255 }, { "epoch": 8.125, "grad_norm": 1.9064483642578125, "learning_rate": 0.0006047619047619047, "loss": 0.0798, "step": 260 }, { "epoch": 8.28125, "grad_norm": 1.2875497341156006, "learning_rate": 0.0005968253968253968, "loss": 0.0609, "step": 265 }, { "epoch": 8.4375, "grad_norm": 0.9999807476997375, "learning_rate": 0.0005888888888888889, "loss": 0.087, "step": 270 }, { "epoch": 8.59375, "grad_norm": 1.205248236656189, "learning_rate": 0.000580952380952381, "loss": 0.0791, "step": 275 }, { "epoch": 8.75, "grad_norm": 1.612770915031433, "learning_rate": 0.000573015873015873, "loss": 0.0768, "step": 280 }, { "epoch": 8.90625, "grad_norm": 1.8061549663543701, "learning_rate": 0.0005650793650793651, "loss": 0.0714, "step": 285 }, { "epoch": 9.0625, "grad_norm": 0.8511625528335571, "learning_rate": 0.0005571428571428572, "loss": 0.066, "step": 290 }, { "epoch": 9.21875, "grad_norm": 2.199141025543213, "learning_rate": 0.0005492063492063493, "loss": 0.0404, "step": 295 }, { "epoch": 9.375, "grad_norm": 1.1211730241775513, "learning_rate": 0.0005412698412698413, "loss": 0.0645, "step": 300 }, { "epoch": 9.53125, "grad_norm": 0.6761476993560791, "learning_rate": 0.0005333333333333334, "loss": 0.0731, "step": 305 }, { "epoch": 9.6875, "grad_norm": 0.9598909020423889, "learning_rate": 0.0005253968253968255, "loss": 0.053, "step": 310 }, { "epoch": 9.84375, "grad_norm": 1.074629545211792, "learning_rate": 0.0005174603174603176, "loss": 0.0606, "step": 315 }, { "epoch": 10.0, "grad_norm": 1.9772604703903198, "learning_rate": 0.0005095238095238095, "loss": 0.0523, "step": 320 }, { "epoch": 10.15625, "grad_norm": 0.6214720010757446, "learning_rate": 0.0005015873015873016, "loss": 0.0401, "step": 325 }, { "epoch": 10.3125, "grad_norm": 1.0092564821243286, "learning_rate": 0.0004936507936507937, "loss": 0.0343, "step": 330 }, { "epoch": 10.46875, "grad_norm": 0.7020597457885742, "learning_rate": 0.0004857142857142857, "loss": 0.0285, "step": 335 }, { "epoch": 10.625, "grad_norm": 0.5726391077041626, "learning_rate": 0.0004777777777777778, "loss": 0.042, "step": 340 }, { "epoch": 10.78125, "grad_norm": 0.4384658634662628, "learning_rate": 0.00046984126984126986, "loss": 0.043, "step": 345 }, { "epoch": 10.9375, "grad_norm": 0.40464839339256287, "learning_rate": 0.00046190476190476195, "loss": 0.0344, "step": 350 } ], "logging_steps": 5, "max_steps": 640, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.217928864934707e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }