{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 224, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15625, "grad_norm": 1.4347870349884033, "learning_rate": 0.0004, "loss": 2.5907, "step": 5 }, { "epoch": 0.3125, "grad_norm": 1.8193588256835938, "learning_rate": 0.0009000000000000001, "loss": 2.0401, "step": 10 }, { "epoch": 0.46875, "grad_norm": 3.8659496307373047, "learning_rate": 0.0009936507936507937, "loss": 1.7812, "step": 15 }, { "epoch": 0.625, "grad_norm": 1.8431330919265747, "learning_rate": 0.0009857142857142857, "loss": 1.7989, "step": 20 }, { "epoch": 0.78125, "grad_norm": 1.5119725465774536, "learning_rate": 0.0009777777777777777, "loss": 1.8825, "step": 25 }, { "epoch": 0.9375, "grad_norm": 2.630089044570923, "learning_rate": 0.0009698412698412698, "loss": 1.8248, "step": 30 }, { "epoch": 1.09375, "grad_norm": 1.824265956878662, "learning_rate": 0.0009619047619047619, "loss": 1.4467, "step": 35 }, { "epoch": 1.25, "grad_norm": 2.9590811729431152, "learning_rate": 0.000953968253968254, "loss": 1.2601, "step": 40 }, { "epoch": 1.40625, "grad_norm": 2.812793731689453, "learning_rate": 0.000946031746031746, "loss": 1.2122, "step": 45 }, { "epoch": 1.5625, "grad_norm": 3.0169219970703125, "learning_rate": 0.0009380952380952381, "loss": 1.2601, "step": 50 }, { "epoch": 1.71875, "grad_norm": 3.0815634727478027, "learning_rate": 0.0009301587301587302, "loss": 1.2289, "step": 55 }, { "epoch": 1.875, "grad_norm": 2.0147032737731934, "learning_rate": 0.0009222222222222223, "loss": 1.2768, "step": 60 }, { "epoch": 2.03125, "grad_norm": 1.5649322271347046, "learning_rate": 0.0009142857142857143, "loss": 1.0438, "step": 65 }, { "epoch": 2.1875, "grad_norm": 1.8557994365692139, "learning_rate": 0.0009063492063492064, "loss": 0.5735, "step": 70 }, { "epoch": 2.34375, "grad_norm": 2.2182095050811768, "learning_rate": 0.0008984126984126985, "loss": 0.6358, "step": 75 }, { "epoch": 2.5, "grad_norm": 3.01589298248291, "learning_rate": 0.0008904761904761904, "loss": 0.5855, "step": 80 }, { "epoch": 2.65625, "grad_norm": 5.799967288970947, "learning_rate": 0.0008825396825396825, "loss": 0.6252, "step": 85 }, { "epoch": 2.8125, "grad_norm": 2.3106298446655273, "learning_rate": 0.0008746031746031746, "loss": 0.5751, "step": 90 }, { "epoch": 2.96875, "grad_norm": 5.094876766204834, "learning_rate": 0.0008666666666666667, "loss": 0.5947, "step": 95 }, { "epoch": 3.125, "grad_norm": 2.225015878677368, "learning_rate": 0.0008587301587301587, "loss": 0.3564, "step": 100 }, { "epoch": 3.28125, "grad_norm": 2.587599515914917, "learning_rate": 0.0008507936507936508, "loss": 0.2237, "step": 105 }, { "epoch": 3.4375, "grad_norm": 1.6697884798049927, "learning_rate": 0.0008428571428571429, "loss": 0.3015, "step": 110 }, { "epoch": 3.59375, "grad_norm": 1.5882270336151123, "learning_rate": 0.000834920634920635, "loss": 0.3354, "step": 115 }, { "epoch": 3.75, "grad_norm": 2.361110210418701, "learning_rate": 0.000826984126984127, "loss": 0.3349, "step": 120 }, { "epoch": 3.90625, "grad_norm": 2.789064884185791, "learning_rate": 0.0008190476190476191, "loss": 0.3435, "step": 125 }, { "epoch": 4.0625, "grad_norm": 1.9216350317001343, "learning_rate": 0.0008111111111111111, "loss": 0.3043, "step": 130 }, { "epoch": 4.21875, "grad_norm": 2.5817618370056152, "learning_rate": 0.0008031746031746032, "loss": 0.1865, "step": 135 }, { "epoch": 4.375, "grad_norm": 1.6777920722961426, "learning_rate": 0.0007952380952380952, "loss": 0.1989, "step": 140 }, { "epoch": 4.53125, "grad_norm": 1.9233051538467407, "learning_rate": 0.0007873015873015873, "loss": 0.1912, "step": 145 }, { "epoch": 4.6875, "grad_norm": 1.441566824913025, "learning_rate": 0.0007793650793650794, "loss": 0.1941, "step": 150 }, { "epoch": 4.84375, "grad_norm": 1.7641961574554443, "learning_rate": 0.0007714285714285715, "loss": 0.1927, "step": 155 }, { "epoch": 5.0, "grad_norm": 2.937122344970703, "learning_rate": 0.0007634920634920634, "loss": 0.2052, "step": 160 }, { "epoch": 5.15625, "grad_norm": 2.509890079498291, "learning_rate": 0.0007555555555555555, "loss": 0.1141, "step": 165 }, { "epoch": 5.3125, "grad_norm": 2.4012136459350586, "learning_rate": 0.0007476190476190476, "loss": 0.1059, "step": 170 }, { "epoch": 5.46875, "grad_norm": 1.8514283895492554, "learning_rate": 0.0007396825396825397, "loss": 0.1349, "step": 175 }, { "epoch": 5.625, "grad_norm": 3.880389451980591, "learning_rate": 0.0007317460317460317, "loss": 0.1334, "step": 180 }, { "epoch": 5.78125, "grad_norm": 1.4267611503601074, "learning_rate": 0.0007238095238095238, "loss": 0.1403, "step": 185 }, { "epoch": 5.9375, "grad_norm": 2.336212635040283, "learning_rate": 0.0007158730158730159, "loss": 0.1702, "step": 190 }, { "epoch": 6.09375, "grad_norm": 1.6925745010375977, "learning_rate": 0.000707936507936508, "loss": 0.1224, "step": 195 }, { "epoch": 6.25, "grad_norm": 2.5146541595458984, "learning_rate": 0.0007, "loss": 0.0903, "step": 200 }, { "epoch": 6.40625, "grad_norm": 1.3388704061508179, "learning_rate": 0.0006920634920634921, "loss": 0.0846, "step": 205 }, { "epoch": 6.5625, "grad_norm": 1.7252851724624634, "learning_rate": 0.0006841269841269842, "loss": 0.1077, "step": 210 }, { "epoch": 6.71875, "grad_norm": 1.524242639541626, "learning_rate": 0.0006761904761904763, "loss": 0.07, "step": 215 }, { "epoch": 6.875, "grad_norm": 2.5351924896240234, "learning_rate": 0.0006682539682539683, "loss": 0.1065, "step": 220 } ], "logging_steps": 5, "max_steps": 640, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4149163280973824e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }