{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2343, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06407176037161622, "grad_norm": 2816.0, "learning_rate": 0.00019982578397212544, "loss": 264.8085, "step": 50 }, { "epoch": 0.12814352074323243, "grad_norm": 310.0, "learning_rate": 0.00019547038327526132, "loss": 93.7181, "step": 100 }, { "epoch": 0.19221528111484862, "grad_norm": 187.0, "learning_rate": 0.0001911149825783972, "loss": 73.6341, "step": 150 }, { "epoch": 0.25628704148646486, "grad_norm": 180.0, "learning_rate": 0.0001867595818815331, "loss": 63.3109, "step": 200 }, { "epoch": 0.3203588018580811, "grad_norm": 152.0, "learning_rate": 0.000182404181184669, "loss": 58.1474, "step": 250 }, { "epoch": 0.38443056222969724, "grad_norm": 131.0, "learning_rate": 0.00017804878048780488, "loss": 53.3058, "step": 300 }, { "epoch": 0.44850232260131345, "grad_norm": 135.0, "learning_rate": 0.00017369337979094077, "loss": 50.364, "step": 350 }, { "epoch": 0.5125740829729297, "grad_norm": 133.0, "learning_rate": 0.00016933797909407668, "loss": 48.1949, "step": 400 }, { "epoch": 0.5766458433445459, "grad_norm": 116.0, "learning_rate": 0.00016498257839721257, "loss": 45.4348, "step": 450 }, { "epoch": 0.6407176037161622, "grad_norm": 108.5, "learning_rate": 0.00016062717770034843, "loss": 43.7144, "step": 500 }, { "epoch": 0.6407176037161622, "eval_loss": 1.6361274719238281, "eval_runtime": 55.126, "eval_samples_per_second": 190.727, "eval_steps_per_second": 2.993, "step": 500 }, { "epoch": 0.7047893640877783, "grad_norm": 106.0, "learning_rate": 0.00015627177700348432, "loss": 42.7101, "step": 550 }, { "epoch": 0.7688611244593945, "grad_norm": 114.0, "learning_rate": 0.0001519163763066202, "loss": 40.339, "step": 600 }, { "epoch": 0.8329328848310107, "grad_norm": 110.0, "learning_rate": 0.0001475609756097561, "loss": 38.7366, "step": 650 }, { "epoch": 0.8970046452026269, "grad_norm": 103.0, "learning_rate": 0.000143205574912892, "loss": 37.5428, "step": 700 }, { "epoch": 0.9610764055742431, "grad_norm": 99.5, "learning_rate": 0.00013885017421602788, "loss": 36.8851, "step": 750 }, { "epoch": 1.024347268941214, "grad_norm": 85.5, "learning_rate": 0.00013449477351916376, "loss": 31.5681, "step": 800 }, { "epoch": 1.0884190293128304, "grad_norm": 93.0, "learning_rate": 0.00013013937282229965, "loss": 25.5343, "step": 850 }, { "epoch": 1.1524907896844465, "grad_norm": 83.0, "learning_rate": 0.00012578397212543557, "loss": 24.5147, "step": 900 }, { "epoch": 1.2165625500560628, "grad_norm": 93.5, "learning_rate": 0.00012142857142857143, "loss": 24.456, "step": 950 }, { "epoch": 1.280634310427679, "grad_norm": 73.5, "learning_rate": 0.00011707317073170732, "loss": 23.506, "step": 1000 }, { "epoch": 1.280634310427679, "eval_loss": 1.2552359104156494, "eval_runtime": 55.6544, "eval_samples_per_second": 188.916, "eval_steps_per_second": 2.965, "step": 1000 }, { "epoch": 1.3447060707992953, "grad_norm": 81.5, "learning_rate": 0.0001127177700348432, "loss": 23.5644, "step": 1050 }, { "epoch": 1.4087778311709114, "grad_norm": 78.0, "learning_rate": 0.0001083623693379791, "loss": 23.0172, "step": 1100 }, { "epoch": 1.4728495915425277, "grad_norm": 80.5, "learning_rate": 0.00010400696864111498, "loss": 22.9003, "step": 1150 }, { "epoch": 1.5369213519141438, "grad_norm": 79.0, "learning_rate": 9.965156794425087e-05, "loss": 22.6571, "step": 1200 }, { "epoch": 1.6009931122857601, "grad_norm": 78.5, "learning_rate": 9.529616724738677e-05, "loss": 21.9341, "step": 1250 }, { "epoch": 1.6650648726573762, "grad_norm": 68.0, "learning_rate": 9.094076655052265e-05, "loss": 21.3732, "step": 1300 }, { "epoch": 1.7291366330289923, "grad_norm": 71.0, "learning_rate": 8.658536585365854e-05, "loss": 20.9112, "step": 1350 }, { "epoch": 1.7932083934006087, "grad_norm": 83.5, "learning_rate": 8.222996515679443e-05, "loss": 20.7182, "step": 1400 }, { "epoch": 1.857280153772225, "grad_norm": 71.0, "learning_rate": 7.787456445993033e-05, "loss": 20.6334, "step": 1450 }, { "epoch": 1.921351914143841, "grad_norm": 87.5, "learning_rate": 7.35191637630662e-05, "loss": 20.1598, "step": 1500 }, { "epoch": 1.921351914143841, "eval_loss": 1.0505120754241943, "eval_runtime": 55.8397, "eval_samples_per_second": 188.289, "eval_steps_per_second": 2.955, "step": 1500 }, { "epoch": 1.9854236745154572, "grad_norm": 70.0, "learning_rate": 6.916376306620209e-05, "loss": 19.4976, "step": 1550 }, { "epoch": 2.048694537882428, "grad_norm": 65.5, "learning_rate": 6.480836236933798e-05, "loss": 12.484, "step": 1600 }, { "epoch": 2.1127662982540447, "grad_norm": 61.25, "learning_rate": 6.0452961672473875e-05, "loss": 9.8898, "step": 1650 }, { "epoch": 2.176838058625661, "grad_norm": 61.25, "learning_rate": 5.6097560975609764e-05, "loss": 9.8554, "step": 1700 }, { "epoch": 2.240909818997277, "grad_norm": 59.0, "learning_rate": 5.1742160278745646e-05, "loss": 9.6838, "step": 1750 }, { "epoch": 2.304981579368893, "grad_norm": 70.0, "learning_rate": 4.7386759581881534e-05, "loss": 9.5104, "step": 1800 }, { "epoch": 2.3690533397405096, "grad_norm": 66.0, "learning_rate": 4.303135888501742e-05, "loss": 9.4922, "step": 1850 }, { "epoch": 2.4331251001121257, "grad_norm": 66.5, "learning_rate": 3.867595818815331e-05, "loss": 9.3802, "step": 1900 }, { "epoch": 2.497196860483742, "grad_norm": 61.0, "learning_rate": 3.43205574912892e-05, "loss": 9.135, "step": 1950 }, { "epoch": 2.561268620855358, "grad_norm": 64.5, "learning_rate": 2.9965156794425088e-05, "loss": 9.0416, "step": 2000 }, { "epoch": 2.561268620855358, "eval_loss": 1.1224490404129028, "eval_runtime": 53.1746, "eval_samples_per_second": 197.726, "eval_steps_per_second": 3.103, "step": 2000 }, { "epoch": 2.625340381226974, "grad_norm": 59.25, "learning_rate": 2.5609756097560977e-05, "loss": 9.083, "step": 2050 }, { "epoch": 2.6894121415985905, "grad_norm": 62.25, "learning_rate": 2.1254355400696865e-05, "loss": 9.0678, "step": 2100 }, { "epoch": 2.7534839019702066, "grad_norm": 53.5, "learning_rate": 1.6898954703832754e-05, "loss": 8.979, "step": 2150 }, { "epoch": 2.8175556623418228, "grad_norm": 57.25, "learning_rate": 1.2543554006968642e-05, "loss": 9.0874, "step": 2200 }, { "epoch": 2.8816274227134393, "grad_norm": 54.0, "learning_rate": 8.188153310104529e-06, "loss": 9.0697, "step": 2250 }, { "epoch": 2.9456991830850554, "grad_norm": 62.0, "learning_rate": 3.832752613240418e-06, "loss": 9.0172, "step": 2300 } ], "logging_steps": 50, "max_steps": 2343, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0212138843436483e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }